diff --git a/brian2cuda/device.py b/brian2cuda/device.py
index b4610ee3..0f134042 100644
--- a/brian2cuda/device.py
+++ b/brian2cuda/device.py
@@ -49,6 +49,17 @@
         ''',
         ),
 
+    gpu_heap_size = BrianPreference(
+        docs='''
+        Size of the heap (in MB) used by malloc() and free() device system calls, which
+        are used in the `cudaVector` implementation. `cudaVectors` are used to
+        dynamically allocate device memory for `Spikemonitors` and the synapse
+        queues in the `CudaSpikeQueue` implementation for networks with
+        heterogeneously distributed delays.
+        ''',
+        validator=lambda v: isinstance(v, int) and v >= 0,
+        default=128),
+
     curand_float_type=BrianPreference(
         docs='''
         Floating point type of generated random numbers (float/double).
@@ -373,7 +384,8 @@ def generate_main_source(self, writer, main_includes):
                                                           code_objects=self.code_objects.values(),
                                                           report_func=self.report_func,
                                                           dt=float(defaultclock.dt),
-                                                          additional_headers=main_includes
+                                                          additional_headers=main_includes,
+                                                          gpu_heap_size=prefs['devices.cuda_standalone.gpu_heap_size']
                                                           )
         writer.write('main.cu', main_tmp)
         
diff --git a/brian2cuda/templates/main.cu b/brian2cuda/templates/main.cu
index af7cedf0..6c56044a 100644
--- a/brian2cuda/templates/main.cu
+++ b/brian2cuda/templates/main.cu
@@ -28,7 +28,7 @@ int main(int argc, char **argv)
 
 	cudaDeviceProp props;
 	cudaGetDeviceProperties(&props, 0);
-	size_t limit = 128 * 1024 * 1024;
+	size_t limit = {{gpu_heap_size}} * 1024 * 1024;
 	cudaDeviceSetLimit(cudaLimitMallocHeapSize, limit);
 	cudaDeviceSynchronize();
 	
diff --git a/brian2cuda/templates/synapses_initialise_queue.cu b/brian2cuda/templates/synapses_initialise_queue.cu
index 13245dd3..0deaabb6 100644
--- a/brian2cuda/templates/synapses_initialise_queue.cu
+++ b/brian2cuda/templates/synapses_initialise_queue.cu
@@ -17,7 +17,7 @@ namespace {
 __global__ void _run_{{codeobj_name}}_kernel(
 	unsigned int _source_N,
 	unsigned int _num_blocks,
-	unsigned int _num_threads_per_block,
+	unsigned int _num_threads,
 	double _dt,
 	unsigned int _syn_N,
 	unsigned int num_delays,
@@ -29,7 +29,7 @@ __global__ void _run_{{codeobj_name}}_kernel(
 
 	{{pathobj}}.queue->prepare(
 		tid,
-		_num_threads_per_block,
+		_num_threads,
 		_num_blocks,
 		0,
 		_source_N,
@@ -364,10 +364,48 @@ void _run_{{pathobj}}_initialise_queue()
 	{
 		num_threads = max_threads_per_block;
 	}
-	_run_{{codeobj_name}}_kernel<<<1, num_threads>>>(
+    unsigned int num_blocks = 1;
+
+    // check if we have enough ressources to call kernel with given number
+    // of blocks and threads
+    struct cudaFuncAttributes funcAttrib;
+    cudaFuncGetAttributes(&funcAttrib, _run_{{codeobj_name}}_kernel);
+    if (num_threads > funcAttrib.maxThreadsPerBlock)
+    {
+        // use the max num_threads before launch failure
+        num_threads = funcAttrib.maxThreadsPerBlock;
+        printf("WARNING Not enough ressources available to call "
+               "_run_{{codeobj_name}}_kernel "
+               "with maximum possible threads per block (%u). "
+               "Reducing num_threads to %u. (Kernel needs %i "
+               "registers per block, %i bytes of "
+               "statically-allocated shared memory per block, %i "
+               "bytes of local memory per thread and a total of %i "
+               "bytes of user-allocated constant memory)\n",
+               max_threads_per_block, num_threads, funcAttrib.numRegs,
+               funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes,
+               funcAttrib.constSizeBytes);
+    }
+    else
+    {
+        printf("INFO _run_{{codeobj_name}}_kernel\n"
+               "\t%u blocks\n"
+               "\t%u threads\n"
+               "\t%i registers per block\n"
+               "\t%i bytes statically-allocated shared memory per block\n"
+               "\t%i bytes local memory per thread\n"
+               "\t%i bytes user-allocated constant memory\n"
+               "",
+               num_blocks, num_threads, funcAttrib.numRegs,
+               funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes,
+               funcAttrib.constSizeBytes);
+    }
+
+
+	_run_{{codeobj_name}}_kernel<<<num_blocks, num_threads>>>(
 		source_N,
 		num_parallel_blocks,
-		max_threads_per_block,
+		num_threads,
 		dt,
 		syn_N,
 		num_delays,
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/README.md b/dev/benchmarks/results_2017_04_05_complete_after_talk/README.md
new file mode 100644
index 00000000..d5f47013
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/README.md
@@ -0,0 +1,1752 @@
+
+# Benchmark results from 05.04.2017
+## Description:
+
+
+
+## Last git log:
+```
+commit 49e59d6b8fe0d84a3a1650e30e80e7caa023d987
+Author: Denis Alevi <mail@denisalevi.de>
+Date:   Wed Mar 29 20:14:08 2017 +0200
+
+    Revert to using cudaMemset to reset eventspace counter
+    
+    `__threadfence()` does not work in this
+
+```
+There is also a `git diff` saved in the current directory.
+
+## Results
+
+### AdaptationOscillation
+![](plots/speed_test_AdaptationOscillation_absolute.png)
+![](plots/speed_test_AdaptationOscillation_profiling.png)
+![](plots/speed_test_AdaptationOscillation_relative.png)
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfiguration**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==27090== NVPROF is profiling process 27090, command: ./main
+==27090== Profiling application: ./main
+==27090== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 54.38%  151.00ms     10000  15.100us  2.8800us  70.592us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, int*, int, int*, double, double*, int*, int, bool*)
+ 18.09%  50.227ms     10000  5.0220us  4.7040us  6.8800us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double, double*, double*, double*, bool*, float*)
+ 11.30%  31.386ms     10000  3.1380us  3.0400us  4.2560us  [CUDA memset]
+  8.01%  22.246ms     10000  2.2240us  1.8560us  2.7520us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+  7.90%  21.951ms     10000  2.1950us  1.5360us  3.0400us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, int*, double*, double*, bool*)
+  0.32%  881.25us         1  881.25us  881.25us  881.25us  void gen_sequenced<curandStateXORWOW, float2, normal_args_st, __operator_&__(float2 curand_normal_scaled2<curandStateXORWOW>(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st)
+
+==27090== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 65.77%  370.03ms     40001  9.2500us  8.1820us  8.8454ms  cudaLaunch
+ 16.57%  93.193ms     10000  9.3190us  8.6380us  24.859us  cudaMemset
+ 13.98%  78.650ms    390005     201ns     149ns  319.77us  cudaSetupArgument
+  1.93%  10.868ms     40001     271ns     200ns  313.28us  cudaConfigureCall
+  1.70%  9.5546ms     40002     238ns     207ns  5.1700us  cudaGetLastError
+  0.03%  174.94us         1  174.94us  174.94us  174.94us  cudaMalloc
+  0.01%  50.180us         1  50.180us  50.180us  50.180us  cudaMemGetInfo
+  0.00%  23.192us        38     610ns     476ns  1.5970us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  19.120us         7  2.7310us  2.0000us  5.0070us  cudaFuncGetAttributes
+  0.00%  17.862us         1  17.862us  17.862us  17.862us  cudaDeviceSynchronize
+  0.00%  5.0460us        12     420ns     293ns  1.1020us  cudaDeviceGetAttribute
+  0.00%  3.2580us         3  1.0860us     659ns  1.8660us  cudaGetDevice
+
+```
+
+</p></details>
+
+
+<details><summary>Examplary `nvprof` results for **GeNNConfigurationOptimized**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==27315== NVPROF is profiling process 27315, command: ./main test 1.0 1
+==27315== Profiling application: ./main test 1.0 1
+==27315== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 53.41%  151.83ms     10000  15.183us  1.9200us  1.1186ms  calcSynapses
+ 46.17%  131.26ms     10000  13.126us  10.560us  20.288us  calcNeurons
+  0.32%  903.46us        48  18.822us     960ns  129.47us  [CUDA memcpy HtoD]
+  0.10%  283.36us        14  20.240us  1.9840us  122.88us  [CUDA memcpy DtoH]
+
+==27315== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 48.83%  298.28ms        13  22.945ms  9.2060us  295.80ms  cudaHostAlloc
+ 46.42%  283.54ms     20000  14.176us  7.6710us  1.1119ms  cudaLaunch
+  2.61%  15.926ms        64  248.85us     409ns  13.875ms  cudaMemcpy
+  1.10%  6.6997ms     20000     334ns     268ns  303.73us  cudaConfigureCall
+  0.84%  5.1253ms     20000     256ns     228ns  5.1490us  cudaSetupArgument
+  0.14%  867.56us        13  66.735us  7.8370us  174.67us  cudaMalloc
+  0.04%  257.35us        83  3.1000us     186ns  109.74us  cuDeviceGetAttribute
+  0.01%  39.793us         1  39.793us  39.793us  39.793us  cuDeviceGetName
+  0.01%  36.797us         1  36.797us  36.797us  36.797us  cuDeviceTotalMem
+  0.00%  16.271us         1  16.271us  16.271us  16.271us  cudaSetDevice
+  0.00%  15.322us        13  1.1780us     539ns  3.3530us  cudaGetSymbolAddress
+  0.00%  2.6060us         2  1.3030us     777ns  1.8290us  cuDeviceGetCount
+  0.00%  1.8590us         1  1.8590us  1.8590us  1.8590us  cudaGetDeviceCount
+  0.00%     975ns         2     487ns     397ns     578ns  cuDeviceGet
+
+```
+
+</p></details>
+
+
+***
+
+### BrunelHakimModelHeterogeneousDelay
+![](plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.png)
+![](plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.png)
+![](plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.png)
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfiguration**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==13252== NVPROF is profiling process 13252, command: ./main
+==13252== Profiling application: ./main
+==13252== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 86.34%  3.21777s     10000  321.78us  1.5360us  4.9107ms  _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*)
+  9.78%  364.56ms     10000  36.455us  2.2080us  84.928us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*)
+  1.24%  46.150ms     10000  4.6140us  4.4480us  6.7520us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*)
+  0.86%  32.053ms     10000  3.2050us  2.9120us  4.2240us  [CUDA memset]
+  0.70%  25.923ms     10000  2.5920us  2.3680us  3.6160us  _run_synapses_pre_push_spikes_advance_kernel(void)
+  0.58%  21.708ms     10000  2.1700us  1.8880us  2.7200us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+  0.48%  17.725ms     10000  1.7720us  1.6960us  2.0480us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+  0.02%  880.45us         1  880.45us  880.45us  880.45us  void gen_sequenced<curandStateXORWOW, float2, normal_args_st, __operator_&__(float2 curand_normal_scaled2<curandStateXORWOW>(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st)
+
+==13252== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 93.08%  3.54282s     60001  59.045us  7.8910us  6.6818ms  cudaLaunch
+  2.78%  105.95ms     10000  10.595us  8.3520us  305.06us  cudaMemset
+  1.61%  61.198ms         1  61.198ms  61.198ms  61.198ms  cudaDeviceSynchronize
+  1.60%  60.805ms    370005     164ns     130ns  296.03us  cudaSetupArgument
+  0.49%  18.710ms     60002     311ns     237ns  312.79us  cudaGetLastError
+  0.43%  16.481ms     60001     274ns     181ns  299.24us  cudaConfigureCall
+  0.00%  182.53us         1  182.53us  182.53us  182.53us  cudaMalloc
+  0.00%  71.394us         1  71.394us  71.394us  71.394us  cudaMemGetInfo
+  0.00%  20.387us        38     536ns     474ns  1.4760us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  18.951us         7  2.7070us  1.9760us  5.3870us  cudaFuncGetAttributes
+  0.00%  4.9460us        12     412ns     263ns  1.1520us  cudaDeviceGetAttribute
+  0.00%  2.8500us         3     950ns     608ns  1.6040us  cudaGetDevice
+
+```
+
+</p></details>
+
+
+<details><summary>Examplary `nvprof` results for **GeNNConfigurationOptimized**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==13488== NVPROF is profiling process 13488, command: ./main test 1.0 1
+==13488== Profiling application: ./main test 1.0 1
+==13488== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 74.47%  118.07ms     10000  11.806us  10.016us  17.664us  calcNeurons
+ 18.42%  29.207ms     10000  2.9200us  1.9200us  17.664us  calcSynapses
+  5.59%  8.8552ms        40  221.38us     960ns  2.5145ms  [CUDA memcpy HtoD]
+  1.52%  2.4178ms        10  241.78us  1.9520us  2.3869ms  [CUDA memcpy DtoH]
+
+==13488== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 58.76%  270.99ms        11  24.635ms  17.531us  265.27ms  cudaHostAlloc
+ 36.00%  166.02ms     20000  8.3000us  7.6090us  315.35us  cudaLaunch
+  2.62%  12.069ms        53  227.72us     334ns  2.5281ms  cudaMemcpy
+  1.36%  6.2887ms     20000     314ns     240ns  302.98us  cudaConfigureCall
+  1.00%  4.6085ms     20000     230ns     217ns  2.8530us  cudaSetupArgument
+  0.19%  860.67us        11  78.243us  12.662us  173.88us  cudaMalloc
+  0.05%  234.84us        83  2.8290us     158ns  100.64us  cuDeviceGetAttribute
+  0.01%  32.245us         1  32.245us  32.245us  32.245us  cuDeviceTotalMem
+  0.01%  27.894us         1  27.894us  27.894us  27.894us  cuDeviceGetName
+  0.00%  14.621us        11  1.3290us     791ns  3.3800us  cudaGetSymbolAddress
+  0.00%  12.561us         1  12.561us  12.561us  12.561us  cudaSetDevice
+  0.00%  1.4740us         2     737ns     495ns     979ns  cuDeviceGetCount
+  0.00%  1.4370us         1  1.4370us  1.4370us  1.4370us  cudaGetDeviceCount
+  0.00%     524ns         2     262ns     227ns     297ns  cuDeviceGet
+
+```
+
+</p></details>
+
+
+***
+
+### BrunelHakimModelScalarDelay
+![](plots/speed_test_BrunelHakimModelScalarDelay_absolute.png)
+![](plots/speed_test_BrunelHakimModelScalarDelay_profiling.png)
+![](plots/speed_test_BrunelHakimModelScalarDelay_relative.png)
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfiguration**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==2491== NVPROF is profiling process 2491, command: ./main
+==2491== Profiling application: ./main
+==2491== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 28.57%  48.196ms     10000  4.8190us  4.5440us  6.7840us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*)
+ 27.77%  46.841ms     10000  4.6840us  2.8800us  31.584us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*)
+ 19.44%  32.782ms     10000  3.2780us  3.2320us  3.7760us  [CUDA memset]
+ 12.58%  21.215ms     10000  2.1210us  1.9840us  2.5600us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+ 11.12%  18.762ms     10000  1.8760us  1.7920us  2.1120us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+  0.52%  880.90us         1  880.90us  880.90us  880.90us  void gen_sequenced<curandStateXORWOW, float2, normal_args_st, __operator_&__(float2 curand_normal_scaled2<curandStateXORWOW>(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st)
+
+==2491== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 67.81%  358.71ms     40001  8.9670us  7.9890us  10.112ms  cudaLaunch
+ 16.69%  88.268ms     10000  8.8260us  8.3570us  34.808us  cudaMemset
+ 11.38%  60.182ms    330005     182ns     150ns  304.26us  cudaSetupArgument
+  2.12%  11.226ms     40001     280ns     197ns  305.80us  cudaConfigureCall
+  1.95%  10.335ms     40002     258ns     217ns  14.869us  cudaGetLastError
+  0.03%  178.47us         1  178.47us  178.47us  178.47us  cudaMalloc
+  0.01%  51.372us         1  51.372us  51.372us  51.372us  cudaMemGetInfo
+  0.00%  21.822us        38     574ns     469ns  3.0220us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  19.460us         7  2.7800us  2.0130us  5.1840us  cudaFuncGetAttributes
+  0.00%  17.572us         1  17.572us  17.572us  17.572us  cudaDeviceSynchronize
+  0.00%  5.0120us        12     417ns     283ns  1.0740us  cudaDeviceGetAttribute
+  0.00%  2.8560us         3     952ns     570ns  1.6710us  cudaGetDevice
+
+```
+
+</p></details>
+
+
+<details><summary>Examplary `nvprof` results for **GeNNConfigurationOptimized**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==2741== NVPROF is profiling process 2741, command: ./main test 1.0 1
+==2741== Profiling application: ./main test 1.0 1
+==2741== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 71.49%  120.00ms     10000  11.999us  10.016us  18.144us  calcNeurons
+ 21.75%  36.501ms     10000  3.6500us  2.4960us  29.185us  calcSynapses
+  5.33%  8.9404ms        41  218.06us     960ns  2.5144ms  [CUDA memcpy HtoD]
+  1.43%  2.4037ms        10  240.37us  2.0480us  2.3725ms  [CUDA memcpy DtoH]
+
+==2741== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 59.17%  284.47ms        11  25.861ms  13.934us  278.41ms  cudaHostAlloc
+ 35.49%  170.60ms     20000  8.5300us  7.5850us  307.94us  cudaLaunch
+  2.68%  12.860ms        53  242.63us     394ns  2.5288ms  cudaMemcpy
+  1.36%  6.5596ms     20000     327ns     257ns  308.28us  cudaConfigureCall
+  1.04%  5.0131ms     20000     250ns     228ns  9.1940us  cudaSetupArgument
+  0.19%  898.78us        11  81.706us  9.2360us  153.32us  cudaMalloc
+  0.05%  226.47us        83  2.7280us     137ns  97.777us  cuDeviceGetAttribute
+  0.01%  31.138us         1  31.138us  31.138us  31.138us  cuDeviceTotalMem
+  0.01%  27.215us         1  27.215us  27.215us  27.215us  cuDeviceGetName
+  0.00%  12.953us        11  1.1770us     575ns  2.8170us  cudaGetSymbolAddress
+  0.00%  12.076us         1  12.076us  12.076us  12.076us  cudaMemcpyToSymbol
+  0.00%  10.837us         1  10.837us  10.837us  10.837us  cudaSetDevice
+  0.00%  1.5250us         1  1.5250us  1.5250us  1.5250us  cudaGetDeviceCount
+  0.00%  1.4930us         2     746ns     490ns  1.0030us  cuDeviceGetCount
+  0.00%     498ns         2     249ns     224ns     274ns  cuDeviceGet
+
+```
+
+</p></details>
+
+
+***
+
+### BrunelHakimModelScalarDelayNoMultiPrePost
+![](plots/speed_test_BrunelHakimModelScalarDelayNoMultiPrePost_absolute.png)
+![](plots/speed_test_BrunelHakimModelScalarDelayNoMultiPrePost_profiling.png)
+![](plots/speed_test_BrunelHakimModelScalarDelayNoMultiPrePost_relative.png)
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfiguration**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==23945== NVPROF is profiling process 23945, command: ./main
+==23945== Profiling application: ./main
+==23945== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 28.82%  47.429ms     10000  4.7420us  2.8800us  34.464us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*)
+ 28.42%  46.768ms     10000  4.6760us  4.4480us  6.8800us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*)
+ 18.77%  30.887ms     10000  3.0880us  3.0400us  3.6160us  [CUDA memset]
+ 13.20%  21.722ms     10000  2.1720us  2.0160us  2.5920us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+ 10.25%  16.871ms     10000  1.6870us  1.5680us  1.9840us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+  0.54%  881.31us         1  881.31us  881.31us  881.31us  void gen_sequenced<curandStateXORWOW, float2, normal_args_st, __operator_&__(float2 curand_normal_scaled2<curandStateXORWOW>(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st)
+
+==23945== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 68.47%  378.42ms     40001  9.4600us  8.3920us  11.185ms  cudaLaunch
+ 16.96%  93.726ms     10000  9.3720us  8.8820us  22.956us  cudaMemset
+ 10.76%  59.491ms    330005     180ns     148ns  309.86us  cudaSetupArgument
+  1.90%  10.527ms     40001     263ns     182ns  298.24us  cudaConfigureCall
+  1.84%  10.177ms     40002     254ns     225ns  10.282us  cudaGetLastError
+  0.03%  178.62us         1  178.62us  178.62us  178.62us  cudaMalloc
+  0.01%  52.598us         1  52.598us  52.598us  52.598us  cudaMemGetInfo
+  0.00%  25.078us        38     659ns     560ns  2.7750us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  19.936us         7  2.8480us  2.0920us  5.4650us  cudaFuncGetAttributes
+  0.00%  17.187us         1  17.187us  17.187us  17.187us  cudaDeviceSynchronize
+  0.00%  5.0920us        12     424ns     278ns  1.0780us  cudaDeviceGetAttribute
+  0.00%  3.1170us         3  1.0390us     523ns  1.9660us  cudaGetDevice
+
+```
+
+</p></details>
+
+
+<details><summary>Examplary `nvprof` results for **GeNNConfigurationOptimized**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==24196== NVPROF is profiling process 24196, command: ./main test 1.0 1
+==24196== Profiling application: ./main test 1.0 1
+==24196== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 71.41%  120.56ms     10000  12.055us  10.048us  17.952us  calcNeurons
+ 21.88%  36.941ms     10000  3.6940us  2.5280us  26.912us  calcSynapses
+  5.29%  8.9319ms        41  217.85us     992ns  2.5123ms  [CUDA memcpy HtoD]
+  1.42%  2.3983ms        10  239.83us  2.0160us  2.3673ms  [CUDA memcpy DtoH]
+
+==24196== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 58.26%  272.15ms        11  24.741ms  19.067us  265.67ms  cudaHostAlloc
+ 36.33%  169.74ms     20000  8.4860us  7.6190us  310.62us  cudaLaunch
+  2.72%  12.686ms        53  239.35us     323ns  2.5267ms  cudaMemcpy
+  1.36%  6.3732ms     20000     318ns     242ns  300.70us  cudaConfigureCall
+  1.03%  4.8351ms     20000     241ns     210ns  10.299us  cudaSetupArgument
+  0.22%  1.0265ms        11  93.320us  12.594us  179.95us  cudaMalloc
+  0.05%  240.26us        83  2.8940us     152ns  104.47us  cuDeviceGetAttribute
+  0.01%  32.415us         1  32.415us  32.415us  32.415us  cuDeviceTotalMem
+  0.01%  28.407us         1  28.407us  28.407us  28.407us  cuDeviceGetName
+  0.00%  14.808us        11  1.3460us     741ns  3.2100us  cudaGetSymbolAddress
+  0.00%  14.772us         1  14.772us  14.772us  14.772us  cudaMemcpyToSymbol
+  0.00%  12.168us         1  12.168us  12.168us  12.168us  cudaSetDevice
+  0.00%  1.4860us         1  1.4860us  1.4860us  1.4860us  cudaGetDeviceCount
+  0.00%  1.4580us         2     729ns     473ns     985ns  cuDeviceGetCount
+  0.00%     537ns         2     268ns     226ns     311ns  cuDeviceGet
+
+```
+
+</p></details>
+
+
+***
+
+### COBAHH
+![](plots/speed_test_COBAHH_absolute.png)
+![](plots/speed_test_COBAHH_profiling.png)
+![](plots/speed_test_COBAHH_relative.png)
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfiguration**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==11907== NVPROF is profiling process 11907, command: ./main
+==11907== Profiling application: ./main
+==11907== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 39.16%  186.02ms     10000  18.602us  17.856us  21.568us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*, bool*, double*, double*, double*, double*, double, double*)
+ 29.93%  142.18ms     10000  14.218us  3.2320us  35.680us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double*, double, int*, int, int*)
+ 19.08%  90.630ms     10000  9.0620us  3.1680us  24.448us  kernel_synapses_1_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, int*, double*)
+  6.67%  31.670ms     10000  3.1660us  3.0400us  4.1920us  [CUDA memset]
+  5.15%  24.481ms     10000  2.4480us  2.0480us  2.7840us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+
+==11907== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 66.01%  376.74ms     40000  9.4180us  8.4480us  6.9662ms  cudaLaunch
+ 15.97%  91.133ms     10000  9.1130us  8.5190us  28.283us  cudaMemset
+ 13.95%  79.611ms    470000     169ns     149ns  316.22us  cudaSetupArgument
+  2.29%  13.092ms     40000     327ns     202ns  311.93us  cudaConfigureCall
+  1.76%  10.072ms     40000     251ns     230ns  5.0760us  cudaGetLastError
+  0.01%  50.252us         1  50.252us  50.252us  50.252us  cudaMemGetInfo
+  0.00%  22.121us         1  22.121us  22.121us  22.121us  cudaDeviceSynchronize
+  0.00%  16.912us         6  2.8180us  2.0980us  4.5270us  cudaFuncGetAttributes
+  0.00%  13.875us        21     660ns     520ns  1.5110us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  3.9730us         8     496ns     302ns  1.1490us  cudaDeviceGetAttribute
+  0.00%  2.3840us         2  1.1920us     836ns  1.5480us  cudaGetDevice
+
+```
+
+</p></details>
+
+
+<details><summary>Examplary `nvprof` results for **GeNNConfigurationOptimized**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==12169== NVPROF is profiling process 12169, command: ./main test 1.0 1
+==12169== Profiling application: ./main test 1.0 1
+==12169== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 64.38%  254.25ms     10000  25.425us  23.777us  28.416us  calcNeurons
+ 35.52%  140.25ms     10000  14.025us  2.4320us  41.696us  calcSynapses
+  0.07%  285.47us        68  4.1980us     960ns  42.944us  [CUDA memcpy HtoD]
+  0.03%  108.42us        18  6.0230us  1.9840us  40.736us  [CUDA memcpy DtoH]
+
+==12169== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 52.49%  378.74ms     20000  18.937us  7.6840us  358.81us  cudaLaunch
+ 42.10%  303.75ms        19  15.987ms  8.2320us  301.68ms  cudaHostAlloc
+  3.34%  24.097ms        88  273.83us     330ns  22.690ms  cudaMemcpy
+  1.06%  7.6642ms     20000     383ns     262ns  335.28us  cudaConfigureCall
+  0.86%  6.2250ms     20000     311ns     242ns  336.35us  cudaSetupArgument
+  0.10%  707.36us        19  37.229us  6.2200us  126.23us  cudaMalloc
+  0.03%  241.14us        83  2.9050us     137ns  109.48us  cuDeviceGetAttribute
+  0.00%  31.485us         1  31.485us  31.485us  31.485us  cuDeviceTotalMem
+  0.00%  30.190us         1  30.190us  30.190us  30.190us  cuDeviceGetName
+  0.00%  12.302us        19     647ns     344ns  2.1110us  cudaGetSymbolAddress
+  0.00%  11.562us         1  11.562us  11.562us  11.562us  cudaSetDevice
+  0.00%  1.5290us         2     764ns     561ns     968ns  cuDeviceGetCount
+  0.00%  1.4620us         1  1.4620us  1.4620us  1.4620us  cudaGetDeviceCount
+  0.00%     480ns         2     240ns     218ns     262ns  cuDeviceGet
+
+```
+
+</p></details>
+
+
+***
+
+### COBAHHFixedConnectivity
+![](plots/speed_test_COBAHHFixedConnectivity_absolute.png)
+![](plots/speed_test_COBAHHFixedConnectivity_profiling.png)
+![](plots/speed_test_COBAHHFixedConnectivity_relative.png)
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfiguration**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==17632== NVPROF is profiling process 17632, command: ./main
+==17632== Profiling application: ./main
+==17632== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 44.90%  349.33ms     10000  34.933us  1.6640us  111.13ms  kernel_spikemonitor_codeobject(unsigned int, int*, double, int*, int*, int*, int, int*, double*, int, int*, int*)
+ 23.60%  183.61ms     10000  18.361us  17.824us  21.856us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*, bool*, double*, double*, double*, double*, double, double*)
+ 14.85%  115.52ms     10000  11.551us  3.0720us  36.353us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double*, double, int*, int, int*)
+  9.49%  73.847ms     10000  7.3840us  3.0720us  24.064us  kernel_synapses_1_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, int*, double*)
+  4.03%  31.352ms     10000  3.1350us  3.0400us  4.2880us  [CUDA memset]
+  3.12%  24.285ms     10000  2.4280us  2.0480us  2.7840us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+  0.01%  68.000us         1  68.000us  68.000us  68.000us  _run_spikemonitor_codeobject_init(void)
+
+==17632== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 73.31%  632.36ms     50001  12.646us  8.2740us  95.930ms  cudaLaunch
+ 12.10%  104.36ms    590000     176ns     149ns  346.69us  cudaSetupArgument
+ 11.27%  97.201ms     10000  9.7200us  8.6440us  1.1383ms  cudaMemset
+  1.55%  13.390ms     50001     267ns     192ns  331.43us  cudaConfigureCall
+  1.55%  13.349ms     50001     266ns     220ns  330.51us  cudaGetLastError
+  0.21%  1.8328ms         1  1.8328ms  1.8328ms  1.8328ms  cudaDeviceSynchronize
+  0.01%  51.143us         1  51.143us  51.143us  51.143us  cudaMemGetInfo
+  0.00%  18.972us         7  2.7100us  2.0070us  4.6510us  cudaFuncGetAttributes
+  0.00%  14.003us        22     636ns     470ns  1.4930us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  4.3080us         8     538ns     317ns  1.2590us  cudaDeviceGetAttribute
+  0.00%  2.2780us         2  1.1390us     764ns  1.5140us  cudaGetDevice
+
+```
+
+</p></details>
+
+
+<details><summary>Examplary `nvprof` results for **GeNNConfigurationOptimized**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==17891== NVPROF is profiling process 17891, command: ./main test 1.0 1
+==17891== Profiling application: ./main test 1.0 1
+==17891== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 66.58%  251.53ms     10000  25.153us  23.840us  28.000us  calcNeurons
+ 23.34%  88.193ms     10000  8.8190us  2.4320us  41.472us  calcSynapses
+  9.86%  37.269ms     18461  2.0180us  1.9520us  153.18us  [CUDA memcpy DtoH]
+  0.22%  820.87us        68  12.071us     960ns  164.23us  [CUDA memcpy HtoD]
+
+==17891== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 52.66%  509.16ms     20088  25.346us     320ns  371.03us  cudaMemcpy
+ 26.73%  258.42ms        19  13.601ms  8.8970us  255.30ms  cudaHostAlloc
+ 19.10%  184.67ms     20000  9.2330us  7.8160us  348.55us  cudaLaunch
+  0.81%  7.7916ms     20000     389ns     275ns  331.45us  cudaConfigureCall
+  0.56%  5.4451ms     20000     272ns     241ns  4.6710us  cudaSetupArgument
+  0.10%  1.0098ms        19  53.145us  6.4240us  173.26us  cudaMalloc
+  0.02%  226.52us        83  2.7290us     143ns  97.659us  cuDeviceGetAttribute
+  0.00%  31.331us         1  31.331us  31.331us  31.331us  cuDeviceTotalMem
+  0.00%  30.487us         1  30.487us  30.487us  30.487us  cuDeviceGetName
+  0.00%  18.126us        19     954ns     368ns  3.5740us  cudaGetSymbolAddress
+  0.00%  11.311us         1  11.311us  11.311us  11.311us  cudaSetDevice
+  0.00%  1.7800us         2     890ns     658ns  1.1220us  cuDeviceGetCount
+  0.00%  1.4830us         1  1.4830us  1.4830us  1.4830us  cudaGetDeviceCount
+  0.00%     640ns         2     320ns     242ns     398ns  cuDeviceGet
+
+```
+
+</p></details>
+
+
+***
+
+### CUBA
+![](plots/speed_test_CUBA_absolute.png)
+![](plots/speed_test_CUBA_profiling.png)
+![](plots/speed_test_CUBA_relative.png)
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfiguration**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==31291== NVPROF is profiling process 31291, command: ./main
+==31291== Profiling application: ./main
+==31291== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 31.18%  76.419ms     10000  7.6410us  7.3920us  8.7360us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double, double*, double*, double*, bool*)
+ 19.96%  48.924ms     10000  4.8920us  3.4560us  20.384us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double*, double, int*, int, int*, bool*)
+ 18.13%  44.432ms     10000  4.4430us  3.2960us  17.952us  kernel_synapses_1_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, int*, double*, bool*)
+ 13.38%  32.789ms     10000  3.2780us  3.2320us  3.7760us  [CUDA memset]
+  9.59%  23.496ms     10000  2.3490us  2.0480us  2.7520us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+  7.76%  19.020ms     10000  1.9010us  1.6640us  2.0800us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*, bool*)
+
+==31291== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 68.69%  471.10ms     50000  9.4220us  8.2170us  19.231ms  cudaLaunch
+ 13.91%  95.387ms     10000  9.5380us  8.7960us  312.26us  cudaMemset
+ 13.50%  92.578ms    510000     181ns     148ns  324.51us  cudaSetupArgument
+  2.05%  14.040ms     50000     280ns     237ns  5.2940us  cudaConfigureCall
+  1.83%  12.581ms     50000     251ns     217ns  12.226us  cudaGetLastError
+  0.01%  51.575us         1  51.575us  51.575us  51.575us  cudaMemGetInfo
+  0.00%  21.460us        39     550ns     461ns  1.4270us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  21.129us         8  2.6410us  1.9560us  4.4310us  cudaFuncGetAttributes
+  0.00%  16.670us         1  16.670us  16.670us  16.670us  cudaDeviceSynchronize
+  0.00%  5.5840us        12     465ns     285ns  1.2870us  cudaDeviceGetAttribute
+  0.00%  3.3860us         3  1.1280us     653ns  1.8010us  cudaGetDevice
+
+```
+
+</p></details>
+
+
+<details><summary>Examplary `nvprof` results for **GeNNConfigurationOptimized**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==31529== NVPROF is profiling process 31529, command: ./main test 1.0 1
+==31529== Profiling application: ./main test 1.0 1
+==31529== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 74.56%  131.02ms     10000  13.101us  11.808us  14.624us  calcNeurons
+ 24.85%  43.662ms     10000  4.3660us  2.1760us  25.760us  calcSynapses
+  0.45%  796.80us        56  14.228us     960ns  163.59us  [CUDA memcpy HtoD]
+  0.13%  234.31us        13  18.023us  1.9520us  155.27us  [CUDA memcpy DtoH]
+
+==31529== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 57.53%  276.80ms        16  17.300ms  8.5100us  274.32ms  cudaHostAlloc
+ 38.37%  184.60ms     20000  9.2300us  7.6370us  342.36us  cudaLaunch
+  1.48%  7.1407ms        73  97.817us     343ns  5.2594ms  cudaMemcpy
+  1.31%  6.3266ms     20000     316ns     249ns  315.38us  cudaConfigureCall
+  1.06%  5.1071ms     20000     255ns     220ns  4.6570us  cudaSetupArgument
+  0.17%  819.17us        16  51.198us  6.2400us  136.59us  cudaMalloc
+  0.05%  241.67us        83  2.9110us     138ns  103.86us  cuDeviceGetAttribute
+  0.01%  32.371us         1  32.371us  32.371us  32.371us  cuDeviceTotalMem
+  0.01%  28.436us         1  28.436us  28.436us  28.436us  cuDeviceGetName
+  0.00%  12.399us        16     774ns     424ns  2.0180us  cudaGetSymbolAddress
+  0.00%  12.047us         1  12.047us  12.047us  12.047us  cudaSetDevice
+  0.00%  1.6800us         1  1.6800us  1.6800us  1.6800us  cudaGetDeviceCount
+  0.00%  1.4560us         2     728ns     455ns  1.0010us  cuDeviceGetCount
+  0.00%     575ns         2     287ns     235ns     340ns  cuDeviceGet
+
+```
+
+</p></details>
+
+
+***
+
+### CUBAFixedConnectivity
+![](plots/speed_test_CUBAFixedConnectivity_absolute.png)
+![](plots/speed_test_CUBAFixedConnectivity_profiling.png)
+![](plots/speed_test_CUBAFixedConnectivity_relative.png)
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfiguration**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==28333== NVPROF is profiling process 28333, command: ./main
+==28333== Profiling application: ./main
+==28333== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 23.53%  75.188ms     10000  7.5180us  7.1360us  8.8960us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double, double*, double*, double*, bool*)
+ 20.88%  66.723ms     10000  6.6720us  1.6960us  14.967ms  kernel_spikemonitor_codeobject(unsigned int, int*, double, int*, int*, int*, int, int*, double*, int, int*, int*)
+ 17.07%  54.561ms     10000  5.4560us  3.2960us  21.920us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double*, double, int*, int, int*, bool*)
+ 15.31%  48.929ms     10000  4.8920us  3.2960us  18.784us  kernel_synapses_1_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, int*, double*, bool*)
+ 10.24%  32.716ms     10000  3.2710us  3.1360us  4.1920us  [CUDA memset]
+  7.36%  23.508ms     10000  2.3500us  2.0160us  2.7200us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+  5.59%  17.866ms     10000  1.7860us  1.5360us  2.0800us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*, bool*)
+  0.02%  67.328us         1  67.328us  67.328us  67.328us  _run_spikemonitor_codeobject_init(void)
+
+==28333== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 70.32%  550.58ms     60001  9.1760us  8.3390us  6.9445ms  cudaLaunch
+ 14.00%  109.65ms    630000     174ns     148ns  343.93us  cudaSetupArgument
+ 11.69%  91.573ms     10000  9.1570us  8.5300us  165.12us  cudaMemset
+  1.99%  15.611ms     60001     260ns     222ns  327.19us  cudaConfigureCall
+  1.98%  15.472ms     60001     257ns     208ns  1.1493ms  cudaGetLastError
+  0.01%  51.353us         1  51.353us  51.353us  51.353us  cudaMemGetInfo
+  0.00%  24.711us        40     617ns     509ns  1.7610us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  23.494us         9  2.6100us  2.0080us  4.3370us  cudaFuncGetAttributes
+  0.00%  17.566us         1  17.566us  17.566us  17.566us  cudaDeviceSynchronize
+  0.00%  5.4430us        12     453ns     281ns  1.1050us  cudaDeviceGetAttribute
+  0.00%  3.0770us         3  1.0250us     646ns  1.6320us  cudaGetDevice
+
+```
+
+</p></details>
+
+
+<details><summary>Examplary `nvprof` results for **GeNNConfigurationOptimized**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==28592== NVPROF is profiling process 28592, command: ./main test 1.0 1
+==28592== Profiling application: ./main test 1.0 1
+==28592== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 63.11%  133.95ms     10000  13.394us  12.384us  14.432us  calcNeurons
+ 22.74%  48.266ms     10000  4.8260us  2.7200us  24.896us  calcSynapses
+ 13.78%  29.240ms     14081  2.0760us  2.0160us  154.95us  [CUDA memcpy DtoH]
+  0.37%  793.60us        56  14.171us     960ns  163.11us  [CUDA memcpy HtoD]
+
+==28592== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 38.67%  315.20ms     20073  15.702us     324ns  773.07us  cudaMemcpy
+ 37.36%  304.57ms        16  19.036ms  8.7600us  301.99ms  cudaHostAlloc
+ 22.40%  182.59ms     20000  9.1290us  7.6730us  821.14us  cudaLaunch
+  0.78%  6.3728ms     20000     318ns     250ns  5.2440us  cudaConfigureCall
+  0.66%  5.3441ms     20000     267ns     226ns  332.81us  cudaSetupArgument
+  0.10%  800.29us        16  50.018us  6.1360us  126.53us  cudaMalloc
+  0.03%  230.87us        83  2.7810us     153ns  99.066us  cuDeviceGetAttribute
+  0.00%  32.084us         1  32.084us  32.084us  32.084us  cuDeviceTotalMem
+  0.00%  30.780us         1  30.780us  30.780us  30.780us  cuDeviceGetName
+  0.00%  12.549us        16     784ns     421ns  2.2350us  cudaGetSymbolAddress
+  0.00%  11.671us         1  11.671us  11.671us  11.671us  cudaSetDevice
+  0.00%  1.8440us         1  1.8440us  1.8440us  1.8440us  cudaGetDeviceCount
+  0.00%  1.7500us         2     875ns     690ns  1.0600us  cuDeviceGetCount
+  0.00%     626ns         2     313ns     253ns     373ns  cuDeviceGet
+
+```
+
+</p></details>
+
+
+***
+
+### DenseMediumRateSynapsesOnly
+![](plots/speed_test_DenseMediumRateSynapsesOnly_absolute.png)
+![](plots/speed_test_DenseMediumRateSynapsesOnly_profiling.png)
+![](plots/speed_test_DenseMediumRateSynapsesOnly_relative.png)
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfiguration**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==30551== NVPROF is profiling process 30551, command: ./main
+==30551== Profiling application: ./main
+==30551== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 56.01%  59.694ms     10000  5.9690us  5.6000us  6.4960us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, double*, int*, int, int*)
+ 28.93%  30.830ms     10000  3.0820us  3.0400us  3.5200us  [CUDA memset]
+ 15.06%  16.055ms     10000  1.6050us  1.5040us  2.4000us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*)
+
+==30551== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 59.17%  191.07ms     20000  9.5530us  8.3220us  11.129ms  cudaLaunch
+ 27.89%  90.062ms     10000  9.0060us  8.4390us  27.616us  cudaMemset
+  9.32%  30.084ms    170000     176ns     153ns  306.97us  cudaSetupArgument
+  1.82%  5.8925ms     20000     294ns     213ns  303.17us  cudaConfigureCall
+  1.77%  5.7023ms     20000     285ns     216ns  302.98us  cudaGetLastError
+  0.01%  46.403us         1  46.403us  46.403us  46.403us  cudaMemGetInfo
+  0.01%  18.635us         1  18.635us  18.635us  18.635us  cudaDeviceSynchronize
+  0.00%  8.8700us         3  2.9560us  2.1570us  3.7290us  cudaFuncGetAttributes
+  0.00%  6.7130us         3  2.2370us     629ns  3.5200us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  1.7730us         4     443ns     369ns     586ns  cudaDeviceGetAttribute
+  0.00%     848ns         1     848ns     848ns     848ns  cudaGetDevice
+
+```
+
+</p></details>
+
+
+<details><summary>Examplary `nvprof` results for **GeNNConfigurationOptimized**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==30762== NVPROF is profiling process 30762, command: ./main test 1.0 1
+==30762== Profiling application: ./main test 1.0 1
+==30762== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 64.08%  52.562ms     10000  5.2560us  3.4240us  5.9200us  calcSynapses
+ 35.80%  29.364ms     10000  2.9360us  2.8800us  3.8080us  calcNeurons
+  0.07%  57.888us        44  1.3150us     960ns  2.2400us  [CUDA memcpy HtoD]
+  0.05%  38.240us        14  2.7310us  2.0160us  4.7360us  [CUDA memcpy DtoH]
+
+==30762== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 61.72%  283.35ms        12  23.613ms  14.143us  281.71ms  cudaHostAlloc
+ 35.34%  162.27ms     20000  8.1130us  7.4880us  334.11us  cudaLaunch
+  1.34%  6.1571ms     20000     307ns     256ns  322.44us  cudaConfigureCall
+  1.16%  5.3454ms     20000     267ns     224ns  332.57us  cudaSetupArgument
+  0.23%  1.0363ms        61  16.988us     318ns  37.131us  cudaMemcpy
+  0.14%  644.11us        12  53.676us  11.831us  178.21us  cudaMalloc
+  0.05%  226.72us        83  2.7310us     138ns  97.611us  cuDeviceGetAttribute
+  0.01%  31.315us         1  31.315us  31.315us  31.315us  cuDeviceTotalMem
+  0.01%  26.553us         1  26.553us  26.553us  26.553us  cuDeviceGetName
+  0.00%  13.976us        12  1.1640us     709ns  3.1230us  cudaGetSymbolAddress
+  0.00%  11.238us         1  11.238us  11.238us  11.238us  cudaSetDevice
+  0.00%  1.4430us         2     721ns     438ns  1.0050us  cuDeviceGetCount
+  0.00%  1.4380us         1  1.4380us  1.4380us  1.4380us  cudaGetDeviceCount
+  0.00%     582ns         2     291ns     214ns     368ns  cuDeviceGet
+
+```
+
+</p></details>
+
+
+***
+
+### HHNeuronsOnly
+![](plots/speed_test_HHNeuronsOnly_absolute.png)
+![](plots/speed_test_HHNeuronsOnly_profiling.png)
+![](plots/speed_test_HHNeuronsOnly_relative.png)
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfiguration**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==25014== NVPROF is profiling process 25014, command: ./main
+==25014== Profiling application: ./main
+==25014== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 76.60%  171.78ms     10000  17.177us  14.880us  18.080us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, bool*, double*, double*, double*, double*)
+ 13.61%  30.516ms     10000  3.0510us  2.8160us  3.5840us  [CUDA memset]
+  9.79%  21.945ms     10000  2.1940us  1.8240us  2.9120us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+
+==25014== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 58.23%  179.09ms     20000  8.9540us  8.0160us  5.8117ms  cudaLaunch
+ 28.13%  86.520ms     10000  8.6520us  8.0220us  324.89us  cudaMemset
+ 10.05%  30.914ms    160000     193ns     150ns  347.54us  cudaSetupArgument
+  1.94%  5.9702ms     20000     298ns     223ns  315.53us  cudaConfigureCall
+  1.61%  4.9531ms     20000     247ns     210ns  327.22us  cudaGetLastError
+  0.02%  46.728us         1  46.728us  46.728us  46.728us  cudaMemGetInfo
+  0.01%  17.432us        35     498ns     471ns     917ns  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  10.745us         1  10.745us  10.745us  10.745us  cudaDeviceSynchronize
+  0.00%  10.378us         4  2.5940us  2.0060us  3.1740us  cudaFuncGetAttributes
+  0.00%  3.1700us         8     396ns     284ns     677ns  cudaDeviceGetAttribute
+  0.00%  1.6580us         2     829ns     801ns     857ns  cudaGetDevice
+
+```
+
+</p></details>
+
+
+<details><summary>Examplary `nvprof` results for **GeNNConfigurationOptimized**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==25225== NVPROF is profiling process 25225, command: ./main test 1.0 1
+==25225== Profiling application: ./main test 1.0 1
+==25225== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 99.94%  177.51ms     10000  17.750us  14.944us  26.400us  calcNeurons
+  0.04%  62.626us        40  1.5650us     960ns  2.1760us  [CUDA memcpy HtoD]
+  0.02%  38.560us        11  3.5050us  2.0160us  4.6720us  [CUDA memcpy DtoH]
+
+==25225== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 55.84%  235.54ms        10  23.554ms  16.992us  233.93ms  cudaHostAlloc
+ 37.45%  157.95ms     10000  15.795us  7.9250us  353.53us  cudaLaunch
+  4.97%  20.977ms        53  395.80us     389ns  20.008ms  cudaMemcpy
+  0.81%  3.4097ms     10000     340ns     278ns  5.0220us  cudaConfigureCall
+  0.70%  2.9582ms     10000     295ns     232ns  339.82us  cudaSetupArgument
+  0.15%  630.64us        10  63.063us  12.457us  174.83us  cudaMalloc
+  0.05%  227.15us        83  2.7360us     140ns  98.109us  cuDeviceGetAttribute
+  0.01%  31.635us         1  31.635us  31.635us  31.635us  cuDeviceTotalMem
+  0.01%  31.273us         1  31.273us  31.273us  31.273us  cuDeviceGetName
+  0.00%  12.870us        10  1.2870us     741ns  3.5550us  cudaGetSymbolAddress
+  0.00%  10.918us         1  10.918us  10.918us  10.918us  cudaSetDevice
+  0.00%  1.9240us         2     962ns     718ns  1.2060us  cuDeviceGetCount
+  0.00%  1.4330us         1  1.4330us  1.4330us  1.4330us  cudaGetDeviceCount
+  0.00%     657ns         2     328ns     303ns     354ns  cuDeviceGet
+
+```
+
+</p></details>
+
+
+***
+
+### LinearNeuronsOnly
+![](plots/speed_test_LinearNeuronsOnly_absolute.png)
+![](plots/speed_test_LinearNeuronsOnly_profiling.png)
+![](plots/speed_test_LinearNeuronsOnly_relative.png)
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfiguration**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==19640== NVPROF is profiling process 19640, command: ./main
+==19640== Profiling application: ./main
+==19640== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+100.00%  247.35ms    100000  2.4730us  2.3360us  3.6800us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*)
+
+==19640== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 87.43%  837.87ms    100000  8.3780us  7.7260us  7.8274ms  cudaLaunch
+  7.01%  67.186ms    400000     167ns     147ns  10.910us  cudaSetupArgument
+  2.81%  26.904ms    100000     269ns     241ns  10.142us  cudaConfigureCall
+  2.74%  26.287ms    100000     262ns     235ns  11.074us  cudaGetLastError
+  0.01%  70.067us         1  70.067us  70.067us  70.067us  cudaMemGetInfo
+  0.00%  14.560us         2  7.2800us  4.1830us  10.377us  cudaFuncGetAttributes
+  0.00%  9.6320us         1  9.6320us  9.6320us  9.6320us  cudaDeviceSynchronize
+  0.00%  5.2800us         2  2.6400us  1.1150us  4.1650us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  3.9840us         1  3.9840us  3.9840us  3.9840us  cudaGetDevice
+  0.00%  3.7360us         4     934ns     668ns  1.5690us  cudaDeviceGetAttribute
+
+```
+
+</p></details>
+
+
+<details><summary>Examplary `nvprof` results for **GeNNConfigurationOptimized**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==19869== NVPROF is profiling process 19869, command: ./main test 10.0 1
+==19869== Profiling application: ./main test 10.0 1
+==19869== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 99.99%  264.71ms    100000  2.6470us  2.5920us  3.1680us  calcNeurons
+  0.01%  22.656us        16  1.4160us     960ns  2.0800us  [CUDA memcpy HtoD]
+  0.01%  14.624us         5  2.9240us  2.0480us  4.6720us  [CUDA memcpy DtoH]
+
+==19869== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 73.18%  822.50ms    100000  8.2250us  7.6370us  361.19us  cudaLaunch
+ 21.57%  242.48ms         4  60.620ms  23.163us  240.97ms  cudaHostAlloc
+  2.95%  33.155ms    100000     331ns     251ns  369.91us  cudaConfigureCall
+  2.18%  24.551ms    100000     245ns     222ns  14.790us  cudaSetupArgument
+  0.05%  525.28us         4  131.32us  12.450us  178.02us  cudaMalloc
+  0.04%  460.82us        23  20.035us     384ns  39.476us  cudaMemcpy
+  0.02%  226.65us        83  2.7300us     142ns  97.695us  cuDeviceGetAttribute
+  0.00%  31.478us         1  31.478us  31.478us  31.478us  cuDeviceTotalMem
+  0.00%  30.578us         1  30.578us  30.578us  30.578us  cuDeviceGetName
+  0.00%  10.794us         1  10.794us  10.794us  10.794us  cudaSetDevice
+  0.00%  7.9740us         4  1.9930us     876ns  3.7070us  cudaGetSymbolAddress
+  0.00%  1.5520us         2     776ns     553ns     999ns  cuDeviceGetCount
+  0.00%  1.4290us         1  1.4290us  1.4290us  1.4290us  cudaGetDeviceCount
+  0.00%     545ns         2     272ns     256ns     289ns  cuDeviceGet
+
+```
+
+</p></details>
+
+
+***
+
+### STDP
+![](plots/speed_test_STDP_absolute.png)
+![](plots/speed_test_STDP_profiling.png)
+![](plots/speed_test_STDP_relative.png)
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfiguration**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==30259== NVPROF is profiling process 30259, command: ./main
+==30259== Profiling application: ./main
+==30259== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 29.51%  119.04ms     10000  11.903us  1.4720us  28.312ms  kernel_spikemonitor_codeobject(unsigned int, int*, double, int*, int*, int*, int, int*, double*, int, int*, int*)
+ 19.38%  78.154ms     10000  7.8150us  3.0400us  25.729us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, double*, double, double*, int, int*, int, int*, int)
+ 15.01%  60.555ms     20000  3.0270us  2.8480us  4.2880us  [CUDA memset]
+ 13.45%  54.257ms     10000  5.4250us  4.9280us  8.0000us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*)
+  8.78%  35.407ms     10000  3.5400us  3.2000us  7.1360us  kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, double, double*, int, int*, int*, int)
+  6.25%  25.200ms     10000  2.5190us  2.1760us  2.8800us  kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*)
+  3.84%  15.476ms     10000  1.5470us  1.4080us  2.4960us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*)
+  3.64%  14.677ms     10000  1.4670us  1.3440us  1.9520us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+  0.13%  535.30us         1  535.30us  535.30us  535.30us  void gen_sequenced<curandStateXORWOW, float, int, __operator_&__(float curand_uniform_noargs<curandStateXORWOW>(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int)
+  0.02%  69.760us         1  69.760us  69.760us  69.760us  _run_spikemonitor_codeobject_init(void)
+
+==30259== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 66.59%  656.39ms     70002  9.3760us  8.0560us  14.291ms  cudaLaunch
+ 18.06%  178.04ms     20000  8.9010us  7.9370us  1.1364ms  cudaMemset
+ 11.56%  113.99ms    680005     167ns     152ns  60.368us  cudaSetupArgument
+  2.00%  19.667ms     70003     280ns     237ns  57.739us  cudaGetLastError
+  1.77%  17.418ms     70002     248ns     194ns  139.14us  cudaConfigureCall
+  0.01%  139.28us         1  139.28us  139.28us  139.28us  cudaMalloc
+  0.00%  48.635us         1  48.635us  48.635us  48.635us  cudaMemGetInfo
+  0.00%  27.603us        11  2.5090us  1.9830us  4.1880us  cudaFuncGetAttributes
+  0.00%  23.673us        42     563ns     472ns  1.2600us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  18.501us         1  18.501us  18.501us  18.501us  cudaDeviceSynchronize
+  0.00%  6.2050us        16     387ns     285ns     719ns  cudaDeviceGetAttribute
+  0.00%  3.4000us         4     850ns     590ns  1.2110us  cudaGetDevice
+
+```
+
+</p></details>
+
+
+<details><summary>Examplary `nvprof` results for **GeNNConfigurationOptimized**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==30505== NVPROF is profiling process 30505, command: ./main test 1.0 1
+==30505== Profiling application: ./main test 1.0 1
+==30505== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 50.58%  115.54ms     10000  11.553us  1.7280us  50.209us  calcSynapses
+ 21.49%  49.104ms     10000  4.9100us  4.0640us  6.1440us  calcNeurons
+ 16.03%  36.625ms     17853  2.0510us  2.0160us  4.7360us  [CUDA memcpy DtoH]
+ 11.86%  27.088ms     10000  2.7080us  2.5920us  11.392us  learnSynapsesPost
+  0.04%  93.633us        70  1.3370us     960ns  2.1440us  [CUDA memcpy HtoD]
+
+==30505== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 35.14%  309.15ms     20095  15.384us     188ns  352.42us  cudaMemcpy
+ 32.84%  288.94ms        20  14.447ms  7.6290us  287.79ms  cudaHostAlloc
+ 29.91%  263.12ms     30000  8.7700us  7.6720us  331.70us  cudaLaunch
+  1.17%  10.291ms     30000     343ns     248ns  319.74us  cudaConfigureCall
+  0.84%  7.4251ms     30000     247ns     223ns  10.549us  cudaSetupArgument
+  0.06%  487.96us        20  24.398us  6.1080us  126.07us  cudaMalloc
+  0.03%  225.93us        83  2.7220us     138ns  97.475us  cuDeviceGetAttribute
+  0.00%  31.137us         1  31.137us  31.137us  31.137us  cuDeviceTotalMem
+  0.00%  27.695us         1  27.695us  27.695us  27.695us  cuDeviceGetName
+  0.00%  11.547us        20     577ns     375ns  2.1780us  cudaGetSymbolAddress
+  0.00%  11.033us         1  11.033us  11.033us  11.033us  cudaSetDevice
+  0.00%  1.4410us         2     720ns     488ns     953ns  cuDeviceGetCount
+  0.00%  1.3060us         1  1.3060us  1.3060us  1.3060us  cudaGetDeviceCount
+  0.00%     575ns         2     287ns     226ns     349ns  cuDeviceGet
+
+```
+
+</p></details>
+
+
+***
+
+### STDPEventDriven
+![](plots/speed_test_STDPEventDriven_absolute.png)
+![](plots/speed_test_STDPEventDriven_profiling.png)
+![](plots/speed_test_STDPEventDriven_relative.png)
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfiguration**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==13883== NVPROF is profiling process 13883, command: ./main
+==13883== Profiling application: ./main
+==13883== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 29.16%  88.869ms     10000  8.8860us  3.4880us  32.064us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, double*, double, double*, int, int*, int, int*, int)
+ 20.89%  63.662ms     20000  3.1830us  3.0400us  3.6800us  [CUDA memset]
+ 17.94%  54.662ms     10000  5.4660us  5.1840us  7.5200us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*)
+ 12.41%  37.829ms     10000  3.7820us  3.6480us  7.2000us  kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, double, double*, int, int*, int*, int)
+  7.99%  24.357ms     10000  2.4350us  2.1760us  2.8800us  kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*)
+  5.78%  17.601ms     10000  1.7600us  1.5360us  2.4960us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*)
+  5.65%  17.232ms     10000  1.7230us  1.6640us  1.9840us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+  0.17%  532.84us         1  532.84us  532.84us  532.84us  void gen_sequenced<curandStateXORWOW, float, int, __operator_&__(float curand_uniform_noargs<curandStateXORWOW>(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int)
+
+==13883== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 62.59%  547.05ms     60001  9.1170us  8.1770us  7.2312ms  cudaLaunch
+ 20.36%  177.95ms     20000  8.8970us  8.1030us  336.69us  cudaMemset
+ 13.38%  116.92ms    560005     208ns     150ns  330.03us  cudaSetupArgument
+  1.91%  16.702ms     60001     278ns     208ns  316.80us  cudaConfigureCall
+  1.74%  15.203ms     60002     253ns     222ns  313.88us  cudaGetLastError
+  0.02%  138.47us         1  138.47us  138.47us  138.47us  cudaMalloc
+  0.01%  47.825us         1  47.825us  47.825us  47.825us  cudaMemGetInfo
+  0.00%  24.670us        10  2.4670us  1.9950us  3.8850us  cudaFuncGetAttributes
+  0.00%  22.588us        41     550ns     471ns  1.2300us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  17.416us         1  17.416us  17.416us  17.416us  cudaDeviceSynchronize
+  0.00%  5.6370us        16     352ns     276ns     664ns  cudaDeviceGetAttribute
+  0.00%  3.1450us         4     786ns     601ns  1.1830us  cudaGetDevice
+
+```
+
+</p></details>
+
+
+<details><summary>Examplary `nvprof` results for **GeNNConfigurationOptimized**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==14124== NVPROF is profiling process 14124, command: ./main test 1.0 1
+==14124== Profiling application: ./main test 1.0 1
+==14124== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 62.29%  109.79ms     10000  10.979us  1.4400us  50.176us  calcSynapses
+ 23.83%  42.003ms     10000  4.2000us  3.3280us  6.2080us  calcNeurons
+ 13.80%  24.321ms     10000  2.4320us  2.0800us  10.848us  learnSynapsesPost
+  0.05%  93.824us        70  1.3400us     960ns  2.1760us  [CUDA memcpy HtoD]
+  0.03%  53.856us        19  2.8340us  1.9520us  4.6400us  [CUDA memcpy DtoH]
+
+==14124== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 54.33%  315.51ms        20  15.776ms  7.4360us  314.37ms  cudaHostAlloc
+ 42.46%  246.58ms     30000  8.2190us  7.6810us  352.29us  cudaLaunch
+  1.62%  9.4165ms     30000     313ns     235ns  338.10us  cudaConfigureCall
+  1.25%  7.2565ms     30000     241ns     219ns  10.061us  cudaSetupArgument
+  0.20%  1.1638ms        95  12.250us     188ns  29.618us  cudaMemcpy
+  0.08%  485.57us        20  24.278us  6.1510us  122.08us  cudaMalloc
+  0.04%  225.75us        83  2.7190us     136ns  97.167us  cuDeviceGetAttribute
+  0.01%  31.148us         1  31.148us  31.148us  31.148us  cuDeviceTotalMem
+  0.00%  27.209us         1  27.209us  27.209us  27.209us  cuDeviceGetName
+  0.00%  25.053us        20  1.2520us     370ns  14.749us  cudaGetSymbolAddress
+  0.00%  11.323us         1  11.323us  11.323us  11.323us  cudaSetDevice
+  0.00%  1.4040us         1  1.4040us  1.4040us  1.4040us  cudaGetDeviceCount
+  0.00%  1.3580us         2     679ns     456ns     902ns  cuDeviceGetCount
+  0.00%     492ns         2     246ns     220ns     272ns  cuDeviceGet
+
+```
+
+</p></details>
+
+
+***
+
+### STDPMultiPost
+![](plots/speed_test_STDPMultiPost_absolute.png)
+![](plots/speed_test_STDPMultiPost_profiling.png)
+![](plots/speed_test_STDPMultiPost_relative.png)
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfiguration**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==13752== NVPROF is profiling process 13752, command: ./main
+==13752== Profiling application: ./main
+==13752== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 26.01%  63.681ms     20000  3.1840us  3.0400us  3.8080us  [CUDA memset]
+ 21.90%  53.615ms     10000  5.3610us  5.1840us  7.2640us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*)
+ 16.08%  39.373ms     10000  3.9370us  3.5840us  10.720us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, double*, double, double*, int, int*, int, int*, int)
+ 14.74%  36.097ms     10000  3.6090us  3.4880us  105.60us  kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, double, double*, int, int*, int*, int)
+  8.31%  20.344ms     10000  2.0340us  1.8560us  2.4320us  kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*)
+  6.61%  16.187ms     10000  1.6180us  1.5040us  2.8160us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*)
+  6.34%  15.535ms     10000  1.5530us  1.4720us  1.9840us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+  0.01%  22.881us         1  22.881us  22.881us  22.881us  void gen_sequenced<curandStateXORWOW, float, int, __operator_&__(float curand_uniform_noargs<curandStateXORWOW>(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int)
+
+==13752== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 64.39%  566.77ms     60001  9.4450us  8.5300us  7.6226ms  cudaLaunch
+ 20.37%  179.35ms     20000  8.9670us  8.0990us  320.51us  cudaMemset
+ 11.68%  102.80ms    560005     183ns     154ns  320.82us  cudaSetupArgument
+  1.91%  16.807ms     60001     280ns     234ns  314.83us  cudaConfigureCall
+  1.62%  14.260ms     60002     237ns     197ns  325.01us  cudaGetLastError
+  0.01%  125.15us         1  125.15us  125.15us  125.15us  cudaMalloc
+  0.01%  50.027us         1  50.027us  50.027us  50.027us  cudaMemGetInfo
+  0.00%  25.943us        10  2.5940us  1.9990us  4.6510us  cudaFuncGetAttributes
+  0.00%  23.402us        41     570ns     490ns  1.2400us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  17.044us         1  17.044us  17.044us  17.044us  cudaDeviceSynchronize
+  0.00%  6.0160us        16     376ns     279ns  1.0150us  cudaDeviceGetAttribute
+  0.00%  3.0950us         4     773ns     532ns  1.3840us  cudaGetDevice
+
+```
+
+</p></details>
+
+
+<details><summary>Examplary `nvprof` results for **GeNNConfigurationOptimized**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==13992== NVPROF is profiling process 13992, command: ./main test 1.0 1
+==13992== Profiling application: ./main test 1.0 1
+==13992== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 47.47%  40.621ms     10000  4.0620us  3.9680us  12.064us  calcNeurons
+ 29.19%  24.977ms     10000  2.4970us  2.4000us  360.29us  learnSynapsesPost
+ 23.19%  19.844ms     10000  1.9840us  1.5680us  15.904us  calcSynapses
+  0.10%  83.488us        70  1.1920us     960ns  2.0480us  [CUDA memcpy HtoD]
+  0.05%  45.344us        17  2.6670us  2.0480us  4.7040us  [CUDA memcpy DtoH]
+
+==13992== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 49.24%  255.49ms        20  12.774ms  7.1470us  254.39ms  cudaHostAlloc
+ 47.05%  244.13ms     30000  8.1370us  7.4970us  325.41us  cudaLaunch
+  1.88%  9.7505ms     30000     325ns     240ns  313.30us  cudaConfigureCall
+  1.44%  7.4897ms     30000     249ns     228ns  4.6460us  cudaSetupArgument
+  0.23%  1.1712ms        95  12.328us     191ns  29.827us  cudaMemcpy
+  0.10%  498.07us        20  24.903us  6.1390us  124.17us  cudaMalloc
+  0.04%  225.66us        83  2.7180us     135ns  97.278us  cuDeviceGetAttribute
+  0.01%  31.145us         1  31.145us  31.145us  31.145us  cuDeviceTotalMem
+  0.01%  27.598us         1  27.598us  27.598us  27.598us  cuDeviceGetName
+  0.00%  11.370us        20     568ns     348ns  2.0700us  cudaGetSymbolAddress
+  0.00%  11.183us         1  11.183us  11.183us  11.183us  cudaSetDevice
+  0.00%  1.4160us         2     708ns     453ns     963ns  cuDeviceGetCount
+  0.00%  1.3950us         1  1.3950us  1.3950us  1.3950us  cudaGetDeviceCount
+  0.00%     533ns         2     266ns     241ns     292ns  cuDeviceGet
+
+```
+
+</p></details>
+
+
+***
+
+### STDPMultiPostNeuronalTraces
+![](plots/speed_test_STDPMultiPostNeuronalTraces_absolute.png)
+![](plots/speed_test_STDPMultiPostNeuronalTraces_profiling.png)
+![](plots/speed_test_STDPMultiPostNeuronalTraces_relative.png)
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfiguration**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==31645== NVPROF is profiling process 31645, command: ./main
+==31645== Profiling application: ./main
+==31645== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 23.09%  63.632ms     20000  3.1810us  3.0400us  3.8080us  [CUDA memset]
+ 21.51%  59.284ms     10000  5.9280us  5.6320us  7.6160us  kernel_neurongroup_1_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*, double*)
+ 13.19%  36.348ms     10000  3.6340us  3.4240us  12.288us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, double*, double, double*, int, int*, int, int*, int, double*)
+ 12.65%  34.859ms     10000  3.4850us  3.3920us  94.048us  kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, double, double*, int, double*, int*, int, int)
+  9.89%  27.258ms     10000  2.7250us  2.5280us  2.9760us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*)
+  6.72%  18.518ms     10000  1.8510us  1.7600us  2.8160us  kernel_neurongroup_1_thresholder_codeobject(unsigned int, unsigned int, int*, double*)
+  6.69%  18.444ms     10000  1.8440us  1.6000us  2.4320us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*)
+  6.26%  17.266ms     10000  1.7260us  1.6640us  2.4000us  kernel_neurongroup_1_resetter_codeobject(unsigned int, unsigned int, double*, int*, double*)
+  0.01%  22.689us         1  22.689us  22.689us  22.689us  void gen_sequenced<curandStateXORWOW, float, int, __operator_&__(float curand_uniform_noargs<curandStateXORWOW>(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int)
+
+==31645== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 66.34%  631.89ms     70001  9.0260us  7.8240us  7.5683ms  cudaLaunch
+ 18.61%  177.26ms     20000  8.8630us  8.0310us  327.63us  cudaMemset
+ 11.06%  105.29ms    570005     184ns     147ns  324.54us  cudaSetupArgument
+  1.98%  18.868ms     70002     269ns     211ns  316.30us  cudaGetLastError
+  1.98%  18.848ms     70001     269ns     196ns  10.259us  cudaConfigureCall
+  0.01%  123.44us         1  123.44us  123.44us  123.44us  cudaMalloc
+  0.01%  48.253us         1  48.253us  48.253us  48.253us  cudaMemGetInfo
+  0.00%  38.693us        74     522ns     468ns  1.2040us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  30.351us        12  2.5290us  2.0130us  4.4000us  cudaFuncGetAttributes
+  0.00%  17.703us         1  17.703us  17.703us  17.703us  cudaDeviceSynchronize
+  0.00%  8.0120us        20     400ns     315ns     771ns  cudaDeviceGetAttribute
+  0.00%  3.7350us         5     747ns     588ns  1.2880us  cudaGetDevice
+
+```
+
+</p></details>
+
+
+<details><summary>Examplary `nvprof` results for **GeNNConfigurationOptimized**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==31875== NVPROF is profiling process 31875, command: ./main test 1.0 1
+==31875== Profiling application: ./main test 1.0 1
+==31875== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 51.59%  44.978ms     10000  4.4970us  4.4160us  13.216us  calcNeurons
+ 28.08%  24.482ms     10000  2.4480us  2.4000us  108.48us  learnSynapsesPost
+ 20.19%  17.604ms     10000  1.7600us  1.5680us  8.0320us  calcSynapses
+  0.09%  77.888us        70  1.1120us     960ns  2.0160us  [CUDA memcpy HtoD]
+  0.05%  40.704us        17  2.3940us  2.0480us  4.6720us  [CUDA memcpy DtoH]
+
+==31875== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 49.08%  242.98ms     30000  8.0990us  7.4830us  330.16us  cudaLaunch
+ 46.99%  232.62ms        20  11.631ms  13.742us  230.95ms  cudaHostAlloc
+  1.93%  9.5539ms     30000     318ns     249ns  316.27us  cudaConfigureCall
+  1.50%  7.4449ms     30000     248ns     228ns  9.5620us  cudaSetupArgument
+  0.29%  1.4169ms        93  15.235us     341ns  34.925us  cudaMemcpy
+  0.15%  732.26us        20  36.613us  11.241us  173.89us  cudaMalloc
+  0.05%  225.85us        83  2.7210us     144ns  97.097us  cuDeviceGetAttribute
+  0.01%  31.104us         1  31.104us  31.104us  31.104us  cuDeviceTotalMem
+  0.01%  27.342us         1  27.342us  27.342us  27.342us  cuDeviceGetName
+  0.00%  19.527us        20     976ns     638ns  3.5660us  cudaGetSymbolAddress
+  0.00%  11.180us         1  11.180us  11.180us  11.180us  cudaSetDevice
+  0.00%  1.5790us         2     789ns     579ns  1.0000us  cuDeviceGetCount
+  0.00%  1.4070us         1  1.4070us  1.4070us  1.4070us  cudaGetDeviceCount
+  0.00%     534ns         2     267ns     238ns     296ns  cuDeviceGet
+
+```
+
+</p></details>
+
+
+***
+
+### STDPNeuronalTraces
+![](plots/speed_test_STDPNeuronalTraces_absolute.png)
+![](plots/speed_test_STDPNeuronalTraces_profiling.png)
+![](plots/speed_test_STDPNeuronalTraces_relative.png)
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfiguration**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==22958== NVPROF is profiling process 22958, command: ./main
+==22958== Profiling application: ./main
+==22958== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 23.34%  76.426ms     10000  7.6420us  3.2960us  26.944us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, double*, double, double*, int, int*, int, int*, int, double*)
+ 19.43%  63.625ms     20000  3.1810us  3.0400us  3.7120us  [CUDA memset]
+ 18.23%  59.686ms     10000  5.9680us  5.6320us  8.0960us  kernel_neurongroup_1_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*, double*)
+ 11.04%  36.142ms     10000  3.6140us  3.3920us  7.0730us  kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, double, double*, int, double*, int*, int, int)
+  9.09%  29.761ms     10000  2.9760us  2.8800us  3.5840us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*)
+  7.99%  26.155ms     10000  2.6150us  2.2080us  2.8800us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*)
+  5.47%  17.908ms     10000  1.7900us  1.7280us  2.4640us  kernel_neurongroup_1_thresholder_codeobject(unsigned int, unsigned int, int*, double*)
+  5.26%  17.212ms     10000  1.7210us  1.6640us  2.3680us  kernel_neurongroup_1_resetter_codeobject(unsigned int, unsigned int, double*, int*, double*)
+  0.16%  534.91us         1  534.91us  534.91us  534.91us  void gen_sequenced<curandStateXORWOW, float, int, __operator_&__(float curand_uniform_noargs<curandStateXORWOW>(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int)
+
+==22958== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 66.19%  628.57ms     70001  8.9790us  7.8060us  7.0815ms  cudaLaunch
+ 18.98%  180.22ms     20000  9.0110us  8.1910us  325.17us  cudaMemset
+ 10.84%  102.92ms    570005     180ns     148ns  322.77us  cudaSetupArgument
+  2.05%  19.421ms     70002     277ns     224ns  322.72us  cudaGetLastError
+  1.92%  18.237ms     70001     260ns     204ns  7.6100us  cudaConfigureCall
+  0.01%  139.26us         1  139.26us  139.26us  139.26us  cudaMalloc
+  0.01%  47.740us         1  47.740us  47.740us  47.740us  cudaMemGetInfo
+  0.00%  38.641us        74     522ns     463ns  1.3230us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  31.070us        12  2.5890us  2.0180us  4.6520us  cudaFuncGetAttributes
+  0.00%  17.325us         1  17.325us  17.325us  17.325us  cudaDeviceSynchronize
+  0.00%  7.2280us        20     361ns     279ns     764ns  cudaDeviceGetAttribute
+  0.00%  3.4300us         5     686ns     519ns  1.2200us  cudaGetDevice
+
+```
+
+</p></details>
+
+
+<details><summary>Examplary `nvprof` results for **GeNNConfigurationOptimized**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==23186== NVPROF is profiling process 23186, command: ./main test 1.0 1
+==23186== Profiling application: ./main test 1.0 1
+==23186== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 45.72%  59.376ms     10000  5.9370us  1.4400us  22.209us  calcSynapses
+ 36.59%  47.519ms     10000  4.7510us  3.7440us  7.2000us  calcNeurons
+ 17.59%  22.844ms     10000  2.2840us  2.0800us  5.8240us  learnSynapsesPost
+  0.07%  90.016us        70  1.2850us     928ns  2.0480us  [CUDA memcpy HtoD]
+  0.04%  51.168us        19  2.6930us  1.9520us  4.6080us  [CUDA memcpy DtoH]
+
+==23186== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 48.78%  251.54ms        20  12.577ms  7.1400us  250.44ms  cudaHostAlloc
+ 47.58%  245.35ms     30000  8.1780us  7.6280us  342.38us  cudaLaunch
+  1.85%  9.5606ms     30000     318ns     255ns  320.84us  cudaConfigureCall
+  1.41%  7.2598ms     30000     241ns     222ns  5.1580us  cudaSetupArgument
+  0.22%  1.1470ms        93  12.333us     278ns  32.150us  cudaMemcpy
+  0.10%  513.51us        20  25.675us  6.0810us  139.05us  cudaMalloc
+  0.04%  228.09us        83  2.7480us     140ns  98.263us  cuDeviceGetAttribute
+  0.01%  31.411us         1  31.411us  31.411us  31.411us  cuDeviceTotalMem
+  0.01%  27.452us         1  27.452us  27.452us  27.452us  cuDeviceGetName
+  0.00%  12.004us         1  12.004us  12.004us  12.004us  cudaSetDevice
+  0.00%  11.525us        20     576ns     352ns  2.0890us  cudaGetSymbolAddress
+  0.00%  1.6280us         2     814ns     489ns  1.1390us  cuDeviceGetCount
+  0.00%  1.5650us         1  1.5650us  1.5650us  1.5650us  cudaGetDeviceCount
+  0.00%     594ns         2     297ns     230ns     364ns  cuDeviceGet
+
+```
+
+</p></details>
+
+
+***
+
+### STDPNotEventDriven
+![](plots/speed_test_STDPNotEventDriven_absolute.png)
+![](plots/speed_test_STDPNotEventDriven_profiling.png)
+![](plots/speed_test_STDPNotEventDriven_relative.png)
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfiguration**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==5309== NVPROF is profiling process 5309, command: ./main
+==5309== Profiling application: ./main
+==5309== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 23.35%  73.232ms     10000  7.3230us  3.4560us  24.544us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, double*, double, double*, int, int*, int, int*)
+ 20.25%  63.528ms     20000  3.1760us  3.0400us  3.7440us  [CUDA memset]
+ 17.18%  53.899ms     10000  5.3890us  5.0240us  7.6480us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*)
+ 11.40%  35.764ms     10000  3.5760us  3.3920us  6.2720us  kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, double, double*, int, int*)
+  9.18%  28.794ms     10000  2.8790us  2.7840us  3.3600us  kernel_synapses_stateupdater_codeobject(unsigned int, unsigned int, double*, int, double*, int, double*, int*)
+  7.72%  24.206ms     10000  2.4200us  2.2080us  2.8480us  kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*)
+  5.48%  17.200ms     10000  1.7190us  1.6640us  1.9840us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+  5.26%  16.509ms     10000  1.6500us  1.5360us  2.4960us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*)
+  0.17%  534.31us         1  534.31us  534.31us  534.31us  void gen_sequenced<curandStateXORWOW, float, int, __operator_&__(float curand_uniform_noargs<curandStateXORWOW>(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int)
+
+==5309== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 65.37%  632.10ms     70001  9.0290us  7.8220us  7.1147ms  cudaLaunch
+ 18.21%  176.05ms     20000  8.8020us  7.9140us  65.993us  cudaMemset
+ 11.98%  115.80ms    640005     180ns     150ns  325.82us  cudaSetupArgument
+  2.23%  21.584ms     70002     308ns     218ns  325.68us  cudaGetLastError
+  2.19%  21.175ms     70001     302ns     199ns  314.30us  cudaConfigureCall
+  0.01%  138.56us         1  138.56us  138.56us  138.56us  cudaMalloc
+  0.00%  48.141us         1  48.141us  48.141us  48.141us  cudaMemGetInfo
+  0.00%  40.939us        74     553ns     496ns  1.2830us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  30.402us        12  2.5330us  2.0360us  4.5650us  cudaFuncGetAttributes
+  0.00%  17.493us         1  17.493us  17.493us  17.493us  cudaDeviceSynchronize
+  0.00%  6.8790us        20     343ns     280ns     612ns  cudaDeviceGetAttribute
+  0.00%  3.7860us         5     757ns     587ns  1.2530us  cudaGetDevice
+
+```
+
+</p></details>
+
+
+<details><summary>Examplary `nvprof` results for **GeNNConfigurationOptimized**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==5547== NVPROF is profiling process 5547, command: ./main test 1.0 1
+==5547== Profiling application: ./main test 1.0 1
+==5547== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 38.01%  64.497ms     10000  6.4490us  1.4720us  25.121us  calcSynapses
+ 24.89%  42.225ms     10000  4.2220us  3.3600us  6.1120us  calcNeurons
+ 22.75%  38.605ms     10000  3.8600us  3.2320us  5.5680us  calcSynapseDynamics
+ 14.26%  24.189ms     10000  2.4180us  2.1120us  6.5920us  learnSynapsesPost
+  0.06%  96.512us        72  1.3400us     928ns  2.0800us  [CUDA memcpy HtoD]
+  0.03%  54.080us        19  2.8460us  1.9840us  4.6720us  [CUDA memcpy DtoH]
+
+==5547== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 53.26%  318.06ms     40000  7.9510us  7.3870us  323.19us  cudaLaunch
+ 42.53%  254.01ms        21  12.096ms  7.5310us  252.89ms  cudaHostAlloc
+  2.21%  13.204ms     40000     330ns     252ns  332.54us  cudaConfigureCall
+  1.66%  9.9116ms     40000     247ns     233ns  5.2730us  cudaSetupArgument
+  0.20%  1.1942ms        97  12.311us     197ns  30.710us  cudaMemcpy
+  0.08%  498.29us        21  23.728us  6.1100us  122.22us  cudaMalloc
+  0.04%  227.33us        83  2.7380us     149ns  97.591us  cuDeviceGetAttribute
+  0.01%  31.273us         1  31.273us  31.273us  31.273us  cuDeviceTotalMem
+  0.00%  27.431us         1  27.431us  27.431us  27.431us  cuDeviceGetName
+  0.00%  11.816us         1  11.816us  11.816us  11.816us  cudaSetDevice
+  0.00%  11.690us        21     556ns     357ns  2.1550us  cudaGetSymbolAddress
+  0.00%  1.4320us         2     716ns     525ns     907ns  cuDeviceGetCount
+  0.00%  1.3390us         1  1.3390us  1.3390us  1.3390us  cudaGetDeviceCount
+  0.00%     577ns         2     288ns     252ns     325ns  cuDeviceGet
+
+```
+
+</p></details>
+
+
+***
+
+### SparseHighRateSynapsesOnly
+![](plots/speed_test_SparseHighRateSynapsesOnly_absolute.png)
+![](plots/speed_test_SparseHighRateSynapsesOnly_profiling.png)
+![](plots/speed_test_SparseHighRateSynapsesOnly_relative.png)
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfiguration**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==29929== NVPROF is profiling process 29929, command: ./main
+==29929== Profiling application: ./main
+==29929== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 86.04%  284.29ms     10000  28.429us  27.328us  32.544us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, double*, int*, int, int*)
+  8.93%  29.521ms     10000  2.9520us  2.8800us  4.4480us  [CUDA memset]
+  5.03%  16.619ms     10000  1.6610us  1.5360us  2.4000us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*)
+
+==29929== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 58.38%  206.98ms     20000  10.348us  8.5120us  8.2431ms  cudaLaunch
+ 28.06%  99.491ms     10000  9.9490us  8.5150us  27.390us  cudaMemset
+  8.91%  31.590ms    170000     185ns     150ns  313.25us  cudaSetupArgument
+  1.79%  6.3337ms     20000     316ns     206ns  303.30us  cudaConfigureCall
+  1.73%  6.1183ms     20000     305ns     199ns  315.94us  cudaGetLastError
+  1.12%  3.9780ms         1  3.9780ms  3.9780ms  3.9780ms  cudaDeviceSynchronize
+  0.01%  46.286us         1  46.286us  46.286us  46.286us  cudaMemGetInfo
+  0.00%  8.3370us         3  2.7790us  2.1280us  3.2430us  cudaFuncGetAttributes
+  0.00%  5.4670us         3  1.8220us     649ns  2.4930us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  1.5130us         4     378ns     295ns     546ns  cudaDeviceGetAttribute
+  0.00%     820ns         1     820ns     820ns     820ns  cudaGetDevice
+
+```
+
+</p></details>
+
+
+<details><summary>Examplary `nvprof` results for **GeNNConfigurationOptimized**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==30148== NVPROF is profiling process 30148, command: ./main test 1.0 1
+==30148== Profiling application: ./main test 1.0 1
+==30148== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 88.25%  301.73ms     10000  30.173us  3.3920us  32.704us  calcSynapses
+ 11.72%  40.058ms     10000  4.0050us  3.8080us  4.8640us  calcNeurons
+  0.02%  61.280us        44  1.3920us     960ns  3.2000us  [CUDA memcpy HtoD]
+  0.01%  39.392us        14  2.8130us  1.9840us  6.8480us  [CUDA memcpy DtoH]
+
+==30148== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 54.90%  442.78ms        12  36.898ms  14.006us  441.12ms  cudaHostAlloc
+ 40.88%  329.68ms     20000  16.483us  7.7050us  338.70us  cudaLaunch
+  2.49%  20.082ms        61  329.22us     400ns  18.995ms  cudaMemcpy
+  0.94%  7.5995ms     20000     379ns     255ns  310.22us  cudaConfigureCall
+  0.67%  5.4120ms     20000     270ns     222ns  314.38us  cudaSetupArgument
+  0.08%  639.34us        12  53.278us  11.895us  172.21us  cudaMalloc
+  0.03%  235.92us        83  2.8420us     155ns  101.36us  cuDeviceGetAttribute
+  0.00%  32.471us         1  32.471us  32.471us  32.471us  cuDeviceTotalMem
+  0.00%  30.953us         1  30.953us  30.953us  30.953us  cuDeviceGetName
+  0.00%  14.056us        12  1.1710us     746ns  3.5320us  cudaGetSymbolAddress
+  0.00%  12.473us         1  12.473us  12.473us  12.473us  cudaSetDevice
+  0.00%  1.5390us         1  1.5390us  1.5390us  1.5390us  cudaGetDeviceCount
+  0.00%  1.4990us         2     749ns     424ns  1.0750us  cuDeviceGetCount
+  0.00%     514ns         2     257ns     199ns     315ns  cuDeviceGet
+
+```
+
+</p></details>
+
+
+***
+
+### SparseLowRateSynapsesOnly
+![](plots/speed_test_SparseLowRateSynapsesOnly_absolute.png)
+![](plots/speed_test_SparseLowRateSynapsesOnly_profiling.png)
+![](plots/speed_test_SparseLowRateSynapsesOnly_relative.png)
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfiguration**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==8193== NVPROF is profiling process 8193, command: ./main
+==8193== Profiling application: ./main
+==8193== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 55.84%  593.43ms    100000  5.9340us  5.4400us  6.9120us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, double*, int*, int, int*)
+ 28.97%  307.88ms    100000  3.0780us  3.0400us  3.6800us  [CUDA memset]
+ 15.19%  161.38ms    100000  1.6130us  1.5040us  2.5920us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*)
+
+==8193== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 55.92%  1.79370s    200000  8.9680us  7.6320us  7.2529ms  cudaLaunch
+ 29.82%  956.72ms    100000  9.5670us  8.2580us  21.256ms  cudaMemset
+ 10.51%  337.16ms   1700000     198ns     139ns  340.09us  cudaSetupArgument
+  1.91%  61.333ms    200000     306ns     217ns  368.29us  cudaGetLastError
+  1.83%  58.844ms    200000     294ns     168ns  332.73us  cudaConfigureCall
+  0.00%  45.848us         1  45.848us  45.848us  45.848us  cudaMemGetInfo
+  0.00%  12.992us         1  12.992us  12.992us  12.992us  cudaDeviceSynchronize
+  0.00%  8.6600us         3  2.8860us  2.0910us  3.5820us  cudaFuncGetAttributes
+  0.00%  5.3760us         3  1.7920us     594ns  2.4470us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  1.5830us         4     395ns     305ns     591ns  cudaDeviceGetAttribute
+  0.00%     829ns         1     829ns     829ns     829ns  cudaGetDevice
+
+```
+
+</p></details>
+
+
+<details><summary>Examplary `nvprof` results for **GeNNConfigurationOptimized**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==8451== NVPROF is profiling process 8451, command: ./main test 10.0 1
+==8451== Profiling application: ./main test 10.0 1
+==8451== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 66.88%  550.62ms    100000  5.5060us  3.4560us  6.4000us  calcSynapses
+ 33.11%  272.64ms    100000  2.7260us  2.6560us  3.7760us  calcNeurons
+  0.01%  53.984us        44  1.2260us     960ns  2.0800us  [CUDA memcpy HtoD]
+  0.00%  35.072us        14  2.5050us  1.9520us  4.7040us  [CUDA memcpy DtoH]
+
+==8451== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 81.32%  1.60600s    200000  8.0290us  7.4920us  354.55us  cudaLaunch
+ 12.69%  250.71ms        12  20.893ms  15.503us  249.06ms  cudaHostAlloc
+  3.37%  66.566ms    200000     332ns     257ns  334.65us  cudaConfigureCall
+  2.52%  49.683ms    200000     248ns     225ns  334.65us  cudaSetupArgument
+  0.05%  1.0155ms        61  16.647us     343ns  35.922us  cudaMemcpy
+  0.03%  641.50us        12  53.458us  12.040us  174.09us  cudaMalloc
+  0.01%  225.49us        83  2.7160us     135ns  97.180us  cuDeviceGetAttribute
+  0.00%  31.170us         1  31.170us  31.170us  31.170us  cuDeviceTotalMem
+  0.00%  26.897us         1  26.897us  26.897us  26.897us  cuDeviceGetName
+  0.00%  13.730us        12  1.1440us     698ns  3.1800us  cudaGetSymbolAddress
+  0.00%  11.132us         1  11.132us  11.132us  11.132us  cudaSetDevice
+  0.00%  1.3520us         2     676ns     376ns     976ns  cuDeviceGetCount
+  0.00%  1.3320us         1  1.3320us  1.3320us  1.3320us  cudaGetDeviceCount
+  0.00%     542ns         2     271ns     213ns     329ns  cuDeviceGet
+
+```
+
+</p></details>
+
+
+***
+
+### SparseMediumRateSynapsesOnly
+![](plots/speed_test_SparseMediumRateSynapsesOnly_absolute.png)
+![](plots/speed_test_SparseMediumRateSynapsesOnly_profiling.png)
+![](plots/speed_test_SparseMediumRateSynapsesOnly_relative.png)
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfiguration**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==16276== NVPROF is profiling process 16276, command: ./main
+==16276== Profiling application: ./main
+==16276== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 55.93%  59.598ms     10000  5.9590us  5.6000us  6.8480us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, double*, int*, int, int*)
+ 28.96%  30.864ms     10000  3.0860us  3.0400us  3.5840us  [CUDA memset]
+ 15.11%  16.106ms     10000  1.6100us  1.5040us  2.4000us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*)
+
+==16276== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 57.38%  194.03ms     20000  9.7010us  8.5280us  7.3801ms  cudaLaunch
+ 27.54%  93.116ms     10000  9.3110us  8.6920us  28.380us  cudaMemset
+ 10.82%  36.579ms    170000     215ns     184ns  349.92us  cudaSetupArgument
+  2.15%  7.2682ms     20000     363ns     248ns  327.47us  cudaConfigureCall
+  2.09%  7.0721ms     20000     353ns     266ns  337.12us  cudaGetLastError
+  0.01%  46.564us         1  46.564us  46.564us  46.564us  cudaMemGetInfo
+  0.01%  18.278us         1  18.278us  18.278us  18.278us  cudaDeviceSynchronize
+  0.00%  8.5460us         3  2.8480us  2.1440us  3.4910us  cudaFuncGetAttributes
+  0.00%  5.2380us         3  1.7460us     617ns  2.4330us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  1.7410us         4     435ns     339ns     632ns  cudaDeviceGetAttribute
+  0.00%     956ns         1     956ns     956ns     956ns  cudaGetDevice
+
+```
+
+</p></details>
+
+
+<details><summary>Examplary `nvprof` results for **GeNNConfigurationOptimized**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==16495== NVPROF is profiling process 16495, command: ./main test 1.0 1
+==16495== Profiling application: ./main test 1.0 1
+==16495== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 67.04%  60.321ms     10000  6.0320us  3.4560us  6.5280us  calcSynapses
+ 32.86%  29.567ms     10000  2.9560us  2.9120us  3.7440us  calcNeurons
+  0.06%  54.017us        44  1.2270us     960ns  2.0480us  [CUDA memcpy HtoD]
+  0.04%  36.032us        14  2.5730us  2.0480us  4.7360us  [CUDA memcpy DtoH]
+
+==16495== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 62.23%  290.68ms        12  24.223ms  7.8400us  289.60ms  cudaHostAlloc
+ 35.13%  164.11ms     20000  8.2050us  7.5690us  348.13us  cudaLaunch
+  1.32%  6.1557ms     20000     307ns     255ns  328.87us  cudaConfigureCall
+  1.01%  4.7095ms     20000     235ns     202ns  341.44us  cudaSetupArgument
+  0.16%  750.68us        61  12.306us     358ns  28.177us  cudaMemcpy
+  0.09%  419.68us        12  34.973us  6.2030us  120.19us  cudaMalloc
+  0.05%  227.14us        83  2.7360us     145ns  97.726us  cuDeviceGetAttribute
+  0.01%  31.327us         1  31.327us  31.327us  31.327us  cuDeviceTotalMem
+  0.01%  26.548us         1  26.548us  26.548us  26.548us  cuDeviceGetName
+  0.00%  11.315us         1  11.315us  11.315us  11.315us  cudaSetDevice
+  0.00%  7.9470us        12     662ns     405ns  1.9600us  cudaGetSymbolAddress
+  0.00%  1.5460us         2     773ns     495ns  1.0510us  cuDeviceGetCount
+  0.00%  1.4000us         1  1.4000us  1.4000us  1.4000us  cudaGetDeviceCount
+  0.00%     578ns         2     289ns     223ns     355ns  cuDeviceGet
+
+```
+
+</p></details>
+
+
+***
+
+### VerySparseMediumRateSynapsesOnly
+![](plots/speed_test_VerySparseMediumRateSynapsesOnly_absolute.png)
+![](plots/speed_test_VerySparseMediumRateSynapsesOnly_profiling.png)
+![](plots/speed_test_VerySparseMediumRateSynapsesOnly_relative.png)
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfiguration**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==6005== NVPROF is profiling process 6005, command: ./main
+==6005== Profiling application: ./main
+==6005== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 55.29%  580.67ms    100000  5.8060us  5.2160us  6.6240us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, double*, int*, int, int*)
+ 29.34%  308.08ms    100000  3.0800us  3.0400us  3.7120us  [CUDA memset]
+ 15.37%  161.45ms    100000  1.6140us  1.5040us  2.5920us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*)
+
+==6005== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 56.44%  1.83924s    200000  9.1960us  7.9810us  7.4326ms  cudaLaunch
+ 29.07%  947.22ms    100000  9.4720us  8.1380us  21.897ms  cudaMemset
+ 10.90%  355.11ms   1700000     208ns     171ns  355.90us  cudaSetupArgument
+  1.82%  59.307ms    200000     296ns     177ns  333.92us  cudaConfigureCall
+  1.77%  57.629ms    200000     288ns     202ns  337.07us  cudaGetLastError
+  0.00%  46.411us         1  46.411us  46.411us  46.411us  cudaMemGetInfo
+  0.00%  13.163us         1  13.163us  13.163us  13.163us  cudaDeviceSynchronize
+  0.00%  8.2890us         3  2.7630us  2.0680us  3.3230us  cudaFuncGetAttributes
+  0.00%  5.4810us         3  1.8270us     565ns  2.5590us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  1.5840us         4     396ns     318ns     545ns  cudaDeviceGetAttribute
+  0.00%     924ns         1     924ns     924ns     924ns  cudaGetDevice
+
+```
+
+</p></details>
+
+
+<details><summary>Examplary `nvprof` results for **GeNNConfigurationOptimized**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==6274== NVPROF is profiling process 6274, command: ./main test 10.0 1
+==6274== Profiling application: ./main test 10.0 1
+==6274== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 69.30%  617.28ms    100000  6.1720us  3.3600us  7.5200us  calcSynapses
+ 30.70%  273.43ms    100000  2.7340us  2.6560us  3.7440us  calcNeurons
+  0.01%  53.472us        44  1.2150us     960ns  2.0480us  [CUDA memcpy HtoD]
+  0.00%  34.560us        14  2.4680us  1.9520us  4.6080us  [CUDA memcpy DtoH]
+
+==6274== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 82.48%  1.61117s    200000  8.0550us  7.0380us  353.83us  cudaLaunch
+ 11.62%  226.99ms        12  18.916ms  7.8850us  225.88ms  cudaHostAlloc
+  3.30%  64.540ms    200000     322ns     238ns  338.74us  cudaConfigureCall
+  2.52%  49.132ms    200000     245ns     211ns  344.36us  cudaSetupArgument
+  0.04%  744.26us        61  12.200us     293ns  32.120us  cudaMemcpy
+  0.02%  421.09us        12  35.090us  6.1780us  119.69us  cudaMalloc
+  0.01%  226.88us        83  2.7330us     137ns  97.756us  cuDeviceGetAttribute
+  0.00%  31.259us         1  31.259us  31.259us  31.259us  cuDeviceTotalMem
+  0.00%  28.119us         1  28.119us  28.119us  28.119us  cuDeviceGetName
+  0.00%  11.457us         1  11.457us  11.457us  11.457us  cudaSetDevice
+  0.00%  8.0410us        12     670ns     397ns  1.9590us  cudaGetSymbolAddress
+  0.00%  1.6770us         2     838ns     479ns  1.1980us  cuDeviceGetCount
+  0.00%  1.4060us         1  1.4060us  1.4060us  1.4060us  cudaGetDeviceCount
+  0.00%     507ns         2     253ns     231ns     276ns  cuDeviceGet
+
+```
+
+</p></details>
+
+
+***
+
+### Vogels
+![](plots/speed_test_Vogels_absolute.png)
+![](plots/speed_test_Vogels_profiling.png)
+![](plots/speed_test_Vogels_relative.png)
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfiguration**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==12243== NVPROF is profiling process 12243, command: ./main
+==12243== Profiling application: ./main
+==12243== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 27.91%  192.82ms     10000  19.281us  3.1360us  2.1170ms  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int*, int, int*, double, int*, int)
+ 25.45%  175.79ms     10000  17.578us  3.3280us  1.7610ms  kernel_synapses_2_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, double, double*, int, int*)
+ 15.82%  109.25ms     10000  10.925us  3.3600us  1.1837ms  kernel_synapses_2_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, int*, int, double, double*, int, double*, int*)
+ 14.27%  98.554ms     10000  9.8550us  3.1680us  1.0373ms  kernel_synapses_1_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, int*, double*, double, int*, int)
+  5.95%  41.110ms     10000  4.1110us  3.7760us  5.3120us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double, double*, double*, double*, bool*)
+  4.53%  31.297ms     10000  3.1290us  2.9440us  4.3200us  [CUDA memset]
+  3.54%  24.435ms     10000  2.4430us  2.0160us  6.0160us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+  2.53%  17.499ms     10000  1.7490us  1.5360us  2.8160us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*, bool*)
+
+==12243== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 69.99%  645.08ms     70000  9.2150us  8.1890us  7.3493ms  cudaLaunch
+ 16.20%  149.30ms    860000     173ns     144ns  1.1943ms  cudaSetupArgument
+ 10.32%  95.084ms     10000  9.5080us  8.7600us  327.83us  cudaMemset
+  1.76%  16.177ms     70000     231ns     200ns  10.120us  cudaGetLastError
+  1.72%  15.875ms     70000     226ns     181ns  5.3450us  cudaConfigureCall
+  0.01%  51.450us         1  51.450us  51.450us  51.450us  cudaMemGetInfo
+  0.00%  25.843us        10  2.5840us  2.0060us  4.6820us  cudaFuncGetAttributes
+  0.00%  25.773us        41     628ns     481ns  2.9340us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  17.259us         1  17.259us  17.259us  17.259us  cudaDeviceSynchronize
+  0.00%  5.8620us        12     488ns     313ns  1.3830us  cudaDeviceGetAttribute
+  0.00%  3.0770us         3  1.0250us     630ns  1.5860us  cudaGetDevice
+
+```
+
+</p></details>
+
+
+<details><summary>Examplary `nvprof` results for **GeNNConfigurationOptimized**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==12518== NVPROF is profiling process 12518, command: ./main test 1.0 1
+==12518== Profiling application: ./main test 1.0 1
+==12518== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 59.61%  415.51ms     10000  41.550us  2.0480us  6.0015ms  learnSynapsesPost
+ 29.39%  204.87ms     10000  20.486us  1.5680us  2.4941ms  calcSynapses
+ 10.93%  76.180ms     10000  7.6170us  6.6240us  14.560us  calcNeurons
+  0.06%  385.28us        86  4.4800us     960ns  42.752us  [CUDA memcpy HtoD]
+  0.02%  130.11us        20  6.5050us  1.9840us  40.641us  [CUDA memcpy DtoH]
+
+==12518== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 66.01%  690.75ms     30000  23.025us  7.6920us  649.80us  cudaLaunch
+ 29.49%  308.57ms        26  11.868ms  7.6940us  306.48ms  cudaHostAlloc
+  2.65%  27.715ms       112  247.46us     184ns  25.977ms  cudaMemcpy
+  0.97%  10.186ms     30000     339ns     250ns  318.13us  cudaConfigureCall
+  0.77%  8.0652ms     30000     268ns     222ns  319.03us  cudaSetupArgument
+  0.07%  763.51us        26  29.365us  6.1460us  121.30us  cudaMalloc
+  0.02%  226.59us        83  2.7300us     136ns  97.714us  cuDeviceGetAttribute
+  0.00%  31.319us         1  31.319us  31.319us  31.319us  cuDeviceTotalMem
+  0.00%  28.107us         1  28.107us  28.107us  28.107us  cuDeviceGetName
+  0.00%  15.639us        26     601ns     388ns  2.0380us  cudaGetSymbolAddress
+  0.00%  11.574us         1  11.574us  11.574us  11.574us  cudaSetDevice
+  0.00%  1.7010us         2     850ns     538ns  1.1630us  cuDeviceGetCount
+  0.00%  1.5690us         1  1.5690us  1.5690us  1.5690us  cudaGetDeviceCount
+  0.00%     540ns         2     270ns     227ns     313ns  cuDeviceGet
+
+```
+
+</p></details>
+
+
+***
+
+### VogelsWithSynapticDynamic
+![](plots/speed_test_VogelsWithSynapticDynamic_absolute.png)
+![](plots/speed_test_VogelsWithSynapticDynamic_profiling.png)
+![](plots/speed_test_VogelsWithSynapticDynamic_relative.png)
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfiguration**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==6312== NVPROF is profiling process 6312, command: ./main
+==6312== Profiling application: ./main
+==6312== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 27.18%  194.20ms     10000  19.419us  3.1680us  2.1194ms  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int*, int, int*, double, int*, int)
+ 22.86%  163.34ms     10000  16.333us  3.1040us  1.6753ms  kernel_synapses_2_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, double*, int, double*, int, int*, double*, int)
+ 14.99%  107.12ms     10000  10.711us  3.2960us  1.1295ms  kernel_synapses_2_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, int*, double, double*, int, double*, double*, int, int*, int, double*, int)
+ 14.22%  101.59ms     10000  10.158us  3.2960us  1.0383ms  kernel_synapses_1_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, int*, double*, double, int*, int)
+  5.84%  41.697ms     10000  4.1690us  3.8720us  5.5360us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double, double*, double*, double*, bool*)
+  4.71%  33.655ms     10000  3.3650us  3.2320us  4.1280us  kernel_synapses_2_stateupdater_codeobject(unsigned int, unsigned int, int*, double*, int, double*, int, double*)
+  4.37%  31.213ms     10000  3.1210us  3.0400us  4.1920us  [CUDA memset]
+  3.37%  24.073ms     10000  2.4070us  2.0160us  5.7920us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+  2.45%  17.497ms     10000  1.7490us  1.5360us  2.7840us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*, bool*)
+
+==6312== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 70.49%  724.20ms     80000  9.0520us  7.8180us  7.3109ms  cudaLaunch
+ 16.18%  166.25ms    940000     176ns     148ns  532.24us  cudaSetupArgument
+  9.28%  95.356ms     10000  9.5350us  8.8100us  1.1346ms  cudaMemset
+  2.07%  21.258ms     80000     265ns     188ns  322.95us  cudaConfigureCall
+  1.97%  20.198ms     80000     252ns     221ns  60.788us  cudaGetLastError
+  0.00%  51.002us         1  51.002us  51.002us  51.002us  cudaMemGetInfo
+  0.00%  42.841us         1  42.841us  42.841us  42.841us  cudaDeviceSynchronize
+  0.00%  41.487us        74     560ns     469ns  2.5840us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  31.858us        12  2.6540us  1.9920us  4.7290us  cudaFuncGetAttributes
+  0.00%  6.5530us        16     409ns     280ns  1.1330us  cudaDeviceGetAttribute
+  0.00%  3.9370us         4     984ns     604ns  1.7060us  cudaGetDevice
+
+```
+
+</p></details>
+
+
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/AdaptationOscillation.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/AdaptationOscillation.pkl
new file mode 100644
index 00000000..8ab15a42
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/AdaptationOscillation.pkl differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/BrunelHakimModelHeterogeneousDelay.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/BrunelHakimModelHeterogeneousDelay.pkl
new file mode 100644
index 00000000..8049b0a5
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/BrunelHakimModelHeterogeneousDelay.pkl differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/BrunelHakimModelScalarDelay.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/BrunelHakimModelScalarDelay.pkl
new file mode 100644
index 00000000..6e2d78a8
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/BrunelHakimModelScalarDelay.pkl differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/BrunelHakimModelScalarDelayNoMultiPrePost.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/BrunelHakimModelScalarDelayNoMultiPrePost.pkl
new file mode 100644
index 00000000..e89cbdf3
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/BrunelHakimModelScalarDelayNoMultiPrePost.pkl differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/COBAHH.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/COBAHH.pkl
new file mode 100644
index 00000000..8e266503
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/COBAHH.pkl differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/COBAHHFixedConnectivity.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/COBAHHFixedConnectivity.pkl
new file mode 100644
index 00000000..fc9586d3
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/COBAHHFixedConnectivity.pkl differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/CUBA.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/CUBA.pkl
new file mode 100644
index 00000000..38353279
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/CUBA.pkl differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/CUBAFixedConnectivity.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/CUBAFixedConnectivity.pkl
new file mode 100644
index 00000000..2902dd42
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/CUBAFixedConnectivity.pkl differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/DenseMediumRateSynapsesOnly.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/DenseMediumRateSynapsesOnly.pkl
new file mode 100644
index 00000000..08896f41
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/DenseMediumRateSynapsesOnly.pkl differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/HHNeuronsOnly.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/HHNeuronsOnly.pkl
new file mode 100644
index 00000000..4c05ff62
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/HHNeuronsOnly.pkl differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/LinearNeuronsOnly.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/LinearNeuronsOnly.pkl
new file mode 100644
index 00000000..671146b7
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/LinearNeuronsOnly.pkl differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/STDP.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/STDP.pkl
new file mode 100644
index 00000000..e2f0bea2
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/STDP.pkl differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/STDPEventDriven.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/STDPEventDriven.pkl
new file mode 100644
index 00000000..07f9c8be
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/STDPEventDriven.pkl differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/STDPMultiPost.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/STDPMultiPost.pkl
new file mode 100644
index 00000000..9e27d459
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/STDPMultiPost.pkl differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/STDPMultiPostNeuronalTraces.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/STDPMultiPostNeuronalTraces.pkl
new file mode 100644
index 00000000..5a3077b1
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/STDPMultiPostNeuronalTraces.pkl differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/STDPNeuronalTraces.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/STDPNeuronalTraces.pkl
new file mode 100644
index 00000000..2fd67b40
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/STDPNeuronalTraces.pkl differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/STDPNotEventDriven.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/STDPNotEventDriven.pkl
new file mode 100644
index 00000000..d3f2bf50
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/STDPNotEventDriven.pkl differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/SparseHighRateSynapsesOnly.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/SparseHighRateSynapsesOnly.pkl
new file mode 100644
index 00000000..e0097385
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/SparseHighRateSynapsesOnly.pkl differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/SparseLowRateSynapsesOnly.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/SparseLowRateSynapsesOnly.pkl
new file mode 100644
index 00000000..874d47a9
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/SparseLowRateSynapsesOnly.pkl differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/SparseMediumRateSynapsesOnly.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/SparseMediumRateSynapsesOnly.pkl
new file mode 100644
index 00000000..f400c591
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/SparseMediumRateSynapsesOnly.pkl differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/VerySparseMediumRateSynapsesOnly.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/VerySparseMediumRateSynapsesOnly.pkl
new file mode 100644
index 00000000..9faa447b
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/VerySparseMediumRateSynapsesOnly.pkl differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/Vogels.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/Vogels.pkl
new file mode 100644
index 00000000..cac93dd3
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/Vogels.pkl differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/data/VogelsWithSynapticDynamic.pkl b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/VogelsWithSynapticDynamic.pkl
new file mode 100644
index 00000000..3d2d4750
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/data/VogelsWithSynapticDynamic.pkl differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/git.diff b/dev/benchmarks/results_2017_04_05_complete_after_talk/git.diff
new file mode 100644
index 00000000..d891c27a
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/git.diff
@@ -0,0 +1,1000 @@
+diff --git a/brian2cuda/device.py b/brian2cuda/device.py
+index 44ca331..b446937 100644
+--- a/brian2cuda/device.py
++++ b/brian2cuda/device.py
+@@ -910,10 +910,13 @@ class CUDAStandaloneDevice(CPPStandaloneDevice):
+             if clock not in all_clocks:
+                 run_lines.append('{net.name}.add(&{clock.name}, NULL, NULL, NULL, NULL);'.format(clock=clock, net=net))
+ 
++        run_lines.append('cudaProfilerStart();')
+         run_lines.append('{net.name}.run({duration!r}, {report_call}, {report_period!r});'.format(net=net,
+                                                                                               duration=float(duration),
+                                                                                               report_call=report_call,
+                                                                                               report_period=float(report_period)))
++        run_lines.append('cudaDeviceSynchronize();')
++        run_lines.append('cudaProfilerStop();')
+         self.main_queue.append(('run_network', (net, run_lines)))
+ 
+         # Manually set the cache for the clocks, simulation scripts might
+diff --git a/brian2cuda/templates/common_group.cu b/brian2cuda/templates/common_group.cu
+index a77b071..85f2639 100644
+--- a/brian2cuda/templates/common_group.cu
++++ b/brian2cuda/templates/common_group.cu
+@@ -145,6 +145,7 @@ void _run_{{codeobj_name}}()
+ 		}
+ 		{% block extra_info_msg %}
+ 		{% endblock %}
++		{% block kernel_info %}
+ 		else
+ 		{
+ 			printf("INFO calling kernel_{{codeobj_name}} with %u blocks and %u threads. "
+@@ -156,6 +157,7 @@ void _run_{{codeobj_name}}()
+ 					funcAttrib.localSizeBytes, funcAttrib.constSizeBytes{% if calc_occupancy %},occupancy{%endif%});
+ 					
+ 		}
++		{% endblock %}
+ 		first_run = false;
+ 	}
+ 	{% endblock prepare_kernel %}
+diff --git a/brian2cuda/templates/main.cu b/brian2cuda/templates/main.cu
+index 8402033..af7cedf 100644
+--- a/brian2cuda/templates/main.cu
++++ b/brian2cuda/templates/main.cu
+@@ -16,6 +16,7 @@
+ 
+ #include <iostream>
+ #include <fstream>
++#include "cuda_profiler_api.h"
+ 
+ {{report_func|autoindent}}
+ 
+diff --git a/brian2cuda/templates/statemonitor.cu b/brian2cuda/templates/statemonitor.cu
+index f43ddf7..73c1b13 100644
+--- a/brian2cuda/templates/statemonitor.cu
++++ b/brian2cuda/templates/statemonitor.cu
+@@ -7,6 +7,8 @@
+ {# remove this once we have properly defined num_threads, num_blocks here... #}
+ {% block occupancy %}
+ {% endblock occupancy %}
++{% block kernel_info %}
++{% endblock %}
+ 
+ {% block prepare_kernel_inner %}
+ {% for varname, var in _recorded_variables | dictsort %}
+diff --git a/brian2cuda/templates/synapses_create_array.cu b/brian2cuda/templates/synapses_create_array.cu
+index 5741b08..e5ae279 100644
+--- a/brian2cuda/templates/synapses_create_array.cu
++++ b/brian2cuda/templates/synapses_create_array.cu
+@@ -17,6 +17,9 @@
+ {% block occupancy %}
+ {% endblock occupancy %}
+ 
++{% block kernel_info %}
++{% endblock %}
++
+ {% block define_N %}
+ {% endblock %}
+ 
+diff --git a/brian2cuda/templates/synapses_create_generator.cu b/brian2cuda/templates/synapses_create_generator.cu
+index cb06a4f..e663cf3 100644
+--- a/brian2cuda/templates/synapses_create_generator.cu
++++ b/brian2cuda/templates/synapses_create_generator.cu
+@@ -20,6 +20,9 @@
+ {% block occupancy %}
+ {% endblock %}
+ 
++{% block kernel_info %}
++{% endblock %}
++
+ {% block define_N %}
+ {% endblock %}
+ 
+diff --git a/brian2cuda/templates/synapses_initialise_queue.cu b/brian2cuda/templates/synapses_initialise_queue.cu
+index 869812f..1fae5cb 100644
+--- a/brian2cuda/templates/synapses_initialise_queue.cu
++++ b/brian2cuda/templates/synapses_initialise_queue.cu
+@@ -161,10 +161,12 @@ void _run_{{pathobj}}_initialise_queue()
+ 	{% endif %}
+ 
+ 
++	int size_connectivity_matrix = 0;
+ 	//fill temp arrays with device pointers
+ 	for(int i = 0; i < num_parallel_blocks*source_N; i++)  // loop through connectivity matrix
+ 	{
+ 		int num_elements = h_synapses_by_pre_id[i].size();
++		size_connectivity_matrix += num_elements;
+ 		temp_size_by_pre_id[i] = num_elements;
+ 		if (num_elements > {{pathobj}}_max_size)
+ 			{{pathobj}}_max_size = num_elements;
+@@ -281,6 +283,7 @@ void _run_{{pathobj}}_initialise_queue()
+ 			{% endif %}
+ 		}
+ 	}
++	printf("INFO connectivity matrix has size %i\n", size_connectivity_matrix);
+ 
+ 
+ 	//copy temp arrays to device
+diff --git a/brian2cuda/tests/features/cuda_configuration.py b/brian2cuda/tests/features/cuda_configuration.py
+index acaae7d..b2b93a1 100644
+--- a/brian2cuda/tests/features/cuda_configuration.py
++++ b/brian2cuda/tests/features/cuda_configuration.py
+@@ -26,6 +26,22 @@ class CUDAStandaloneConfiguration(Configuration):
+         brian2.device.build(directory='cuda_standalone', compile=True, run=True,
+                             with_output=False)
+ 
++class CUDAStandaloneConfigurationNoAssert(Configuration):
++    name = 'CUDA standalone (asserts disabled)'
++    def before_run(self):
++        brian2.set_device('cuda_standalone', build_on_run=False, disable_asserts=True)
++        if socket.gethostname() == 'elnath':
++            if prefs['devices.cpp_standalone.extra_make_args_unix'] == ['-j12']:
++                prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j24']
++            prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
++            prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20'])
++
++    def after_run(self):
++        if os.path.exists('cuda_standalone'):
++            shutil.rmtree('cuda_standalone')
++        brian2.device.build(directory='cuda_standalone', compile=True, run=True,
++                            with_output=False)
++
+ class CUDAStandaloneConfigurationCurandDouble(Configuration):
+     name = 'CUDA standalone (curand_float_type = double)'
+     def before_run(self):
+diff --git a/brian2cuda/tests/features/speed.py b/brian2cuda/tests/features/speed.py
+index 49c6e52..c0b4887 100644
+--- a/brian2cuda/tests/features/speed.py
++++ b/brian2cuda/tests/features/speed.py
+@@ -7,8 +7,23 @@ from brian2.tests.features.speed import *
+ 
+ from brian2.tests.features.speed import __all__
+ __all__.extend(['AdaptationOscillation',
++                'ThresholderOnlyAlwaysSpiking',
++                'ThresholderOnlyPoissonLowRate',
++                'ThresholderOnlyPoissonMediumRate',
++                'ThresholderOnlyPoissonHighRate',
++                'BrunelHakimNeuronsOnly',
++                'BrunelHakimStateupdateOnly',
++                'BrunelHakimStateupdateOnlyDouble',
++                'BrunelHakimStateupdateOnlyTriple',
++                'BrunelHakimStateupdateThresholdOnly',
++                'BrunelHakimStateupdateThresholdResetOnly',
++                'BrunelHakimNeuronsOnlyNoXi',
++                'BrunelHakimNeuronsOnlyNoRand',
+                 'BrunelHakimModelScalarDelay',
++                'BrunelHakimModelScalarDelayNoMultiPrePost',
++                'BrunelHakimModelScalarDelayShort',
+                 'BrunelHakimModelHeterogeneousDelay',
++                'CUBA',
+                 'COBAHH',
+                 'STDPEventDriven',
+                 'STDPNotEventDriven',
+@@ -73,12 +88,334 @@ class AdaptationOscillation(SpeedTest):
+         
+         self.timed_run(self.duration)
+ 
++class BrunelHakimNeuronsOnly(SpeedTest):
++    
++    category = "Neurons only"
++    name = "Brunel Hakim"
++    tags = ["Neurons"]
++    n_range = [10, 100, 1000, 10000, 20000, 50000, 100000, 1000000, 10000000]
++    n_label = 'Num neurons'
++
++    # configuration options
++    duration = 1*second
++    
++    def run(self):
++        N = self.n
++        Vr = 10*mV
++        theta = 20*mV
++        tau = 20*ms
++        delta = 2*ms
++        taurefr = 2*ms
++        C = 1000
++        sparseness = float(C)/N
++        J = .1*mV
++        muext = 25*mV
++        sigmaext = 1*mV
++        
++        eqs = """
++        dV/dt = (-V+muext + sigmaext * sqrt(tau) * xi)/tau : volt
++        """
++        
++        self.group = group = NeuronGroup(N, eqs, threshold='V>theta',
++                            reset='V=Vr', refractory=taurefr)
++        group.V = Vr
++        
++        self.timed_run(self.duration)
++
++class BrunelHakimStateupdateOnlyTriple(SpeedTest):
++    
++    category = "Neurons only"
++    name = "Brunel Hakim (3 x stateupdate)"
++    tags = ["Neurons"]
++    n_range = [10, 100, 1000, 10000, 20000, 50000, 100000, 1000000, 10000000]
++    n_label = 'Num neurons'
++
++    # configuration options
++    duration = 1*second
++    
++    def run(self):
++        N = self.n
++        Vr = 10*mV
++        theta = 20*mV
++        tau = 20*ms
++        delta = 2*ms
++        taurefr = 2*ms
++        C = 1000
++        sparseness = float(C)/N
++        J = .1*mV
++        muext = 25*mV
++        sigmaext = 1*mV
++        
++        eqs = """
++        dV/dt = (-V+muext + sigmaext * sqrt(tau) * xi)/tau : volt
++        """
++        
++        self.group = group = NeuronGroup(N, eqs)#, threshold='V>theta',
++                            #reset='V=Vr', refractory=taurefr)
++        group.V = Vr
++
++        self.group2 = group2 = NeuronGroup(N, eqs)
++        group2.V = Vr
++
++        self.group3 = group3 = NeuronGroup(N, eqs)
++        group3.V = Vr
++        
++        self.timed_run(self.duration)
++
++
++class BrunelHakimStateupdateOnlyDouble(SpeedTest):
++    
++    category = "Neurons only"
++    name = "Brunel Hakim (2 x stateupdate)"
++    tags = ["Neurons"]
++    n_range = [10, 100, 1000, 10000, 20000, 50000, 100000, 1000000, 10000000]
++    n_label = 'Num neurons'
++
++    # configuration options
++    duration = 1*second
++    
++    def run(self):
++        N = self.n
++        Vr = 10*mV
++        theta = 20*mV
++        tau = 20*ms
++        delta = 2*ms
++        taurefr = 2*ms
++        C = 1000
++        sparseness = float(C)/N
++        J = .1*mV
++        muext = 25*mV
++        sigmaext = 1*mV
++        
++        eqs = """
++        dV/dt = (-V+muext + sigmaext * sqrt(tau) * xi)/tau : volt
++        """
++        
++        self.group = group = NeuronGroup(N, eqs)#, threshold='V>theta',
++                            #reset='V=Vr', refractory=taurefr)
++        group.V = Vr
++
++        self.group2 = group2 = NeuronGroup(N, eqs)
++        group2.V = Vr
++        
++        self.timed_run(self.duration)
++
++
++class BrunelHakimStateupdateOnly(SpeedTest):
++    
++    category = "Neurons only"
++    name = "Brunel Hakim (stateupdate)"
++    tags = ["Neurons"]
++    n_range = [10, 100, 1000, 10000, 20000, 50000, 100000, 1000000, 10000000]
++    n_label = 'Num neurons'
++
++    # configuration options
++    duration = 1*second
++    
++    def run(self):
++        N = self.n
++        Vr = 10*mV
++        theta = 20*mV
++        tau = 20*ms
++        delta = 2*ms
++        taurefr = 2*ms
++        C = 1000
++        sparseness = float(C)/N
++        J = .1*mV
++        muext = 25*mV
++        sigmaext = 1*mV
++        
++        eqs = """
++        dV/dt = (-V+muext + sigmaext * sqrt(tau) * xi)/tau : volt
++        """
++        
++        self.group = group = NeuronGroup(N, eqs)#, threshold='V>theta',
++                            #reset='V=Vr', refractory=taurefr)
++        group.V = Vr
++        
++        self.timed_run(self.duration)
++
++class BrunelHakimStateupdateThresholdOnly(SpeedTest):
++    
++    category = "Neurons only"
++    name = "Brunel Hakim (stateupdate + threshold)"
++    tags = ["Neurons"]
++    n_range = [10, 100, 1000, 10000, 20000, 50000, 100000, 1000000, 10000000]
++    n_label = 'Num neurons'
++
++    # configuration options
++    duration = 1*second
++    
++    def run(self):
++        N = self.n
++        Vr = 10*mV
++        theta = 20*mV
++        tau = 20*ms
++        delta = 2*ms
++        taurefr = 2*ms
++        C = 1000
++        sparseness = float(C)/N
++        J = .1*mV
++        muext = 25*mV
++        sigmaext = 1*mV
++        
++        eqs = """
++        dV/dt = (-V+muext + sigmaext * sqrt(tau) * xi)/tau : volt
++        """
++        
++        self.group = group = NeuronGroup(N, eqs, threshold='V>theta')
++        group.V = Vr
++        
++        self.timed_run(self.duration)
++
++class BrunelHakimStateupdateThresholdResetOnly(SpeedTest):
++    
++    category = "Neurons only"
++    name = "Brunel Hakim (stateupdate + threshold + reset)"
++    tags = ["Neurons"]
++    n_range = [10, 100, 1000, 10000, 20000, 50000, 100000, 1000000, 10000000]
++    n_label = 'Num neurons'
++
++    # configuration options
++    duration = 1*second
++    
++    def run(self):
++        N = self.n
++        Vr = 10*mV
++        theta = 20*mV
++        tau = 20*ms
++        delta = 2*ms
++        taurefr = 2*ms
++        C = 1000
++        sparseness = float(C)/N
++        J = .1*mV
++        muext = 25*mV
++        sigmaext = 1*mV
++        
++        eqs = """
++        dV/dt = (-V+muext + sigmaext * sqrt(tau) * xi)/tau : volt
++        """
++        
++        self.group = group = NeuronGroup(N, eqs, threshold='V>theta',
++                            reset='V=Vr')
++        group.V = Vr
++        
++        self.timed_run(self.duration)
++
++class BrunelHakimNeuronsOnlyNoXi(SpeedTest):
++    
++    category = "Neurons only"
++    name = "Brunel Hakim (no xi)"
++    tags = ["Neurons"]
++    n_range = [10, 100, 1000, 10000, 20000, 50000, 100000, 1000000, 10000000]
++    n_label = 'Num neurons'
++
++    # configuration options
++    duration = 1*second
++    
++    def run(self):
++        N = self.n
++        Vr = 10*mV
++        theta = 20*mV
++        tau = 20*ms
++        delta = 2*ms
++        taurefr = 2*ms
++        C = 1000
++        sparseness = float(C)/N
++        J = .1*mV
++        muext = 25*mV
++        sigmaext = 1*mV
++        
++        eqs = """
++        dV/dt = (-V+muext + sigmaext * sqrt(tau/ms))/tau : volt
++        """
++        
++        self.group = group = NeuronGroup(N, eqs, threshold='V>theta',
++                            reset='V=Vr', refractory=taurefr)
++        group.V = Vr
++        
++        self.timed_run(self.duration)
++
++class BrunelHakimNeuronsOnlyNoRand(SpeedTest):
++    
++    category = "Neurons only"
++    name = "Brunel Hakim (no rand)"
++    tags = ["Neurons"]
++    n_range = [10, 100, 1000, 10000, 20000, 50000, 100000, 1000000, 10000000]
++    n_label = 'Num neurons'
++
++    # configuration options
++    duration = 1*second
++    
++    def run(self):
++        N = self.n
++        Vr = 10*mV
++        theta = 20*mV
++        tau = 20*ms
++        delta = 2*ms
++        taurefr = 2*ms
++        C = 1000
++        sparseness = float(C)/N
++        J = .1*mV
++        muext = 25*mV
++        sigmaext = 1*mV
++        
++        myxi = np.random.randn(N)
++        
++        eqs = """
++        dV/dt = (-V+muext + sigmaext * sqrt(tau) * myxi/sqrt(ms))/tau : volt
++        myxi : 1
++        """
++        
++        self.group = group = NeuronGroup(N, eqs, threshold='V>theta',
++                            reset='V=Vr', refractory=taurefr)
++        group.V = Vr
++        group.myxi = myxi
++        
++        self.timed_run(self.duration)
++
++class BrunelHakimModelScalarDelayNoMultiPrePost(SpeedTest):
++    
++    category = "Full examples"
++    name = "Brunel Hakim with scalar delay (1s, no multip pre-post connections)"
++    tags = ["Neurons", "Synapses"]
++    n_range = [10, 100, 1000, 10000, 20000, 50000, 100000, 250000]#, 350000]#500000, 1000000]
++    n_label = 'Num neurons'
++
++    # configuration options
++    duration = 1*second
++    
++    def run(self):
++        N = self.n
++        Vr = 10*mV
++        theta = 20*mV
++        tau = 20*ms
++        delta = 2*ms
++        taurefr = 2*ms
++        C = 1000
++        sparseness = float(C)/N
++        J = .1*mV
++        muext = 25*mV
++        sigmaext = 1*mV
++        
++        eqs = """
++        dV/dt = (-V+muext + sigmaext * sqrt(tau) * xi)/tau : volt
++        """
++        
++        group = NeuronGroup(N, eqs, threshold='V>theta',
++                            reset='V=Vr', refractory=taurefr)
++        group.V = Vr
++        conn = Synapses(group, group, on_pre='V += -J', delay=delta)
++        conn.connect('i!=j and rand()<sparseness')
++        
++        self.timed_run(self.duration)
++        
+ class BrunelHakimModelScalarDelay(SpeedTest):
+     
+     category = "Full examples"
+-    name = "Brunel Hakim with scalar delay"
++    name = "Brunel Hakim with scalar delay (1s)"
+     tags = ["Neurons", "Synapses"]
+-    n_range = [10, 100, 1000, 10000, 20000, 50000, 100000]
++    n_range = [10, 100, 1000, 10000, 20000, 50000, 100000, 250000]#, 350000]#500000, 1000000]
+     n_label = 'Num neurons'
+ 
+     # configuration options
+@@ -91,7 +428,6 @@ class BrunelHakimModelScalarDelay(SpeedTest):
+         tau = 20*ms
+         delta = 2*ms
+         taurefr = 2*ms
+-        duration = .1*second
+         C = 1000
+         sparseness = float(C)/N
+         J = .1*mV
+@@ -108,8 +444,44 @@ class BrunelHakimModelScalarDelay(SpeedTest):
+         conn = Synapses(group, group, on_pre='V += -J', delay=delta)
+         conn.connect('rand()<sparseness')
+         
+-        self.timed_run(duration)
++        self.timed_run(self.duration)
+         
++class BrunelHakimModelScalarDelayShort(SpeedTest):
++    
++    category = "Full examples"
++    name = "Brunel Hakim with scalar delay (0.01s)"
++    tags = ["Neurons", "Synapses"]
++    n_range = [10, 100, 1000, 10000, 20000, 50000, 100000]#, 200000, 500000, 1000000]
++    n_label = 'Num neurons'
++
++    # configuration options
++    duration = 0.01*second
++    
++    def run(self):
++        N = self.n
++        Vr = 10*mV
++        theta = 20*mV
++        tau = 20*ms
++        delta = 2*ms
++        taurefr = 2*ms
++        C = 1000
++        sparseness = float(C)/N
++        J = .1*mV
++        muext = 25*mV
++        sigmaext = 1*mV
++        
++        eqs = """
++        dV/dt = (-V+muext + sigmaext * sqrt(tau) * xi)/tau : volt
++        """
++        
++        group = NeuronGroup(N, eqs, threshold='V>theta',
++                            reset='V=Vr', refractory=taurefr)
++        group.V = Vr
++        conn = Synapses(group, group, on_pre='V += -J', delay=delta)
++        conn.connect('rand()<sparseness')
++        
++        self.timed_run(self.duration)
++
+ class BrunelHakimModelHeterogeneousDelay(SpeedTest):
+     
+     category = "Full examples"
+@@ -128,7 +500,6 @@ class BrunelHakimModelHeterogeneousDelay(SpeedTest):
+         tau = 20*ms
+         delta = 2*ms
+         taurefr = 2*ms
+-        duration = .1*second
+         C = 1000
+         sparseness = float(C)/N
+         J = .1*mV
+@@ -146,7 +517,86 @@ class BrunelHakimModelHeterogeneousDelay(SpeedTest):
+         conn.connect('rand()<sparseness')
+         conn.delay = "delta * 2 * rand()"
+         
+-        self.timed_run(duration)
++        self.timed_run(self.duration)
++
++class ThresholderOnly(SpeedTest):
++    category = "Neurons only"
++    name = "Thresholder only"
++    tags = ["Neurons"]
++    n_range = [10, 100, 1000, 10000, 20000, 50000, 100000, 1000000, 10000000]
++    n_label = 'Num neurons'
++    rate = None
++
++    # configuration options
++    duration = 1 * second
++
++    def run(self):
++        N = self.n
++        rate = self.rate
++        self.group = group = NeuronGroup(N, 'v:1', threshold=self.threshold_condition)
++        self.timed_run(self.duration)
++
++class ThresholderOnlyAlwaysSpiking(ThresholderOnly, SpeedTest):
++    name = "Thresholder only (always spiking)"
++    threshold_condition = 'True'
++
++class ThresholderOnlyPoissonHighRate(ThresholderOnly, SpeedTest):
++    name = "Thresholder only (high rate)"
++    rate = 100 * Hz
++    threshold_condition = 'rand() < rate*dt'
++
++class ThresholderOnlyPoissonMediumRate(ThresholderOnly, SpeedTest):
++    name = "Thresholder only (medium rate)"
++    rate = 10 * Hz
++    threshold_condition = 'rand() < rate*dt'
++
++class ThresholderOnlyPoissonLowRate(ThresholderOnly, SpeedTest):
++    name = "Thresholder only (low rate)"
++    rate = 1 * Hz
++    threshold_condition = 'rand() < rate*dt'
++
++class CUBA(SpeedTest):
++
++    category = "Full examples"
++    name = "CUBA fixed connectivity"
++    tags = ["Neurons", "Synapses"]
++    n_range = [10, 100, 1000, 10000, 100000, 1000000]
++    n_label = 'Num neurons'
++
++    # configuration options
++    duration = 1 * second
++
++    def run(self):
++        N = self.n
++        Ne = int(.8 * N)
++
++        taum = 20 * ms
++        taue = 5 * ms
++        taui = 10 * ms
++        Vt = -50 * mV
++        Vr = -60 * mV
++        El = -49 * mV
++
++        eqs = '''
++        dv/dt  = (ge+gi-(v-El))/taum : volt (unless refractory)
++        dge/dt = -ge/taue : volt (unless refractory)
++        dgi/dt = -gi/taui : volt (unless refractory)
++        '''
++
++        P = NeuronGroup(
++            N, eqs, threshold='v>Vt', reset='v = Vr', refractory=5 * ms)
++        P.v = 'Vr + rand() * (Vt - Vr)'
++        P.ge = 0 * mV
++        P.gi = 0 * mV
++
++        we = (60 * 0.27 / 10) * mV  # excitatory synaptic weight (voltage)
++        wi = (-20 * 4.5 / 10) * mV  # inhibitory synaptic weight
++        Ce = Synapses(P, P, on_pre='ge += we')
++        Ci = Synapses(P, P, on_pre='gi += wi')
++        Ce.connect('i<Ne', p=80. / N)
++        Ci.connect('i>=Ne', p=80. / N)
++
++        self.timed_run(self.duration)
+ 
+ class COBAHH(SpeedTest):
+     
+@@ -223,7 +673,7 @@ class STDPEventDriven(SpeedTest):
+     category = "Full examples"
+     name = "STDP (event-driven)"
+     tags = ["Neurons", "Synapses"]
+-    n_range = [10, 100, 1000, 10000, 20000, 50000, 100000]
++    n_range = [10, 100, 1000, 10000, 20000, 50000, 100000, 1000000, 5000000]
+     n_label = 'Num neurons'
+ 
+     # configuration options
+@@ -543,18 +993,18 @@ class Vogels(SpeedTest):
+         
+         eqs_stdp_inhib = '''
+         w : 1
+-        dA_pre/dt=-A_pre/tau_stdp : 1 (event-driven)
+-        dA_post/dt=-A_post/tau_stdp : 1 (event-driven)
++        dApre/dt=-Apre/tau_stdp : 1 (event-driven)
++        dApost/dt=-Apost/tau_stdp : 1 (event-driven)
+         '''
+         alpha = 3*Hz*tau_stdp*2  # Target rate parameter
+         gmax = 100               # Maximum inhibitory weight
+         
+         con_ie = Synapses(Pi, Pe, model=eqs_stdp_inhib,
+-                          on_pre='''A_pre += 1.
+-                                 w = clip(w+(A_post-alpha)*eta, 0, gmax)
++                          on_pre='''Apre += 1.
++                                 w = clip(w+(Apost-alpha)*eta, 0, gmax)
+                                  g_gaba += w*nS''',
+-                          on_post='''A_post += 1.
+-                                  w = clip(w+A_pre*eta, 0, gmax)
++                          on_post='''Apost += 1.
++                                  w = clip(w+Apre*eta, 0, gmax)
+                                '''
+                          )
+         con_ie.connect('rand()<epsilon')
+@@ -630,5 +1080,5 @@ class VogelsWithSynapticDynamic(SpeedTest):
+ 
+ if __name__=='__main__':
+     #prefs.codegen.target = 'numpy'
+-    VerySparseMediumRateSynapsesOnly(100000).run()
+-    show()
++    ThresholderOnlyPoissonLowRate(10).run()
++    #show()
+diff --git a/brian2cuda/tests/test_profiling.py b/brian2cuda/tests/test_profiling.py
+index 7f2c7da..687bf7d 100644
+--- a/brian2cuda/tests/test_profiling.py
++++ b/brian2cuda/tests/test_profiling.py
+@@ -80,6 +80,6 @@ def test_profile_build_raises():
+     assert_raises(TypeError, lambda: device.build(profile='string'))
+ 
+ if __name__ == '__main__':
+-    #test_profile_in_run_raises()
++    test_profile_in_run_raises()
+     #test_profile_wrong_raises()
+-    test_profile_build_raises()
++    #test_profile_build_raises()
+diff --git a/dev/benchmarks/run_speed_tests.py b/dev/benchmarks/run_speed_tests.py
+index ea45b35..865118a 100644
+--- a/dev/benchmarks/run_speed_tests.py
++++ b/dev/benchmarks/run_speed_tests.py
+@@ -22,12 +22,14 @@ from brian2.tests.features.base import results
+ 
+ import brian2cuda
+ from brian2cuda.tests.features.cuda_configuration import (CUDAStandaloneConfiguration,
+-                                                          CUDAStandaloneConfigurationUseCudaOccupancyAPI,
+-                                                          CUDAStandaloneConfigurationUseCudaOccupancyAPIProfileCPU,
++                                                          CUDAStandaloneConfigurationNoAssert,
++                                                          CUDAStandaloneConfigurationCurandDouble,
++                                                          CUDAStandaloneConfigurationNoCudaOccupancyAPI,
++                                                          CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU,
+                                                           CUDAStandaloneConfiguration2BlocksPerSM,
+                                                           CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds,
+-                                                          CUDAStandaloneConfigurationSynLaunchBoundsOccup,
+-                                                          CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBoundsOccup,
++                                                          CUDAStandaloneConfigurationSynLaunchBounds,
++                                                          CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds,
+                                                           CUDAStandaloneConfigurationProfileGPU,
+                                                           CUDAStandaloneConfigurationProfileCPU)
+ from brian2cuda.tests.features.speed import *
+@@ -48,16 +50,19 @@ configs = [# configuration                          project_directory
+           #(NumpyConfiguration,                     None),
+           #(WeaveConfiguration,                     None),
+           #(LocalConfiguration,                     None),
+-          #(CUDAStandaloneConfiguration,             'cuda_standalone'),
+-          (CUDAStandaloneConfigurationUseCudaOccupancyAPI,      'cuda_standalone'),
+-          #(CUDAStandaloneConfigurationUseCudaOccupancyAPIProfileCPU,    'cuda_standalone'),
++          (CUDAStandaloneConfiguration,             'cuda_standalone'),
++          #(CUDAStandaloneConfigurationNoAssert,             'cuda_standalone'),
++          #(CUDAStandaloneConfigurationNoThreadfence,  'cuda_standalone'),
++          #(CUDAStandaloneConfigurationCurandDouble,              'cuda_standalone'),
++          #(CUDAStandaloneConfigurationNoCudaOccupancyAPI,      'cuda_standalone'),
++          #(CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU,    'cuda_standalone'),
+           #(CUDAStandaloneConfiguration2BlocksPerSM, 'cuda_standalone'),
+           #(CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds, 'cuda_standalone'),
+-          (CUDAStandaloneConfigurationSynLaunchBoundsOccup,     'cuda_standalone'),
+-          (CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBoundsOccup, 'cuda_standalone'),
++          #(CUDAStandaloneConfigurationSynLaunchBounds,     'cuda_standalone'),
++          #(CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds, 'cuda_standalone'),
+           #(CUDAStandaloneConfigurationProfileGPU,   'cuda_standalone'),
+           #(CUDAStandaloneConfigurationProfileCPU,   'cuda_standalone'),
+-          #(CPPStandaloneConfiguration,              'cpp_standalone'),
++          (CPPStandaloneConfiguration,              'cpp_standalone'),
+           #(GeNNConfiguration,                       'GeNNworkspace'),
+           #(CPPStandaloneConfigurationOpenMP,        'cpp_standalone'),
+           #(GeNNConfigurationCPU,                    'GeNNworkspace'),
+@@ -65,18 +70,29 @@ configs = [# configuration                          project_directory
+           ]
+ 
+ speed_tests = [# feature_test                     name                                  n_slice
+-               (LinearNeuronsOnly,                     'LinearNeuronsOnly',                   slice(None)         ),
+-               (HHNeuronsOnly,                         'HHNeuronsOnly',                       slice(None)         ),
+-
+-               (BrunelHakimModelScalarDelay,           'BrunelHakimModelScalarDelay',         slice(None)         ),
+-               (BrunelHakimModelHeterogeneousDelay,    'BrunelHakimModelHeterogeneousDelay',  slice(None)         ),
+ 
++               #(ThresholderOnlyPoissonLowRate,                         'ThresholderOnlyPoissonLowRate',                       slice(None)         ),
++               #(ThresholderOnlyPoissonMediumRate,                         'ThresholderOnlyPoissonMediumRate',                       slice(None)         ),
++               #(ThresholderOnlyPoissonHighRate,                         'ThresholderOnlyPoissonHighRate',                       slice(None)         ),
++               #(ThresholderOnlyAlwaysSpiking,                         'ThresholderOnlyAlwaysSpiking',                       slice(None)         ),
++
++               #(BrunelHakimStateupdateOnlyDouble,           'BrunelHakimStateupdateOnlyDouble',         slice(None)         ),
++               #(BrunelHakimStateupdateOnlyTriple,           'BrunelHakimStateupdateOnlyTriple',         slice(None)         ),
++               #(BrunelHakimStateupdateOnly,           'BrunelHakimStateupdateOnly',         slice(None)         ),
++               #(BrunelHakimNeuronsOnly,           'BrunelHakimNeuronsOnly',         slice(None)         ),
++               #(BrunelHakimNeuronsOnlyNoXi,           'BrunelHakimNeuronsOnlyNoXi',         slice(None)         ),
++               #(BrunelHakimNeuronsOnlyNoRand,           'BrunelHakimNeuronsOnlyNoRand',         slice(None)         ),
++               #(BrunelHakimStateupdateThresholdOnly,           'BrunelHakimStateupdateThresholdOnly',         slice(None)         ),
++               #(BrunelHakimStateupdateThresholdResetOnly,           'BrunelHakimStateupdateThresholdResetOnly',         slice(None)         ),
++               #(BrunelHakimModelScalarDelayShort,      'BrunelHakimModelScalarDelayShort',     slice(None)         ),
++              (CUBA,                                 'CUBA',                              slice(None)         ),
++              (COBAHH,                                 'COBAHH',                              slice(None)         ),
++              (AdaptationOscillation,                  'AdaptationOscillation',               slice(None)         ),
++              (Vogels,                                 'Vogels',                              slice(None)         ),
+                (STDP,                                   'STDP',                                slice(None)         ),
+-               (STDPEventDriven,                        'STDPEventDriven',                     slice(None)         ),
+-               (STDPNotEventDriven,                     'STDPNotEventDriven',                  slice(None)         ),
+-               (STDPMultiPost,                          'STDPMultiPost',                        slice(None)         ),
+-               (STDPNeuronalTraces,                     'STDPNeuronalTraces',                   slice(None)         ),
+-               (STDPMultiPostNeuronalTraces,            'STDPMultiPostNeuronalTraces',          slice(None)         ),
++              (STDPEventDriven,                        'STDPEventDriven',                     slice(None)         ),
++              (BrunelHakimModelScalarDelay,           'BrunelHakimModelScalarDelay',         slice(None)         ),
++              (BrunelHakimModelScalarDelayNoMultiPrePost,           'BrunelHakimModelScalarDelayNoMultiPrePost',         slice(None)         ),
+ 
+                (VerySparseMediumRateSynapsesOnly,       'VerySparseMediumRateSynapsesOnly',    slice(None)         ),
+                (SparseMediumRateSynapsesOnly,           'SparseMediumRateSynapsesOnly',        slice(None)         ),
+@@ -84,13 +100,19 @@ speed_tests = [# feature_test                     name
+                (SparseLowRateSynapsesOnly,              'SparseLowRateSynapsesOnly',           slice(None)         ),
+                (SparseHighRateSynapsesOnly,             'SparseHighRateSynapsesOnly',          slice(None)         ),
+ 
+-               (AdaptationOscillation,                  'AdaptationOscillation',               slice(None)         ),
+-               (COBAHH,                                 'COBAHH',                              slice(None)         ),
+-               (Vogels,                                 'Vogels',                              slice(None)         ),
+-               (VogelsWithSynapticDynamic,              'VogelsWithSynapticDynamic',           slice(None)         ),
++               (STDPNotEventDriven,                     'STDPNotEventDriven',                  slice(None)         ),
++               (STDPMultiPost,                          'STDPMultiPost',                        slice(None)         ),
++               (STDPNeuronalTraces,                     'STDPNeuronalTraces',                   slice(None)         ),
++               (STDPMultiPostNeuronalTraces,            'STDPMultiPostNeuronalTraces',          slice(None)         ),
+ 
+-               (COBAHHFixedConnectivity,                'COBAHHFixedConnectivity',             slice(None, -1)     ),
++              (BrunelHakimModelHeterogeneousDelay,    'BrunelHakimModelHeterogeneousDelay',  slice(None)         ),
++
++              (LinearNeuronsOnly,                     'LinearNeuronsOnly',                   slice(None)         ),
++              (HHNeuronsOnly,                         'HHNeuronsOnly',                       slice(None)         ),
++               (VogelsWithSynapticDynamic,              'VogelsWithSynapticDynamic',           slice(None)         ),
++## below uses monitors
+                (CUBAFixedConnectivity,                 'CUBAFixedConnectivity',               slice(None)         ),
++               (COBAHHFixedConnectivity,                'COBAHHFixedConnectivity',             slice(None, -1)     ),
+ ]
+ 
+ configurations = [config[0] for config in configs]
+@@ -140,22 +162,24 @@ try:
+         start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+         print("Starting {} on {}.".format(name, start))
+         maximum_run_time = 1*60*60*second
+-        st.duration = 10*second
++        #st.duration = 10*second
+         res = run_speed_tests(configurations=configurations,
+                               speed_tests=[st],
+                               n_slice=sl,
+-                              #n_slice=slice(0,2,None),
+-                              #run_twice=False,
++                              #n_slice=slice(0,1,None),
++                              run_twice=False,
+                               verbose=True,
+-                              maximum_run_time=maximum_run_time)
++                              maximum_run_time=maximum_run_time,
++                              profile_only_active=True)
++                              #profile_only_active=False)
+         end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+         diff = datetime.datetime.strptime(end, time_format) - datetime.datetime.strptime(start, time_format)
+         print("Running {} took {}.".format(name, diff))
+-        res.plot_all_tests()
++        res.plot_all_tests(print_relative=True)
+         savefig(os.path.join(plot_dir, 'speed_test_{}_absolute.png'.format(speed_tests[n][1])))
+         res.plot_all_tests(relative=True)
+         savefig(os.path.join(plot_dir, 'speed_test_{}_relative.png'.format(name)))
+-        res.plot_all_tests(profiling_minimum=0.15)
++        res.plot_all_tests(profiling_minimum=0.05)
+         savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.png'.format(name)))
+         if 3 != len(get_fignums()):
+             print("WARNING: There were {} plots created, but only {} saved.".format(len(get_fignums()), 3*(n+1)))
+@@ -192,8 +216,9 @@ try:
+                 print("Rerunning {} with n = {} for nvprof profiling".format(conf_name, st.n_range[idx]))
+                 tb, res, runtime, prof_info = results(conf, st, st.n_range[idx], maximum_run_time=maximum_run_time)
+                 if not isinstance(res, Exception) and runtime < max_runtime:
+-                    cmd = 'cd {proj_dir} && nvprof --profile-from-start-off --log-file ../{log_file} ./main {arg}'.format(
+-                        proj_dir=proj_dir, arg=main_arg,
++                    option = '--profile-from-start-off' if proj_dir == 'cuda_standalone' else ''
++                    cmd = 'cd {proj_dir} && nvprof {opt} --log-file ../{log_file} ./main {arg}'.format(
++                        proj_dir=proj_dir, arg=main_arg, opt=option,
+                         log_file=os.path.join(prof_dir, 'nvprof_{st}_{conf}_{n}.log'.format(
+                             st=name, conf=conf_name, n=st.n_range[idx])))
+                     prof_start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+diff --git a/dev/issues/issue9_spikespace/global_atomicAdds/code_objects/neurongroup_thresholder_codeobject.cu b/dev/issues/issue9_spikespace/global_atomicAdds/code_objects/neurongroup_thresholder_codeobject.cu
+index 1c749a9..44ea6cd 100644
+--- a/dev/issues/issue9_spikespace/global_atomicAdds/code_objects/neurongroup_thresholder_codeobject.cu
++++ b/dev/issues/issue9_spikespace/global_atomicAdds/code_objects/neurongroup_thresholder_codeobject.cu
+@@ -88,7 +88,8 @@ __global__ void kernel_neurongroup_thresholder_codeobject(
+ 
+  	
+  const int32_t i = _ptr_array_neurongroup_i[_idx];
+- const double _cond = true;//(i / 2) == ((i + 1) / 2);
++ //const double _cond = true;//(i / 2) == ((i + 1) / 2);
++  const double _cond = fmodf(float(i), float(500)) == 0;//(i / 2) == ((i + 1) / 2);
+ 
+ 	int32_t spike_index;
+ 
+diff --git a/dev/issues/issue9_spikespace/shared_atomicAdds/code_objects/neurongroup_thresholder_codeobject.cu b/dev/issues/issue9_spikespace/shared_atomicAdds/code_objects/neurongroup_thresholder_codeobject.cu
+index aa4b1e2..1a3800f 100644
+--- a/dev/issues/issue9_spikespace/shared_atomicAdds/code_objects/neurongroup_thresholder_codeobject.cu
++++ b/dev/issues/issue9_spikespace/shared_atomicAdds/code_objects/neurongroup_thresholder_codeobject.cu
+@@ -100,7 +100,7 @@ __global__ void kernel_neurongroup_thresholder_codeobject(
+ 
+  	
+  const int32_t i = _ptr_array_neurongroup_i[_idx];
+- const double _cond = true;//(i / 2) == ((i + 1) / 2);
++ const double _cond = fmodf(float(i),float(500)) == 0;//(i / 2) == ((i + 1) / 2);
+ 
+ 
+ 	int32_t spike_index;
+diff --git a/examples/STDP_multipost_neuronaltraces_standalone_cpp.py b/examples/STDP_multipost_neuronaltraces_standalone_cpp.py
+index 5776588..5827061 100644
+--- a/examples/STDP_multipost_neuronaltraces_standalone_cpp.py
++++ b/examples/STDP_multipost_neuronaltraces_standalone_cpp.py
+@@ -6,7 +6,7 @@ Adapted from Song, Miller and Abbott (2000) and Song and Abbott (2001).
+ This example is modified from ``synapses_STDP.py`` and writes a standalone
+ C++ project in the directory ``STDP_standalone``.
+ 
+-This version includes two further modifications: 
++This version includes two further modifications:
+ traces in neurons and multiple pre- _and_ postsynaptic neurons (s.t. no. synpases is N).
+ '''
+ import matplotlib
+diff --git a/examples/STDP_multipost_standalone_cpp.py b/examples/STDP_multipost_standalone_cpp.py
+index 2cec4ae..a43459f 100644
+--- a/examples/STDP_multipost_standalone_cpp.py
++++ b/examples/STDP_multipost_standalone_cpp.py
+@@ -6,7 +6,7 @@ Adapted from Song, Miller and Abbott (2000) and Song and Abbott (2001).
+ This example is modified from ``synapses_STDP.py`` and writes a standalone
+ C++ project in the directory ``STDP_standalone``.
+ 
+-This version includes a further modification: 
++This version includes a further modification:
+ multiple pre- _and_ postsynaptic neurons (s.t. no. synpases is N).
+ '''
+ import matplotlib
+diff --git a/examples/STDP_neuronaltraces_standalone_cpp.py b/examples/STDP_neuronaltraces_standalone_cpp.py
+index 4e0fc21..86590ca 100644
+--- a/examples/STDP_neuronaltraces_standalone_cpp.py
++++ b/examples/STDP_neuronaltraces_standalone_cpp.py
+@@ -15,7 +15,7 @@ import os
+ example_name = os.path.splitext(os.path.basename(__file__))[0]
+ 
+ from brian2 import *
+-set_device('cpp_standalone', directory=example_name, compile=True, run=True, debug=True)
++set_device('cpp_standalone', directory=example_name, compile=True, run=True, debug=False)
+ 
+ N = 1000
+ taum = 10*ms
+@@ -65,6 +65,8 @@ r_mon = PopulationRateMonitor(input_poisson)
+ 
+ run(100*second, report='text')
+ 
++print(profiling_summary())
++
+ subplot(311)
+ suptitle(example_name)
+ plot(S.w / gmax, '.k', ms=1)
+diff --git a/examples/STDP_standalone_cuda.py b/examples/STDP_standalone_cuda.py
+index 8807b28..d73b5a7 100644
+--- a/examples/STDP_standalone_cuda.py
++++ b/examples/STDP_standalone_cuda.py
+@@ -53,7 +53,7 @@ mon = StateMonitor(S, 'w', record=[0, 1])
+ s_mon = SpikeMonitor(input)
+ r_mon = PopulationRateMonitor(input)
+ 
+-run(100*second, report='text')
++run(1*second, report='text')
+ 
+ subplot(311)
+ suptitle('STDP_standalone_cuda')
+diff --git a/examples/cuba_cuda.py b/examples/cuba_cuda.py
+index 417f60d..eacbeae 100644
+--- a/examples/cuba_cuda.py
++++ b/examples/cuba_cuda.py
+@@ -37,11 +37,13 @@ Ci.connect('i>=3200', p=0.02)
+ 
+ s_mon = SpikeMonitor(P)
+ 
+-run(1 * second, report='text')
++run(10 * second, report='text')
+ 
+-plot(s_mon.t/ms, s_mon.i, '.k', ms=1)
+-title('CUBA_CUDA')
+-xlabel('Time (ms)')
+-ylabel('Neuron index')
+-savefig('CUBA_CUDA/CUBA_CUDA_rasterplot.png')
++print(profiling_summary())
++
++#plot(s_mon.t/ms, s_mon.i, '.k', ms=1)
++#title('CUBA_CUDA')
++#xlabel('Time (ms)')
++#ylabel('Neuron index')
++#savefig('CUBA_CUDA/CUBA_CUDA_rasterplot.png')
+ #show()
+diff --git a/frozen_repos/brian2 b/frozen_repos/brian2
+--- a/frozen_repos/brian2
++++ b/frozen_repos/brian2
+@@ -1 +1 @@
+-Subproject commit fadc6a0aeb90d1b4d343470628457d8561536f67
++Subproject commit fadc6a0aeb90d1b4d343470628457d8561536f67-dirty
+diff --git a/frozen_repos/brian2genn b/frozen_repos/brian2genn
+--- a/frozen_repos/brian2genn
++++ b/frozen_repos/brian2genn
+@@ -1 +1 @@
+-Subproject commit 0553cafeab49ea5403c0230411035df504d4db06
++Subproject commit 0553cafeab49ea5403c0230411035df504d4db06-dirty
+diff --git a/frozen_repos/genn b/frozen_repos/genn
+--- a/frozen_repos/genn
++++ b/frozen_repos/genn
+@@ -1 +1 @@
+-Subproject commit e01c85f18339249558d6e570ae976609dc972846
++Subproject commit e01c85f18339249558d6e570ae976609dc972846-dirty
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/log_complete_after_talk b/dev/benchmarks/results_2017_04_05_complete_after_talk/log_complete_after_talk
new file mode 100644
index 00000000..13ab8e8d
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/log_complete_after_talk
@@ -0,0 +1,1903 @@
+Directory with name `results_2017_04_05_complete_after_talk` already exists. Renaming it to `results_2017_04_05_complete_after_talk_bak_1491422611`.
+Saving results in results_2017_04_05_complete_after_talk/plots.
+Starting CUBA on 05.04.2017 at 22:03:31.
+Running speed tests
+Configurations: CUDA standalone, C++ standalone, GeNN_optimized
+Full examples: CUBA fixed connectivity:  n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=100000 [...] n=1000000 [E..]WARNING    /home/denisalevi/anaconda2/envs/dev_b2c/lib/python2.7/site-packages/matplotlib/__init__.py:892: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
+  warnings.warn(self.msg_depr % (key, alt_key))
+ [py.warnings]
+
+
+TRACEBACK CUDA standalone N=1000000
+INFO: setting cudaDevice stuff took 0.265023 seconds
+INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject with 977 blocks and 1024 threads. Kernel needs 8 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO connectivity matrix has size 15999736
+INFO connectivity matrix has size 64013467
+INFO calling kernel_neurongroup_stateupdater_codeobject with 1303 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000.
+ERROR launching kernel_neurongroup_stateupdater_codeobject in code_objects/neurongroup_stateupdater_codeobject.cu:1101 invalid argument
+
+('debug syn effect mdoe ', 'target')
+('debug syn effect mdoe ', 'target')
+INFO: setting cudaDevice stuff took 0.265023 seconds
+INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject with 977 blocks and 1024 threads. Kernel needs 8 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO connectivity matrix has size 15999736
+INFO connectivity matrix has size 64013467
+INFO calling kernel_neurongroup_stateupdater_codeobject with 1303 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000.
+ERROR launching kernel_neurongroup_stateupdater_codeobject in code_objects/neurongroup_stateupdater_codeobject.cu:1101 invalid argument
+
+
+/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/utils/logger.py:546: UserWarning: Could not copy script file to temp directory: [Errno 2] No such file or directory: '/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/-c'
+  'Could not copy script file to temp directory: %s' % ex)
+INFO       No numerical integration method specified for group 'neurongroup', using method 'linear' (took 1.49s). [brian2.stateupdaters.base.method_choice]
+
+Traceback (most recent call last):
+  File "<string>", line 21, in <module>
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/tests/features/cuda_configuration.py", line 27, in after_run
+    with_output=False)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/device.py", line 778, in build
+    self.run(directory, with_output, run_args)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/devices/cpp_standalone/device.py", line 864, in run
+    "%s)" % os.path.abspath(directory))
+RuntimeError: Project run failed (project directory: /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/cuda_standalone/cuda_standalone)
+
+
+
+Running CUBA took 2:08:31.
+INFO relative performance for Full examples: CUBA fixed connectivity N=10 CUDA standalone: 1.0
+INFO relative performance for Full examples: CUBA fixed connectivity N=100 CUDA standalone: 1.0
+INFO relative performance for Full examples: CUBA fixed connectivity N=1000 CUDA standalone: 1.0
+INFO relative performance for Full examples: CUBA fixed connectivity N=10000 CUDA standalone: 1.0
+INFO relative performance for Full examples: CUBA fixed connectivity N=100000 CUDA standalone: 1.0
+INFO relative performance for Full examples: CUBA fixed connectivity N=1000000 CUDA standalone: nan
+INFO relative performance for Full examples: CUBA fixed connectivity N=10 C++ standalone: 9.50331627245
+INFO relative performance for Full examples: CUBA fixed connectivity N=100 C++ standalone: 8.5446853951
+INFO relative performance for Full examples: CUBA fixed connectINFO: setting cudaDevice stuff took 0.172666 seconds
+INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject with 1 blocks and 1024 threads. Kernel needs 8 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO connectivity matrix has size 16108
+INFO connectivity matrix has size 64158
+INFO calling kernel_neurongroup_stateupdater_codeobject with 2 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000.
+INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 11 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_1_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_neurongroup_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+Number of synapses: 64158
+Number of synapses: 16108
+INFO: main_lines took 5.325889 seconds
+INFO: main function took 5.523285 seconds
+mkdir: cannot create directory ‘test_output’: File exists
+# DT 0.000100 
+# totalTime 1.000000 
+# We are running with fixed time step 0.000100 
+0.9999 done ...
+everything finished.
+ivity N=1000 C++ standalone: 3.79737620222
+INFO relative performance for Full examples: CUBA fixed connectivity N=10000 C++ standalone: 0.797678384644
+INFO relative performance for Full examples: CUBA fixed connectivity N=100000 C++ standalone: 0.440223048132
+INFO relative performance for Full examples: CUBA fixed connectivity N=1000000 C++ standalone: nan
+INFO relative performance for Full examples: CUBA fixed connectivity N=10 GeNN_optimized: 2.12816895677
+INFO relative performance for Full examples: CUBA fixed connectivity N=100 GeNN_optimized: 1.93923218414
+INFO relative performance for Full examples: CUBA fixed connectivity N=1000 GeNN_optimized: 1.75435949027
+INFO relative performance for Full examples: CUBA fixed connectivity N=10000 GeNN_optimized: 1.43439787216
+INFO relative performance for Full examples: CUBA fixed connectivity N=100000 GeNN_optimized: 0.625130402294
+INFO relative performance for Full examples: CUBA fixed connectivity N=1000000 GeNN_optimized: nan
+Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling
+cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_CUBA_CUDAStandaloneConfiguration_1000.log ./main 
+Profiling took 0:00:10 for runtime of 0.355534
+Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling
+cd GeNNworkspace && nvprof  --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_CUBA_GeNNConfigurationOptimized_1000.log ./main test 1.0 1
+Profiling took 0:00:02 for runtime of 0.199734
+Starting COBAHH on 06.04.2017 at 00:13:16.
+Running speed tests
+Configurations: CUDA standalone, C++ standalone, GeNN_optimized
+Full examples: COBAHH:  n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=20000 [...] n=50000 [...] n=100000 [...]INFO: setting cudaDevice stuff took 0.156221 seconds
+INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject_1 with 1 blocks and 1024 threads. Kernel needs 7 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject_2 with 1 blocks and 1024 threads. Kernel needs 7 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO connectivity matrix has size 3949
+INFO connectivity matrix has size 15742
+INFO calling kernel_neurongroup_stateupdater_codeobject with 2 blocks and 512 threads. Kernel needs 109 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.250000.
+INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 11 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_1_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+Number of synapses: 3949
+Number of synapses: 15742
+INFO: main_lines took 4.652397 seconds
+INFO: main function took 4.834268 seconds
+mkdir: cannot create directory ‘test_output’: File exists
+# DT 0.000100 
+# totalTime 1.000000 
+# We are running with fixed time step 0.000100 
+0.9999 done ...
+everything finished.
+
+Running COBAHH took 0:55:44.
+INFO relative performance for Full examples: COBAHH N=10 CUDA standalone: 1.0
+INFO relative performance for Full examples: COBAHH N=100 CUDA standalone: 1.0
+INFO relative performance for Full examples: COBAHH N=1000 CUDA standalone: 1.0
+INFO relative performance for Full examples: COBAHH N=10000 CUDA standalone: 1.0
+INFO relative performance for Full examples: COBAHH N=20000 CUDA standalone: 1.0
+INFO relative performance for Full examples: COBAHH N=50000 CUDA standalone: 1.0
+INFO relative performance for Full examples: COBAHH N=100000 CUDA standalone: 1.0
+INFO relative performance for Full examples: COBAHH N=10 C++ standalone: 5.97680043162
+INFO relative performance for Full examples: COBAHH N=100 C++ standalone: 1.00371226203
+INFO relative performance for Full examples: COBAHH N=1000 C++ standalone: 0.154618074816
+INFO relative performance for Full examples: COBAHH N=10000 C++ standalone: 0.0396381192455
+INFO relative performance for Full examples: COBAHH N=20000 C++ standalone: 0.0253531218652
+INFO relative performance for Full examples: COBAHH N=50000 C++ standalone: 0.0106447160514
+INFO relative performance for Full examples: COBAHH N=100000 C++ standalone: 0.00914741719581
+INFO relative performance for Full examples: COBAHH N=10 GeNN_optimized: 1.08714146487
+INFO relative performance for Full examples: COBAHH N=100 GeNN_optimized: 1.20794915957
+INFO relative performance for Full examples: COBAHH N=1000 GeNN_optimized: 1.32626500874
+INFO relative performance for Full examples: COBAHH N=10000 GeNN_optimized: 1.27813023083
+INFO relative performance for Full examples: COBAHH N=20000 GeNN_optimized: 2.12777862025
+INFO relative performance for Full examples: COBAHH N=50000 GeNN_optimized: 1.22069742475
+INFO relative performance for Full examples: COBAHH N=100000 GeNN_optimized: 1.09531764141
+Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling
+cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_COBAHH_CUDAStandaloneConfiguration_1000.log ./main 
+Profiling took 0:00:09 for runtime of 0.531296
+Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling
+cd GeNNworkspace && nvprof  --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_COBAHH_GeNNConfigurationOptimized_1000.log ./main test 1.0 1
+Profiling took 0:00:03 for runtime of 0.409076
+Starting AdaptationOscillation on 06.04.2017 at 01:10:20.
+Running speed tests
+Configurations: CUDA standalone, C++ standalone, GeNN_optimized
+Full examples: Adaptation oscillation:  n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=20000 [...] n=50000 [...] n=100000 [E.E]
+
+TRACEBACK CUDA standalone N=100000
+INFO: setting cudaDevice stuff took 0.311282 seconds
+INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject with 98 blocks and 1024 threads. Kernel needs 8 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject_1 with 98 blocks and 1024 threads. Kernel needs 8 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+
+('debug syn effect mdoe ', 'target')
+INFO: setting cudaDevice stuff took 0.311282 seconds
+INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject with 98 blocks and 1024 threads. Kernel needs 8 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject_1 with 98 blocks and 1024 threads. Kernel needs 8 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+
+
+/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/utils/logger.py:546: UserWarning: Could not copy script file to temp directory: [Errno 2] No such file or directory: '/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/-c'
+  'Could not copy script file to temp directory: %s' % ex)
+INFO       No numerical integration method specified for group 'neurongroup', using method 'euler' (took 0.14s, trying other methods took 0.00s). [brian2.stateupdaters.base.method_choice]
+terminate called after throwing an instance of 'thrust::system::detail::bad_alloc'
+  what():  std::bad_alloc: out of memory
+
+Traceback (most recent call last):
+  File "<string>", line 21, in <module>
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/tests/features/cuda_configuration.py", line 27, in after_run
+    with_output=False)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/device.py", line 778, in build
+    self.run(directory, with_output, run_args)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/devices/cpp_standalone/device.py", line 864, in run
+    "%s)" % os.path.abspath(directory))
+RuntimeError: Project run failed (project directory: /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/cuda_standalone/cuda_standalone)
+
+
+
+
+TRACEBACK GeNN_optimized N=100000
+no stdout file found, cwd = /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNWorkspace/results/stdout.txt
+running brian code generation ...
+building genn executable ...
+executing genn binary on GPU ...
+
+/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/utils/logger.py:546: UserWarning: Could not copy script file to temp directory: [Errno 2] No such file or directory: '/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/-c'
+  'Could not copy script file to temp directory: %s' % ex)
+WARNING    The selected device 'genn' only supports a fixed schedule, but this schedule is not consistent with the network's schedule. The simulation will use the device's schedule.
+Device schedule: ['start', 'synapses', 'groups', 'thresholds', 'resets', 'end']
+Network schedule: ['start', 'groups', 'thresholds', 'synapses', 'resets', 'end']
+Set the network schedule explicitly or set the core.network.default_schedule preference to avoid this warning. [brian2.core.network.schedule_conflict]
+INFO       No numerical integration method specified for group 'neurongroup', using method 'euler' (took 0.12s, trying other methods took 0.00s). [brian2.stateupdaters.base.method_choice]
+mkdir: cannot create directory ‘test_output’: File exists
+# DT 0.000100 
+# totalTime 1.000000 
+definitions.h: 102: cuda runtime error 2: out of memory
+
+Traceback (most recent call last):
+  File "<string>", line 14, in <module>
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/tests/features/speed.py", line 89, in run
+    self.timed_run(self.duration)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/tests/features/base.py", line 63, in timed_run
+    brian2.run(duration, level=1)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/units/fundamentalunits.py", line 2428, in new_f
+    result = f(*args, **kwds)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/magic.py", line 371, in run
+    namespace=namespace, profile=profile, level=2+level)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/magic.py", line 231, in run
+    namespace=namespace, profile=profile, level=level+1)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/base.py", line 276, in device_override_decorated_function
+    return getattr(curdev, name)(*args, **kwds)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2genn/brian2genn/device.py", line 1206, in network_run
+    super(GeNNDevice, self).network_run(net=net, duration=duration, report=report, report_period=report_period, namespace=namespace, level=level+1)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/devices/cpp_standalone/device.py", line 1171, in network_run
+    self.build(direct_call=False, **self.build_options)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2genn/brian2genn/device.py", line 592, in build
+    returncode=ex.returncode)
+RuntimeError: Project run failed (Command ['./main', 'test', '1.0', '1'] failed with error code 1).
+See the output above (if any) for more details.
+
+
+
+Running AdaptationOscillation took 1:41:42.
+INFO relative performance for Full examples: Adaptation oscillation N=10 CUDA standalone: 1.0
+INFO relative performance for Full examples: Adaptation oscillation N=100 CUDA standalone: 1.0
+INFO relative performance for Full examples: Adaptation oscillation N=1000 CUDA standalone: 1.0
+INFO relative performance for Full examples: Adaptation oscillation N=10000 CUDA standalone: 1.0
+INFO relative performance for Full examples: Adaptation oscillation N=20000 CUDA standalone: 1.0
+INFO relative performance for Full examples: Adaptation oscillation N=50000 CUDA standalone: 1.0
+INFO relative performance for Full examples: Adaptation oscillation N=100000 CUDA standalone: nan
+INFO relative performance for Full examples: Adaptation oscillation N=10 C++ standalone: 8.59983763369
+INFO relative performance for Full examples: Adaptation oscillation N=100 C++ standalone: 2.8440796921
+INFO relative performance for Full examples: Adaptation oscillation N=1000 C++ standalone: 0.506496466226
+INFO relative performance for Full examples: Adaptation oscillation N=10000 C++ standalone: 0.214522626694
+INFO relative performance for Full examples: Adaptation oscillation N=20000 C++ standalone: 0.170045431942
+INFO relative performance for Full examples: Adaptation oscillation N=50000 C++ standalone: 0.109420242272
+INFO relative performance for Full examples: Adaptation oscillation N=100000 C++ standalone: nan
+INFO relative performance for Full examples: Adaptation oscillation N=10 GeNN_optimized: 1.54028322165
+INFO relative performance for Full examples: Adaptation oscillation N=100 GeNN_optimized: 1.37097363869
+INFO relative performance for Full examples: Adaptation oscillation N=1000 GeNN_optimized: 1.2270477INFO: setting cudaDevice stuff took 0.149325 seconds
+INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject with 1 blocks and 1024 threads. Kernel needs 8 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject_1 with 1 blocks and 1024 threads. Kernel needs 8 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO calling kernel_synapses_group_variable_set_conditional_codeobject with 49 blocks and 1024 threads. Kernel needs 6 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO calling kernel_synapses_group_variable_set_conditional_codeobject_1 with 49 blocks and 1024 threads. Kernel needs 6 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO connectivity matrix has size 49931
+INFO generating 10000000 randn every 13107 clock cycles for neurongroup_stateupdater_codeobject
+INFO calling kernel_neurongroup_stateupdater_codeobject with 2 blocks and 576 threads. Kernel needs 52 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.562500.
+INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 11 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 28 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_neurongroup_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+Number of synapses: 49931
+INFO: main_lines took 4.071034 seconds
+INFO: main function took 4.240770 seconds
+mkdir: cannot create directory ‘test_output’: File exists
+# DT 0.000100 
+# totalTime 1.000000 
+# We are running with fixed time step 0.000100 
+0.9999 done ...
+everything finished.
+4328
+INFO relative performance for Full examples: Adaptation oscillation N=10000 GeNN_optimized: 1.08010242282
+INFO relative performance for Full examples: Adaptation oscillation N=20000 GeNN_optimized: 1.04654988332
+INFO relative performance for Full examples: Adaptation oscillation N=50000 GeNN_optimized: 0.961954575197
+INFO relative performance for Full examples: Adaptation oscillation N=100000 GeNN_optimized: nan
+Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling
+cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_AdaptationOscillation_CUDAStandaloneConfiguration_1000.log ./main 
+Profiling took 0:00:07 for runtime of 0.372608
+Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling
+cd GeNNworkspace && nvprof  --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_AdaptationOscillation_GeNNConfigurationOptimized_1000.log ./main test 1.0 1
+Profiling took 0:00:03 for runtime of 0.311908
+Starting Vogels on 06.04.2017 at 02:53:45.
+Running speed tests
+Configurations: CUDA standalone, C++ standalone, GeNN_optimized
+Full examples: Vogels et al 2011 (event-driven synapses):  n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=20000 [...] n=50000 [...] n=100000 [E..]
+
+TRACEBACK CUDA standalone N=100000
+INFO: setting cudaDevice stuff took 0.263336 seconds
+INFO connectivity matrix has size 7997654
+INFO connectivity matrix has size 31988320
+INFO connectivity matrix has size 159989507
+INFO connectivity matrix has size 31988320
+INFO calling kernel_neurongroup_stateupdater_codeobject with 131 blocks and 768 threads. Kernel needs 40 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000.
+ERROR launching kernel_neurongroup_stateupdater_codeobject in code_objects/neurongroup_stateupdater_codeobject.cu:1066 invalid argument
+
+('debug syn effect mdoe ', 'target')
+('debug syn effect mdoe ', 'target')
+('debug syn effect mdoe ', 'target')
+('debug syn effect mdoe ', 'synapse')
+INFO: setting cudaDevice stuff took 0.263336 seconds
+INFO connectivity matrix has size 7997654
+INFO connectivity matrix has size 31988320
+INFO connectivity matrix has size 159989507
+INFO connectivity matrix has size 31988320
+INFO calling kernel_neurongroup_stateupdater_codeobject with 131 blocks and 768 threads. Kernel needs 40 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000.
+ERROR launching kernel_neurongroup_stateupdater_codeobject in code_objects/neurongroup_stateupdater_codeobject.cu:1066 invalid argument
+
+
+/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/utils/logger.py:546: UserWarning: Could not copy script file to temp directory: [Errno 2] No such file or directory: '/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/-c'
+  'Could not copy script file to temp directory: %s' % ex)
+INFO       No numerical integration method specified for group 'neurongroup', using method 'euler' (took 0.06s, trying other methods took 0.11s). [brian2.stateupdaters.base.method_choice]
+
+Traceback (most recent call last):
+  File "<string>", line 21, in <module>
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/tests/features/cuda_configuration.py", line 27, in after_run
+    with_output=False)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/device.py", line 778, in build
+    self.run(directory, with_output, run_args)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/devices/cpp_standalone/device.py", line 864, in run
+    "%s)" % os.path.abspath(directory))
+RuntimeError: Project run failed (project directory: /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/cuda_standalone/cuda_standalone)
+
+
+
+Running Vogels took 1:24:04.
+INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=10 CUDA standalone: 1.0
+INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=100 CUDA standalone: 1.0
+INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=1000 CUDA standalone: 1.0
+INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=10000 CUDA standalone: 1.0
+INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=20000 CUDA standalone: 1.0
+INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=50000 CUDA standalone: 1.0
+INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=100000 CUDA standalone: nan
+INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=10 C++ standalone: 5.81708231538
+INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=100 C++ standalone: 8.52257702632
+INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=1000 C++ standalone: 3.6497INFO: setting cudaDevice stuff took 0.201820 seconds
+INFO connectivity matrix has size 776
+INFO connectivity matrix has size 3224
+INFO connectivity matrix has size 15811
+INFO connectivity matrix has size 3224
+INFO calling kernel_neurongroup_stateupdater_codeobject with 2 blocks and 768 threads. Kernel needs 40 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000.
+INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 11 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_1_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_2_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 42 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.500000.
+INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_2_post_codeobject with 15 blocks and 1024 threads. Kernel needs 34 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.500000.
+INFO calling kernel_neurongroup_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+Number of synapses: 3224
+Number of synapses: 3224
+Number of synapses: 15811
+Number of synapses: 776
+INFO: main_lines took 7.959584 seconds
+INFO: main function took 8.189923 seconds
+mkdir: cannot create directory ‘test_output’: File exists
+# DT 0.000100 
+# totalTime 1.000000 
+# We are running with fixed time step 0.000100 
+0.9999 done ...
+everything finished.
+7700535
+INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=10000 C++ standalone: 1.27385995798
+INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=20000 C++ standalone: 0.756183613604
+INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=50000 C++ standalone: 0.427712217638
+INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=100000 C++ standalone: nan
+INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=10 GeNN_optimized: 1.48314163249
+INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=100 GeNN_optimized: 2.17682514992
+INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=1000 GeNN_optimized: 1.16598307673
+INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=10000 GeNN_optimized: 0.540424662951
+INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=20000 GeNN_optimized: 0.560432413335
+INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=50000 GeNN_optimized: 0.536937885623
+INFO relative performance for Full examples: Vogels et al 2011 (event-driven synapses) N=100000 GeNN_optimized: nan
+Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling
+cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_Vogels_CUDAStandaloneConfiguration_1000.log ./main 
+Profiling took 0:00:15 for runtime of 0.85962
+Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling
+cd GeNNworkspace && nvprof  --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_Vogels_GeNNConfigurationOptimized_1000.log ./main test 1.0 1
+Profiling took 0:00:04 for runtime of 0.740019
+Starting STDP on 06.04.2017 at 04:19:26.
+Running speed tests
+Configurations: CUDA standalone, C++ standalone, GeNN_optimized
+Full examples: STDP with Poisson input:  n=100 [...] n=500 [...] n=1000 [...] n=5000 [...] n=10000 [...] n=50000 [...] n=100000 [...] n=500000 [...] n=1000000 [...]INFO: setting cudaDevice stuff took 0.313461 seconds
+INFO calling kernel_synapses_group_variable_set_conditional_codeobject with 1 blocks and 1024 threads. Kernel needs 6 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO connectivity matrix has size 1000
+INFO connectivity matrix has size 1000
+INFO generating 10000000 rand every 13107 clock cycles for poissongroup_thresholder_codeobject
+INFO calling kernel_neurongroup_stateupdater_codeobject with 1 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000.
+INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 9 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_poissongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 15 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_spikemonitor_codeobject with 1 blocks and 1 threads. Kernel needs 37 registers per block, 0 bytes of statically-allocated shared memory per block, 16 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.000000.
+INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 42 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.500000.
+INFO calling kernel_synapses_post_codeobject with 15 blocks and 1024 threads. Kernel needs 34 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.500000.
+INFO calling kernel_neurongroup_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+Number of synapses: 1000
+Number of synapses: 1000
+INFO: main_lines took 6.937627 seconds
+Number of spikes: 14845
+INFO: main function took 7.299253 seconds
+mkdir: cannot create directory ‘test_output’: File exists
+# DT 0.000100 
+# totalTime 1.000000 
+# We are running with fixed time step 0.000100 
+0.9999 done ...
+everything finished.
+
+Running STDP took 1:20:32.
+INFO relative performance for Full examples: STDP with Poisson input N=100 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP with Poisson input N=500 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP with Poisson input N=1000 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP with Poisson input N=5000 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP with Poisson input N=10000 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP with Poisson input N=50000 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP with Poisson input N=100000 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP with Poisson input N=500000 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP with Poisson input N=1000000 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP with Poisson input N=100 C++ standalone: 8.88338818143
+INFO relative performance for Full examples: STDP with Poisson input N=500 C++ standalone: 5.09683126402
+INFO relative performance for Full examples: STDP with Poisson input N=1000 C++ standalone: 3.43176369337
+INFO relative performance for Full examples: STDP with Poisson input N=5000 C++ standalone: 1.23369418761
+INFO relative performance for Full examples: STDP with Poisson input N=10000 C++ standalone: 0.740582551232
+INFO relative performance for Full examples: STDP with Poisson input N=50000 C++ standalone: 0.373019575212
+INFO relative performance for Full examples: STDP with Poisson input N=100000 C++ standalone: 0.364799663468
+INFO relative performance for Full examples: STDP with Poisson input N=500000 C++ standalone: 2.96351279504
+INFO relative performance for Full examples: STDP with Poisson input N=1000000 C++ standalone: 4.70050294368
+INFO relative performance for Full examples: STDP with Poisson input N=100 GeNN_optimized: 1.30974438102
+INFO relative performance for Full examples: STDP with Poisson input N=500 GeNN_optimized: 1.18471683703
+INFO relative performance for Full examples: STDP with Poisson input N=1000 GeNN_optimized: 1.16060458009
+INFO relative performance for Full examples: STDP with Poisson input N=5000 GeNN_optimized: 1.56393492832
+INFO relative performance for Full examples: STDP with Poisson input N=10000 GeNN_optimized: 1.7254821012
+INFO relative performance for Full examples: STDP with Poisson input N=50000 GeNN_optimized: 1.69780564815
+INFO relative performance for Full examples: STDP with Poisson input N=100000 GeNN_optimized: 1.68674045475
+INFO relative performance for Full examples: STDP with Poisson input N=500000 GeNN_optimized: 14.7726180354
+INFO relative performance for Full examples: STDP with Poisson input N=1000000 GeNN_optimized: 23.369044153
+Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling
+cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_STDP_CUDAStandaloneConfiguration_1000.log ./main 
+Profiling took 0:00:12 for runtime of 0.558918
+Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling
+cd GeNNworkspace && nvprof  --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_STDP_GeNNConfigurationOptimized_1000.log ./main test 1.0 1
+Profiling took 0:00:04 for runtime of 0.479759
+Starting STDPEventDriven on 06.04.2017 at 05:41:19.
+Running speed tests
+Configurations: CUDA standalone, C++ standalone, GeNN_optimized
+Full examples: STDP (event-driven):  n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=20000 [...] n=50000 [...] n=100000 [...] n=1000000 [...] n=5000000 [...]INFO: setting cudaDevice stuff took 0.328379 seconds
+INFO calling kernel_synapses_group_variable_set_conditional_codeobject with 1 blocks and 1024 threads. Kernel needs 6 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO connectivity matrix has size 1000
+INFO connectivity matrix has size 1000
+INFO generating 10000000 rand every 13107 clock cycles for poissongroup_thresholder_codeobject
+INFO calling kernel_neurongroup_stateupdater_codeobject with 1 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000.
+INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 9 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_poissongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 15 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 42 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.500000.
+INFO calling kernel_synapses_post_codeobject with 15 blocks and 1024 threads. Kernel needs 34 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.500000.
+INFO calling kernel_neurongroup_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+Number of synapses: 1000
+Number of synapses: 1000
+INFO: main_lines took 5.794570 seconds
+INFO: main function took 6.145385 seconds
+mkdir: cannot create directory ‘test_output’: File exists
+# DT 0.000100 
+# totalTime 1.000000 
+# We are running with fixed time step 0.000100 
+0.9999 done ...
+everything finished.
+
+Running STDPEventDriven took 1:19:53.
+INFO relative performance for Full examples: STDP (event-driven) N=10 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP (event-driven) N=100 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP (event-driven) N=1000 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP (event-driven) N=10000 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP (event-driven) N=20000 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP (event-driven) N=50000 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP (event-driven) N=100000 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP (event-driven) N=1000000 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP (event-driven) N=5000000 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP (event-driven) N=10 C++ standalone: 11.2497943354
+INFO relative performance for Full examples: STDP (event-driven) N=100 C++ standalone: 8.31207995112
+INFO relative performance for Full examples: STDP (event-driven) N=1000 C++ standalone: 2.61972877926
+INFO relative performance for Full examples: STDP (event-driven) N=10000 C++ standalone: 0.288795267181
+INFO relative performance for Full examples: STDP (event-driven) N=20000 C++ standalone: 0.142747274164
+INFO relative performance for Full examples: STDP (event-driven) N=50000 C++ standalone: 0.127502322902
+INFO relative performance for Full examples: STDP (event-driven) N=100000 C++ standalone: 0.124020384785
+INFO relative performance for Full examples: STDP (event-driven) N=1000000 C++ standalone: 0.111562986518
+INFO relative performance for Full examples: STDP (event-driven) N=5000000 C++ standalone: 0.115750281196
+INFO relative performance for Full examples: STDP (event-driven) N=10 GeNN_optimized: 2.76946556051
+INFO relative performance for Full examples: STDP (event-driven) N=100 GeNN_optimized: 2.56034789856
+INFO relative performance for Full examples: STDP (event-driven) N=1000 GeNN_optimized: 1.81374117769
+INFO relative performance for Full examples: STDP (event-driven) N=10000 GeNN_optimized: 0.854456214358
+INFO relative performance for Full examples: STDP (event-driven) N=20000 GeNN_optimized: 0.725383414514
+INFO relative performance for Full examples: STDP (event-driven) N=50000 GeNN_optimized: 0.633313911843
+INFO relative performance for Full examples: STDP (event-driven) N=100000 GeNN_optimized: 0.609749101371
+INFO relative performance for Full examples: STDP (event-driven) N=1000000 GeNN_optimized: 0.578689800398
+INFO relative performance for Full examples: STDP (event-driven) N=5000000 GeNN_optimized: 0.607900794592
+Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling
+cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPEventDriven_CUDAStandaloneConfiguration_1000.log ./main 
+Profiling took 0:00:10 for runtime of 0.415781
+Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling
+cd GeNNworkspace && nvprof  --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPEventDriven_GeNNConfigurationOptimized_1000.log ./main test 1.0 1
+Profiling took 0:00:02 for runtime of 0.232967
+Starting BrunelHakimModelScalarDelay on 06.04.2017 at 07:02:28.
+Running speed tests
+Configurations: CUDA standalone, C++ standalone, GeNN_optimized
+Full examples: Brunel Hakim with scalar delay (1s):  n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=20000 [...] n=50000 [...] n=100000 [...] n=250000 [...]INFO: setting cudaDevice stuff took 0.303419 seconds
+INFO connectivity matrix has size 1000000
+INFO generating 10000000 randn every 13107 clock cycles for neurongroup_stateupdater_codeobject
+INFO calling kernel_neurongroup_stateupdater_codeobject with 2 blocks and 576 threads. Kernel needs 52 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.562500.
+INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 11 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_neurongroup_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+Number of synapses: 1000000
+INFO: main_lines took 4.771391 seconds
+INFO: main function took 5.118664 seconds
+mkdir: cannot create directory ‘test_output’: File exists
+# DT 0.000100 
+# totalTime 1.000000 
+# We are running with fixed time step 0.000100 
+0.9999 done ...
+everything finished.
+
+Running BrunelHakimModelScalarDelay took 2:11:11.
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=10 CUDA standalone: 1.0
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=100 CUDA standalone: 1.0
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=1000 CUDA standalone: 1.0
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=10000 CUDA standalone: 1.0
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=20000 CUDA standalone: 1.0
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=50000 CUDA standalone: 1.0
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=100000 CUDA standalone: 1.0
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=250000 CUDA standalone: 1.0
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=10 C++ standalone: 9.78843893145
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=100 C++ standalone: 2.88689717066
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=1000 C++ standalone: 0.39678676061
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=10000 C++ standalone: 0.0579548817132
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=20000 C++ standalone: 0.043471563887
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=50000 C++ standalone: 0.032781986812
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=100000 C++ standalone: 0.0294157569658
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=250000 C++ standalone: 0.0305941911305
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=10 GeNN_optimized: 1.58033861921
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=100 GeNN_optimized: 1.39364952685
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=1000 GeNN_optimized: 1.28814277906
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=10000 GeNN_optimized: 1.15879715758
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=20000 GeNN_optimized: 1.08175176136
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=50000 GeNN_optimized: 0.884117890361
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=100000 GeNN_optimized: 0.770480233171
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s) N=250000 GeNN_optimized: 0.457457403795
+Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling
+cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelScalarDelay_CUDAStandaloneConfiguration_1000.log ./main 
+Profiling took 0:00:09 for runtime of 0.248229
+Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling
+cd GeNNworkspace && nvprof  --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelScalarDelay_GeNNConfigurationOptimized_1000.log ./main test 1.0 1
+Profiling took 0:00:04 for runtime of 0.195981
+Starting BrunelHakimModelScalarDelayNoMultiPrePost on 06.04.2017 at 09:15:13.
+Running speed tests
+Configurations: CUDA standalone, C++ standalone, GeNN_optimized
+Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections):  n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=20000 [...] n=50000 [...] n=100000 [...] n=250000 [...]INFO: setting cudaDevice stuff took 0.171453 seconds
+INFO connectivity matrix has size 999000
+INFO generating 10000000 randn every 13107 clock cycles for neurongroup_stateupdater_codeobject
+INFO calling kernel_neurongroup_stateupdater_codeobject with 2 blocks and 576 threads. Kernel needs 52 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.562500.
+INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 11 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_neurongroup_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+Number of synapses: 999000
+INFO: main_lines took 4.843546 seconds
+INFO: main function took 5.059406 seconds
+
+Running BrunelHakimModelScalarDelayNoMultiPrePost took 2:09:50.
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=10 CUDA standalone: 1.0
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=100 CUDA standalone: 1.0
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=1000 CUDA standalone: 1.0
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=10000 CUDA standalone: 1.0
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=20000 CUDA standalone: 1.0
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=50000 CUDA standalone: 1.0
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=100000 CUDA standalone: 1.0
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=250000 CUDA standalone: 1.0
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=10 C++ standalone: 8.20634264047
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=100 C++ standalone: 2.89509603547
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=1000 C++ standalone: 0.362216843115
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=10000 C++ standalone: 0.0614173832433
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=20000 C++ standalone: 0.0440634627728
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=50000 C++ standalone: 0.0323258939735
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=100000 C++ standalone: 0.0295636812952
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=250000 C++ standalone: 0.0301825412509
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=10 GeNN_optimized: 1.55619035168
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=100 GeNN_optimized: 1.39476926975
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=1000 GeNN_optimized: 1.32290219232
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=10000 GeNN_optimized: 1.19834218479
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=20000 GeNN_optimized: 1.08968345615
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=50000 GeNN_optimized: 0.88300034153
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=100000 GeNN_optimized: 0.76787687807
+INFO relative performance for Full examples: Brunel Hakim with scalar delay (1s, no multip pre-post connections) N=250000 GeNN_optimized: 0.4580926795
+Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling
+cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelScalarDelayNoMultiPrePost_CUDAStandaloneConfiguration_1000.log ./main 
+Profiling took 0:00:10 for runtime of 0.251194
+Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling
+cd GeNNworkspace && nvprof  --log-file ../results_2017_04_05_complete_after_talk/mkdir: cannot create directory ‘test_output’: File exists
+# DT 0.000100 
+# totalTime 1.000000 
+# We are running with fixed time step 0.000100 
+0.9999 done ...
+everything finished.
+nvprof/nvprof_BrunelHakimModelScalarDelayNoMultiPrePost_GeNNConfigurationOptimized_1000.log ./main test 1.0 1
+Profiling took 0:00:04 for runtime of 0.194764
+Starting VerySparseMediumRateSynapsesOnly on 06.04.2017 at 11:26:46.
+Running speed tests
+Configurations: CUDA standalone, C++ standalone, GeNN_optimized
+Synapses only: Very sparse, medium rate (10s duration):  n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=100000 [...] n=500000 [...]INFO: setting cudaDevice stuff took 0.189294 seconds
+INFO connectivity matrix has size 32
+INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+Number of synapses: 32
+INFO: main_lines took 19.422106 seconds
+INFO: main function took 19.638434 seconds
+mkdir: cannot create directory ‘test_output’: File exists
+# DT 0.000100 
+# totalTime 10.000000 
+# We are running with fixed time step 0.000100 
+9.9999 done ...
+everything finished.
+
+Running VerySparseMediumRateSynapsesOnly took 1:34:44.
+INFO relative performance for Synapses only: Very sparse, medium rate (10s duration) N=10 CUDA standalone: 1.0
+INFO relative performance for Synapses only: Very sparse, medium rate (10s duration) N=100 CUDA standalone: 1.0
+INFO relative performance for Synapses only: Very sparse, medium rate (10s duration) N=1000 CUDA standalone: 1.0
+INFO relative performance for Synapses only: Very sparse, medium rate (10s duration) N=10000 CUDA standalone: 1.0
+INFO relative performance for Synapses only: Very sparse, medium rate (10s duration) N=100000 CUDA standalone: 1.0
+INFO relative performance for Synapses only: Very sparse, medium rate (10s duration) N=500000 CUDA standalone: 1.0
+INFO relative performance for Synapses only: Very sparse, medium rate (10s duration) N=10 C++ standalone: 7.46232140337
+INFO relative performance for Synapses only: Very sparse, medium rate (10s duration) N=100 C++ standalone: 7.35680132588
+INFO relative performance for Synapses only: Very sparse, medium rate (10s duration) N=1000 C++ standalone: 11.8771841425
+INFO relative performance for Synapses only: Very sparse, medium rate (10s duration) N=10000 C++ standalone: 5.02984481209
+INFO relative performance for Synapses only: Very sparse, medium rate (10s duration) N=100000 C++ standalone: 0.318651307727
+INFO relative performance for Synapses only: Very sparse, medium rate (10s duration) N=500000 C++ standalone: 0.0791039271891
+INFO relative performance for Synapses only: Very sparse, medium rate (10s duration) N=10 GeNN_optimized: 0.882604947291
+INFO relative performance for Synapses only: Very sparse, medium rate (10s duration) N=100 GeNN_optimized: 0.865446555301
+INFO relative performance for Synapses only: Very sparse, medium rate (10s duration) N=1000 GeNN_optimized: 1.15609826329
+INFO relative performance for Synapses only: Very sparse, medium rate (10s duration) N=10000 GeNN_optimized: 0.847073701942
+INFO relative performance for Synapses only: Very sparse, medium rate (10s duration) N=100000 GeNN_optimized: 0.706869490288
+INFO relative performance for Synapses only: Very sparse, medium rate (10s duration) N=500000 GeNN_optimized: 0.280107509868
+Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling
+cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_VerySparseMediumRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log ./main 
+Profiling took 0:00:30 for runtime of 1.45008
+Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling
+cd GeNNworkspace && nvprof  --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_VerySparseMediumRateSynapsesOnly_GeNNConfigurationOptimized_1000.log ./main test 10.0 1
+Profiling took 0:00:12 for runtime of 1.244563
+Starting SparseMediumRateSynapsesOnly on 06.04.2017 at 13:03:04.
+Running speed tests
+Configurations: CUDA standalone, C++ standalone, GeNN_optimized
+Synapses only: Sparse, medium rate (1s duration):  n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=100000 [...] n=500000 [...]INFO: setting cudaDevice stuff took 0.337554 seconds
+INFO connectivity matrix has size 203
+INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+Number of synapses: 203
+INFO: main_lines took 2.003913 seconds
+INFO: main function took 2.360594 seconds
+mkdir: cannot create directory ‘test_output’: File exists
+# DT 0.000100 
+# totalTime 1.000000 
+# We are running with fixed time step 0.000100 
+0.9999 done ...
+everything finished.
+
+Running SparseMediumRateSynapsesOnly took 1:10:08.
+INFO relative performance for Synapses only: Sparse, medium rate (1s duration) N=10 CUDA standalone: 1.0
+INFO relative performance for Synapses only: Sparse, medium rate (1s duration) N=100 CUDA standalone: 1.0
+INFO relative performance for Synapses only: Sparse, medium rate (1s duration) N=1000 CUDA standalone: 1.0
+INFO relative performance for Synapses only: Sparse, medium rate (1s duration) N=10000 CUDA standalone: 1.0
+INFO relative performance for Synapses only: Sparse, medium rate (1s duration) N=100000 CUDA standalone: 1.0
+INFO relative performance for Synapses only: Sparse, medium rate (1s duration) N=500000 CUDA standalone: 1.0
+INFO relative performance for Synapses only: Sparse, medium rate (1s duration) N=10 C++ standalone: 9.03811951624
+INFO relative performance for Synapses only: Sparse, medium rate (1s duration) N=100 C++ standalone: 12.5731575893
+INFO relative performance for Synapses only: Sparse, medium rate (1s duration) N=1000 C++ standalone: 7.14094900056
+INFO relative performance for Synapses only: Sparse, medium rate (1s duration) N=10000 C++ standalone: 0.618089685795
+INFO relative performance for Synapses only: Sparse, medium rate (1s duration) N=100000 C++ standalone: 0.0489483464931
+INFO relative performance for Synapses only: Sparse, medium rate (1s duration) N=500000 C++ standalone: 0.049623428215
+INFO relative performance for Synapses only: Sparse, medium rate (1s duration) N=10 GeNN_optimized: 1.29097406281
+INFO relative performance for Synapses only: Sparse, medium rate (1s duration) N=100 GeNN_optimized: 1.23388192416
+INFO relative performance for Synapses only: Sparse, medium rate (1s duration) N=1000 GeNN_optimized: 1.20240950591
+INFO relative performance for Synapses only: Sparse, medium rate (1s duration) N=10000 GeNN_optimized: 1.07033546948
+INFO relative performance for Synapses only: Sparse, medium rate (1s duration) N=100000 GeNN_optimized: 0.721667335794
+INFO relative performance for Synapses only: Sparse, medium rate (1s duration) N=500000 GeNN_optimized: 0.715129849711
+Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling
+cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseMediumRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log ./main 
+Profiling took 0:00:04 for runtime of 0.152218
+Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling
+cd GeNNworkspace && nvprof  --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseMediumRateSynapsesOnly_GeNNConfigurationOptimized_1000.log ./main test 1.0 1
+Profiling took 0:00:02 for runtime of 0.128374
+Starting DenseMediumRateSynapsesOnly on 06.04.2017 at 14:14:07.
+Running speed tests
+Configurations: CUDA standalone, C++ standalone, GeNN_optimized
+Synapses only: Dense, medium rate (1s duration):  n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=100000 [...] n=500000 [E..]
+
+TRACEBACK CUDA standalone N=500000
+INFO: setting cudaDevice stuff took 0.113374 seconds
+INFO connectivity matrix has size 250000000
+INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+ERROR launching kernel_neurongroup_thresholder_codeobject in code_objects/neurongroup_thresholder_codeobject.cu:1008 invalid argument
+
+('debug syn effect mdoe ', 'target')
+INFO: setting cudaDevice stuff took 0.113374 seconds
+INFO connectivity matrix has size 250000000
+INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+ERROR launching kernel_neurongroup_thresholder_codeobject in code_objects/neurongroup_thresholder_codeobject.cu:1008 invalid argument
+
+
+/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/utils/logger.py:546: UserWarning: Could not copy script file to temp directory: [Errno 2] No such file or directory: '/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/-c'
+  'Could not copy script file to temp directory: %s' % ex)
+
+Traceback (most recent call last):
+  File "<string>", line 21, in <module>
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/tests/features/cuda_configuration.py", line 27, in after_run
+    with_output=False)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/device.py", line 778, in build
+    self.run(directory, with_output, run_args)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/devices/cpp_standalone/device.py", line 864, in run
+    "%s)" % os.path.abspath(directory))
+RuntimeError: Project run failed (project directory: /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/cuda_standalone/cuda_standalone)
+
+
+
+Running DenseMediumRateSynapsesOnly took 1:52:54.
+INFO relative performance for Synapses only: Dense, medium rate (1s duration) N=10 CUDA standalone: 1.0
+INFO relative performance for Synapses only: Dense, medium rate (1s duration) N=100 CUDA standalone: 1.0
+INFO relative performance for Synapses only: Dense, medium rate (1s duration) N=1000 CUDA standalone: 1.0
+INFO relative performance for Synapses only: Dense, medium rate (1s duration) N=10000 CUDA standalone: 1.0
+INFO relative performance for Synapses only: Dense, medium rate (1s duration) N=100000 CUDA standalone: 1.0
+INFO relative performance for Synapses only: Dense, medium rate (1s duration) N=500000 CUDA standalone: nan
+INFO relative performance for Synapses only: Dense, medium rate (1s duration) N=10 C++ standalone: 8.7685522422
+INFO relative performance for Synapses only: Dense, medium rate (1s duration) N=100 C++ standalone: 8.55977504315
+INFO relative performance for Synapses only: Dense, medium rate (1s duration) N=1000 C++ standalone: 3.94245172467
+INFO relative performance for Synapses only: Dense, medium rate (1s duration) N=10000 C++ standalone: 0.142538734289
+INFO relative performance for Synapses only: Dense, medium rate (1s duration) N=100000 C++ standalone: 0.0356173923641
+INFO relative performance for Synapses only: Dense, medium rate (1s duration) N=500000 C++ standalone: nan
+INFO relative performance for Synapses only: Dense, medium rate (1s duration) N=10 GeNN_optimized: 1.30697848012
+INFO relative performance for Synapses only: Dense, medium rate (1s duration) N=100 GeNN_optimized: 1.27150537634
+INFO relative performance for Synapses only: Dense, medium rate (1s duration) N=1000 GeNN_optimized: 1.26404476027
+INFO relative performance for Synapses only: DeINFO: setting cudaDevice stuff took 0.177828 seconds
+INFO connectivity matrix has size 1000
+INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+Number of synapses: 1000
+INFO: main_lines took 1.945162 seconds
+INFO: main function took 2.141915 seconds
+mkdir: cannot create directory ‘test_output’: File exists
+# DT 0.000100 
+# totalTime 1.000000 
+# We are running with fixed time step 0.000100 
+0.9999 done ...
+everything finished.
+nse, medium rate (1s duration) N=10000 GeNN_optimized: 1.24454603496
+INFO relative performance for Synapses only: Dense, medium rate (1s duration) N=100000 GeNN_optimized: 1.35448135542
+INFO relative performance for Synapses only: Dense, medium rate (1s duration) N=500000 GeNN_optimized: nan
+Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling
+cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_DenseMediumRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log ./main 
+Profiling took 0:00:04 for runtime of 0.154969
+Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling
+cd GeNNworkspace && nvprof  --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_DenseMediumRateSynapsesOnly_GeNNConfigurationOptimized_1000.log ./main test 1.0 1
+Profiling took 0:00:03 for runtime of 0.120151
+Starting SparseLowRateSynapsesOnly on 06.04.2017 at 16:08:31.
+Running speed tests
+Configurations: CUDA standalone, C++ standalone, GeNN_optimized
+Synapses only: Sparse, low rate (10s duration):  n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=100000 [...] n=500000 [...]INFO: setting cudaDevice stuff took 0.315501 seconds
+INFO connectivity matrix has size 191
+INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+Number of synapses: 191
+INFO: main_lines took 18.990017 seconds
+INFO: main function took 19.330690 seconds
+mkdir: cannot create directory ‘test_output’: File exists
+# DT 0.000100 
+# totalTime 10.000000 
+# We are running with fixed time step 0.000100 
+9.9999 done ...
+everything finished.
+
+Running SparseLowRateSynapsesOnly took 1:06:45.
+INFO relative performance for Synapses only: Sparse, low rate (10s duration) N=10 CUDA standalone: 1.0
+INFO relative performance for Synapses only: Sparse, low rate (10s duration) N=100 CUDA standalone: 1.0
+INFO relative performance for Synapses only: Sparse, low rate (10s duration) N=1000 CUDA standalone: 1.0
+INFO relative performance for Synapses only: Sparse, low rate (10s duration) N=10000 CUDA standalone: 1.0
+INFO relative performance for Synapses only: Sparse, low rate (10s duration) N=100000 CUDA standalone: 1.0
+INFO relative performance for Synapses only: Sparse, low rate (10s duration) N=500000 CUDA standalone: 1.0
+INFO relative performance for Synapses only: Sparse, low rate (10s duration) N=10 C++ standalone: 11.3532916318
+INFO relative performance for Synapses only: Sparse, low rate (10s duration) N=100 C++ standalone: 11.7559788497
+INFO relative performance for Synapses only: Sparse, low rate (10s duration) N=1000 C++ standalone: 8.13595845714
+INFO relative performance for Synapses only: Sparse, low rate (10s duration) N=10000 C++ standalone: 2.08954888412
+INFO relative performance for Synapses only: Sparse, low rate (10s duration) N=100000 C++ standalone: 0.0786977354705
+INFO relative performance for Synapses only: Sparse, low rate (10s duration) N=500000 C++ standalone: 0.050991577758
+INFO relative performance for Synapses only: Sparse, low rate (10s duration) N=10 GeNN_optimized: 1.24713089977
+INFO relative performance for Synapses only: Sparse, low rate (10s duration) N=100 GeNN_optimized: 1.20253101745
+INFO relative performance for Synapses only: Sparse, low rate (10s duration) N=1000 GeNN_optimized: 1.16650382499
+INFO relative performance for Synapses only: Sparse, low rate (10s duration) N=10000 GeNN_optimized: 1.13437197572
+INFO relative performance for Synapses only: Sparse, low rate (10s duration) N=100000 GeNN_optimized: 0.553632482835
+INFO relative performance for Synapses only: Sparse, low rate (10s duration) N=500000 GeNN_optimized: 0.521572319612
+Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling
+cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseLowRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log ./main 
+Profiling took 0:00:31 for runtime of 1.45718
+Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling
+cd GeNNworkspace && nvprof  --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseLowRateSynapsesOnly_GeNNConfigurationOptimized_1000.log ./main test 10.0 1
+Profiling took 0:00:12 for runtime of 1.260501
+Starting SparseHighRateSynapsesOnly on 06.04.2017 at 17:16:49.
+Running speed tests
+Configurations: CUDA standalone, C++ standalone, GeNN_optimized
+Synapses only: Sparse, high rate (1s duration):  n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=100000 [...] n=500000 [E..]INFO: setting cudaDevice stuff took 0.143830 seconds
+INFO connectivity matrix has size 2030
+INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+Number of synapses: 2030
+INFO: main_lines took 1.986260 seconds
+INFO: main function took 2.148834 seconds
+mkdir: cannot create directory ‘test_output’: File exists
+# DT 0.000100 
+# totalTime 1.000000 
+# We are running with fixed time step 0.000100 
+0.9999 done ...
+everything finished.
+
+
+TRACEBACK CUDA standalone N=500000
+INFO: setting cudaDevice stuff took 0.286275 seconds
+
+('debug syn effect mdoe ', 'target')
+INFO: setting cudaDevice stuff took 0.286275 seconds
+
+
+/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/utils/logger.py:546: UserWarning: Could not copy script file to temp directory: [Errno 2] No such file or directory: '/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/-c'
+  'Could not copy script file to temp directory: %s' % ex)
+terminate called after throwing an instance of 'thrust::system::detail::bad_alloc'
+  what():  std::bad_alloc: out of memory
+
+Traceback (most recent call last):
+  File "<string>", line 21, in <module>
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/tests/features/cuda_configuration.py", line 27, in after_run
+    with_output=False)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/device.py", line 778, in build
+    self.run(directory, with_output, run_args)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/devices/cpp_standalone/device.py", line 864, in run
+    "%s)" % os.path.abspath(directory))
+RuntimeError: Project run failed (project directory: /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/cuda_standalone/cuda_standalone)
+
+
+
+Running SparseHighRateSynapsesOnly took 3:04:44.
+INFO relative performance for Synapses only: Sparse, high rate (1s duration) N=10 CUDA standalone: 1.0
+INFO relative performance for Synapses only: Sparse, high rate (1s duration) N=100 CUDA standalone: 1.0
+INFO relative performance for Synapses only: Sparse, high rate (1s duration) N=1000 CUDA standalone: 1.0
+INFO relative performance for Synapses only: Sparse, high rate (1s duration) N=10000 CUDA standalone: 1.0
+INFO relative performance for Synapses only: Sparse, high rate (1s duration) N=100000 CUDA standalone: 1.0
+INFO relative performance for Synapses only: Sparse, high rate (1s duration) N=500000 CUDA standalone: nan
+INFO relative performance for Synapses only: Sparse, high rate (1s duration) N=10 C++ standalone: 13.027787307
+INFO relative performance for Synapses only: Sparse, high rate (1s duration) N=100 C++ standalone: 8.31720400679
+INFO relative performance for Synapses only: Sparse, high rate (1s duration) N=1000 C++ standalone: 5.57225034169
+INFO relative performance for Synapses only: Sparse, high rate (1s duration) N=10000 C++ standalone: 0.42275766627
+INFO relative performance for Synapses only: Sparse, high rate (1s duration) N=100000 C++ standalone: 0.0495905212469
+INFO relative performance for Synapses only: Sparse, high rate (1s duration) N=500000 C++ standalone: nan
+INFO relative performance for Synapses only: Sparse, high rate (1s duration) N=10 GeNN_optimized: 1.29306411522
+INFO relative performance for Synapses only: Sparse, high rate (1s duration) N=100 GeNN_optimized: 1.23894750575
+INFO relative performance for Synapses only: Sparse, high rate (1s duration) N=1000 GeNN_optimized: 1.03327020554
+INFO relative performance for Synapses only: Sparse, high rate (1s duration) N=10000 GeNN_optimized: 1.0713338171
+INFO relative performance for Synapses only: Sparse, high rate (1s duration) N=100000 GeNN_optimized: 0.743955000458
+INFO relative performance for Synapses only: Sparse, high rate (1s duration) N=500000 GeNN_optimized: nan
+Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling
+cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseHighRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log ./main 
+Profiling took 0:00:04 for runtime of 0.374927
+Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling
+cd GeNNworkspace && nvprof  --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseHighRateSynapsesOnly_GeNNConfigurationOptimized_1000.log ./main test 1.0 1
+Profiling took 0:00:03 for runtime of 0.362964
+Starting STDPNotEventDriven on 06.04.2017 at 20:23:12.
+Running speed tests
+Configurations: CUDA standalone, C++ standalone, GeNN_optimized
+Full examples: STDP (not event-driven):  n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=20000 [...] n=50000 [...] n=100000 [...]INFO: setting cudaDevice stuff took 0.153160 seconds
+INFO calling kernel_synapses_group_variable_set_conditional_codeobject with 1 blocks and 1024 threads. Kernel needs 6 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO connectivity matrix has size 1000
+INFO connectivity matrix has size 1000
+INFO generating 10000000 rand every 13107 clock cycles for poissongroup_thresholder_codeobject
+INFO calling kernel_neurongroup_stateupdater_codeobject with 1 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000.
+INFO calling kernel_synapses_stateupdater_codeobject with 2 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000.
+INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 9 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_poissongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 15 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 33 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.500000.
+INFO calling kernel_synapses_post_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_neurongroup_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+Number of synapses: 1000
+Number of synapses: 1000
+INFO: main_lines took 6.799618 seconds
+INFO: main function took 6.974729 seconds
+mkdir: cannot create directory ‘test_output’: File exists
+# DT 0.000100 
+# totalTime 1.000000 
+# We are running with fixed time step 0.000100 
+0.9999 done ...
+everything finished.
+
+Running STDPNotEventDriven took 0:08:16.
+INFO relative performance for Full examples: STDP (not event-driven) N=10 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP (not event-driven) N=100 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP (not event-driven) N=1000 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP (not event-driven) N=10000 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP (not event-driven) N=20000 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP (not event-driven) N=50000 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP (not event-driven) N=100000 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP (not event-driven) N=10 C++ standalone: 13.4856872038
+INFO relative performance for Full examples: STDP (not event-driven) N=100 C++ standalone: 8.68819435966
+INFO relative performance for Full examples: STDP (not event-driven) N=1000 C++ standalone: 2.66795987943
+INFO relative performance for Full examples: STDP (not event-driven) N=10000 C++ standalone: 0.487504968552
+INFO relative performance for Full examples: STDP (not event-driven) N=20000 C++ standalone: 0.335405214753
+INFO relative performance for Full examples: STDP (not event-driven) N=50000 C++ standalone: 0.278810141551
+INFO relative performance for Full examples: STDP (not event-driven) N=100000 C++ standalone: 0.248035751829
+INFO relative performance for Full examples: STDP (not event-driven) N=10 GeNN_optimized: 2.23964192763
+INFO relative performance for Full examples: STDP (not event-driven) N=100 GeNN_optimized: 2.14290015959
+INFO relative performance for Full examples: STDP (not event-driven) N=1000 GeNN_optimized: 1.86995794984
+INFO relative performance for Full examples: STDP (not event-driven) N=10000 GeNN_optimized: 1.30586540569
+INFO relative performance for Full examples: STDP (not event-driven) N=20000 GeNN_optimized: 1.1519252236
+INFO relative performance for Full examples: STDP (not event-driven) N=50000 GeNN_optimized: 0.970393398749
+INFO relative performance for Full examples: STDP (not event-driven) N=100000 GeNN_optimized: 0.923158814456
+Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling
+cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPNotEventDriven_CUDAStandaloneConfiguration_1000.log ./main 
+Profiling took 0:00:11 for runtime of 0.459476
+Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling
+cd GeNNworkspace && nvprof  --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPNotEventDriven_GeNNConfigurationOptimized_1000.log ./main test 1.0 1
+Profiling took 0:00:03 for runtime of 0.244255
+Starting STDPMultiPost on 06.04.2017 at 20:32:42.
+Running speed tests
+Configurations: CUDA standalone, C++ standalone, GeNN_optimized
+Full examples: STDP with multiple pre- and postsynaptic neurons:  n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=20000 [...] n=50000 [...] n=100000 [...] n=1000000 [...]INFO: setting cudaDevice stuff took 0.144291 seconds
+INFO calling kernel_synapses_group_variable_set_conditional_codeobject with 1 blocks and 1024 threads. Kernel needs 6 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO connectivity matrix has size 961
+INFO connectivity matrix has size 961
+INFO generating 310000 rand every 422812 clock cycles for poissongroup_thresholder_codeobject
+INFO calling kernel_neurongroup_stateupdater_codeobject with 1 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000.
+INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 9 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_poissongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 15 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 42 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.500000.
+INFO calling kernel_synapses_post_codeobject with 15 blocks and 1024 threads. Kernel needs 34 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.500000.
+INFO calling kernel_neurongroup_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+Number of synapses: 961
+Number of synapses: 961
+INFO: main_lines took 5.925237 seconds
+INFO: main function took 6.091588 seconds
+mkdir: cannot create directory ‘test_output’: File exists
+# DT 0.000100 
+# totalTime 1.000000 
+# We are running with fixed time step 0.000100 
+0.9999 done ...
+everything finished.
+
+Running STDPMultiPost took 0:09:20.
+INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=10 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=100 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=1000 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=10000 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=20000 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=50000 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=100000 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=1000000 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=10 C++ standalone: 12.7474018667
+INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=100 C++ standalone: 11.5913458596
+INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=1000 C++ standalone: 9.54428741786
+INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=10000 C++ standalone: 7.34011397663
+INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=20000 C++ standalone: 4.6434939759
+INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=50000 C++ standalone: 3.54781164235
+INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=100000 C++ standalone: 3.07057913489
+INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=1000000 C++ standalone: 0.10910483462
+INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=10 GeNN_optimized: 2.81045227011
+INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=100 GeNN_optimized: 2.74426018098
+INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=1000 GeNN_optimized: 2.62544842893
+INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=10000 GeNN_optimized: 2.48962114561
+INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=20000 GeNN_optimized: 2.45380922669
+INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=50000 GeNN_optimized: 2.3428861001
+INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=100000 GeNN_optimized: 2.18909976103
+INFO relative performance for Full examples: STDP with multiple pre- and postsynaptic neurons N=1000000 GeNN_optimized: 0.801428258577
+Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling
+cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPMultiPost_CUDAStandaloneConfiguration_1000.log ./main 
+Profiling took 0:00:10 for runtime of 0.379254
+Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling
+cd GeNNworkspace && nvprof  --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPMultiPost_GeNNConfigurationOptimized_1000.log ./main test 1.0 1
+Profiling took 0:00:03 for runtime of 0.142824
+Starting STDPNeuronalTraces on 06.04.2017 at 20:43:19.
+Running speed tests
+Configurations: CUDA standalone, C++ standalone, GeNN_optimized
+Full examples: STDP with traces in neurons:  n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=20000 [...] n=50000 [...] n=100000 [...] n=1000000 [...]INFO: setting cudaDevice stuff took 0.176256 seconds
+INFO calling kernel_synapses_group_variable_set_conditional_codeobject with 1 blocks and 1024 threads. Kernel needs 6 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO connectivity matrix has size 1000
+INFO connectivity matrix has size 1000
+INFO generating 10000000 rand every 13107 clock cycles for neurongroup_thresholder_codeobject
+INFO calling kernel_neurongroup_1_stateupdater_codeobject with 1 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000.
+INFO calling kernel_neurongroup_stateupdater_codeobject with 2 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000.
+INFO calling kernel_neurongroup_1_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 9 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 15 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 31 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_post_codeobject with 15 blocks and 1024 threads. Kernel needs 28 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_neurongroup_1_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+Number of synapses: 1000
+Number of synapses: 1000
+INFO: main_lines took 6.268831 seconds
+INFO: main function took 6.467315 seconds
+mkdir: cannot create directory ‘test_output’: File exists
+# DT 0.000100 
+# totalTime 1.000000 
+# We are running with fixed time step 0.000100 
+0.9999 done ...
+everything finished.
+
+Running STDPNeuronalTraces took 0:14:08.
+INFO relative performance for Full examples: STDP with traces in neurons N=10 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP with traces in neurons N=100 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP with traces in neurons N=1000 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP with traces in neurons N=10000 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP with traces in neurons N=20000 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP with traces in neurons N=50000 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP with traces in neurons N=100000 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP with traces in neurons N=1000000 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP with traces in neurons N=10 C++ standalone: 11.6440202443
+INFO relative performance for Full examples: STDP with traces in neurons N=100 C++ standalone: 8.66784869976
+INFO relative performance for Full examples: STDP with traces in neurons N=1000 C++ standalone: 2.18593367648
+INFO relative performance for Full examples: STDP with traces in neurons N=10000 C++ standalone: 0.621996996463
+INFO relative performance for Full examples: STDP with traces in neurons N=20000 C++ standalone: 0.439453583339
+INFO relative performance for Full examples: STDP with traces in neurons N=50000 C++ standalone: 0.385339081617
+INFO relative performance for Full examples: STDP with traces in neurons N=100000 C++ standalone: 0.355185290586
+INFO relative performance for Full examples: STDP with traces in neurons N=1000000 C++ standalone: 0.288422965605
+INFO relative performance for Full examples: STDP with traces in neurons N=10 GeNN_optimized: 2.95565780195
+INFO relative performance for Full examples: STDP with traces in neurons N=100 GeNN_optimized: 2.81320236796
+INFO relative performance for Full examples: STDP with traces in neurons N=1000 GeNN_optimized: 2.48979624527
+INFO relative performance for Full examples: STDP with traces in neurons N=10000 GeNN_optimized: 1.74573261233
+INFO relative performance for Full examples: STDP with traces in neurons N=20000 GeNN_optimized: 1.53949294157
+INFO relative performance for Full examples: STDP with traces in neurons N=50000 GeNN_optimized: 1.27630217316
+INFO relative performance for Full examples: STDP with traces in neurons N=100000 GeNN_optimized: 1.20707261203
+INFO relative performance for Full examples: STDP with traces in neurons N=1000000 GeNN_optimized: 1.15485155451
+Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling
+cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPNeuronalTraces_CUDAStandaloneConfiguration_1000.log ./main 
+Profiling took 0:00:12 for runtime of 0.465582
+Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling
+cd GeNNworkspace && nvprof  --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPNeuronalTraces_GeNNConfigurationOptimized_1000.log ./main test 1.0 1
+Profiling took 0:00:03 for runtime of 0.186743
+Starting STDPMultiPostNeuronalTraces on 06.04.2017 at 20:58:43.
+Running speed tests
+Configurations: CUDA standalone, C++ standalone, GeNN_optimized
+Full examples: STDP with multiple postsynaptic neurons and traces in neurons:  n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=20000 [...] n=50000 [...] n=100000 [...] n=1000000 [...]INFO: setting cudaDevice stuff took 0.186410 seconds
+INFO calling kernel_synapses_group_variable_set_conditional_codeobject with 1 blocks and 1024 threads. Kernel needs 6 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO connectivity matrix has size 961
+INFO connectivity matrix has size 961
+INFO generating 310000 rand every 422812 clock cycles for neurongroup_thresholder_codeobject
+INFO calling kernel_neurongroup_1_stateupdater_codeobject with 1 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000.
+INFO calling kernel_neurongroup_stateupdater_codeobject with 1 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000.
+INFO calling kernel_neurongroup_1_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 9 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 15 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 31 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_post_codeobject with 15 blocks and 1024 threads. Kernel needs 28 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_neurongroup_1_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+Number of synapses: 961
+Number of synapses: 961
+INFO: main_lines took 6.372802 seconds
+INFO: main function took 6.581250 seconds
+mkdir: cannot create directory ‘test_output’: File exists
+# DT 0.000100 
+# totalTime 1.000000 
+# We are running with fixed time step 0.000100 
+0.9999 done ...
+everything finished.
+
+Running STDPMultiPostNeuronalTraces took 0:08:49.
+INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=10 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=100 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=1000 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=10000 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=20000 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=50000 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=100000 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=1000000 CUDA standalone: 1.0
+INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=10 C++ standalone: 11.2776436504
+INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=100 C++ standalone: 11.2039193789
+INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=1000 C++ standalone: 10.2038728186
+INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=10000 C++ standalone: 8.2861849847
+INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=20000 C++ standalone: 7.36851429155
+INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=50000 C++ standalone: 6.35608139671
+INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=100000 C++ standalone: 4.82692286419
+INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=1000000 C++ standalone: 0.603177630071
+INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=10 GeNN_optimized: 2.94135590849
+INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=100 GeNN_optimized: 2.95301604211
+INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=1000 GeNN_optimized: 2.90776556826
+INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=10000 GeNN_optimized: 2.78124403426
+INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=20000 GeNN_optimized: 2.82306874025
+INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=50000 GeNN_optimized: 2.77392207329
+INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=100000 GeNN_optimized: 2.67728338099
+INFO relative performance for Full examples: STDP with multiple postsynaptic neurons and traces in neurons N=1000000 GeNN_optimized: 2.1046509768
+Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling
+cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPMultiPostNeuronalTraces_CUDAStandaloneConfiguration_1000.log ./main 
+Profiling took 0:00:12 for runtime of 0.422016
+Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling
+cd GeNNworkspace && nvprof  --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPMultiPostNeuronalTraces_GeNNConfigurationOptimized_1000.log ./main test 1.0 1
+Profiling took 0:00:03 for runtime of 0.146966
+Starting BrunelHakimModelHeterogeneousDelay on 06.04.2017 at 21:08:46.
+Running speed tests
+Configurations: CUDA standalone, C++ standalone, GeNN_optimized
+Full examples: Brunel Hakim with heterogenous delays:  n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=20000 [...] n=50000 [...] n=100000 [E..]
+
+TRACEBACK CUDA standalone N=100000
+INFO: setting cudaDevice stuff took 0.094517 seconds
+INFO calling kernel_synapses_group_variable_set_conditional_codeobject with 97661 blocks and 1024 threads. Kernel needs 6 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO connectivity matrix has size 100004409
+INFO not enough memory available to generate 247808 random numbers for neurongroup_stateupdater_codeobject, reducing the buffer size
+INFO not enough memory available to generate 247808 random numbers for neurongroup_stateupdater_codeobject, reducing the buffer size
+INFO not enough memory available to generate 247808 random numbers for neurongroup_stateupdater_codeobject, reducing the buffer size
+INFO not enough memory available to generate 247808 random numbers for neurongroup_stateupdater_codeobject, reducing the buffer size
+INFO not enough memory available to generate 247808 random numbers for neurongroup_stateupdater_codeobject, reducing the buffer size
+INFO not enough memory available to generate 247808 random numbers for neurongroup_stateupdater_codeobject, reducing the buffer size
+INFO generating 204687 randn every 131 clock cycles for neurongroup_stateupdater_codeobject
+INFO calling kernel_neurongroup_stateupdater_codeobject with 174 blocks and 576 threads. Kernel needs 52 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.562500.
+ERROR launching kernel_neurongroup_stateupdater_codeobject in code_objects/neurongroup_stateupdater_codeobject.cu:1051 out of memory
+
+('debug syn effect mdoe ', 'target')
+INFO: setting cudaDevice stuff took 0.094517 seconds
+INFO calling kernel_synapses_group_variable_set_conditional_codeobject with 97661 blocks and 1024 threads. Kernel needs 6 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO connectivity matrix has size 100004409
+INFO not enough memory available to generate 247808 random numbers for neurongroup_stateupdater_codeobject, reducing the buffer size
+INFO not enough memory available to generate 247808 random numbers for neurongroup_stateupdater_codeobject, reducing the buffer size
+INFO not enough memory available to generate 247808 random numbers for neurongroup_stateupdater_codeobject, reducing the buffer size
+INFO not enough memory available to generate 247808 random numbers for neurongroup_stateupdater_codeobject, reducing the buffer size
+INFO not enough memory available to generate 247808 random numbers for neurongroup_stateupdater_codeobject, reducing the buffer size
+INFO not enough memory available to generate 247808 random numbers for neurongroup_stateupdater_codeobject, reducing the buffer size
+INFO generating 204687 randn every 131 clock cycles for neurongroup_stateupdater_codeobject
+INFO calling kernel_neurongroup_stateupdater_codeobject with 174 blocks and 576 threads. Kernel needs 52 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.562500.
+ERROR launching kernel_neurongroup_stateupdater_codeobject in code_objects/neurongroup_stateupdater_codeobject.cu:1051 out of memory
+
+
+/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/utils/logger.py:546: UserWarning: Could not copy script file to temp directory: [Errno 2] No such file or directory: '/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/-c'
+  'Could not copy script file to temp directory: %s' % ex)
+INFO       No numerical integration method specified for group 'neurongroup', using method 'euler' (took 0.07s, trying other methods took 0.00s). [brian2.stateupdaters.base.method_choice]
+terminINFO: setting cudaDevice stuff took 0.179905 seconds
+INFO calling kernel_synapses_group_variable_set_conditional_codeobject with 977 blocks and 1024 threads. Kernel needs 6 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO connectivity matrix has size 1000000
+INFO generating 10000000 randn every 13107 clock cycles for neurongroup_stateupdater_codeobject
+INFO calling kernel_neurongroup_stateupdater_codeobject with 2 blocks and 576 threads. Kernel needs 52 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.562500.
+INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 11 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.000000.
+INFO calling kernel_neurongroup_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+Number of synapses: 1000000
+INFO: main_lines took 9.590645 seconds
+INFO: main function took 9.829623 seconds
+mkdir: cannot create directory ‘test_output’: File exists
+# DT 0.000100 
+# totalTime 1.000000 
+# We are running with fixed time step 0.000100 
+0.9999 done ...
+everything finished.
+ate called after throwing an instance of 'thrust::system::system_error'
+  what():  cudaFree in free: an illegal memory access was encountered
+
+Traceback (most recent call last):
+  File "<string>", line 21, in <module>
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/tests/features/cuda_configuration.py", line 27, in after_run
+    with_output=False)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/device.py", line 778, in build
+    self.run(directory, with_output, run_args)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/devices/cpp_standalone/device.py", line 864, in run
+    "%s)" % os.path.abspath(directory))
+RuntimeError: Project run failed (project directory: /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/cuda_standalone/cuda_standalone)
+
+
+
+Running BrunelHakimModelHeterogeneousDelay took 1:11:22.
+INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=10 CUDA standalone: 1.0
+INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=100 CUDA standalone: 1.0
+INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=1000 CUDA standalone: 1.0
+INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=10000 CUDA standalone: 1.0
+INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=20000 CUDA standalone: 1.0
+INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=50000 CUDA standalone: 1.0
+INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=100000 CUDA standalone: nan
+INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=10 C++ standalone: 11.4029279828
+INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=100 C++ standalone: 8.7440567782
+INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=1000 C++ standalone: 5.15311757921
+INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=10000 C++ standalone: 6.3911140671
+INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=20000 C++ standalone: 12.3422729105
+INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=50000 C++ standalone: 15.0350916602
+INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=100000 C++ standalone: nan
+INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=10 GeNN_optimized: 2.23856530406
+INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=100 GeNN_optimized: 4.36762611789
+INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=1000 GeNN_optimized: 20.15374328
+INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=10000 GeNN_optimized: 146.679676807
+INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=20000 GeNN_optimized: 357.995725987
+INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=50000 GeNN_optimized: 492.68840486
+INFO relative performance for Full examples: Brunel Hakim with heterogenous delays N=100000 GeNN_optimized: nan
+Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling
+cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfiguration_1000.log ./main 
+Profiling took 0:00:15 for runtime of 3.81853
+Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling
+cd GeNNworkspace && nvprof  --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_GeNNConfigurationOptimized_1000.log ./main test 1.0 1
+Profiling took 0:00:04 for runtime of 0.188529
+Starting LinearNeuronsOnly on 06.04.2017 at 22:21:28.
+Running speed tests
+Configurations: CUDA standalone, C++ standalone, GeNN_optimized
+Neurons only: Linear 1D:  n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=100000 [...] n=1000000 [...] n=10000000 [...]INFO: setting cudaDevice stuff took 0.326520 seconds
+INFO calling kernel_neurongroup_stateupdater_codeobject with 1 blocks and 1024 threads. Kernel needs 12 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 0 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO: main_lines took 5.766263 seconds
+INFO: main function took 6.105103 seconds
+mkdir: cannot create directory ‘test_output’: File exists
+# DT 0.000100 
+# totalTime 10.000000 
+# We are running with fixed time step 0.000100 
+9.9999 done ...
+everything finished.
+
+Running LinearNeuronsOnly took 0:33:02.
+INFO relative performance for Neurons only: Linear 1D N=10 CUDA standalone: 1.0
+INFO relative performance for Neurons only: Linear 1D N=100 CUDA standalone: 1.0
+INFO relative performance for Neurons only: Linear 1D N=1000 CUDA standalone: 1.0
+INFO relative performance for Neurons only: Linear 1D N=10000 CUDA standalone: 1.0
+INFO relative performance for Neurons only: Linear 1D N=100000 CUDA standalone: 1.0
+INFO relative performance for Neurons only: Linear 1D N=1000000 CUDA standalone: 1.0
+INFO relative performance for Neurons only: Linear 1D N=10000000 CUDA standalone: 1.0
+INFO relative performance for Neurons only: Linear 1D N=10 C++ standalone: 7.51083775606
+INFO relative performance for Neurons only: Linear 1D N=100 C++ standalone: 6.6944353215
+INFO relative performance for Neurons only: Linear 1D N=1000 C++ standalone: 3.76942753323
+INFO relative performance for Neurons only: Linear 1D N=10000 C++ standalone: 0.799040137212
+INFO relative performance for Neurons only: Linear 1D N=100000 C++ standalone: 0.144238328209
+INFO relative performance for Neurons only: Linear 1D N=1000000 C++ standalone: 0.104833409762
+INFO relative performance for Neurons only: Linear 1D N=10000000 C++ standalone: 0.0490665649822
+INFO relative performance for Neurons only: Linear 1D N=10 GeNN_optimized: 0.973583595511
+INFO relative performance for Neurons only: Linear 1D N=100 GeNN_optimized: 0.984380282075
+INFO relative performance for Neurons only: Linear 1D N=1000 GeNN_optimized: 1.0333623878
+INFO relative performance for Neurons only: Linear 1D N=10000 GeNN_optimized: 0.907366342055
+INFO relative performance for Neurons only: Linear 1D N=100000 GeNN_optimized: 0.65065534916
+INFO relative performance for Neurons only: Linear 1D N=1000000 GeNN_optimized: 0.603726581657
+INFO relative performance for Neurons only: Linear 1D N=10000000 GeNN_optimized: 0.583361983096
+Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling
+cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_LinearNeuronsOnly_CUDAStandaloneConfiguration_1000.log ./main 
+Profiling took 0:00:10 for runtime of 0.48209
+Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling
+cd GeNNworkspace && nvprof  --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_LinearNeuronsOnly_GeNNConfigurationOptimized_1000.log ./main test 10.0 1
+Profiling took 0:00:06 for runtime of 0.45749
+Starting HHNeuronsOnly on 06.04.2017 at 22:55:24.
+Running speed tests
+Configurations: CUDA standalone, C++ standalone, GeNN_optimized
+Neurons only: Hodgkin-Huxley:  n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=100000 [...] n=1000000 [...]INFO: setting cudaDevice stuff took 0.143355 seconds
+INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject with 1 blocks and 1024 threads. Kernel needs 7 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 0 bytes of user-allocated constant memory
+INFO calling kernel_neurongroup_stateupdater_codeobject with 1 blocks and 1024 threads. Kernel needs 62 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 0 bytes of user-allocated constant memory. Theoretical occupancy is 0.500000.
+INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 11 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 0 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO: main_lines took 1.875142 seconds
+INFO: main function took 2.032155 seconds
+mkdir: cannot create directory ‘test_output’: File exists
+# DT 0.000100 
+# totalTime 1.000000 
+# We are running with fixed time step 0.000100 
+0.9999 done ...
+everything finished.
+
+Running HHNeuronsOnly took 0:21:09.
+INFO relative performance for Neurons only: Hodgkin-Huxley N=10 CUDA standalone: 1.0
+INFO relative performance for Neurons only: Hodgkin-Huxley N=100 CUDA standalone: 1.0
+INFO relative performance for Neurons only: Hodgkin-Huxley N=1000 CUDA standalone: 1.0
+INFO relative performance for Neurons only: Hodgkin-Huxley N=10000 CUDA standalone: 1.0
+INFO relative performance for Neurons only: Hodgkin-Huxley N=100000 CUDA standalone: 1.0
+INFO relative performance for Neurons only: Hodgkin-Huxley N=1000000 CUDA standalone: 1.0
+INFO relative performance for Neurons only: Hodgkin-Huxley N=10 C++ standalone: 8.88708231981
+INFO relative performance for Neurons only: Hodgkin-Huxley N=100 C++ standalone: 1.93233246921
+INFO relative performance for Neurons only: Hodgkin-Huxley N=1000 C++ standalone: 0.273829639851
+INFO relative performance for Neurons only: Hodgkin-Huxley N=10000 C++ standalone: 0.0301623262336
+INFO relative performance for Neurons only: Hodgkin-Huxley N=100000 C++ standalone: 0.0140154114169
+INFO relative performance for Neurons only: Hodgkin-Huxley N=1000000 C++ standalone: 0.0117289494235
+INFO relative performance for Neurons only: Hodgkin-Huxley N=10 GeNN_optimized: 1.17820348169
+INFO relative performance for Neurons only: Hodgkin-Huxley N=100 GeNN_optimized: 1.1193610315
+INFO relative performance for Neurons only: Hodgkin-Huxley N=1000 GeNN_optimized: 1.40084936828
+INFO relative performance for Neurons only: Hodgkin-Huxley N=10000 GeNN_optimized: 1.15423786453
+INFO relative performance for Neurons only: Hodgkin-Huxley N=100000 GeNN_optimized: 1.03595178005
+INFO relative performance for Neurons only: Hodgkin-Huxley N=1000000 GeNN_optimized: 0.955525776817
+Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling
+cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_HHNeuronsOnly_CUDAStandaloneConfiguration_1000.log ./main 
+Profiling took 0:00:04 for runtime of 0.251227
+Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling
+cd GeNNworkspace && nvprof  --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_HHNeuronsOnly_GeNNConfigurationOptimized_1000.log ./main test 1.0 1
+Profiling took 0:00:02 for runtime of 0.179052
+Starting VogelsWithSynapticDynamic on 06.04.2017 at 23:17:20.
+Running speed tests
+Configurations: CUDA standalone, C++ standalone, GeNN_optimized
+Full examples: Vogels et al 2011 (not event-driven synapses):  n=10 [E.E] n=100 [..E] n=1000 [..E] n=10000 [..E] n=20000 [..E] n=50000 [..E] n=100000 [E.E]
+
+TRACEBACK CUDA standalone N=10
+INFO: setting cudaDevice stuff took 0.268672 seconds
+INFO connectivity matrix has size 1
+INFO calling kernel_neurongroup_stateupdater_codeobject with 1 blocks and 768 threads. Kernel needs 40 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000.
+INFO calling kernel_synapses_2_stateupdater_codeobject with 0 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000.
+ERROR launching kernel_synapses_2_stateupdater_codeobject in code_objects/synapses_2_stateupdater_codeobject.cu:1044 invalid configuration argument
+
+('debug syn effect mdoe ', 'target')
+('debug syn effect mdoe ', 'target')
+('debug syn effect mdoe ', 'target')
+('debug syn effect mdoe ', 'synapse')
+INFO: setting cudaDevice stuff took 0.268672 seconds
+INFO connectivity matrix has size 1
+INFO calling kernel_neurongroup_stateupdater_codeobject with 1 blocks and 768 threads. Kernel needs 40 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000.
+INFO calling kernel_synapses_2_stateupdater_codeobject with 0 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000.
+ERROR launching kernel_synapses_2_stateupdater_codeobject in code_objects/synapses_2_stateupdater_codeobject.cu:1044 invalid configuration argument
+
+
+/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/utils/logger.py:546: UserWarning: Could not copy script file to temp directory: [Errno 2] No such file or directory: '/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/-c'
+  'Could not copy script file to temp directory: %s' % ex)
+INFO       The synaptic equation for the variable A_pre does not specify whether it should be integrated at every timestep ("clock-driven") or only at spiking events ("event-driven"). It will be integrated at every timestep which can slow down your simulation unnecessarily if you only need the values of this variable whenever a spike occurs. Specify the equation as clock-driven explicitly to avoid this warning. [brian2.synapses.synapses.clock_driven]
+INFO       The synaptic equation for the variable A_post does not specify whether it should be integrated at every timestep ("clock-driven") or only at spiking events ("event-driven"). It will be integrated at every timestep which can slow down your simulation unnecessarily if you only need the values of this variable whenever a spike occurs. Specify the equation as clock-driven explicitly to avoid this warning. [brian2.synapses.synapses.clock_driven]
+INFO       No numerical integration method specified for group 'neurongroup', using method 'euler' (took 0.06s, trying other methods took 0.12s). [brian2.stateupdaters.base.method_choice]
+INFO       No numerical integration method specified for group 'synapses_2', using method 'linear' (took 0.14s). [brian2.stateupdaters.base.method_choice]
+
+Traceback (most recent call last):
+  File "<string>", line 21, in <module>
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/tests/features/cuda_configuration.py", line 27, in after_run
+    with_output=False)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/device.py", line 778, in build
+    self.run(directory, with_output, run_args)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/devices/cpp_standalone/device.py", line 864, in run
+    "%s)" % os.path.abspath(directory))
+RuntimeError: Project run failed (project directory: /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/cuda_standalone/cuda_standalone)
+
+
+
+
+TRACEBACK GeNN_optimized N=10
+no stdout file found, cwd = /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNWorkspace/results/stdout.txt
+running brian code generation ...
+building genn executable ...
+ar -rcs /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/lib/libgenn.a /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/global.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/modelSpec.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/neuronModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/synapseModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/postSynapseModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/utils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/stringUtils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/sparseUtils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/hr_time.o
+g++ -std=c++11 -DNVCC=\""/usr/local/cuda/bin/nvcc"\" -DMODEL=\"/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model.cpp\" -o /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/generateALL /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/src/generate*.cc -I"/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/include" -I"/usr/local/cuda/include" -L"/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/lib" -L"/usr/local/cuda/lib64" -lgenn -lcuda -lcudart
+call was ./generateALL /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace 
+optimizing block size...
+Global memory required for core model: 0.000654 MB. 
+6440894464 for device 0
+dry-run compile for device 0
+"/usr/local/cuda/bin/nvcc" -cubin -x cu -arch sm_35 -O3 -I"$GENN_PATH/lib/include" -o "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/runner.cubin" "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model_CODE/runner.cc"
+genn-buildmodel.sh:70: error 50: command failure
+
+/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/utils/logger.py:546: UserWarning: Could not copy script file to temp directory: [Errno 2] No such file or directory: '/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/-c'
+  'Could not copy script file to temp directory: %s' % ex)
+INFO       The synaptic equation for the variable A_pre does not specify whether it should be integrated at every timestep ("clock-driven") or only at spiking events ("event-driven"). It will be integrated at every timestep which can slow down your simulation unnecessarily if you only need the values of this variable whenever a spike occurs. Specify the equation as clock-driven explicitly to avoid this warning. [brian2.synapses.synapses.clock_driven]
+INFO       The synaptic equation for the variable A_post does not specify whether it should be integrated at every timestep ("clock-driven") or only at spiking events ("event-driven"). It will be integrated at every timestep which can slow down your simulation unnecessarily if you only need the values of this variable whenever a spike occurs. Specify the equation as clock-driven explicitly to avoid this warning. [brian2.synapses.synapses.clock_driven]
+WARNING    The selected device 'genn' only supports a fixed schedule, but this schedule is not consistent with the network's schedule. The simulation will use the device's schedule.
+Device schedule: ['start', 'synapses', 'groups', 'thresholds', 'resets', 'end']
+Network schedule: ['start', 'groups', 'thresholds', 'synapses', 'resets', 'end']
+Set the network schedule explicitly or set the core.network.default_schedule preference to avoid this warning. [brian2.core.network.schedule_conflict]
+INFO       No numerical integration method specified for group 'neurongroup', using method 'euler' (took 0.06s, trying other methods took 0.11s). [brian2.stateupdaters.base.method_choice]
+INFO       No numerical integration method specified for group 'synapses_2', using method 'linear' (took 0.15s). [brian2.stateupdaters.base.method_choice]
+/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model_CODE/synapseKrnl.cc(23): error: name must be a namespace name
+
+1 error detected in the compilation of "/tmp/tmpxft_000066aa_00000000-7_runner.cpp1.ii".
+/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/src/generateALL.cc: 258: cuda driver error 301: CUDA_ERROR_FILE_NOT_FOUND
+
+Traceback (most recent call last):
+  File "<string>", line 14, in <module>
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/tests/features/speed.py", line 1075, in run
+    self.timed_run(self.duration)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/tests/features/base.py", line 63, in timed_run
+    brian2.run(duration, level=1)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/units/fundamentalunits.py", line 2428, in new_f
+    result = f(*args, **kwds)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/magic.py", line 371, in run
+    namespace=namespace, profile=profile, level=2+level)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/magic.py", line 231, in run
+    namespace=namespace, profile=profile, level=level+1)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/base.py", line 276, in device_override_decorated_function
+    return getattr(curdev, name)(*args, **kwds)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2genn/brian2genn/device.py", line 1206, in network_run
+    super(GeNNDevice, self).network_run(net=net, duration=duration, report=report, report_period=report_period, namespace=namespace, level=level+1)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/devices/cpp_standalone/device.py", line 1171, in network_run
+    self.build(direct_call=False, **self.build_options)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2genn/brian2genn/device.py", line 582, in build
+    returncode=ex.returncode)
+RuntimeError: Project compilation failed (Command ['genn-buildmodel.sh', 'magicnetwork_model.cpp'] failed with error code 50).
+See the output above (if any) for more details.
+
+
+
+
+TRACEBACK GeNN_optimized N=100
+no stdout file found, cwd = /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNWorkspace/results/stdout.txt
+running brian code generation ...
+building genn executable ...
+ar -rcs /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/lib/libgenn.a /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/global.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/modelSpec.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/neuronModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/synapseModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/postSynapseModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/utils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/stringUtils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/sparseUtils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/hr_time.o
+g++ -std=c++11 -DNVCC=\""/usr/local/cuda/bin/nvcc"\" -DMODEL=\"/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model.cpp\" -o /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/generateALL /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/src/generate*.cc -I"/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/include" -I"/usr/local/cuda/include" -L"/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/lib" -L"/usr/local/cuda/lib64" -lgenn -lcuda -lcudart
+call was ./generateALL /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace 
+optimizing block size...
+Global memory required for core model: 0.006504 MB. 
+6440894464 for device 0
+dry-run compile for device 0
+"/usr/local/cuda/bin/nvcc" -cubin -x cu -arch sm_35 -O3 -I"$GENN_PATH/lib/include" -o "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/runner.cubin" "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model_CODE/runner.cc"
+genn-buildmodel.sh:70: error 50: command failure
+
+/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/utils/logger.py:546: UserWarning: Could not copy script file to temp directory: [Errno 2] No such file or directory: '/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/-c'
+  'Could not copy script file to temp directory: %s' % ex)
+INFO       The synaptic equation for the variable A_pre does not specify whether it should be integrated at every timestep ("clock-driven") or only at spiking events ("event-driven"). It will be integrated at every timestep which can slow down your simulation unnecessarily if you only need the values of this variable whenever a spike occurs. Specify the equation as clock-driven explicitly to avoid this warning. [brian2.synapses.synapses.clock_driven]
+INFO       The synaptic equation for the variable A_post does not specify whether it should be integrated at every timestep ("clock-driven") or only at spiking events ("event-driven"). It will be integrated at every timestep which can slow down your simulation unnecessarily if you only need the values of this variable whenever a spike occurs. Specify the equation as clock-driven explicitly to avoid this warning. [brian2.synapses.synapses.clock_driven]
+WARNING    The selected device 'genn' only supports a fixed schedule, but this schedule is not consistent with the network's schedule. The simulation will use the device's schedule.
+Device schedule: ['start', 'synapses', 'groups', 'thresholds', 'resets', 'end']
+Network schedule: ['start', 'groups', 'thresholds', 'synapses', 'resets', 'end']
+Set the network schedule explicitly or set the core.network.default_schedule preference to avoid this warning. [brian2.core.network.schedule_conflict]
+INFO       No numerical integration method specified for group 'neurongroup', using method 'euler' (took 0.07s, trying other methods took 0.14s). [brian2.stateupdaters.base.method_choice]
+INFO       No numerical integration method specified for group 'synapses_2', using method 'linear' (took 0.13s). [brian2.stateupdaters.base.method_choice]
+/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model_CODE/synapseKrnl.cc(23): error: name must be a namespace name
+
+1 error detected in the compilation of "/tmp/tmpxft_00006ab9_00000000-7_runner.cpp1.ii".
+/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/src/generateALL.cc: 258: cuda driver error 301: CUDA_ERROR_FILE_NOT_FOUND
+
+Traceback (most recent call last):
+  File "<string>", line 14, in <module>
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/tests/features/speed.py", line 1075, in run
+    self.timed_run(self.duration)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/tests/features/base.py", line 63, in timed_run
+    brian2.run(duration, level=1)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/units/fundamentalunits.py", line 2428, in new_f
+    result = f(*args, **kwds)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/magic.py", line 371, in run
+    namespace=namespace, profile=profile, level=2+level)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/magic.py", line 231, in run
+    namespace=namespace, profile=profile, level=level+1)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/base.py", line 276, in device_override_decorated_function
+    return getattr(curdev, name)(*args, **kwds)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2genn/brian2genn/device.py", line 1206, in network_run
+    super(GeNNDevice, self).network_run(net=net, duration=duration, report=report, report_period=report_period, namespace=namespace, level=level+1)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/devices/cpp_standalone/device.py", line 1171, in network_run
+    self.build(direct_call=False, **self.build_options)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2genn/brian2genn/device.py", line 582, in build
+    returncode=ex.returncode)
+RuntimeError: Project compilation failed (Command ['genn-buildmodel.sh', 'magicnetwork_model.cpp'] failed with error code 50).
+See the output above (if any) for more details.
+
+
+
+
+TRACEBACK GeNN_optimized N=1000
+no stdout file found, cwd = /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNWorkspace/results/stdout.txt
+running brian code generation ...
+building genn executable ...
+ar -rcs /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/lib/libgenn.a /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/global.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/modelSpec.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/neuronModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/synapseModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/postSynapseModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/utils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/stringUtils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/sparseUtils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/hr_time.o
+g++ -std=c++11 -DNVCC=\""/usr/local/cuda/bin/nvcc"\" -DMODEL=\"/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model.cpp\" -o /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/generateALL /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/src/generate*.cc -I"/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/include" -I"/usr/local/cuda/include" -L"/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/lib" -L"/usr/local/cuda/lib64" -lgenn -lcuda -lcudart
+call was ./generateALL /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace 
+optimizing block size...
+Global memory required for core model: 0.065004 MB. 
+6440894464 for device 0
+dry-run compile for device 0
+"/usr/local/cuda/bin/nvcc" -cubin -x cu -arch sm_35 -O3 -I"$GENN_PATH/lib/include" -o "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/runner.cubin" "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model_CODE/runner.cc"
+genn-buildmodel.sh:70: error 50: command failure
+
+/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/utils/logger.py:546: UserWarning: Could not copy script file to temp directory: [Errno 2] No such file or directory: '/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/-c'
+  'Could not copy script file to temp directory: %s' % ex)
+INFO       The synaptic equation for the variable A_pre does not specify whether it should be integrated at every timestep ("clock-driven") or only at spiking events ("event-driven"). It will be integrated at every timestep which can slow down your simulation unnecessarily if you only need the values of this variable whenever a spike occurs. Specify the equation as clock-driven explicitly to avoid this warning. [brian2.synapses.synapses.clock_driven]
+INFO       The synaptic equation for the variable A_post does not specify whether it should be integrated at every timestep ("clock-driven") or only at spiking events ("event-driven"). It will be integrated at every timestep which can slow down your simulation unnecessarily if you only need the values of this variable whenever a spike occurs. Specify the equation as clock-driven explicitly to avoid this warning. [brian2.synapses.synapses.clock_driven]
+WARNING    The selected device 'genn' only supports a fixed schedule, but this schedule is not consistent with the network's schedule. The simulation will use the device's schedule.
+Device schedule: ['start', 'synapses', 'groups', 'thresholds', 'resets', 'end']
+Network schedule: ['start', 'groups', 'thresholds', 'synapses', 'resets', 'end']
+Set the network schedule explicitly or set the core.network.default_schedule preference to avoid this warning. [brian2.core.network.schedule_conflict]
+INFO       No numerical integration method specified for group 'neurongroup', using method 'euler' (took 0.07s, trying other methods took 0.12s). [brian2.stateupdaters.base.method_choice]
+INFO       No numerical integration method specified for group 'synapses_2', using method 'linear' (took 0.16s). [brian2.stateupdaters.base.method_choice]
+/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model_CODE/synapseKrnl.cc(23): error: name must be a namespace name
+
+1 error detected in the compilation of "/tmp/tmpxft_00006ee5_00000000-7_runner.cpp1.ii".
+/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/src/generateALL.cc: 258: cuda driver error 301: CUDA_ERROR_FILE_NOT_FOUND
+
+Traceback (most recent call last):
+  File "<string>", line 14, in <module>
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/tests/features/speed.py", line 1075, in run
+    self.timed_run(self.duration)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/tests/features/base.py", line 63, in timed_run
+    brian2.run(duration, level=1)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/units/fundamentalunits.py", line 2428, in new_f
+    result = f(*args, **kwds)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/magic.py", line 371, in run
+    namespace=namespace, profile=profile, level=2+level)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/magic.py", line 231, in run
+    namespace=namespace, profile=profile, level=level+1)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/base.py", line 276, in device_override_decorated_function
+    return getattr(curdev, name)(*args, **kwds)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2genn/brian2genn/device.py", line 1206, in network_run
+    super(GeNNDevice, self).network_run(net=net, duration=duration, report=report, report_period=report_period, namespace=namespace, level=level+1)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/devices/cpp_standalone/device.py", line 1171, in network_run
+    self.build(direct_call=False, **self.build_options)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2genn/brian2genn/device.py", line 582, in build
+    returncode=ex.returncode)
+RuntimeError: Project compilation failed (Command ['genn-buildmodel.sh', 'magicnetwork_model.cpp'] failed with error code 50).
+See the output above (if any) for more details.
+
+
+
+
+TRACEBACK GeNN_optimized N=10000
+no stdout file found, cwd = /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNWorkspace/results/stdout.txt
+running brian code generation ...
+building genn executable ...
+ar -rcs /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/lib/libgenn.a /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/global.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/modelSpec.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/neuronModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/synapseModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/postSynapseModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/utils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/stringUtils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/sparseUtils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/hr_time.o
+g++ -std=c++11 -DNVCC=\""/usr/local/cuda/bin/nvcc"\" -DMODEL=\"/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model.cpp\" -o /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/generateALL /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/src/generate*.cc -I"/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/include" -I"/usr/local/cuda/include" -L"/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/lib" -L"/usr/local/cuda/lib64" -lgenn -lcuda -lcudart
+call was ./generateALL /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace 
+optimizing block size...
+Global memory required for core model: 0.650004 MB. 
+6440894464 for device 0
+dry-run compile for device 0
+"/usr/local/cuda/bin/nvcc" -cubin -x cu -arch sm_35 -O3 -I"$GENN_PATH/lib/include" -o "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/runner.cubin" "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model_CODE/runner.cc"
+genn-buildmodel.sh:70: error 50: command failure
+
+/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/utils/logger.py:546: UserWarning: Could not copy script file to temp directory: [Errno 2] No such file or directory: '/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/-c'
+  'Could not copy script file to temp directory: %s' % ex)
+INFO       The synaptic equation for the variable A_pre does not specify whether it should be integrated at every timestep ("clock-driven") or only at spiking events ("event-driven"). It will be integrated at every timestep which can slow down your simulation unnecessarily if you only need the values of this variable whenever a spike occurs. Specify the equation as clock-driven explicitly to avoid this warning. [brian2.synapses.synapses.clock_driven]
+INFO       The synaptic equation for the variable A_post does not specify whether it should be integrated at every timestep ("clock-driven") or only at spiking events ("event-driven"). It will be integrated at every timestep which can slow down your simulation unnecessarily if you only need the values of this variable whenever a spike occurs. Specify the equation as clock-driven explicitly to avoid this warning. [brian2.synapses.synapses.clock_driven]
+WARNING    The selected device 'genn' only supports a fixed schedule, but this schedule is not consistent with the network's schedule. The simulation will use the device's schedule.
+Device schedule: ['start', 'synapses', 'groups', 'thresholds', 'resets', 'end']
+Network schedule: ['start', 'groups', 'thresholds', 'synapses', 'resets', 'end']
+Set the network schedule explicitly or set the core.network.default_schedule preference to avoid this warning. [brian2.core.network.schedule_conflict]
+INFO       No numerical integration method specified for group 'neurongroup', using method 'euler' (took 0.06s, trying other methods took 0.14s). [brian2.stateupdaters.base.method_choice]
+INFO       No numerical integration method specified for group 'synapses_2', using method 'linear' (took 0.13s). [brian2.stateupdaters.base.method_choice]
+/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model_CODE/synapseKrnl.cc(23): error: name must be a namespace name
+
+1 error detected in the compilation of "/tmp/tmpxft_0000733c_00000000-7_runner.cpp1.ii".
+/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/src/generateALL.cc: 258: cuda driver error 301: CUDA_ERROR_FILE_NOT_FOUND
+
+Traceback (most recent call last):
+  File "<string>", line 14, in <module>
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/tests/features/speed.py", line 1075, in run
+    self.timed_run(self.duration)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/tests/features/base.py", line 63, in timed_run
+    brian2.run(duration, level=1)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/units/fundamentalunits.py", line 2428, in new_f
+    result = f(*args, **kwds)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/magic.py", line 371, in run
+    namespace=namespace, profile=profile, level=2+level)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/magic.py", line 231, in run
+    namespace=namespace, profile=profile, level=level+1)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/base.py", line 276, in device_override_decorated_function
+    return getattr(curdev, name)(*args, **kwds)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2genn/brian2genn/device.py", line 1206, in network_run
+    super(GeNNDevice, self).network_run(net=net, duration=duration, report=report, report_period=report_period, namespace=namespace, level=level+1)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/devices/cpp_standalone/device.py", line 1171, in network_run
+    self.build(direct_call=False, **self.build_options)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2genn/brian2genn/device.py", line 582, in build
+    returncode=ex.returncode)
+RuntimeError: Project compilation failed (Command ['genn-buildmodel.sh', 'magicnetwork_model.cpp'] failed with error code 50).
+See the output above (if any) for more details.
+
+
+
+
+TRACEBACK GeNN_optimized N=20000
+no stdout file found, cwd = /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNWorkspace/results/stdout.txt
+running brian code generation ...
+building genn executable ...
+ar -rcs /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/lib/libgenn.a /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/global.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/modelSpec.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/neuronModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/synapseModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/postSynapseModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/utils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/stringUtils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/sparseUtils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/hr_time.o
+g++ -std=c++11 -DNVCC=\""/usr/local/cuda/bin/nvcc"\" -DMODEL=\"/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model.cpp\" -o /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/generateALL /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/src/generate*.cc -I"/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/include" -I"/usr/local/cuda/include" -L"/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/lib" -L"/usr/local/cuda/lib64" -lgenn -lcuda -lcudart
+call was ./generateALL /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace 
+optimizing block size...
+Global memory required for core model: 1.3 MB. 
+6440894464 for device 0
+dry-run compile for device 0
+"/usr/local/cuda/bin/nvcc" -cubin -x cu -arch sm_35 -O3 -I"$GENN_PATH/lib/include" -o "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/runner.cubin" "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model_CODE/runner.cc"
+genn-buildmodel.sh:70: error 50: command failure
+
+/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/utils/logger.py:546: UserWarning: Could not copy script file to temp directory: [Errno 2] No such file or directory: '/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/-c'
+  'Could not copy script file to temp directory: %s' % ex)
+INFO       The synaptic equation for the variable A_pre does not specify whether it should be integrated at every timestep ("clock-driven") or only at spiking events ("event-driven"). It will be integrated at every timestep which can slow down your simulation unnecessarily if you only need the values of this variable whenever a spike occurs. Specify the equation as clock-driven explicitly to avoid this warning. [brian2.synapses.synapses.clock_driven]
+INFO       The synaptic equation for the variable A_post does not specify whether it should be integrated at every timestep ("clock-driven") or only at spiking events ("event-driven"). It will be integrated at every timestep which can slow down your simulation unnecessarily if you only need the values of this variable whenever a spike occurs. Specify the equation as clock-driven explicitly to avoid this warning. [brian2.synapses.synapses.clock_driven]
+WARNING    The selected device 'genn' only supports a fixed schedule, but this schedule is not consistent with the network's schedule. The simulation will use the device's schedule.
+Device schedule: ['start', 'synapses', 'groups', 'thresholds', 'resets', 'end']
+Network schedule: ['start', 'groups', 'thresholds', 'synapses', 'resets', 'end']
+Set the network schedule explicitly or set the core.network.default_schedule preference to avoid this warning. [brian2.core.network.schedule_conflict]
+INFO       No numerical integration method specified for group 'neurongroup', using method 'euler' (took 0.07s, trying other methods took 0.17s). [brian2.stateupdaters.base.method_choice]
+INFO       No numerical integration method specified for group 'synapses_2', using method 'linear' (took 0.14s). [brian2.stateupdaters.base.method_choice]
+/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model_CODE/synapseKrnl.cc(23): error: name must be a namespace name
+
+1 error detected in the compilation of "/tmp/tmpxft_0000782e_00000000-7_runner.cpp1.ii".
+/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/src/generateALL.cc: 258: cuda driver error 301: CUDA_ERROR_FILE_NOT_FOUND
+
+Traceback (most recent call last):
+  File "<string>", line 14, in <module>
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/tests/features/speed.py", line 1075, in run
+    self.timed_run(self.duration)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/tests/features/base.py", line 63, in timed_run
+    brian2.run(duration, level=1)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/units/fundamentalunits.py", line 2428, in new_f
+    result = f(*args, **kwds)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/magic.py", line 371, in run
+    namespace=namespace, profile=profile, level=2+level)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/magic.py", line 231, in run
+    namespace=namespace, profile=profile, level=level+1)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/base.py", line 276, in device_override_decorated_function
+    return getattr(curdev, name)(*args, **kwds)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2genn/brian2genn/device.py", line 1206, in network_run
+    super(GeNNDevice, self).network_run(net=net, duration=duration, report=report, report_period=report_period, namespace=namespace, level=level+1)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/devices/cpp_standalone/device.py", line 1171, in network_run
+    self.build(direct_call=False, **self.build_options)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2genn/brian2genn/device.py", line 582, in build
+    returncode=ex.returncode)
+RuntimeError: Project compilation failed (Command ['genn-buildmodel.sh', 'magicnetwork_model.cpp'] failed with error code 50).
+See the output above (if any) for more details.
+
+
+
+
+TRACEBACK GeNN_optimized N=50000
+no stdout file found, cwd = /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNWorkspace/results/stdout.txt
+running brian code generation ...
+building genn executable ...
+ar -rcs /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/lib/libgenn.a /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/global.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/modelSpec.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/neuronModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/synapseModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/postSynapseModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/utils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/stringUtils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/sparseUtils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/hr_time.o
+g++ -std=c++11 -DNVCC=\""/usr/local/cuda/bin/nvcc"\" -DMODEL=\"/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model.cpp\" -o /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/generateALL /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/src/generate*.cc -I"/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/include" -I"/usr/local/cuda/include" -L"/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/lib" -L"/usr/local/cuda/lib64" -lgenn -lcuda -lcudart
+call was ./generateALL /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace 
+optimizing block size...
+Global memory required for core model: 3.25 MB. 
+6440894464 for device 0
+dry-run compile for device 0
+"/usr/local/cuda/bin/nvcc" -cubin -x cu -arch sm_35 -O3 -I"$GENN_PATH/lib/include" -o "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/runner.cubin" "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model_CODE/runner.cc"
+genn-buildmodel.sh:70: error 50: command failure
+
+/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/utils/logger.py:546: UserWarning: Could not copy script file to temp directory: [Errno 2] No such file or directory: '/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/-c'
+  'Could not copy script file to temp directory: %s' % ex)
+INFO       The synaptic equation for the variable A_pre does not specify whether it should be integrated at every timestep ("clock-driven") or only at spiking events ("event-driven"). It will be integrated at every timestep which can slow down your simulation unnecessarily if you only need the values of this variable whenever a spike occurs. Specify the equation as clock-driven explicitly to avoid this warning. [brian2.synapses.synapses.clock_driven]
+INFO       The synaptic equation for the variable A_post does not specify whether it should be integrated at every timestep ("clock-driven") or only at spiking events ("event-driven"). It will be integrated at every timestep which can slow down your simulation unnecessarily if you only need the values of this variable whenever a spike occurs. Specify the equation as clock-driven explicitly to avoid this warning. [brian2.synapses.synapses.clock_driven]
+WARNING    The selected device 'genn' only supports a fixed schedule, but this schedule is not consistent with the network's schedule. The simulation will use the device's schedule.
+Device schedule: ['start', 'synapses', 'groups', 'thresholds', 'resets', 'end']
+Network schedule: ['start', 'groups', 'thresholds', 'synapses', 'resets', 'end']
+Set the network schedule explicitly or set the core.network.default_schedule preference to avoid this warning. [brian2.core.network.schedule_conflict]
+INFO       No numerical integration method specified for group 'neurongroup', using method 'euler' (took 0.06s, trying other methods took 0.12s). [brian2.stateupdaters.base.method_choice]
+INFO       No numerical integration method specified for group 'synapses_2', using method 'linear' (took 0.14s). [brian2.stateupdaters.base.method_choice]
+/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model_CODE/synapseKrnl.cc(23): error: name must be a namespace name
+
+1 error detected in the compilation of "/tmp/tmpxft_00000209_00000000-7_runner.cpp1.ii".
+/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/src/generateALL.cc: 258: cuda driver error 301: CUDA_ERROR_FILE_NOT_FOUND
+
+Traceback (most recent call last):
+  File "<string>", line 14, in <module>
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/tests/features/speed.py", line 1075, in run
+    self.timed_run(self.duration)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/tests/features/base.py", line 63, in timed_run
+    brian2.run(duration, level=1)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/units/fundamentalunits.py", line 2428, in new_f
+    result = f(*args, **kwds)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/magic.py", line 371, in run
+    namespace=namespace, profile=profile, level=2+level)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/magic.py", line 231, in run
+    namespace=namespace, profile=profile, level=level+1)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/base.py", line 276, in device_override_decorated_function
+    return getattr(curdev, name)(*args, **kwds)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2genn/brian2genn/device.py", line 1206, in network_run
+    super(GeNNDevice, self).network_run(net=net, duration=duration, report=report, report_period=report_period, namespace=namespace, level=level+1)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/devices/cpp_standalone/device.py", line 1171, in network_run
+    self.build(direct_call=False, **self.build_options)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2genn/brian2genn/device.py", line 582, in build
+    returncode=ex.returncode)
+RuntimeError: Project compilation failed (Command ['genn-buildmodel.sh', 'magicnetwork_model.cpp'] failed with error code 50).
+See the output above (if any) for more details.
+
+
+
+
+TRACEBACK CUDA standalone N=100000
+INFO: setting cudaDevice stuff took 0.256535 seconds
+INFO connectivity matrix has size 7999692
+INFO connectivity matrix has size 32002251
+INFO connectivity matrix has size 160017025
+INFO connectivity matrix has size 32002251
+INFO calling kernel_neurongroup_stateupdater_codeobject with 131 blocks and 768 threads. Kernel needs 40 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000.
+ERROR launching kernel_neurongroup_stateupdater_codeobject in code_objects/neurongroup_stateupdater_codeobject.cu:1066 invalid argument
+
+('debug syn effect mdoe ', 'target')
+('debug syn effect mdoe ', 'target')
+('debug syn effect mdoe ', 'target')
+('debug syn effect mdoe ', 'synapse')
+INFO: setting cudaDevice stuff took 0.256535 seconds
+INFO connectivity matrix has size 7999692
+INFO connectivity matrix has size 32002251
+INFO connectivity matrix has size 160017025
+INFO connectivity matrix has size 32002251
+INFO calling kernel_neurongroup_stateupdater_codeobject with 131 blocks and 768 threads. Kernel needs 40 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000.
+ERROR launching kernel_neurongroup_stateupdater_codeobject in code_objects/neurongroup_stateupdater_codeobject.cu:1066 invalid argument
+
+
+/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/utils/logger.py:546: UserWarning: Could not copy script file to temp directory: [Errno 2] No such file or directory: '/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/-c'
+  'Could not copy script file to temp directory: %s' % ex)
+INFO       The synaptic equation for the variable A_pre does not specify whether it should be integrated at every timestep ("clock-driven") or only at spiking events ("event-driven"). It will be integrated at every timestep which can slow down your simulation unnecessarily if you only need the values of this variable whenever a spike occurs. Specify the equation as clock-driven explicitly to avoid this warning. [brian2.synapses.synapses.clock_driven]
+INFO       The synaptic equation for the variable A_post does not specify whether it should be integrated at every timestep ("clock-driven") or only at spiking events ("event-driven"). It will be integrated at every timestep which can slow down your simulation unnecessarily if you only need the values of this variable whenever a spike occurs. Specify the equation as clock-driven explicitly to avoid this warning. [brian2.synapses.synapses.clock_driven]
+INFO       No numerical integration method specified for group 'neurongroup', using method 'euler' (took 0.06s, trying other methods took 0.12s). [brian2.stateupdaters.base.method_choice]
+INFO       No numerical integration method specified for group 'synapses_2', using method 'linear' (took 0.19s). [brian2.stateupdaters.base.method_choice]
+
+Traceback (most recent call last):
+  File "<string>", line 21, in <module>
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/tests/features/cuda_configuration.py", line 27, in after_run
+    with_output=False)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/device.py", line 778, in build
+    self.run(directory, with_output, run_args)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/devices/cpp_standalone/device.py", line 864, in run
+    "%s)" % os.path.abspath(directory))
+RuntimeError: Project run failed (project directory: /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/cuda_standalone/cuda_standalone)
+
+
+
+
+TRACEBACK GeNN_optimized N=100000
+no stdout file found, cwd = /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNWorkspace/results/stdout.txt
+running brian code generation ...
+building genn executable ...
+ar -rcs /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/lib/libgenn.a /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/global.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/modelSpec.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/neuronModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/synapseModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/postSynapseModels.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/utils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/stringUtils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/sparseUtils.o /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/obj/hr_time.o
+g++ -std=c++11 -DNVCC=\""/usr/local/cuda/bin/nvcc"\" -DMODEL=\"/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model.cpp\" -o /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/generateALL /home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/src/generate*.cc -I"/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/include" -I"/usr/local/cuda/include" -L"/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/lib" -L"/usr/local/cuda/lib64" -lgenn -lcuda -lcudart
+call was ./generateALL /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace 
+optimizing block size...
+Global memory required for core model: 6.5 MB. 
+6440894464 for device 0
+dry-run compile for device 0
+"/usr/local/cuda/bin/nvcc" -cubin -x cu -arch sm_35 -O3 -I"$GENN_PATH/lib/include" -o "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/runner.cubin" "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model_CODE/runner.cc"
+genn-buildmodel.sh:70: error 50: command failure
+
+/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/utils/logger.py:546: UserWarning: Could not copy script file to temp directory: [Errno 2] No such file or directory: '/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/-c'
+  'Could not copy script file to temp directory: %s' % ex)
+INFO       The synaptic equation for the variable A_pre does not specify whether it should be integrated at every timestep ("clock-driven") or only at spiking events ("event-driven"). It will be integrated at every timestep which can slow down your simulation unnecessarily if you only need the values of this variable whenever a spike occurs. Specify the equation as clock-driven explicitly to avoid this warning. [brian2.synapses.synapses.clock_driven]
+INFO       The synaptic equation for the variable A_post does not specify whether it should be integrated at every timestep ("clock-driven") or only at spiking events ("event-driven"). It will be integrated at every timestep which can slow down your simulation unnecessarily if you only need the values of this variable whenever a spike occurs. Specify the equation as clock-driven explicitly to avoid this warning. [brian2.synapses.synapses.clock_driven]
+WARNING    The selected device 'genn' only supports a fixed schedule, but this schedule is not consistent with the network's schedule. The simulation will use the device's schedule.
+Device schedule: ['start', 'synapses', 'groups', 'thresholds', 'resets', 'end']
+Network schedule: ['start', 'groups', 'thresholds', 'synapses', 'resets', 'end']
+Set the network schedule explicitly or set the core.network.default_schedule preference to avoid this warning. [brian2.core.network.schedule_conflict]
+INFO       No numerical integration method specified for group 'neurongroup', using method 'euler' (took 0.06s, trying other methods took 0.11s). [brian2.stateupdaters.base.method_choice]
+INFO       No numerical integration method specified for group 'synapses_2', using method 'linear' (took 0.20s). [brian2.stateupdaters.base.method_choice]
+/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/GeNNworkspace/magicnetwork_model_CODE/synapseKrnl.cc(23): error: name must be a namespace name
+
+1 error detected in the compilation of "/tmp/tmpxft_00001531_00000000-7_runner.cpp1.ii".
+/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/genn/lib/src/generateALL.cc: 258: cuda driver error 301: CUDA_ERROR_FILE_NOT_FOUND
+
+Traceback (most recent call last):
+  File "<string>", line 14, in <module>
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/tests/features/speed.py", line 1075, in run
+    self.timed_run(self.duration)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/tests/features/base.py", line 63, in timed_run
+    brian2.run(duration, level=1)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/units/fundamentalunits.py", line 2428, in new_f
+    result = f(*args, **kwds)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/magic.py", line 371, in run
+    namespace=namespace, profile=profile, level=2+leINFO: setting cudaDevice stuff took 0.170318 seconds
+INFO connectivity matrix has size 812
+INFO connectivity matrix has size 3241
+INFO connectivity matrix has size 16057
+INFO connectivity matrix has size 3241
+INFO calling kernel_neurongroup_stateupdater_codeobject with 2 blocks and 768 threads. Kernel needs 40 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000.
+INFO calling kernel_synapses_2_stateupdater_codeobject with 5 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000.
+INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 11 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_1_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_2_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 31 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_2_post_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_neurongroup_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+Number of synapses: 3241
+Number of synapses: 3241
+Number of synapses: 16057
+Number of synapses: 812
+INFO: main_lines took 8.665926 seconds
+INFO: main function took 8.864357 seconds
+vel)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/magic.py", line 231, in run
+    namespace=namespace, profile=profile, level=level+1)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/core/base.py", line 276, in device_override_decorated_function
+    return getattr(curdev, name)(*args, **kwds)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2genn/brian2genn/device.py", line 1206, in network_run
+    super(GeNNDevice, self).network_run(net=net, duration=duration, report=report, report_period=report_period, namespace=namespace, level=level+1)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/devices/cpp_standalone/device.py", line 1171, in network_run
+    self.build(direct_call=False, **self.build_options)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2genn/brian2genn/device.py", line 582, in build
+    returncode=ex.returncode)
+RuntimeError: Project compilation failed (Command ['genn-buildmodel.sh', 'magicnetwork_model.cpp'] failed with error code 50).
+See the output above (if any) for more details.
+
+
+
+Running VogelsWithSynapticDynamic took 1:02:13.
+INFO relative performance for Full examples: Vogels et al 2011 (not event-driven synapses) N=10 CUDA standalone: nan
+INFO relative performance for Full examples: Vogels et al 2011 (not event-driven synapses) N=100 CUDA standalone: 1.0
+INFO relative performance for Full examples: Vogels et al 2011 (not event-driven synapses) N=1000 CUDA standalone: 1.0
+INFO relative performance for Full examples: Vogels et al 2011 (not event-driven synapses) N=10000 CUDA standalone: 1.0
+INFO relative performance for Full examples: Vogels et al 2011 (not event-driven synapses) N=20000 CUDA standalone: 1.0
+INFO relative performance for Full examples: Vogels et al 2011 (not event-driven synapses) N=50000 CUDA standalone: 1.0
+INFO relative performance for Full examples: Vogels et al 2011 (not event-driven synapses) N=100000 CUDA standalone: nan
+INFO relative performance for Full examples: Vogels et al 2011 (not event-driven synapses) N=10 C++ standalone: nan
+INFO relative performance for Full examples: Vogels et al 2011 (not event-driven synapses) N=100 C++ standalone: 8.86897457548
+INFO relative performance for Full examples: Vogels et al 2011 (not event-driven synapses) N=1000 C++ standalone: 4.54459720371
+INFO relative performance for Full examples: Vogels et al 2011 (not event-driven synapses) N=10000 C++ standalone: 1.30294701267
+INFO relative performance for Full examples: Vogels et al 2011 (not event-driven synapses) N=20000 C++ standalone: 0.533376382918
+INFO relative performance for Full examples: Vogels et al 2011 (not event-driven synapses) N=50000 C++ standalone: 0.285758703759
+INFO relative performance for Full examples: Vogels et al 2011 (not event-driven synapses) N=100000 C++ standalone: nan
+Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling
+cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_VogelsWithSynapticDynamic_CUDAStandaloneConfiguration_1000.log ./main 
+Profiling took 0:00:15 for runtime of 0.894734
+Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling
+Starting CUBAFixedConnectivity on 07.04.2017 at 00:20:43.
+Running speed tests
+Configurations: CUDA standalone, C++ standalone, GeNN_optimized
+Full examples: CUBA fixed connectivity:  n=10 [...] n=100 [...] n=1000 [...] n=10000 [...] n=100000 [...] n=1000000 [E..]
+
+TRACEBACK CUDA standalone N=1000000
+INFO: setting cudaDevice stuff took 0.269366 seconds
+INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject with 977 blocks and 1024 threads. Kernel needs 8 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO connectivity matrix has size 15997784
+INFO connectivity matrix has size 63997779
+INFO calling kernel_neurongroup_stateupdater_codeobject with 1303 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000.
+ERROR launching kernel_neurongroup_stateupdater_codeobject in code_objects/neurongroup_stateupdater_codeobject.cu:1101 invalid argument
+
+('debug syn effect mdoe ', 'target')
+('debug syn effect mdoe ', 'target')
+INFO: setting cudaDevice stuff took 0.269366 seconds
+INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject with 977 blocks and 1024 threads. Kernel needs 8 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO connectivity matrix has size 15997784
+INFO connectivity matrix has size 63997779
+INFO calling kernel_neurongroup_stateupdater_codeobject with 1303 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000.
+ERROR launching kernel_neurongroup_stateupdater_codeobject in code_objects/neurongroup_stateupdater_codeobject.cu:1101 invalid argument
+
+
+/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/utils/logger.py:546: UserWarning: Could not copy script file to temp directory: [Errno 2] No such file or directory: '/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/-c'
+  'Could not copy script file to temp directory: %s' % ex)
+INFO       No numerical integration method specified for group 'neurongroup', using method 'linear' (took 1.52s). [brian2.stateupdaters.base.method_choice]
+
+Traceback (most recent call last):
+  File "<string>", line 21, in <module>
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/tests/features/cuda_configuration.py", line 27, in after_run
+    with_output=False)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/brian2cuda/device.py", line 778, in build
+    self.run(directory, with_output, run_args)
+  File "/mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/frozen_repos/brian2/brian2/devices/cpp_standalone/device.py", line 864, in run
+    "%s)" % os.path.abspath(directory))
+RuntimeError: Project run failed (project directory: /mnt/antares_raid/home/denisalevi/projects/dev_brian2cuda/brian2cuda_repo/dev/benchmarks/cuda_standalone/cuda_standalone)
+
+
+
+Running CUBAFixedConnectivity took 2:08:42.
+INFO relative performance for Full examples: CUBA fixed connectivity N=10 CUDA standalone: 1.0
+INFO relative performance for Full examples: CUBA fixed connectivity N=100 CUDA standalone: 1.0
+INFO relative performance for Full examples: CUBA fixed connectivity N=1000 CUDA standalone: 1.0
+INFO relative performance for Full examples: CUBA fixed connectivity N=10000 CUDA standalone: 1.0
+INFO relative performance for Full examples: CUBA fixed connectivity N=100000 CUDA standalone: 1.0
+INFO relative performance for Full examples: CUBA fixed connectivity N=1000000 CUDA standalone: nan
+INFO relative performance for Full examples: CUBA fixed connectivity N=10 C++ standalone: 10.4753271459
+INFO relative performance for Full examples: CUBA fixed connectivity N=100 C++ standalone: 9.11815288259
+INFO relative performance for Full examples: INFO: setting cudaDevice stuff took 0.159560 seconds
+INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject with 1 blocks and 1024 threads. Kernel needs 8 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO connectivity matrix has size 15885
+INFO connectivity matrix has size 63887
+INFO calling kernel_neurongroup_stateupdater_codeobject with 2 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000.
+INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 11 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_spikemonitor_codeobject with 1 blocks and 1 threads. Kernel needs 37 registers per block, 0 bytes of statically-allocated shared memory per block, 16 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.000000.
+INFO calling kernel_synapses_1_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_neurongroup_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+Number of synapses: 63887
+Number of synapses: 15885
+INFO: main_lines took 6.371682 seconds
+Number of spikes: 5793
+INFO: main function took 6.564151 seconds
+mkdir: cannot create directory ‘test_output’: File exists
+# DT 0.000100 
+# totalTime 1.000000 
+# We are running with fixed time step 0.000100 
+0.9999 done ...
+everything finished.
+CUBA fixed connectivity N=1000 C++ standalone: 4.59531396439
+INFO relative performance for Full examples: CUBA fixed connectivity N=10000 C++ standalone: 1.31468051433
+INFO relative performance for Full examples: CUBA fixed connectivity N=100000 C++ standalone: 1.18548470084
+INFO relative performance for Full examples: CUBA fixed connectivity N=1000000 C++ standalone: nan
+INFO relative performance for Full examples: CUBA fixed connectivity N=10 GeNN_optimized: 1.11042839814
+INFO relative performance for Full examples: CUBA fixed connectivity N=100 GeNN_optimized: 1.04833449905
+INFO relative performance for Full examples: CUBA fixed connectivity N=1000 GeNN_optimized: 1.04557236752
+INFO relative performance for Full examples: CUBA fixed connectivity N=10000 GeNN_optimized: 1.50988461982
+INFO relative performance for Full examples: CUBA fixed connectivity N=100000 GeNN_optimized: 1.45615031203
+INFO relative performance for Full examples: CUBA fixed connectivity N=1000000 GeNN_optimized: nan
+Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling
+cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_CUBAFixedConnectivity_CUDAStandaloneConfiguration_1000.log ./main 
+Profiling took 0:00:12 for runtime of 0.436839
+Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling
+cd GeNNworkspace && nvprof  --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_CUBAFixedConnectivity_GeNNConfigurationOptimized_1000.log ./main test 1.0 1
+Profiling took 0:00:03 for runtime of 0.414081
+Starting COBAHHFixedConnectivity on 07.04.2017 at 02:30:45.
+Running speed tests
+Configurations: CUDA standalone, C++ standalone, GeNN_optimized
+Full examples: COBAHH fixed connectivity:  n=100 [...] n=500 [...] n=1000 [...] n=5000 [...] n=10000 [...] n=50000 [...] n=100000 [...] n=500000 [...]INFO: setting cudaDevice stuff took 0.332072 seconds
+INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject_1 with 1 blocks and 1024 threads. Kernel needs 7 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject_2 with 1 blocks and 1024 threads. Kernel needs 7 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO connectivity matrix has size 15964
+INFO connectivity matrix has size 64222
+INFO calling kernel_neurongroup_stateupdater_codeobject with 2 blocks and 512 threads. Kernel needs 109 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.250000.
+INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 11 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_spikemonitor_codeobject with 1 blocks and 1 threads. Kernel needs 37 registers per block, 0 bytes of statically-allocated shared memory per block, 16 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.000000.
+INFO calling kernel_synapses_1_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+Number of synapses: 15964
+Number of synapses: 64222
+INFO: main_lines took 6.054959 seconds
+Number of spikes: 37082
+INFO: main function took 6.485835 seconds
+mkdir: cannot create directory ‘test_output’: File exists
+# DT 0.000100 
+# totalTime 1.000000 
+# We are running with fixed time step 0.000100 
+0.9999 done ...
+everything finished.
+
+Running COBAHHFixedConnectivity took 2:02:00.
+INFO relative performance for Full examples: COBAHH fixed connectivity N=100 CUDA standalone: 1.0
+INFO relative performance for Full examples: COBAHH fixed connectivity N=500 CUDA standalone: 1.0
+INFO relative performance for Full examples: COBAHH fixed connectivity N=1000 CUDA standalone: 1.0
+INFO relative performance for Full examples: COBAHH fixed connectivity N=5000 CUDA standalone: 1.0
+INFO relative performance for Full examples: COBAHH fixed connectivity N=10000 CUDA standalone: 1.0
+INFO relative performance for Full examples: COBAHH fixed connectivity N=50000 CUDA standalone: 1.0
+INFO relative performance for Full examples: COBAHH fixed connectivity N=100000 CUDA standalone: 1.0
+INFO relative performance for Full examples: COBAHH fixed connectivity N=500000 CUDA standalone: 1.0
+INFO relative performance for Full examples: COBAHH fixed connectivity N=100 C++ standalone: 1.08329904788
+INFO relative performance for Full examples: COBAHH fixed connectivity N=500 C++ standalone: 0.396811267606
+INFO relative performance for Full examples: COBAHH fixed connectivity N=1000 C++ standalone: 0.263636826134
+INFO relative performance for Full examples: COBAHH fixed connectivity N=5000 C++ standalone: 0.156838480711
+INFO relative performance for Full examples: COBAHH fixed connectivity N=10000 C++ standalone: 0.144342764605
+INFO relative performance for Full examples: COBAHH fixed connectivity N=50000 C++ standalone: 0.124221429972
+INFO relative performance for Full examples: COBAHH fixed connectivity N=100000 C++ standalone: 0.125880488867
+INFO relative performance for Full examples: COBAHH fixed connectivity N=500000 C++ standalone: 1.77958110994
+INFO relative performance for Full examples: COBAHH fixed connectivity N=100 GeNN_optimized: 0.849264812988
+INFO relative performance for Full examples: COBAHH fixed connectivity N=500 GeNN_optimized: 1.14449843751
+INFO relative performance for Full examples: COBAHH fixed connectivity N=1000 GeNN_optimized: 1.29478794328
+INFO relative performance for Full examples: COBAHH fixed connectivity N=5000 GeNN_optimized: 2.10583634597
+INFO relative performance for Full examples: COBAHH fixed connectivity N=10000 GeNN_optimized: 2.64633239801
+INFO relative performance for Full examples: COBAHH fixed connectivity N=50000 GeNN_optimized: 2.23178989574
+INFO relative performance for Full examples: COBAHH fixed connectivity N=100000 GeNN_optimized: 1.21355706552
+INFO relative performance for Full examples: COBAHH fixed connectivity N=500000 GeNN_optimized: 3.68221005988
+Rerunning CUDAStandaloneConfiguration with n = 1000 for nvprof profiling
+cd cuda_standalone && nvprof --profile-from-start-off --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_COBAHHFixedConnectivity_CUDAStandaloneConfiguration_1000.log ./main 
+Profiling took 0:00:11 for runtime of 0.881464
+Rerunning GeNNConfigurationOptimized with n = 1000 for nvprof profiling
+cd GeNNworkspace && nvprof  --log-file ../results_2017_04_05_complete_after_talk/nvprof/nvprof_COBAHHFixedConnectivity_GeNNConfigurationOptimized_1000.log ./main test 1.0 1
+Profiling took 0:00:04 for runtime of 0.6583
+
+Summarized speed test results in results_2017_04_05_complete_after_talk/README.md
+Finished speed test on 07.04.2017 at 04:34:29. Total time = 1 day, 6:30:58.
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_AdaptationOscillation_cpp_standalone_100000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_AdaptationOscillation_cpp_standalone_100000.txt
new file mode 100644
index 00000000..e0192f4f
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_AdaptationOscillation_cpp_standalone_100000.txt
@@ -0,0 +1 @@
+Number of synapses: 500007644
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_AdaptationOscillation_cuda_standalone_100000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_AdaptationOscillation_cuda_standalone_100000.txt
new file mode 100644
index 00000000..dfcb71f6
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_AdaptationOscillation_cuda_standalone_100000.txt
@@ -0,0 +1,3 @@
+INFO: setting cudaDevice stuff took 0.311282 seconds
+INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject with 98 blocks and 1024 threads. Kernel needs 8 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject_1 with 98 blocks and 1024 threads. Kernel needs 8 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_BrunelHakimModelHeterogeneousDelay_cpp_standalone_100000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_BrunelHakimModelHeterogeneousDelay_cpp_standalone_100000.txt
new file mode 100644
index 00000000..074364c4
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_BrunelHakimModelHeterogeneousDelay_cpp_standalone_100000.txt
@@ -0,0 +1 @@
+Number of synapses: 100003108
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_BrunelHakimModelHeterogeneousDelay_cuda_standalone_100000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_BrunelHakimModelHeterogeneousDelay_cuda_standalone_100000.txt
new file mode 100644
index 00000000..1be9a968
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_BrunelHakimModelHeterogeneousDelay_cuda_standalone_100000.txt
@@ -0,0 +1,12 @@
+INFO: setting cudaDevice stuff took 0.094517 seconds
+INFO calling kernel_synapses_group_variable_set_conditional_codeobject with 97661 blocks and 1024 threads. Kernel needs 6 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO connectivity matrix has size 100004409
+INFO not enough memory available to generate 247808 random numbers for neurongroup_stateupdater_codeobject, reducing the buffer size
+INFO not enough memory available to generate 247808 random numbers for neurongroup_stateupdater_codeobject, reducing the buffer size
+INFO not enough memory available to generate 247808 random numbers for neurongroup_stateupdater_codeobject, reducing the buffer size
+INFO not enough memory available to generate 247808 random numbers for neurongroup_stateupdater_codeobject, reducing the buffer size
+INFO not enough memory available to generate 247808 random numbers for neurongroup_stateupdater_codeobject, reducing the buffer size
+INFO not enough memory available to generate 247808 random numbers for neurongroup_stateupdater_codeobject, reducing the buffer size
+INFO generating 204687 randn every 131 clock cycles for neurongroup_stateupdater_codeobject
+INFO calling kernel_neurongroup_stateupdater_codeobject with 174 blocks and 576 threads. Kernel needs 52 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.562500.
+ERROR launching kernel_neurongroup_stateupdater_codeobject in code_objects/neurongroup_stateupdater_codeobject.cu:1051 out of memory
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_BrunelHakimModelScalarDelayNoMultiPrePost_cpp_standalone_250000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_BrunelHakimModelScalarDelayNoMultiPrePost_cpp_standalone_250000.txt
new file mode 100644
index 00000000..797f590d
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_BrunelHakimModelScalarDelayNoMultiPrePost_cpp_standalone_250000.txt
@@ -0,0 +1 @@
+Number of synapses: 250027209
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_BrunelHakimModelScalarDelayNoMultiPrePost_cuda_standalone_250000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_BrunelHakimModelScalarDelayNoMultiPrePost_cuda_standalone_250000.txt
new file mode 100644
index 00000000..03cbcd84
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_BrunelHakimModelScalarDelayNoMultiPrePost_cuda_standalone_250000.txt
@@ -0,0 +1,10 @@
+INFO: setting cudaDevice stuff took 0.282011 seconds
+INFO connectivity matrix has size 249991134
+INFO generating 13000000 randn every 52 clock cycles for neurongroup_stateupdater_codeobject
+INFO calling kernel_neurongroup_stateupdater_codeobject with 435 blocks and 576 threads. Kernel needs 52 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.562500.
+INFO calling kernel_neurongroup_thresholder_codeobject with 245 blocks and 1024 threads. Kernel needs 11 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_neurongroup_resetter_codeobject with 245 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+Number of synapses: 249991134
+INFO: main_lines took 851.693748 seconds
+INFO: main function took 860.695222 seconds
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_BrunelHakimModelScalarDelay_cpp_standalone_250000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_BrunelHakimModelScalarDelay_cpp_standalone_250000.txt
new file mode 100644
index 00000000..15884f65
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_BrunelHakimModelScalarDelay_cpp_standalone_250000.txt
@@ -0,0 +1 @@
+Number of synapses: 250014804
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_BrunelHakimModelScalarDelay_cuda_standalone_250000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_BrunelHakimModelScalarDelay_cuda_standalone_250000.txt
new file mode 100644
index 00000000..ae002382
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_BrunelHakimModelScalarDelay_cuda_standalone_250000.txt
@@ -0,0 +1,10 @@
+INFO: setting cudaDevice stuff took 0.232506 seconds
+INFO connectivity matrix has size 249995860
+INFO generating 13000000 randn every 52 clock cycles for neurongroup_stateupdater_codeobject
+INFO calling kernel_neurongroup_stateupdater_codeobject with 435 blocks and 576 threads. Kernel needs 52 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.562500.
+INFO calling kernel_neurongroup_thresholder_codeobject with 245 blocks and 1024 threads. Kernel needs 11 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_neurongroup_resetter_codeobject with 245 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+Number of synapses: 249995860
+INFO: main_lines took 871.342261 seconds
+INFO: main function took 880.904159 seconds
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_COBAHHFixedConnectivity_cpp_standalone_500000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_COBAHHFixedConnectivity_cpp_standalone_500000.txt
new file mode 100644
index 00000000..99f726d6
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_COBAHHFixedConnectivity_cpp_standalone_500000.txt
@@ -0,0 +1,3 @@
+Number of spikes: 18384816
+Number of synapses: 7999481
+Number of synapses: 32003913
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_COBAHHFixedConnectivity_cuda_standalone_500000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_COBAHHFixedConnectivity_cuda_standalone_500000.txt
new file mode 100644
index 00000000..e62943fc
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_COBAHHFixedConnectivity_cuda_standalone_500000.txt
@@ -0,0 +1,20 @@
+INFO: setting cudaDevice stuff took 0.273473 seconds
+INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject with 489 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject_1 with 489 blocks and 1024 threads. Kernel needs 7 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject_2 with 489 blocks and 1024 threads. Kernel needs 7 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO connectivity matrix has size 7997964
+INFO connectivity matrix has size 31986009
+INFO calling kernel_neurongroup_stateupdater_codeobject with 977 blocks and 512 threads. Kernel needs 109 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.250000.
+INFO calling kernel_neurongroup_thresholder_codeobject with 489 blocks and 1024 threads. Kernel needs 11 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_spikemonitor_codeobject with 1 blocks and 1 threads. Kernel needs 37 registers per block, 0 bytes of statically-allocated shared memory per block, 16 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.000000.
+INFO calling kernel_synapses_1_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+...
+ERROR while allocating 33554428 bytes in cudaVector.h/reserve()
+ERROR while allocating 67108856 bytes in cudaVector.h/reserve()
+...
+Number of synapses: 7997964
+Number of synapses: 31986009
+INFO: main_lines took 3155.450103 seconds
+Number of spikes: 4194303
+INFO: main function took 3164.489584 seconds
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_COBAHH_cpp_standalone_100000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_COBAHH_cpp_standalone_100000.txt
new file mode 100644
index 00000000..057aa12b
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_COBAHH_cpp_standalone_100000.txt
@@ -0,0 +1,2 @@
+Number of synapses: 40003665
+Number of synapses: 160004814
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_COBAHH_cuda_standalone_100000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_COBAHH_cuda_standalone_100000.txt
new file mode 100644
index 00000000..2c98a46b
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_COBAHH_cuda_standalone_100000.txt
@@ -0,0 +1,14 @@
+INFO: setting cudaDevice stuff took 0.293608 seconds
+INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject with 98 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject_1 with 98 blocks and 1024 threads. Kernel needs 7 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject_2 with 98 blocks and 1024 threads. Kernel needs 7 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO connectivity matrix has size 40000194
+INFO connectivity matrix has size 159996627
+INFO calling kernel_neurongroup_stateupdater_codeobject with 196 blocks and 512 threads. Kernel needs 109 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.250000.
+INFO calling kernel_neurongroup_thresholder_codeobject with 98 blocks and 1024 threads. Kernel needs 11 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_1_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+Number of synapses: 40000194
+Number of synapses: 159996627
+INFO: main_lines took 362.221271 seconds
+INFO: main function took 368.960768 seconds
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_CUBAFixedConnectivity_cpp_standalone_1000000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_CUBAFixedConnectivity_cpp_standalone_1000000.txt
new file mode 100644
index 00000000..3d0c9a4f
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_CUBAFixedConnectivity_cpp_standalone_1000000.txt
@@ -0,0 +1,3 @@
+Number of spikes: 5834425
+Number of synapses: 63999971
+Number of synapses: 16002058
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_CUBAFixedConnectivity_cuda_standalone_1000000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_CUBAFixedConnectivity_cuda_standalone_1000000.txt
new file mode 100644
index 00000000..487ad897
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_CUBAFixedConnectivity_cuda_standalone_1000000.txt
@@ -0,0 +1,6 @@
+INFO: setting cudaDevice stuff took 0.269366 seconds
+INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject with 977 blocks and 1024 threads. Kernel needs 8 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO connectivity matrix has size 15997784
+INFO connectivity matrix has size 63997779
+INFO calling kernel_neurongroup_stateupdater_codeobject with 1303 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000.
+ERROR launching kernel_neurongroup_stateupdater_codeobject in code_objects/neurongroup_stateupdater_codeobject.cu:1101 invalid argument
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_CUBA_cpp_standalone_1000000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_CUBA_cpp_standalone_1000000.txt
new file mode 100644
index 00000000..39e2b088
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_CUBA_cpp_standalone_1000000.txt
@@ -0,0 +1,2 @@
+Number of synapses: 63985232
+Number of synapses: 16002041
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_CUBA_cuda_standalone_1000000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_CUBA_cuda_standalone_1000000.txt
new file mode 100644
index 00000000..7ce8ad87
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_CUBA_cuda_standalone_1000000.txt
@@ -0,0 +1,6 @@
+INFO: setting cudaDevice stuff took 0.265023 seconds
+INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject with 977 blocks and 1024 threads. Kernel needs 8 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO connectivity matrix has size 15999736
+INFO connectivity matrix has size 64013467
+INFO calling kernel_neurongroup_stateupdater_codeobject with 1303 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000.
+ERROR launching kernel_neurongroup_stateupdater_codeobject in code_objects/neurongroup_stateupdater_codeobject.cu:1101 invalid argument
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_DenseMediumRateSynapsesOnly_cpp_standalone_500000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_DenseMediumRateSynapsesOnly_cpp_standalone_500000.txt
new file mode 100644
index 00000000..d304292b
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_DenseMediumRateSynapsesOnly_cpp_standalone_500000.txt
@@ -0,0 +1 @@
+Number of synapses: 250000000
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_DenseMediumRateSynapsesOnly_cuda_standalone_500000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_DenseMediumRateSynapsesOnly_cuda_standalone_500000.txt
new file mode 100644
index 00000000..8209f380
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_DenseMediumRateSynapsesOnly_cuda_standalone_500000.txt
@@ -0,0 +1,4 @@
+INFO: setting cudaDevice stuff took 0.113374 seconds
+INFO connectivity matrix has size 250000000
+INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+ERROR launching kernel_neurongroup_thresholder_codeobject in code_objects/neurongroup_thresholder_codeobject.cu:1008 invalid argument
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_HHNeuronsOnly_cpp_standalone_1000000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_HHNeuronsOnly_cpp_standalone_1000000.txt
new file mode 100644
index 00000000..e69de29b
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_HHNeuronsOnly_cuda_standalone_1000000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_HHNeuronsOnly_cuda_standalone_1000000.txt
new file mode 100644
index 00000000..0de01ad8
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_HHNeuronsOnly_cuda_standalone_1000000.txt
@@ -0,0 +1,6 @@
+INFO: setting cudaDevice stuff took 0.135925 seconds
+INFO calling kernel_neurongroup_group_variable_set_conditional_codeobject with 977 blocks and 1024 threads. Kernel needs 7 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 0 bytes of user-allocated constant memory
+INFO calling kernel_neurongroup_stateupdater_codeobject with 977 blocks and 1024 threads. Kernel needs 62 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 0 bytes of user-allocated constant memory. Theoretical occupancy is 0.500000.
+INFO calling kernel_neurongroup_thresholder_codeobject with 977 blocks and 1024 threads. Kernel needs 11 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 0 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO: main_lines took 10.586006 seconds
+INFO: main function took 10.832985 seconds
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_LinearNeuronsOnly_cpp_standalone_10000000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_LinearNeuronsOnly_cpp_standalone_10000000.txt
new file mode 100644
index 00000000..e69de29b
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_LinearNeuronsOnly_cuda_standalone_10000000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_LinearNeuronsOnly_cuda_standalone_10000000.txt
new file mode 100644
index 00000000..3d020cad
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_LinearNeuronsOnly_cuda_standalone_10000000.txt
@@ -0,0 +1,4 @@
+INFO: setting cudaDevice stuff took 0.274551 seconds
+INFO calling kernel_neurongroup_stateupdater_codeobject with 9766 blocks and 1024 threads. Kernel needs 12 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 0 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO: main_lines took 68.949131 seconds
+INFO: main function took 69.505944 seconds
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPEventDriven_cpp_standalone_5000000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPEventDriven_cpp_standalone_5000000.txt
new file mode 100644
index 00000000..0d502595
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPEventDriven_cpp_standalone_5000000.txt
@@ -0,0 +1,2 @@
+Number of synapses: 5000000
+Number of synapses: 5000000
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPEventDriven_cuda_standalone_5000000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPEventDriven_cuda_standalone_5000000.txt
new file mode 100644
index 00000000..5b160ed4
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPEventDriven_cuda_standalone_5000000.txt
@@ -0,0 +1,15 @@
+INFO: setting cudaDevice stuff took 0.241168 seconds
+INFO calling kernel_synapses_group_variable_set_conditional_codeobject with 4883 blocks and 1024 threads. Kernel needs 6 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO connectivity matrix has size 5000000
+INFO connectivity matrix has size 5000000
+INFO generating 10000000 rand every 2 clock cycles for poissongroup_thresholder_codeobject
+INFO calling kernel_neurongroup_stateupdater_codeobject with 1 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000.
+INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 9 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_poissongroup_thresholder_codeobject with 4883 blocks and 1024 threads. Kernel needs 15 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 42 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.500000.
+INFO calling kernel_synapses_post_codeobject with 15 blocks and 1024 threads. Kernel needs 34 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.500000.
+INFO calling kernel_neurongroup_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+Number of synapses: 5000000
+Number of synapses: 5000000
+INFO: main_lines took 382.905001 seconds
+INFO: main function took 383.607174 seconds
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPMultiPostNeuronalTraces_cpp_standalone_1000000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPMultiPostNeuronalTraces_cpp_standalone_1000000.txt
new file mode 100644
index 00000000..f8e93de7
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPMultiPostNeuronalTraces_cpp_standalone_1000000.txt
@@ -0,0 +1,2 @@
+Number of synapses: 1000000
+Number of synapses: 1000000
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPMultiPostNeuronalTraces_cuda_standalone_1000000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPMultiPostNeuronalTraces_cuda_standalone_1000000.txt
new file mode 100644
index 00000000..3278dcc4
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPMultiPostNeuronalTraces_cuda_standalone_1000000.txt
@@ -0,0 +1,16 @@
+INFO: setting cudaDevice stuff took 0.264461 seconds
+INFO calling kernel_synapses_group_variable_set_conditional_codeobject with 977 blocks and 1024 threads. Kernel needs 6 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO connectivity matrix has size 1000000
+INFO connectivity matrix has size 1000000
+INFO generating 10000000 rand every 13107 clock cycles for neurongroup_thresholder_codeobject
+INFO calling kernel_neurongroup_1_stateupdater_codeobject with 2 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000.
+INFO calling kernel_neurongroup_stateupdater_codeobject with 2 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000.
+INFO calling kernel_neurongroup_1_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 9 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 15 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 31 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_post_codeobject with 15 blocks and 1024 threads. Kernel needs 28 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_neurongroup_1_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+Number of synapses: 1000000
+Number of synapses: 1000000
+INFO: main_lines took 1.961364 seconds
+INFO: main function took 2.291993 seconds
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPMultiPost_cpp_standalone_1000000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPMultiPost_cpp_standalone_1000000.txt
new file mode 100644
index 00000000..f8e93de7
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPMultiPost_cpp_standalone_1000000.txt
@@ -0,0 +1,2 @@
+Number of synapses: 1000000
+Number of synapses: 1000000
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPMultiPost_cuda_standalone_1000000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPMultiPost_cuda_standalone_1000000.txt
new file mode 100644
index 00000000..0d03941f
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPMultiPost_cuda_standalone_1000000.txt
@@ -0,0 +1,15 @@
+INFO: setting cudaDevice stuff took 0.135497 seconds
+INFO calling kernel_synapses_group_variable_set_conditional_codeobject with 977 blocks and 1024 threads. Kernel needs 6 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO connectivity matrix has size 1000000
+INFO connectivity matrix has size 1000000
+INFO generating 10000000 rand every 13107 clock cycles for poissongroup_thresholder_codeobject
+INFO calling kernel_neurongroup_stateupdater_codeobject with 2 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000.
+INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 9 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_poissongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 15 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 42 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.500000.
+INFO calling kernel_synapses_post_codeobject with 15 blocks and 1024 threads. Kernel needs 34 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.500000.
+INFO calling kernel_neurongroup_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+Number of synapses: 1000000
+Number of synapses: 1000000
+INFO: main_lines took 2.266672 seconds
+INFO: main function took 2.490897 seconds
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPNeuronalTraces_cpp_standalone_1000000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPNeuronalTraces_cpp_standalone_1000000.txt
new file mode 100644
index 00000000..f8e93de7
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPNeuronalTraces_cpp_standalone_1000000.txt
@@ -0,0 +1,2 @@
+Number of synapses: 1000000
+Number of synapses: 1000000
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPNeuronalTraces_cuda_standalone_1000000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPNeuronalTraces_cuda_standalone_1000000.txt
new file mode 100644
index 00000000..54a46611
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPNeuronalTraces_cuda_standalone_1000000.txt
@@ -0,0 +1,16 @@
+INFO: setting cudaDevice stuff took 0.104386 seconds
+INFO calling kernel_synapses_group_variable_set_conditional_codeobject with 977 blocks and 1024 threads. Kernel needs 6 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO connectivity matrix has size 1000000
+INFO connectivity matrix has size 1000000
+INFO generating 13000000 rand every 13 clock cycles for neurongroup_thresholder_codeobject
+INFO calling kernel_neurongroup_1_stateupdater_codeobject with 1 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000.
+INFO calling kernel_neurongroup_stateupdater_codeobject with 1303 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000.
+INFO calling kernel_neurongroup_1_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 9 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_neurongroup_thresholder_codeobject with 977 blocks and 1024 threads. Kernel needs 15 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 31 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_post_codeobject with 15 blocks and 1024 threads. Kernel needs 28 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_neurongroup_1_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+Number of synapses: 1000000
+Number of synapses: 1000000
+INFO: main_lines took 65.679538 seconds
+INFO: main function took 65.888984 seconds
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPNotEventDriven_cpp_standalone_100000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPNotEventDriven_cpp_standalone_100000.txt
new file mode 100644
index 00000000..2ed54ef7
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPNotEventDriven_cpp_standalone_100000.txt
@@ -0,0 +1,2 @@
+Number of synapses: 100000
+Number of synapses: 100000
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPNotEventDriven_cuda_standalone_100000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPNotEventDriven_cuda_standalone_100000.txt
new file mode 100644
index 00000000..289fd80d
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDPNotEventDriven_cuda_standalone_100000.txt
@@ -0,0 +1,16 @@
+INFO: setting cudaDevice stuff took 0.096839 seconds
+INFO calling kernel_synapses_group_variable_set_conditional_codeobject with 98 blocks and 1024 threads. Kernel needs 6 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO connectivity matrix has size 100000
+INFO connectivity matrix has size 100000
+INFO generating 13100000 rand every 131 clock cycles for poissongroup_thresholder_codeobject
+INFO calling kernel_neurongroup_stateupdater_codeobject with 1 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000.
+INFO calling kernel_synapses_stateupdater_codeobject with 131 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000.
+INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 9 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_poissongroup_thresholder_codeobject with 98 blocks and 1024 threads. Kernel needs 15 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 33 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.500000.
+INFO calling kernel_synapses_post_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_neurongroup_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+Number of synapses: 100000
+Number of synapses: 100000
+INFO: main_lines took 6.587633 seconds
+INFO: main function took 6.708345 seconds
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDP_cpp_standalone_1000000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDP_cpp_standalone_1000000.txt
new file mode 100644
index 00000000..f9d65d91
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDP_cpp_standalone_1000000.txt
@@ -0,0 +1,3 @@
+Number of synapses: 1000000
+Number of spikes: 15000963
+Number of synapses: 1000000
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDP_cuda_standalone_1000000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDP_cuda_standalone_1000000.txt
new file mode 100644
index 00000000..5b8787a6
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_STDP_cuda_standalone_1000000.txt
@@ -0,0 +1,21 @@
+INFO: setting cudaDevice stuff took 0.275236 seconds
+INFO calling kernel_synapses_group_variable_set_conditional_codeobject with 977 blocks and 1024 threads. Kernel needs 6 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory
+INFO connectivity matrix has size 1000000
+INFO connectivity matrix has size 1000000
+INFO generating 13000000 rand every 13 clock cycles for poissongroup_thresholder_codeobject
+INFO calling kernel_neurongroup_stateupdater_codeobject with 1 blocks and 768 threads. Kernel needs 36 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000.
+INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 9 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_poissongroup_thresholder_codeobject with 977 blocks and 1024 threads. Kernel needs 15 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_spikemonitor_codeobject with 1 blocks and 1 threads. Kernel needs 37 registers per block, 0 bytes of statically-allocated shared memory per block, 16 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.000000.
+INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 42 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.500000.
+INFO calling kernel_synapses_post_codeobject with 15 blocks and 1024 threads. Kernel needs 34 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.500000.
+INFO calling kernel_neurongroup_resetter_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+...
+ERROR while allocating 33554428 bytes in cudaVector.h/reserve()
+ERROR while allocating 67108856 bytes in cudaVector.h/reserve()
+...
+Number of synapses: 1000000
+Number of synapses: 1000000
+INFO: main_lines took 2321.434166 seconds
+Number of spikes: 4194303
+INFO: main function took 2329.753789 seconds
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_SparseHighRateSynapsesOnly_cpp_standalone_500000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_SparseHighRateSynapsesOnly_cpp_standalone_500000.txt
new file mode 100644
index 00000000..d6ff351a
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_SparseHighRateSynapsesOnly_cpp_standalone_500000.txt
@@ -0,0 +1 @@
+Number of synapses: 500027365
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_SparseHighRateSynapsesOnly_cuda_standalone_500000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_SparseHighRateSynapsesOnly_cuda_standalone_500000.txt
new file mode 100644
index 00000000..13230949
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_SparseHighRateSynapsesOnly_cuda_standalone_500000.txt
@@ -0,0 +1 @@
+INFO: setting cudaDevice stuff took 0.286275 seconds
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_SparseLowRateSynapsesOnly_cpp_standalone_500000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_SparseLowRateSynapsesOnly_cpp_standalone_500000.txt
new file mode 100644
index 00000000..a4e24f5d
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_SparseLowRateSynapsesOnly_cpp_standalone_500000.txt
@@ -0,0 +1 @@
+Number of synapses: 5004129
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_SparseLowRateSynapsesOnly_cuda_standalone_500000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_SparseLowRateSynapsesOnly_cuda_standalone_500000.txt
new file mode 100644
index 00000000..2e03a450
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_SparseLowRateSynapsesOnly_cuda_standalone_500000.txt
@@ -0,0 +1,7 @@
+INFO: setting cudaDevice stuff took 0.127070 seconds
+INFO connectivity matrix has size 5000898
+INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+Number of synapses: 5000898
+INFO: main_lines took 162.284184 seconds
+INFO: main function took 162.573362 seconds
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_SparseMediumRateSynapsesOnly_cpp_standalone_500000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_SparseMediumRateSynapsesOnly_cpp_standalone_500000.txt
new file mode 100644
index 00000000..f5dcf831
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_SparseMediumRateSynapsesOnly_cpp_standalone_500000.txt
@@ -0,0 +1 @@
+Number of synapses: 50001289
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_SparseMediumRateSynapsesOnly_cuda_standalone_500000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_SparseMediumRateSynapsesOnly_cuda_standalone_500000.txt
new file mode 100644
index 00000000..d6f0b0c1
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_SparseMediumRateSynapsesOnly_cuda_standalone_500000.txt
@@ -0,0 +1,7 @@
+INFO: setting cudaDevice stuff took 0.129188 seconds
+INFO connectivity matrix has size 49998381
+INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+Number of synapses: 49998381
+INFO: main_lines took 223.189514 seconds
+INFO: main function took 224.554622 seconds
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_VerySparseMediumRateSynapsesOnly_cpp_standalone_500000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_VerySparseMediumRateSynapsesOnly_cpp_standalone_500000.txt
new file mode 100644
index 00000000..6e23dd8f
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_VerySparseMediumRateSynapsesOnly_cpp_standalone_500000.txt
@@ -0,0 +1 @@
+Number of synapses: 5000485
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_VerySparseMediumRateSynapsesOnly_cuda_standalone_500000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_VerySparseMediumRateSynapsesOnly_cuda_standalone_500000.txt
new file mode 100644
index 00000000..9111f9e9
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_VerySparseMediumRateSynapsesOnly_cuda_standalone_500000.txt
@@ -0,0 +1,7 @@
+INFO: setting cudaDevice stuff took 0.117737 seconds
+INFO connectivity matrix has size 5002948
+INFO calling kernel_neurongroup_thresholder_codeobject with 1 blocks and 1024 threads. Kernel needs 14 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+INFO calling kernel_synapses_pre_codeobject with 15 blocks and 1024 threads. Kernel needs 27 registers per block, 0 bytes of statically-allocated shared memory per block, 8 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 1.000000.
+Number of synapses: 5002948
+INFO: main_lines took 316.438656 seconds
+INFO: main function took 316.716762 seconds
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_VogelsWithSynapticDynamic_cpp_standalone_100000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_VogelsWithSynapticDynamic_cpp_standalone_100000.txt
new file mode 100644
index 00000000..5fbe1aaf
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_VogelsWithSynapticDynamic_cpp_standalone_100000.txt
@@ -0,0 +1,4 @@
+Number of synapses: 32011543
+Number of synapses: 32011543
+Number of synapses: 160020171
+Number of synapses: 8000773
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_VogelsWithSynapticDynamic_cuda_standalone_100000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_VogelsWithSynapticDynamic_cuda_standalone_100000.txt
new file mode 100644
index 00000000..58c2f26b
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_VogelsWithSynapticDynamic_cuda_standalone_100000.txt
@@ -0,0 +1,7 @@
+INFO: setting cudaDevice stuff took 0.256535 seconds
+INFO connectivity matrix has size 7999692
+INFO connectivity matrix has size 32002251
+INFO connectivity matrix has size 160017025
+INFO connectivity matrix has size 32002251
+INFO calling kernel_neurongroup_stateupdater_codeobject with 131 blocks and 768 threads. Kernel needs 40 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000.
+ERROR launching kernel_neurongroup_stateupdater_codeobject in code_objects/neurongroup_stateupdater_codeobject.cu:1066 invalid argument
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_Vogels_cpp_standalone_100000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_Vogels_cpp_standalone_100000.txt
new file mode 100644
index 00000000..7990bb47
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_Vogels_cpp_standalone_100000.txt
@@ -0,0 +1,4 @@
+Number of synapses: 32000183
+Number of synapses: 32000183
+Number of synapses: 159996515
+Number of synapses: 7996913
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_Vogels_cuda_standalone_100000.txt b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_Vogels_cuda_standalone_100000.txt
new file mode 100644
index 00000000..da99c451
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/logs/stdout_Vogels_cuda_standalone_100000.txt
@@ -0,0 +1,7 @@
+INFO: setting cudaDevice stuff took 0.263336 seconds
+INFO connectivity matrix has size 7997654
+INFO connectivity matrix has size 31988320
+INFO connectivity matrix has size 159989507
+INFO connectivity matrix has size 31988320
+INFO calling kernel_neurongroup_stateupdater_codeobject with 131 blocks and 768 threads. Kernel needs 40 registers per block, 0 bytes of statically-allocated shared memory per block, 0 bytes of local memory per thread and a total of 256 bytes of user-allocated constant memory. Theoretical occupancy is 0.750000.
+ERROR launching kernel_neurongroup_stateupdater_codeobject in code_objects/neurongroup_stateupdater_codeobject.cu:1066 invalid argument
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_AdaptationOscillation_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_AdaptationOscillation_CUDAStandaloneConfiguration_1000.log
new file mode 100644
index 00000000..c0fa0dd0
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_AdaptationOscillation_CUDAStandaloneConfiguration_1000.log
@@ -0,0 +1,25 @@
+==27090== NVPROF is profiling process 27090, command: ./main
+==27090== Profiling application: ./main
+==27090== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 54.38%  151.00ms     10000  15.100us  2.8800us  70.592us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, int*, int, int*, double, double*, int*, int, bool*)
+ 18.09%  50.227ms     10000  5.0220us  4.7040us  6.8800us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double, double*, double*, double*, bool*, float*)
+ 11.30%  31.386ms     10000  3.1380us  3.0400us  4.2560us  [CUDA memset]
+  8.01%  22.246ms     10000  2.2240us  1.8560us  2.7520us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+  7.90%  21.951ms     10000  2.1950us  1.5360us  3.0400us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, int*, double*, double*, bool*)
+  0.32%  881.25us         1  881.25us  881.25us  881.25us  void gen_sequenced<curandStateXORWOW, float2, normal_args_st, __operator_&__(float2 curand_normal_scaled2<curandStateXORWOW>(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st)
+
+==27090== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 65.77%  370.03ms     40001  9.2500us  8.1820us  8.8454ms  cudaLaunch
+ 16.57%  93.193ms     10000  9.3190us  8.6380us  24.859us  cudaMemset
+ 13.98%  78.650ms    390005     201ns     149ns  319.77us  cudaSetupArgument
+  1.93%  10.868ms     40001     271ns     200ns  313.28us  cudaConfigureCall
+  1.70%  9.5546ms     40002     238ns     207ns  5.1700us  cudaGetLastError
+  0.03%  174.94us         1  174.94us  174.94us  174.94us  cudaMalloc
+  0.01%  50.180us         1  50.180us  50.180us  50.180us  cudaMemGetInfo
+  0.00%  23.192us        38     610ns     476ns  1.5970us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  19.120us         7  2.7310us  2.0000us  5.0070us  cudaFuncGetAttributes
+  0.00%  17.862us         1  17.862us  17.862us  17.862us  cudaDeviceSynchronize
+  0.00%  5.0460us        12     420ns     293ns  1.1020us  cudaDeviceGetAttribute
+  0.00%  3.2580us         3  1.0860us     659ns  1.8660us  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_AdaptationOscillation_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_AdaptationOscillation_GeNNConfigurationOptimized_1000.log
new file mode 100644
index 00000000..47f193b0
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_AdaptationOscillation_GeNNConfigurationOptimized_1000.log
@@ -0,0 +1,25 @@
+==27315== NVPROF is profiling process 27315, command: ./main test 1.0 1
+==27315== Profiling application: ./main test 1.0 1
+==27315== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 53.41%  151.83ms     10000  15.183us  1.9200us  1.1186ms  calcSynapses
+ 46.17%  131.26ms     10000  13.126us  10.560us  20.288us  calcNeurons
+  0.32%  903.46us        48  18.822us     960ns  129.47us  [CUDA memcpy HtoD]
+  0.10%  283.36us        14  20.240us  1.9840us  122.88us  [CUDA memcpy DtoH]
+
+==27315== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 48.83%  298.28ms        13  22.945ms  9.2060us  295.80ms  cudaHostAlloc
+ 46.42%  283.54ms     20000  14.176us  7.6710us  1.1119ms  cudaLaunch
+  2.61%  15.926ms        64  248.85us     409ns  13.875ms  cudaMemcpy
+  1.10%  6.6997ms     20000     334ns     268ns  303.73us  cudaConfigureCall
+  0.84%  5.1253ms     20000     256ns     228ns  5.1490us  cudaSetupArgument
+  0.14%  867.56us        13  66.735us  7.8370us  174.67us  cudaMalloc
+  0.04%  257.35us        83  3.1000us     186ns  109.74us  cuDeviceGetAttribute
+  0.01%  39.793us         1  39.793us  39.793us  39.793us  cuDeviceGetName
+  0.01%  36.797us         1  36.797us  36.797us  36.797us  cuDeviceTotalMem
+  0.00%  16.271us         1  16.271us  16.271us  16.271us  cudaSetDevice
+  0.00%  15.322us        13  1.1780us     539ns  3.3530us  cudaGetSymbolAddress
+  0.00%  2.6060us         2  1.3030us     777ns  1.8290us  cuDeviceGetCount
+  0.00%  1.8590us         1  1.8590us  1.8590us  1.8590us  cudaGetDeviceCount
+  0.00%     975ns         2     487ns     397ns     578ns  cuDeviceGet
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfiguration_1000.log
new file mode 100644
index 00000000..0d6b351a
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfiguration_1000.log
@@ -0,0 +1,27 @@
+==13252== NVPROF is profiling process 13252, command: ./main
+==13252== Profiling application: ./main
+==13252== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 86.34%  3.21777s     10000  321.78us  1.5360us  4.9107ms  _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*)
+  9.78%  364.56ms     10000  36.455us  2.2080us  84.928us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*)
+  1.24%  46.150ms     10000  4.6140us  4.4480us  6.7520us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*)
+  0.86%  32.053ms     10000  3.2050us  2.9120us  4.2240us  [CUDA memset]
+  0.70%  25.923ms     10000  2.5920us  2.3680us  3.6160us  _run_synapses_pre_push_spikes_advance_kernel(void)
+  0.58%  21.708ms     10000  2.1700us  1.8880us  2.7200us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+  0.48%  17.725ms     10000  1.7720us  1.6960us  2.0480us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+  0.02%  880.45us         1  880.45us  880.45us  880.45us  void gen_sequenced<curandStateXORWOW, float2, normal_args_st, __operator_&__(float2 curand_normal_scaled2<curandStateXORWOW>(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st)
+
+==13252== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 93.08%  3.54282s     60001  59.045us  7.8910us  6.6818ms  cudaLaunch
+  2.78%  105.95ms     10000  10.595us  8.3520us  305.06us  cudaMemset
+  1.61%  61.198ms         1  61.198ms  61.198ms  61.198ms  cudaDeviceSynchronize
+  1.60%  60.805ms    370005     164ns     130ns  296.03us  cudaSetupArgument
+  0.49%  18.710ms     60002     311ns     237ns  312.79us  cudaGetLastError
+  0.43%  16.481ms     60001     274ns     181ns  299.24us  cudaConfigureCall
+  0.00%  182.53us         1  182.53us  182.53us  182.53us  cudaMalloc
+  0.00%  71.394us         1  71.394us  71.394us  71.394us  cudaMemGetInfo
+  0.00%  20.387us        38     536ns     474ns  1.4760us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  18.951us         7  2.7070us  1.9760us  5.3870us  cudaFuncGetAttributes
+  0.00%  4.9460us        12     412ns     263ns  1.1520us  cudaDeviceGetAttribute
+  0.00%  2.8500us         3     950ns     608ns  1.6040us  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_GeNNConfigurationOptimized_1000.log
new file mode 100644
index 00000000..1c7f7e6f
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_GeNNConfigurationOptimized_1000.log
@@ -0,0 +1,25 @@
+==13488== NVPROF is profiling process 13488, command: ./main test 1.0 1
+==13488== Profiling application: ./main test 1.0 1
+==13488== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 74.47%  118.07ms     10000  11.806us  10.016us  17.664us  calcNeurons
+ 18.42%  29.207ms     10000  2.9200us  1.9200us  17.664us  calcSynapses
+  5.59%  8.8552ms        40  221.38us     960ns  2.5145ms  [CUDA memcpy HtoD]
+  1.52%  2.4178ms        10  241.78us  1.9520us  2.3869ms  [CUDA memcpy DtoH]
+
+==13488== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 58.76%  270.99ms        11  24.635ms  17.531us  265.27ms  cudaHostAlloc
+ 36.00%  166.02ms     20000  8.3000us  7.6090us  315.35us  cudaLaunch
+  2.62%  12.069ms        53  227.72us     334ns  2.5281ms  cudaMemcpy
+  1.36%  6.2887ms     20000     314ns     240ns  302.98us  cudaConfigureCall
+  1.00%  4.6085ms     20000     230ns     217ns  2.8530us  cudaSetupArgument
+  0.19%  860.67us        11  78.243us  12.662us  173.88us  cudaMalloc
+  0.05%  234.84us        83  2.8290us     158ns  100.64us  cuDeviceGetAttribute
+  0.01%  32.245us         1  32.245us  32.245us  32.245us  cuDeviceTotalMem
+  0.01%  27.894us         1  27.894us  27.894us  27.894us  cuDeviceGetName
+  0.00%  14.621us        11  1.3290us     791ns  3.3800us  cudaGetSymbolAddress
+  0.00%  12.561us         1  12.561us  12.561us  12.561us  cudaSetDevice
+  0.00%  1.4740us         2     737ns     495ns     979ns  cuDeviceGetCount
+  0.00%  1.4370us         1  1.4370us  1.4370us  1.4370us  cudaGetDeviceCount
+  0.00%     524ns         2     262ns     227ns     297ns  cuDeviceGet
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelScalarDelayNoMultiPrePost_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelScalarDelayNoMultiPrePost_CUDAStandaloneConfiguration_1000.log
new file mode 100644
index 00000000..607585c5
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelScalarDelayNoMultiPrePost_CUDAStandaloneConfiguration_1000.log
@@ -0,0 +1,25 @@
+==23945== NVPROF is profiling process 23945, command: ./main
+==23945== Profiling application: ./main
+==23945== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 28.82%  47.429ms     10000  4.7420us  2.8800us  34.464us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*)
+ 28.42%  46.768ms     10000  4.6760us  4.4480us  6.8800us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*)
+ 18.77%  30.887ms     10000  3.0880us  3.0400us  3.6160us  [CUDA memset]
+ 13.20%  21.722ms     10000  2.1720us  2.0160us  2.5920us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+ 10.25%  16.871ms     10000  1.6870us  1.5680us  1.9840us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+  0.54%  881.31us         1  881.31us  881.31us  881.31us  void gen_sequenced<curandStateXORWOW, float2, normal_args_st, __operator_&__(float2 curand_normal_scaled2<curandStateXORWOW>(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st)
+
+==23945== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 68.47%  378.42ms     40001  9.4600us  8.3920us  11.185ms  cudaLaunch
+ 16.96%  93.726ms     10000  9.3720us  8.8820us  22.956us  cudaMemset
+ 10.76%  59.491ms    330005     180ns     148ns  309.86us  cudaSetupArgument
+  1.90%  10.527ms     40001     263ns     182ns  298.24us  cudaConfigureCall
+  1.84%  10.177ms     40002     254ns     225ns  10.282us  cudaGetLastError
+  0.03%  178.62us         1  178.62us  178.62us  178.62us  cudaMalloc
+  0.01%  52.598us         1  52.598us  52.598us  52.598us  cudaMemGetInfo
+  0.00%  25.078us        38     659ns     560ns  2.7750us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  19.936us         7  2.8480us  2.0920us  5.4650us  cudaFuncGetAttributes
+  0.00%  17.187us         1  17.187us  17.187us  17.187us  cudaDeviceSynchronize
+  0.00%  5.0920us        12     424ns     278ns  1.0780us  cudaDeviceGetAttribute
+  0.00%  3.1170us         3  1.0390us     523ns  1.9660us  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelScalarDelayNoMultiPrePost_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelScalarDelayNoMultiPrePost_GeNNConfigurationOptimized_1000.log
new file mode 100644
index 00000000..1c090bcc
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelScalarDelayNoMultiPrePost_GeNNConfigurationOptimized_1000.log
@@ -0,0 +1,26 @@
+==24196== NVPROF is profiling process 24196, command: ./main test 1.0 1
+==24196== Profiling application: ./main test 1.0 1
+==24196== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 71.41%  120.56ms     10000  12.055us  10.048us  17.952us  calcNeurons
+ 21.88%  36.941ms     10000  3.6940us  2.5280us  26.912us  calcSynapses
+  5.29%  8.9319ms        41  217.85us     992ns  2.5123ms  [CUDA memcpy HtoD]
+  1.42%  2.3983ms        10  239.83us  2.0160us  2.3673ms  [CUDA memcpy DtoH]
+
+==24196== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 58.26%  272.15ms        11  24.741ms  19.067us  265.67ms  cudaHostAlloc
+ 36.33%  169.74ms     20000  8.4860us  7.6190us  310.62us  cudaLaunch
+  2.72%  12.686ms        53  239.35us     323ns  2.5267ms  cudaMemcpy
+  1.36%  6.3732ms     20000     318ns     242ns  300.70us  cudaConfigureCall
+  1.03%  4.8351ms     20000     241ns     210ns  10.299us  cudaSetupArgument
+  0.22%  1.0265ms        11  93.320us  12.594us  179.95us  cudaMalloc
+  0.05%  240.26us        83  2.8940us     152ns  104.47us  cuDeviceGetAttribute
+  0.01%  32.415us         1  32.415us  32.415us  32.415us  cuDeviceTotalMem
+  0.01%  28.407us         1  28.407us  28.407us  28.407us  cuDeviceGetName
+  0.00%  14.808us        11  1.3460us     741ns  3.2100us  cudaGetSymbolAddress
+  0.00%  14.772us         1  14.772us  14.772us  14.772us  cudaMemcpyToSymbol
+  0.00%  12.168us         1  12.168us  12.168us  12.168us  cudaSetDevice
+  0.00%  1.4860us         1  1.4860us  1.4860us  1.4860us  cudaGetDeviceCount
+  0.00%  1.4580us         2     729ns     473ns     985ns  cuDeviceGetCount
+  0.00%     537ns         2     268ns     226ns     311ns  cuDeviceGet
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelScalarDelay_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelScalarDelay_CUDAStandaloneConfiguration_1000.log
new file mode 100644
index 00000000..a21ea00e
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelScalarDelay_CUDAStandaloneConfiguration_1000.log
@@ -0,0 +1,25 @@
+==2491== NVPROF is profiling process 2491, command: ./main
+==2491== Profiling application: ./main
+==2491== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 28.57%  48.196ms     10000  4.8190us  4.5440us  6.7840us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*)
+ 27.77%  46.841ms     10000  4.6840us  2.8800us  31.584us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*)
+ 19.44%  32.782ms     10000  3.2780us  3.2320us  3.7760us  [CUDA memset]
+ 12.58%  21.215ms     10000  2.1210us  1.9840us  2.5600us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+ 11.12%  18.762ms     10000  1.8760us  1.7920us  2.1120us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+  0.52%  880.90us         1  880.90us  880.90us  880.90us  void gen_sequenced<curandStateXORWOW, float2, normal_args_st, __operator_&__(float2 curand_normal_scaled2<curandStateXORWOW>(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st)
+
+==2491== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 67.81%  358.71ms     40001  8.9670us  7.9890us  10.112ms  cudaLaunch
+ 16.69%  88.268ms     10000  8.8260us  8.3570us  34.808us  cudaMemset
+ 11.38%  60.182ms    330005     182ns     150ns  304.26us  cudaSetupArgument
+  2.12%  11.226ms     40001     280ns     197ns  305.80us  cudaConfigureCall
+  1.95%  10.335ms     40002     258ns     217ns  14.869us  cudaGetLastError
+  0.03%  178.47us         1  178.47us  178.47us  178.47us  cudaMalloc
+  0.01%  51.372us         1  51.372us  51.372us  51.372us  cudaMemGetInfo
+  0.00%  21.822us        38     574ns     469ns  3.0220us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  19.460us         7  2.7800us  2.0130us  5.1840us  cudaFuncGetAttributes
+  0.00%  17.572us         1  17.572us  17.572us  17.572us  cudaDeviceSynchronize
+  0.00%  5.0120us        12     417ns     283ns  1.0740us  cudaDeviceGetAttribute
+  0.00%  2.8560us         3     952ns     570ns  1.6710us  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelScalarDelay_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelScalarDelay_GeNNConfigurationOptimized_1000.log
new file mode 100644
index 00000000..e28fbaf4
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_BrunelHakimModelScalarDelay_GeNNConfigurationOptimized_1000.log
@@ -0,0 +1,26 @@
+==2741== NVPROF is profiling process 2741, command: ./main test 1.0 1
+==2741== Profiling application: ./main test 1.0 1
+==2741== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 71.49%  120.00ms     10000  11.999us  10.016us  18.144us  calcNeurons
+ 21.75%  36.501ms     10000  3.6500us  2.4960us  29.185us  calcSynapses
+  5.33%  8.9404ms        41  218.06us     960ns  2.5144ms  [CUDA memcpy HtoD]
+  1.43%  2.4037ms        10  240.37us  2.0480us  2.3725ms  [CUDA memcpy DtoH]
+
+==2741== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 59.17%  284.47ms        11  25.861ms  13.934us  278.41ms  cudaHostAlloc
+ 35.49%  170.60ms     20000  8.5300us  7.5850us  307.94us  cudaLaunch
+  2.68%  12.860ms        53  242.63us     394ns  2.5288ms  cudaMemcpy
+  1.36%  6.5596ms     20000     327ns     257ns  308.28us  cudaConfigureCall
+  1.04%  5.0131ms     20000     250ns     228ns  9.1940us  cudaSetupArgument
+  0.19%  898.78us        11  81.706us  9.2360us  153.32us  cudaMalloc
+  0.05%  226.47us        83  2.7280us     137ns  97.777us  cuDeviceGetAttribute
+  0.01%  31.138us         1  31.138us  31.138us  31.138us  cuDeviceTotalMem
+  0.01%  27.215us         1  27.215us  27.215us  27.215us  cuDeviceGetName
+  0.00%  12.953us        11  1.1770us     575ns  2.8170us  cudaGetSymbolAddress
+  0.00%  12.076us         1  12.076us  12.076us  12.076us  cudaMemcpyToSymbol
+  0.00%  10.837us         1  10.837us  10.837us  10.837us  cudaSetDevice
+  0.00%  1.5250us         1  1.5250us  1.5250us  1.5250us  cudaGetDeviceCount
+  0.00%  1.4930us         2     746ns     490ns  1.0030us  cuDeviceGetCount
+  0.00%     498ns         2     249ns     224ns     274ns  cuDeviceGet
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_COBAHHFixedConnectivity_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_COBAHHFixedConnectivity_CUDAStandaloneConfiguration_1000.log
new file mode 100644
index 00000000..4beaf7b5
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_COBAHHFixedConnectivity_CUDAStandaloneConfiguration_1000.log
@@ -0,0 +1,25 @@
+==17632== NVPROF is profiling process 17632, command: ./main
+==17632== Profiling application: ./main
+==17632== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 44.90%  349.33ms     10000  34.933us  1.6640us  111.13ms  kernel_spikemonitor_codeobject(unsigned int, int*, double, int*, int*, int*, int, int*, double*, int, int*, int*)
+ 23.60%  183.61ms     10000  18.361us  17.824us  21.856us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*, bool*, double*, double*, double*, double*, double, double*)
+ 14.85%  115.52ms     10000  11.551us  3.0720us  36.353us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double*, double, int*, int, int*)
+  9.49%  73.847ms     10000  7.3840us  3.0720us  24.064us  kernel_synapses_1_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, int*, double*)
+  4.03%  31.352ms     10000  3.1350us  3.0400us  4.2880us  [CUDA memset]
+  3.12%  24.285ms     10000  2.4280us  2.0480us  2.7840us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+  0.01%  68.000us         1  68.000us  68.000us  68.000us  _run_spikemonitor_codeobject_init(void)
+
+==17632== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 73.31%  632.36ms     50001  12.646us  8.2740us  95.930ms  cudaLaunch
+ 12.10%  104.36ms    590000     176ns     149ns  346.69us  cudaSetupArgument
+ 11.27%  97.201ms     10000  9.7200us  8.6440us  1.1383ms  cudaMemset
+  1.55%  13.390ms     50001     267ns     192ns  331.43us  cudaConfigureCall
+  1.55%  13.349ms     50001     266ns     220ns  330.51us  cudaGetLastError
+  0.21%  1.8328ms         1  1.8328ms  1.8328ms  1.8328ms  cudaDeviceSynchronize
+  0.01%  51.143us         1  51.143us  51.143us  51.143us  cudaMemGetInfo
+  0.00%  18.972us         7  2.7100us  2.0070us  4.6510us  cudaFuncGetAttributes
+  0.00%  14.003us        22     636ns     470ns  1.4930us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  4.3080us         8     538ns     317ns  1.2590us  cudaDeviceGetAttribute
+  0.00%  2.2780us         2  1.1390us     764ns  1.5140us  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_COBAHHFixedConnectivity_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_COBAHHFixedConnectivity_GeNNConfigurationOptimized_1000.log
new file mode 100644
index 00000000..32b766b8
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_COBAHHFixedConnectivity_GeNNConfigurationOptimized_1000.log
@@ -0,0 +1,25 @@
+==17891== NVPROF is profiling process 17891, command: ./main test 1.0 1
+==17891== Profiling application: ./main test 1.0 1
+==17891== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 66.58%  251.53ms     10000  25.153us  23.840us  28.000us  calcNeurons
+ 23.34%  88.193ms     10000  8.8190us  2.4320us  41.472us  calcSynapses
+  9.86%  37.269ms     18461  2.0180us  1.9520us  153.18us  [CUDA memcpy DtoH]
+  0.22%  820.87us        68  12.071us     960ns  164.23us  [CUDA memcpy HtoD]
+
+==17891== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 52.66%  509.16ms     20088  25.346us     320ns  371.03us  cudaMemcpy
+ 26.73%  258.42ms        19  13.601ms  8.8970us  255.30ms  cudaHostAlloc
+ 19.10%  184.67ms     20000  9.2330us  7.8160us  348.55us  cudaLaunch
+  0.81%  7.7916ms     20000     389ns     275ns  331.45us  cudaConfigureCall
+  0.56%  5.4451ms     20000     272ns     241ns  4.6710us  cudaSetupArgument
+  0.10%  1.0098ms        19  53.145us  6.4240us  173.26us  cudaMalloc
+  0.02%  226.52us        83  2.7290us     143ns  97.659us  cuDeviceGetAttribute
+  0.00%  31.331us         1  31.331us  31.331us  31.331us  cuDeviceTotalMem
+  0.00%  30.487us         1  30.487us  30.487us  30.487us  cuDeviceGetName
+  0.00%  18.126us        19     954ns     368ns  3.5740us  cudaGetSymbolAddress
+  0.00%  11.311us         1  11.311us  11.311us  11.311us  cudaSetDevice
+  0.00%  1.7800us         2     890ns     658ns  1.1220us  cuDeviceGetCount
+  0.00%  1.4830us         1  1.4830us  1.4830us  1.4830us  cudaGetDeviceCount
+  0.00%     640ns         2     320ns     242ns     398ns  cuDeviceGet
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_COBAHH_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_COBAHH_CUDAStandaloneConfiguration_1000.log
new file mode 100644
index 00000000..3e647a1c
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_COBAHH_CUDAStandaloneConfiguration_1000.log
@@ -0,0 +1,23 @@
+==11907== NVPROF is profiling process 11907, command: ./main
+==11907== Profiling application: ./main
+==11907== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 39.16%  186.02ms     10000  18.602us  17.856us  21.568us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*, bool*, double*, double*, double*, double*, double, double*)
+ 29.93%  142.18ms     10000  14.218us  3.2320us  35.680us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double*, double, int*, int, int*)
+ 19.08%  90.630ms     10000  9.0620us  3.1680us  24.448us  kernel_synapses_1_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, int*, double*)
+  6.67%  31.670ms     10000  3.1660us  3.0400us  4.1920us  [CUDA memset]
+  5.15%  24.481ms     10000  2.4480us  2.0480us  2.7840us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+
+==11907== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 66.01%  376.74ms     40000  9.4180us  8.4480us  6.9662ms  cudaLaunch
+ 15.97%  91.133ms     10000  9.1130us  8.5190us  28.283us  cudaMemset
+ 13.95%  79.611ms    470000     169ns     149ns  316.22us  cudaSetupArgument
+  2.29%  13.092ms     40000     327ns     202ns  311.93us  cudaConfigureCall
+  1.76%  10.072ms     40000     251ns     230ns  5.0760us  cudaGetLastError
+  0.01%  50.252us         1  50.252us  50.252us  50.252us  cudaMemGetInfo
+  0.00%  22.121us         1  22.121us  22.121us  22.121us  cudaDeviceSynchronize
+  0.00%  16.912us         6  2.8180us  2.0980us  4.5270us  cudaFuncGetAttributes
+  0.00%  13.875us        21     660ns     520ns  1.5110us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  3.9730us         8     496ns     302ns  1.1490us  cudaDeviceGetAttribute
+  0.00%  2.3840us         2  1.1920us     836ns  1.5480us  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_COBAHH_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_COBAHH_GeNNConfigurationOptimized_1000.log
new file mode 100644
index 00000000..21ebd2b2
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_COBAHH_GeNNConfigurationOptimized_1000.log
@@ -0,0 +1,25 @@
+==12169== NVPROF is profiling process 12169, command: ./main test 1.0 1
+==12169== Profiling application: ./main test 1.0 1
+==12169== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 64.38%  254.25ms     10000  25.425us  23.777us  28.416us  calcNeurons
+ 35.52%  140.25ms     10000  14.025us  2.4320us  41.696us  calcSynapses
+  0.07%  285.47us        68  4.1980us     960ns  42.944us  [CUDA memcpy HtoD]
+  0.03%  108.42us        18  6.0230us  1.9840us  40.736us  [CUDA memcpy DtoH]
+
+==12169== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 52.49%  378.74ms     20000  18.937us  7.6840us  358.81us  cudaLaunch
+ 42.10%  303.75ms        19  15.987ms  8.2320us  301.68ms  cudaHostAlloc
+  3.34%  24.097ms        88  273.83us     330ns  22.690ms  cudaMemcpy
+  1.06%  7.6642ms     20000     383ns     262ns  335.28us  cudaConfigureCall
+  0.86%  6.2250ms     20000     311ns     242ns  336.35us  cudaSetupArgument
+  0.10%  707.36us        19  37.229us  6.2200us  126.23us  cudaMalloc
+  0.03%  241.14us        83  2.9050us     137ns  109.48us  cuDeviceGetAttribute
+  0.00%  31.485us         1  31.485us  31.485us  31.485us  cuDeviceTotalMem
+  0.00%  30.190us         1  30.190us  30.190us  30.190us  cuDeviceGetName
+  0.00%  12.302us        19     647ns     344ns  2.1110us  cudaGetSymbolAddress
+  0.00%  11.562us         1  11.562us  11.562us  11.562us  cudaSetDevice
+  0.00%  1.5290us         2     764ns     561ns     968ns  cuDeviceGetCount
+  0.00%  1.4620us         1  1.4620us  1.4620us  1.4620us  cudaGetDeviceCount
+  0.00%     480ns         2     240ns     218ns     262ns  cuDeviceGet
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_CUBAFixedConnectivity_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_CUBAFixedConnectivity_CUDAStandaloneConfiguration_1000.log
new file mode 100644
index 00000000..60479a60
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_CUBAFixedConnectivity_CUDAStandaloneConfiguration_1000.log
@@ -0,0 +1,26 @@
+==28333== NVPROF is profiling process 28333, command: ./main
+==28333== Profiling application: ./main
+==28333== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 23.53%  75.188ms     10000  7.5180us  7.1360us  8.8960us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double, double*, double*, double*, bool*)
+ 20.88%  66.723ms     10000  6.6720us  1.6960us  14.967ms  kernel_spikemonitor_codeobject(unsigned int, int*, double, int*, int*, int*, int, int*, double*, int, int*, int*)
+ 17.07%  54.561ms     10000  5.4560us  3.2960us  21.920us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double*, double, int*, int, int*, bool*)
+ 15.31%  48.929ms     10000  4.8920us  3.2960us  18.784us  kernel_synapses_1_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, int*, double*, bool*)
+ 10.24%  32.716ms     10000  3.2710us  3.1360us  4.1920us  [CUDA memset]
+  7.36%  23.508ms     10000  2.3500us  2.0160us  2.7200us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+  5.59%  17.866ms     10000  1.7860us  1.5360us  2.0800us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*, bool*)
+  0.02%  67.328us         1  67.328us  67.328us  67.328us  _run_spikemonitor_codeobject_init(void)
+
+==28333== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 70.32%  550.58ms     60001  9.1760us  8.3390us  6.9445ms  cudaLaunch
+ 14.00%  109.65ms    630000     174ns     148ns  343.93us  cudaSetupArgument
+ 11.69%  91.573ms     10000  9.1570us  8.5300us  165.12us  cudaMemset
+  1.99%  15.611ms     60001     260ns     222ns  327.19us  cudaConfigureCall
+  1.98%  15.472ms     60001     257ns     208ns  1.1493ms  cudaGetLastError
+  0.01%  51.353us         1  51.353us  51.353us  51.353us  cudaMemGetInfo
+  0.00%  24.711us        40     617ns     509ns  1.7610us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  23.494us         9  2.6100us  2.0080us  4.3370us  cudaFuncGetAttributes
+  0.00%  17.566us         1  17.566us  17.566us  17.566us  cudaDeviceSynchronize
+  0.00%  5.4430us        12     453ns     281ns  1.1050us  cudaDeviceGetAttribute
+  0.00%  3.0770us         3  1.0250us     646ns  1.6320us  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_CUBAFixedConnectivity_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_CUBAFixedConnectivity_GeNNConfigurationOptimized_1000.log
new file mode 100644
index 00000000..2b127f8e
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_CUBAFixedConnectivity_GeNNConfigurationOptimized_1000.log
@@ -0,0 +1,25 @@
+==28592== NVPROF is profiling process 28592, command: ./main test 1.0 1
+==28592== Profiling application: ./main test 1.0 1
+==28592== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 63.11%  133.95ms     10000  13.394us  12.384us  14.432us  calcNeurons
+ 22.74%  48.266ms     10000  4.8260us  2.7200us  24.896us  calcSynapses
+ 13.78%  29.240ms     14081  2.0760us  2.0160us  154.95us  [CUDA memcpy DtoH]
+  0.37%  793.60us        56  14.171us     960ns  163.11us  [CUDA memcpy HtoD]
+
+==28592== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 38.67%  315.20ms     20073  15.702us     324ns  773.07us  cudaMemcpy
+ 37.36%  304.57ms        16  19.036ms  8.7600us  301.99ms  cudaHostAlloc
+ 22.40%  182.59ms     20000  9.1290us  7.6730us  821.14us  cudaLaunch
+  0.78%  6.3728ms     20000     318ns     250ns  5.2440us  cudaConfigureCall
+  0.66%  5.3441ms     20000     267ns     226ns  332.81us  cudaSetupArgument
+  0.10%  800.29us        16  50.018us  6.1360us  126.53us  cudaMalloc
+  0.03%  230.87us        83  2.7810us     153ns  99.066us  cuDeviceGetAttribute
+  0.00%  32.084us         1  32.084us  32.084us  32.084us  cuDeviceTotalMem
+  0.00%  30.780us         1  30.780us  30.780us  30.780us  cuDeviceGetName
+  0.00%  12.549us        16     784ns     421ns  2.2350us  cudaGetSymbolAddress
+  0.00%  11.671us         1  11.671us  11.671us  11.671us  cudaSetDevice
+  0.00%  1.8440us         1  1.8440us  1.8440us  1.8440us  cudaGetDeviceCount
+  0.00%  1.7500us         2     875ns     690ns  1.0600us  cuDeviceGetCount
+  0.00%     626ns         2     313ns     253ns     373ns  cuDeviceGet
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_CUBA_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_CUBA_CUDAStandaloneConfiguration_1000.log
new file mode 100644
index 00000000..69e1a4bd
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_CUBA_CUDAStandaloneConfiguration_1000.log
@@ -0,0 +1,24 @@
+==31291== NVPROF is profiling process 31291, command: ./main
+==31291== Profiling application: ./main
+==31291== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 31.18%  76.419ms     10000  7.6410us  7.3920us  8.7360us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double, double*, double*, double*, bool*)
+ 19.96%  48.924ms     10000  4.8920us  3.4560us  20.384us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double*, double, int*, int, int*, bool*)
+ 18.13%  44.432ms     10000  4.4430us  3.2960us  17.952us  kernel_synapses_1_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, int*, double*, bool*)
+ 13.38%  32.789ms     10000  3.2780us  3.2320us  3.7760us  [CUDA memset]
+  9.59%  23.496ms     10000  2.3490us  2.0480us  2.7520us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+  7.76%  19.020ms     10000  1.9010us  1.6640us  2.0800us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*, bool*)
+
+==31291== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 68.69%  471.10ms     50000  9.4220us  8.2170us  19.231ms  cudaLaunch
+ 13.91%  95.387ms     10000  9.5380us  8.7960us  312.26us  cudaMemset
+ 13.50%  92.578ms    510000     181ns     148ns  324.51us  cudaSetupArgument
+  2.05%  14.040ms     50000     280ns     237ns  5.2940us  cudaConfigureCall
+  1.83%  12.581ms     50000     251ns     217ns  12.226us  cudaGetLastError
+  0.01%  51.575us         1  51.575us  51.575us  51.575us  cudaMemGetInfo
+  0.00%  21.460us        39     550ns     461ns  1.4270us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  21.129us         8  2.6410us  1.9560us  4.4310us  cudaFuncGetAttributes
+  0.00%  16.670us         1  16.670us  16.670us  16.670us  cudaDeviceSynchronize
+  0.00%  5.5840us        12     465ns     285ns  1.2870us  cudaDeviceGetAttribute
+  0.00%  3.3860us         3  1.1280us     653ns  1.8010us  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_CUBA_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_CUBA_GeNNConfigurationOptimized_1000.log
new file mode 100644
index 00000000..5ec6b1a0
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_CUBA_GeNNConfigurationOptimized_1000.log
@@ -0,0 +1,25 @@
+==31529== NVPROF is profiling process 31529, command: ./main test 1.0 1
+==31529== Profiling application: ./main test 1.0 1
+==31529== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 74.56%  131.02ms     10000  13.101us  11.808us  14.624us  calcNeurons
+ 24.85%  43.662ms     10000  4.3660us  2.1760us  25.760us  calcSynapses
+  0.45%  796.80us        56  14.228us     960ns  163.59us  [CUDA memcpy HtoD]
+  0.13%  234.31us        13  18.023us  1.9520us  155.27us  [CUDA memcpy DtoH]
+
+==31529== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 57.53%  276.80ms        16  17.300ms  8.5100us  274.32ms  cudaHostAlloc
+ 38.37%  184.60ms     20000  9.2300us  7.6370us  342.36us  cudaLaunch
+  1.48%  7.1407ms        73  97.817us     343ns  5.2594ms  cudaMemcpy
+  1.31%  6.3266ms     20000     316ns     249ns  315.38us  cudaConfigureCall
+  1.06%  5.1071ms     20000     255ns     220ns  4.6570us  cudaSetupArgument
+  0.17%  819.17us        16  51.198us  6.2400us  136.59us  cudaMalloc
+  0.05%  241.67us        83  2.9110us     138ns  103.86us  cuDeviceGetAttribute
+  0.01%  32.371us         1  32.371us  32.371us  32.371us  cuDeviceTotalMem
+  0.01%  28.436us         1  28.436us  28.436us  28.436us  cuDeviceGetName
+  0.00%  12.399us        16     774ns     424ns  2.0180us  cudaGetSymbolAddress
+  0.00%  12.047us         1  12.047us  12.047us  12.047us  cudaSetDevice
+  0.00%  1.6800us         1  1.6800us  1.6800us  1.6800us  cudaGetDeviceCount
+  0.00%  1.4560us         2     728ns     455ns  1.0010us  cuDeviceGetCount
+  0.00%     575ns         2     287ns     235ns     340ns  cuDeviceGet
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_DenseMediumRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_DenseMediumRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log
new file mode 100644
index 00000000..9f083553
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_DenseMediumRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log
@@ -0,0 +1,21 @@
+==30551== NVPROF is profiling process 30551, command: ./main
+==30551== Profiling application: ./main
+==30551== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 56.01%  59.694ms     10000  5.9690us  5.6000us  6.4960us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, double*, int*, int, int*)
+ 28.93%  30.830ms     10000  3.0820us  3.0400us  3.5200us  [CUDA memset]
+ 15.06%  16.055ms     10000  1.6050us  1.5040us  2.4000us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*)
+
+==30551== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 59.17%  191.07ms     20000  9.5530us  8.3220us  11.129ms  cudaLaunch
+ 27.89%  90.062ms     10000  9.0060us  8.4390us  27.616us  cudaMemset
+  9.32%  30.084ms    170000     176ns     153ns  306.97us  cudaSetupArgument
+  1.82%  5.8925ms     20000     294ns     213ns  303.17us  cudaConfigureCall
+  1.77%  5.7023ms     20000     285ns     216ns  302.98us  cudaGetLastError
+  0.01%  46.403us         1  46.403us  46.403us  46.403us  cudaMemGetInfo
+  0.01%  18.635us         1  18.635us  18.635us  18.635us  cudaDeviceSynchronize
+  0.00%  8.8700us         3  2.9560us  2.1570us  3.7290us  cudaFuncGetAttributes
+  0.00%  6.7130us         3  2.2370us     629ns  3.5200us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  1.7730us         4     443ns     369ns     586ns  cudaDeviceGetAttribute
+  0.00%     848ns         1     848ns     848ns     848ns  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_DenseMediumRateSynapsesOnly_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_DenseMediumRateSynapsesOnly_GeNNConfigurationOptimized_1000.log
new file mode 100644
index 00000000..05b6bca3
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_DenseMediumRateSynapsesOnly_GeNNConfigurationOptimized_1000.log
@@ -0,0 +1,25 @@
+==30762== NVPROF is profiling process 30762, command: ./main test 1.0 1
+==30762== Profiling application: ./main test 1.0 1
+==30762== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 64.08%  52.562ms     10000  5.2560us  3.4240us  5.9200us  calcSynapses
+ 35.80%  29.364ms     10000  2.9360us  2.8800us  3.8080us  calcNeurons
+  0.07%  57.888us        44  1.3150us     960ns  2.2400us  [CUDA memcpy HtoD]
+  0.05%  38.240us        14  2.7310us  2.0160us  4.7360us  [CUDA memcpy DtoH]
+
+==30762== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 61.72%  283.35ms        12  23.613ms  14.143us  281.71ms  cudaHostAlloc
+ 35.34%  162.27ms     20000  8.1130us  7.4880us  334.11us  cudaLaunch
+  1.34%  6.1571ms     20000     307ns     256ns  322.44us  cudaConfigureCall
+  1.16%  5.3454ms     20000     267ns     224ns  332.57us  cudaSetupArgument
+  0.23%  1.0363ms        61  16.988us     318ns  37.131us  cudaMemcpy
+  0.14%  644.11us        12  53.676us  11.831us  178.21us  cudaMalloc
+  0.05%  226.72us        83  2.7310us     138ns  97.611us  cuDeviceGetAttribute
+  0.01%  31.315us         1  31.315us  31.315us  31.315us  cuDeviceTotalMem
+  0.01%  26.553us         1  26.553us  26.553us  26.553us  cuDeviceGetName
+  0.00%  13.976us        12  1.1640us     709ns  3.1230us  cudaGetSymbolAddress
+  0.00%  11.238us         1  11.238us  11.238us  11.238us  cudaSetDevice
+  0.00%  1.4430us         2     721ns     438ns  1.0050us  cuDeviceGetCount
+  0.00%  1.4380us         1  1.4380us  1.4380us  1.4380us  cudaGetDeviceCount
+  0.00%     582ns         2     291ns     214ns     368ns  cuDeviceGet
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_HHNeuronsOnly_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_HHNeuronsOnly_CUDAStandaloneConfiguration_1000.log
new file mode 100644
index 00000000..c3db833d
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_HHNeuronsOnly_CUDAStandaloneConfiguration_1000.log
@@ -0,0 +1,21 @@
+==25014== NVPROF is profiling process 25014, command: ./main
+==25014== Profiling application: ./main
+==25014== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 76.60%  171.78ms     10000  17.177us  14.880us  18.080us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, bool*, double*, double*, double*, double*)
+ 13.61%  30.516ms     10000  3.0510us  2.8160us  3.5840us  [CUDA memset]
+  9.79%  21.945ms     10000  2.1940us  1.8240us  2.9120us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+
+==25014== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 58.23%  179.09ms     20000  8.9540us  8.0160us  5.8117ms  cudaLaunch
+ 28.13%  86.520ms     10000  8.6520us  8.0220us  324.89us  cudaMemset
+ 10.05%  30.914ms    160000     193ns     150ns  347.54us  cudaSetupArgument
+  1.94%  5.9702ms     20000     298ns     223ns  315.53us  cudaConfigureCall
+  1.61%  4.9531ms     20000     247ns     210ns  327.22us  cudaGetLastError
+  0.02%  46.728us         1  46.728us  46.728us  46.728us  cudaMemGetInfo
+  0.01%  17.432us        35     498ns     471ns     917ns  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  10.745us         1  10.745us  10.745us  10.745us  cudaDeviceSynchronize
+  0.00%  10.378us         4  2.5940us  2.0060us  3.1740us  cudaFuncGetAttributes
+  0.00%  3.1700us         8     396ns     284ns     677ns  cudaDeviceGetAttribute
+  0.00%  1.6580us         2     829ns     801ns     857ns  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_HHNeuronsOnly_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_HHNeuronsOnly_GeNNConfigurationOptimized_1000.log
new file mode 100644
index 00000000..798b69a4
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_HHNeuronsOnly_GeNNConfigurationOptimized_1000.log
@@ -0,0 +1,24 @@
+==25225== NVPROF is profiling process 25225, command: ./main test 1.0 1
+==25225== Profiling application: ./main test 1.0 1
+==25225== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 99.94%  177.51ms     10000  17.750us  14.944us  26.400us  calcNeurons
+  0.04%  62.626us        40  1.5650us     960ns  2.1760us  [CUDA memcpy HtoD]
+  0.02%  38.560us        11  3.5050us  2.0160us  4.6720us  [CUDA memcpy DtoH]
+
+==25225== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 55.84%  235.54ms        10  23.554ms  16.992us  233.93ms  cudaHostAlloc
+ 37.45%  157.95ms     10000  15.795us  7.9250us  353.53us  cudaLaunch
+  4.97%  20.977ms        53  395.80us     389ns  20.008ms  cudaMemcpy
+  0.81%  3.4097ms     10000     340ns     278ns  5.0220us  cudaConfigureCall
+  0.70%  2.9582ms     10000     295ns     232ns  339.82us  cudaSetupArgument
+  0.15%  630.64us        10  63.063us  12.457us  174.83us  cudaMalloc
+  0.05%  227.15us        83  2.7360us     140ns  98.109us  cuDeviceGetAttribute
+  0.01%  31.635us         1  31.635us  31.635us  31.635us  cuDeviceTotalMem
+  0.01%  31.273us         1  31.273us  31.273us  31.273us  cuDeviceGetName
+  0.00%  12.870us        10  1.2870us     741ns  3.5550us  cudaGetSymbolAddress
+  0.00%  10.918us         1  10.918us  10.918us  10.918us  cudaSetDevice
+  0.00%  1.9240us         2     962ns     718ns  1.2060us  cuDeviceGetCount
+  0.00%  1.4330us         1  1.4330us  1.4330us  1.4330us  cudaGetDeviceCount
+  0.00%     657ns         2     328ns     303ns     354ns  cuDeviceGet
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_LinearNeuronsOnly_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_LinearNeuronsOnly_CUDAStandaloneConfiguration_1000.log
new file mode 100644
index 00000000..1b216fb6
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_LinearNeuronsOnly_CUDAStandaloneConfiguration_1000.log
@@ -0,0 +1,18 @@
+==19640== NVPROF is profiling process 19640, command: ./main
+==19640== Profiling application: ./main
+==19640== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+100.00%  247.35ms    100000  2.4730us  2.3360us  3.6800us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*)
+
+==19640== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 87.43%  837.87ms    100000  8.3780us  7.7260us  7.8274ms  cudaLaunch
+  7.01%  67.186ms    400000     167ns     147ns  10.910us  cudaSetupArgument
+  2.81%  26.904ms    100000     269ns     241ns  10.142us  cudaConfigureCall
+  2.74%  26.287ms    100000     262ns     235ns  11.074us  cudaGetLastError
+  0.01%  70.067us         1  70.067us  70.067us  70.067us  cudaMemGetInfo
+  0.00%  14.560us         2  7.2800us  4.1830us  10.377us  cudaFuncGetAttributes
+  0.00%  9.6320us         1  9.6320us  9.6320us  9.6320us  cudaDeviceSynchronize
+  0.00%  5.2800us         2  2.6400us  1.1150us  4.1650us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  3.9840us         1  3.9840us  3.9840us  3.9840us  cudaGetDevice
+  0.00%  3.7360us         4     934ns     668ns  1.5690us  cudaDeviceGetAttribute
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_LinearNeuronsOnly_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_LinearNeuronsOnly_GeNNConfigurationOptimized_1000.log
new file mode 100644
index 00000000..a5d528ae
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_LinearNeuronsOnly_GeNNConfigurationOptimized_1000.log
@@ -0,0 +1,24 @@
+==19869== NVPROF is profiling process 19869, command: ./main test 10.0 1
+==19869== Profiling application: ./main test 10.0 1
+==19869== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 99.99%  264.71ms    100000  2.6470us  2.5920us  3.1680us  calcNeurons
+  0.01%  22.656us        16  1.4160us     960ns  2.0800us  [CUDA memcpy HtoD]
+  0.01%  14.624us         5  2.9240us  2.0480us  4.6720us  [CUDA memcpy DtoH]
+
+==19869== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 73.18%  822.50ms    100000  8.2250us  7.6370us  361.19us  cudaLaunch
+ 21.57%  242.48ms         4  60.620ms  23.163us  240.97ms  cudaHostAlloc
+  2.95%  33.155ms    100000     331ns     251ns  369.91us  cudaConfigureCall
+  2.18%  24.551ms    100000     245ns     222ns  14.790us  cudaSetupArgument
+  0.05%  525.28us         4  131.32us  12.450us  178.02us  cudaMalloc
+  0.04%  460.82us        23  20.035us     384ns  39.476us  cudaMemcpy
+  0.02%  226.65us        83  2.7300us     142ns  97.695us  cuDeviceGetAttribute
+  0.00%  31.478us         1  31.478us  31.478us  31.478us  cuDeviceTotalMem
+  0.00%  30.578us         1  30.578us  30.578us  30.578us  cuDeviceGetName
+  0.00%  10.794us         1  10.794us  10.794us  10.794us  cudaSetDevice
+  0.00%  7.9740us         4  1.9930us     876ns  3.7070us  cudaGetSymbolAddress
+  0.00%  1.5520us         2     776ns     553ns     999ns  cuDeviceGetCount
+  0.00%  1.4290us         1  1.4290us  1.4290us  1.4290us  cudaGetDeviceCount
+  0.00%     545ns         2     272ns     256ns     289ns  cuDeviceGet
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPEventDriven_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPEventDriven_CUDAStandaloneConfiguration_1000.log
new file mode 100644
index 00000000..f45b844f
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPEventDriven_CUDAStandaloneConfiguration_1000.log
@@ -0,0 +1,27 @@
+==13883== NVPROF is profiling process 13883, command: ./main
+==13883== Profiling application: ./main
+==13883== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 29.16%  88.869ms     10000  8.8860us  3.4880us  32.064us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, double*, double, double*, int, int*, int, int*, int)
+ 20.89%  63.662ms     20000  3.1830us  3.0400us  3.6800us  [CUDA memset]
+ 17.94%  54.662ms     10000  5.4660us  5.1840us  7.5200us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*)
+ 12.41%  37.829ms     10000  3.7820us  3.6480us  7.2000us  kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, double, double*, int, int*, int*, int)
+  7.99%  24.357ms     10000  2.4350us  2.1760us  2.8800us  kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*)
+  5.78%  17.601ms     10000  1.7600us  1.5360us  2.4960us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*)
+  5.65%  17.232ms     10000  1.7230us  1.6640us  1.9840us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+  0.17%  532.84us         1  532.84us  532.84us  532.84us  void gen_sequenced<curandStateXORWOW, float, int, __operator_&__(float curand_uniform_noargs<curandStateXORWOW>(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int)
+
+==13883== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 62.59%  547.05ms     60001  9.1170us  8.1770us  7.2312ms  cudaLaunch
+ 20.36%  177.95ms     20000  8.8970us  8.1030us  336.69us  cudaMemset
+ 13.38%  116.92ms    560005     208ns     150ns  330.03us  cudaSetupArgument
+  1.91%  16.702ms     60001     278ns     208ns  316.80us  cudaConfigureCall
+  1.74%  15.203ms     60002     253ns     222ns  313.88us  cudaGetLastError
+  0.02%  138.47us         1  138.47us  138.47us  138.47us  cudaMalloc
+  0.01%  47.825us         1  47.825us  47.825us  47.825us  cudaMemGetInfo
+  0.00%  24.670us        10  2.4670us  1.9950us  3.8850us  cudaFuncGetAttributes
+  0.00%  22.588us        41     550ns     471ns  1.2300us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  17.416us         1  17.416us  17.416us  17.416us  cudaDeviceSynchronize
+  0.00%  5.6370us        16     352ns     276ns     664ns  cudaDeviceGetAttribute
+  0.00%  3.1450us         4     786ns     601ns  1.1830us  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPEventDriven_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPEventDriven_GeNNConfigurationOptimized_1000.log
new file mode 100644
index 00000000..e7d20d38
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPEventDriven_GeNNConfigurationOptimized_1000.log
@@ -0,0 +1,26 @@
+==14124== NVPROF is profiling process 14124, command: ./main test 1.0 1
+==14124== Profiling application: ./main test 1.0 1
+==14124== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 62.29%  109.79ms     10000  10.979us  1.4400us  50.176us  calcSynapses
+ 23.83%  42.003ms     10000  4.2000us  3.3280us  6.2080us  calcNeurons
+ 13.80%  24.321ms     10000  2.4320us  2.0800us  10.848us  learnSynapsesPost
+  0.05%  93.824us        70  1.3400us     960ns  2.1760us  [CUDA memcpy HtoD]
+  0.03%  53.856us        19  2.8340us  1.9520us  4.6400us  [CUDA memcpy DtoH]
+
+==14124== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 54.33%  315.51ms        20  15.776ms  7.4360us  314.37ms  cudaHostAlloc
+ 42.46%  246.58ms     30000  8.2190us  7.6810us  352.29us  cudaLaunch
+  1.62%  9.4165ms     30000     313ns     235ns  338.10us  cudaConfigureCall
+  1.25%  7.2565ms     30000     241ns     219ns  10.061us  cudaSetupArgument
+  0.20%  1.1638ms        95  12.250us     188ns  29.618us  cudaMemcpy
+  0.08%  485.57us        20  24.278us  6.1510us  122.08us  cudaMalloc
+  0.04%  225.75us        83  2.7190us     136ns  97.167us  cuDeviceGetAttribute
+  0.01%  31.148us         1  31.148us  31.148us  31.148us  cuDeviceTotalMem
+  0.00%  27.209us         1  27.209us  27.209us  27.209us  cuDeviceGetName
+  0.00%  25.053us        20  1.2520us     370ns  14.749us  cudaGetSymbolAddress
+  0.00%  11.323us         1  11.323us  11.323us  11.323us  cudaSetDevice
+  0.00%  1.4040us         1  1.4040us  1.4040us  1.4040us  cudaGetDeviceCount
+  0.00%  1.3580us         2     679ns     456ns     902ns  cuDeviceGetCount
+  0.00%     492ns         2     246ns     220ns     272ns  cuDeviceGet
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPMultiPostNeuronalTraces_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPMultiPostNeuronalTraces_CUDAStandaloneConfiguration_1000.log
new file mode 100644
index 00000000..c05b8c43
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPMultiPostNeuronalTraces_CUDAStandaloneConfiguration_1000.log
@@ -0,0 +1,28 @@
+==31645== NVPROF is profiling process 31645, command: ./main
+==31645== Profiling application: ./main
+==31645== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 23.09%  63.632ms     20000  3.1810us  3.0400us  3.8080us  [CUDA memset]
+ 21.51%  59.284ms     10000  5.9280us  5.6320us  7.6160us  kernel_neurongroup_1_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*, double*)
+ 13.19%  36.348ms     10000  3.6340us  3.4240us  12.288us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, double*, double, double*, int, int*, int, int*, int, double*)
+ 12.65%  34.859ms     10000  3.4850us  3.3920us  94.048us  kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, double, double*, int, double*, int*, int, int)
+  9.89%  27.258ms     10000  2.7250us  2.5280us  2.9760us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*)
+  6.72%  18.518ms     10000  1.8510us  1.7600us  2.8160us  kernel_neurongroup_1_thresholder_codeobject(unsigned int, unsigned int, int*, double*)
+  6.69%  18.444ms     10000  1.8440us  1.6000us  2.4320us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*)
+  6.26%  17.266ms     10000  1.7260us  1.6640us  2.4000us  kernel_neurongroup_1_resetter_codeobject(unsigned int, unsigned int, double*, int*, double*)
+  0.01%  22.689us         1  22.689us  22.689us  22.689us  void gen_sequenced<curandStateXORWOW, float, int, __operator_&__(float curand_uniform_noargs<curandStateXORWOW>(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int)
+
+==31645== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 66.34%  631.89ms     70001  9.0260us  7.8240us  7.5683ms  cudaLaunch
+ 18.61%  177.26ms     20000  8.8630us  8.0310us  327.63us  cudaMemset
+ 11.06%  105.29ms    570005     184ns     147ns  324.54us  cudaSetupArgument
+  1.98%  18.868ms     70002     269ns     211ns  316.30us  cudaGetLastError
+  1.98%  18.848ms     70001     269ns     196ns  10.259us  cudaConfigureCall
+  0.01%  123.44us         1  123.44us  123.44us  123.44us  cudaMalloc
+  0.01%  48.253us         1  48.253us  48.253us  48.253us  cudaMemGetInfo
+  0.00%  38.693us        74     522ns     468ns  1.2040us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  30.351us        12  2.5290us  2.0130us  4.4000us  cudaFuncGetAttributes
+  0.00%  17.703us         1  17.703us  17.703us  17.703us  cudaDeviceSynchronize
+  0.00%  8.0120us        20     400ns     315ns     771ns  cudaDeviceGetAttribute
+  0.00%  3.7350us         5     747ns     588ns  1.2880us  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPMultiPostNeuronalTraces_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPMultiPostNeuronalTraces_GeNNConfigurationOptimized_1000.log
new file mode 100644
index 00000000..7556474c
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPMultiPostNeuronalTraces_GeNNConfigurationOptimized_1000.log
@@ -0,0 +1,26 @@
+==31875== NVPROF is profiling process 31875, command: ./main test 1.0 1
+==31875== Profiling application: ./main test 1.0 1
+==31875== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 51.59%  44.978ms     10000  4.4970us  4.4160us  13.216us  calcNeurons
+ 28.08%  24.482ms     10000  2.4480us  2.4000us  108.48us  learnSynapsesPost
+ 20.19%  17.604ms     10000  1.7600us  1.5680us  8.0320us  calcSynapses
+  0.09%  77.888us        70  1.1120us     960ns  2.0160us  [CUDA memcpy HtoD]
+  0.05%  40.704us        17  2.3940us  2.0480us  4.6720us  [CUDA memcpy DtoH]
+
+==31875== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 49.08%  242.98ms     30000  8.0990us  7.4830us  330.16us  cudaLaunch
+ 46.99%  232.62ms        20  11.631ms  13.742us  230.95ms  cudaHostAlloc
+  1.93%  9.5539ms     30000     318ns     249ns  316.27us  cudaConfigureCall
+  1.50%  7.4449ms     30000     248ns     228ns  9.5620us  cudaSetupArgument
+  0.29%  1.4169ms        93  15.235us     341ns  34.925us  cudaMemcpy
+  0.15%  732.26us        20  36.613us  11.241us  173.89us  cudaMalloc
+  0.05%  225.85us        83  2.7210us     144ns  97.097us  cuDeviceGetAttribute
+  0.01%  31.104us         1  31.104us  31.104us  31.104us  cuDeviceTotalMem
+  0.01%  27.342us         1  27.342us  27.342us  27.342us  cuDeviceGetName
+  0.00%  19.527us        20     976ns     638ns  3.5660us  cudaGetSymbolAddress
+  0.00%  11.180us         1  11.180us  11.180us  11.180us  cudaSetDevice
+  0.00%  1.5790us         2     789ns     579ns  1.0000us  cuDeviceGetCount
+  0.00%  1.4070us         1  1.4070us  1.4070us  1.4070us  cudaGetDeviceCount
+  0.00%     534ns         2     267ns     238ns     296ns  cuDeviceGet
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPMultiPost_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPMultiPost_CUDAStandaloneConfiguration_1000.log
new file mode 100644
index 00000000..d11b4f61
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPMultiPost_CUDAStandaloneConfiguration_1000.log
@@ -0,0 +1,27 @@
+==13752== NVPROF is profiling process 13752, command: ./main
+==13752== Profiling application: ./main
+==13752== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 26.01%  63.681ms     20000  3.1840us  3.0400us  3.8080us  [CUDA memset]
+ 21.90%  53.615ms     10000  5.3610us  5.1840us  7.2640us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*)
+ 16.08%  39.373ms     10000  3.9370us  3.5840us  10.720us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, double*, double, double*, int, int*, int, int*, int)
+ 14.74%  36.097ms     10000  3.6090us  3.4880us  105.60us  kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, double, double*, int, int*, int*, int)
+  8.31%  20.344ms     10000  2.0340us  1.8560us  2.4320us  kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*)
+  6.61%  16.187ms     10000  1.6180us  1.5040us  2.8160us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*)
+  6.34%  15.535ms     10000  1.5530us  1.4720us  1.9840us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+  0.01%  22.881us         1  22.881us  22.881us  22.881us  void gen_sequenced<curandStateXORWOW, float, int, __operator_&__(float curand_uniform_noargs<curandStateXORWOW>(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int)
+
+==13752== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 64.39%  566.77ms     60001  9.4450us  8.5300us  7.6226ms  cudaLaunch
+ 20.37%  179.35ms     20000  8.9670us  8.0990us  320.51us  cudaMemset
+ 11.68%  102.80ms    560005     183ns     154ns  320.82us  cudaSetupArgument
+  1.91%  16.807ms     60001     280ns     234ns  314.83us  cudaConfigureCall
+  1.62%  14.260ms     60002     237ns     197ns  325.01us  cudaGetLastError
+  0.01%  125.15us         1  125.15us  125.15us  125.15us  cudaMalloc
+  0.01%  50.027us         1  50.027us  50.027us  50.027us  cudaMemGetInfo
+  0.00%  25.943us        10  2.5940us  1.9990us  4.6510us  cudaFuncGetAttributes
+  0.00%  23.402us        41     570ns     490ns  1.2400us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  17.044us         1  17.044us  17.044us  17.044us  cudaDeviceSynchronize
+  0.00%  6.0160us        16     376ns     279ns  1.0150us  cudaDeviceGetAttribute
+  0.00%  3.0950us         4     773ns     532ns  1.3840us  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPMultiPost_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPMultiPost_GeNNConfigurationOptimized_1000.log
new file mode 100644
index 00000000..08c38fd6
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPMultiPost_GeNNConfigurationOptimized_1000.log
@@ -0,0 +1,26 @@
+==13992== NVPROF is profiling process 13992, command: ./main test 1.0 1
+==13992== Profiling application: ./main test 1.0 1
+==13992== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 47.47%  40.621ms     10000  4.0620us  3.9680us  12.064us  calcNeurons
+ 29.19%  24.977ms     10000  2.4970us  2.4000us  360.29us  learnSynapsesPost
+ 23.19%  19.844ms     10000  1.9840us  1.5680us  15.904us  calcSynapses
+  0.10%  83.488us        70  1.1920us     960ns  2.0480us  [CUDA memcpy HtoD]
+  0.05%  45.344us        17  2.6670us  2.0480us  4.7040us  [CUDA memcpy DtoH]
+
+==13992== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 49.24%  255.49ms        20  12.774ms  7.1470us  254.39ms  cudaHostAlloc
+ 47.05%  244.13ms     30000  8.1370us  7.4970us  325.41us  cudaLaunch
+  1.88%  9.7505ms     30000     325ns     240ns  313.30us  cudaConfigureCall
+  1.44%  7.4897ms     30000     249ns     228ns  4.6460us  cudaSetupArgument
+  0.23%  1.1712ms        95  12.328us     191ns  29.827us  cudaMemcpy
+  0.10%  498.07us        20  24.903us  6.1390us  124.17us  cudaMalloc
+  0.04%  225.66us        83  2.7180us     135ns  97.278us  cuDeviceGetAttribute
+  0.01%  31.145us         1  31.145us  31.145us  31.145us  cuDeviceTotalMem
+  0.01%  27.598us         1  27.598us  27.598us  27.598us  cuDeviceGetName
+  0.00%  11.370us        20     568ns     348ns  2.0700us  cudaGetSymbolAddress
+  0.00%  11.183us         1  11.183us  11.183us  11.183us  cudaSetDevice
+  0.00%  1.4160us         2     708ns     453ns     963ns  cuDeviceGetCount
+  0.00%  1.3950us         1  1.3950us  1.3950us  1.3950us  cudaGetDeviceCount
+  0.00%     533ns         2     266ns     241ns     292ns  cuDeviceGet
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPNeuronalTraces_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPNeuronalTraces_CUDAStandaloneConfiguration_1000.log
new file mode 100644
index 00000000..df801df7
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPNeuronalTraces_CUDAStandaloneConfiguration_1000.log
@@ -0,0 +1,28 @@
+==22958== NVPROF is profiling process 22958, command: ./main
+==22958== Profiling application: ./main
+==22958== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 23.34%  76.426ms     10000  7.6420us  3.2960us  26.944us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, double*, double, double*, int, int*, int, int*, int, double*)
+ 19.43%  63.625ms     20000  3.1810us  3.0400us  3.7120us  [CUDA memset]
+ 18.23%  59.686ms     10000  5.9680us  5.6320us  8.0960us  kernel_neurongroup_1_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*, double*)
+ 11.04%  36.142ms     10000  3.6140us  3.3920us  7.0730us  kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, double, double*, int, double*, int*, int, int)
+  9.09%  29.761ms     10000  2.9760us  2.8800us  3.5840us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*)
+  7.99%  26.155ms     10000  2.6150us  2.2080us  2.8800us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*)
+  5.47%  17.908ms     10000  1.7900us  1.7280us  2.4640us  kernel_neurongroup_1_thresholder_codeobject(unsigned int, unsigned int, int*, double*)
+  5.26%  17.212ms     10000  1.7210us  1.6640us  2.3680us  kernel_neurongroup_1_resetter_codeobject(unsigned int, unsigned int, double*, int*, double*)
+  0.16%  534.91us         1  534.91us  534.91us  534.91us  void gen_sequenced<curandStateXORWOW, float, int, __operator_&__(float curand_uniform_noargs<curandStateXORWOW>(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int)
+
+==22958== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 66.19%  628.57ms     70001  8.9790us  7.8060us  7.0815ms  cudaLaunch
+ 18.98%  180.22ms     20000  9.0110us  8.1910us  325.17us  cudaMemset
+ 10.84%  102.92ms    570005     180ns     148ns  322.77us  cudaSetupArgument
+  2.05%  19.421ms     70002     277ns     224ns  322.72us  cudaGetLastError
+  1.92%  18.237ms     70001     260ns     204ns  7.6100us  cudaConfigureCall
+  0.01%  139.26us         1  139.26us  139.26us  139.26us  cudaMalloc
+  0.01%  47.740us         1  47.740us  47.740us  47.740us  cudaMemGetInfo
+  0.00%  38.641us        74     522ns     463ns  1.3230us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  31.070us        12  2.5890us  2.0180us  4.6520us  cudaFuncGetAttributes
+  0.00%  17.325us         1  17.325us  17.325us  17.325us  cudaDeviceSynchronize
+  0.00%  7.2280us        20     361ns     279ns     764ns  cudaDeviceGetAttribute
+  0.00%  3.4300us         5     686ns     519ns  1.2200us  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPNeuronalTraces_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPNeuronalTraces_GeNNConfigurationOptimized_1000.log
new file mode 100644
index 00000000..4a1b6afb
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPNeuronalTraces_GeNNConfigurationOptimized_1000.log
@@ -0,0 +1,26 @@
+==23186== NVPROF is profiling process 23186, command: ./main test 1.0 1
+==23186== Profiling application: ./main test 1.0 1
+==23186== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 45.72%  59.376ms     10000  5.9370us  1.4400us  22.209us  calcSynapses
+ 36.59%  47.519ms     10000  4.7510us  3.7440us  7.2000us  calcNeurons
+ 17.59%  22.844ms     10000  2.2840us  2.0800us  5.8240us  learnSynapsesPost
+  0.07%  90.016us        70  1.2850us     928ns  2.0480us  [CUDA memcpy HtoD]
+  0.04%  51.168us        19  2.6930us  1.9520us  4.6080us  [CUDA memcpy DtoH]
+
+==23186== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 48.78%  251.54ms        20  12.577ms  7.1400us  250.44ms  cudaHostAlloc
+ 47.58%  245.35ms     30000  8.1780us  7.6280us  342.38us  cudaLaunch
+  1.85%  9.5606ms     30000     318ns     255ns  320.84us  cudaConfigureCall
+  1.41%  7.2598ms     30000     241ns     222ns  5.1580us  cudaSetupArgument
+  0.22%  1.1470ms        93  12.333us     278ns  32.150us  cudaMemcpy
+  0.10%  513.51us        20  25.675us  6.0810us  139.05us  cudaMalloc
+  0.04%  228.09us        83  2.7480us     140ns  98.263us  cuDeviceGetAttribute
+  0.01%  31.411us         1  31.411us  31.411us  31.411us  cuDeviceTotalMem
+  0.01%  27.452us         1  27.452us  27.452us  27.452us  cuDeviceGetName
+  0.00%  12.004us         1  12.004us  12.004us  12.004us  cudaSetDevice
+  0.00%  11.525us        20     576ns     352ns  2.0890us  cudaGetSymbolAddress
+  0.00%  1.6280us         2     814ns     489ns  1.1390us  cuDeviceGetCount
+  0.00%  1.5650us         1  1.5650us  1.5650us  1.5650us  cudaGetDeviceCount
+  0.00%     594ns         2     297ns     230ns     364ns  cuDeviceGet
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPNotEventDriven_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPNotEventDriven_CUDAStandaloneConfiguration_1000.log
new file mode 100644
index 00000000..f8401320
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPNotEventDriven_CUDAStandaloneConfiguration_1000.log
@@ -0,0 +1,28 @@
+==5309== NVPROF is profiling process 5309, command: ./main
+==5309== Profiling application: ./main
+==5309== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 23.35%  73.232ms     10000  7.3230us  3.4560us  24.544us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, double*, double, double*, int, int*, int, int*)
+ 20.25%  63.528ms     20000  3.1760us  3.0400us  3.7440us  [CUDA memset]
+ 17.18%  53.899ms     10000  5.3890us  5.0240us  7.6480us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*)
+ 11.40%  35.764ms     10000  3.5760us  3.3920us  6.2720us  kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, double, double*, int, int*)
+  9.18%  28.794ms     10000  2.8790us  2.7840us  3.3600us  kernel_synapses_stateupdater_codeobject(unsigned int, unsigned int, double*, int, double*, int, double*, int*)
+  7.72%  24.206ms     10000  2.4200us  2.2080us  2.8480us  kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*)
+  5.48%  17.200ms     10000  1.7190us  1.6640us  1.9840us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+  5.26%  16.509ms     10000  1.6500us  1.5360us  2.4960us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*)
+  0.17%  534.31us         1  534.31us  534.31us  534.31us  void gen_sequenced<curandStateXORWOW, float, int, __operator_&__(float curand_uniform_noargs<curandStateXORWOW>(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int)
+
+==5309== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 65.37%  632.10ms     70001  9.0290us  7.8220us  7.1147ms  cudaLaunch
+ 18.21%  176.05ms     20000  8.8020us  7.9140us  65.993us  cudaMemset
+ 11.98%  115.80ms    640005     180ns     150ns  325.82us  cudaSetupArgument
+  2.23%  21.584ms     70002     308ns     218ns  325.68us  cudaGetLastError
+  2.19%  21.175ms     70001     302ns     199ns  314.30us  cudaConfigureCall
+  0.01%  138.56us         1  138.56us  138.56us  138.56us  cudaMalloc
+  0.00%  48.141us         1  48.141us  48.141us  48.141us  cudaMemGetInfo
+  0.00%  40.939us        74     553ns     496ns  1.2830us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  30.402us        12  2.5330us  2.0360us  4.5650us  cudaFuncGetAttributes
+  0.00%  17.493us         1  17.493us  17.493us  17.493us  cudaDeviceSynchronize
+  0.00%  6.8790us        20     343ns     280ns     612ns  cudaDeviceGetAttribute
+  0.00%  3.7860us         5     757ns     587ns  1.2530us  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPNotEventDriven_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPNotEventDriven_GeNNConfigurationOptimized_1000.log
new file mode 100644
index 00000000..e8654e31
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDPNotEventDriven_GeNNConfigurationOptimized_1000.log
@@ -0,0 +1,27 @@
+==5547== NVPROF is profiling process 5547, command: ./main test 1.0 1
+==5547== Profiling application: ./main test 1.0 1
+==5547== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 38.01%  64.497ms     10000  6.4490us  1.4720us  25.121us  calcSynapses
+ 24.89%  42.225ms     10000  4.2220us  3.3600us  6.1120us  calcNeurons
+ 22.75%  38.605ms     10000  3.8600us  3.2320us  5.5680us  calcSynapseDynamics
+ 14.26%  24.189ms     10000  2.4180us  2.1120us  6.5920us  learnSynapsesPost
+  0.06%  96.512us        72  1.3400us     928ns  2.0800us  [CUDA memcpy HtoD]
+  0.03%  54.080us        19  2.8460us  1.9840us  4.6720us  [CUDA memcpy DtoH]
+
+==5547== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 53.26%  318.06ms     40000  7.9510us  7.3870us  323.19us  cudaLaunch
+ 42.53%  254.01ms        21  12.096ms  7.5310us  252.89ms  cudaHostAlloc
+  2.21%  13.204ms     40000     330ns     252ns  332.54us  cudaConfigureCall
+  1.66%  9.9116ms     40000     247ns     233ns  5.2730us  cudaSetupArgument
+  0.20%  1.1942ms        97  12.311us     197ns  30.710us  cudaMemcpy
+  0.08%  498.29us        21  23.728us  6.1100us  122.22us  cudaMalloc
+  0.04%  227.33us        83  2.7380us     149ns  97.591us  cuDeviceGetAttribute
+  0.01%  31.273us         1  31.273us  31.273us  31.273us  cuDeviceTotalMem
+  0.00%  27.431us         1  27.431us  27.431us  27.431us  cuDeviceGetName
+  0.00%  11.816us         1  11.816us  11.816us  11.816us  cudaSetDevice
+  0.00%  11.690us        21     556ns     357ns  2.1550us  cudaGetSymbolAddress
+  0.00%  1.4320us         2     716ns     525ns     907ns  cuDeviceGetCount
+  0.00%  1.3390us         1  1.3390us  1.3390us  1.3390us  cudaGetDeviceCount
+  0.00%     577ns         2     288ns     252ns     325ns  cuDeviceGet
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDP_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDP_CUDAStandaloneConfiguration_1000.log
new file mode 100644
index 00000000..3a956eee
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDP_CUDAStandaloneConfiguration_1000.log
@@ -0,0 +1,29 @@
+==30259== NVPROF is profiling process 30259, command: ./main
+==30259== Profiling application: ./main
+==30259== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 29.51%  119.04ms     10000  11.903us  1.4720us  28.312ms  kernel_spikemonitor_codeobject(unsigned int, int*, double, int*, int*, int*, int, int*, double*, int, int*, int*)
+ 19.38%  78.154ms     10000  7.8150us  3.0400us  25.729us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, double*, double, double*, int, int*, int, int*, int)
+ 15.01%  60.555ms     20000  3.0270us  2.8480us  4.2880us  [CUDA memset]
+ 13.45%  54.257ms     10000  5.4250us  4.9280us  8.0000us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*)
+  8.78%  35.407ms     10000  3.5400us  3.2000us  7.1360us  kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, double, double*, int, int*, int*, int)
+  6.25%  25.200ms     10000  2.5190us  2.1760us  2.8800us  kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*)
+  3.84%  15.476ms     10000  1.5470us  1.4080us  2.4960us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*)
+  3.64%  14.677ms     10000  1.4670us  1.3440us  1.9520us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+  0.13%  535.30us         1  535.30us  535.30us  535.30us  void gen_sequenced<curandStateXORWOW, float, int, __operator_&__(float curand_uniform_noargs<curandStateXORWOW>(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int)
+  0.02%  69.760us         1  69.760us  69.760us  69.760us  _run_spikemonitor_codeobject_init(void)
+
+==30259== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 66.59%  656.39ms     70002  9.3760us  8.0560us  14.291ms  cudaLaunch
+ 18.06%  178.04ms     20000  8.9010us  7.9370us  1.1364ms  cudaMemset
+ 11.56%  113.99ms    680005     167ns     152ns  60.368us  cudaSetupArgument
+  2.00%  19.667ms     70003     280ns     237ns  57.739us  cudaGetLastError
+  1.77%  17.418ms     70002     248ns     194ns  139.14us  cudaConfigureCall
+  0.01%  139.28us         1  139.28us  139.28us  139.28us  cudaMalloc
+  0.00%  48.635us         1  48.635us  48.635us  48.635us  cudaMemGetInfo
+  0.00%  27.603us        11  2.5090us  1.9830us  4.1880us  cudaFuncGetAttributes
+  0.00%  23.673us        42     563ns     472ns  1.2600us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  18.501us         1  18.501us  18.501us  18.501us  cudaDeviceSynchronize
+  0.00%  6.2050us        16     387ns     285ns     719ns  cudaDeviceGetAttribute
+  0.00%  3.4000us         4     850ns     590ns  1.2110us  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDP_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDP_GeNNConfigurationOptimized_1000.log
new file mode 100644
index 00000000..7c0d0855
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_STDP_GeNNConfigurationOptimized_1000.log
@@ -0,0 +1,26 @@
+==30505== NVPROF is profiling process 30505, command: ./main test 1.0 1
+==30505== Profiling application: ./main test 1.0 1
+==30505== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 50.58%  115.54ms     10000  11.553us  1.7280us  50.209us  calcSynapses
+ 21.49%  49.104ms     10000  4.9100us  4.0640us  6.1440us  calcNeurons
+ 16.03%  36.625ms     17853  2.0510us  2.0160us  4.7360us  [CUDA memcpy DtoH]
+ 11.86%  27.088ms     10000  2.7080us  2.5920us  11.392us  learnSynapsesPost
+  0.04%  93.633us        70  1.3370us     960ns  2.1440us  [CUDA memcpy HtoD]
+
+==30505== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 35.14%  309.15ms     20095  15.384us     188ns  352.42us  cudaMemcpy
+ 32.84%  288.94ms        20  14.447ms  7.6290us  287.79ms  cudaHostAlloc
+ 29.91%  263.12ms     30000  8.7700us  7.6720us  331.70us  cudaLaunch
+  1.17%  10.291ms     30000     343ns     248ns  319.74us  cudaConfigureCall
+  0.84%  7.4251ms     30000     247ns     223ns  10.549us  cudaSetupArgument
+  0.06%  487.96us        20  24.398us  6.1080us  126.07us  cudaMalloc
+  0.03%  225.93us        83  2.7220us     138ns  97.475us  cuDeviceGetAttribute
+  0.00%  31.137us         1  31.137us  31.137us  31.137us  cuDeviceTotalMem
+  0.00%  27.695us         1  27.695us  27.695us  27.695us  cuDeviceGetName
+  0.00%  11.547us        20     577ns     375ns  2.1780us  cudaGetSymbolAddress
+  0.00%  11.033us         1  11.033us  11.033us  11.033us  cudaSetDevice
+  0.00%  1.4410us         2     720ns     488ns     953ns  cuDeviceGetCount
+  0.00%  1.3060us         1  1.3060us  1.3060us  1.3060us  cudaGetDeviceCount
+  0.00%     575ns         2     287ns     226ns     349ns  cuDeviceGet
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseHighRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseHighRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log
new file mode 100644
index 00000000..1ecd96c1
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseHighRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log
@@ -0,0 +1,21 @@
+==29929== NVPROF is profiling process 29929, command: ./main
+==29929== Profiling application: ./main
+==29929== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 86.04%  284.29ms     10000  28.429us  27.328us  32.544us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, double*, int*, int, int*)
+  8.93%  29.521ms     10000  2.9520us  2.8800us  4.4480us  [CUDA memset]
+  5.03%  16.619ms     10000  1.6610us  1.5360us  2.4000us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*)
+
+==29929== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 58.38%  206.98ms     20000  10.348us  8.5120us  8.2431ms  cudaLaunch
+ 28.06%  99.491ms     10000  9.9490us  8.5150us  27.390us  cudaMemset
+  8.91%  31.590ms    170000     185ns     150ns  313.25us  cudaSetupArgument
+  1.79%  6.3337ms     20000     316ns     206ns  303.30us  cudaConfigureCall
+  1.73%  6.1183ms     20000     305ns     199ns  315.94us  cudaGetLastError
+  1.12%  3.9780ms         1  3.9780ms  3.9780ms  3.9780ms  cudaDeviceSynchronize
+  0.01%  46.286us         1  46.286us  46.286us  46.286us  cudaMemGetInfo
+  0.00%  8.3370us         3  2.7790us  2.1280us  3.2430us  cudaFuncGetAttributes
+  0.00%  5.4670us         3  1.8220us     649ns  2.4930us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  1.5130us         4     378ns     295ns     546ns  cudaDeviceGetAttribute
+  0.00%     820ns         1     820ns     820ns     820ns  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseHighRateSynapsesOnly_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseHighRateSynapsesOnly_GeNNConfigurationOptimized_1000.log
new file mode 100644
index 00000000..0d0d08dd
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseHighRateSynapsesOnly_GeNNConfigurationOptimized_1000.log
@@ -0,0 +1,25 @@
+==30148== NVPROF is profiling process 30148, command: ./main test 1.0 1
+==30148== Profiling application: ./main test 1.0 1
+==30148== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 88.25%  301.73ms     10000  30.173us  3.3920us  32.704us  calcSynapses
+ 11.72%  40.058ms     10000  4.0050us  3.8080us  4.8640us  calcNeurons
+  0.02%  61.280us        44  1.3920us     960ns  3.2000us  [CUDA memcpy HtoD]
+  0.01%  39.392us        14  2.8130us  1.9840us  6.8480us  [CUDA memcpy DtoH]
+
+==30148== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 54.90%  442.78ms        12  36.898ms  14.006us  441.12ms  cudaHostAlloc
+ 40.88%  329.68ms     20000  16.483us  7.7050us  338.70us  cudaLaunch
+  2.49%  20.082ms        61  329.22us     400ns  18.995ms  cudaMemcpy
+  0.94%  7.5995ms     20000     379ns     255ns  310.22us  cudaConfigureCall
+  0.67%  5.4120ms     20000     270ns     222ns  314.38us  cudaSetupArgument
+  0.08%  639.34us        12  53.278us  11.895us  172.21us  cudaMalloc
+  0.03%  235.92us        83  2.8420us     155ns  101.36us  cuDeviceGetAttribute
+  0.00%  32.471us         1  32.471us  32.471us  32.471us  cuDeviceTotalMem
+  0.00%  30.953us         1  30.953us  30.953us  30.953us  cuDeviceGetName
+  0.00%  14.056us        12  1.1710us     746ns  3.5320us  cudaGetSymbolAddress
+  0.00%  12.473us         1  12.473us  12.473us  12.473us  cudaSetDevice
+  0.00%  1.5390us         1  1.5390us  1.5390us  1.5390us  cudaGetDeviceCount
+  0.00%  1.4990us         2     749ns     424ns  1.0750us  cuDeviceGetCount
+  0.00%     514ns         2     257ns     199ns     315ns  cuDeviceGet
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseLowRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseLowRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log
new file mode 100644
index 00000000..d7d74aa0
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseLowRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log
@@ -0,0 +1,21 @@
+==8193== NVPROF is profiling process 8193, command: ./main
+==8193== Profiling application: ./main
+==8193== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 55.84%  593.43ms    100000  5.9340us  5.4400us  6.9120us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, double*, int*, int, int*)
+ 28.97%  307.88ms    100000  3.0780us  3.0400us  3.6800us  [CUDA memset]
+ 15.19%  161.38ms    100000  1.6130us  1.5040us  2.5920us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*)
+
+==8193== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 55.92%  1.79370s    200000  8.9680us  7.6320us  7.2529ms  cudaLaunch
+ 29.82%  956.72ms    100000  9.5670us  8.2580us  21.256ms  cudaMemset
+ 10.51%  337.16ms   1700000     198ns     139ns  340.09us  cudaSetupArgument
+  1.91%  61.333ms    200000     306ns     217ns  368.29us  cudaGetLastError
+  1.83%  58.844ms    200000     294ns     168ns  332.73us  cudaConfigureCall
+  0.00%  45.848us         1  45.848us  45.848us  45.848us  cudaMemGetInfo
+  0.00%  12.992us         1  12.992us  12.992us  12.992us  cudaDeviceSynchronize
+  0.00%  8.6600us         3  2.8860us  2.0910us  3.5820us  cudaFuncGetAttributes
+  0.00%  5.3760us         3  1.7920us     594ns  2.4470us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  1.5830us         4     395ns     305ns     591ns  cudaDeviceGetAttribute
+  0.00%     829ns         1     829ns     829ns     829ns  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseLowRateSynapsesOnly_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseLowRateSynapsesOnly_GeNNConfigurationOptimized_1000.log
new file mode 100644
index 00000000..c1775029
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseLowRateSynapsesOnly_GeNNConfigurationOptimized_1000.log
@@ -0,0 +1,25 @@
+==8451== NVPROF is profiling process 8451, command: ./main test 10.0 1
+==8451== Profiling application: ./main test 10.0 1
+==8451== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 66.88%  550.62ms    100000  5.5060us  3.4560us  6.4000us  calcSynapses
+ 33.11%  272.64ms    100000  2.7260us  2.6560us  3.7760us  calcNeurons
+  0.01%  53.984us        44  1.2260us     960ns  2.0800us  [CUDA memcpy HtoD]
+  0.00%  35.072us        14  2.5050us  1.9520us  4.7040us  [CUDA memcpy DtoH]
+
+==8451== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 81.32%  1.60600s    200000  8.0290us  7.4920us  354.55us  cudaLaunch
+ 12.69%  250.71ms        12  20.893ms  15.503us  249.06ms  cudaHostAlloc
+  3.37%  66.566ms    200000     332ns     257ns  334.65us  cudaConfigureCall
+  2.52%  49.683ms    200000     248ns     225ns  334.65us  cudaSetupArgument
+  0.05%  1.0155ms        61  16.647us     343ns  35.922us  cudaMemcpy
+  0.03%  641.50us        12  53.458us  12.040us  174.09us  cudaMalloc
+  0.01%  225.49us        83  2.7160us     135ns  97.180us  cuDeviceGetAttribute
+  0.00%  31.170us         1  31.170us  31.170us  31.170us  cuDeviceTotalMem
+  0.00%  26.897us         1  26.897us  26.897us  26.897us  cuDeviceGetName
+  0.00%  13.730us        12  1.1440us     698ns  3.1800us  cudaGetSymbolAddress
+  0.00%  11.132us         1  11.132us  11.132us  11.132us  cudaSetDevice
+  0.00%  1.3520us         2     676ns     376ns     976ns  cuDeviceGetCount
+  0.00%  1.3320us         1  1.3320us  1.3320us  1.3320us  cudaGetDeviceCount
+  0.00%     542ns         2     271ns     213ns     329ns  cuDeviceGet
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseMediumRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseMediumRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log
new file mode 100644
index 00000000..4d4e1ebe
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseMediumRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log
@@ -0,0 +1,21 @@
+==16276== NVPROF is profiling process 16276, command: ./main
+==16276== Profiling application: ./main
+==16276== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 55.93%  59.598ms     10000  5.9590us  5.6000us  6.8480us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, double*, int*, int, int*)
+ 28.96%  30.864ms     10000  3.0860us  3.0400us  3.5840us  [CUDA memset]
+ 15.11%  16.106ms     10000  1.6100us  1.5040us  2.4000us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*)
+
+==16276== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 57.38%  194.03ms     20000  9.7010us  8.5280us  7.3801ms  cudaLaunch
+ 27.54%  93.116ms     10000  9.3110us  8.6920us  28.380us  cudaMemset
+ 10.82%  36.579ms    170000     215ns     184ns  349.92us  cudaSetupArgument
+  2.15%  7.2682ms     20000     363ns     248ns  327.47us  cudaConfigureCall
+  2.09%  7.0721ms     20000     353ns     266ns  337.12us  cudaGetLastError
+  0.01%  46.564us         1  46.564us  46.564us  46.564us  cudaMemGetInfo
+  0.01%  18.278us         1  18.278us  18.278us  18.278us  cudaDeviceSynchronize
+  0.00%  8.5460us         3  2.8480us  2.1440us  3.4910us  cudaFuncGetAttributes
+  0.00%  5.2380us         3  1.7460us     617ns  2.4330us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  1.7410us         4     435ns     339ns     632ns  cudaDeviceGetAttribute
+  0.00%     956ns         1     956ns     956ns     956ns  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseMediumRateSynapsesOnly_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseMediumRateSynapsesOnly_GeNNConfigurationOptimized_1000.log
new file mode 100644
index 00000000..639f22a9
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_SparseMediumRateSynapsesOnly_GeNNConfigurationOptimized_1000.log
@@ -0,0 +1,25 @@
+==16495== NVPROF is profiling process 16495, command: ./main test 1.0 1
+==16495== Profiling application: ./main test 1.0 1
+==16495== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 67.04%  60.321ms     10000  6.0320us  3.4560us  6.5280us  calcSynapses
+ 32.86%  29.567ms     10000  2.9560us  2.9120us  3.7440us  calcNeurons
+  0.06%  54.017us        44  1.2270us     960ns  2.0480us  [CUDA memcpy HtoD]
+  0.04%  36.032us        14  2.5730us  2.0480us  4.7360us  [CUDA memcpy DtoH]
+
+==16495== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 62.23%  290.68ms        12  24.223ms  7.8400us  289.60ms  cudaHostAlloc
+ 35.13%  164.11ms     20000  8.2050us  7.5690us  348.13us  cudaLaunch
+  1.32%  6.1557ms     20000     307ns     255ns  328.87us  cudaConfigureCall
+  1.01%  4.7095ms     20000     235ns     202ns  341.44us  cudaSetupArgument
+  0.16%  750.68us        61  12.306us     358ns  28.177us  cudaMemcpy
+  0.09%  419.68us        12  34.973us  6.2030us  120.19us  cudaMalloc
+  0.05%  227.14us        83  2.7360us     145ns  97.726us  cuDeviceGetAttribute
+  0.01%  31.327us         1  31.327us  31.327us  31.327us  cuDeviceTotalMem
+  0.01%  26.548us         1  26.548us  26.548us  26.548us  cuDeviceGetName
+  0.00%  11.315us         1  11.315us  11.315us  11.315us  cudaSetDevice
+  0.00%  7.9470us        12     662ns     405ns  1.9600us  cudaGetSymbolAddress
+  0.00%  1.5460us         2     773ns     495ns  1.0510us  cuDeviceGetCount
+  0.00%  1.4000us         1  1.4000us  1.4000us  1.4000us  cudaGetDeviceCount
+  0.00%     578ns         2     289ns     223ns     355ns  cuDeviceGet
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_VerySparseMediumRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_VerySparseMediumRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log
new file mode 100644
index 00000000..782334df
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_VerySparseMediumRateSynapsesOnly_CUDAStandaloneConfiguration_1000.log
@@ -0,0 +1,21 @@
+==6005== NVPROF is profiling process 6005, command: ./main
+==6005== Profiling application: ./main
+==6005== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 55.29%  580.67ms    100000  5.8060us  5.2160us  6.6240us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, double*, int*, int, int*)
+ 29.34%  308.08ms    100000  3.0800us  3.0400us  3.7120us  [CUDA memset]
+ 15.37%  161.45ms    100000  1.6140us  1.5040us  2.5920us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*)
+
+==6005== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 56.44%  1.83924s    200000  9.1960us  7.9810us  7.4326ms  cudaLaunch
+ 29.07%  947.22ms    100000  9.4720us  8.1380us  21.897ms  cudaMemset
+ 10.90%  355.11ms   1700000     208ns     171ns  355.90us  cudaSetupArgument
+  1.82%  59.307ms    200000     296ns     177ns  333.92us  cudaConfigureCall
+  1.77%  57.629ms    200000     288ns     202ns  337.07us  cudaGetLastError
+  0.00%  46.411us         1  46.411us  46.411us  46.411us  cudaMemGetInfo
+  0.00%  13.163us         1  13.163us  13.163us  13.163us  cudaDeviceSynchronize
+  0.00%  8.2890us         3  2.7630us  2.0680us  3.3230us  cudaFuncGetAttributes
+  0.00%  5.4810us         3  1.8270us     565ns  2.5590us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  1.5840us         4     396ns     318ns     545ns  cudaDeviceGetAttribute
+  0.00%     924ns         1     924ns     924ns     924ns  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_VerySparseMediumRateSynapsesOnly_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_VerySparseMediumRateSynapsesOnly_GeNNConfigurationOptimized_1000.log
new file mode 100644
index 00000000..da3ef851
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_VerySparseMediumRateSynapsesOnly_GeNNConfigurationOptimized_1000.log
@@ -0,0 +1,25 @@
+==6274== NVPROF is profiling process 6274, command: ./main test 10.0 1
+==6274== Profiling application: ./main test 10.0 1
+==6274== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 69.30%  617.28ms    100000  6.1720us  3.3600us  7.5200us  calcSynapses
+ 30.70%  273.43ms    100000  2.7340us  2.6560us  3.7440us  calcNeurons
+  0.01%  53.472us        44  1.2150us     960ns  2.0480us  [CUDA memcpy HtoD]
+  0.00%  34.560us        14  2.4680us  1.9520us  4.6080us  [CUDA memcpy DtoH]
+
+==6274== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 82.48%  1.61117s    200000  8.0550us  7.0380us  353.83us  cudaLaunch
+ 11.62%  226.99ms        12  18.916ms  7.8850us  225.88ms  cudaHostAlloc
+  3.30%  64.540ms    200000     322ns     238ns  338.74us  cudaConfigureCall
+  2.52%  49.132ms    200000     245ns     211ns  344.36us  cudaSetupArgument
+  0.04%  744.26us        61  12.200us     293ns  32.120us  cudaMemcpy
+  0.02%  421.09us        12  35.090us  6.1780us  119.69us  cudaMalloc
+  0.01%  226.88us        83  2.7330us     137ns  97.756us  cuDeviceGetAttribute
+  0.00%  31.259us         1  31.259us  31.259us  31.259us  cuDeviceTotalMem
+  0.00%  28.119us         1  28.119us  28.119us  28.119us  cuDeviceGetName
+  0.00%  11.457us         1  11.457us  11.457us  11.457us  cudaSetDevice
+  0.00%  8.0410us        12     670ns     397ns  1.9590us  cudaGetSymbolAddress
+  0.00%  1.6770us         2     838ns     479ns  1.1980us  cuDeviceGetCount
+  0.00%  1.4060us         1  1.4060us  1.4060us  1.4060us  cudaGetDeviceCount
+  0.00%     507ns         2     253ns     231ns     276ns  cuDeviceGet
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_VogelsWithSynapticDynamic_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_VogelsWithSynapticDynamic_CUDAStandaloneConfiguration_1000.log
new file mode 100644
index 00000000..2d7eef06
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_VogelsWithSynapticDynamic_CUDAStandaloneConfiguration_1000.log
@@ -0,0 +1,27 @@
+==6312== NVPROF is profiling process 6312, command: ./main
+==6312== Profiling application: ./main
+==6312== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 27.18%  194.20ms     10000  19.419us  3.1680us  2.1194ms  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int*, int, int*, double, int*, int)
+ 22.86%  163.34ms     10000  16.333us  3.1040us  1.6753ms  kernel_synapses_2_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, double*, int, double*, int, int*, double*, int)
+ 14.99%  107.12ms     10000  10.711us  3.2960us  1.1295ms  kernel_synapses_2_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, int*, double, double*, int, double*, double*, int, int*, int, double*, int)
+ 14.22%  101.59ms     10000  10.158us  3.2960us  1.0383ms  kernel_synapses_1_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, int*, double*, double, int*, int)
+  5.84%  41.697ms     10000  4.1690us  3.8720us  5.5360us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double, double*, double*, double*, bool*)
+  4.71%  33.655ms     10000  3.3650us  3.2320us  4.1280us  kernel_synapses_2_stateupdater_codeobject(unsigned int, unsigned int, int*, double*, int, double*, int, double*)
+  4.37%  31.213ms     10000  3.1210us  3.0400us  4.1920us  [CUDA memset]
+  3.37%  24.073ms     10000  2.4070us  2.0160us  5.7920us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+  2.45%  17.497ms     10000  1.7490us  1.5360us  2.7840us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*, bool*)
+
+==6312== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 70.49%  724.20ms     80000  9.0520us  7.8180us  7.3109ms  cudaLaunch
+ 16.18%  166.25ms    940000     176ns     148ns  532.24us  cudaSetupArgument
+  9.28%  95.356ms     10000  9.5350us  8.8100us  1.1346ms  cudaMemset
+  2.07%  21.258ms     80000     265ns     188ns  322.95us  cudaConfigureCall
+  1.97%  20.198ms     80000     252ns     221ns  60.788us  cudaGetLastError
+  0.00%  51.002us         1  51.002us  51.002us  51.002us  cudaMemGetInfo
+  0.00%  42.841us         1  42.841us  42.841us  42.841us  cudaDeviceSynchronize
+  0.00%  41.487us        74     560ns     469ns  2.5840us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  31.858us        12  2.6540us  1.9920us  4.7290us  cudaFuncGetAttributes
+  0.00%  6.5530us        16     409ns     280ns  1.1330us  cudaDeviceGetAttribute
+  0.00%  3.9370us         4     984ns     604ns  1.7060us  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_Vogels_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_Vogels_CUDAStandaloneConfiguration_1000.log
new file mode 100644
index 00000000..6f6703a2
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_Vogels_CUDAStandaloneConfiguration_1000.log
@@ -0,0 +1,26 @@
+==12243== NVPROF is profiling process 12243, command: ./main
+==12243== Profiling application: ./main
+==12243== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 27.91%  192.82ms     10000  19.281us  3.1360us  2.1170ms  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int*, int, int*, double, int*, int)
+ 25.45%  175.79ms     10000  17.578us  3.3280us  1.7610ms  kernel_synapses_2_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, double, double*, int, int*)
+ 15.82%  109.25ms     10000  10.925us  3.3600us  1.1837ms  kernel_synapses_2_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, int*, int, double, double*, int, double*, int*)
+ 14.27%  98.554ms     10000  9.8550us  3.1680us  1.0373ms  kernel_synapses_1_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, int*, double*, double, int*, int)
+  5.95%  41.110ms     10000  4.1110us  3.7760us  5.3120us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double, double*, double*, double*, bool*)
+  4.53%  31.297ms     10000  3.1290us  2.9440us  4.3200us  [CUDA memset]
+  3.54%  24.435ms     10000  2.4430us  2.0160us  6.0160us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+  2.53%  17.499ms     10000  1.7490us  1.5360us  2.8160us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*, bool*)
+
+==12243== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 69.99%  645.08ms     70000  9.2150us  8.1890us  7.3493ms  cudaLaunch
+ 16.20%  149.30ms    860000     173ns     144ns  1.1943ms  cudaSetupArgument
+ 10.32%  95.084ms     10000  9.5080us  8.7600us  327.83us  cudaMemset
+  1.76%  16.177ms     70000     231ns     200ns  10.120us  cudaGetLastError
+  1.72%  15.875ms     70000     226ns     181ns  5.3450us  cudaConfigureCall
+  0.01%  51.450us         1  51.450us  51.450us  51.450us  cudaMemGetInfo
+  0.00%  25.843us        10  2.5840us  2.0060us  4.6820us  cudaFuncGetAttributes
+  0.00%  25.773us        41     628ns     481ns  2.9340us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+  0.00%  17.259us         1  17.259us  17.259us  17.259us  cudaDeviceSynchronize
+  0.00%  5.8620us        12     488ns     313ns  1.3830us  cudaDeviceGetAttribute
+  0.00%  3.0770us         3  1.0250us     630ns  1.5860us  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_Vogels_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_Vogels_GeNNConfigurationOptimized_1000.log
new file mode 100644
index 00000000..c45cf1a3
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/nvprof/nvprof_Vogels_GeNNConfigurationOptimized_1000.log
@@ -0,0 +1,26 @@
+==12518== NVPROF is profiling process 12518, command: ./main test 1.0 1
+==12518== Profiling application: ./main test 1.0 1
+==12518== Profiling result:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 59.61%  415.51ms     10000  41.550us  2.0480us  6.0015ms  learnSynapsesPost
+ 29.39%  204.87ms     10000  20.486us  1.5680us  2.4941ms  calcSynapses
+ 10.93%  76.180ms     10000  7.6170us  6.6240us  14.560us  calcNeurons
+  0.06%  385.28us        86  4.4800us     960ns  42.752us  [CUDA memcpy HtoD]
+  0.02%  130.11us        20  6.5050us  1.9840us  40.641us  [CUDA memcpy DtoH]
+
+==12518== API calls:
+Time(%)      Time     Calls       Avg       Min       Max  Name
+ 66.01%  690.75ms     30000  23.025us  7.6920us  649.80us  cudaLaunch
+ 29.49%  308.57ms        26  11.868ms  7.6940us  306.48ms  cudaHostAlloc
+  2.65%  27.715ms       112  247.46us     184ns  25.977ms  cudaMemcpy
+  0.97%  10.186ms     30000     339ns     250ns  318.13us  cudaConfigureCall
+  0.77%  8.0652ms     30000     268ns     222ns  319.03us  cudaSetupArgument
+  0.07%  763.51us        26  29.365us  6.1460us  121.30us  cudaMalloc
+  0.02%  226.59us        83  2.7300us     136ns  97.714us  cuDeviceGetAttribute
+  0.00%  31.319us         1  31.319us  31.319us  31.319us  cuDeviceTotalMem
+  0.00%  28.107us         1  28.107us  28.107us  28.107us  cuDeviceGetName
+  0.00%  15.639us        26     601ns     388ns  2.0380us  cudaGetSymbolAddress
+  0.00%  11.574us         1  11.574us  11.574us  11.574us  cudaSetDevice
+  0.00%  1.7010us         2     850ns     538ns  1.1630us  cuDeviceGetCount
+  0.00%  1.5690us         1  1.5690us  1.5690us  1.5690us  cudaGetDeviceCount
+  0.00%     540ns         2     270ns     227ns     313ns  cuDeviceGet
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_AdaptationOscillation_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_AdaptationOscillation_absolute.png
new file mode 100644
index 00000000..e1ba28f3
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_AdaptationOscillation_absolute.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_AdaptationOscillation_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_AdaptationOscillation_profiling.png
new file mode 100644
index 00000000..d19c440d
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_AdaptationOscillation_profiling.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_AdaptationOscillation_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_AdaptationOscillation_relative.png
new file mode 100644
index 00000000..c270386e
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_AdaptationOscillation_relative.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.png
new file mode 100644
index 00000000..84a46b24
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.png
new file mode 100644
index 00000000..bd726111
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.png
new file mode 100644
index 00000000..ffea3406
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelScalarDelayNoMultiPrePost_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelScalarDelayNoMultiPrePost_absolute.png
new file mode 100644
index 00000000..58e7259f
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelScalarDelayNoMultiPrePost_absolute.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelScalarDelayNoMultiPrePost_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelScalarDelayNoMultiPrePost_profiling.png
new file mode 100644
index 00000000..399afa8c
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelScalarDelayNoMultiPrePost_profiling.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelScalarDelayNoMultiPrePost_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelScalarDelayNoMultiPrePost_relative.png
new file mode 100644
index 00000000..919caf69
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelScalarDelayNoMultiPrePost_relative.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelScalarDelay_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelScalarDelay_absolute.png
new file mode 100644
index 00000000..0411270d
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelScalarDelay_absolute.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelScalarDelay_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelScalarDelay_profiling.png
new file mode 100644
index 00000000..6cc9c92e
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelScalarDelay_profiling.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelScalarDelay_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelScalarDelay_relative.png
new file mode 100644
index 00000000..7a5d4fc2
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_BrunelHakimModelScalarDelay_relative.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_COBAHHFixedConnectivity_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_COBAHHFixedConnectivity_absolute.png
new file mode 100644
index 00000000..e4edc8b6
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_COBAHHFixedConnectivity_absolute.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_COBAHHFixedConnectivity_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_COBAHHFixedConnectivity_profiling.png
new file mode 100644
index 00000000..44159113
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_COBAHHFixedConnectivity_profiling.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_COBAHHFixedConnectivity_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_COBAHHFixedConnectivity_relative.png
new file mode 100644
index 00000000..7257cc46
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_COBAHHFixedConnectivity_relative.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_COBAHH_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_COBAHH_absolute.png
new file mode 100644
index 00000000..a69e0cdb
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_COBAHH_absolute.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_COBAHH_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_COBAHH_profiling.png
new file mode 100644
index 00000000..f44f9932
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_COBAHH_profiling.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_COBAHH_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_COBAHH_relative.png
new file mode 100644
index 00000000..3b17d55f
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_COBAHH_relative.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_CUBAFixedConnectivity_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_CUBAFixedConnectivity_absolute.png
new file mode 100644
index 00000000..c18c0899
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_CUBAFixedConnectivity_absolute.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_CUBAFixedConnectivity_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_CUBAFixedConnectivity_profiling.png
new file mode 100644
index 00000000..aa7b06d1
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_CUBAFixedConnectivity_profiling.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_CUBAFixedConnectivity_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_CUBAFixedConnectivity_relative.png
new file mode 100644
index 00000000..9a8341ed
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_CUBAFixedConnectivity_relative.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_CUBA_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_CUBA_absolute.png
new file mode 100644
index 00000000..2a605e63
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_CUBA_absolute.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_CUBA_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_CUBA_profiling.png
new file mode 100644
index 00000000..d7b98866
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_CUBA_profiling.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_CUBA_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_CUBA_relative.png
new file mode 100644
index 00000000..7fd8860d
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_CUBA_relative.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_DenseMediumRateSynapsesOnly_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_DenseMediumRateSynapsesOnly_absolute.png
new file mode 100644
index 00000000..ee874308
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_DenseMediumRateSynapsesOnly_absolute.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_DenseMediumRateSynapsesOnly_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_DenseMediumRateSynapsesOnly_profiling.png
new file mode 100644
index 00000000..07ce992e
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_DenseMediumRateSynapsesOnly_profiling.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_DenseMediumRateSynapsesOnly_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_DenseMediumRateSynapsesOnly_relative.png
new file mode 100644
index 00000000..a844b061
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_DenseMediumRateSynapsesOnly_relative.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_HHNeuronsOnly_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_HHNeuronsOnly_absolute.png
new file mode 100644
index 00000000..5453d4bd
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_HHNeuronsOnly_absolute.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_HHNeuronsOnly_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_HHNeuronsOnly_profiling.png
new file mode 100644
index 00000000..56192923
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_HHNeuronsOnly_profiling.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_HHNeuronsOnly_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_HHNeuronsOnly_relative.png
new file mode 100644
index 00000000..98ee6788
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_HHNeuronsOnly_relative.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_LinearNeuronsOnly_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_LinearNeuronsOnly_absolute.png
new file mode 100644
index 00000000..5d629b4b
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_LinearNeuronsOnly_absolute.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_LinearNeuronsOnly_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_LinearNeuronsOnly_profiling.png
new file mode 100644
index 00000000..e26ef41a
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_LinearNeuronsOnly_profiling.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_LinearNeuronsOnly_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_LinearNeuronsOnly_relative.png
new file mode 100644
index 00000000..255f8cdf
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_LinearNeuronsOnly_relative.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPEventDriven_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPEventDriven_absolute.png
new file mode 100644
index 00000000..0599d925
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPEventDriven_absolute.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPEventDriven_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPEventDriven_profiling.png
new file mode 100644
index 00000000..8a2f88e0
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPEventDriven_profiling.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPEventDriven_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPEventDriven_relative.png
new file mode 100644
index 00000000..29696eb2
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPEventDriven_relative.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPMultiPostNeuronalTraces_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPMultiPostNeuronalTraces_absolute.png
new file mode 100644
index 00000000..66bf3e36
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPMultiPostNeuronalTraces_absolute.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPMultiPostNeuronalTraces_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPMultiPostNeuronalTraces_profiling.png
new file mode 100644
index 00000000..03d00681
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPMultiPostNeuronalTraces_profiling.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPMultiPostNeuronalTraces_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPMultiPostNeuronalTraces_relative.png
new file mode 100644
index 00000000..11ef77a4
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPMultiPostNeuronalTraces_relative.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPMultiPost_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPMultiPost_absolute.png
new file mode 100644
index 00000000..65f2d9d4
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPMultiPost_absolute.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPMultiPost_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPMultiPost_profiling.png
new file mode 100644
index 00000000..15f230be
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPMultiPost_profiling.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPMultiPost_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPMultiPost_relative.png
new file mode 100644
index 00000000..a709f40e
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPMultiPost_relative.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPNeuronalTraces_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPNeuronalTraces_absolute.png
new file mode 100644
index 00000000..4507a51e
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPNeuronalTraces_absolute.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPNeuronalTraces_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPNeuronalTraces_profiling.png
new file mode 100644
index 00000000..e9b440b3
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPNeuronalTraces_profiling.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPNeuronalTraces_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPNeuronalTraces_relative.png
new file mode 100644
index 00000000..23ce5dc6
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPNeuronalTraces_relative.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPNotEventDriven_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPNotEventDriven_absolute.png
new file mode 100644
index 00000000..56bd1ec9
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPNotEventDriven_absolute.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPNotEventDriven_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPNotEventDriven_profiling.png
new file mode 100644
index 00000000..e1f9b29e
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPNotEventDriven_profiling.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPNotEventDriven_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPNotEventDriven_relative.png
new file mode 100644
index 00000000..a114006a
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDPNotEventDriven_relative.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDP_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDP_absolute.png
new file mode 100644
index 00000000..9a6e8891
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDP_absolute.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDP_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDP_profiling.png
new file mode 100644
index 00000000..54ddab08
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDP_profiling.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDP_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDP_relative.png
new file mode 100644
index 00000000..ceac1522
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_STDP_relative.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseHighRateSynapsesOnly_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseHighRateSynapsesOnly_absolute.png
new file mode 100644
index 00000000..3e2bd173
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseHighRateSynapsesOnly_absolute.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseHighRateSynapsesOnly_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseHighRateSynapsesOnly_profiling.png
new file mode 100644
index 00000000..4494d0a3
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseHighRateSynapsesOnly_profiling.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseHighRateSynapsesOnly_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseHighRateSynapsesOnly_relative.png
new file mode 100644
index 00000000..85ddf107
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseHighRateSynapsesOnly_relative.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseLowRateSynapsesOnly_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseLowRateSynapsesOnly_absolute.png
new file mode 100644
index 00000000..5cb5a1a8
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseLowRateSynapsesOnly_absolute.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseLowRateSynapsesOnly_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseLowRateSynapsesOnly_profiling.png
new file mode 100644
index 00000000..5562caa9
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseLowRateSynapsesOnly_profiling.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseLowRateSynapsesOnly_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseLowRateSynapsesOnly_relative.png
new file mode 100644
index 00000000..70d71c34
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseLowRateSynapsesOnly_relative.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseMediumRateSynapsesOnly_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseMediumRateSynapsesOnly_absolute.png
new file mode 100644
index 00000000..bc14e017
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseMediumRateSynapsesOnly_absolute.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseMediumRateSynapsesOnly_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseMediumRateSynapsesOnly_profiling.png
new file mode 100644
index 00000000..574e7d10
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseMediumRateSynapsesOnly_profiling.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseMediumRateSynapsesOnly_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseMediumRateSynapsesOnly_relative.png
new file mode 100644
index 00000000..954b7f19
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_SparseMediumRateSynapsesOnly_relative.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_VerySparseMediumRateSynapsesOnly_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_VerySparseMediumRateSynapsesOnly_absolute.png
new file mode 100644
index 00000000..85709744
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_VerySparseMediumRateSynapsesOnly_absolute.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_VerySparseMediumRateSynapsesOnly_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_VerySparseMediumRateSynapsesOnly_profiling.png
new file mode 100644
index 00000000..76f619d0
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_VerySparseMediumRateSynapsesOnly_profiling.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_VerySparseMediumRateSynapsesOnly_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_VerySparseMediumRateSynapsesOnly_relative.png
new file mode 100644
index 00000000..17a234eb
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_VerySparseMediumRateSynapsesOnly_relative.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_VogelsWithSynapticDynamic_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_VogelsWithSynapticDynamic_absolute.png
new file mode 100644
index 00000000..207758e2
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_VogelsWithSynapticDynamic_absolute.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_VogelsWithSynapticDynamic_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_VogelsWithSynapticDynamic_profiling.png
new file mode 100644
index 00000000..199ec4f8
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_VogelsWithSynapticDynamic_profiling.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_VogelsWithSynapticDynamic_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_VogelsWithSynapticDynamic_relative.png
new file mode 100644
index 00000000..378cdc89
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_VogelsWithSynapticDynamic_relative.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_Vogels_absolute.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_Vogels_absolute.png
new file mode 100644
index 00000000..8f6a8e3c
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_Vogels_absolute.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_Vogels_profiling.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_Vogels_profiling.png
new file mode 100644
index 00000000..c9d9e666
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_Vogels_profiling.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_Vogels_relative.png b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_Vogels_relative.png
new file mode 100644
index 00000000..48573e36
Binary files /dev/null and b/dev/benchmarks/results_2017_04_05_complete_after_talk/plots/speed_test_Vogels_relative.png differ
diff --git a/dev/benchmarks/results_2017_04_05_complete_after_talk/run_speed_test_script.py b/dev/benchmarks/results_2017_04_05_complete_after_talk/run_speed_test_script.py
new file mode 100644
index 00000000..865118ae
--- /dev/null
+++ b/dev/benchmarks/results_2017_04_05_complete_after_talk/run_speed_test_script.py
@@ -0,0 +1,249 @@
+import os
+import shutil
+import glob
+import subprocess
+import sys
+
+# run tests without X-server
+import matplotlib
+matplotlib.use('Agg')
+
+# pretty plots
+import seaborn
+
+import time
+import datetime
+import cPickle as pickle
+
+from brian2 import *
+from brian2.tests.features import *
+from brian2.tests.features.base import *
+from brian2.tests.features.base import results
+
+import brian2cuda
+from brian2cuda.tests.features.cuda_configuration import (CUDAStandaloneConfiguration,
+                                                          CUDAStandaloneConfigurationNoAssert,
+                                                          CUDAStandaloneConfigurationCurandDouble,
+                                                          CUDAStandaloneConfigurationNoCudaOccupancyAPI,
+                                                          CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU,
+                                                          CUDAStandaloneConfiguration2BlocksPerSM,
+                                                          CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds,
+                                                          CUDAStandaloneConfigurationSynLaunchBounds,
+                                                          CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds,
+                                                          CUDAStandaloneConfigurationProfileGPU,
+                                                          CUDAStandaloneConfigurationProfileCPU)
+from brian2cuda.tests.features.speed import *
+
+from brian2genn.correctness_testing import GeNNConfiguration, GeNNConfigurationCPU, GeNNConfigurationOptimized
+
+from create_readme import create_readme
+
+assert len(sys.argv)<= 2, 'Only one command line argument supported! Got {}'.format(len(sys.argv)-1)
+if len(sys.argv) == 2:
+    additional_dir_name = '_' + sys.argv[1]
+else:
+    additional_dir_name = ''
+
+prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j12']
+
+configs = [# configuration                          project_directory
+          #(NumpyConfiguration,                     None),
+          #(WeaveConfiguration,                     None),
+          #(LocalConfiguration,                     None),
+          (CUDAStandaloneConfiguration,             'cuda_standalone'),
+          #(CUDAStandaloneConfigurationNoAssert,             'cuda_standalone'),
+          #(CUDAStandaloneConfigurationNoThreadfence,  'cuda_standalone'),
+          #(CUDAStandaloneConfigurationCurandDouble,              'cuda_standalone'),
+          #(CUDAStandaloneConfigurationNoCudaOccupancyAPI,      'cuda_standalone'),
+          #(CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU,    'cuda_standalone'),
+          #(CUDAStandaloneConfiguration2BlocksPerSM, 'cuda_standalone'),
+          #(CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds, 'cuda_standalone'),
+          #(CUDAStandaloneConfigurationSynLaunchBounds,     'cuda_standalone'),
+          #(CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds, 'cuda_standalone'),
+          #(CUDAStandaloneConfigurationProfileGPU,   'cuda_standalone'),
+          #(CUDAStandaloneConfigurationProfileCPU,   'cuda_standalone'),
+          (CPPStandaloneConfiguration,              'cpp_standalone'),
+          #(GeNNConfiguration,                       'GeNNworkspace'),
+          #(CPPStandaloneConfigurationOpenMP,        'cpp_standalone'),
+          #(GeNNConfigurationCPU,                    'GeNNworkspace'),
+          (GeNNConfigurationOptimized,              'GeNNworkspace')
+          ]
+
+speed_tests = [# feature_test                     name                                  n_slice
+
+               #(ThresholderOnlyPoissonLowRate,                         'ThresholderOnlyPoissonLowRate',                       slice(None)         ),
+               #(ThresholderOnlyPoissonMediumRate,                         'ThresholderOnlyPoissonMediumRate',                       slice(None)         ),
+               #(ThresholderOnlyPoissonHighRate,                         'ThresholderOnlyPoissonHighRate',                       slice(None)         ),
+               #(ThresholderOnlyAlwaysSpiking,                         'ThresholderOnlyAlwaysSpiking',                       slice(None)         ),
+
+               #(BrunelHakimStateupdateOnlyDouble,           'BrunelHakimStateupdateOnlyDouble',         slice(None)         ),
+               #(BrunelHakimStateupdateOnlyTriple,           'BrunelHakimStateupdateOnlyTriple',         slice(None)         ),
+               #(BrunelHakimStateupdateOnly,           'BrunelHakimStateupdateOnly',         slice(None)         ),
+               #(BrunelHakimNeuronsOnly,           'BrunelHakimNeuronsOnly',         slice(None)         ),
+               #(BrunelHakimNeuronsOnlyNoXi,           'BrunelHakimNeuronsOnlyNoXi',         slice(None)         ),
+               #(BrunelHakimNeuronsOnlyNoRand,           'BrunelHakimNeuronsOnlyNoRand',         slice(None)         ),
+               #(BrunelHakimStateupdateThresholdOnly,           'BrunelHakimStateupdateThresholdOnly',         slice(None)         ),
+               #(BrunelHakimStateupdateThresholdResetOnly,           'BrunelHakimStateupdateThresholdResetOnly',         slice(None)         ),
+               #(BrunelHakimModelScalarDelayShort,      'BrunelHakimModelScalarDelayShort',     slice(None)         ),
+              (CUBA,                                 'CUBA',                              slice(None)         ),
+              (COBAHH,                                 'COBAHH',                              slice(None)         ),
+              (AdaptationOscillation,                  'AdaptationOscillation',               slice(None)         ),
+              (Vogels,                                 'Vogels',                              slice(None)         ),
+               (STDP,                                   'STDP',                                slice(None)         ),
+              (STDPEventDriven,                        'STDPEventDriven',                     slice(None)         ),
+              (BrunelHakimModelScalarDelay,           'BrunelHakimModelScalarDelay',         slice(None)         ),
+              (BrunelHakimModelScalarDelayNoMultiPrePost,           'BrunelHakimModelScalarDelayNoMultiPrePost',         slice(None)         ),
+
+               (VerySparseMediumRateSynapsesOnly,       'VerySparseMediumRateSynapsesOnly',    slice(None)         ),
+               (SparseMediumRateSynapsesOnly,           'SparseMediumRateSynapsesOnly',        slice(None)         ),
+               (DenseMediumRateSynapsesOnly,            'DenseMediumRateSynapsesOnly',         slice(None)         ),
+               (SparseLowRateSynapsesOnly,              'SparseLowRateSynapsesOnly',           slice(None)         ),
+               (SparseHighRateSynapsesOnly,             'SparseHighRateSynapsesOnly',          slice(None)         ),
+
+               (STDPNotEventDriven,                     'STDPNotEventDriven',                  slice(None)         ),
+               (STDPMultiPost,                          'STDPMultiPost',                        slice(None)         ),
+               (STDPNeuronalTraces,                     'STDPNeuronalTraces',                   slice(None)         ),
+               (STDPMultiPostNeuronalTraces,            'STDPMultiPostNeuronalTraces',          slice(None)         ),
+
+              (BrunelHakimModelHeterogeneousDelay,    'BrunelHakimModelHeterogeneousDelay',  slice(None)         ),
+
+              (LinearNeuronsOnly,                     'LinearNeuronsOnly',                   slice(None)         ),
+              (HHNeuronsOnly,                         'HHNeuronsOnly',                       slice(None)         ),
+               (VogelsWithSynapticDynamic,              'VogelsWithSynapticDynamic',           slice(None)         ),
+## below uses monitors
+               (CUBAFixedConnectivity,                 'CUBAFixedConnectivity',               slice(None)         ),
+               (COBAHHFixedConnectivity,                'COBAHHFixedConnectivity',             slice(None, -1)     ),
+]
+
+configurations = [config[0] for config in configs]
+project_dirs = [config[1] for config in configs]
+
+# check if multiple Configurations with same project_dirs are specified
+last_idx = {}
+for proj_dir in project_dirs:
+    if proj_dir is not None:
+        first_i = project_dirs.index(proj_dir)
+        last_i = len(project_dirs) - 1 - project_dirs[::-1].index(proj_dir)
+        if first_i != last_i:
+            print("WARNING there are multiple configurations using {d} as project "
+                  "directory. Profiling and logfiles will only be saved for the last one {c}.".format(
+                  d=proj_dir, c=configurations[last_i].__name__))
+        last_idx[proj_dir] = last_i
+
+time_stemp = time.time()
+date_str = datetime.datetime.fromtimestamp(time_stemp).strftime('%Y_%m_%d')
+
+directory = 'results_{}{}'.format(date_str, additional_dir_name)
+if os.path.exists(directory):
+    new_dir = directory + '_bak_' + str(int(time.time()))
+    print("Directory with name `{}` already exists. Renaming it to `{}`.".format(directory, new_dir))
+    os.rename(directory, new_dir)
+os.makedirs(directory)
+data_dir = os.path.join(directory, 'data')
+plot_dir = os.path.join(directory, 'plots')
+log_dir = os.path.join(directory, 'logs')
+prof_dir = os.path.join(directory, 'nvprof')
+os.makedirs(data_dir)
+os.makedirs(plot_dir)
+os.makedirs(log_dir)
+os.makedirs(prof_dir)
+print("Saving results in {}.".format(plot_dir))
+
+shutil.copy(os.path.realpath(__file__), os.path.join(directory, 'run_speed_test_script.py'))
+
+time_format = '%d.%m.%Y at %H:%M:%S'
+script_start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+
+with open(os.path.join(directory, 'git.diff'), 'w') as diff_file:
+    subprocess.call(['git', 'diff'], stdout=diff_file)
+
+try:
+    for n, (st, name, sl) in enumerate(speed_tests):
+        start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+        print("Starting {} on {}.".format(name, start))
+        maximum_run_time = 1*60*60*second
+        #st.duration = 10*second
+        res = run_speed_tests(configurations=configurations,
+                              speed_tests=[st],
+                              n_slice=sl,
+                              #n_slice=slice(0,1,None),
+                              run_twice=False,
+                              verbose=True,
+                              maximum_run_time=maximum_run_time,
+                              profile_only_active=True)
+                              #profile_only_active=False)
+        end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+        diff = datetime.datetime.strptime(end, time_format) - datetime.datetime.strptime(start, time_format)
+        print("Running {} took {}.".format(name, diff))
+        res.plot_all_tests(print_relative=True)
+        savefig(os.path.join(plot_dir, 'speed_test_{}_absolute.png'.format(speed_tests[n][1])))
+        res.plot_all_tests(relative=True)
+        savefig(os.path.join(plot_dir, 'speed_test_{}_relative.png'.format(name)))
+        res.plot_all_tests(profiling_minimum=0.05)
+        savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.png'.format(name)))
+        if 3 != len(get_fignums()):
+            print("WARNING: There were {} plots created, but only {} saved.".format(len(get_fignums()), 3*(n+1)))
+        for n in get_fignums():
+            close(n)
+
+        # pickel results object to disk
+        pkl_file = os.path.join(data_dir, name + '.pkl' )
+        with open(pkl_file, 'wb') as output:
+                pickle.dump(res, output, pickle.HIGHEST_PROTOCOL)
+
+        # save stdout log of last run (the other are deleted in run_speed_tests())
+        for proj_dir in set(project_dirs):
+            if not proj_dir is None and proj_dir in ['cuda_standalone', 'cpp_standalone']:
+                config = configurations[last_idx[proj_dir]]
+                stdout_file = os.path.join(proj_dir, 'results/stdout.txt')
+                if os.path.exists(stdout_file):
+                    shutil.copy(stdout_file,
+                                os.path.join(log_dir, 'stdout_{st}_{conf}_{n}.txt'.format(st=name, conf=proj_dir,
+                                                                                           n=st.n_range[sl][-1])))
+                else:
+                    print("WARNING Couldn't save {},file not found.".format(stdout_file))
+
+        # run nvprof on n_range[2]
+        for conf, proj_dir in zip(configurations, project_dirs):
+            main_arg = ''
+            if proj_dir in ['cuda_standalone', 'GeNNworkspace']:
+                if proj_dir == 'GeNNworkspace':
+                    main_arg = 'test {time} 1'.format(time=st.duration/second)
+                ns = st.n_range[sl]
+                idx = 2
+                max_runtime = 20
+                conf_name = conf.__name__
+                print("Rerunning {} with n = {} for nvprof profiling".format(conf_name, st.n_range[idx]))
+                tb, res, runtime, prof_info = results(conf, st, st.n_range[idx], maximum_run_time=maximum_run_time)
+                if not isinstance(res, Exception) and runtime < max_runtime:
+                    option = '--profile-from-start-off' if proj_dir == 'cuda_standalone' else ''
+                    cmd = 'cd {proj_dir} && nvprof {opt} --log-file ../{log_file} ./main {arg}'.format(
+                        proj_dir=proj_dir, arg=main_arg, opt=option,
+                        log_file=os.path.join(prof_dir, 'nvprof_{st}_{conf}_{n}.log'.format(
+                            st=name, conf=conf_name, n=st.n_range[idx])))
+                    prof_start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+                    print(cmd)
+                    x = os.system(cmd)
+                    if x:
+                        print('nvprof failed with {}'.format(x))
+                    prof_end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+                    prof_diff = datetime.datetime.strptime(prof_end, time_format) - datetime.datetime.strptime(prof_start, time_format)
+                    print("Profiling took {} for runtime of {}".format(prof_diff, runtime))
+finally:
+    create_readme(directory)
+    print("\nSummarized speed test results in {}".format(directory + '/README.md'))
+    script_end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+    script_diff = datetime.datetime.strptime(script_end, time_format) - datetime.datetime.strptime(script_start, time_format)
+    print("Finished speed test on {}. Total time = {}.".format(
+        datetime.datetime.fromtimestamp(time.time()).strftime(time_format), script_diff))
+
+
+##res.plot_all_tests(relative=True)
+#for n in get_fignums():
+#    plt.figure(n)
+#    savefig(plot_dir + '/speed_test_{}.png'.format(speed_tests[n-1][1]))
+
+## Debug (includes profiling infos)
+#from brian2.tests.features.base import results
+#for x in results(LocalConfiguration, LinearNeuronsOnly, 10, maximum_run_time=10*second):
+#    print x
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/README.md b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/README.md
new file mode 100644
index 00000000..258f1cd7
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/README.md
@@ -0,0 +1,7 @@
+[cuda_cpp_comparison_for_heterogenous_delay_mode](cuda_cpp_comparison_for_heterogenous_delay_mode)
+
+[cuda_atomics_original_and_atomics_effects_profiled](cuda_atomics_original_and_atomics_effects_profiled)
+
+[cuda_atomics_in_heterogenous_delay_mode](cuda_atomics_in_heterogenous_delay_mode)
+
+[cuda_atomics_effects_and_queue_resize_profiled](cuda_atomics_effects_and_queue_resize_profiled)
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/README.md b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/README.md
new file mode 100644
index 00000000..86d6251e
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/README.md
@@ -0,0 +1,98 @@
+
+# Benchmark results from 28.11.2017
+## Description:
+
+
+
+## Last git log:
+```
+commit 8987de24ed9f4a3b1a276496407fca1087f04004
+Author: Denis Alevi <mail@denisalevi.de>
+Date:   Mon Nov 20 14:31:09 2017 +0100
+
+    Fix critical section to include the actual pushing
+
+```
+There is also a `git diff` saved in the current directory.
+
+## Results
+
+### BrunelHakimModelHeterogeneousDelay
+![](plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.svg)
+![](plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.svg)
+![](plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.svg)
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResizeProfileCPU**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==24531== NVPROF is profiling process 24531, command: ./main
+==24531== Profiling application: ./main
+==24531== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   37.91%  132.38ms      2521  52.511us  14.048us  1.0672ms  _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*)
+                   18.34%  64.052ms     10000  6.4050us  3.5520us  8.3520us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*)
+                   13.11%  45.786ms     10000  4.5780us  4.3840us  5.6320us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*)
+                    7.89%  27.566ms     10000  2.7560us  2.7200us  4.1280us  _run_synapses_pre_push_spikes_advance_kernel(void)
+                    6.60%  23.060ms     10000  2.3050us  2.0800us  2.8480us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+                    5.89%  20.553ms     10000  2.0550us  2.0160us  4.1920us  [CUDA memcpy DtoH]
+                    5.25%  18.329ms     10000  1.8320us  1.6640us  2.1760us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+                    4.80%  16.747ms     10000  1.6740us  1.6000us  2.2080us  _GLOBAL__N__69_tmpxft_00005e15_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*)
+                    0.21%  731.84us         1  731.84us  731.84us  731.84us  void gen_sequenced<curandStateXORWOW, float2, normal_args_st, __operator_&__(float2 curand_normal_scaled2<curandStateXORWOW>(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st)
+      API calls:   46.45%  719.26ms     62522  11.504us  9.5190us  8.6020ms  cudaLaunch
+                   35.15%  544.39ms     60001  9.0720us  2.4110us  1.0720ms  cudaDeviceSynchronize
+                   12.81%  198.43ms     10000  19.842us  18.034us  330.92us  cudaMemcpy
+                    3.57%  55.321ms    350089     158ns     123ns  330.68us  cudaSetupArgument
+                    1.15%  17.835ms     62522     285ns     182ns  10.032us  cudaConfigureCall
+                    0.83%  12.881ms     52523     245ns     209ns  9.8600us  cudaGetLastError
+                    0.02%  250.79us         1  250.79us  250.79us  250.79us  cudaMalloc
+                    0.01%  147.52us         1  147.52us  147.52us  147.52us  cudaMemGetInfo
+                    0.00%  28.259us         8  3.5320us  2.7680us  5.4040us  cudaFuncGetAttributes
+                    0.00%  26.485us        39     679ns     562ns  1.7750us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  6.3090us        12     525ns     358ns  1.3730us  cudaDeviceGetAttribute
+                    0.00%  2.9100us         3     970ns     717ns  1.4180us  cudaGetDevice
+
+```
+
+</p></details>
+
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==23837== NVPROF is profiling process 23837, command: ./main
+==23837== Profiling application: ./main
+==23837== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   44.49%  157.31ms     10000  15.731us  1.8560us  1.1459ms  _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*)
+                   17.67%  62.479ms     10000  6.2470us  3.4240us  7.9360us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*)
+                   12.96%  45.814ms     10000  4.5810us  4.3530us  5.4080us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*)
+                    7.81%  27.614ms     10000  2.7610us  2.7200us  4.1920us  _run_synapses_pre_push_spikes_advance_kernel(void)
+                    6.48%  22.902ms     10000  2.2900us  2.0160us  2.8170us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+                    5.57%  19.698ms     10000  1.9690us  1.6960us  2.2080us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+                    4.81%  17.002ms     10000  1.7000us  1.6320us  2.2400us  _GLOBAL__N__69_tmpxft_00005b5a_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*)
+                    0.21%  731.94us         1  731.94us  731.94us  731.94us  void gen_sequenced<curandStateXORWOW, float2, normal_args_st, __operator_&__(float2 curand_normal_scaled2<curandStateXORWOW>(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st)
+      API calls:   54.28%  776.23ms     70001  11.088us  9.1570us  9.5432ms  cudaLaunch
+                   39.57%  565.83ms     60001  9.4300us  2.4970us  1.1523ms  cudaDeviceSynchronize
+                    3.99%  57.063ms    380005     150ns     121ns  325.75us  cudaSetupArgument
+                    1.16%  16.531ms     70001     236ns     172ns  25.540us  cudaConfigureCall
+                    0.96%  13.788ms     60002     229ns     191ns  12.473us  cudaGetLastError
+                    0.02%  304.60us         1  304.60us  304.60us  304.60us  cudaMalloc
+                    0.01%  168.13us         1  168.13us  168.13us  168.13us  cudaMemGetInfo
+                    0.00%  31.295us        39     802ns     568ns  4.4480us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  29.348us         8  3.6680us  2.8260us  5.7480us  cudaFuncGetAttributes
+                    0.00%  6.1630us        12     513ns     356ns  1.2920us  cudaDeviceGetAttribute
+                    0.00%  3.1870us         3  1.0620us     733ns  1.7050us  cudaGetDevice
+
+```
+
+</p></details>
+
+
+***
+
+### BrunelHakimModelHeterogeneousDelay - display less kernels in profiling 
+![](plots/speed_test_min_15_BrunelHakimModelHeterogeneousDelay_profiling.svg)
+
+
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/data/BrunelHakimModelHeterogeneousDelay.pkl b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/data/BrunelHakimModelHeterogeneousDelay.pkl
new file mode 100644
index 00000000..207f984f
Binary files /dev/null and b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/data/BrunelHakimModelHeterogeneousDelay.pkl differ
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/git.diff b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/git.diff
new file mode 100644
index 00000000..44a84fa2
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/git.diff
@@ -0,0 +1,305 @@
+diff --git a/brian2cuda/tests/features/cuda_configuration.py b/brian2cuda/tests/features/cuda_configuration.py
+index 250687d..622f73a 100644
+--- a/brian2cuda/tests/features/cuda_configuration.py
++++ b/brian2cuda/tests/features/cuda_configuration.py
+@@ -225,7 +225,7 @@ class CUDAStandaloneConfigurationProfileCPU(Configuration):
+                             with_output=False)
+ 
+ class CUDAStandaloneConfigurationTestBrunelHeteroAtomics(Configuration):
+-    name = 'CUDA standalone with atomics in heterog delay mode'
++    name = 'CUDA standalone with atomics in effect application'
+     def before_run(self):
+         brian2.set_device('cuda_standalone', build_on_run=False)
+         prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True
+@@ -248,7 +248,7 @@ class CUDAStandaloneConfigurationTestBrunelHeteroAtomics(Configuration):
+                             with_output=False)
+ 
+ class CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU(Configuration):
+-    name = "CUDA standalone with atomics in heterog delay mode (profile='blocking')"
++    name = "CUDA standalone with atomics in effect application (profile='blocking')"
+     def before_run(self):
+         brian2.set_device('cuda_standalone', build_on_run=False, profile='blocking')
+         prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True
+@@ -270,12 +270,10 @@ class CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU(Configuration
+         brian2.device.build(directory='cuda_standalone', compile=True, run=True,
+                             with_output=False)
+ 
+-
+ class CUDAStandaloneConfigurationPushAtomicResize(Configuration):
+-    name = "CUDA standalone with atomics in queue resize"
++    name = "CUDA standalone with atomics in spikequeue resize"
+     def before_run(self):
+         brian2.set_device('cuda_standalone', build_on_run=False)
+-        prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True
+         prefs["devices.cuda_standalone.push_atomic_resize"] = True
+         if socket.gethostname() == 'elnath':
+             if prefs['devices.cpp_standalone.extra_make_args_unix'] == ['-j12']:
+@@ -295,8 +293,82 @@ class CUDAStandaloneConfigurationPushAtomicResize(Configuration):
+         brian2.device.build(directory='cuda_standalone', compile=True, run=True,
+                             with_output=False)
+ 
++
+ class CUDAStandaloneConfigurationPushAtomicResizeProfileCPU(Configuration):
+-    name = "CUDA standalone with atomics in queue resize (profile='blocking')"
++    name = "CUDA standalone with atomics in spikequeue resize (profile='blocking')"
++    def before_run(self):
++        brian2.set_device('cuda_standalone', build_on_run=False, profile='blocking')
++        prefs["devices.cuda_standalone.push_atomic_resize"] = True
++        if socket.gethostname() == 'elnath':
++            if prefs['devices.cpp_standalone.extra_make_args_unix'] == ['-j12']:
++                prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j24']
++            prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
++            prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20'])
++        elif socket.gethostname() == 'sabik':
++            prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
++            prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_52'])
++        elif socket.gethostname() == 'eltanin':
++            prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
++            prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_61'])
++
++    def after_run(self):
++        if os.path.exists('cuda_standalone'):
++            shutil.rmtree('cuda_standalone')
++        brian2.device.build(directory='cuda_standalone', compile=True, run=True,
++                            with_output=False)
++
++
++class CUDAStandaloneConfigurationPushAtomicResizProfileCPU(Configuration):
++    name = "CUDA standalone with atomics in spikequeue resize (profile='blocking)"
++    def before_run(self):
++        brian2.set_device('cuda_standalone', build_on_run=False,
++                          profile='blocking')
++        prefs["devices.cuda_standalone.push_atomic_resize"] = True
++        if socket.gethostname() == 'elnath':
++            if prefs['devices.cpp_standalone.extra_make_args_unix'] == ['-j12']:
++                prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j24']
++            prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
++            prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20'])
++        elif socket.gethostname() == 'sabik':
++            prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
++            prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_52'])
++        elif socket.gethostname() == 'eltanin':
++            prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
++            prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_61'])
++
++    def after_run(self):
++        if os.path.exists('cuda_standalone'):
++            shutil.rmtree('cuda_standalone')
++        brian2.device.build(directory='cuda_standalone', compile=True, run=True,
++                            with_output=False)
++
++
++class CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize(Configuration):
++    name = "CUDA standalone with atomics in effect application and in spikequeue resize"
++    def before_run(self):
++        brian2.set_device('cuda_standalone', build_on_run=False)
++        prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True
++        prefs["devices.cuda_standalone.push_atomic_resize"] = True
++        if socket.gethostname() == 'elnath':
++            if prefs['devices.cpp_standalone.extra_make_args_unix'] == ['-j12']:
++                prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j24']
++            prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
++            prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20'])
++        elif socket.gethostname() == 'sabik':
++            prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
++            prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_52'])
++        elif socket.gethostname() == 'eltanin':
++            prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
++            prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_61'])
++
++    def after_run(self):
++        if os.path.exists('cuda_standalone'):
++            shutil.rmtree('cuda_standalone')
++        brian2.device.build(directory='cuda_standalone', compile=True, run=True,
++                            with_output=False)
++
++class CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResizeProfileCPU(Configuration):
++    name = "CUDA standalone with atomics in effect application and in spikequeue resize (profile='blocking')"
+     def before_run(self):
+         brian2.set_device('cuda_standalone', build_on_run=False, profile='blocking')
+         prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True
+diff --git a/dev/benchmarks/run_speed_tests.py b/dev/benchmarks/run_speed_tests.py
+index 2518634..b525e97 100644
+--- a/dev/benchmarks/run_speed_tests.py
++++ b/dev/benchmarks/run_speed_tests.py
+@@ -37,6 +37,7 @@ from brian2cuda.tests.features.cuda_configuration import (CUDAStandaloneConfigur
+                                                           CUDAStandaloneConfigurationTestBrunelHeteroAtomics,
+                                                           CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU,
+                                                           CUDAStandaloneConfigurationPushAtomicResize,
++                                                          CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize,
+                                                           CUDAStandaloneConfigurationPushAtomicResizeProfileCPU,
+                                                           CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy,
+                                                           CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU)
+@@ -61,79 +62,80 @@ if socket.gethostname() == 'elnath':
+     prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20'])
+ 
+ configs = [# configuration                          project_directory
+-          (NumpyConfiguration,                     None),
+-          (WeaveConfiguration,                     None),
+-          (LocalConfiguration,                     None),
++          #(NumpyConfiguration,                     None),
++          #(WeaveConfiguration,                     None),
++          #(LocalConfiguration,                     None),
++          #(CPPStandaloneConfiguration,              'cpp_standalone'),
++          #(CPPStandaloneConfigurationOpenMP,        'cpp_standalone'),
+           (CUDAStandaloneConfiguration,             'cuda_standalone'),
+-          (CUDAStandaloneConfigurationExtraThresholdKernel,             'cuda_standalone'),
+-          (CUDAStandaloneConfigurationNoAssert,             'cuda_standalone'),
+-          (CUDAStandaloneConfigurationCurandDouble,              'cuda_standalone'),
+-          (CUDAStandaloneConfigurationNoCudaOccupancyAPI,      'cuda_standalone'),
+-          (CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU,    'cuda_standalone'),
+-          (CUDAStandaloneConfiguration2BlocksPerSM, 'cuda_standalone'),
+-          (CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds, 'cuda_standalone'),
+-          (CUDAStandaloneConfigurationSynLaunchBounds,     'cuda_standalone'),
+-          (CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds, 'cuda_standalone'),
+-          (CUDAStandaloneConfigurationProfileGPU,   'cuda_standalone'),
+-          (CUDAStandaloneConfigurationProfileCPU,   'cuda_standalone'),
+-          (CPPStandaloneConfiguration,              'cpp_standalone'),
+-          (CUDAStandaloneConfigurationTestBrunelHeteroAtomics,   'cuda_standalone'),
+-          (CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU,   'cuda_standalone'),
+           (CUDAStandaloneConfigurationPushAtomicResize,   'cuda_standalone'),
+-          (CUDAStandaloneConfigurationPushAtomicResizeProfileCPU,   'cuda_standalone'),
+-          (CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy,   'cuda_standalone'),
+-          (CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU,   'cuda_standalone'),
+-          (GeNNConfiguration,                       'GeNNworkspace'),
+-          (CPPStandaloneConfigurationOpenMP,        'cpp_standalone'),
+-          (GeNNConfigurationCPU,                    'GeNNworkspace'),
+-          (GeNNConfigurationOptimized,              'GeNNworkspace')
++          (CUDAStandaloneConfigurationTestBrunelHeteroAtomics,   'cuda_standalone'),
++          (CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize,     'cuda_standalone'),
++          #(CUDAStandaloneConfigurationExtraThresholdKernel,             'cuda_standalone'),
++          #(CUDAStandaloneConfigurationNoAssert,             'cuda_standalone'),
++          #(CUDAStandaloneConfigurationCurandDouble,              'cuda_standalone'),
++          #(CUDAStandaloneConfigurationNoCudaOccupancyAPI,      'cuda_standalone'),
++          #(CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU,    'cuda_standalone'),
++          #(CUDAStandaloneConfiguration2BlocksPerSM, 'cuda_standalone'),
++          #(CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds, 'cuda_standalone'),
++          #(CUDAStandaloneConfigurationSynLaunchBounds,     'cuda_standalone'),
++          #(CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds, 'cuda_standalone'),
++          #(CUDAStandaloneConfigurationProfileGPU,   'cuda_standalone'),
++          #(CUDAStandaloneConfigurationProfileCPU,   'cuda_standalone'),
++          #(CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU,   'cuda_standalone'),
++          #(CUDAStandaloneConfigurationPushAtomicResizeProfileCPU,   'cuda_standalone'),
++          #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy,   'cuda_standalone'),
++          #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU,   'cuda_standalone'),
++          #(GeNNConfiguration,                       'GeNNworkspace'),
++          #(GeNNConfigurationCPU,                    'GeNNworkspace'),
++          #(GeNNConfigurationOptimized,              'GeNNworkspace')
+           ]
+ 
+ speed_tests = [# feature_test                     name                                  n_slice
+ 
+-               (ThresholderOnlyPoissonLowRate,                  'ThresholderOnlyPoissonLowRate',                slice(None)         ),
+-               (ThresholderOnlyPoissonMediumRate,               'ThresholderOnlyPoissonMediumRate',             slice(None)         ),
+-               (ThresholderOnlyPoissonHighRate,                 'ThresholderOnlyPoissonHighRate',               slice(None)         ),
+-               (ThresholderOnlyAlwaysSpiking,                   'ThresholderOnlyAlwaysSpiking',                 slice(None)         ),
+-
+-               (BrunelHakimStateupdateOnlyDouble,               'BrunelHakimStateupdateOnlyDouble',             slice(None)         ),
+-               (BrunelHakimStateupdateOnlyTriple,               'BrunelHakimStateupdateOnlyTriple',             slice(None)         ),
+-               (BrunelHakimStateupdateOnly,                     'BrunelHakimStateupdateOnly',                   slice(None)         ),
+-               (BrunelHakimNeuronsOnly,                         'BrunelHakimNeuronsOnly',                       slice(None)         ),
+-               (BrunelHakimNeuronsOnlyNoXi,                     'BrunelHakimNeuronsOnlyNoXi',                   slice(None)         ),
+-               (BrunelHakimNeuronsOnlyNoRand,                   'BrunelHakimNeuronsOnlyNoRand',                 slice(None)         ),
+-               (BrunelHakimStateupdateThresholdOnly,            'BrunelHakimStateupdateThresholdOnly',          slice(None)         ),
+-               (BrunelHakimStateupdateThresholdResetOnly,       'BrunelHakimStateupdateThresholdResetOnly',     slice(None)         ),
+-               (BrunelHakimModelScalarDelayShort,               'BrunelHakimModelScalarDelayShort',             slice(None)         ),
+-               (BrunelHakimModelScalarDelayNoSelfConnections,   'BrunelHakimModelScalarDelayNoSelfConnections', slice(None)         ),
+-               (CUBA,                                           'CUBA',                                         slice(None)         ),
+-               (COBAHH,                                         'COBAHH',                                       slice(None)         ),
+-               (AdaptationOscillation,                          'AdaptationOscillation',                        slice(None)         ),
+-               (Vogels,                                         'Vogels',                                       slice(None)         ),
+-               (STDP,                                           'STDP',                                         slice(None)         ),
+-               (STDPEventDriven,                                'STDPEventDriven',                              slice(None)         ),
+-               (BrunelHakimModelScalarDelay,                    'BrunelHakimModelScalarDelay',                  slice(None)         ),
+-
+-               (VerySparseMediumRateSynapsesOnly,               'VerySparseMediumRateSynapsesOnly',             slice(None)         ),
+-               (SparseMediumRateSynapsesOnly,                   'SparseMediumRateSynapsesOnly',                 slice(None)         ),
+-               (DenseMediumRateSynapsesOnly,                    'DenseMediumRateSynapsesOnly',                  slice(None)         ),
+-               (SparseLowRateSynapsesOnly,                      'SparseLowRateSynapsesOnly',                    slice(None)         ),
+-               (SparseHighRateSynapsesOnly,                     'SparseHighRateSynapsesOnly',                   slice(None)         ),
+-
+-               (STDPNotEventDriven,                             'STDPNotEventDriven',                           slice(None)         ),
+-               (STDPMultiPost,                                  'STDPMultiPost',                                slice(None)         ),
+-               (STDPNeuronalTraces,                             'STDPNeuronalTraces',                           slice(None)         ),
+-               (STDPMultiPostNeuronalTraces,                    'STDPMultiPostNeuronalTraces',                  slice(None)         ),
++               #(ThresholderOnlyPoissonLowRate,                  'ThresholderOnlyPoissonLowRate',                slice(None)         ),
++               #(ThresholderOnlyPoissonMediumRate,               'ThresholderOnlyPoissonMediumRate',             slice(None)         ),
++               #(ThresholderOnlyPoissonHighRate,                 'ThresholderOnlyPoissonHighRate',               slice(None)         ),
++               #(ThresholderOnlyAlwaysSpiking,                   'ThresholderOnlyAlwaysSpiking',                 slice(None)         ),
++
++               #(BrunelHakimStateupdateOnlyDouble,               'BrunelHakimStateupdateOnlyDouble',             slice(None)         ),
++               #(BrunelHakimStateupdateOnlyTriple,               'BrunelHakimStateupdateOnlyTriple',             slice(None)         ),
++               #(BrunelHakimStateupdateOnly,                     'BrunelHakimStateupdateOnly',                   slice(None)         ),
++               #(BrunelHakimNeuronsOnly,                         'BrunelHakimNeuronsOnly',                       slice(None)         ),
++               #(BrunelHakimNeuronsOnlyNoXi,                     'BrunelHakimNeuronsOnlyNoXi',                   slice(None)         ),
++               #(BrunelHakimNeuronsOnlyNoRand,                   'BrunelHakimNeuronsOnlyNoRand',                 slice(None)         ),
++               #(BrunelHakimStateupdateThresholdOnly,            'BrunelHakimStateupdateThresholdOnly',          slice(None)         ),
++               #(BrunelHakimStateupdateThresholdResetOnly,       'BrunelHakimStateupdateThresholdResetOnly',     slice(None)         ),
++               #(BrunelHakimModelScalarDelayShort,               'BrunelHakimModelScalarDelayShort',             slice(None)         ),
++               #(BrunelHakimModelScalarDelayNoSelfConnections,   'BrunelHakimModelScalarDelayNoSelfConnections', slice(None)         ),
++               #(CUBA,                                           'CUBA',                                         slice(None)         ),
++               #(COBAHH,                                         'COBAHH',                                       slice(None)         ),
++               #(AdaptationOscillation,                          'AdaptationOscillation',                        slice(None)         ),
++               #(Vogels,                                         'Vogels',                                       slice(None)         ),
++               #(STDP,                                           'STDP',                                         slice(None)         ),
++               #(STDPEventDriven,                                'STDPEventDriven',                              slice(None)         ),
++               #(BrunelHakimModelScalarDelay,                    'BrunelHakimModelScalarDelay',                  slice(None)         ),
++
++               #(VerySparseMediumRateSynapsesOnly,               'VerySparseMediumRateSynapsesOnly',             slice(None)         ),
++               #(SparseMediumRateSynapsesOnly,                   'SparseMediumRateSynapsesOnly',                 slice(None)         ),
++               #(DenseMediumRateSynapsesOnly,                    'DenseMediumRateSynapsesOnly',                  slice(None)         ),
++               #(SparseLowRateSynapsesOnly,                      'SparseLowRateSynapsesOnly',                    slice(None)         ),
++               #(SparseHighRateSynapsesOnly,                     'SparseHighRateSynapsesOnly',                   slice(None)         ),
++
++               #(STDPNotEventDriven,                             'STDPNotEventDriven',                           slice(None)         ),
++               #(STDPMultiPost,                                  'STDPMultiPost',                                slice(None)         ),
++               #(STDPNeuronalTraces,                             'STDPNeuronalTraces',                           slice(None)         ),
++               #(STDPMultiPostNeuronalTraces,                    'STDPMultiPostNeuronalTraces',                  slice(None)         ),
+ 
+                (BrunelHakimModelHeterogeneousDelay,             'BrunelHakimModelHeterogeneousDelay',           slice(None)         ),
+ 
+-               (LinearNeuronsOnly,                              'LinearNeuronsOnly',                            slice(None)         ),
+-               (HHNeuronsOnly,                                  'HHNeuronsOnly',                                slice(None)         ),
+-               (VogelsWithSynapticDynamic,                      'VogelsWithSynapticDynamic',                    slice(None)         ),
++               #(LinearNeuronsOnly,                              'LinearNeuronsOnly',                            slice(None)         ),
++               #(HHNeuronsOnly,                                  'HHNeuronsOnly',                                slice(None)         ),
++               #(VogelsWithSynapticDynamic,                      'VogelsWithSynapticDynamic',                    slice(None)         ),
+ 
+-               ## below uses monitors
+-               (CUBAFixedConnectivity,                          'CUBAFixedConnectivity',                        slice(None)         ),
+-               (COBAHHFixedConnectivity,                        'COBAHHFixedConnectivity',                      slice(None, -1)     ),
++               ### below uses monitors
++               #(CUBAFixedConnectivity,                          'CUBAFixedConnectivity',                        slice(None)         ),
++               #(COBAHHFixedConnectivity,                        'COBAHHFixedConnectivity',                      slice(None, -1)     ),
+ ]
+ 
+ configurations = [config[0] for config in configs]
+@@ -205,6 +207,16 @@ try:
+         savefig(os.path.join(plot_dir, 'speed_test_{}_relative.png'.format(name)))
+         res.plot_all_tests(profiling_minimum=0.05)
+         savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.png'.format(name)))
++
++        res.plot_all_tests()
++        ## this needs modification of brian2 code
++        #res.plot_all_tests(print_relative=True)
++        savefig(os.path.join(plot_dir, 'speed_test_{}_absolute.svg'.format(speed_tests[n][1])))
++        res.plot_all_tests(relative=True)
++        savefig(os.path.join(plot_dir, 'speed_test_{}_relative.svg'.format(name)))
++        res.plot_all_tests(profiling_minimum=0.05)
++        savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.svg'.format(name)))
++
+         if 3 != len(get_fignums()):
+             print("WARNING: There were {} plots created, but only {} saved.".format(len(get_fignums()), 3*(n+1)))
+         for n in get_fignums():
+diff --git a/frozen_repos/brian2 b/frozen_repos/brian2
+--- a/frozen_repos/brian2
++++ b/frozen_repos/brian2
+@@ -1 +1 @@
+-Subproject commit fadc6a0aeb90d1b4d343470628457d8561536f67
++Subproject commit fadc6a0aeb90d1b4d343470628457d8561536f67-dirty
+diff --git a/frozen_repos/brian2genn b/frozen_repos/brian2genn
+--- a/frozen_repos/brian2genn
++++ b/frozen_repos/brian2genn
+@@ -1 +1 @@
+-Subproject commit 0553cafeab49ea5403c0230411035df504d4db06
++Subproject commit 0553cafeab49ea5403c0230411035df504d4db06-dirty
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/logs/stdout_BrunelHakimModelHeterogeneousDelay_cuda_standalone_50000.txt b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/logs/stdout_BrunelHakimModelHeterogeneousDelay_cuda_standalone_50000.txt
new file mode 100644
index 00000000..9bfd72cb
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/logs/stdout_BrunelHakimModelHeterogeneousDelay_cuda_standalone_50000.txt
@@ -0,0 +1,53 @@
+INFO: setting cudaDevice stuff took 0.304642 seconds
+INFO kernel_synapses_group_variable_set_conditional_codeobject
+	48824 blocks
+	1024 threads
+	8 registers per block
+	0 bytes statically-allocated shared memory per block
+	0 bytes local memory per thread
+	304 bytes user-allocated constant memory
+INFO connectivity matrix has size 49994896
+INFO generating 13100000 randn every 262 clock cycles for neurongroup_stateupdater_codeobject
+INFO kernel_neurongroup_stateupdater_codeobject
+	66 blocks
+	768 threads
+	36 registers per block
+	0 bytes statically-allocated shared memory per block
+	8 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	0.750 theoretical occupancy
+INFO kernel_neurongroup_thresholder_codeobject
+	49 blocks
+	1024 threads
+	15 registers per block
+	0 bytes statically-allocated shared memory per block
+	0 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	1.000 theoretical occupancy
+INFO _run_synapses_pre_push_spikes_push_kernel
+	15 blocks
+	108 threads
+	78 registers per block
+	0 bytes statically-allocated shared memory per block
+	16 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	0.312 theoretical occupancy
+INFO kernel_synapses_pre_codeobject
+	15 blocks
+	1024 threads
+	21 registers per block
+	0 bytes statically-allocated shared memory per block
+	8 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	1.000 theoretical occupancy
+INFO kernel_neurongroup_resetter_codeobject
+	49 blocks
+	1024 threads
+	14 registers per block
+	0 bytes statically-allocated shared memory per block
+	0 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	1.000 theoretical occupancy
+Number of synapses: 49994896
+INFO: main_lines took 138.483894 seconds
+INFO: main function took 140.094220 seconds
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResizeProfileCPU_1000.log b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResizeProfileCPU_1000.log
new file mode 100644
index 00000000..da67b355
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResizeProfileCPU_1000.log
@@ -0,0 +1,25 @@
+==24531== NVPROF is profiling process 24531, command: ./main
+==24531== Profiling application: ./main
+==24531== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   37.91%  132.38ms      2521  52.511us  14.048us  1.0672ms  _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*)
+                   18.34%  64.052ms     10000  6.4050us  3.5520us  8.3520us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*)
+                   13.11%  45.786ms     10000  4.5780us  4.3840us  5.6320us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*)
+                    7.89%  27.566ms     10000  2.7560us  2.7200us  4.1280us  _run_synapses_pre_push_spikes_advance_kernel(void)
+                    6.60%  23.060ms     10000  2.3050us  2.0800us  2.8480us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+                    5.89%  20.553ms     10000  2.0550us  2.0160us  4.1920us  [CUDA memcpy DtoH]
+                    5.25%  18.329ms     10000  1.8320us  1.6640us  2.1760us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+                    4.80%  16.747ms     10000  1.6740us  1.6000us  2.2080us  _GLOBAL__N__69_tmpxft_00005e15_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*)
+                    0.21%  731.84us         1  731.84us  731.84us  731.84us  void gen_sequenced<curandStateXORWOW, float2, normal_args_st, __operator_&__(float2 curand_normal_scaled2<curandStateXORWOW>(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st)
+      API calls:   46.45%  719.26ms     62522  11.504us  9.5190us  8.6020ms  cudaLaunch
+                   35.15%  544.39ms     60001  9.0720us  2.4110us  1.0720ms  cudaDeviceSynchronize
+                   12.81%  198.43ms     10000  19.842us  18.034us  330.92us  cudaMemcpy
+                    3.57%  55.321ms    350089     158ns     123ns  330.68us  cudaSetupArgument
+                    1.15%  17.835ms     62522     285ns     182ns  10.032us  cudaConfigureCall
+                    0.83%  12.881ms     52523     245ns     209ns  9.8600us  cudaGetLastError
+                    0.02%  250.79us         1  250.79us  250.79us  250.79us  cudaMalloc
+                    0.01%  147.52us         1  147.52us  147.52us  147.52us  cudaMemGetInfo
+                    0.00%  28.259us         8  3.5320us  2.7680us  5.4040us  cudaFuncGetAttributes
+                    0.00%  26.485us        39     679ns     562ns  1.7750us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  6.3090us        12     525ns     358ns  1.3730us  cudaDeviceGetAttribute
+                    0.00%  2.9100us         3     970ns     717ns  1.4180us  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU_1000.log b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU_1000.log
new file mode 100644
index 00000000..666b0e65
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU_1000.log
@@ -0,0 +1,23 @@
+==23837== NVPROF is profiling process 23837, command: ./main
+==23837== Profiling application: ./main
+==23837== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   44.49%  157.31ms     10000  15.731us  1.8560us  1.1459ms  _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*)
+                   17.67%  62.479ms     10000  6.2470us  3.4240us  7.9360us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*)
+                   12.96%  45.814ms     10000  4.5810us  4.3530us  5.4080us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*)
+                    7.81%  27.614ms     10000  2.7610us  2.7200us  4.1920us  _run_synapses_pre_push_spikes_advance_kernel(void)
+                    6.48%  22.902ms     10000  2.2900us  2.0160us  2.8170us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+                    5.57%  19.698ms     10000  1.9690us  1.6960us  2.2080us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+                    4.81%  17.002ms     10000  1.7000us  1.6320us  2.2400us  _GLOBAL__N__69_tmpxft_00005b5a_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*)
+                    0.21%  731.94us         1  731.94us  731.94us  731.94us  void gen_sequenced<curandStateXORWOW, float2, normal_args_st, __operator_&__(float2 curand_normal_scaled2<curandStateXORWOW>(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st)
+      API calls:   54.28%  776.23ms     70001  11.088us  9.1570us  9.5432ms  cudaLaunch
+                   39.57%  565.83ms     60001  9.4300us  2.4970us  1.1523ms  cudaDeviceSynchronize
+                    3.99%  57.063ms    380005     150ns     121ns  325.75us  cudaSetupArgument
+                    1.16%  16.531ms     70001     236ns     172ns  25.540us  cudaConfigureCall
+                    0.96%  13.788ms     60002     229ns     191ns  12.473us  cudaGetLastError
+                    0.02%  304.60us         1  304.60us  304.60us  304.60us  cudaMalloc
+                    0.01%  168.13us         1  168.13us  168.13us  168.13us  cudaMemGetInfo
+                    0.00%  31.295us        39     802ns     568ns  4.4480us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  29.348us         8  3.6680us  2.8260us  5.7480us  cudaFuncGetAttributes
+                    0.00%  6.1630us        12     513ns     356ns  1.2920us  cudaDeviceGetAttribute
+                    0.00%  3.1870us         3  1.0620us     733ns  1.7050us  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.png b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.png
new file mode 100644
index 00000000..de995116
Binary files /dev/null and b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.png differ
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.png b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.png
new file mode 100644
index 00000000..d94f0997
Binary files /dev/null and b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.png differ
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.png b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.png
new file mode 100644
index 00000000..11df764b
Binary files /dev/null and b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.png differ
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/plots/speed_test_min_15_BrunelHakimModelHeterogeneousDelay_profiling.png b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/plots/speed_test_min_15_BrunelHakimModelHeterogeneousDelay_profiling.png
new file mode 100644
index 00000000..50724fc3
Binary files /dev/null and b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/plots/speed_test_min_15_BrunelHakimModelHeterogeneousDelay_profiling.png differ
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/run_speed_test_script.py b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/run_speed_test_script.py
new file mode 100644
index 00000000..c88ac141
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_effects_and_queue_resize_profiled/run_speed_test_script.py
@@ -0,0 +1,291 @@
+import os
+import shutil
+import glob
+import subprocess
+import sys
+import socket
+
+# run tests without X-server
+import matplotlib
+matplotlib.use('Agg')
+
+# pretty plots
+import seaborn
+
+import time
+import datetime
+import cPickle as pickle
+
+from brian2 import *
+from brian2.tests.features import *
+from brian2.tests.features.base import *
+from brian2.tests.features.base import results
+
+import brian2cuda
+from brian2cuda.tests.features.cuda_configuration import (CUDAStandaloneConfiguration,
+                                                          CUDAStandaloneConfigurationNoAssert,
+                                                          CUDAStandaloneConfigurationExtraThresholdKernel,
+                                                          CUDAStandaloneConfigurationCurandDouble,
+                                                          CUDAStandaloneConfigurationNoCudaOccupancyAPI,
+                                                          CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU,
+                                                          CUDAStandaloneConfiguration2BlocksPerSM,
+                                                          CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds,
+                                                          CUDAStandaloneConfigurationSynLaunchBounds,
+                                                          CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds,
+                                                          CUDAStandaloneConfigurationProfileGPU,
+                                                          CUDAStandaloneConfigurationProfileCPU,
+                                                          CUDAStandaloneConfigurationTestBrunelHeteroAtomics,
+                                                          CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU,
+                                                          CUDAStandaloneConfigurationPushAtomicResize,
+                                                          CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize,
+                                                          CUDAStandaloneConfigurationPushAtomicResizeProfileCPU,
+                                                          CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResizeProfileCPU,
+                                                          CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy,
+                                                          CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU)
+from brian2cuda.tests.features.speed import *
+
+from brian2genn.correctness_testing import GeNNConfiguration, GeNNConfigurationCPU, GeNNConfigurationOptimized
+
+from create_readme import create_readme
+
+assert len(sys.argv)<= 2, 'Only one command line argument supported! Got {}'.format(len(sys.argv)-1)
+if len(sys.argv) == 2:
+    additional_dir_name = '_' + sys.argv[1]
+else:
+    additional_dir_name = ''
+
+prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j12']
+
+# host specific settings
+if socket.gethostname() == 'elnath':
+    prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j24']
+    prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
+    prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20'])
+
+configs = [# configuration                          project_directory
+          #(NumpyConfiguration,                     None),
+          #(WeaveConfiguration,                     None),
+          #(LocalConfiguration,                     None),
+          #(CPPStandaloneConfiguration,              'cpp_standalone'),
+          #(CPPStandaloneConfigurationOpenMP,        'cpp_standalone'),
+          #(CUDAStandaloneConfiguration,             'cuda_standalone'),
+          #(CUDAStandaloneConfigurationPushAtomicResize,   'cuda_standalone'),
+          #(CUDAStandaloneConfigurationTestBrunelHeteroAtomics,   'cuda_standalone'),
+          #(CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize,     'cuda_standalone'),
+          #(CUDAStandaloneConfigurationExtraThresholdKernel,             'cuda_standalone'),
+          #(CUDAStandaloneConfigurationNoAssert,             'cuda_standalone'),
+          #(CUDAStandaloneConfigurationCurandDouble,              'cuda_standalone'),
+          #(CUDAStandaloneConfigurationNoCudaOccupancyAPI,      'cuda_standalone'),
+          #(CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU,    'cuda_standalone'),
+          #(CUDAStandaloneConfiguration2BlocksPerSM, 'cuda_standalone'),
+          #(CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds, 'cuda_standalone'),
+          #(CUDAStandaloneConfigurationSynLaunchBounds,     'cuda_standalone'),
+          #(CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds, 'cuda_standalone'),
+          #(CUDAStandaloneConfigurationProfileGPU,   'cuda_standalone'),
+          #(CUDAStandaloneConfigurationProfileCPU,   'cuda_standalone'),
+          (CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU,   'cuda_standalone'),
+          #(CUDAStandaloneConfigurationPushAtomicResizeProfileCPU,   'cuda_standalone'),
+          (CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResizeProfileCPU,     'cuda_standalone'),
+          #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy,   'cuda_standalone'),
+          #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU,   'cuda_standalone'),
+          #(GeNNConfiguration,                       'GeNNworkspace'),
+          #(GeNNConfigurationCPU,                    'GeNNworkspace'),
+          #(GeNNConfigurationOptimized,              'GeNNworkspace')
+          ]
+
+speed_tests = [# feature_test                     name                                  n_slice
+
+               #(ThresholderOnlyPoissonLowRate,                  'ThresholderOnlyPoissonLowRate',                slice(None)         ),
+               #(ThresholderOnlyPoissonMediumRate,               'ThresholderOnlyPoissonMediumRate',             slice(None)         ),
+               #(ThresholderOnlyPoissonHighRate,                 'ThresholderOnlyPoissonHighRate',               slice(None)         ),
+               #(ThresholderOnlyAlwaysSpiking,                   'ThresholderOnlyAlwaysSpiking',                 slice(None)         ),
+
+               #(BrunelHakimStateupdateOnlyDouble,               'BrunelHakimStateupdateOnlyDouble',             slice(None)         ),
+               #(BrunelHakimStateupdateOnlyTriple,               'BrunelHakimStateupdateOnlyTriple',             slice(None)         ),
+               #(BrunelHakimStateupdateOnly,                     'BrunelHakimStateupdateOnly',                   slice(None)         ),
+               #(BrunelHakimNeuronsOnly,                         'BrunelHakimNeuronsOnly',                       slice(None)         ),
+               #(BrunelHakimNeuronsOnlyNoXi,                     'BrunelHakimNeuronsOnlyNoXi',                   slice(None)         ),
+               #(BrunelHakimNeuronsOnlyNoRand,                   'BrunelHakimNeuronsOnlyNoRand',                 slice(None)         ),
+               #(BrunelHakimStateupdateThresholdOnly,            'BrunelHakimStateupdateThresholdOnly',          slice(None)         ),
+               #(BrunelHakimStateupdateThresholdResetOnly,       'BrunelHakimStateupdateThresholdResetOnly',     slice(None)         ),
+               #(BrunelHakimModelScalarDelayShort,               'BrunelHakimModelScalarDelayShort',             slice(None)         ),
+               #(BrunelHakimModelScalarDelayNoSelfConnections,   'BrunelHakimModelScalarDelayNoSelfConnections', slice(None)         ),
+               #(CUBA,                                           'CUBA',                                         slice(None)         ),
+               #(COBAHH,                                         'COBAHH',                                       slice(None)         ),
+               #(AdaptationOscillation,                          'AdaptationOscillation',                        slice(None)         ),
+               #(Vogels,                                         'Vogels',                                       slice(None)         ),
+               #(STDP,                                           'STDP',                                         slice(None)         ),
+               #(STDPEventDriven,                                'STDPEventDriven',                              slice(None)         ),
+               #(BrunelHakimModelScalarDelay,                    'BrunelHakimModelScalarDelay',                  slice(None)         ),
+
+               #(VerySparseMediumRateSynapsesOnly,               'VerySparseMediumRateSynapsesOnly',             slice(None)         ),
+               #(SparseMediumRateSynapsesOnly,                   'SparseMediumRateSynapsesOnly',                 slice(None)         ),
+               #(DenseMediumRateSynapsesOnly,                    'DenseMediumRateSynapsesOnly',                  slice(None)         ),
+               #(SparseLowRateSynapsesOnly,                      'SparseLowRateSynapsesOnly',                    slice(None)         ),
+               #(SparseHighRateSynapsesOnly,                     'SparseHighRateSynapsesOnly',                   slice(None)         ),
+
+               #(STDPNotEventDriven,                             'STDPNotEventDriven',                           slice(None)         ),
+               #(STDPMultiPost,                                  'STDPMultiPost',                                slice(None)         ),
+               #(STDPNeuronalTraces,                             'STDPNeuronalTraces',                           slice(None)         ),
+               #(STDPMultiPostNeuronalTraces,                    'STDPMultiPostNeuronalTraces',                  slice(None)         ),
+
+               (BrunelHakimModelHeterogeneousDelay,             'BrunelHakimModelHeterogeneousDelay',           slice(0,-1,1)         ),
+
+               #(LinearNeuronsOnly,                              'LinearNeuronsOnly',                            slice(None)         ),
+               #(HHNeuronsOnly,                                  'HHNeuronsOnly',                                slice(None)         ),
+               #(VogelsWithSynapticDynamic,                      'VogelsWithSynapticDynamic',                    slice(None)         ),
+
+               ### below uses monitors
+               #(CUBAFixedConnectivity,                          'CUBAFixedConnectivity',                        slice(None)         ),
+               #(COBAHHFixedConnectivity,                        'COBAHHFixedConnectivity',                      slice(None, -1)     ),
+]
+
+configurations = [config[0] for config in configs]
+project_dirs = [config[1] for config in configs]
+
+# check if multiple Configurations with same project_dirs are specified
+last_idx = {}
+for proj_dir in project_dirs:
+    if proj_dir is not None:
+        first_i = project_dirs.index(proj_dir)
+        last_i = len(project_dirs) - 1 - project_dirs[::-1].index(proj_dir)
+        if first_i != last_i:
+            print("WARNING there are multiple configurations using {d} as project "
+                  "directory. Profiling and logfiles will only be saved for the last one {c}.".format(
+                  d=proj_dir, c=configurations[last_i].__name__))
+        last_idx[proj_dir] = last_i
+
+time_stemp = time.time()
+date_str = datetime.datetime.fromtimestamp(time_stemp).strftime('%Y_%m_%d')
+
+directory = 'results_{}{}'.format(date_str, additional_dir_name)
+if os.path.exists(directory):
+    new_dir = directory + '_bak_' + str(int(time.time()))
+    print("Directory with name `{}` already exists. Renaming it to `{}`.".format(directory, new_dir))
+    os.rename(directory, new_dir)
+os.makedirs(directory)
+data_dir = os.path.join(directory, 'data')
+plot_dir = os.path.join(directory, 'plots')
+log_dir = os.path.join(directory, 'logs')
+prof_dir = os.path.join(directory, 'nvprof')
+os.makedirs(data_dir)
+os.makedirs(plot_dir)
+os.makedirs(log_dir)
+os.makedirs(prof_dir)
+print("Saving results in {}.".format(plot_dir))
+
+shutil.copy(os.path.realpath(__file__), os.path.join(directory, 'run_speed_test_script.py'))
+
+time_format = '%d.%m.%Y at %H:%M:%S'
+script_start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+
+with open(os.path.join(directory, 'git.diff'), 'w') as diff_file:
+    subprocess.call(['git', 'diff'], stdout=diff_file)
+
+try:
+    for n, (st, name, sl) in enumerate(speed_tests):
+        start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+        print("Starting {} on {}.".format(name, start))
+        maximum_run_time = 1*60*60*second
+        res = run_speed_tests(configurations=configurations,
+                              speed_tests=[st],
+                              n_slice=sl,
+                              #n_slice=slice(0,1,None),
+                              run_twice=False,
+                              verbose=True,
+                              maximum_run_time=maximum_run_time#,
+                              ## this needs modification of brian2 code
+                              #profile_only_active=True 
+                              #profile_only_active=False
+                             )
+        end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+        diff = datetime.datetime.strptime(end, time_format) - datetime.datetime.strptime(start, time_format)
+        print("Running {} took {}.".format(name, diff))
+        res.plot_all_tests()
+        ## this needs modification of brian2 code
+        #res.plot_all_tests(print_relative=True)
+        savefig(os.path.join(plot_dir, 'speed_test_{}_absolute.png'.format(speed_tests[n][1])))
+        res.plot_all_tests(relative=True)
+        savefig(os.path.join(plot_dir, 'speed_test_{}_relative.png'.format(name)))
+        res.plot_all_tests(profiling_minimum=0.05)
+        savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.png'.format(name)))
+        res.plot_all_tests(profiling_minimum=0.15)
+        savefig(os.path.join(plot_dir, 'speed_test_min_15_{}_profiling.png'.format(name)))
+
+        res.plot_all_tests()
+        ## this needs modification of brian2 code
+        #res.plot_all_tests(print_relative=True)
+        savefig(os.path.join(plot_dir, 'speed_test_{}_absolute.svg'.format(speed_tests[n][1])))
+        res.plot_all_tests(relative=True)
+        savefig(os.path.join(plot_dir, 'speed_test_{}_relative.svg'.format(name)))
+        res.plot_all_tests(profiling_minimum=0.05)
+        savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.svg'.format(name)))
+        res.plot_all_tests(profiling_minimum=0.15)
+        savefig(os.path.join(plot_dir, 'speed_test_min_15_{}_profiling.svg'.format(name)))
+
+        if 3 != len(get_fignums()):
+            print("WARNING: There were {} plots created, but only {} saved.".format(len(get_fignums()), 3*(n+1)))
+        for n in get_fignums():
+            close(n)
+
+        # pickel results object to disk
+        pkl_file = os.path.join(data_dir, name + '.pkl' )
+        with open(pkl_file, 'wb') as output:
+                pickle.dump(res, output, pickle.HIGHEST_PROTOCOL)
+
+        # save stdout log of last run (the other are deleted in run_speed_tests())
+        for proj_dir in set(project_dirs):
+            if not proj_dir is None and proj_dir in ['cuda_standalone', 'cpp_standalone']:
+                config = configurations[last_idx[proj_dir]]
+                stdout_file = os.path.join(proj_dir, 'results/stdout.txt')
+                if os.path.exists(stdout_file):
+                    shutil.copy(stdout_file,
+                                os.path.join(log_dir, 'stdout_{st}_{conf}_{n}.txt'.format(st=name, conf=proj_dir,
+                                                                                           n=st.n_range[sl][-1])))
+                else:
+                    print("WARNING Couldn't save {},file not found.".format(stdout_file))
+
+        # run nvprof on n_range[2]
+        for conf, proj_dir in zip(configurations, project_dirs):
+            main_arg = ''
+            if proj_dir in ['cuda_standalone', 'GeNNworkspace']:
+                if proj_dir == 'GeNNworkspace':
+                    main_arg = 'test {time} 1'.format(time=st.duration/second)
+                ns = st.n_range[sl]
+                idx = 2
+                max_runtime = 20
+                conf_name = conf.__name__
+                print("Rerunning {} with n = {} for nvprof profiling".format(conf_name, st.n_range[idx]))
+                tb, res, runtime, prof_info = results(conf, st, st.n_range[idx], maximum_run_time=maximum_run_time)
+                if not isinstance(res, Exception) and runtime < max_runtime:
+                    option = '--profile-from-start-off' if proj_dir == 'cuda_standalone' else ''
+                    cmd = 'cd {proj_dir} && nvprof {opt} --log-file ../{log_file} ./main {arg}'.format(
+                        proj_dir=proj_dir, arg=main_arg, opt=option,
+                        log_file=os.path.join(prof_dir, 'nvprof_{st}_{conf}_{n}.log'.format(
+                            st=name, conf=conf_name, n=st.n_range[idx])))
+                    prof_start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+                    print(cmd)
+                    x = os.system(cmd)
+                    if x:
+                        print('nvprof failed with {}'.format(x))
+                    prof_end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+                    prof_diff = datetime.datetime.strptime(prof_end, time_format) - datetime.datetime.strptime(prof_start, time_format)
+                    print("Profiling took {} for runtime of {}".format(prof_diff, runtime))
+finally:
+    create_readme(directory)
+    print("\nSummarized speed test results in {}".format(directory + '/README.md'))
+    script_end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+    script_diff = datetime.datetime.strptime(script_end, time_format) - datetime.datetime.strptime(script_start, time_format)
+    print("Finished speed test on {}. Total time = {}.".format(
+        datetime.datetime.fromtimestamp(time.time()).strftime(time_format), script_diff))
+
+
+##res.plot_all_tests(relative=True)
+#for n in get_fignums():
+#    plt.figure(n)
+#    savefig(plot_dir + '/speed_test_{}.png'.format(speed_tests[n-1][1]))
+
+## Debug (includes profiling infos)
+#from brian2.tests.features.base import results
+#for x in results(LocalConfiguration, LinearNeuronsOnly, 10, maximum_run_time=10*second):
+#    print x
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/README.md b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/README.md
new file mode 100644
index 00000000..99a61ae2
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/README.md
@@ -0,0 +1,160 @@
+
+# Benchmark results from 28.11.2017
+## Description:
+
+
+
+## Last git log:
+```
+commit 8987de24ed9f4a3b1a276496407fca1087f04004
+Author: Denis Alevi <mail@denisalevi.de>
+Date:   Mon Nov 20 14:31:09 2017 +0100
+
+    Fix critical section to include the actual pushing
+
+```
+There is also a `git diff` saved in the current directory.
+
+## Results
+
+### BrunelHakimModelHeterogeneousDelay
+![](plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.svg)
+![](plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.svg)
+![](plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.svg)
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==11697== NVPROF is profiling process 11697, command: ./main
+==11697== Profiling application: ./main
+==11697== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   35.88%  122.08ms      2517  48.504us  14.144us  1.1319ms  _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*)
+                   18.92%  64.378ms     10000  6.4370us  3.5520us  8.5120us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*)
+                   13.37%  45.473ms     10000  4.5470us  4.2560us  5.4400us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*)
+                    8.57%  29.165ms     10000  2.9160us  2.7200us  4.3200us  _run_synapses_pre_push_spikes_advance_kernel(void)
+                    6.76%  22.989ms     10000  2.2980us  2.0480us  2.8160us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+                    6.05%  20.569ms     10000  2.0560us  2.0160us  4.1290us  [CUDA memcpy DtoH]
+                    5.33%  18.127ms     10000  1.8120us  1.6320us  3.0080us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+                    4.91%  16.719ms     10000  1.6710us  1.3440us  2.6240us  _GLOBAL__N__69_tmpxft_00002bf1_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*)
+                    0.22%  732.58us         1  732.58us  732.58us  732.58us  void gen_sequenced<curandStateXORWOW, float2, normal_args_st, __operator_&__(float2 curand_normal_scaled2<curandStateXORWOW>(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st)
+      API calls:   63.87%  668.18ms     62518  10.687us  8.5490us  9.4480ms  cudaLaunch
+                   27.88%  291.67ms     10000  29.166us  18.638us  1.1319ms  cudaMemcpy
+                    5.59%  58.512ms    350073     167ns     127ns  325.26us  cudaSetupArgument
+                    1.38%  14.484ms     62518     231ns     156ns  327.91us  cudaConfigureCall
+                    1.23%  12.835ms     52519     244ns     175ns  326.82us  cudaGetLastError
+                    0.03%  305.95us         1  305.95us  305.95us  305.95us  cudaMalloc
+                    0.02%  159.37us         1  159.37us  159.37us  159.37us  cudaMemGetInfo
+                    0.00%  31.728us        39     813ns     618ns  3.2440us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  31.555us         8  3.9440us  2.9150us  7.0990us  cudaFuncGetAttributes
+                    0.00%  7.5860us         1  7.5860us  7.5860us  7.5860us  cudaDeviceSynchronize
+                    0.00%  6.3490us        12     529ns     354ns  1.3930us  cudaDeviceGetAttribute
+                    0.00%  4.5310us         3  1.5100us  1.0030us  2.4860us  cudaGetDevice
+
+```
+
+</p></details>
+
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfigurationPushAtomicResize**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==10355== NVPROF is profiling process 10355, command: ./main
+==10355== Profiling application: ./main
+==10355== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   57.51%  359.36ms     10000  35.935us  2.0800us  84.257us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*)
+                   19.13%  119.52ms      2474  48.310us  13.376us  1.5396ms  _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*)
+                    6.91%  43.163ms     10000  4.3160us  4.0640us  6.1440us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*)
+                    4.20%  26.272ms     10000  2.6270us  2.5600us  4.0960us  _run_synapses_pre_push_spikes_advance_kernel(void)
+                    3.48%  21.766ms     10000  2.1760us  1.9200us  2.8480us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+                    3.22%  20.128ms     10000  2.0120us  1.9520us  4.3200us  [CUDA memcpy DtoH]
+                    2.80%  17.511ms     10000  1.7510us  1.5360us  3.0080us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+                    2.62%  16.370ms     10000  1.6370us  1.5360us  2.7200us  _GLOBAL__N__69_tmpxft_000026b5_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*)
+                    0.12%  733.19us         1  733.19us  733.19us  733.19us  void gen_sequenced<curandStateXORWOW, float2, normal_args_st, __operator_&__(float2 curand_normal_scaled2<curandStateXORWOW>(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st)
+      API calls:   58.61%  668.77ms     62475  10.704us  8.6680us  8.8399ms  cudaLaunch
+                   33.75%  385.14ms     10000  38.513us  18.463us  1.5525ms  cudaMemcpy
+                    5.06%  57.722ms    349901     164ns     123ns  329.86us  cudaSetupArgument
+                    1.38%  15.780ms     62475     252ns     172ns  322.70us  cudaConfigureCall
+                    1.15%  13.126ms     52476     250ns     203ns  308.93us  cudaGetLastError
+                    0.02%  253.22us         1  253.22us  253.22us  253.22us  cudaMalloc
+                    0.01%  144.31us         1  144.31us  144.31us  144.31us  cudaMemGetInfo
+                    0.00%  29.459us         8  3.6820us  2.8650us  6.3900us  cudaFuncGetAttributes
+                    0.00%  28.673us        39     735ns     615ns  2.0410us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  11.375us         1  11.375us  11.375us  11.375us  cudaDeviceSynchronize
+                    0.00%  6.4040us        12     533ns     341ns  1.4790us  cudaDeviceGetAttribute
+                    0.00%  3.9250us         3  1.3080us     891ns  2.0500us  cudaGetDevice
+
+```
+
+</p></details>
+
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfigurationTestBrunelHeteroAtomics**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==11034== NVPROF is profiling process 11034, command: ./main
+==11034== Profiling application: ./main
+==11034== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   45.01%  151.33ms     10000  15.133us  1.6960us  1.0793ms  _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*)
+                   18.06%  60.710ms     10000  6.0710us  3.4240us  8.0000us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*)
+                   12.22%  41.097ms     10000  4.1090us  3.9040us  5.3760us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*)
+                    8.12%  27.312ms     10000  2.7310us  2.6560us  4.4800us  _run_synapses_pre_push_spikes_advance_kernel(void)
+                    6.93%  23.311ms     10000  2.3310us  2.1440us  3.7440us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+                    5.17%  17.383ms     10000  1.7380us  1.5680us  3.7120us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+                    4.27%  14.359ms     10000  1.4350us  1.3120us  3.7440us  _GLOBAL__N__69_tmpxft_0000295b_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*)
+                    0.22%  732.61us         1  732.61us  732.61us  732.61us  void gen_sequenced<curandStateXORWOW, float2, normal_args_st, __operator_&__(float2 curand_normal_scaled2<curandStateXORWOW>(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st)
+      API calls:   88.41%  688.82ms     70001  9.8400us  8.6450us  8.6670ms  cudaLaunch
+                    7.80%  60.794ms    380005     159ns     122ns  319.71us  cudaSetupArgument
+                    1.88%  14.685ms     70001     209ns     166ns  314.53us  cudaConfigureCall
+                    1.84%  14.372ms     60002     239ns     197ns  307.61us  cudaGetLastError
+                    0.03%  259.35us         1  259.35us  259.35us  259.35us  cudaMalloc
+                    0.02%  147.03us         1  147.03us  147.03us  147.03us  cudaMemGetInfo
+                    0.00%  29.491us        39     756ns     620ns  1.8670us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  28.449us         8  3.5560us  2.7550us  5.7160us  cudaFuncGetAttributes
+                    0.00%  13.847us         1  13.847us  13.847us  13.847us  cudaDeviceSynchronize
+                    0.00%  6.0720us        12     506ns     338ns  1.3940us  cudaDeviceGetAttribute
+                    0.00%  3.5940us         3  1.1980us     850ns  1.8350us  cudaGetDevice
+
+```
+
+</p></details>
+
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfiguration**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==9683== NVPROF is profiling process 9683, command: ./main
+==9683== Profiling application: ./main
+==9683== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   57.63%  349.62ms     10000  34.962us  1.8880us  75.808us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*)
+                   22.08%  133.93ms     10000  13.392us  1.6960us  947.33us  _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*)
+                    6.72%  40.747ms     10000  4.0740us  3.8720us  5.4080us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*)
+                    4.24%  25.697ms     10000  2.5690us  2.4640us  4.1920us  _run_synapses_pre_push_spikes_advance_kernel(void)
+                    3.74%  22.682ms     10000  2.2680us  2.0160us  4.0320us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+                    3.10%  18.782ms     10000  1.8780us  1.7280us  3.9040us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+                    2.39%  14.485ms     10000  1.4480us  1.2800us  3.9040us  _GLOBAL__N__69_tmpxft_00002413_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*)
+                    0.12%  732.29us         1  732.29us  732.29us  732.29us  void gen_sequenced<curandStateXORWOW, float2, normal_args_st, __operator_&__(float2 curand_normal_scaled2<curandStateXORWOW>(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st)
+      API calls:   88.80%  701.01ms     70001  10.014us  8.3580us  9.2616ms  cudaLaunch
+                    7.49%  59.115ms    380005     155ns     121ns  308.33us  cudaSetupArgument
+                    1.87%  14.751ms     70001     210ns     163ns  298.58us  cudaConfigureCall
+                    1.77%  13.941ms     60002     232ns     183ns  295.21us  cudaGetLastError
+                    0.04%  282.56us         1  282.56us  282.56us  282.56us  cudaMalloc
+                    0.02%  148.30us         1  148.30us  148.30us  148.30us  cudaMemGetInfo
+                    0.01%  81.989us         1  81.989us  81.989us  81.989us  cudaDeviceSynchronize
+                    0.00%  30.446us         8  3.8050us  2.9540us  6.3180us  cudaFuncGetAttributes
+                    0.00%  27.544us        39     706ns     585ns  1.9970us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  6.1470us        12     512ns     337ns  1.2730us  cudaDeviceGetAttribute
+                    0.00%  4.1110us         3  1.3700us     857ns  2.3570us  cudaGetDevice
+
+```
+
+</p></details>
+
+
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/data/BrunelHakimModelHeterogeneousDelay.pkl b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/data/BrunelHakimModelHeterogeneousDelay.pkl
new file mode 100644
index 00000000..f0d1527a
Binary files /dev/null and b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/data/BrunelHakimModelHeterogeneousDelay.pkl differ
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/git.diff b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/git.diff
new file mode 100644
index 00000000..44a84fa2
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/git.diff
@@ -0,0 +1,305 @@
+diff --git a/brian2cuda/tests/features/cuda_configuration.py b/brian2cuda/tests/features/cuda_configuration.py
+index 250687d..622f73a 100644
+--- a/brian2cuda/tests/features/cuda_configuration.py
++++ b/brian2cuda/tests/features/cuda_configuration.py
+@@ -225,7 +225,7 @@ class CUDAStandaloneConfigurationProfileCPU(Configuration):
+                             with_output=False)
+ 
+ class CUDAStandaloneConfigurationTestBrunelHeteroAtomics(Configuration):
+-    name = 'CUDA standalone with atomics in heterog delay mode'
++    name = 'CUDA standalone with atomics in effect application'
+     def before_run(self):
+         brian2.set_device('cuda_standalone', build_on_run=False)
+         prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True
+@@ -248,7 +248,7 @@ class CUDAStandaloneConfigurationTestBrunelHeteroAtomics(Configuration):
+                             with_output=False)
+ 
+ class CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU(Configuration):
+-    name = "CUDA standalone with atomics in heterog delay mode (profile='blocking')"
++    name = "CUDA standalone with atomics in effect application (profile='blocking')"
+     def before_run(self):
+         brian2.set_device('cuda_standalone', build_on_run=False, profile='blocking')
+         prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True
+@@ -270,12 +270,10 @@ class CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU(Configuration
+         brian2.device.build(directory='cuda_standalone', compile=True, run=True,
+                             with_output=False)
+ 
+-
+ class CUDAStandaloneConfigurationPushAtomicResize(Configuration):
+-    name = "CUDA standalone with atomics in queue resize"
++    name = "CUDA standalone with atomics in spikequeue resize"
+     def before_run(self):
+         brian2.set_device('cuda_standalone', build_on_run=False)
+-        prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True
+         prefs["devices.cuda_standalone.push_atomic_resize"] = True
+         if socket.gethostname() == 'elnath':
+             if prefs['devices.cpp_standalone.extra_make_args_unix'] == ['-j12']:
+@@ -295,8 +293,82 @@ class CUDAStandaloneConfigurationPushAtomicResize(Configuration):
+         brian2.device.build(directory='cuda_standalone', compile=True, run=True,
+                             with_output=False)
+ 
++
+ class CUDAStandaloneConfigurationPushAtomicResizeProfileCPU(Configuration):
+-    name = "CUDA standalone with atomics in queue resize (profile='blocking')"
++    name = "CUDA standalone with atomics in spikequeue resize (profile='blocking')"
++    def before_run(self):
++        brian2.set_device('cuda_standalone', build_on_run=False, profile='blocking')
++        prefs["devices.cuda_standalone.push_atomic_resize"] = True
++        if socket.gethostname() == 'elnath':
++            if prefs['devices.cpp_standalone.extra_make_args_unix'] == ['-j12']:
++                prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j24']
++            prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
++            prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20'])
++        elif socket.gethostname() == 'sabik':
++            prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
++            prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_52'])
++        elif socket.gethostname() == 'eltanin':
++            prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
++            prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_61'])
++
++    def after_run(self):
++        if os.path.exists('cuda_standalone'):
++            shutil.rmtree('cuda_standalone')
++        brian2.device.build(directory='cuda_standalone', compile=True, run=True,
++                            with_output=False)
++
++
++class CUDAStandaloneConfigurationPushAtomicResizProfileCPU(Configuration):
++    name = "CUDA standalone with atomics in spikequeue resize (profile='blocking)"
++    def before_run(self):
++        brian2.set_device('cuda_standalone', build_on_run=False,
++                          profile='blocking')
++        prefs["devices.cuda_standalone.push_atomic_resize"] = True
++        if socket.gethostname() == 'elnath':
++            if prefs['devices.cpp_standalone.extra_make_args_unix'] == ['-j12']:
++                prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j24']
++            prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
++            prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20'])
++        elif socket.gethostname() == 'sabik':
++            prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
++            prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_52'])
++        elif socket.gethostname() == 'eltanin':
++            prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
++            prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_61'])
++
++    def after_run(self):
++        if os.path.exists('cuda_standalone'):
++            shutil.rmtree('cuda_standalone')
++        brian2.device.build(directory='cuda_standalone', compile=True, run=True,
++                            with_output=False)
++
++
++class CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize(Configuration):
++    name = "CUDA standalone with atomics in effect application and in spikequeue resize"
++    def before_run(self):
++        brian2.set_device('cuda_standalone', build_on_run=False)
++        prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True
++        prefs["devices.cuda_standalone.push_atomic_resize"] = True
++        if socket.gethostname() == 'elnath':
++            if prefs['devices.cpp_standalone.extra_make_args_unix'] == ['-j12']:
++                prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j24']
++            prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
++            prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20'])
++        elif socket.gethostname() == 'sabik':
++            prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
++            prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_52'])
++        elif socket.gethostname() == 'eltanin':
++            prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
++            prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_61'])
++
++    def after_run(self):
++        if os.path.exists('cuda_standalone'):
++            shutil.rmtree('cuda_standalone')
++        brian2.device.build(directory='cuda_standalone', compile=True, run=True,
++                            with_output=False)
++
++class CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResizeProfileCPU(Configuration):
++    name = "CUDA standalone with atomics in effect application and in spikequeue resize (profile='blocking')"
+     def before_run(self):
+         brian2.set_device('cuda_standalone', build_on_run=False, profile='blocking')
+         prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True
+diff --git a/dev/benchmarks/run_speed_tests.py b/dev/benchmarks/run_speed_tests.py
+index 2518634..b525e97 100644
+--- a/dev/benchmarks/run_speed_tests.py
++++ b/dev/benchmarks/run_speed_tests.py
+@@ -37,6 +37,7 @@ from brian2cuda.tests.features.cuda_configuration import (CUDAStandaloneConfigur
+                                                           CUDAStandaloneConfigurationTestBrunelHeteroAtomics,
+                                                           CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU,
+                                                           CUDAStandaloneConfigurationPushAtomicResize,
++                                                          CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize,
+                                                           CUDAStandaloneConfigurationPushAtomicResizeProfileCPU,
+                                                           CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy,
+                                                           CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU)
+@@ -61,79 +62,80 @@ if socket.gethostname() == 'elnath':
+     prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20'])
+ 
+ configs = [# configuration                          project_directory
+-          (NumpyConfiguration,                     None),
+-          (WeaveConfiguration,                     None),
+-          (LocalConfiguration,                     None),
++          #(NumpyConfiguration,                     None),
++          #(WeaveConfiguration,                     None),
++          #(LocalConfiguration,                     None),
++          #(CPPStandaloneConfiguration,              'cpp_standalone'),
++          #(CPPStandaloneConfigurationOpenMP,        'cpp_standalone'),
+           (CUDAStandaloneConfiguration,             'cuda_standalone'),
+-          (CUDAStandaloneConfigurationExtraThresholdKernel,             'cuda_standalone'),
+-          (CUDAStandaloneConfigurationNoAssert,             'cuda_standalone'),
+-          (CUDAStandaloneConfigurationCurandDouble,              'cuda_standalone'),
+-          (CUDAStandaloneConfigurationNoCudaOccupancyAPI,      'cuda_standalone'),
+-          (CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU,    'cuda_standalone'),
+-          (CUDAStandaloneConfiguration2BlocksPerSM, 'cuda_standalone'),
+-          (CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds, 'cuda_standalone'),
+-          (CUDAStandaloneConfigurationSynLaunchBounds,     'cuda_standalone'),
+-          (CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds, 'cuda_standalone'),
+-          (CUDAStandaloneConfigurationProfileGPU,   'cuda_standalone'),
+-          (CUDAStandaloneConfigurationProfileCPU,   'cuda_standalone'),
+-          (CPPStandaloneConfiguration,              'cpp_standalone'),
+-          (CUDAStandaloneConfigurationTestBrunelHeteroAtomics,   'cuda_standalone'),
+-          (CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU,   'cuda_standalone'),
+           (CUDAStandaloneConfigurationPushAtomicResize,   'cuda_standalone'),
+-          (CUDAStandaloneConfigurationPushAtomicResizeProfileCPU,   'cuda_standalone'),
+-          (CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy,   'cuda_standalone'),
+-          (CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU,   'cuda_standalone'),
+-          (GeNNConfiguration,                       'GeNNworkspace'),
+-          (CPPStandaloneConfigurationOpenMP,        'cpp_standalone'),
+-          (GeNNConfigurationCPU,                    'GeNNworkspace'),
+-          (GeNNConfigurationOptimized,              'GeNNworkspace')
++          (CUDAStandaloneConfigurationTestBrunelHeteroAtomics,   'cuda_standalone'),
++          (CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize,     'cuda_standalone'),
++          #(CUDAStandaloneConfigurationExtraThresholdKernel,             'cuda_standalone'),
++          #(CUDAStandaloneConfigurationNoAssert,             'cuda_standalone'),
++          #(CUDAStandaloneConfigurationCurandDouble,              'cuda_standalone'),
++          #(CUDAStandaloneConfigurationNoCudaOccupancyAPI,      'cuda_standalone'),
++          #(CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU,    'cuda_standalone'),
++          #(CUDAStandaloneConfiguration2BlocksPerSM, 'cuda_standalone'),
++          #(CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds, 'cuda_standalone'),
++          #(CUDAStandaloneConfigurationSynLaunchBounds,     'cuda_standalone'),
++          #(CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds, 'cuda_standalone'),
++          #(CUDAStandaloneConfigurationProfileGPU,   'cuda_standalone'),
++          #(CUDAStandaloneConfigurationProfileCPU,   'cuda_standalone'),
++          #(CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU,   'cuda_standalone'),
++          #(CUDAStandaloneConfigurationPushAtomicResizeProfileCPU,   'cuda_standalone'),
++          #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy,   'cuda_standalone'),
++          #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU,   'cuda_standalone'),
++          #(GeNNConfiguration,                       'GeNNworkspace'),
++          #(GeNNConfigurationCPU,                    'GeNNworkspace'),
++          #(GeNNConfigurationOptimized,              'GeNNworkspace')
+           ]
+ 
+ speed_tests = [# feature_test                     name                                  n_slice
+ 
+-               (ThresholderOnlyPoissonLowRate,                  'ThresholderOnlyPoissonLowRate',                slice(None)         ),
+-               (ThresholderOnlyPoissonMediumRate,               'ThresholderOnlyPoissonMediumRate',             slice(None)         ),
+-               (ThresholderOnlyPoissonHighRate,                 'ThresholderOnlyPoissonHighRate',               slice(None)         ),
+-               (ThresholderOnlyAlwaysSpiking,                   'ThresholderOnlyAlwaysSpiking',                 slice(None)         ),
+-
+-               (BrunelHakimStateupdateOnlyDouble,               'BrunelHakimStateupdateOnlyDouble',             slice(None)         ),
+-               (BrunelHakimStateupdateOnlyTriple,               'BrunelHakimStateupdateOnlyTriple',             slice(None)         ),
+-               (BrunelHakimStateupdateOnly,                     'BrunelHakimStateupdateOnly',                   slice(None)         ),
+-               (BrunelHakimNeuronsOnly,                         'BrunelHakimNeuronsOnly',                       slice(None)         ),
+-               (BrunelHakimNeuronsOnlyNoXi,                     'BrunelHakimNeuronsOnlyNoXi',                   slice(None)         ),
+-               (BrunelHakimNeuronsOnlyNoRand,                   'BrunelHakimNeuronsOnlyNoRand',                 slice(None)         ),
+-               (BrunelHakimStateupdateThresholdOnly,            'BrunelHakimStateupdateThresholdOnly',          slice(None)         ),
+-               (BrunelHakimStateupdateThresholdResetOnly,       'BrunelHakimStateupdateThresholdResetOnly',     slice(None)         ),
+-               (BrunelHakimModelScalarDelayShort,               'BrunelHakimModelScalarDelayShort',             slice(None)         ),
+-               (BrunelHakimModelScalarDelayNoSelfConnections,   'BrunelHakimModelScalarDelayNoSelfConnections', slice(None)         ),
+-               (CUBA,                                           'CUBA',                                         slice(None)         ),
+-               (COBAHH,                                         'COBAHH',                                       slice(None)         ),
+-               (AdaptationOscillation,                          'AdaptationOscillation',                        slice(None)         ),
+-               (Vogels,                                         'Vogels',                                       slice(None)         ),
+-               (STDP,                                           'STDP',                                         slice(None)         ),
+-               (STDPEventDriven,                                'STDPEventDriven',                              slice(None)         ),
+-               (BrunelHakimModelScalarDelay,                    'BrunelHakimModelScalarDelay',                  slice(None)         ),
+-
+-               (VerySparseMediumRateSynapsesOnly,               'VerySparseMediumRateSynapsesOnly',             slice(None)         ),
+-               (SparseMediumRateSynapsesOnly,                   'SparseMediumRateSynapsesOnly',                 slice(None)         ),
+-               (DenseMediumRateSynapsesOnly,                    'DenseMediumRateSynapsesOnly',                  slice(None)         ),
+-               (SparseLowRateSynapsesOnly,                      'SparseLowRateSynapsesOnly',                    slice(None)         ),
+-               (SparseHighRateSynapsesOnly,                     'SparseHighRateSynapsesOnly',                   slice(None)         ),
+-
+-               (STDPNotEventDriven,                             'STDPNotEventDriven',                           slice(None)         ),
+-               (STDPMultiPost,                                  'STDPMultiPost',                                slice(None)         ),
+-               (STDPNeuronalTraces,                             'STDPNeuronalTraces',                           slice(None)         ),
+-               (STDPMultiPostNeuronalTraces,                    'STDPMultiPostNeuronalTraces',                  slice(None)         ),
++               #(ThresholderOnlyPoissonLowRate,                  'ThresholderOnlyPoissonLowRate',                slice(None)         ),
++               #(ThresholderOnlyPoissonMediumRate,               'ThresholderOnlyPoissonMediumRate',             slice(None)         ),
++               #(ThresholderOnlyPoissonHighRate,                 'ThresholderOnlyPoissonHighRate',               slice(None)         ),
++               #(ThresholderOnlyAlwaysSpiking,                   'ThresholderOnlyAlwaysSpiking',                 slice(None)         ),
++
++               #(BrunelHakimStateupdateOnlyDouble,               'BrunelHakimStateupdateOnlyDouble',             slice(None)         ),
++               #(BrunelHakimStateupdateOnlyTriple,               'BrunelHakimStateupdateOnlyTriple',             slice(None)         ),
++               #(BrunelHakimStateupdateOnly,                     'BrunelHakimStateupdateOnly',                   slice(None)         ),
++               #(BrunelHakimNeuronsOnly,                         'BrunelHakimNeuronsOnly',                       slice(None)         ),
++               #(BrunelHakimNeuronsOnlyNoXi,                     'BrunelHakimNeuronsOnlyNoXi',                   slice(None)         ),
++               #(BrunelHakimNeuronsOnlyNoRand,                   'BrunelHakimNeuronsOnlyNoRand',                 slice(None)         ),
++               #(BrunelHakimStateupdateThresholdOnly,            'BrunelHakimStateupdateThresholdOnly',          slice(None)         ),
++               #(BrunelHakimStateupdateThresholdResetOnly,       'BrunelHakimStateupdateThresholdResetOnly',     slice(None)         ),
++               #(BrunelHakimModelScalarDelayShort,               'BrunelHakimModelScalarDelayShort',             slice(None)         ),
++               #(BrunelHakimModelScalarDelayNoSelfConnections,   'BrunelHakimModelScalarDelayNoSelfConnections', slice(None)         ),
++               #(CUBA,                                           'CUBA',                                         slice(None)         ),
++               #(COBAHH,                                         'COBAHH',                                       slice(None)         ),
++               #(AdaptationOscillation,                          'AdaptationOscillation',                        slice(None)         ),
++               #(Vogels,                                         'Vogels',                                       slice(None)         ),
++               #(STDP,                                           'STDP',                                         slice(None)         ),
++               #(STDPEventDriven,                                'STDPEventDriven',                              slice(None)         ),
++               #(BrunelHakimModelScalarDelay,                    'BrunelHakimModelScalarDelay',                  slice(None)         ),
++
++               #(VerySparseMediumRateSynapsesOnly,               'VerySparseMediumRateSynapsesOnly',             slice(None)         ),
++               #(SparseMediumRateSynapsesOnly,                   'SparseMediumRateSynapsesOnly',                 slice(None)         ),
++               #(DenseMediumRateSynapsesOnly,                    'DenseMediumRateSynapsesOnly',                  slice(None)         ),
++               #(SparseLowRateSynapsesOnly,                      'SparseLowRateSynapsesOnly',                    slice(None)         ),
++               #(SparseHighRateSynapsesOnly,                     'SparseHighRateSynapsesOnly',                   slice(None)         ),
++
++               #(STDPNotEventDriven,                             'STDPNotEventDriven',                           slice(None)         ),
++               #(STDPMultiPost,                                  'STDPMultiPost',                                slice(None)         ),
++               #(STDPNeuronalTraces,                             'STDPNeuronalTraces',                           slice(None)         ),
++               #(STDPMultiPostNeuronalTraces,                    'STDPMultiPostNeuronalTraces',                  slice(None)         ),
+ 
+                (BrunelHakimModelHeterogeneousDelay,             'BrunelHakimModelHeterogeneousDelay',           slice(None)         ),
+ 
+-               (LinearNeuronsOnly,                              'LinearNeuronsOnly',                            slice(None)         ),
+-               (HHNeuronsOnly,                                  'HHNeuronsOnly',                                slice(None)         ),
+-               (VogelsWithSynapticDynamic,                      'VogelsWithSynapticDynamic',                    slice(None)         ),
++               #(LinearNeuronsOnly,                              'LinearNeuronsOnly',                            slice(None)         ),
++               #(HHNeuronsOnly,                                  'HHNeuronsOnly',                                slice(None)         ),
++               #(VogelsWithSynapticDynamic,                      'VogelsWithSynapticDynamic',                    slice(None)         ),
+ 
+-               ## below uses monitors
+-               (CUBAFixedConnectivity,                          'CUBAFixedConnectivity',                        slice(None)         ),
+-               (COBAHHFixedConnectivity,                        'COBAHHFixedConnectivity',                      slice(None, -1)     ),
++               ### below uses monitors
++               #(CUBAFixedConnectivity,                          'CUBAFixedConnectivity',                        slice(None)         ),
++               #(COBAHHFixedConnectivity,                        'COBAHHFixedConnectivity',                      slice(None, -1)     ),
+ ]
+ 
+ configurations = [config[0] for config in configs]
+@@ -205,6 +207,16 @@ try:
+         savefig(os.path.join(plot_dir, 'speed_test_{}_relative.png'.format(name)))
+         res.plot_all_tests(profiling_minimum=0.05)
+         savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.png'.format(name)))
++
++        res.plot_all_tests()
++        ## this needs modification of brian2 code
++        #res.plot_all_tests(print_relative=True)
++        savefig(os.path.join(plot_dir, 'speed_test_{}_absolute.svg'.format(speed_tests[n][1])))
++        res.plot_all_tests(relative=True)
++        savefig(os.path.join(plot_dir, 'speed_test_{}_relative.svg'.format(name)))
++        res.plot_all_tests(profiling_minimum=0.05)
++        savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.svg'.format(name)))
++
+         if 3 != len(get_fignums()):
+             print("WARNING: There were {} plots created, but only {} saved.".format(len(get_fignums()), 3*(n+1)))
+         for n in get_fignums():
+diff --git a/frozen_repos/brian2 b/frozen_repos/brian2
+--- a/frozen_repos/brian2
++++ b/frozen_repos/brian2
+@@ -1 +1 @@
+-Subproject commit fadc6a0aeb90d1b4d343470628457d8561536f67
++Subproject commit fadc6a0aeb90d1b4d343470628457d8561536f67-dirty
+diff --git a/frozen_repos/brian2genn b/frozen_repos/brian2genn
+--- a/frozen_repos/brian2genn
++++ b/frozen_repos/brian2genn
+@@ -1 +1 @@
+-Subproject commit 0553cafeab49ea5403c0230411035df504d4db06
++Subproject commit 0553cafeab49ea5403c0230411035df504d4db06-dirty
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/logs/stdout_BrunelHakimModelHeterogeneousDelay_cuda_standalone_100000.txt b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/logs/stdout_BrunelHakimModelHeterogeneousDelay_cuda_standalone_100000.txt
new file mode 100644
index 00000000..e69de29b
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize_1000.log b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize_1000.log
new file mode 100644
index 00000000..caa4f6e7
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize_1000.log
@@ -0,0 +1,25 @@
+==11697== NVPROF is profiling process 11697, command: ./main
+==11697== Profiling application: ./main
+==11697== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   35.88%  122.08ms      2517  48.504us  14.144us  1.1319ms  _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*)
+                   18.92%  64.378ms     10000  6.4370us  3.5520us  8.5120us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*)
+                   13.37%  45.473ms     10000  4.5470us  4.2560us  5.4400us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*)
+                    8.57%  29.165ms     10000  2.9160us  2.7200us  4.3200us  _run_synapses_pre_push_spikes_advance_kernel(void)
+                    6.76%  22.989ms     10000  2.2980us  2.0480us  2.8160us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+                    6.05%  20.569ms     10000  2.0560us  2.0160us  4.1290us  [CUDA memcpy DtoH]
+                    5.33%  18.127ms     10000  1.8120us  1.6320us  3.0080us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+                    4.91%  16.719ms     10000  1.6710us  1.3440us  2.6240us  _GLOBAL__N__69_tmpxft_00002bf1_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*)
+                    0.22%  732.58us         1  732.58us  732.58us  732.58us  void gen_sequenced<curandStateXORWOW, float2, normal_args_st, __operator_&__(float2 curand_normal_scaled2<curandStateXORWOW>(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st)
+      API calls:   63.87%  668.18ms     62518  10.687us  8.5490us  9.4480ms  cudaLaunch
+                   27.88%  291.67ms     10000  29.166us  18.638us  1.1319ms  cudaMemcpy
+                    5.59%  58.512ms    350073     167ns     127ns  325.26us  cudaSetupArgument
+                    1.38%  14.484ms     62518     231ns     156ns  327.91us  cudaConfigureCall
+                    1.23%  12.835ms     52519     244ns     175ns  326.82us  cudaGetLastError
+                    0.03%  305.95us         1  305.95us  305.95us  305.95us  cudaMalloc
+                    0.02%  159.37us         1  159.37us  159.37us  159.37us  cudaMemGetInfo
+                    0.00%  31.728us        39     813ns     618ns  3.2440us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  31.555us         8  3.9440us  2.9150us  7.0990us  cudaFuncGetAttributes
+                    0.00%  7.5860us         1  7.5860us  7.5860us  7.5860us  cudaDeviceSynchronize
+                    0.00%  6.3490us        12     529ns     354ns  1.3930us  cudaDeviceGetAttribute
+                    0.00%  4.5310us         3  1.5100us  1.0030us  2.4860us  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationPushAtomicResize_1000.log b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationPushAtomicResize_1000.log
new file mode 100644
index 00000000..acabeb8f
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationPushAtomicResize_1000.log
@@ -0,0 +1,25 @@
+==10355== NVPROF is profiling process 10355, command: ./main
+==10355== Profiling application: ./main
+==10355== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   57.51%  359.36ms     10000  35.935us  2.0800us  84.257us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*)
+                   19.13%  119.52ms      2474  48.310us  13.376us  1.5396ms  _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*)
+                    6.91%  43.163ms     10000  4.3160us  4.0640us  6.1440us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*)
+                    4.20%  26.272ms     10000  2.6270us  2.5600us  4.0960us  _run_synapses_pre_push_spikes_advance_kernel(void)
+                    3.48%  21.766ms     10000  2.1760us  1.9200us  2.8480us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+                    3.22%  20.128ms     10000  2.0120us  1.9520us  4.3200us  [CUDA memcpy DtoH]
+                    2.80%  17.511ms     10000  1.7510us  1.5360us  3.0080us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+                    2.62%  16.370ms     10000  1.6370us  1.5360us  2.7200us  _GLOBAL__N__69_tmpxft_000026b5_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*)
+                    0.12%  733.19us         1  733.19us  733.19us  733.19us  void gen_sequenced<curandStateXORWOW, float2, normal_args_st, __operator_&__(float2 curand_normal_scaled2<curandStateXORWOW>(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st)
+      API calls:   58.61%  668.77ms     62475  10.704us  8.6680us  8.8399ms  cudaLaunch
+                   33.75%  385.14ms     10000  38.513us  18.463us  1.5525ms  cudaMemcpy
+                    5.06%  57.722ms    349901     164ns     123ns  329.86us  cudaSetupArgument
+                    1.38%  15.780ms     62475     252ns     172ns  322.70us  cudaConfigureCall
+                    1.15%  13.126ms     52476     250ns     203ns  308.93us  cudaGetLastError
+                    0.02%  253.22us         1  253.22us  253.22us  253.22us  cudaMalloc
+                    0.01%  144.31us         1  144.31us  144.31us  144.31us  cudaMemGetInfo
+                    0.00%  29.459us         8  3.6820us  2.8650us  6.3900us  cudaFuncGetAttributes
+                    0.00%  28.673us        39     735ns     615ns  2.0410us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  11.375us         1  11.375us  11.375us  11.375us  cudaDeviceSynchronize
+                    0.00%  6.4040us        12     533ns     341ns  1.4790us  cudaDeviceGetAttribute
+                    0.00%  3.9250us         3  1.3080us     891ns  2.0500us  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationTestBrunelHeteroAtomics_1000.log b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationTestBrunelHeteroAtomics_1000.log
new file mode 100644
index 00000000..f9dac10c
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationTestBrunelHeteroAtomics_1000.log
@@ -0,0 +1,23 @@
+==11034== NVPROF is profiling process 11034, command: ./main
+==11034== Profiling application: ./main
+==11034== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   45.01%  151.33ms     10000  15.133us  1.6960us  1.0793ms  _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*)
+                   18.06%  60.710ms     10000  6.0710us  3.4240us  8.0000us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*)
+                   12.22%  41.097ms     10000  4.1090us  3.9040us  5.3760us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*)
+                    8.12%  27.312ms     10000  2.7310us  2.6560us  4.4800us  _run_synapses_pre_push_spikes_advance_kernel(void)
+                    6.93%  23.311ms     10000  2.3310us  2.1440us  3.7440us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+                    5.17%  17.383ms     10000  1.7380us  1.5680us  3.7120us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+                    4.27%  14.359ms     10000  1.4350us  1.3120us  3.7440us  _GLOBAL__N__69_tmpxft_0000295b_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*)
+                    0.22%  732.61us         1  732.61us  732.61us  732.61us  void gen_sequenced<curandStateXORWOW, float2, normal_args_st, __operator_&__(float2 curand_normal_scaled2<curandStateXORWOW>(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st)
+      API calls:   88.41%  688.82ms     70001  9.8400us  8.6450us  8.6670ms  cudaLaunch
+                    7.80%  60.794ms    380005     159ns     122ns  319.71us  cudaSetupArgument
+                    1.88%  14.685ms     70001     209ns     166ns  314.53us  cudaConfigureCall
+                    1.84%  14.372ms     60002     239ns     197ns  307.61us  cudaGetLastError
+                    0.03%  259.35us         1  259.35us  259.35us  259.35us  cudaMalloc
+                    0.02%  147.03us         1  147.03us  147.03us  147.03us  cudaMemGetInfo
+                    0.00%  29.491us        39     756ns     620ns  1.8670us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  28.449us         8  3.5560us  2.7550us  5.7160us  cudaFuncGetAttributes
+                    0.00%  13.847us         1  13.847us  13.847us  13.847us  cudaDeviceSynchronize
+                    0.00%  6.0720us        12     506ns     338ns  1.3940us  cudaDeviceGetAttribute
+                    0.00%  3.5940us         3  1.1980us     850ns  1.8350us  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfiguration_1000.log
new file mode 100644
index 00000000..5a56d8cc
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfiguration_1000.log
@@ -0,0 +1,23 @@
+==9683== NVPROF is profiling process 9683, command: ./main
+==9683== Profiling application: ./main
+==9683== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   57.63%  349.62ms     10000  34.962us  1.8880us  75.808us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*)
+                   22.08%  133.93ms     10000  13.392us  1.6960us  947.33us  _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*)
+                    6.72%  40.747ms     10000  4.0740us  3.8720us  5.4080us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*)
+                    4.24%  25.697ms     10000  2.5690us  2.4640us  4.1920us  _run_synapses_pre_push_spikes_advance_kernel(void)
+                    3.74%  22.682ms     10000  2.2680us  2.0160us  4.0320us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+                    3.10%  18.782ms     10000  1.8780us  1.7280us  3.9040us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+                    2.39%  14.485ms     10000  1.4480us  1.2800us  3.9040us  _GLOBAL__N__69_tmpxft_00002413_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*)
+                    0.12%  732.29us         1  732.29us  732.29us  732.29us  void gen_sequenced<curandStateXORWOW, float2, normal_args_st, __operator_&__(float2 curand_normal_scaled2<curandStateXORWOW>(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st)
+      API calls:   88.80%  701.01ms     70001  10.014us  8.3580us  9.2616ms  cudaLaunch
+                    7.49%  59.115ms    380005     155ns     121ns  308.33us  cudaSetupArgument
+                    1.87%  14.751ms     70001     210ns     163ns  298.58us  cudaConfigureCall
+                    1.77%  13.941ms     60002     232ns     183ns  295.21us  cudaGetLastError
+                    0.04%  282.56us         1  282.56us  282.56us  282.56us  cudaMalloc
+                    0.02%  148.30us         1  148.30us  148.30us  148.30us  cudaMemGetInfo
+                    0.01%  81.989us         1  81.989us  81.989us  81.989us  cudaDeviceSynchronize
+                    0.00%  30.446us         8  3.8050us  2.9540us  6.3180us  cudaFuncGetAttributes
+                    0.00%  27.544us        39     706ns     585ns  1.9970us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  6.1470us        12     512ns     337ns  1.2730us  cudaDeviceGetAttribute
+                    0.00%  4.1110us         3  1.3700us     857ns  2.3570us  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.png b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.png
new file mode 100644
index 00000000..ac9bf015
Binary files /dev/null and b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.png differ
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.png b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.png
new file mode 100644
index 00000000..e707e501
Binary files /dev/null and b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.png differ
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.png b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.png
new file mode 100644
index 00000000..32f9e08a
Binary files /dev/null and b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.png differ
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/run_speed_test_script.py b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/run_speed_test_script.py
new file mode 100644
index 00000000..b525e975
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_in_heterogenous_delay_mode/run_speed_test_script.py
@@ -0,0 +1,285 @@
+import os
+import shutil
+import glob
+import subprocess
+import sys
+import socket
+
+# run tests without X-server
+import matplotlib
+matplotlib.use('Agg')
+
+# pretty plots
+import seaborn
+
+import time
+import datetime
+import cPickle as pickle
+
+from brian2 import *
+from brian2.tests.features import *
+from brian2.tests.features.base import *
+from brian2.tests.features.base import results
+
+import brian2cuda
+from brian2cuda.tests.features.cuda_configuration import (CUDAStandaloneConfiguration,
+                                                          CUDAStandaloneConfigurationNoAssert,
+                                                          CUDAStandaloneConfigurationExtraThresholdKernel,
+                                                          CUDAStandaloneConfigurationCurandDouble,
+                                                          CUDAStandaloneConfigurationNoCudaOccupancyAPI,
+                                                          CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU,
+                                                          CUDAStandaloneConfiguration2BlocksPerSM,
+                                                          CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds,
+                                                          CUDAStandaloneConfigurationSynLaunchBounds,
+                                                          CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds,
+                                                          CUDAStandaloneConfigurationProfileGPU,
+                                                          CUDAStandaloneConfigurationProfileCPU,
+                                                          CUDAStandaloneConfigurationTestBrunelHeteroAtomics,
+                                                          CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU,
+                                                          CUDAStandaloneConfigurationPushAtomicResize,
+                                                          CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize,
+                                                          CUDAStandaloneConfigurationPushAtomicResizeProfileCPU,
+                                                          CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy,
+                                                          CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU)
+from brian2cuda.tests.features.speed import *
+
+from brian2genn.correctness_testing import GeNNConfiguration, GeNNConfigurationCPU, GeNNConfigurationOptimized
+
+from create_readme import create_readme
+
+assert len(sys.argv)<= 2, 'Only one command line argument supported! Got {}'.format(len(sys.argv)-1)
+if len(sys.argv) == 2:
+    additional_dir_name = '_' + sys.argv[1]
+else:
+    additional_dir_name = ''
+
+prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j12']
+
+# host specific settings
+if socket.gethostname() == 'elnath':
+    prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j24']
+    prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
+    prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20'])
+
+configs = [# configuration                          project_directory
+          #(NumpyConfiguration,                     None),
+          #(WeaveConfiguration,                     None),
+          #(LocalConfiguration,                     None),
+          #(CPPStandaloneConfiguration,              'cpp_standalone'),
+          #(CPPStandaloneConfigurationOpenMP,        'cpp_standalone'),
+          (CUDAStandaloneConfiguration,             'cuda_standalone'),
+          (CUDAStandaloneConfigurationPushAtomicResize,   'cuda_standalone'),
+          (CUDAStandaloneConfigurationTestBrunelHeteroAtomics,   'cuda_standalone'),
+          (CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize,     'cuda_standalone'),
+          #(CUDAStandaloneConfigurationExtraThresholdKernel,             'cuda_standalone'),
+          #(CUDAStandaloneConfigurationNoAssert,             'cuda_standalone'),
+          #(CUDAStandaloneConfigurationCurandDouble,              'cuda_standalone'),
+          #(CUDAStandaloneConfigurationNoCudaOccupancyAPI,      'cuda_standalone'),
+          #(CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU,    'cuda_standalone'),
+          #(CUDAStandaloneConfiguration2BlocksPerSM, 'cuda_standalone'),
+          #(CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds, 'cuda_standalone'),
+          #(CUDAStandaloneConfigurationSynLaunchBounds,     'cuda_standalone'),
+          #(CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds, 'cuda_standalone'),
+          #(CUDAStandaloneConfigurationProfileGPU,   'cuda_standalone'),
+          #(CUDAStandaloneConfigurationProfileCPU,   'cuda_standalone'),
+          #(CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU,   'cuda_standalone'),
+          #(CUDAStandaloneConfigurationPushAtomicResizeProfileCPU,   'cuda_standalone'),
+          #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy,   'cuda_standalone'),
+          #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU,   'cuda_standalone'),
+          #(GeNNConfiguration,                       'GeNNworkspace'),
+          #(GeNNConfigurationCPU,                    'GeNNworkspace'),
+          #(GeNNConfigurationOptimized,              'GeNNworkspace')
+          ]
+
+speed_tests = [# feature_test                     name                                  n_slice
+
+               #(ThresholderOnlyPoissonLowRate,                  'ThresholderOnlyPoissonLowRate',                slice(None)         ),
+               #(ThresholderOnlyPoissonMediumRate,               'ThresholderOnlyPoissonMediumRate',             slice(None)         ),
+               #(ThresholderOnlyPoissonHighRate,                 'ThresholderOnlyPoissonHighRate',               slice(None)         ),
+               #(ThresholderOnlyAlwaysSpiking,                   'ThresholderOnlyAlwaysSpiking',                 slice(None)         ),
+
+               #(BrunelHakimStateupdateOnlyDouble,               'BrunelHakimStateupdateOnlyDouble',             slice(None)         ),
+               #(BrunelHakimStateupdateOnlyTriple,               'BrunelHakimStateupdateOnlyTriple',             slice(None)         ),
+               #(BrunelHakimStateupdateOnly,                     'BrunelHakimStateupdateOnly',                   slice(None)         ),
+               #(BrunelHakimNeuronsOnly,                         'BrunelHakimNeuronsOnly',                       slice(None)         ),
+               #(BrunelHakimNeuronsOnlyNoXi,                     'BrunelHakimNeuronsOnlyNoXi',                   slice(None)         ),
+               #(BrunelHakimNeuronsOnlyNoRand,                   'BrunelHakimNeuronsOnlyNoRand',                 slice(None)         ),
+               #(BrunelHakimStateupdateThresholdOnly,            'BrunelHakimStateupdateThresholdOnly',          slice(None)         ),
+               #(BrunelHakimStateupdateThresholdResetOnly,       'BrunelHakimStateupdateThresholdResetOnly',     slice(None)         ),
+               #(BrunelHakimModelScalarDelayShort,               'BrunelHakimModelScalarDelayShort',             slice(None)         ),
+               #(BrunelHakimModelScalarDelayNoSelfConnections,   'BrunelHakimModelScalarDelayNoSelfConnections', slice(None)         ),
+               #(CUBA,                                           'CUBA',                                         slice(None)         ),
+               #(COBAHH,                                         'COBAHH',                                       slice(None)         ),
+               #(AdaptationOscillation,                          'AdaptationOscillation',                        slice(None)         ),
+               #(Vogels,                                         'Vogels',                                       slice(None)         ),
+               #(STDP,                                           'STDP',                                         slice(None)         ),
+               #(STDPEventDriven,                                'STDPEventDriven',                              slice(None)         ),
+               #(BrunelHakimModelScalarDelay,                    'BrunelHakimModelScalarDelay',                  slice(None)         ),
+
+               #(VerySparseMediumRateSynapsesOnly,               'VerySparseMediumRateSynapsesOnly',             slice(None)         ),
+               #(SparseMediumRateSynapsesOnly,                   'SparseMediumRateSynapsesOnly',                 slice(None)         ),
+               #(DenseMediumRateSynapsesOnly,                    'DenseMediumRateSynapsesOnly',                  slice(None)         ),
+               #(SparseLowRateSynapsesOnly,                      'SparseLowRateSynapsesOnly',                    slice(None)         ),
+               #(SparseHighRateSynapsesOnly,                     'SparseHighRateSynapsesOnly',                   slice(None)         ),
+
+               #(STDPNotEventDriven,                             'STDPNotEventDriven',                           slice(None)         ),
+               #(STDPMultiPost,                                  'STDPMultiPost',                                slice(None)         ),
+               #(STDPNeuronalTraces,                             'STDPNeuronalTraces',                           slice(None)         ),
+               #(STDPMultiPostNeuronalTraces,                    'STDPMultiPostNeuronalTraces',                  slice(None)         ),
+
+               (BrunelHakimModelHeterogeneousDelay,             'BrunelHakimModelHeterogeneousDelay',           slice(None)         ),
+
+               #(LinearNeuronsOnly,                              'LinearNeuronsOnly',                            slice(None)         ),
+               #(HHNeuronsOnly,                                  'HHNeuronsOnly',                                slice(None)         ),
+               #(VogelsWithSynapticDynamic,                      'VogelsWithSynapticDynamic',                    slice(None)         ),
+
+               ### below uses monitors
+               #(CUBAFixedConnectivity,                          'CUBAFixedConnectivity',                        slice(None)         ),
+               #(COBAHHFixedConnectivity,                        'COBAHHFixedConnectivity',                      slice(None, -1)     ),
+]
+
+configurations = [config[0] for config in configs]
+project_dirs = [config[1] for config in configs]
+
+# check if multiple Configurations with same project_dirs are specified
+last_idx = {}
+for proj_dir in project_dirs:
+    if proj_dir is not None:
+        first_i = project_dirs.index(proj_dir)
+        last_i = len(project_dirs) - 1 - project_dirs[::-1].index(proj_dir)
+        if first_i != last_i:
+            print("WARNING there are multiple configurations using {d} as project "
+                  "directory. Profiling and logfiles will only be saved for the last one {c}.".format(
+                  d=proj_dir, c=configurations[last_i].__name__))
+        last_idx[proj_dir] = last_i
+
+time_stemp = time.time()
+date_str = datetime.datetime.fromtimestamp(time_stemp).strftime('%Y_%m_%d')
+
+directory = 'results_{}{}'.format(date_str, additional_dir_name)
+if os.path.exists(directory):
+    new_dir = directory + '_bak_' + str(int(time.time()))
+    print("Directory with name `{}` already exists. Renaming it to `{}`.".format(directory, new_dir))
+    os.rename(directory, new_dir)
+os.makedirs(directory)
+data_dir = os.path.join(directory, 'data')
+plot_dir = os.path.join(directory, 'plots')
+log_dir = os.path.join(directory, 'logs')
+prof_dir = os.path.join(directory, 'nvprof')
+os.makedirs(data_dir)
+os.makedirs(plot_dir)
+os.makedirs(log_dir)
+os.makedirs(prof_dir)
+print("Saving results in {}.".format(plot_dir))
+
+shutil.copy(os.path.realpath(__file__), os.path.join(directory, 'run_speed_test_script.py'))
+
+time_format = '%d.%m.%Y at %H:%M:%S'
+script_start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+
+with open(os.path.join(directory, 'git.diff'), 'w') as diff_file:
+    subprocess.call(['git', 'diff'], stdout=diff_file)
+
+try:
+    for n, (st, name, sl) in enumerate(speed_tests):
+        start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+        print("Starting {} on {}.".format(name, start))
+        maximum_run_time = 1*60*60*second
+        res = run_speed_tests(configurations=configurations,
+                              speed_tests=[st],
+                              n_slice=sl,
+                              #n_slice=slice(0,1,None),
+                              run_twice=False,
+                              verbose=True,
+                              maximum_run_time=maximum_run_time#,
+                              ## this needs modification of brian2 code
+                              #profile_only_active=True 
+                              #profile_only_active=False
+                             )
+        end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+        diff = datetime.datetime.strptime(end, time_format) - datetime.datetime.strptime(start, time_format)
+        print("Running {} took {}.".format(name, diff))
+        res.plot_all_tests()
+        ## this needs modification of brian2 code
+        #res.plot_all_tests(print_relative=True)
+        savefig(os.path.join(plot_dir, 'speed_test_{}_absolute.png'.format(speed_tests[n][1])))
+        res.plot_all_tests(relative=True)
+        savefig(os.path.join(plot_dir, 'speed_test_{}_relative.png'.format(name)))
+        res.plot_all_tests(profiling_minimum=0.05)
+        savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.png'.format(name)))
+
+        res.plot_all_tests()
+        ## this needs modification of brian2 code
+        #res.plot_all_tests(print_relative=True)
+        savefig(os.path.join(plot_dir, 'speed_test_{}_absolute.svg'.format(speed_tests[n][1])))
+        res.plot_all_tests(relative=True)
+        savefig(os.path.join(plot_dir, 'speed_test_{}_relative.svg'.format(name)))
+        res.plot_all_tests(profiling_minimum=0.05)
+        savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.svg'.format(name)))
+
+        if 3 != len(get_fignums()):
+            print("WARNING: There were {} plots created, but only {} saved.".format(len(get_fignums()), 3*(n+1)))
+        for n in get_fignums():
+            close(n)
+
+        # pickel results object to disk
+        pkl_file = os.path.join(data_dir, name + '.pkl' )
+        with open(pkl_file, 'wb') as output:
+                pickle.dump(res, output, pickle.HIGHEST_PROTOCOL)
+
+        # save stdout log of last run (the other are deleted in run_speed_tests())
+        for proj_dir in set(project_dirs):
+            if not proj_dir is None and proj_dir in ['cuda_standalone', 'cpp_standalone']:
+                config = configurations[last_idx[proj_dir]]
+                stdout_file = os.path.join(proj_dir, 'results/stdout.txt')
+                if os.path.exists(stdout_file):
+                    shutil.copy(stdout_file,
+                                os.path.join(log_dir, 'stdout_{st}_{conf}_{n}.txt'.format(st=name, conf=proj_dir,
+                                                                                           n=st.n_range[sl][-1])))
+                else:
+                    print("WARNING Couldn't save {},file not found.".format(stdout_file))
+
+        # run nvprof on n_range[2]
+        for conf, proj_dir in zip(configurations, project_dirs):
+            main_arg = ''
+            if proj_dir in ['cuda_standalone', 'GeNNworkspace']:
+                if proj_dir == 'GeNNworkspace':
+                    main_arg = 'test {time} 1'.format(time=st.duration/second)
+                ns = st.n_range[sl]
+                idx = 2
+                max_runtime = 20
+                conf_name = conf.__name__
+                print("Rerunning {} with n = {} for nvprof profiling".format(conf_name, st.n_range[idx]))
+                tb, res, runtime, prof_info = results(conf, st, st.n_range[idx], maximum_run_time=maximum_run_time)
+                if not isinstance(res, Exception) and runtime < max_runtime:
+                    option = '--profile-from-start-off' if proj_dir == 'cuda_standalone' else ''
+                    cmd = 'cd {proj_dir} && nvprof {opt} --log-file ../{log_file} ./main {arg}'.format(
+                        proj_dir=proj_dir, arg=main_arg, opt=option,
+                        log_file=os.path.join(prof_dir, 'nvprof_{st}_{conf}_{n}.log'.format(
+                            st=name, conf=conf_name, n=st.n_range[idx])))
+                    prof_start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+                    print(cmd)
+                    x = os.system(cmd)
+                    if x:
+                        print('nvprof failed with {}'.format(x))
+                    prof_end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+                    prof_diff = datetime.datetime.strptime(prof_end, time_format) - datetime.datetime.strptime(prof_start, time_format)
+                    print("Profiling took {} for runtime of {}".format(prof_diff, runtime))
+finally:
+    create_readme(directory)
+    print("\nSummarized speed test results in {}".format(directory + '/README.md'))
+    script_end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+    script_diff = datetime.datetime.strptime(script_end, time_format) - datetime.datetime.strptime(script_start, time_format)
+    print("Finished speed test on {}. Total time = {}.".format(
+        datetime.datetime.fromtimestamp(time.time()).strftime(time_format), script_diff))
+
+
+##res.plot_all_tests(relative=True)
+#for n in get_fignums():
+#    plt.figure(n)
+#    savefig(plot_dir + '/speed_test_{}.png'.format(speed_tests[n-1][1]))
+
+## Debug (includes profiling infos)
+#from brian2.tests.features.base import results
+#for x in results(LocalConfiguration, LinearNeuronsOnly, 10, maximum_run_time=10*second):
+#    print x
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/README.md b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/README.md
new file mode 100644
index 00000000..db395a43
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/README.md
@@ -0,0 +1,96 @@
+
+# Benchmark results from 28.11.2017
+## Description:
+
+
+
+## Last git log:
+```
+commit 8987de24ed9f4a3b1a276496407fca1087f04004
+Author: Denis Alevi <mail@denisalevi.de>
+Date:   Mon Nov 20 14:31:09 2017 +0100
+
+    Fix critical section to include the actual pushing
+
+```
+There is also a `git diff` saved in the current directory.
+
+## Results
+
+### BrunelHakimModelHeterogeneousDelay
+![](plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.svg)
+![](plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.svg)
+![](plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.svg)
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfigurationProfileCPU**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==22819== NVPROF is profiling process 22819, command: ./main
+==22819== Profiling application: ./main
+==22819== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   57.40%  374.75ms     10000  37.475us  2.2080us  87.617us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*)
+                   22.46%  146.65ms     10000  14.664us  1.7280us  960.13us  _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*)
+                    6.51%  42.496ms     10000  4.2490us  4.0640us  5.4400us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*)
+                    4.30%  28.046ms     10000  2.8040us  2.7200us  4.3840us  _run_synapses_pre_push_spikes_advance_kernel(void)
+                    3.74%  24.420ms     10000  2.4420us  2.1120us  2.9120us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+                    3.10%  20.239ms     10000  2.0230us  1.9520us  2.2400us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+                    2.38%  15.522ms     10000  1.5520us  1.4400us  1.9520us  _GLOBAL__N__69_tmpxft_0000578e_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*)
+                    0.11%  732.55us         1  732.55us  732.55us  732.55us  void gen_sequenced<curandStateXORWOW, float2, normal_args_st, __operator_&__(float2 curand_normal_scaled2<curandStateXORWOW>(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st)
+      API calls:   48.78%  862.78ms     60001  14.379us  2.6750us  965.88us  cudaDeviceSynchronize
+                   45.73%  808.74ms     70001  11.553us  9.9030us  8.8572ms  cudaLaunch
+                    3.59%  63.553ms    380005     167ns     135ns  324.80us  cudaSetupArgument
+                    1.05%  18.499ms     70001     264ns     187ns  12.032us  cudaConfigureCall
+                    0.82%  14.507ms     60002     241ns     191ns  11.691us  cudaGetLastError
+                    0.02%  272.19us         1  272.19us  272.19us  272.19us  cudaMalloc
+                    0.01%  153.60us         1  153.60us  153.60us  153.60us  cudaMemGetInfo
+                    0.00%  30.182us         8  3.7720us  3.0160us  5.4880us  cudaFuncGetAttributes
+                    0.00%  28.610us        39     733ns     616ns  2.0690us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  6.6020us        12     550ns     360ns  1.2870us  cudaDeviceGetAttribute
+                    0.00%  2.7290us         3     909ns     660ns  1.3620us  cudaGetDevice
+
+```
+
+</p></details>
+
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==23535== NVPROF is profiling process 23535, command: ./main
+==23535== Profiling application: ./main
+==23535== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   42.76%  145.65ms     10000  14.565us  1.9840us  1.1359ms  _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*)
+                   18.94%  64.522ms     10000  6.4520us  3.6480us  8.1280us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*)
+                   12.92%  43.998ms     10000  4.3990us  4.1280us  5.5360us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*)
+                    8.09%  27.540ms     10000  2.7530us  2.6880us  4.5120us  _run_synapses_pre_push_spikes_advance_kernel(void)
+                    6.73%  22.929ms     10000  2.2920us  2.0800us  2.8170us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+                    5.80%  19.741ms     10000  1.9740us  1.8560us  2.1760us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+                    4.55%  15.502ms     10000  1.5500us  1.4400us  2.1760us  _GLOBAL__N__69_tmpxft_00005a32_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*)
+                    0.21%  731.97us         1  731.97us  731.97us  731.97us  void gen_sequenced<curandStateXORWOW, float2, normal_args_st, __operator_&__(float2 curand_normal_scaled2<curandStateXORWOW>(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st)
+      API calls:   55.46%  798.58ms     70001  11.408us  9.3530us  8.8769ms  cudaLaunch
+                   37.87%  545.25ms     60001  9.0870us  2.4500us  1.1408ms  cudaDeviceSynchronize
+                    4.14%  59.601ms    380005     156ns     125ns  312.80us  cudaSetupArgument
+                    1.35%  19.377ms     70001     276ns     185ns  13.408us  cudaConfigureCall
+                    1.16%  16.665ms     60002     277ns     203ns  11.227us  cudaGetLastError
+                    0.02%  277.76us         1  277.76us  277.76us  277.76us  cudaMalloc
+                    0.01%  156.74us         1  156.74us  156.74us  156.74us  cudaMemGetInfo
+                    0.00%  32.165us         8  4.0200us  2.8580us  7.6380us  cudaFuncGetAttributes
+                    0.00%  27.873us        39     714ns     603ns  1.7260us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  6.6820us        12     556ns     373ns  1.4220us  cudaDeviceGetAttribute
+                    0.00%  2.8330us         3     944ns     675ns  1.4210us  cudaGetDevice
+
+```
+
+</p></details>
+
+
+***
+
+### BrunelHakimModelHeterogeneousDelay - display less kernels in profiling
+![](plots/speed_test_min_15_BrunelHakimModelHeterogeneousDelay_profiling.svg)
+
+
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/data/BrunelHakimModelHeterogeneousDelay.pkl b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/data/BrunelHakimModelHeterogeneousDelay.pkl
new file mode 100644
index 00000000..a7ddffe9
Binary files /dev/null and b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/data/BrunelHakimModelHeterogeneousDelay.pkl differ
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/git.diff b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/git.diff
new file mode 100644
index 00000000..44a84fa2
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/git.diff
@@ -0,0 +1,305 @@
+diff --git a/brian2cuda/tests/features/cuda_configuration.py b/brian2cuda/tests/features/cuda_configuration.py
+index 250687d..622f73a 100644
+--- a/brian2cuda/tests/features/cuda_configuration.py
++++ b/brian2cuda/tests/features/cuda_configuration.py
+@@ -225,7 +225,7 @@ class CUDAStandaloneConfigurationProfileCPU(Configuration):
+                             with_output=False)
+ 
+ class CUDAStandaloneConfigurationTestBrunelHeteroAtomics(Configuration):
+-    name = 'CUDA standalone with atomics in heterog delay mode'
++    name = 'CUDA standalone with atomics in effect application'
+     def before_run(self):
+         brian2.set_device('cuda_standalone', build_on_run=False)
+         prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True
+@@ -248,7 +248,7 @@ class CUDAStandaloneConfigurationTestBrunelHeteroAtomics(Configuration):
+                             with_output=False)
+ 
+ class CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU(Configuration):
+-    name = "CUDA standalone with atomics in heterog delay mode (profile='blocking')"
++    name = "CUDA standalone with atomics in effect application (profile='blocking')"
+     def before_run(self):
+         brian2.set_device('cuda_standalone', build_on_run=False, profile='blocking')
+         prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True
+@@ -270,12 +270,10 @@ class CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU(Configuration
+         brian2.device.build(directory='cuda_standalone', compile=True, run=True,
+                             with_output=False)
+ 
+-
+ class CUDAStandaloneConfigurationPushAtomicResize(Configuration):
+-    name = "CUDA standalone with atomics in queue resize"
++    name = "CUDA standalone with atomics in spikequeue resize"
+     def before_run(self):
+         brian2.set_device('cuda_standalone', build_on_run=False)
+-        prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True
+         prefs["devices.cuda_standalone.push_atomic_resize"] = True
+         if socket.gethostname() == 'elnath':
+             if prefs['devices.cpp_standalone.extra_make_args_unix'] == ['-j12']:
+@@ -295,8 +293,82 @@ class CUDAStandaloneConfigurationPushAtomicResize(Configuration):
+         brian2.device.build(directory='cuda_standalone', compile=True, run=True,
+                             with_output=False)
+ 
++
+ class CUDAStandaloneConfigurationPushAtomicResizeProfileCPU(Configuration):
+-    name = "CUDA standalone with atomics in queue resize (profile='blocking')"
++    name = "CUDA standalone with atomics in spikequeue resize (profile='blocking')"
++    def before_run(self):
++        brian2.set_device('cuda_standalone', build_on_run=False, profile='blocking')
++        prefs["devices.cuda_standalone.push_atomic_resize"] = True
++        if socket.gethostname() == 'elnath':
++            if prefs['devices.cpp_standalone.extra_make_args_unix'] == ['-j12']:
++                prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j24']
++            prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
++            prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20'])
++        elif socket.gethostname() == 'sabik':
++            prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
++            prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_52'])
++        elif socket.gethostname() == 'eltanin':
++            prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
++            prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_61'])
++
++    def after_run(self):
++        if os.path.exists('cuda_standalone'):
++            shutil.rmtree('cuda_standalone')
++        brian2.device.build(directory='cuda_standalone', compile=True, run=True,
++                            with_output=False)
++
++
++class CUDAStandaloneConfigurationPushAtomicResizProfileCPU(Configuration):
++    name = "CUDA standalone with atomics in spikequeue resize (profile='blocking)"
++    def before_run(self):
++        brian2.set_device('cuda_standalone', build_on_run=False,
++                          profile='blocking')
++        prefs["devices.cuda_standalone.push_atomic_resize"] = True
++        if socket.gethostname() == 'elnath':
++            if prefs['devices.cpp_standalone.extra_make_args_unix'] == ['-j12']:
++                prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j24']
++            prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
++            prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20'])
++        elif socket.gethostname() == 'sabik':
++            prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
++            prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_52'])
++        elif socket.gethostname() == 'eltanin':
++            prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
++            prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_61'])
++
++    def after_run(self):
++        if os.path.exists('cuda_standalone'):
++            shutil.rmtree('cuda_standalone')
++        brian2.device.build(directory='cuda_standalone', compile=True, run=True,
++                            with_output=False)
++
++
++class CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize(Configuration):
++    name = "CUDA standalone with atomics in effect application and in spikequeue resize"
++    def before_run(self):
++        brian2.set_device('cuda_standalone', build_on_run=False)
++        prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True
++        prefs["devices.cuda_standalone.push_atomic_resize"] = True
++        if socket.gethostname() == 'elnath':
++            if prefs['devices.cpp_standalone.extra_make_args_unix'] == ['-j12']:
++                prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j24']
++            prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
++            prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20'])
++        elif socket.gethostname() == 'sabik':
++            prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
++            prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_52'])
++        elif socket.gethostname() == 'eltanin':
++            prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
++            prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_61'])
++
++    def after_run(self):
++        if os.path.exists('cuda_standalone'):
++            shutil.rmtree('cuda_standalone')
++        brian2.device.build(directory='cuda_standalone', compile=True, run=True,
++                            with_output=False)
++
++class CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResizeProfileCPU(Configuration):
++    name = "CUDA standalone with atomics in effect application and in spikequeue resize (profile='blocking')"
+     def before_run(self):
+         brian2.set_device('cuda_standalone', build_on_run=False, profile='blocking')
+         prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True
+diff --git a/dev/benchmarks/run_speed_tests.py b/dev/benchmarks/run_speed_tests.py
+index 2518634..b525e97 100644
+--- a/dev/benchmarks/run_speed_tests.py
++++ b/dev/benchmarks/run_speed_tests.py
+@@ -37,6 +37,7 @@ from brian2cuda.tests.features.cuda_configuration import (CUDAStandaloneConfigur
+                                                           CUDAStandaloneConfigurationTestBrunelHeteroAtomics,
+                                                           CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU,
+                                                           CUDAStandaloneConfigurationPushAtomicResize,
++                                                          CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize,
+                                                           CUDAStandaloneConfigurationPushAtomicResizeProfileCPU,
+                                                           CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy,
+                                                           CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU)
+@@ -61,79 +62,80 @@ if socket.gethostname() == 'elnath':
+     prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20'])
+ 
+ configs = [# configuration                          project_directory
+-          (NumpyConfiguration,                     None),
+-          (WeaveConfiguration,                     None),
+-          (LocalConfiguration,                     None),
++          #(NumpyConfiguration,                     None),
++          #(WeaveConfiguration,                     None),
++          #(LocalConfiguration,                     None),
++          #(CPPStandaloneConfiguration,              'cpp_standalone'),
++          #(CPPStandaloneConfigurationOpenMP,        'cpp_standalone'),
+           (CUDAStandaloneConfiguration,             'cuda_standalone'),
+-          (CUDAStandaloneConfigurationExtraThresholdKernel,             'cuda_standalone'),
+-          (CUDAStandaloneConfigurationNoAssert,             'cuda_standalone'),
+-          (CUDAStandaloneConfigurationCurandDouble,              'cuda_standalone'),
+-          (CUDAStandaloneConfigurationNoCudaOccupancyAPI,      'cuda_standalone'),
+-          (CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU,    'cuda_standalone'),
+-          (CUDAStandaloneConfiguration2BlocksPerSM, 'cuda_standalone'),
+-          (CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds, 'cuda_standalone'),
+-          (CUDAStandaloneConfigurationSynLaunchBounds,     'cuda_standalone'),
+-          (CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds, 'cuda_standalone'),
+-          (CUDAStandaloneConfigurationProfileGPU,   'cuda_standalone'),
+-          (CUDAStandaloneConfigurationProfileCPU,   'cuda_standalone'),
+-          (CPPStandaloneConfiguration,              'cpp_standalone'),
+-          (CUDAStandaloneConfigurationTestBrunelHeteroAtomics,   'cuda_standalone'),
+-          (CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU,   'cuda_standalone'),
+           (CUDAStandaloneConfigurationPushAtomicResize,   'cuda_standalone'),
+-          (CUDAStandaloneConfigurationPushAtomicResizeProfileCPU,   'cuda_standalone'),
+-          (CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy,   'cuda_standalone'),
+-          (CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU,   'cuda_standalone'),
+-          (GeNNConfiguration,                       'GeNNworkspace'),
+-          (CPPStandaloneConfigurationOpenMP,        'cpp_standalone'),
+-          (GeNNConfigurationCPU,                    'GeNNworkspace'),
+-          (GeNNConfigurationOptimized,              'GeNNworkspace')
++          (CUDAStandaloneConfigurationTestBrunelHeteroAtomics,   'cuda_standalone'),
++          (CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize,     'cuda_standalone'),
++          #(CUDAStandaloneConfigurationExtraThresholdKernel,             'cuda_standalone'),
++          #(CUDAStandaloneConfigurationNoAssert,             'cuda_standalone'),
++          #(CUDAStandaloneConfigurationCurandDouble,              'cuda_standalone'),
++          #(CUDAStandaloneConfigurationNoCudaOccupancyAPI,      'cuda_standalone'),
++          #(CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU,    'cuda_standalone'),
++          #(CUDAStandaloneConfiguration2BlocksPerSM, 'cuda_standalone'),
++          #(CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds, 'cuda_standalone'),
++          #(CUDAStandaloneConfigurationSynLaunchBounds,     'cuda_standalone'),
++          #(CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds, 'cuda_standalone'),
++          #(CUDAStandaloneConfigurationProfileGPU,   'cuda_standalone'),
++          #(CUDAStandaloneConfigurationProfileCPU,   'cuda_standalone'),
++          #(CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU,   'cuda_standalone'),
++          #(CUDAStandaloneConfigurationPushAtomicResizeProfileCPU,   'cuda_standalone'),
++          #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy,   'cuda_standalone'),
++          #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU,   'cuda_standalone'),
++          #(GeNNConfiguration,                       'GeNNworkspace'),
++          #(GeNNConfigurationCPU,                    'GeNNworkspace'),
++          #(GeNNConfigurationOptimized,              'GeNNworkspace')
+           ]
+ 
+ speed_tests = [# feature_test                     name                                  n_slice
+ 
+-               (ThresholderOnlyPoissonLowRate,                  'ThresholderOnlyPoissonLowRate',                slice(None)         ),
+-               (ThresholderOnlyPoissonMediumRate,               'ThresholderOnlyPoissonMediumRate',             slice(None)         ),
+-               (ThresholderOnlyPoissonHighRate,                 'ThresholderOnlyPoissonHighRate',               slice(None)         ),
+-               (ThresholderOnlyAlwaysSpiking,                   'ThresholderOnlyAlwaysSpiking',                 slice(None)         ),
+-
+-               (BrunelHakimStateupdateOnlyDouble,               'BrunelHakimStateupdateOnlyDouble',             slice(None)         ),
+-               (BrunelHakimStateupdateOnlyTriple,               'BrunelHakimStateupdateOnlyTriple',             slice(None)         ),
+-               (BrunelHakimStateupdateOnly,                     'BrunelHakimStateupdateOnly',                   slice(None)         ),
+-               (BrunelHakimNeuronsOnly,                         'BrunelHakimNeuronsOnly',                       slice(None)         ),
+-               (BrunelHakimNeuronsOnlyNoXi,                     'BrunelHakimNeuronsOnlyNoXi',                   slice(None)         ),
+-               (BrunelHakimNeuronsOnlyNoRand,                   'BrunelHakimNeuronsOnlyNoRand',                 slice(None)         ),
+-               (BrunelHakimStateupdateThresholdOnly,            'BrunelHakimStateupdateThresholdOnly',          slice(None)         ),
+-               (BrunelHakimStateupdateThresholdResetOnly,       'BrunelHakimStateupdateThresholdResetOnly',     slice(None)         ),
+-               (BrunelHakimModelScalarDelayShort,               'BrunelHakimModelScalarDelayShort',             slice(None)         ),
+-               (BrunelHakimModelScalarDelayNoSelfConnections,   'BrunelHakimModelScalarDelayNoSelfConnections', slice(None)         ),
+-               (CUBA,                                           'CUBA',                                         slice(None)         ),
+-               (COBAHH,                                         'COBAHH',                                       slice(None)         ),
+-               (AdaptationOscillation,                          'AdaptationOscillation',                        slice(None)         ),
+-               (Vogels,                                         'Vogels',                                       slice(None)         ),
+-               (STDP,                                           'STDP',                                         slice(None)         ),
+-               (STDPEventDriven,                                'STDPEventDriven',                              slice(None)         ),
+-               (BrunelHakimModelScalarDelay,                    'BrunelHakimModelScalarDelay',                  slice(None)         ),
+-
+-               (VerySparseMediumRateSynapsesOnly,               'VerySparseMediumRateSynapsesOnly',             slice(None)         ),
+-               (SparseMediumRateSynapsesOnly,                   'SparseMediumRateSynapsesOnly',                 slice(None)         ),
+-               (DenseMediumRateSynapsesOnly,                    'DenseMediumRateSynapsesOnly',                  slice(None)         ),
+-               (SparseLowRateSynapsesOnly,                      'SparseLowRateSynapsesOnly',                    slice(None)         ),
+-               (SparseHighRateSynapsesOnly,                     'SparseHighRateSynapsesOnly',                   slice(None)         ),
+-
+-               (STDPNotEventDriven,                             'STDPNotEventDriven',                           slice(None)         ),
+-               (STDPMultiPost,                                  'STDPMultiPost',                                slice(None)         ),
+-               (STDPNeuronalTraces,                             'STDPNeuronalTraces',                           slice(None)         ),
+-               (STDPMultiPostNeuronalTraces,                    'STDPMultiPostNeuronalTraces',                  slice(None)         ),
++               #(ThresholderOnlyPoissonLowRate,                  'ThresholderOnlyPoissonLowRate',                slice(None)         ),
++               #(ThresholderOnlyPoissonMediumRate,               'ThresholderOnlyPoissonMediumRate',             slice(None)         ),
++               #(ThresholderOnlyPoissonHighRate,                 'ThresholderOnlyPoissonHighRate',               slice(None)         ),
++               #(ThresholderOnlyAlwaysSpiking,                   'ThresholderOnlyAlwaysSpiking',                 slice(None)         ),
++
++               #(BrunelHakimStateupdateOnlyDouble,               'BrunelHakimStateupdateOnlyDouble',             slice(None)         ),
++               #(BrunelHakimStateupdateOnlyTriple,               'BrunelHakimStateupdateOnlyTriple',             slice(None)         ),
++               #(BrunelHakimStateupdateOnly,                     'BrunelHakimStateupdateOnly',                   slice(None)         ),
++               #(BrunelHakimNeuronsOnly,                         'BrunelHakimNeuronsOnly',                       slice(None)         ),
++               #(BrunelHakimNeuronsOnlyNoXi,                     'BrunelHakimNeuronsOnlyNoXi',                   slice(None)         ),
++               #(BrunelHakimNeuronsOnlyNoRand,                   'BrunelHakimNeuronsOnlyNoRand',                 slice(None)         ),
++               #(BrunelHakimStateupdateThresholdOnly,            'BrunelHakimStateupdateThresholdOnly',          slice(None)         ),
++               #(BrunelHakimStateupdateThresholdResetOnly,       'BrunelHakimStateupdateThresholdResetOnly',     slice(None)         ),
++               #(BrunelHakimModelScalarDelayShort,               'BrunelHakimModelScalarDelayShort',             slice(None)         ),
++               #(BrunelHakimModelScalarDelayNoSelfConnections,   'BrunelHakimModelScalarDelayNoSelfConnections', slice(None)         ),
++               #(CUBA,                                           'CUBA',                                         slice(None)         ),
++               #(COBAHH,                                         'COBAHH',                                       slice(None)         ),
++               #(AdaptationOscillation,                          'AdaptationOscillation',                        slice(None)         ),
++               #(Vogels,                                         'Vogels',                                       slice(None)         ),
++               #(STDP,                                           'STDP',                                         slice(None)         ),
++               #(STDPEventDriven,                                'STDPEventDriven',                              slice(None)         ),
++               #(BrunelHakimModelScalarDelay,                    'BrunelHakimModelScalarDelay',                  slice(None)         ),
++
++               #(VerySparseMediumRateSynapsesOnly,               'VerySparseMediumRateSynapsesOnly',             slice(None)         ),
++               #(SparseMediumRateSynapsesOnly,                   'SparseMediumRateSynapsesOnly',                 slice(None)         ),
++               #(DenseMediumRateSynapsesOnly,                    'DenseMediumRateSynapsesOnly',                  slice(None)         ),
++               #(SparseLowRateSynapsesOnly,                      'SparseLowRateSynapsesOnly',                    slice(None)         ),
++               #(SparseHighRateSynapsesOnly,                     'SparseHighRateSynapsesOnly',                   slice(None)         ),
++
++               #(STDPNotEventDriven,                             'STDPNotEventDriven',                           slice(None)         ),
++               #(STDPMultiPost,                                  'STDPMultiPost',                                slice(None)         ),
++               #(STDPNeuronalTraces,                             'STDPNeuronalTraces',                           slice(None)         ),
++               #(STDPMultiPostNeuronalTraces,                    'STDPMultiPostNeuronalTraces',                  slice(None)         ),
+ 
+                (BrunelHakimModelHeterogeneousDelay,             'BrunelHakimModelHeterogeneousDelay',           slice(None)         ),
+ 
+-               (LinearNeuronsOnly,                              'LinearNeuronsOnly',                            slice(None)         ),
+-               (HHNeuronsOnly,                                  'HHNeuronsOnly',                                slice(None)         ),
+-               (VogelsWithSynapticDynamic,                      'VogelsWithSynapticDynamic',                    slice(None)         ),
++               #(LinearNeuronsOnly,                              'LinearNeuronsOnly',                            slice(None)         ),
++               #(HHNeuronsOnly,                                  'HHNeuronsOnly',                                slice(None)         ),
++               #(VogelsWithSynapticDynamic,                      'VogelsWithSynapticDynamic',                    slice(None)         ),
+ 
+-               ## below uses monitors
+-               (CUBAFixedConnectivity,                          'CUBAFixedConnectivity',                        slice(None)         ),
+-               (COBAHHFixedConnectivity,                        'COBAHHFixedConnectivity',                      slice(None, -1)     ),
++               ### below uses monitors
++               #(CUBAFixedConnectivity,                          'CUBAFixedConnectivity',                        slice(None)         ),
++               #(COBAHHFixedConnectivity,                        'COBAHHFixedConnectivity',                      slice(None, -1)     ),
+ ]
+ 
+ configurations = [config[0] for config in configs]
+@@ -205,6 +207,16 @@ try:
+         savefig(os.path.join(plot_dir, 'speed_test_{}_relative.png'.format(name)))
+         res.plot_all_tests(profiling_minimum=0.05)
+         savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.png'.format(name)))
++
++        res.plot_all_tests()
++        ## this needs modification of brian2 code
++        #res.plot_all_tests(print_relative=True)
++        savefig(os.path.join(plot_dir, 'speed_test_{}_absolute.svg'.format(speed_tests[n][1])))
++        res.plot_all_tests(relative=True)
++        savefig(os.path.join(plot_dir, 'speed_test_{}_relative.svg'.format(name)))
++        res.plot_all_tests(profiling_minimum=0.05)
++        savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.svg'.format(name)))
++
+         if 3 != len(get_fignums()):
+             print("WARNING: There were {} plots created, but only {} saved.".format(len(get_fignums()), 3*(n+1)))
+         for n in get_fignums():
+diff --git a/frozen_repos/brian2 b/frozen_repos/brian2
+--- a/frozen_repos/brian2
++++ b/frozen_repos/brian2
+@@ -1 +1 @@
+-Subproject commit fadc6a0aeb90d1b4d343470628457d8561536f67
++Subproject commit fadc6a0aeb90d1b4d343470628457d8561536f67-dirty
+diff --git a/frozen_repos/brian2genn b/frozen_repos/brian2genn
+--- a/frozen_repos/brian2genn
++++ b/frozen_repos/brian2genn
+@@ -1 +1 @@
+-Subproject commit 0553cafeab49ea5403c0230411035df504d4db06
++Subproject commit 0553cafeab49ea5403c0230411035df504d4db06-dirty
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/logs/stdout_BrunelHakimModelHeterogeneousDelay_cuda_standalone_100000.txt b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/logs/stdout_BrunelHakimModelHeterogeneousDelay_cuda_standalone_100000.txt
new file mode 100644
index 00000000..e69de29b
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationProfileCPU_1000.log b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationProfileCPU_1000.log
new file mode 100644
index 00000000..a7caa3d5
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationProfileCPU_1000.log
@@ -0,0 +1,23 @@
+==22819== NVPROF is profiling process 22819, command: ./main
+==22819== Profiling application: ./main
+==22819== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   57.40%  374.75ms     10000  37.475us  2.2080us  87.617us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*)
+                   22.46%  146.65ms     10000  14.664us  1.7280us  960.13us  _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*)
+                    6.51%  42.496ms     10000  4.2490us  4.0640us  5.4400us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*)
+                    4.30%  28.046ms     10000  2.8040us  2.7200us  4.3840us  _run_synapses_pre_push_spikes_advance_kernel(void)
+                    3.74%  24.420ms     10000  2.4420us  2.1120us  2.9120us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+                    3.10%  20.239ms     10000  2.0230us  1.9520us  2.2400us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+                    2.38%  15.522ms     10000  1.5520us  1.4400us  1.9520us  _GLOBAL__N__69_tmpxft_0000578e_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*)
+                    0.11%  732.55us         1  732.55us  732.55us  732.55us  void gen_sequenced<curandStateXORWOW, float2, normal_args_st, __operator_&__(float2 curand_normal_scaled2<curandStateXORWOW>(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st)
+      API calls:   48.78%  862.78ms     60001  14.379us  2.6750us  965.88us  cudaDeviceSynchronize
+                   45.73%  808.74ms     70001  11.553us  9.9030us  8.8572ms  cudaLaunch
+                    3.59%  63.553ms    380005     167ns     135ns  324.80us  cudaSetupArgument
+                    1.05%  18.499ms     70001     264ns     187ns  12.032us  cudaConfigureCall
+                    0.82%  14.507ms     60002     241ns     191ns  11.691us  cudaGetLastError
+                    0.02%  272.19us         1  272.19us  272.19us  272.19us  cudaMalloc
+                    0.01%  153.60us         1  153.60us  153.60us  153.60us  cudaMemGetInfo
+                    0.00%  30.182us         8  3.7720us  3.0160us  5.4880us  cudaFuncGetAttributes
+                    0.00%  28.610us        39     733ns     616ns  2.0690us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  6.6020us        12     550ns     360ns  1.2870us  cudaDeviceGetAttribute
+                    0.00%  2.7290us         3     909ns     660ns  1.3620us  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU_1000.log b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU_1000.log
new file mode 100644
index 00000000..3937cc58
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU_1000.log
@@ -0,0 +1,23 @@
+==23535== NVPROF is profiling process 23535, command: ./main
+==23535== Profiling application: ./main
+==23535== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   42.76%  145.65ms     10000  14.565us  1.9840us  1.1359ms  _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*)
+                   18.94%  64.522ms     10000  6.4520us  3.6480us  8.1280us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*)
+                   12.92%  43.998ms     10000  4.3990us  4.1280us  5.5360us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*)
+                    8.09%  27.540ms     10000  2.7530us  2.6880us  4.5120us  _run_synapses_pre_push_spikes_advance_kernel(void)
+                    6.73%  22.929ms     10000  2.2920us  2.0800us  2.8170us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+                    5.80%  19.741ms     10000  1.9740us  1.8560us  2.1760us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+                    4.55%  15.502ms     10000  1.5500us  1.4400us  2.1760us  _GLOBAL__N__69_tmpxft_00005a32_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*)
+                    0.21%  731.97us         1  731.97us  731.97us  731.97us  void gen_sequenced<curandStateXORWOW, float2, normal_args_st, __operator_&__(float2 curand_normal_scaled2<curandStateXORWOW>(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st)
+      API calls:   55.46%  798.58ms     70001  11.408us  9.3530us  8.8769ms  cudaLaunch
+                   37.87%  545.25ms     60001  9.0870us  2.4500us  1.1408ms  cudaDeviceSynchronize
+                    4.14%  59.601ms    380005     156ns     125ns  312.80us  cudaSetupArgument
+                    1.35%  19.377ms     70001     276ns     185ns  13.408us  cudaConfigureCall
+                    1.16%  16.665ms     60002     277ns     203ns  11.227us  cudaGetLastError
+                    0.02%  277.76us         1  277.76us  277.76us  277.76us  cudaMalloc
+                    0.01%  156.74us         1  156.74us  156.74us  156.74us  cudaMemGetInfo
+                    0.00%  32.165us         8  4.0200us  2.8580us  7.6380us  cudaFuncGetAttributes
+                    0.00%  27.873us        39     714ns     603ns  1.7260us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  6.6820us        12     556ns     373ns  1.4220us  cudaDeviceGetAttribute
+                    0.00%  2.8330us         3     944ns     675ns  1.4210us  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.png b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.png
new file mode 100644
index 00000000..0631b80b
Binary files /dev/null and b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.png differ
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.png b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.png
new file mode 100644
index 00000000..41b4424e
Binary files /dev/null and b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.png differ
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.png b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.png
new file mode 100644
index 00000000..00de07cb
Binary files /dev/null and b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.png differ
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/plots/speed_test_min_15_BrunelHakimModelHeterogeneousDelay_profiling.png b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/plots/speed_test_min_15_BrunelHakimModelHeterogeneousDelay_profiling.png
new file mode 100644
index 00000000..f78a4f2a
Binary files /dev/null and b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/plots/speed_test_min_15_BrunelHakimModelHeterogeneousDelay_profiling.png differ
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/run_speed_test_script.py b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/run_speed_test_script.py
new file mode 100644
index 00000000..f2e265e8
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_atomics_original_and_atomics_effects_profiled/run_speed_test_script.py
@@ -0,0 +1,291 @@
+import os
+import shutil
+import glob
+import subprocess
+import sys
+import socket
+
+# run tests without X-server
+import matplotlib
+matplotlib.use('Agg')
+
+# pretty plots
+import seaborn
+
+import time
+import datetime
+import cPickle as pickle
+
+from brian2 import *
+from brian2.tests.features import *
+from brian2.tests.features.base import *
+from brian2.tests.features.base import results
+
+import brian2cuda
+from brian2cuda.tests.features.cuda_configuration import (CUDAStandaloneConfiguration,
+                                                          CUDAStandaloneConfigurationNoAssert,
+                                                          CUDAStandaloneConfigurationExtraThresholdKernel,
+                                                          CUDAStandaloneConfigurationCurandDouble,
+                                                          CUDAStandaloneConfigurationNoCudaOccupancyAPI,
+                                                          CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU,
+                                                          CUDAStandaloneConfiguration2BlocksPerSM,
+                                                          CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds,
+                                                          CUDAStandaloneConfigurationSynLaunchBounds,
+                                                          CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds,
+                                                          CUDAStandaloneConfigurationProfileGPU,
+                                                          CUDAStandaloneConfigurationProfileCPU,
+                                                          CUDAStandaloneConfigurationTestBrunelHeteroAtomics,
+                                                          CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU,
+                                                          CUDAStandaloneConfigurationPushAtomicResize,
+                                                          CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize,
+                                                          CUDAStandaloneConfigurationPushAtomicResizeProfileCPU,
+                                                          CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResizeProfileCPU,
+                                                          CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy,
+                                                          CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU)
+from brian2cuda.tests.features.speed import *
+
+from brian2genn.correctness_testing import GeNNConfiguration, GeNNConfigurationCPU, GeNNConfigurationOptimized
+
+from create_readme import create_readme
+
+assert len(sys.argv)<= 2, 'Only one command line argument supported! Got {}'.format(len(sys.argv)-1)
+if len(sys.argv) == 2:
+    additional_dir_name = '_' + sys.argv[1]
+else:
+    additional_dir_name = ''
+
+prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j12']
+
+# host specific settings
+if socket.gethostname() == 'elnath':
+    prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j24']
+    prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
+    prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20'])
+
+configs = [# configuration                          project_directory
+          #(NumpyConfiguration,                     None),
+          #(WeaveConfiguration,                     None),
+          #(LocalConfiguration,                     None),
+          #(CPPStandaloneConfiguration,              'cpp_standalone'),
+          #(CPPStandaloneConfigurationOpenMP,        'cpp_standalone'),
+          #(CUDAStandaloneConfiguration,             'cuda_standalone'),
+          #(CUDAStandaloneConfigurationPushAtomicResize,   'cuda_standalone'),
+          #(CUDAStandaloneConfigurationTestBrunelHeteroAtomics,   'cuda_standalone'),
+          #(CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize,     'cuda_standalone'),
+          #(CUDAStandaloneConfigurationExtraThresholdKernel,             'cuda_standalone'),
+          #(CUDAStandaloneConfigurationNoAssert,             'cuda_standalone'),
+          #(CUDAStandaloneConfigurationCurandDouble,              'cuda_standalone'),
+          #(CUDAStandaloneConfigurationNoCudaOccupancyAPI,      'cuda_standalone'),
+          #(CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU,    'cuda_standalone'),
+          #(CUDAStandaloneConfiguration2BlocksPerSM, 'cuda_standalone'),
+          #(CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds, 'cuda_standalone'),
+          #(CUDAStandaloneConfigurationSynLaunchBounds,     'cuda_standalone'),
+          #(CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds, 'cuda_standalone'),
+          #(CUDAStandaloneConfigurationProfileGPU,   'cuda_standalone'),
+          (CUDAStandaloneConfigurationProfileCPU,   'cuda_standalone'),
+          (CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU,   'cuda_standalone'),
+          #(CUDAStandaloneConfigurationPushAtomicResizeProfileCPU,   'cuda_standalone'),
+          #(CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResizeProfileCPU,     'cuda_standalone'),
+          #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy,   'cuda_standalone'),
+          #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU,   'cuda_standalone'),
+          #(GeNNConfiguration,                       'GeNNworkspace'),
+          #(GeNNConfigurationCPU,                    'GeNNworkspace'),
+          #(GeNNConfigurationOptimized,              'GeNNworkspace')
+          ]
+
+speed_tests = [# feature_test                     name                                  n_slice
+
+               #(ThresholderOnlyPoissonLowRate,                  'ThresholderOnlyPoissonLowRate',                slice(None)         ),
+               #(ThresholderOnlyPoissonMediumRate,               'ThresholderOnlyPoissonMediumRate',             slice(None)         ),
+               #(ThresholderOnlyPoissonHighRate,                 'ThresholderOnlyPoissonHighRate',               slice(None)         ),
+               #(ThresholderOnlyAlwaysSpiking,                   'ThresholderOnlyAlwaysSpiking',                 slice(None)         ),
+
+               #(BrunelHakimStateupdateOnlyDouble,               'BrunelHakimStateupdateOnlyDouble',             slice(None)         ),
+               #(BrunelHakimStateupdateOnlyTriple,               'BrunelHakimStateupdateOnlyTriple',             slice(None)         ),
+               #(BrunelHakimStateupdateOnly,                     'BrunelHakimStateupdateOnly',                   slice(None)         ),
+               #(BrunelHakimNeuronsOnly,                         'BrunelHakimNeuronsOnly',                       slice(None)         ),
+               #(BrunelHakimNeuronsOnlyNoXi,                     'BrunelHakimNeuronsOnlyNoXi',                   slice(None)         ),
+               #(BrunelHakimNeuronsOnlyNoRand,                   'BrunelHakimNeuronsOnlyNoRand',                 slice(None)         ),
+               #(BrunelHakimStateupdateThresholdOnly,            'BrunelHakimStateupdateThresholdOnly',          slice(None)         ),
+               #(BrunelHakimStateupdateThresholdResetOnly,       'BrunelHakimStateupdateThresholdResetOnly',     slice(None)         ),
+               #(BrunelHakimModelScalarDelayShort,               'BrunelHakimModelScalarDelayShort',             slice(None)         ),
+               #(BrunelHakimModelScalarDelayNoSelfConnections,   'BrunelHakimModelScalarDelayNoSelfConnections', slice(None)         ),
+               #(CUBA,                                           'CUBA',                                         slice(None)         ),
+               #(COBAHH,                                         'COBAHH',                                       slice(None)         ),
+               #(AdaptationOscillation,                          'AdaptationOscillation',                        slice(None)         ),
+               #(Vogels,                                         'Vogels',                                       slice(None)         ),
+               #(STDP,                                           'STDP',                                         slice(None)         ),
+               #(STDPEventDriven,                                'STDPEventDriven',                              slice(None)         ),
+               #(BrunelHakimModelScalarDelay,                    'BrunelHakimModelScalarDelay',                  slice(None)         ),
+
+               #(VerySparseMediumRateSynapsesOnly,               'VerySparseMediumRateSynapsesOnly',             slice(None)         ),
+               #(SparseMediumRateSynapsesOnly,                   'SparseMediumRateSynapsesOnly',                 slice(None)         ),
+               #(DenseMediumRateSynapsesOnly,                    'DenseMediumRateSynapsesOnly',                  slice(None)         ),
+               #(SparseLowRateSynapsesOnly,                      'SparseLowRateSynapsesOnly',                    slice(None)         ),
+               #(SparseHighRateSynapsesOnly,                     'SparseHighRateSynapsesOnly',                   slice(None)         ),
+
+               #(STDPNotEventDriven,                             'STDPNotEventDriven',                           slice(None)         ),
+               #(STDPMultiPost,                                  'STDPMultiPost',                                slice(None)         ),
+               #(STDPNeuronalTraces,                             'STDPNeuronalTraces',                           slice(None)         ),
+               #(STDPMultiPostNeuronalTraces,                    'STDPMultiPostNeuronalTraces',                  slice(None)         ),
+
+               (BrunelHakimModelHeterogeneousDelay,             'BrunelHakimModelHeterogeneousDelay',           slice(None)         ),
+
+               #(LinearNeuronsOnly,                              'LinearNeuronsOnly',                            slice(None)         ),
+               #(HHNeuronsOnly,                                  'HHNeuronsOnly',                                slice(None)         ),
+               #(VogelsWithSynapticDynamic,                      'VogelsWithSynapticDynamic',                    slice(None)         ),
+
+               ### below uses monitors
+               #(CUBAFixedConnectivity,                          'CUBAFixedConnectivity',                        slice(None)         ),
+               #(COBAHHFixedConnectivity,                        'COBAHHFixedConnectivity',                      slice(None, -1)     ),
+]
+
+configurations = [config[0] for config in configs]
+project_dirs = [config[1] for config in configs]
+
+# check if multiple Configurations with same project_dirs are specified
+last_idx = {}
+for proj_dir in project_dirs:
+    if proj_dir is not None:
+        first_i = project_dirs.index(proj_dir)
+        last_i = len(project_dirs) - 1 - project_dirs[::-1].index(proj_dir)
+        if first_i != last_i:
+            print("WARNING there are multiple configurations using {d} as project "
+                  "directory. Profiling and logfiles will only be saved for the last one {c}.".format(
+                  d=proj_dir, c=configurations[last_i].__name__))
+        last_idx[proj_dir] = last_i
+
+time_stemp = time.time()
+date_str = datetime.datetime.fromtimestamp(time_stemp).strftime('%Y_%m_%d')
+
+directory = 'results_{}{}'.format(date_str, additional_dir_name)
+if os.path.exists(directory):
+    new_dir = directory + '_bak_' + str(int(time.time()))
+    print("Directory with name `{}` already exists. Renaming it to `{}`.".format(directory, new_dir))
+    os.rename(directory, new_dir)
+os.makedirs(directory)
+data_dir = os.path.join(directory, 'data')
+plot_dir = os.path.join(directory, 'plots')
+log_dir = os.path.join(directory, 'logs')
+prof_dir = os.path.join(directory, 'nvprof')
+os.makedirs(data_dir)
+os.makedirs(plot_dir)
+os.makedirs(log_dir)
+os.makedirs(prof_dir)
+print("Saving results in {}.".format(plot_dir))
+
+shutil.copy(os.path.realpath(__file__), os.path.join(directory, 'run_speed_test_script.py'))
+
+time_format = '%d.%m.%Y at %H:%M:%S'
+script_start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+
+with open(os.path.join(directory, 'git.diff'), 'w') as diff_file:
+    subprocess.call(['git', 'diff'], stdout=diff_file)
+
+try:
+    for n, (st, name, sl) in enumerate(speed_tests):
+        start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+        print("Starting {} on {}.".format(name, start))
+        maximum_run_time = 1*60*60*second
+        res = run_speed_tests(configurations=configurations,
+                              speed_tests=[st],
+                              n_slice=sl,
+                              #n_slice=slice(0,1,None),
+                              run_twice=False,
+                              verbose=True,
+                              maximum_run_time=maximum_run_time#,
+                              ## this needs modification of brian2 code
+                              #profile_only_active=True 
+                              #profile_only_active=False
+                             )
+        end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+        diff = datetime.datetime.strptime(end, time_format) - datetime.datetime.strptime(start, time_format)
+        print("Running {} took {}.".format(name, diff))
+        res.plot_all_tests()
+        ## this needs modification of brian2 code
+        #res.plot_all_tests(print_relative=True)
+        savefig(os.path.join(plot_dir, 'speed_test_{}_absolute.png'.format(speed_tests[n][1])))
+        res.plot_all_tests(relative=True)
+        savefig(os.path.join(plot_dir, 'speed_test_{}_relative.png'.format(name)))
+        res.plot_all_tests(profiling_minimum=0.05)
+        savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.png'.format(name)))
+        res.plot_all_tests(profiling_minimum=0.15)
+        savefig(os.path.join(plot_dir, 'speed_test_min_15_{}_profiling.png'.format(name)))
+
+        res.plot_all_tests()
+        ## this needs modification of brian2 code
+        #res.plot_all_tests(print_relative=True)
+        savefig(os.path.join(plot_dir, 'speed_test_{}_absolute.svg'.format(speed_tests[n][1])))
+        res.plot_all_tests(relative=True)
+        savefig(os.path.join(plot_dir, 'speed_test_{}_relative.svg'.format(name)))
+        res.plot_all_tests(profiling_minimum=0.05)
+        savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.svg'.format(name)))
+        res.plot_all_tests(profiling_minimum=0.15)
+        savefig(os.path.join(plot_dir, 'speed_test_min_15_{}_profiling.svg'.format(name)))
+
+        if 3 != len(get_fignums()):
+            print("WARNING: There were {} plots created, but only {} saved.".format(len(get_fignums()), 3*(n+1)))
+        for n in get_fignums():
+            close(n)
+
+        # pickel results object to disk
+        pkl_file = os.path.join(data_dir, name + '.pkl' )
+        with open(pkl_file, 'wb') as output:
+                pickle.dump(res, output, pickle.HIGHEST_PROTOCOL)
+
+        # save stdout log of last run (the other are deleted in run_speed_tests())
+        for proj_dir in set(project_dirs):
+            if not proj_dir is None and proj_dir in ['cuda_standalone', 'cpp_standalone']:
+                config = configurations[last_idx[proj_dir]]
+                stdout_file = os.path.join(proj_dir, 'results/stdout.txt')
+                if os.path.exists(stdout_file):
+                    shutil.copy(stdout_file,
+                                os.path.join(log_dir, 'stdout_{st}_{conf}_{n}.txt'.format(st=name, conf=proj_dir,
+                                                                                           n=st.n_range[sl][-1])))
+                else:
+                    print("WARNING Couldn't save {},file not found.".format(stdout_file))
+
+        # run nvprof on n_range[2]
+        for conf, proj_dir in zip(configurations, project_dirs):
+            main_arg = ''
+            if proj_dir in ['cuda_standalone', 'GeNNworkspace']:
+                if proj_dir == 'GeNNworkspace':
+                    main_arg = 'test {time} 1'.format(time=st.duration/second)
+                ns = st.n_range[sl]
+                idx = 2
+                max_runtime = 20
+                conf_name = conf.__name__
+                print("Rerunning {} with n = {} for nvprof profiling".format(conf_name, st.n_range[idx]))
+                tb, res, runtime, prof_info = results(conf, st, st.n_range[idx], maximum_run_time=maximum_run_time)
+                if not isinstance(res, Exception) and runtime < max_runtime:
+                    option = '--profile-from-start-off' if proj_dir == 'cuda_standalone' else ''
+                    cmd = 'cd {proj_dir} && nvprof {opt} --log-file ../{log_file} ./main {arg}'.format(
+                        proj_dir=proj_dir, arg=main_arg, opt=option,
+                        log_file=os.path.join(prof_dir, 'nvprof_{st}_{conf}_{n}.log'.format(
+                            st=name, conf=conf_name, n=st.n_range[idx])))
+                    prof_start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+                    print(cmd)
+                    x = os.system(cmd)
+                    if x:
+                        print('nvprof failed with {}'.format(x))
+                    prof_end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+                    prof_diff = datetime.datetime.strptime(prof_end, time_format) - datetime.datetime.strptime(prof_start, time_format)
+                    print("Profiling took {} for runtime of {}".format(prof_diff, runtime))
+finally:
+    create_readme(directory)
+    print("\nSummarized speed test results in {}".format(directory + '/README.md'))
+    script_end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+    script_diff = datetime.datetime.strptime(script_end, time_format) - datetime.datetime.strptime(script_start, time_format)
+    print("Finished speed test on {}. Total time = {}.".format(
+        datetime.datetime.fromtimestamp(time.time()).strftime(time_format), script_diff))
+
+
+##res.plot_all_tests(relative=True)
+#for n in get_fignums():
+#    plt.figure(n)
+#    savefig(plot_dir + '/speed_test_{}.png'.format(speed_tests[n-1][1]))
+
+## Debug (includes profiling infos)
+#from brian2.tests.features.base import results
+#for x in results(LocalConfiguration, LinearNeuronsOnly, 10, maximum_run_time=10*second):
+#    print x
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/README.md b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/README.md
new file mode 100644
index 00000000..d43429c2
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/README.md
@@ -0,0 +1,92 @@
+
+# Benchmark results from 28.11.2017
+## Description:
+
+
+
+## Last git log:
+```
+commit 8987de24ed9f4a3b1a276496407fca1087f04004
+Author: Denis Alevi <mail@denisalevi.de>
+Date:   Mon Nov 20 14:31:09 2017 +0100
+
+    Fix critical section to include the actual pushing
+
+```
+There is also a `git diff` saved in the current directory.
+
+## Results
+
+### BrunelHakimModelHeterogeneousDelay
+![](plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.svg)
+![](plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.svg)
+![](plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.svg)
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==2700== NVPROF is profiling process 2700, command: ./main
+==2700== Profiling application: ./main
+==2700== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   36.32%  123.91ms      2523  49.113us  14.176us  1.3924ms  _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*)
+                   18.81%  64.168ms     10000  6.4160us  3.5840us  8.5440us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*)
+                   12.89%  43.962ms     10000  4.3960us  4.1600us  5.4080us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*)
+                    8.62%  29.419ms     10000  2.9410us  2.8800us  4.2880us  _run_synapses_pre_push_spikes_advance_kernel(void)
+                    6.74%  22.995ms     10000  2.2990us  2.0160us  2.8800us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+                    6.03%  20.585ms     10000  2.0580us  2.0160us  4.0960us  [CUDA memcpy DtoH]
+                    5.48%  18.689ms     10000  1.8680us  1.7280us  2.2400us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+                    4.89%  16.676ms     10000  1.6670us  1.6000us  2.7520us  _GLOBAL__N__69_tmpxft_000008bc_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*)
+                    0.21%  732.10us         1  732.10us  732.10us  732.10us  void gen_sequenced<curandStateXORWOW, float2, normal_args_st, __operator_&__(float2 curand_normal_scaled2<curandStateXORWOW>(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st)
+      API calls:   63.43%  648.25ms     62524  10.368us  8.7500us  8.8943ms  cudaLaunch
+                   28.22%  288.45ms     10000  28.844us  18.477us  1.3838ms  cudaMemcpy
+                    5.61%  57.386ms    350097     163ns     124ns  335.99us  cudaSetupArgument
+                    1.38%  14.127ms     62524     225ns     161ns  321.95us  cudaConfigureCall
+                    1.30%  13.336ms     52525     253ns     200ns  300.39us  cudaGetLastError
+                    0.03%  268.04us         1  268.04us  268.04us  268.04us  cudaMalloc
+                    0.02%  166.72us         1  166.72us  166.72us  166.72us  cudaMemGetInfo
+                    0.00%  30.363us        39     778ns     650ns  2.4670us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  29.284us         8  3.6600us  2.8650us  6.1410us  cudaFuncGetAttributes
+                    0.00%  13.545us         1  13.545us  13.545us  13.545us  cudaDeviceSynchronize
+                    0.00%  6.1940us        12     516ns     337ns  1.4590us  cudaDeviceGetAttribute
+                    0.00%  3.8130us         3  1.2710us     863ns  1.9980us  cudaGetDevice
+
+```
+
+</p></details>
+
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfiguration**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==1993== NVPROF is profiling process 1993, command: ./main
+==1993== Profiling application: ./main
+==1993== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   56.03%  352.83ms     10000  35.283us  2.0480us  87.201us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*)
+                   23.69%  149.15ms     10000  14.915us  1.6320us  1.3164ms  _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*)
+                    6.70%  42.158ms     10000  4.2150us  3.8080us  5.6320us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*)
+                    4.32%  27.228ms     10000  2.7220us  2.4960us  4.5120us  _run_synapses_pre_push_spikes_advance_kernel(void)
+                    3.61%  22.747ms     10000  2.2740us  1.9200us  3.7760us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+                    3.00%  18.918ms     10000  1.8910us  1.7280us  3.7440us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+                    2.53%  15.914ms     10000  1.5910us  1.3440us  3.8080us  _GLOBAL__N__69_tmpxft_000005de_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*)
+                    0.12%  731.65us         1  731.65us  731.65us  731.65us  void gen_sequenced<curandStateXORWOW, float2, normal_args_st, __operator_&__(float2 curand_normal_scaled2<curandStateXORWOW>(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st)
+      API calls:   89.24%  744.81ms     70001  10.639us  8.5770us  8.8242ms  cudaLaunch
+                    7.22%  60.281ms    380005     158ns     121ns  336.34us  cudaSetupArgument
+                    1.85%  15.427ms     70001     220ns     159ns  319.22us  cudaConfigureCall
+                    1.60%  13.340ms     60002     222ns     175ns  326.16us  cudaGetLastError
+                    0.04%  332.76us         1  332.76us  332.76us  332.76us  cudaDeviceSynchronize
+                    0.03%  253.93us         1  253.93us  253.93us  253.93us  cudaMalloc
+                    0.02%  146.47us         1  146.47us  146.47us  146.47us  cudaMemGetInfo
+                    0.00%  29.198us         8  3.6490us  2.7670us  6.3670us  cudaFuncGetAttributes
+                    0.00%  27.382us        39     702ns     578ns  1.8100us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  6.0870us        12     507ns     326ns  1.3870us  cudaDeviceGetAttribute
+                    0.00%  3.7450us         3  1.2480us     822ns  2.0410us  cudaGetDevice
+
+```
+
+</p></details>
+
+
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/data/BrunelHakimModelHeterogeneousDelay.pkl b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/data/BrunelHakimModelHeterogeneousDelay.pkl
new file mode 100644
index 00000000..51f30c2c
Binary files /dev/null and b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/data/BrunelHakimModelHeterogeneousDelay.pkl differ
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/git.diff b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/git.diff
new file mode 100644
index 00000000..44a84fa2
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/git.diff
@@ -0,0 +1,305 @@
+diff --git a/brian2cuda/tests/features/cuda_configuration.py b/brian2cuda/tests/features/cuda_configuration.py
+index 250687d..622f73a 100644
+--- a/brian2cuda/tests/features/cuda_configuration.py
++++ b/brian2cuda/tests/features/cuda_configuration.py
+@@ -225,7 +225,7 @@ class CUDAStandaloneConfigurationProfileCPU(Configuration):
+                             with_output=False)
+ 
+ class CUDAStandaloneConfigurationTestBrunelHeteroAtomics(Configuration):
+-    name = 'CUDA standalone with atomics in heterog delay mode'
++    name = 'CUDA standalone with atomics in effect application'
+     def before_run(self):
+         brian2.set_device('cuda_standalone', build_on_run=False)
+         prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True
+@@ -248,7 +248,7 @@ class CUDAStandaloneConfigurationTestBrunelHeteroAtomics(Configuration):
+                             with_output=False)
+ 
+ class CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU(Configuration):
+-    name = "CUDA standalone with atomics in heterog delay mode (profile='blocking')"
++    name = "CUDA standalone with atomics in effect application (profile='blocking')"
+     def before_run(self):
+         brian2.set_device('cuda_standalone', build_on_run=False, profile='blocking')
+         prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True
+@@ -270,12 +270,10 @@ class CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU(Configuration
+         brian2.device.build(directory='cuda_standalone', compile=True, run=True,
+                             with_output=False)
+ 
+-
+ class CUDAStandaloneConfigurationPushAtomicResize(Configuration):
+-    name = "CUDA standalone with atomics in queue resize"
++    name = "CUDA standalone with atomics in spikequeue resize"
+     def before_run(self):
+         brian2.set_device('cuda_standalone', build_on_run=False)
+-        prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True
+         prefs["devices.cuda_standalone.push_atomic_resize"] = True
+         if socket.gethostname() == 'elnath':
+             if prefs['devices.cpp_standalone.extra_make_args_unix'] == ['-j12']:
+@@ -295,8 +293,82 @@ class CUDAStandaloneConfigurationPushAtomicResize(Configuration):
+         brian2.device.build(directory='cuda_standalone', compile=True, run=True,
+                             with_output=False)
+ 
++
+ class CUDAStandaloneConfigurationPushAtomicResizeProfileCPU(Configuration):
+-    name = "CUDA standalone with atomics in queue resize (profile='blocking')"
++    name = "CUDA standalone with atomics in spikequeue resize (profile='blocking')"
++    def before_run(self):
++        brian2.set_device('cuda_standalone', build_on_run=False, profile='blocking')
++        prefs["devices.cuda_standalone.push_atomic_resize"] = True
++        if socket.gethostname() == 'elnath':
++            if prefs['devices.cpp_standalone.extra_make_args_unix'] == ['-j12']:
++                prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j24']
++            prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
++            prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20'])
++        elif socket.gethostname() == 'sabik':
++            prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
++            prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_52'])
++        elif socket.gethostname() == 'eltanin':
++            prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
++            prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_61'])
++
++    def after_run(self):
++        if os.path.exists('cuda_standalone'):
++            shutil.rmtree('cuda_standalone')
++        brian2.device.build(directory='cuda_standalone', compile=True, run=True,
++                            with_output=False)
++
++
++class CUDAStandaloneConfigurationPushAtomicResizProfileCPU(Configuration):
++    name = "CUDA standalone with atomics in spikequeue resize (profile='blocking)"
++    def before_run(self):
++        brian2.set_device('cuda_standalone', build_on_run=False,
++                          profile='blocking')
++        prefs["devices.cuda_standalone.push_atomic_resize"] = True
++        if socket.gethostname() == 'elnath':
++            if prefs['devices.cpp_standalone.extra_make_args_unix'] == ['-j12']:
++                prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j24']
++            prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
++            prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20'])
++        elif socket.gethostname() == 'sabik':
++            prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
++            prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_52'])
++        elif socket.gethostname() == 'eltanin':
++            prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
++            prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_61'])
++
++    def after_run(self):
++        if os.path.exists('cuda_standalone'):
++            shutil.rmtree('cuda_standalone')
++        brian2.device.build(directory='cuda_standalone', compile=True, run=True,
++                            with_output=False)
++
++
++class CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize(Configuration):
++    name = "CUDA standalone with atomics in effect application and in spikequeue resize"
++    def before_run(self):
++        brian2.set_device('cuda_standalone', build_on_run=False)
++        prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True
++        prefs["devices.cuda_standalone.push_atomic_resize"] = True
++        if socket.gethostname() == 'elnath':
++            if prefs['devices.cpp_standalone.extra_make_args_unix'] == ['-j12']:
++                prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j24']
++            prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
++            prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20'])
++        elif socket.gethostname() == 'sabik':
++            prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
++            prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_52'])
++        elif socket.gethostname() == 'eltanin':
++            prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
++            prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_61'])
++
++    def after_run(self):
++        if os.path.exists('cuda_standalone'):
++            shutil.rmtree('cuda_standalone')
++        brian2.device.build(directory='cuda_standalone', compile=True, run=True,
++                            with_output=False)
++
++class CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResizeProfileCPU(Configuration):
++    name = "CUDA standalone with atomics in effect application and in spikequeue resize (profile='blocking')"
+     def before_run(self):
+         brian2.set_device('cuda_standalone', build_on_run=False, profile='blocking')
+         prefs["devices.cuda_standalone.test_brunel_hetero_atomics"] = True
+diff --git a/dev/benchmarks/run_speed_tests.py b/dev/benchmarks/run_speed_tests.py
+index 2518634..b525e97 100644
+--- a/dev/benchmarks/run_speed_tests.py
++++ b/dev/benchmarks/run_speed_tests.py
+@@ -37,6 +37,7 @@ from brian2cuda.tests.features.cuda_configuration import (CUDAStandaloneConfigur
+                                                           CUDAStandaloneConfigurationTestBrunelHeteroAtomics,
+                                                           CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU,
+                                                           CUDAStandaloneConfigurationPushAtomicResize,
++                                                          CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize,
+                                                           CUDAStandaloneConfigurationPushAtomicResizeProfileCPU,
+                                                           CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy,
+                                                           CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU)
+@@ -61,79 +62,80 @@ if socket.gethostname() == 'elnath':
+     prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20'])
+ 
+ configs = [# configuration                          project_directory
+-          (NumpyConfiguration,                     None),
+-          (WeaveConfiguration,                     None),
+-          (LocalConfiguration,                     None),
++          #(NumpyConfiguration,                     None),
++          #(WeaveConfiguration,                     None),
++          #(LocalConfiguration,                     None),
++          #(CPPStandaloneConfiguration,              'cpp_standalone'),
++          #(CPPStandaloneConfigurationOpenMP,        'cpp_standalone'),
+           (CUDAStandaloneConfiguration,             'cuda_standalone'),
+-          (CUDAStandaloneConfigurationExtraThresholdKernel,             'cuda_standalone'),
+-          (CUDAStandaloneConfigurationNoAssert,             'cuda_standalone'),
+-          (CUDAStandaloneConfigurationCurandDouble,              'cuda_standalone'),
+-          (CUDAStandaloneConfigurationNoCudaOccupancyAPI,      'cuda_standalone'),
+-          (CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU,    'cuda_standalone'),
+-          (CUDAStandaloneConfiguration2BlocksPerSM, 'cuda_standalone'),
+-          (CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds, 'cuda_standalone'),
+-          (CUDAStandaloneConfigurationSynLaunchBounds,     'cuda_standalone'),
+-          (CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds, 'cuda_standalone'),
+-          (CUDAStandaloneConfigurationProfileGPU,   'cuda_standalone'),
+-          (CUDAStandaloneConfigurationProfileCPU,   'cuda_standalone'),
+-          (CPPStandaloneConfiguration,              'cpp_standalone'),
+-          (CUDAStandaloneConfigurationTestBrunelHeteroAtomics,   'cuda_standalone'),
+-          (CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU,   'cuda_standalone'),
+           (CUDAStandaloneConfigurationPushAtomicResize,   'cuda_standalone'),
+-          (CUDAStandaloneConfigurationPushAtomicResizeProfileCPU,   'cuda_standalone'),
+-          (CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy,   'cuda_standalone'),
+-          (CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU,   'cuda_standalone'),
+-          (GeNNConfiguration,                       'GeNNworkspace'),
+-          (CPPStandaloneConfigurationOpenMP,        'cpp_standalone'),
+-          (GeNNConfigurationCPU,                    'GeNNworkspace'),
+-          (GeNNConfigurationOptimized,              'GeNNworkspace')
++          (CUDAStandaloneConfigurationTestBrunelHeteroAtomics,   'cuda_standalone'),
++          (CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize,     'cuda_standalone'),
++          #(CUDAStandaloneConfigurationExtraThresholdKernel,             'cuda_standalone'),
++          #(CUDAStandaloneConfigurationNoAssert,             'cuda_standalone'),
++          #(CUDAStandaloneConfigurationCurandDouble,              'cuda_standalone'),
++          #(CUDAStandaloneConfigurationNoCudaOccupancyAPI,      'cuda_standalone'),
++          #(CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU,    'cuda_standalone'),
++          #(CUDAStandaloneConfiguration2BlocksPerSM, 'cuda_standalone'),
++          #(CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds, 'cuda_standalone'),
++          #(CUDAStandaloneConfigurationSynLaunchBounds,     'cuda_standalone'),
++          #(CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds, 'cuda_standalone'),
++          #(CUDAStandaloneConfigurationProfileGPU,   'cuda_standalone'),
++          #(CUDAStandaloneConfigurationProfileCPU,   'cuda_standalone'),
++          #(CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU,   'cuda_standalone'),
++          #(CUDAStandaloneConfigurationPushAtomicResizeProfileCPU,   'cuda_standalone'),
++          #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy,   'cuda_standalone'),
++          #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU,   'cuda_standalone'),
++          #(GeNNConfiguration,                       'GeNNworkspace'),
++          #(GeNNConfigurationCPU,                    'GeNNworkspace'),
++          #(GeNNConfigurationOptimized,              'GeNNworkspace')
+           ]
+ 
+ speed_tests = [# feature_test                     name                                  n_slice
+ 
+-               (ThresholderOnlyPoissonLowRate,                  'ThresholderOnlyPoissonLowRate',                slice(None)         ),
+-               (ThresholderOnlyPoissonMediumRate,               'ThresholderOnlyPoissonMediumRate',             slice(None)         ),
+-               (ThresholderOnlyPoissonHighRate,                 'ThresholderOnlyPoissonHighRate',               slice(None)         ),
+-               (ThresholderOnlyAlwaysSpiking,                   'ThresholderOnlyAlwaysSpiking',                 slice(None)         ),
+-
+-               (BrunelHakimStateupdateOnlyDouble,               'BrunelHakimStateupdateOnlyDouble',             slice(None)         ),
+-               (BrunelHakimStateupdateOnlyTriple,               'BrunelHakimStateupdateOnlyTriple',             slice(None)         ),
+-               (BrunelHakimStateupdateOnly,                     'BrunelHakimStateupdateOnly',                   slice(None)         ),
+-               (BrunelHakimNeuronsOnly,                         'BrunelHakimNeuronsOnly',                       slice(None)         ),
+-               (BrunelHakimNeuronsOnlyNoXi,                     'BrunelHakimNeuronsOnlyNoXi',                   slice(None)         ),
+-               (BrunelHakimNeuronsOnlyNoRand,                   'BrunelHakimNeuronsOnlyNoRand',                 slice(None)         ),
+-               (BrunelHakimStateupdateThresholdOnly,            'BrunelHakimStateupdateThresholdOnly',          slice(None)         ),
+-               (BrunelHakimStateupdateThresholdResetOnly,       'BrunelHakimStateupdateThresholdResetOnly',     slice(None)         ),
+-               (BrunelHakimModelScalarDelayShort,               'BrunelHakimModelScalarDelayShort',             slice(None)         ),
+-               (BrunelHakimModelScalarDelayNoSelfConnections,   'BrunelHakimModelScalarDelayNoSelfConnections', slice(None)         ),
+-               (CUBA,                                           'CUBA',                                         slice(None)         ),
+-               (COBAHH,                                         'COBAHH',                                       slice(None)         ),
+-               (AdaptationOscillation,                          'AdaptationOscillation',                        slice(None)         ),
+-               (Vogels,                                         'Vogels',                                       slice(None)         ),
+-               (STDP,                                           'STDP',                                         slice(None)         ),
+-               (STDPEventDriven,                                'STDPEventDriven',                              slice(None)         ),
+-               (BrunelHakimModelScalarDelay,                    'BrunelHakimModelScalarDelay',                  slice(None)         ),
+-
+-               (VerySparseMediumRateSynapsesOnly,               'VerySparseMediumRateSynapsesOnly',             slice(None)         ),
+-               (SparseMediumRateSynapsesOnly,                   'SparseMediumRateSynapsesOnly',                 slice(None)         ),
+-               (DenseMediumRateSynapsesOnly,                    'DenseMediumRateSynapsesOnly',                  slice(None)         ),
+-               (SparseLowRateSynapsesOnly,                      'SparseLowRateSynapsesOnly',                    slice(None)         ),
+-               (SparseHighRateSynapsesOnly,                     'SparseHighRateSynapsesOnly',                   slice(None)         ),
+-
+-               (STDPNotEventDriven,                             'STDPNotEventDriven',                           slice(None)         ),
+-               (STDPMultiPost,                                  'STDPMultiPost',                                slice(None)         ),
+-               (STDPNeuronalTraces,                             'STDPNeuronalTraces',                           slice(None)         ),
+-               (STDPMultiPostNeuronalTraces,                    'STDPMultiPostNeuronalTraces',                  slice(None)         ),
++               #(ThresholderOnlyPoissonLowRate,                  'ThresholderOnlyPoissonLowRate',                slice(None)         ),
++               #(ThresholderOnlyPoissonMediumRate,               'ThresholderOnlyPoissonMediumRate',             slice(None)         ),
++               #(ThresholderOnlyPoissonHighRate,                 'ThresholderOnlyPoissonHighRate',               slice(None)         ),
++               #(ThresholderOnlyAlwaysSpiking,                   'ThresholderOnlyAlwaysSpiking',                 slice(None)         ),
++
++               #(BrunelHakimStateupdateOnlyDouble,               'BrunelHakimStateupdateOnlyDouble',             slice(None)         ),
++               #(BrunelHakimStateupdateOnlyTriple,               'BrunelHakimStateupdateOnlyTriple',             slice(None)         ),
++               #(BrunelHakimStateupdateOnly,                     'BrunelHakimStateupdateOnly',                   slice(None)         ),
++               #(BrunelHakimNeuronsOnly,                         'BrunelHakimNeuronsOnly',                       slice(None)         ),
++               #(BrunelHakimNeuronsOnlyNoXi,                     'BrunelHakimNeuronsOnlyNoXi',                   slice(None)         ),
++               #(BrunelHakimNeuronsOnlyNoRand,                   'BrunelHakimNeuronsOnlyNoRand',                 slice(None)         ),
++               #(BrunelHakimStateupdateThresholdOnly,            'BrunelHakimStateupdateThresholdOnly',          slice(None)         ),
++               #(BrunelHakimStateupdateThresholdResetOnly,       'BrunelHakimStateupdateThresholdResetOnly',     slice(None)         ),
++               #(BrunelHakimModelScalarDelayShort,               'BrunelHakimModelScalarDelayShort',             slice(None)         ),
++               #(BrunelHakimModelScalarDelayNoSelfConnections,   'BrunelHakimModelScalarDelayNoSelfConnections', slice(None)         ),
++               #(CUBA,                                           'CUBA',                                         slice(None)         ),
++               #(COBAHH,                                         'COBAHH',                                       slice(None)         ),
++               #(AdaptationOscillation,                          'AdaptationOscillation',                        slice(None)         ),
++               #(Vogels,                                         'Vogels',                                       slice(None)         ),
++               #(STDP,                                           'STDP',                                         slice(None)         ),
++               #(STDPEventDriven,                                'STDPEventDriven',                              slice(None)         ),
++               #(BrunelHakimModelScalarDelay,                    'BrunelHakimModelScalarDelay',                  slice(None)         ),
++
++               #(VerySparseMediumRateSynapsesOnly,               'VerySparseMediumRateSynapsesOnly',             slice(None)         ),
++               #(SparseMediumRateSynapsesOnly,                   'SparseMediumRateSynapsesOnly',                 slice(None)         ),
++               #(DenseMediumRateSynapsesOnly,                    'DenseMediumRateSynapsesOnly',                  slice(None)         ),
++               #(SparseLowRateSynapsesOnly,                      'SparseLowRateSynapsesOnly',                    slice(None)         ),
++               #(SparseHighRateSynapsesOnly,                     'SparseHighRateSynapsesOnly',                   slice(None)         ),
++
++               #(STDPNotEventDriven,                             'STDPNotEventDriven',                           slice(None)         ),
++               #(STDPMultiPost,                                  'STDPMultiPost',                                slice(None)         ),
++               #(STDPNeuronalTraces,                             'STDPNeuronalTraces',                           slice(None)         ),
++               #(STDPMultiPostNeuronalTraces,                    'STDPMultiPostNeuronalTraces',                  slice(None)         ),
+ 
+                (BrunelHakimModelHeterogeneousDelay,             'BrunelHakimModelHeterogeneousDelay',           slice(None)         ),
+ 
+-               (LinearNeuronsOnly,                              'LinearNeuronsOnly',                            slice(None)         ),
+-               (HHNeuronsOnly,                                  'HHNeuronsOnly',                                slice(None)         ),
+-               (VogelsWithSynapticDynamic,                      'VogelsWithSynapticDynamic',                    slice(None)         ),
++               #(LinearNeuronsOnly,                              'LinearNeuronsOnly',                            slice(None)         ),
++               #(HHNeuronsOnly,                                  'HHNeuronsOnly',                                slice(None)         ),
++               #(VogelsWithSynapticDynamic,                      'VogelsWithSynapticDynamic',                    slice(None)         ),
+ 
+-               ## below uses monitors
+-               (CUBAFixedConnectivity,                          'CUBAFixedConnectivity',                        slice(None)         ),
+-               (COBAHHFixedConnectivity,                        'COBAHHFixedConnectivity',                      slice(None, -1)     ),
++               ### below uses monitors
++               #(CUBAFixedConnectivity,                          'CUBAFixedConnectivity',                        slice(None)         ),
++               #(COBAHHFixedConnectivity,                        'COBAHHFixedConnectivity',                      slice(None, -1)     ),
+ ]
+ 
+ configurations = [config[0] for config in configs]
+@@ -205,6 +207,16 @@ try:
+         savefig(os.path.join(plot_dir, 'speed_test_{}_relative.png'.format(name)))
+         res.plot_all_tests(profiling_minimum=0.05)
+         savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.png'.format(name)))
++
++        res.plot_all_tests()
++        ## this needs modification of brian2 code
++        #res.plot_all_tests(print_relative=True)
++        savefig(os.path.join(plot_dir, 'speed_test_{}_absolute.svg'.format(speed_tests[n][1])))
++        res.plot_all_tests(relative=True)
++        savefig(os.path.join(plot_dir, 'speed_test_{}_relative.svg'.format(name)))
++        res.plot_all_tests(profiling_minimum=0.05)
++        savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.svg'.format(name)))
++
+         if 3 != len(get_fignums()):
+             print("WARNING: There were {} plots created, but only {} saved.".format(len(get_fignums()), 3*(n+1)))
+         for n in get_fignums():
+diff --git a/frozen_repos/brian2 b/frozen_repos/brian2
+--- a/frozen_repos/brian2
++++ b/frozen_repos/brian2
+@@ -1 +1 @@
+-Subproject commit fadc6a0aeb90d1b4d343470628457d8561536f67
++Subproject commit fadc6a0aeb90d1b4d343470628457d8561536f67-dirty
+diff --git a/frozen_repos/brian2genn b/frozen_repos/brian2genn
+--- a/frozen_repos/brian2genn
++++ b/frozen_repos/brian2genn
+@@ -1 +1 @@
+-Subproject commit 0553cafeab49ea5403c0230411035df504d4db06
++Subproject commit 0553cafeab49ea5403c0230411035df504d4db06-dirty
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/logs/stdout_BrunelHakimModelHeterogeneousDelay_cpp_standalone_50000.txt b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/logs/stdout_BrunelHakimModelHeterogeneousDelay_cpp_standalone_50000.txt
new file mode 100644
index 00000000..0cd8723a
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/logs/stdout_BrunelHakimModelHeterogeneousDelay_cpp_standalone_50000.txt
@@ -0,0 +1 @@
+Number of synapses: 50008503
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/logs/stdout_BrunelHakimModelHeterogeneousDelay_cuda_standalone_50000.txt b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/logs/stdout_BrunelHakimModelHeterogeneousDelay_cuda_standalone_50000.txt
new file mode 100644
index 00000000..8a832b67
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/logs/stdout_BrunelHakimModelHeterogeneousDelay_cuda_standalone_50000.txt
@@ -0,0 +1,53 @@
+INFO: setting cudaDevice stuff took 0.359616 seconds
+INFO kernel_synapses_group_variable_set_conditional_codeobject
+	48825 blocks
+	1024 threads
+	8 registers per block
+	0 bytes statically-allocated shared memory per block
+	0 bytes local memory per thread
+	304 bytes user-allocated constant memory
+INFO connectivity matrix has size 49995881
+INFO generating 13100000 randn every 262 clock cycles for neurongroup_stateupdater_codeobject
+INFO kernel_neurongroup_stateupdater_codeobject
+	66 blocks
+	768 threads
+	36 registers per block
+	0 bytes statically-allocated shared memory per block
+	8 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	0.750 theoretical occupancy
+INFO kernel_neurongroup_thresholder_codeobject
+	49 blocks
+	1024 threads
+	15 registers per block
+	0 bytes statically-allocated shared memory per block
+	0 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	1.000 theoretical occupancy
+INFO _run_synapses_pre_push_spikes_push_kernel
+	15 blocks
+	110 threads
+	78 registers per block
+	0 bytes statically-allocated shared memory per block
+	16 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	0.312 theoretical occupancy
+INFO kernel_synapses_pre_codeobject
+	15 blocks
+	1024 threads
+	21 registers per block
+	0 bytes statically-allocated shared memory per block
+	8 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	1.000 theoretical occupancy
+INFO kernel_neurongroup_resetter_codeobject
+	49 blocks
+	1024 threads
+	14 registers per block
+	0 bytes statically-allocated shared memory per block
+	0 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	1.000 theoretical occupancy
+Number of synapses: 49995881
+INFO: main_lines took 138.083454 seconds
+INFO: main function took 139.763504 seconds
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize_1000.log b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize_1000.log
new file mode 100644
index 00000000..d3b0f919
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize_1000.log
@@ -0,0 +1,25 @@
+==2700== NVPROF is profiling process 2700, command: ./main
+==2700== Profiling application: ./main
+==2700== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   36.32%  123.91ms      2523  49.113us  14.176us  1.3924ms  _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*)
+                   18.81%  64.168ms     10000  6.4160us  3.5840us  8.5440us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*)
+                   12.89%  43.962ms     10000  4.3960us  4.1600us  5.4080us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*)
+                    8.62%  29.419ms     10000  2.9410us  2.8800us  4.2880us  _run_synapses_pre_push_spikes_advance_kernel(void)
+                    6.74%  22.995ms     10000  2.2990us  2.0160us  2.8800us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+                    6.03%  20.585ms     10000  2.0580us  2.0160us  4.0960us  [CUDA memcpy DtoH]
+                    5.48%  18.689ms     10000  1.8680us  1.7280us  2.2400us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+                    4.89%  16.676ms     10000  1.6670us  1.6000us  2.7520us  _GLOBAL__N__69_tmpxft_000008bc_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*)
+                    0.21%  732.10us         1  732.10us  732.10us  732.10us  void gen_sequenced<curandStateXORWOW, float2, normal_args_st, __operator_&__(float2 curand_normal_scaled2<curandStateXORWOW>(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st)
+      API calls:   63.43%  648.25ms     62524  10.368us  8.7500us  8.8943ms  cudaLaunch
+                   28.22%  288.45ms     10000  28.844us  18.477us  1.3838ms  cudaMemcpy
+                    5.61%  57.386ms    350097     163ns     124ns  335.99us  cudaSetupArgument
+                    1.38%  14.127ms     62524     225ns     161ns  321.95us  cudaConfigureCall
+                    1.30%  13.336ms     52525     253ns     200ns  300.39us  cudaGetLastError
+                    0.03%  268.04us         1  268.04us  268.04us  268.04us  cudaMalloc
+                    0.02%  166.72us         1  166.72us  166.72us  166.72us  cudaMemGetInfo
+                    0.00%  30.363us        39     778ns     650ns  2.4670us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  29.284us         8  3.6600us  2.8650us  6.1410us  cudaFuncGetAttributes
+                    0.00%  13.545us         1  13.545us  13.545us  13.545us  cudaDeviceSynchronize
+                    0.00%  6.1940us        12     516ns     337ns  1.4590us  cudaDeviceGetAttribute
+                    0.00%  3.8130us         3  1.2710us     863ns  1.9980us  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfiguration_1000.log
new file mode 100644
index 00000000..fa2e29f5
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/nvprof/nvprof_BrunelHakimModelHeterogeneousDelay_CUDAStandaloneConfiguration_1000.log
@@ -0,0 +1,23 @@
+==1993== NVPROF is profiling process 1993, command: ./main
+==1993== Profiling application: ./main
+==1993== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   56.03%  352.83ms     10000  35.283us  2.0480us  87.201us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, double*, int*)
+                   23.69%  149.15ms     10000  14.915us  1.6320us  1.3164ms  _run_synapses_pre_push_spikes_push_kernel(unsigned int, unsigned int, unsigned int, int*)
+                    6.70%  42.158ms     10000  4.2150us  3.8080us  5.6320us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double, double*, double*, double*, bool*, float*)
+                    4.32%  27.228ms     10000  2.7220us  2.4960us  4.5120us  _run_synapses_pre_push_spikes_advance_kernel(void)
+                    3.61%  22.747ms     10000  2.2740us  1.9200us  3.7760us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+                    3.00%  18.918ms     10000  1.8910us  1.7280us  3.7440us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+                    2.53%  15.914ms     10000  1.5910us  1.3440us  3.8080us  _GLOBAL__N__69_tmpxft_000005de_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*)
+                    0.12%  731.65us         1  731.65us  731.65us  731.65us  void gen_sequenced<curandStateXORWOW, float2, normal_args_st, __operator_&__(float2 curand_normal_scaled2<curandStateXORWOW>(curandStateXORWOW*, normal_args_st))>(curandStateXORWOW*, float2*, unsigned long, unsigned long, normal_args_st)
+      API calls:   89.24%  744.81ms     70001  10.639us  8.5770us  8.8242ms  cudaLaunch
+                    7.22%  60.281ms    380005     158ns     121ns  336.34us  cudaSetupArgument
+                    1.85%  15.427ms     70001     220ns     159ns  319.22us  cudaConfigureCall
+                    1.60%  13.340ms     60002     222ns     175ns  326.16us  cudaGetLastError
+                    0.04%  332.76us         1  332.76us  332.76us  332.76us  cudaDeviceSynchronize
+                    0.03%  253.93us         1  253.93us  253.93us  253.93us  cudaMalloc
+                    0.02%  146.47us         1  146.47us  146.47us  146.47us  cudaMemGetInfo
+                    0.00%  29.198us         8  3.6490us  2.7670us  6.3670us  cudaFuncGetAttributes
+                    0.00%  27.382us        39     702ns     578ns  1.8100us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  6.0870us        12     507ns     326ns  1.3870us  cudaDeviceGetAttribute
+                    0.00%  3.7450us         3  1.2480us     822ns  2.0410us  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.png b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.png
new file mode 100644
index 00000000..16721228
Binary files /dev/null and b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/plots/speed_test_BrunelHakimModelHeterogeneousDelay_absolute.png differ
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.png b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.png
new file mode 100644
index 00000000..b9973c28
Binary files /dev/null and b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/plots/speed_test_BrunelHakimModelHeterogeneousDelay_profiling.png differ
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.png b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.png
new file mode 100644
index 00000000..d9ec9aa8
Binary files /dev/null and b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/plots/speed_test_BrunelHakimModelHeterogeneousDelay_relative.png differ
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/run_speed_test_script.py b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/run_speed_test_script.py
new file mode 100644
index 00000000..97f4c06f
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/cuda_cpp_comparison_for_heterogenous_delay_mode/run_speed_test_script.py
@@ -0,0 +1,285 @@
+import os
+import shutil
+import glob
+import subprocess
+import sys
+import socket
+
+# run tests without X-server
+import matplotlib
+matplotlib.use('Agg')
+
+# pretty plots
+import seaborn
+
+import time
+import datetime
+import cPickle as pickle
+
+from brian2 import *
+from brian2.tests.features import *
+from brian2.tests.features.base import *
+from brian2.tests.features.base import results
+
+import brian2cuda
+from brian2cuda.tests.features.cuda_configuration import (CUDAStandaloneConfiguration,
+                                                          CUDAStandaloneConfigurationNoAssert,
+                                                          CUDAStandaloneConfigurationExtraThresholdKernel,
+                                                          CUDAStandaloneConfigurationCurandDouble,
+                                                          CUDAStandaloneConfigurationNoCudaOccupancyAPI,
+                                                          CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU,
+                                                          CUDAStandaloneConfiguration2BlocksPerSM,
+                                                          CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds,
+                                                          CUDAStandaloneConfigurationSynLaunchBounds,
+                                                          CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds,
+                                                          CUDAStandaloneConfigurationProfileGPU,
+                                                          CUDAStandaloneConfigurationProfileCPU,
+                                                          CUDAStandaloneConfigurationTestBrunelHeteroAtomics,
+                                                          CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU,
+                                                          CUDAStandaloneConfigurationPushAtomicResize,
+                                                          CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize,
+                                                          CUDAStandaloneConfigurationPushAtomicResizeProfileCPU,
+                                                          CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy,
+                                                          CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU)
+from brian2cuda.tests.features.speed import *
+
+from brian2genn.correctness_testing import GeNNConfiguration, GeNNConfigurationCPU, GeNNConfigurationOptimized
+
+from create_readme import create_readme
+
+assert len(sys.argv)<= 2, 'Only one command line argument supported! Got {}'.format(len(sys.argv)-1)
+if len(sys.argv) == 2:
+    additional_dir_name = '_' + sys.argv[1]
+else:
+    additional_dir_name = ''
+
+prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j12']
+
+# host specific settings
+if socket.gethostname() == 'elnath':
+    prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j24']
+    prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
+    prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20'])
+
+configs = [# configuration                          project_directory
+          #(NumpyConfiguration,                     None),
+          #(WeaveConfiguration,                     None),
+          #(LocalConfiguration,                     None),
+          (CPPStandaloneConfiguration,              'cpp_standalone'),
+          (CPPStandaloneConfigurationOpenMP,        'cpp_standalone'),
+          (CUDAStandaloneConfiguration,             'cuda_standalone'),
+          #(CUDAStandaloneConfigurationPushAtomicResize,   'cuda_standalone'),
+          #(CUDAStandaloneConfigurationTestBrunelHeteroAtomics,   'cuda_standalone'),
+          (CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize,     'cuda_standalone'),
+          #(CUDAStandaloneConfigurationExtraThresholdKernel,             'cuda_standalone'),
+          #(CUDAStandaloneConfigurationNoAssert,             'cuda_standalone'),
+          #(CUDAStandaloneConfigurationCurandDouble,              'cuda_standalone'),
+          #(CUDAStandaloneConfigurationNoCudaOccupancyAPI,      'cuda_standalone'),
+          #(CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU,    'cuda_standalone'),
+          #(CUDAStandaloneConfiguration2BlocksPerSM, 'cuda_standalone'),
+          #(CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds, 'cuda_standalone'),
+          #(CUDAStandaloneConfigurationSynLaunchBounds,     'cuda_standalone'),
+          #(CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds, 'cuda_standalone'),
+          #(CUDAStandaloneConfigurationProfileGPU,   'cuda_standalone'),
+          #(CUDAStandaloneConfigurationProfileCPU,   'cuda_standalone'),
+          #(CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU,   'cuda_standalone'),
+          #(CUDAStandaloneConfigurationPushAtomicResizeProfileCPU,   'cuda_standalone'),
+          #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy,   'cuda_standalone'),
+          #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU,   'cuda_standalone'),
+          #(GeNNConfiguration,                       'GeNNworkspace'),
+          #(GeNNConfigurationCPU,                    'GeNNworkspace'),
+          #(GeNNConfigurationOptimized,              'GeNNworkspace')
+          ]
+
+speed_tests = [# feature_test                     name                                  n_slice
+
+               #(ThresholderOnlyPoissonLowRate,                  'ThresholderOnlyPoissonLowRate',                slice(None)         ),
+               #(ThresholderOnlyPoissonMediumRate,               'ThresholderOnlyPoissonMediumRate',             slice(None)         ),
+               #(ThresholderOnlyPoissonHighRate,                 'ThresholderOnlyPoissonHighRate',               slice(None)         ),
+               #(ThresholderOnlyAlwaysSpiking,                   'ThresholderOnlyAlwaysSpiking',                 slice(None)         ),
+
+               #(BrunelHakimStateupdateOnlyDouble,               'BrunelHakimStateupdateOnlyDouble',             slice(None)         ),
+               #(BrunelHakimStateupdateOnlyTriple,               'BrunelHakimStateupdateOnlyTriple',             slice(None)         ),
+               #(BrunelHakimStateupdateOnly,                     'BrunelHakimStateupdateOnly',                   slice(None)         ),
+               #(BrunelHakimNeuronsOnly,                         'BrunelHakimNeuronsOnly',                       slice(None)         ),
+               #(BrunelHakimNeuronsOnlyNoXi,                     'BrunelHakimNeuronsOnlyNoXi',                   slice(None)         ),
+               #(BrunelHakimNeuronsOnlyNoRand,                   'BrunelHakimNeuronsOnlyNoRand',                 slice(None)         ),
+               #(BrunelHakimStateupdateThresholdOnly,            'BrunelHakimStateupdateThresholdOnly',          slice(None)         ),
+               #(BrunelHakimStateupdateThresholdResetOnly,       'BrunelHakimStateupdateThresholdResetOnly',     slice(None)         ),
+               #(BrunelHakimModelScalarDelayShort,               'BrunelHakimModelScalarDelayShort',             slice(None)         ),
+               #(BrunelHakimModelScalarDelayNoSelfConnections,   'BrunelHakimModelScalarDelayNoSelfConnections', slice(None)         ),
+               #(CUBA,                                           'CUBA',                                         slice(None)         ),
+               #(COBAHH,                                         'COBAHH',                                       slice(None)         ),
+               #(AdaptationOscillation,                          'AdaptationOscillation',                        slice(None)         ),
+               #(Vogels,                                         'Vogels',                                       slice(None)         ),
+               #(STDP,                                           'STDP',                                         slice(None)         ),
+               #(STDPEventDriven,                                'STDPEventDriven',                              slice(None)         ),
+               #(BrunelHakimModelScalarDelay,                    'BrunelHakimModelScalarDelay',                  slice(None)         ),
+
+               #(VerySparseMediumRateSynapsesOnly,               'VerySparseMediumRateSynapsesOnly',             slice(None)         ),
+               #(SparseMediumRateSynapsesOnly,                   'SparseMediumRateSynapsesOnly',                 slice(None)         ),
+               #(DenseMediumRateSynapsesOnly,                    'DenseMediumRateSynapsesOnly',                  slice(None)         ),
+               #(SparseLowRateSynapsesOnly,                      'SparseLowRateSynapsesOnly',                    slice(None)         ),
+               #(SparseHighRateSynapsesOnly,                     'SparseHighRateSynapsesOnly',                   slice(None)         ),
+
+               #(STDPNotEventDriven,                             'STDPNotEventDriven',                           slice(None)         ),
+               #(STDPMultiPost,                                  'STDPMultiPost',                                slice(None)         ),
+               #(STDPNeuronalTraces,                             'STDPNeuronalTraces',                           slice(None)         ),
+               #(STDPMultiPostNeuronalTraces,                    'STDPMultiPostNeuronalTraces',                  slice(None)         ),
+
+               (BrunelHakimModelHeterogeneousDelay,             'BrunelHakimModelHeterogeneousDelay',           slice(0,-1,1)         ),
+
+               #(LinearNeuronsOnly,                              'LinearNeuronsOnly',                            slice(None)         ),
+               #(HHNeuronsOnly,                                  'HHNeuronsOnly',                                slice(None)         ),
+               #(VogelsWithSynapticDynamic,                      'VogelsWithSynapticDynamic',                    slice(None)         ),
+
+               ### below uses monitors
+               #(CUBAFixedConnectivity,                          'CUBAFixedConnectivity',                        slice(None)         ),
+               #(COBAHHFixedConnectivity,                        'COBAHHFixedConnectivity',                      slice(None, -1)     ),
+]
+
+configurations = [config[0] for config in configs]
+project_dirs = [config[1] for config in configs]
+
+# check if multiple Configurations with same project_dirs are specified
+last_idx = {}
+for proj_dir in project_dirs:
+    if proj_dir is not None:
+        first_i = project_dirs.index(proj_dir)
+        last_i = len(project_dirs) - 1 - project_dirs[::-1].index(proj_dir)
+        if first_i != last_i:
+            print("WARNING there are multiple configurations using {d} as project "
+                  "directory. Profiling and logfiles will only be saved for the last one {c}.".format(
+                  d=proj_dir, c=configurations[last_i].__name__))
+        last_idx[proj_dir] = last_i
+
+time_stemp = time.time()
+date_str = datetime.datetime.fromtimestamp(time_stemp).strftime('%Y_%m_%d')
+
+directory = 'results_{}{}'.format(date_str, additional_dir_name)
+if os.path.exists(directory):
+    new_dir = directory + '_bak_' + str(int(time.time()))
+    print("Directory with name `{}` already exists. Renaming it to `{}`.".format(directory, new_dir))
+    os.rename(directory, new_dir)
+os.makedirs(directory)
+data_dir = os.path.join(directory, 'data')
+plot_dir = os.path.join(directory, 'plots')
+log_dir = os.path.join(directory, 'logs')
+prof_dir = os.path.join(directory, 'nvprof')
+os.makedirs(data_dir)
+os.makedirs(plot_dir)
+os.makedirs(log_dir)
+os.makedirs(prof_dir)
+print("Saving results in {}.".format(plot_dir))
+
+shutil.copy(os.path.realpath(__file__), os.path.join(directory, 'run_speed_test_script.py'))
+
+time_format = '%d.%m.%Y at %H:%M:%S'
+script_start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+
+with open(os.path.join(directory, 'git.diff'), 'w') as diff_file:
+    subprocess.call(['git', 'diff'], stdout=diff_file)
+
+try:
+    for n, (st, name, sl) in enumerate(speed_tests):
+        start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+        print("Starting {} on {}.".format(name, start))
+        maximum_run_time = 1*60*60*second
+        res = run_speed_tests(configurations=configurations,
+                              speed_tests=[st],
+                              n_slice=sl,
+                              #n_slice=slice(0,1,None),
+                              run_twice=False,
+                              verbose=True,
+                              maximum_run_time=maximum_run_time#,
+                              ## this needs modification of brian2 code
+                              #profile_only_active=True 
+                              #profile_only_active=False
+                             )
+        end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+        diff = datetime.datetime.strptime(end, time_format) - datetime.datetime.strptime(start, time_format)
+        print("Running {} took {}.".format(name, diff))
+        res.plot_all_tests()
+        ## this needs modification of brian2 code
+        #res.plot_all_tests(print_relative=True)
+        savefig(os.path.join(plot_dir, 'speed_test_{}_absolute.png'.format(speed_tests[n][1])))
+        res.plot_all_tests(relative=True)
+        savefig(os.path.join(plot_dir, 'speed_test_{}_relative.png'.format(name)))
+        res.plot_all_tests(profiling_minimum=0.05)
+        savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.png'.format(name)))
+
+        res.plot_all_tests()
+        ## this needs modification of brian2 code
+        #res.plot_all_tests(print_relative=True)
+        savefig(os.path.join(plot_dir, 'speed_test_{}_absolute.svg'.format(speed_tests[n][1])))
+        res.plot_all_tests(relative=True)
+        savefig(os.path.join(plot_dir, 'speed_test_{}_relative.svg'.format(name)))
+        res.plot_all_tests(profiling_minimum=0.05)
+        savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.svg'.format(name)))
+
+        if 3 != len(get_fignums()):
+            print("WARNING: There were {} plots created, but only {} saved.".format(len(get_fignums()), 3*(n+1)))
+        for n in get_fignums():
+            close(n)
+
+        # pickel results object to disk
+        pkl_file = os.path.join(data_dir, name + '.pkl' )
+        with open(pkl_file, 'wb') as output:
+                pickle.dump(res, output, pickle.HIGHEST_PROTOCOL)
+
+        # save stdout log of last run (the other are deleted in run_speed_tests())
+        for proj_dir in set(project_dirs):
+            if not proj_dir is None and proj_dir in ['cuda_standalone', 'cpp_standalone']:
+                config = configurations[last_idx[proj_dir]]
+                stdout_file = os.path.join(proj_dir, 'results/stdout.txt')
+                if os.path.exists(stdout_file):
+                    shutil.copy(stdout_file,
+                                os.path.join(log_dir, 'stdout_{st}_{conf}_{n}.txt'.format(st=name, conf=proj_dir,
+                                                                                           n=st.n_range[sl][-1])))
+                else:
+                    print("WARNING Couldn't save {},file not found.".format(stdout_file))
+
+        # run nvprof on n_range[2]
+        for conf, proj_dir in zip(configurations, project_dirs):
+            main_arg = ''
+            if proj_dir in ['cuda_standalone', 'GeNNworkspace']:
+                if proj_dir == 'GeNNworkspace':
+                    main_arg = 'test {time} 1'.format(time=st.duration/second)
+                ns = st.n_range[sl]
+                idx = 2
+                max_runtime = 20
+                conf_name = conf.__name__
+                print("Rerunning {} with n = {} for nvprof profiling".format(conf_name, st.n_range[idx]))
+                tb, res, runtime, prof_info = results(conf, st, st.n_range[idx], maximum_run_time=maximum_run_time)
+                if not isinstance(res, Exception) and runtime < max_runtime:
+                    option = '--profile-from-start-off' if proj_dir == 'cuda_standalone' else ''
+                    cmd = 'cd {proj_dir} && nvprof {opt} --log-file ../{log_file} ./main {arg}'.format(
+                        proj_dir=proj_dir, arg=main_arg, opt=option,
+                        log_file=os.path.join(prof_dir, 'nvprof_{st}_{conf}_{n}.log'.format(
+                            st=name, conf=conf_name, n=st.n_range[idx])))
+                    prof_start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+                    print(cmd)
+                    x = os.system(cmd)
+                    if x:
+                        print('nvprof failed with {}'.format(x))
+                    prof_end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+                    prof_diff = datetime.datetime.strptime(prof_end, time_format) - datetime.datetime.strptime(prof_start, time_format)
+                    print("Profiling took {} for runtime of {}".format(prof_diff, runtime))
+finally:
+    create_readme(directory)
+    print("\nSummarized speed test results in {}".format(directory + '/README.md'))
+    script_end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+    script_diff = datetime.datetime.strptime(script_end, time_format) - datetime.datetime.strptime(script_start, time_format)
+    print("Finished speed test on {}. Total time = {}.".format(
+        datetime.datetime.fromtimestamp(time.time()).strftime(time_format), script_diff))
+
+
+##res.plot_all_tests(relative=True)
+#for n in get_fignums():
+#    plt.figure(n)
+#    savefig(plot_dir + '/speed_test_{}.png'.format(speed_tests[n-1][1]))
+
+## Debug (includes profiling infos)
+#from brian2.tests.features.base import results
+#for x in results(LocalConfiguration, LinearNeuronsOnly, 10, maximum_run_time=10*second):
+#    print x
diff --git a/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/update_readme.py b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/update_readme.py
new file mode 100644
index 00000000..b68b1a9e
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_28_atomics_for_heterogenous_delay_mode_parallelisations/update_readme.py
@@ -0,0 +1,18 @@
+import os
+from glob import glob
+
+def update_benchmark_readme():
+    filedir = os.path.dirname(os.path.realpath(__file__))
+    
+    lines = []
+    for readme in sorted(glob(filedir + '/*/README.md'), reverse=True):
+        d = os.path.split(readme)[0]
+        lines.append("[{d}]({d})\n".format(d=os.path.basename(os.path.normpath(d))))
+    
+    readme_md = '\n'.join(lines)
+    
+    with open(filedir + "/README.md", "w") as readme_file:
+        readme_file.write(readme_md)
+
+if __name__ == '__main__':
+    update_benchmark_readme()
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/README.md b/dev/benchmarks/results_2017_11_30_cuba_stdp/README.md
new file mode 100644
index 00000000..1bdd7b98
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/README.md
@@ -0,0 +1,3 @@
+[cuba_stdp_profiled](cuba_stdp_profiled)
+
+[cuba_stdp](cuba_stdp)
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/README.md b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/README.md
new file mode 100644
index 00000000..edb94c86
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/README.md
@@ -0,0 +1,311 @@
+
+# Benchmark results from 29.11.2017
+## Description:
+
+
+
+## Last git log:
+```
+commit 65e51048f25caaee2a6e0396269f90821d994f85
+Author: Denis Alevi <mail@denisalevi.de>
+Date:   Mon Nov 27 17:55:05 2017 +0100
+
+    Add recent benchmark results
+
+```
+There is also a `git diff` saved in the current directory.
+
+## Results
+
+### CUBA
+![](plots/speed_test_CUBA_absolute.svg)
+![](plots/speed_test_CUBA_profiling.svg)
+![](plots/speed_test_CUBA_relative.svg)
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfiguration**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==9905== NVPROF is profiling process 9905, command: ./main
+==9905== Profiling application: ./main
+==9905== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   27.60%  60.179ms     10000  6.0170us  5.8560us  7.0720us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double, double*, double*, double*, bool*)
+                   23.08%  50.318ms     10000  5.0310us  3.2960us  23.232us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double*, double, int*, int, int*, bool*)
+                   21.95%  47.850ms     10000  4.7840us  3.2960us  19.968us  kernel_synapses_1_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, int*, double*, bool*)
+                   11.42%  24.905ms     10000  2.4900us  2.2720us  2.8800us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+                    8.26%  18.018ms     10000  1.8010us  1.6640us  2.1120us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*, bool*)
+                    7.68%  16.743ms     10000  1.6740us  1.5360us  2.0800us  _GLOBAL__N__69_tmpxft_000024a7_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*)
+      API calls:   85.15%  643.72ms     60000  10.728us  9.3820us  9.0568ms  cudaLaunch
+                   11.40%  86.186ms    520000     165ns     134ns  363.54us  cudaSetupArgument
+                    1.93%  14.574ms     60000     242ns     182ns  349.11us  cudaConfigureCall
+                    1.50%  11.304ms     50000     226ns     194ns  14.049us  cudaGetLastError
+                    0.02%  134.84us         1  134.84us  134.84us  134.84us  cudaMemGetInfo
+                    0.00%  31.105us         8  3.8880us  3.0120us  5.6890us  cudaFuncGetAttributes
+                    0.00%  30.378us        39     778ns     653ns  1.8930us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  13.359us         1  13.359us  13.359us  13.359us  cudaDeviceSynchronize
+                    0.00%  6.2530us        12     521ns     334ns  1.3690us  cudaDeviceGetAttribute
+                    0.00%  3.6520us         3  1.2170us     789ns  1.6000us  cudaGetDevice
+
+```
+
+</p></details>
+
+
+<details><summary>Examplary `nvprof` results for **GeNNConfigurationOptimized**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==10402== NVPROF is profiling process 10402, command: ./main test 1.0 1
+==10402== Profiling application: ./main test 1.0 1
+==10402== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   63.74%  78.512ms     10000  7.8510us  7.0080us  10.336us  calcNeurons
+                   35.42%  43.636ms     10000  4.3630us  1.9840us  20.512us  calcSynapses
+                    0.65%  799.04us        56  14.268us     960ns  163.46us  [CUDA memcpy HtoD]
+                    0.19%  234.66us        13  18.050us  1.9840us  155.30us  [CUDA memcpy DtoH]
+      API calls:   67.87%  468.06ms        16  29.253ms  15.634us  464.71ms  cudaHostAlloc
+                   29.73%  204.98ms     20000  10.248us  9.4610us  337.99us  cudaLaunch
+                    1.01%  6.9362ms     20000     346ns     275ns  331.07us  cudaConfigureCall
+                    0.81%  5.6041ms     20000     280ns     221ns  329.96us  cudaSetupArgument
+                    0.31%  2.1674ms        73  29.690us     512ns  179.18us  cudaMemcpy
+                    0.18%  1.2374ms        16  77.339us  9.8610us  230.18us  cudaMalloc
+                    0.06%  398.46us        94  4.2380us     154ns  155.40us  cuDeviceGetAttribute
+                    0.02%  118.62us         1  118.62us  118.62us  118.62us  cuDeviceTotalMem
+                    0.01%  48.855us         1  48.855us  48.855us  48.855us  cuDeviceGetName
+                    0.00%  22.545us        16  1.4090us     582ns  3.4920us  cudaGetSymbolAddress
+                    0.00%  9.5420us         1  9.5420us  9.5420us  9.5420us  cudaSetDevice
+                    0.00%  3.6290us         3  1.2090us     200ns  2.4090us  cuDeviceGetCount
+                    0.00%  1.5380us         1  1.5380us  1.5380us  1.5380us  cudaGetDeviceCount
+                    0.00%  1.1220us         2     561ns     362ns     760ns  cuDeviceGet
+
+```
+
+</p></details>
+
+
+***
+
+### STDP (with SpikeMonitor)
+![](plots/speed_test_STDP_absolute.svg)
+![](plots/speed_test_STDP_profiling.svg)
+![](plots/speed_test_STDP_relative.svg)
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfiguration**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==8893== NVPROF is profiling process 8893, command: ./main
+==8893== Profiling application: ./main
+==8893== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   32.18%  119.34ms     10000  11.934us  1.6000us  26.926ms  kernel_spikemonitor_codeobject(unsigned int, int*, double, int*, int*, int*, int, int*, double*, int, int*, int*)
+                   20.94%  77.684ms     10000  7.7680us  3.3600us  25.728us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, double*, double, double*, int, int*, int, int*, int)
+                   11.71%  43.439ms     10000  4.3430us  3.8400us  6.0800us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*)
+                    9.85%  36.549ms     10000  3.6540us  3.5520us  7.0080us  kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, double, double*, int, int*, int*, int)
+                    6.79%  25.173ms     10000  2.5170us  2.1760us  3.6800us  kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*)
+                    4.91%  18.216ms     10000  1.8210us  1.7280us  4.3200us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*)
+                    4.78%  17.745ms     10000  1.7740us  1.5680us  3.6800us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+                    4.51%  16.723ms     10000  1.6720us  1.6000us  3.2640us  _GLOBAL__N__70_tmpxft_00002089_00000000_6_poissongroup_thresholder_codeobject_cpp1_ii_7314966e::_reset_poissongroup_thresholder_codeobject(int*)
+                    4.22%  15.645ms     10000  1.5640us  1.3760us  3.5200us  _GLOBAL__N__69_tmpxft_00002086_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_c0b8948b::_reset_neurongroup_thresholder_codeobject(int*)
+                    0.09%  330.21us         1  330.21us  330.21us  330.21us  void gen_sequenced<curandStateXORWOW, float, int, __operator_&__(float curand_uniform_noargs<curandStateXORWOW>(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int)
+                    0.02%  68.192us         1  68.192us  68.192us  68.192us  _run_spikemonitor_codeobject_init(void)
+      API calls:   85.86%  977.23ms     90002  10.857us  9.1510us  10.887ms  cudaLaunch
+                   10.27%  116.88ms    700005     166ns     137ns  331.60us  cudaSetupArgument
+                    2.12%  24.161ms     90002     268ns     176ns  335.73us  cudaConfigureCall
+                    1.71%  19.473ms     70003     278ns     205ns  329.44us  cudaGetLastError
+                    0.02%  213.58us         1  213.58us  213.58us  213.58us  cudaMalloc
+                    0.01%  133.47us         1  133.47us  133.47us  133.47us  cudaMemGetInfo
+                    0.00%  43.181us        11  3.9250us  3.2280us  6.4450us  cudaFuncGetAttributes
+                    0.00%  32.048us        42     763ns     627ns  1.7760us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  13.358us         1  13.358us  13.358us  13.358us  cudaDeviceSynchronize
+                    0.00%  7.7520us        16     484ns     356ns  1.1240us  cudaDeviceGetAttribute
+                    0.00%  4.0510us         4  1.0120us     822ns  1.5030us  cudaGetDevice
+
+```
+
+</p></details>
+
+
+<details><summary>Examplary `nvprof` results for **GeNNConfigurationOptimized**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==9420== NVPROF is profiling process 9420, command: ./main test 1.0 1
+==9420== Profiling application: ./main test 1.0 1
+==9420== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   50.20%  103.62ms     10000  10.362us  1.5680us  45.248us  calcSynapses
+                   19.97%  41.214ms     10000  4.1210us  3.1040us  6.7200us  calcNeurons
+                   17.73%  36.597ms     17812  2.0540us  2.0160us  4.7360us  [CUDA memcpy DtoH]
+                   12.06%  24.885ms     10000  2.4880us  2.3680us  10.848us  learnSynapsesPost
+                    0.05%  94.016us        70  1.3430us     960ns  2.0480us  [CUDA memcpy HtoD]
+      API calls:   34.18%  358.40ms        20  17.920ms  8.3270us  356.55ms  cudaHostAlloc
+                   32.17%  337.26ms     30000  11.241us  9.5510us  356.07us  cudaLaunch
+                   31.72%  332.57ms     20095  16.549us     231ns  988.56us  cudaMemcpy
+                    1.03%  10.770ms     30000     358ns     283ns  331.73us  cudaConfigureCall
+                    0.77%  8.0617ms     30000     268ns     208ns  334.35us  cudaSetupArgument
+                    0.08%  809.75us        20  40.487us  8.0280us  232.94us  cudaMalloc
+                    0.04%  401.78us        94  4.2740us     161ns  156.02us  cuDeviceGetAttribute
+                    0.01%  113.16us         1  113.16us  113.16us  113.16us  cuDeviceTotalMem
+                    0.00%  37.103us         1  37.103us  37.103us  37.103us  cuDeviceGetName
+                    0.00%  22.451us        20  1.1220us     525ns  5.8000us  cudaGetSymbolAddress
+                    0.00%  9.5720us         1  9.5720us  9.5720us  9.5720us  cudaSetDevice
+                    0.00%  3.2610us         3  1.0870us     219ns  2.3710us  cuDeviceGetCount
+                    0.00%  1.6100us         1  1.6100us  1.6100us  1.6100us  cudaGetDeviceCount
+                    0.00%  1.0470us         2     523ns     250ns     797ns  cuDeviceGet
+
+```
+
+</p></details>
+
+
+***
+
+### STDPEventDriven
+![](plots/speed_test_STDPEventDriven_absolute.svg)
+![](plots/speed_test_STDPEventDriven_profiling.svg)
+![](plots/speed_test_STDPEventDriven_relative.svg)
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfiguration**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==19561== NVPROF is profiling process 19561, command: ./main
+==19561== Profiling application: ./main
+==19561== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   33.06%  85.737ms     10000  8.5730us  3.3600us  26.176us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, double*, double, double*, int, int*, int, int*, int)
+                   16.85%  43.713ms     10000  4.3710us  3.8720us  6.4320us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*)
+                   13.67%  35.462ms     10000  3.5460us  3.4560us  7.1040us  kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, double, double*, int, int*, int*, int)
+                    9.83%  25.505ms     10000  2.5500us  2.2400us  2.8480us  kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*)
+                    7.03%  18.243ms     10000  1.8240us  1.7600us  2.5920us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*)
+                    7.01%  18.182ms     10000  1.8180us  1.6960us  2.2080us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+                    6.41%  16.614ms     10000  1.6610us  1.5360us  1.9520us  _GLOBAL__N__70_tmpxft_00004a64_00000000_6_poissongroup_thresholder_codeobject_cpp1_ii_7314966e::_reset_poissongroup_thresholder_codeobject(int*)
+                    6.01%  15.583ms     10000  1.5580us  1.4720us  1.7280us  _GLOBAL__N__69_tmpxft_00004a60_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_c0b8948b::_reset_neurongroup_thresholder_codeobject(int*)
+                    0.13%  330.21us         1  330.21us  330.21us  330.21us  void gen_sequenced<curandStateXORWOW, float, int, __operator_&__(float curand_uniform_noargs<curandStateXORWOW>(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int)
+      API calls:   84.19%  838.94ms     80001  10.486us  9.1520us  8.9317ms  cudaLaunch
+                   11.76%  117.21ms    580005     202ns     157ns  419.79us  cudaSetupArgument
+                    2.27%  22.642ms     80001     283ns     205ns  337.22us  cudaConfigureCall
+                    1.74%  17.290ms     60002     288ns     220ns  371.39us  cudaGetLastError
+                    0.02%  198.76us         1  198.76us  198.76us  198.76us  cudaMalloc
+                    0.01%  139.83us         1  139.83us  139.83us  139.83us  cudaMemGetInfo
+                    0.00%  37.555us        10  3.7550us  3.0440us  6.0110us  cudaFuncGetAttributes
+                    0.00%  31.926us        41     778ns     680ns  1.6620us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  13.490us         1  13.490us  13.490us  13.490us  cudaDeviceSynchronize
+                    0.00%  8.0030us        16     500ns     369ns  1.0430us  cudaDeviceGetAttribute
+                    0.00%  4.0740us         4  1.0180us     792ns  1.5710us  cudaGetDevice
+
+```
+
+</p></details>
+
+
+<details><summary>Examplary `nvprof` results for **GeNNConfigurationOptimized**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==20069== NVPROF is profiling process 20069, command: ./main test 1.0 1
+==20069== Profiling application: ./main test 1.0 1
+==20069== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   61.14%  103.55ms     10000  10.355us  1.5680us  49.760us  calcSynapses
+                   24.06%  40.741ms     10000  4.0740us  3.0720us  6.6880us  calcNeurons
+                   14.71%  24.918ms     10000  2.4910us  2.3360us  10.560us  learnSynapsesPost
+                    0.06%  94.593us        70  1.3510us     960ns  2.0800us  [CUDA memcpy HtoD]
+                    0.03%  55.553us        19  2.9230us  2.0160us  4.8010us  [CUDA memcpy DtoH]
+      API calls:   56.62%  434.02ms        20  21.701ms  16.555us  432.05ms  cudaHostAlloc
+                   40.47%  310.26ms     30000  10.341us  9.4140us  347.35us  cudaLaunch
+                    1.37%  10.508ms     30000     350ns     275ns  330.81us  cudaConfigureCall
+                    1.08%  8.2824ms     30000     276ns     221ns  333.57us  cudaSetupArgument
+                    0.25%  1.9098ms        95  20.103us     434ns  41.200us  cudaMemcpy
+                    0.13%  998.34us        20  49.917us  13.252us  259.26us  cudaMalloc
+                    0.05%  419.41us        94  4.4610us     183ns  162.60us  cuDeviceGetAttribute
+                    0.02%  126.30us         1  126.30us  126.30us  126.30us  cuDeviceTotalMem
+                    0.00%  38.221us         1  38.221us  38.221us  38.221us  cuDeviceGetName
+                    0.00%  29.710us        20  1.4850us     972ns  6.2560us  cudaGetSymbolAddress
+                    0.00%  9.9350us         1  9.9350us  9.9350us  9.9350us  cudaSetDevice
+                    0.00%  3.4560us         3  1.1520us     236ns  2.5630us  cuDeviceGetCount
+                    0.00%  1.7080us         1  1.7080us  1.7080us  1.7080us  cudaGetDeviceCount
+                    0.00%  1.2540us         2     627ns     267ns     987ns  cuDeviceGet
+
+```
+
+</p></details>
+
+
+***
+
+### STDPNotEventDriven
+![](plots/speed_test_STDPNotEventDriven_absolute.svg)
+![](plots/speed_test_STDPNotEventDriven_profiling.svg)
+![](plots/speed_test_STDPNotEventDriven_relative.svg)
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfiguration**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==18533== NVPROF is profiling process 18533, command: ./main
+==18533== Profiling application: ./main
+==18533== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   27.01%  73.513ms     10000  7.3510us  3.2960us  29.120us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, double*, double, double*, int, int*, int, int*)
+                   16.08%  43.771ms     10000  4.3770us  3.9040us  6.3360us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*)
+                   12.83%  34.925ms     10000  3.4920us  3.3920us  6.4000us  kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, double, double*, int, int*)
+                   10.01%  27.253ms     10000  2.7250us  2.6240us  3.2000us  kernel_synapses_stateupdater_codeobject(unsigned int, unsigned int, double*, int, double*, int, double*, int*)
+                    9.18%  24.982ms     10000  2.4980us  2.2080us  2.6880us  kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*)
+                    6.70%  18.244ms     10000  1.8240us  1.7280us  2.0800us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+                    6.70%  18.236ms     10000  1.8230us  1.7280us  2.6240us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*)
+                    5.76%  15.677ms     10000  1.5670us  1.4720us  1.6960us  _GLOBAL__N__69_tmpxft_00004642_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_c0b8948b::_reset_neurongroup_thresholder_codeobject(int*)
+                    5.59%  15.219ms     10000  1.5210us  1.4400us  1.9520us  _GLOBAL__N__70_tmpxft_00004643_00000000_6_poissongroup_thresholder_codeobject_cpp1_ii_7314966e::_reset_poissongroup_thresholder_codeobject(int*)
+                    0.12%  330.63us         1  330.63us  330.63us  330.63us  void gen_sequenced<curandStateXORWOW, float, int, __operator_&__(float curand_uniform_noargs<curandStateXORWOW>(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int)
+      API calls:   85.78%  968.25ms     90001  10.758us  9.3110us  9.2828ms  cudaLaunch
+                   10.41%  117.51ms    660005     178ns     137ns  367.56us  cudaSetupArgument
+                    2.19%  24.694ms     90001     274ns     200ns  349.17us  cudaConfigureCall
+                    1.59%  17.897ms     70002     255ns     203ns  333.24us  cudaGetLastError
+                    0.02%  201.51us         1  201.51us  201.51us  201.51us  cudaMalloc
+                    0.01%  131.77us         1  131.77us  131.77us  131.77us  cudaMemGetInfo
+                    0.00%  51.691us        74     698ns     591ns  1.8080us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  47.398us        12  3.9490us  3.1850us  6.2570us  cudaFuncGetAttributes
+                    0.00%  13.229us         1  13.229us  13.229us  13.229us  cudaDeviceSynchronize
+                    0.00%  9.0230us        20     451ns     339ns     845ns  cudaDeviceGetAttribute
+                    0.00%  4.9330us         5     986ns     852ns  1.4630us  cudaGetDevice
+
+```
+
+</p></details>
+
+
+<details><summary>Examplary `nvprof` results for **GeNNConfigurationOptimized**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==19043== NVPROF is profiling process 19043, command: ./main test 1.0 1
+==19043== Profiling application: ./main test 1.0 1
+==19043== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   39.69%  65.436ms     10000  6.5430us  1.5680us  25.984us  calcSynapses
+                   24.07%  39.692ms     10000  3.9690us  3.1040us  6.4320us  calcNeurons
+                   20.72%  34.165ms     10000  3.4160us  3.1040us  6.0800us  calcSynapseDynamics
+                   15.42%  25.426ms     10000  2.5420us  2.3680us  6.6880us  learnSynapsesPost
+                    0.06%  96.800us        72  1.3440us     960ns  2.0800us  [CUDA memcpy HtoD]
+                    0.04%  59.552us        21  2.8350us  2.0480us  4.7680us  [CUDA memcpy DtoH]
+      API calls:   51.75%  397.10ms     40000  9.9270us  9.2210us  345.12us  cudaLaunch
+                   44.63%  342.53ms        21  16.311ms  16.914us  340.57ms  cudaHostAlloc
+                    1.75%  13.449ms     40000     336ns     278ns  330.54us  cudaConfigureCall
+                    1.40%  10.778ms     40000     269ns     210ns  335.21us  cudaSetupArgument
+                    0.26%  1.9587ms        97  20.192us     407ns  40.743us  cudaMemcpy
+                    0.13%  990.84us        21  47.182us  13.183us  232.13us  cudaMalloc
+                    0.05%  400.17us        94  4.2570us     154ns  155.67us  cuDeviceGetAttribute
+                    0.01%  113.90us         1  113.90us  113.90us  113.90us  cuDeviceTotalMem
+                    0.00%  36.839us         1  36.839us  36.839us  36.839us  cuDeviceGetName
+                    0.00%  30.866us        21  1.4690us     942ns  6.1960us  cudaGetSymbolAddress
+                    0.00%  9.2900us         1  9.2900us  9.2900us  9.2900us  cudaSetDevice
+                    0.00%  3.2500us         3  1.0830us     238ns  2.4920us  cuDeviceGetCount
+                    0.00%  1.6970us         1  1.6970us  1.6970us  1.6970us  cudaGetDeviceCount
+                    0.00%  1.0870us         2     543ns     238ns     849ns  cuDeviceGet
+
+```
+
+</p></details>
+
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/data/CUBA.pkl b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/data/CUBA.pkl
new file mode 100644
index 00000000..d65d8af9
Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/data/CUBA.pkl differ
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/data/STDP.pkl b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/data/STDP.pkl
new file mode 100644
index 00000000..53cfc7de
Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/data/STDP.pkl differ
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/data/STDPEventDriven.pkl b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/data/STDPEventDriven.pkl
new file mode 100644
index 00000000..bf3ce605
Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/data/STDPEventDriven.pkl differ
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/data/STDPNotEventDriven.pkl b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/data/STDPNotEventDriven.pkl
new file mode 100644
index 00000000..b0a7f42a
Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/data/STDPNotEventDriven.pkl differ
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/git.diff b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/git.diff
new file mode 100644
index 00000000..7737e913
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/git.diff
@@ -0,0 +1,60 @@
+diff --git a/brian2cuda/device.py b/brian2cuda/device.py
+index b4610ee..e032d32 100644
+--- a/brian2cuda/device.py
++++ b/brian2cuda/device.py
+@@ -919,13 +919,13 @@ class CUDAStandaloneDevice(CPPStandaloneDevice):
+             if clock not in all_clocks:
+                 run_lines.append('{net.name}.add(&{clock.name}, NULL, NULL, NULL, NULL);'.format(clock=clock, net=net))
+ 
+-        if self.profile and self.profile != 'blocking':  # self.profile == True
++        if True:#self.profile and self.profile != 'blocking':  # self.profile == True
+             run_lines.append('cudaProfilerStart();')
+         run_lines.append('{net.name}.run({duration!r}, {report_call}, {report_period!r});'.format(net=net,
+                                                                                               duration=float(duration),
+                                                                                               report_call=report_call,
+                                                                                               report_period=float(report_period)))
+-        if self.profile and self.profile != 'blocking':  # self.profile == True
++        if True:#self.profile and self.profile != 'blocking':  # self.profile == True
+             run_lines.append('cudaDeviceSynchronize();')
+             run_lines.append('cudaProfilerStop();')
+         self.main_queue.append(('run_network', (net, run_lines)))
+diff --git a/brian2cuda/tests/features/speed.py b/brian2cuda/tests/features/speed.py
+index 2293533..c093bc9 100644
+--- a/brian2cuda/tests/features/speed.py
++++ b/brian2cuda/tests/features/speed.py
+@@ -558,7 +558,7 @@ class CUBA(SpeedTest):
+     category = "Full examples"
+     name = "CUBA fixed connectivity"
+     tags = ["Neurons", "Synapses"]
+-    n_range = [10, 100, 1000, 10000, 100000, 1000000]
++    n_range = [10, 100, 1000, 10000, 100000, 200000, 500000, 1000000]
+     n_label = 'Num neurons'
+ 
+     # configuration options
+@@ -720,7 +720,7 @@ class STDPNotEventDriven(SpeedTest):
+     category = "Full examples"
+     name = "STDP (not event-driven)"
+     tags = ["Neurons", "Synapses"]
+-    n_range = [10, 100, 1000, 10000, 20000, 50000, 100000]
++    n_range = [10, 100, 1000, 10000, 20000, 50000, 100000, 1000000, 5000000]
+     n_label = 'Num neurons'
+ 
+     # configuration options
+diff --git a/frozen_repos/brian2 b/frozen_repos/brian2
+--- a/frozen_repos/brian2
++++ b/frozen_repos/brian2
+@@ -1 +1 @@
+-Subproject commit fadc6a0aeb90d1b4d343470628457d8561536f67
++Subproject commit fadc6a0aeb90d1b4d343470628457d8561536f67-dirty
+diff --git a/frozen_repos/brian2genn b/frozen_repos/brian2genn
+--- a/frozen_repos/brian2genn
++++ b/frozen_repos/brian2genn
+@@ -1 +1 @@
+-Subproject commit 0553cafeab49ea5403c0230411035df504d4db06
++Subproject commit 0553cafeab49ea5403c0230411035df504d4db06-dirty
+diff --git a/frozen_repos/genn b/frozen_repos/genn
+--- a/frozen_repos/genn
++++ b/frozen_repos/genn
+@@ -1 +1 @@
+-Subproject commit e01c85f18339249558d6e570ae976609dc972846
++Subproject commit e01c85f18339249558d6e570ae976609dc972846-dirty
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_CUBA_cpp_standalone_1000000.txt b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_CUBA_cpp_standalone_1000000.txt
new file mode 100644
index 00000000..765e61ed
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_CUBA_cpp_standalone_1000000.txt
@@ -0,0 +1,2 @@
+Number of synapses: 64000702
+Number of synapses: 15999195
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_CUBA_cuda_standalone_1000000.txt b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_CUBA_cuda_standalone_1000000.txt
new file mode 100644
index 00000000..b923caf1
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_CUBA_cuda_standalone_1000000.txt
@@ -0,0 +1,9 @@
+INFO: setting cudaDevice stuff took 0.347672 seconds
+INFO kernel_neurongroup_group_variable_set_conditional_codeobject
+	977 blocks
+	1024 threads
+	12 registers per block
+	0 bytes statically-allocated shared memory per block
+	0 bytes local memory per thread
+	304 bytes user-allocated constant memory
+INFO connectivity matrix has size 15994612
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_STDPEventDriven_cpp_standalone_5000000.txt b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_STDPEventDriven_cpp_standalone_5000000.txt
new file mode 100644
index 00000000..0d502595
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_STDPEventDriven_cpp_standalone_5000000.txt
@@ -0,0 +1,2 @@
+Number of synapses: 5000000
+Number of synapses: 5000000
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_STDPEventDriven_cuda_standalone_5000000.txt b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_STDPEventDriven_cuda_standalone_5000000.txt
new file mode 100644
index 00000000..d310f8f6
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_STDPEventDriven_cuda_standalone_5000000.txt
@@ -0,0 +1,63 @@
+INFO: setting cudaDevice stuff took 0.343461 seconds
+INFO kernel_synapses_group_variable_set_conditional_codeobject
+	4883 blocks
+	1024 threads
+	8 registers per block
+	0 bytes statically-allocated shared memory per block
+	0 bytes local memory per thread
+	304 bytes user-allocated constant memory
+INFO connectivity matrix has size 5000000
+INFO connectivity matrix has size 5000000
+INFO generating 10000000 rand every 2 clock cycles for poissongroup_thresholder_codeobject
+INFO kernel_neurongroup_stateupdater_codeobject
+	1 blocks
+	768 threads
+	35 registers per block
+	0 bytes statically-allocated shared memory per block
+	0 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	0.750 theoretical occupancy
+INFO kernel_neurongroup_thresholder_codeobject
+	1 blocks
+	1024 threads
+	14 registers per block
+	0 bytes statically-allocated shared memory per block
+	0 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	1.000 theoretical occupancy
+INFO kernel_poissongroup_thresholder_codeobject
+	4883 blocks
+	1024 threads
+	14 registers per block
+	0 bytes statically-allocated shared memory per block
+	0 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	1.000 theoretical occupancy
+INFO kernel_synapses_pre_codeobject
+	15 blocks
+	1024 threads
+	40 registers per block
+	0 bytes statically-allocated shared memory per block
+	8 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	0.500 theoretical occupancy
+INFO kernel_synapses_post_codeobject
+	15 blocks
+	1024 threads
+	34 registers per block
+	0 bytes statically-allocated shared memory per block
+	8 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	0.500 theoretical occupancy
+INFO kernel_neurongroup_resetter_codeobject
+	1 blocks
+	1024 threads
+	14 registers per block
+	0 bytes statically-allocated shared memory per block
+	0 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	1.000 theoretical occupancy
+Number of synapses: 5000000
+Number of synapses: 5000000
+INFO: main_lines took 343.866984 seconds
+INFO: main function took 344.732117 seconds
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_STDPNotEventDriven_cpp_standalone_5000000.txt b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_STDPNotEventDriven_cpp_standalone_5000000.txt
new file mode 100644
index 00000000..0d502595
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_STDPNotEventDriven_cpp_standalone_5000000.txt
@@ -0,0 +1,2 @@
+Number of synapses: 5000000
+Number of synapses: 5000000
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_STDPNotEventDriven_cuda_standalone_5000000.txt b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_STDPNotEventDriven_cuda_standalone_5000000.txt
new file mode 100644
index 00000000..4c247b04
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_STDPNotEventDriven_cuda_standalone_5000000.txt
@@ -0,0 +1,71 @@
+INFO: setting cudaDevice stuff took 0.345858 seconds
+INFO kernel_synapses_group_variable_set_conditional_codeobject
+	4883 blocks
+	1024 threads
+	8 registers per block
+	0 bytes statically-allocated shared memory per block
+	0 bytes local memory per thread
+	304 bytes user-allocated constant memory
+INFO connectivity matrix has size 5000000
+INFO connectivity matrix has size 5000000
+INFO generating 10000000 rand every 2 clock cycles for poissongroup_thresholder_codeobject
+INFO kernel_neurongroup_stateupdater_codeobject
+	1 blocks
+	768 threads
+	35 registers per block
+	0 bytes statically-allocated shared memory per block
+	0 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	0.750 theoretical occupancy
+INFO kernel_synapses_stateupdater_codeobject
+	6511 blocks
+	768 threads
+	35 registers per block
+	0 bytes statically-allocated shared memory per block
+	0 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	0.750 theoretical occupancy
+INFO kernel_neurongroup_thresholder_codeobject
+	1 blocks
+	1024 threads
+	14 registers per block
+	0 bytes statically-allocated shared memory per block
+	0 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	1.000 theoretical occupancy
+INFO kernel_poissongroup_thresholder_codeobject
+	4883 blocks
+	1024 threads
+	14 registers per block
+	0 bytes statically-allocated shared memory per block
+	0 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	1.000 theoretical occupancy
+INFO kernel_synapses_pre_codeobject
+	15 blocks
+	1024 threads
+	28 registers per block
+	0 bytes statically-allocated shared memory per block
+	8 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	1.000 theoretical occupancy
+INFO kernel_synapses_post_codeobject
+	15 blocks
+	1024 threads
+	26 registers per block
+	0 bytes statically-allocated shared memory per block
+	8 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	1.000 theoretical occupancy
+INFO kernel_neurongroup_resetter_codeobject
+	1 blocks
+	1024 threads
+	14 registers per block
+	0 bytes statically-allocated shared memory per block
+	0 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	1.000 theoretical occupancy
+Number of synapses: 5000000
+Number of synapses: 5000000
+INFO: main_lines took 328.762289 seconds
+INFO: main function took 329.622826 seconds
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_STDP_cpp_standalone_1000000.txt b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_STDP_cpp_standalone_1000000.txt
new file mode 100644
index 00000000..eb1d6c28
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_STDP_cpp_standalone_1000000.txt
@@ -0,0 +1,3 @@
+Number of synapses: 1000000
+Number of spikes: 14994297
+Number of synapses: 1000000
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_STDP_cuda_standalone_1000000.txt b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_STDP_cuda_standalone_1000000.txt
new file mode 100644
index 00000000..f36afa97
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/logs/stdout_STDP_cuda_standalone_1000000.txt
@@ -0,0 +1,76 @@
+INFO: setting cudaDevice stuff took 0.189500 seconds
+INFO kernel_synapses_group_variable_set_conditional_codeobject
+	977 blocks
+	1024 threads
+	8 registers per block
+	0 bytes statically-allocated shared memory per block
+	0 bytes local memory per thread
+	304 bytes user-allocated constant memory
+INFO connectivity matrix has size 1000000
+INFO connectivity matrix has size 1000000
+INFO generating 13000000 rand every 13 clock cycles for poissongroup_thresholder_codeobject
+INFO kernel_neurongroup_stateupdater_codeobject
+	1 blocks
+	768 threads
+	35 registers per block
+	0 bytes statically-allocated shared memory per block
+	0 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	0.750 theoretical occupancy
+INFO kernel_neurongroup_thresholder_codeobject
+	1 blocks
+	1024 threads
+	14 registers per block
+	0 bytes statically-allocated shared memory per block
+	0 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	1.000 theoretical occupancy
+INFO kernel_poissongroup_thresholder_codeobject
+	977 blocks
+	1024 threads
+	14 registers per block
+	0 bytes statically-allocated shared memory per block
+	0 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	1.000 theoretical occupancy
+INFO kernel_spikemonitor_codeobject
+	1 blocks
+	1 threads
+	30 registers per block
+	0 bytes statically-allocated shared memory per block
+	16 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	0.000 theoretical occupancy
+INFO kernel_synapses_pre_codeobject
+	15 blocks
+	1024 threads
+	40 registers per block
+	0 bytes statically-allocated shared memory per block
+	8 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	0.500 theoretical occupancy
+INFO kernel_synapses_post_codeobject
+	15 blocks
+	1024 threads
+	34 registers per block
+	0 bytes statically-allocated shared memory per block
+	8 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	0.500 theoretical occupancy
+INFO kernel_neurongroup_resetter_codeobject
+	1 blocks
+	1024 threads
+	14 registers per block
+	0 bytes statically-allocated shared memory per block
+	0 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	1.000 theoretical occupancy
+...
+ERROR while allocating 33554428 bytes in cudaVector.h/reserve()
+ERROR while allocating 67108856 bytes in cudaVector.h/reserve()
+...
+Number of synapses: 1000000
+Number of synapses: 1000000
+INFO: main_lines took 2374.681343 seconds
+Number of spikes: 4194303
+INFO: main function took 2382.821595 seconds
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_CUBA_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_CUBA_CUDAStandaloneConfiguration_1000.log
new file mode 100644
index 00000000..3ee9ab79
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_CUBA_CUDAStandaloneConfiguration_1000.log
@@ -0,0 +1,20 @@
+==9905== NVPROF is profiling process 9905, command: ./main
+==9905== Profiling application: ./main
+==9905== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   27.60%  60.179ms     10000  6.0170us  5.8560us  7.0720us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double, double*, double*, double*, bool*)
+                   23.08%  50.318ms     10000  5.0310us  3.2960us  23.232us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double*, double, int*, int, int*, bool*)
+                   21.95%  47.850ms     10000  4.7840us  3.2960us  19.968us  kernel_synapses_1_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, int*, double*, bool*)
+                   11.42%  24.905ms     10000  2.4900us  2.2720us  2.8800us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+                    8.26%  18.018ms     10000  1.8010us  1.6640us  2.1120us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*, bool*)
+                    7.68%  16.743ms     10000  1.6740us  1.5360us  2.0800us  _GLOBAL__N__69_tmpxft_000024a7_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*)
+      API calls:   85.15%  643.72ms     60000  10.728us  9.3820us  9.0568ms  cudaLaunch
+                   11.40%  86.186ms    520000     165ns     134ns  363.54us  cudaSetupArgument
+                    1.93%  14.574ms     60000     242ns     182ns  349.11us  cudaConfigureCall
+                    1.50%  11.304ms     50000     226ns     194ns  14.049us  cudaGetLastError
+                    0.02%  134.84us         1  134.84us  134.84us  134.84us  cudaMemGetInfo
+                    0.00%  31.105us         8  3.8880us  3.0120us  5.6890us  cudaFuncGetAttributes
+                    0.00%  30.378us        39     778ns     653ns  1.8930us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  13.359us         1  13.359us  13.359us  13.359us  cudaDeviceSynchronize
+                    0.00%  6.2530us        12     521ns     334ns  1.3690us  cudaDeviceGetAttribute
+                    0.00%  3.6520us         3  1.2170us     789ns  1.6000us  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_CUBA_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_CUBA_GeNNConfigurationOptimized_1000.log
new file mode 100644
index 00000000..9add1dc1
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_CUBA_GeNNConfigurationOptimized_1000.log
@@ -0,0 +1,22 @@
+==10402== NVPROF is profiling process 10402, command: ./main test 1.0 1
+==10402== Profiling application: ./main test 1.0 1
+==10402== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   63.74%  78.512ms     10000  7.8510us  7.0080us  10.336us  calcNeurons
+                   35.42%  43.636ms     10000  4.3630us  1.9840us  20.512us  calcSynapses
+                    0.65%  799.04us        56  14.268us     960ns  163.46us  [CUDA memcpy HtoD]
+                    0.19%  234.66us        13  18.050us  1.9840us  155.30us  [CUDA memcpy DtoH]
+      API calls:   67.87%  468.06ms        16  29.253ms  15.634us  464.71ms  cudaHostAlloc
+                   29.73%  204.98ms     20000  10.248us  9.4610us  337.99us  cudaLaunch
+                    1.01%  6.9362ms     20000     346ns     275ns  331.07us  cudaConfigureCall
+                    0.81%  5.6041ms     20000     280ns     221ns  329.96us  cudaSetupArgument
+                    0.31%  2.1674ms        73  29.690us     512ns  179.18us  cudaMemcpy
+                    0.18%  1.2374ms        16  77.339us  9.8610us  230.18us  cudaMalloc
+                    0.06%  398.46us        94  4.2380us     154ns  155.40us  cuDeviceGetAttribute
+                    0.02%  118.62us         1  118.62us  118.62us  118.62us  cuDeviceTotalMem
+                    0.01%  48.855us         1  48.855us  48.855us  48.855us  cuDeviceGetName
+                    0.00%  22.545us        16  1.4090us     582ns  3.4920us  cudaGetSymbolAddress
+                    0.00%  9.5420us         1  9.5420us  9.5420us  9.5420us  cudaSetDevice
+                    0.00%  3.6290us         3  1.2090us     200ns  2.4090us  cuDeviceGetCount
+                    0.00%  1.5380us         1  1.5380us  1.5380us  1.5380us  cudaGetDeviceCount
+                    0.00%  1.1220us         2     561ns     362ns     760ns  cuDeviceGet
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_STDPEventDriven_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_STDPEventDriven_CUDAStandaloneConfiguration_1000.log
new file mode 100644
index 00000000..d67458c4
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_STDPEventDriven_CUDAStandaloneConfiguration_1000.log
@@ -0,0 +1,24 @@
+==19561== NVPROF is profiling process 19561, command: ./main
+==19561== Profiling application: ./main
+==19561== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   33.06%  85.737ms     10000  8.5730us  3.3600us  26.176us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, double*, double, double*, int, int*, int, int*, int)
+                   16.85%  43.713ms     10000  4.3710us  3.8720us  6.4320us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*)
+                   13.67%  35.462ms     10000  3.5460us  3.4560us  7.1040us  kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, double, double*, int, int*, int*, int)
+                    9.83%  25.505ms     10000  2.5500us  2.2400us  2.8480us  kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*)
+                    7.03%  18.243ms     10000  1.8240us  1.7600us  2.5920us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*)
+                    7.01%  18.182ms     10000  1.8180us  1.6960us  2.2080us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+                    6.41%  16.614ms     10000  1.6610us  1.5360us  1.9520us  _GLOBAL__N__70_tmpxft_00004a64_00000000_6_poissongroup_thresholder_codeobject_cpp1_ii_7314966e::_reset_poissongroup_thresholder_codeobject(int*)
+                    6.01%  15.583ms     10000  1.5580us  1.4720us  1.7280us  _GLOBAL__N__69_tmpxft_00004a60_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_c0b8948b::_reset_neurongroup_thresholder_codeobject(int*)
+                    0.13%  330.21us         1  330.21us  330.21us  330.21us  void gen_sequenced<curandStateXORWOW, float, int, __operator_&__(float curand_uniform_noargs<curandStateXORWOW>(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int)
+      API calls:   84.19%  838.94ms     80001  10.486us  9.1520us  8.9317ms  cudaLaunch
+                   11.76%  117.21ms    580005     202ns     157ns  419.79us  cudaSetupArgument
+                    2.27%  22.642ms     80001     283ns     205ns  337.22us  cudaConfigureCall
+                    1.74%  17.290ms     60002     288ns     220ns  371.39us  cudaGetLastError
+                    0.02%  198.76us         1  198.76us  198.76us  198.76us  cudaMalloc
+                    0.01%  139.83us         1  139.83us  139.83us  139.83us  cudaMemGetInfo
+                    0.00%  37.555us        10  3.7550us  3.0440us  6.0110us  cudaFuncGetAttributes
+                    0.00%  31.926us        41     778ns     680ns  1.6620us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  13.490us         1  13.490us  13.490us  13.490us  cudaDeviceSynchronize
+                    0.00%  8.0030us        16     500ns     369ns  1.0430us  cudaDeviceGetAttribute
+                    0.00%  4.0740us         4  1.0180us     792ns  1.5710us  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_STDPEventDriven_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_STDPEventDriven_GeNNConfigurationOptimized_1000.log
new file mode 100644
index 00000000..e03d86b4
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_STDPEventDriven_GeNNConfigurationOptimized_1000.log
@@ -0,0 +1,23 @@
+==20069== NVPROF is profiling process 20069, command: ./main test 1.0 1
+==20069== Profiling application: ./main test 1.0 1
+==20069== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   61.14%  103.55ms     10000  10.355us  1.5680us  49.760us  calcSynapses
+                   24.06%  40.741ms     10000  4.0740us  3.0720us  6.6880us  calcNeurons
+                   14.71%  24.918ms     10000  2.4910us  2.3360us  10.560us  learnSynapsesPost
+                    0.06%  94.593us        70  1.3510us     960ns  2.0800us  [CUDA memcpy HtoD]
+                    0.03%  55.553us        19  2.9230us  2.0160us  4.8010us  [CUDA memcpy DtoH]
+      API calls:   56.62%  434.02ms        20  21.701ms  16.555us  432.05ms  cudaHostAlloc
+                   40.47%  310.26ms     30000  10.341us  9.4140us  347.35us  cudaLaunch
+                    1.37%  10.508ms     30000     350ns     275ns  330.81us  cudaConfigureCall
+                    1.08%  8.2824ms     30000     276ns     221ns  333.57us  cudaSetupArgument
+                    0.25%  1.9098ms        95  20.103us     434ns  41.200us  cudaMemcpy
+                    0.13%  998.34us        20  49.917us  13.252us  259.26us  cudaMalloc
+                    0.05%  419.41us        94  4.4610us     183ns  162.60us  cuDeviceGetAttribute
+                    0.02%  126.30us         1  126.30us  126.30us  126.30us  cuDeviceTotalMem
+                    0.00%  38.221us         1  38.221us  38.221us  38.221us  cuDeviceGetName
+                    0.00%  29.710us        20  1.4850us     972ns  6.2560us  cudaGetSymbolAddress
+                    0.00%  9.9350us         1  9.9350us  9.9350us  9.9350us  cudaSetDevice
+                    0.00%  3.4560us         3  1.1520us     236ns  2.5630us  cuDeviceGetCount
+                    0.00%  1.7080us         1  1.7080us  1.7080us  1.7080us  cudaGetDeviceCount
+                    0.00%  1.2540us         2     627ns     267ns     987ns  cuDeviceGet
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_STDPNotEventDriven_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_STDPNotEventDriven_CUDAStandaloneConfiguration_1000.log
new file mode 100644
index 00000000..75f290df
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_STDPNotEventDriven_CUDAStandaloneConfiguration_1000.log
@@ -0,0 +1,25 @@
+==18533== NVPROF is profiling process 18533, command: ./main
+==18533== Profiling application: ./main
+==18533== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   27.01%  73.513ms     10000  7.3510us  3.2960us  29.120us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, double*, double, double*, int, int*, int, int*)
+                   16.08%  43.771ms     10000  4.3770us  3.9040us  6.3360us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*)
+                   12.83%  34.925ms     10000  3.4920us  3.3920us  6.4000us  kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, double, double*, int, int*)
+                   10.01%  27.253ms     10000  2.7250us  2.6240us  3.2000us  kernel_synapses_stateupdater_codeobject(unsigned int, unsigned int, double*, int, double*, int, double*, int*)
+                    9.18%  24.982ms     10000  2.4980us  2.2080us  2.6880us  kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*)
+                    6.70%  18.244ms     10000  1.8240us  1.7280us  2.0800us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+                    6.70%  18.236ms     10000  1.8230us  1.7280us  2.6240us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*)
+                    5.76%  15.677ms     10000  1.5670us  1.4720us  1.6960us  _GLOBAL__N__69_tmpxft_00004642_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_c0b8948b::_reset_neurongroup_thresholder_codeobject(int*)
+                    5.59%  15.219ms     10000  1.5210us  1.4400us  1.9520us  _GLOBAL__N__70_tmpxft_00004643_00000000_6_poissongroup_thresholder_codeobject_cpp1_ii_7314966e::_reset_poissongroup_thresholder_codeobject(int*)
+                    0.12%  330.63us         1  330.63us  330.63us  330.63us  void gen_sequenced<curandStateXORWOW, float, int, __operator_&__(float curand_uniform_noargs<curandStateXORWOW>(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int)
+      API calls:   85.78%  968.25ms     90001  10.758us  9.3110us  9.2828ms  cudaLaunch
+                   10.41%  117.51ms    660005     178ns     137ns  367.56us  cudaSetupArgument
+                    2.19%  24.694ms     90001     274ns     200ns  349.17us  cudaConfigureCall
+                    1.59%  17.897ms     70002     255ns     203ns  333.24us  cudaGetLastError
+                    0.02%  201.51us         1  201.51us  201.51us  201.51us  cudaMalloc
+                    0.01%  131.77us         1  131.77us  131.77us  131.77us  cudaMemGetInfo
+                    0.00%  51.691us        74     698ns     591ns  1.8080us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  47.398us        12  3.9490us  3.1850us  6.2570us  cudaFuncGetAttributes
+                    0.00%  13.229us         1  13.229us  13.229us  13.229us  cudaDeviceSynchronize
+                    0.00%  9.0230us        20     451ns     339ns     845ns  cudaDeviceGetAttribute
+                    0.00%  4.9330us         5     986ns     852ns  1.4630us  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_STDPNotEventDriven_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_STDPNotEventDriven_GeNNConfigurationOptimized_1000.log
new file mode 100644
index 00000000..aa874e31
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_STDPNotEventDriven_GeNNConfigurationOptimized_1000.log
@@ -0,0 +1,24 @@
+==19043== NVPROF is profiling process 19043, command: ./main test 1.0 1
+==19043== Profiling application: ./main test 1.0 1
+==19043== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   39.69%  65.436ms     10000  6.5430us  1.5680us  25.984us  calcSynapses
+                   24.07%  39.692ms     10000  3.9690us  3.1040us  6.4320us  calcNeurons
+                   20.72%  34.165ms     10000  3.4160us  3.1040us  6.0800us  calcSynapseDynamics
+                   15.42%  25.426ms     10000  2.5420us  2.3680us  6.6880us  learnSynapsesPost
+                    0.06%  96.800us        72  1.3440us     960ns  2.0800us  [CUDA memcpy HtoD]
+                    0.04%  59.552us        21  2.8350us  2.0480us  4.7680us  [CUDA memcpy DtoH]
+      API calls:   51.75%  397.10ms     40000  9.9270us  9.2210us  345.12us  cudaLaunch
+                   44.63%  342.53ms        21  16.311ms  16.914us  340.57ms  cudaHostAlloc
+                    1.75%  13.449ms     40000     336ns     278ns  330.54us  cudaConfigureCall
+                    1.40%  10.778ms     40000     269ns     210ns  335.21us  cudaSetupArgument
+                    0.26%  1.9587ms        97  20.192us     407ns  40.743us  cudaMemcpy
+                    0.13%  990.84us        21  47.182us  13.183us  232.13us  cudaMalloc
+                    0.05%  400.17us        94  4.2570us     154ns  155.67us  cuDeviceGetAttribute
+                    0.01%  113.90us         1  113.90us  113.90us  113.90us  cuDeviceTotalMem
+                    0.00%  36.839us         1  36.839us  36.839us  36.839us  cuDeviceGetName
+                    0.00%  30.866us        21  1.4690us     942ns  6.1960us  cudaGetSymbolAddress
+                    0.00%  9.2900us         1  9.2900us  9.2900us  9.2900us  cudaSetDevice
+                    0.00%  3.2500us         3  1.0830us     238ns  2.4920us  cuDeviceGetCount
+                    0.00%  1.6970us         1  1.6970us  1.6970us  1.6970us  cudaGetDeviceCount
+                    0.00%  1.0870us         2     543ns     238ns     849ns  cuDeviceGet
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_STDP_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_STDP_CUDAStandaloneConfiguration_1000.log
new file mode 100644
index 00000000..69fa92bb
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_STDP_CUDAStandaloneConfiguration_1000.log
@@ -0,0 +1,26 @@
+==8893== NVPROF is profiling process 8893, command: ./main
+==8893== Profiling application: ./main
+==8893== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   32.18%  119.34ms     10000  11.934us  1.6000us  26.926ms  kernel_spikemonitor_codeobject(unsigned int, int*, double, int*, int*, int*, int, int*, double*, int, int*, int*)
+                   20.94%  77.684ms     10000  7.7680us  3.3600us  25.728us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, double*, double, double*, int, int*, int, int*, int)
+                   11.71%  43.439ms     10000  4.3430us  3.8400us  6.0800us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*)
+                    9.85%  36.549ms     10000  3.6540us  3.5520us  7.0080us  kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, double, double*, int, int*, int*, int)
+                    6.79%  25.173ms     10000  2.5170us  2.1760us  3.6800us  kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*)
+                    4.91%  18.216ms     10000  1.8210us  1.7280us  4.3200us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*)
+                    4.78%  17.745ms     10000  1.7740us  1.5680us  3.6800us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+                    4.51%  16.723ms     10000  1.6720us  1.6000us  3.2640us  _GLOBAL__N__70_tmpxft_00002089_00000000_6_poissongroup_thresholder_codeobject_cpp1_ii_7314966e::_reset_poissongroup_thresholder_codeobject(int*)
+                    4.22%  15.645ms     10000  1.5640us  1.3760us  3.5200us  _GLOBAL__N__69_tmpxft_00002086_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_c0b8948b::_reset_neurongroup_thresholder_codeobject(int*)
+                    0.09%  330.21us         1  330.21us  330.21us  330.21us  void gen_sequenced<curandStateXORWOW, float, int, __operator_&__(float curand_uniform_noargs<curandStateXORWOW>(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int)
+                    0.02%  68.192us         1  68.192us  68.192us  68.192us  _run_spikemonitor_codeobject_init(void)
+      API calls:   85.86%  977.23ms     90002  10.857us  9.1510us  10.887ms  cudaLaunch
+                   10.27%  116.88ms    700005     166ns     137ns  331.60us  cudaSetupArgument
+                    2.12%  24.161ms     90002     268ns     176ns  335.73us  cudaConfigureCall
+                    1.71%  19.473ms     70003     278ns     205ns  329.44us  cudaGetLastError
+                    0.02%  213.58us         1  213.58us  213.58us  213.58us  cudaMalloc
+                    0.01%  133.47us         1  133.47us  133.47us  133.47us  cudaMemGetInfo
+                    0.00%  43.181us        11  3.9250us  3.2280us  6.4450us  cudaFuncGetAttributes
+                    0.00%  32.048us        42     763ns     627ns  1.7760us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  13.358us         1  13.358us  13.358us  13.358us  cudaDeviceSynchronize
+                    0.00%  7.7520us        16     484ns     356ns  1.1240us  cudaDeviceGetAttribute
+                    0.00%  4.0510us         4  1.0120us     822ns  1.5030us  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_STDP_GeNNConfigurationOptimized_1000.log b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_STDP_GeNNConfigurationOptimized_1000.log
new file mode 100644
index 00000000..222cc004
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/nvprof/nvprof_STDP_GeNNConfigurationOptimized_1000.log
@@ -0,0 +1,23 @@
+==9420== NVPROF is profiling process 9420, command: ./main test 1.0 1
+==9420== Profiling application: ./main test 1.0 1
+==9420== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   50.20%  103.62ms     10000  10.362us  1.5680us  45.248us  calcSynapses
+                   19.97%  41.214ms     10000  4.1210us  3.1040us  6.7200us  calcNeurons
+                   17.73%  36.597ms     17812  2.0540us  2.0160us  4.7360us  [CUDA memcpy DtoH]
+                   12.06%  24.885ms     10000  2.4880us  2.3680us  10.848us  learnSynapsesPost
+                    0.05%  94.016us        70  1.3430us     960ns  2.0480us  [CUDA memcpy HtoD]
+      API calls:   34.18%  358.40ms        20  17.920ms  8.3270us  356.55ms  cudaHostAlloc
+                   32.17%  337.26ms     30000  11.241us  9.5510us  356.07us  cudaLaunch
+                   31.72%  332.57ms     20095  16.549us     231ns  988.56us  cudaMemcpy
+                    1.03%  10.770ms     30000     358ns     283ns  331.73us  cudaConfigureCall
+                    0.77%  8.0617ms     30000     268ns     208ns  334.35us  cudaSetupArgument
+                    0.08%  809.75us        20  40.487us  8.0280us  232.94us  cudaMalloc
+                    0.04%  401.78us        94  4.2740us     161ns  156.02us  cuDeviceGetAttribute
+                    0.01%  113.16us         1  113.16us  113.16us  113.16us  cuDeviceTotalMem
+                    0.00%  37.103us         1  37.103us  37.103us  37.103us  cuDeviceGetName
+                    0.00%  22.451us        20  1.1220us     525ns  5.8000us  cudaGetSymbolAddress
+                    0.00%  9.5720us         1  9.5720us  9.5720us  9.5720us  cudaSetDevice
+                    0.00%  3.2610us         3  1.0870us     219ns  2.3710us  cuDeviceGetCount
+                    0.00%  1.6100us         1  1.6100us  1.6100us  1.6100us  cudaGetDeviceCount
+                    0.00%  1.0470us         2     523ns     250ns     797ns  cuDeviceGet
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_CUBA_absolute.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_CUBA_absolute.png
new file mode 100644
index 00000000..16624531
Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_CUBA_absolute.png differ
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_CUBA_profiling.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_CUBA_profiling.png
new file mode 100644
index 00000000..f5bf2a7b
Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_CUBA_profiling.png differ
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_CUBA_relative.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_CUBA_relative.png
new file mode 100644
index 00000000..8e3fd355
Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_CUBA_relative.png differ
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDPEventDriven_absolute.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDPEventDriven_absolute.png
new file mode 100644
index 00000000..5c7b42a0
Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDPEventDriven_absolute.png differ
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDPEventDriven_profiling.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDPEventDriven_profiling.png
new file mode 100644
index 00000000..f15e8c3d
Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDPEventDriven_profiling.png differ
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDPEventDriven_relative.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDPEventDriven_relative.png
new file mode 100644
index 00000000..a5144e31
Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDPEventDriven_relative.png differ
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDPNotEventDriven_absolute.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDPNotEventDriven_absolute.png
new file mode 100644
index 00000000..585942c0
Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDPNotEventDriven_absolute.png differ
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDPNotEventDriven_profiling.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDPNotEventDriven_profiling.png
new file mode 100644
index 00000000..f411ff6d
Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDPNotEventDriven_profiling.png differ
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDPNotEventDriven_relative.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDPNotEventDriven_relative.png
new file mode 100644
index 00000000..cf91a072
Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDPNotEventDriven_relative.png differ
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDP_absolute.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDP_absolute.png
new file mode 100644
index 00000000..4f20032c
Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDP_absolute.png differ
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDP_profiling.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDP_profiling.png
new file mode 100644
index 00000000..c4aad905
Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDP_profiling.png differ
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDP_relative.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDP_relative.png
new file mode 100644
index 00000000..7b88c45e
Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/plots/speed_test_STDP_relative.png differ
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/run_speed_test_script.py b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/run_speed_test_script.py
new file mode 100644
index 00000000..beeb8f92
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp/run_speed_test_script.py
@@ -0,0 +1,291 @@
+import os
+import shutil
+import glob
+import subprocess
+import sys
+import socket
+
+# run tests without X-server
+import matplotlib
+matplotlib.use('Agg')
+
+# pretty plots
+import seaborn
+
+import time
+import datetime
+import cPickle as pickle
+
+from brian2 import *
+from brian2.tests.features import *
+from brian2.tests.features.base import *
+from brian2.tests.features.base import results
+
+import brian2cuda
+from brian2cuda.tests.features.cuda_configuration import (CUDAStandaloneConfiguration,
+                                                          CUDAStandaloneConfigurationNoAssert,
+                                                          CUDAStandaloneConfigurationExtraThresholdKernel,
+                                                          CUDAStandaloneConfigurationCurandDouble,
+                                                          CUDAStandaloneConfigurationNoCudaOccupancyAPI,
+                                                          CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU,
+                                                          CUDAStandaloneConfiguration2BlocksPerSM,
+                                                          CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds,
+                                                          CUDAStandaloneConfigurationSynLaunchBounds,
+                                                          CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds,
+                                                          CUDAStandaloneConfigurationProfileGPU,
+                                                          CUDAStandaloneConfigurationProfileCPU)
+                                                          #CUDAStandaloneConfigurationTestBrunelHeteroAtomics,
+                                                          #CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU,
+                                                          #CUDAStandaloneConfigurationPushAtomicResize,
+                                                          #CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize,
+                                                          #CUDAStandaloneConfigurationPushAtomicResizeProfileCPU,
+                                                          #CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResizeProfileCPU,
+                                                          #CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy,
+                                                          #CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU)
+from brian2cuda.tests.features.speed import *
+
+from brian2genn.correctness_testing import GeNNConfiguration, GeNNConfigurationCPU, GeNNConfigurationOptimized
+
+from create_readme import create_readme
+
+assert len(sys.argv)<= 2, 'Only one command line argument supported! Got {}'.format(len(sys.argv)-1)
+if len(sys.argv) == 2:
+    additional_dir_name = '_' + sys.argv[1]
+else:
+    additional_dir_name = ''
+
+prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j12']
+
+# host specific settings
+if socket.gethostname() == 'elnath':
+    prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j24']
+    prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
+    prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20'])
+
+configs = [# configuration                          project_directory
+          #(NumpyConfiguration,                     None),
+          #(WeaveConfiguration,                     None),
+          #(LocalConfiguration,                     None),
+          (CPPStandaloneConfiguration,              'cpp_standalone'),
+          (CPPStandaloneConfigurationOpenMP,        'cpp_standalone'),
+          (CUDAStandaloneConfiguration,             'cuda_standalone'),
+          #(CUDAStandaloneConfigurationPushAtomicResize,   'cuda_standalone'),
+          #(CUDAStandaloneConfigurationTestBrunelHeteroAtomics,   'cuda_standalone'),
+          #(CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize,     'cuda_standalone'),
+          #(CUDAStandaloneConfigurationExtraThresholdKernel,             'cuda_standalone'),
+          #(CUDAStandaloneConfigurationNoAssert,             'cuda_standalone'),
+          #(CUDAStandaloneConfigurationCurandDouble,              'cuda_standalone'),
+          #(CUDAStandaloneConfigurationNoCudaOccupancyAPI,      'cuda_standalone'),
+          #(CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU,    'cuda_standalone'),
+          #(CUDAStandaloneConfiguration2BlocksPerSM, 'cuda_standalone'),
+          #(CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds, 'cuda_standalone'),
+          #(CUDAStandaloneConfigurationSynLaunchBounds,     'cuda_standalone'),
+          #(CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds, 'cuda_standalone'),
+          #(CUDAStandaloneConfigurationProfileGPU,   'cuda_standalone'),
+          #(CUDAStandaloneConfigurationProfileCPU,   'cuda_standalone'),
+          #(CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU,   'cuda_standalone'),
+          #(CUDAStandaloneConfigurationPushAtomicResizeProfileCPU,   'cuda_standalone'),
+          #(CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResizeProfileCPU,     'cuda_standalone'),
+          #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy,   'cuda_standalone'),
+          #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU,   'cuda_standalone'),
+          #(GeNNConfiguration,                       'GeNNworkspace'),
+          #(GeNNConfigurationCPU,                    'GeNNworkspace'),
+          (GeNNConfigurationOptimized,              'GeNNworkspace')
+          ]
+
+speed_tests = [# feature_test                     name                                  n_slice
+
+               #(ThresholderOnlyPoissonLowRate,                  'ThresholderOnlyPoissonLowRate',                slice(None)         ),
+               #(ThresholderOnlyPoissonMediumRate,               'ThresholderOnlyPoissonMediumRate',             slice(None)         ),
+               #(ThresholderOnlyPoissonHighRate,                 'ThresholderOnlyPoissonHighRate',               slice(None)         ),
+               #(ThresholderOnlyAlwaysSpiking,                   'ThresholderOnlyAlwaysSpiking',                 slice(None)         ),
+
+               #(BrunelHakimStateupdateOnlyDouble,               'BrunelHakimStateupdateOnlyDouble',             slice(None)         ),
+               #(BrunelHakimStateupdateOnlyTriple,               'BrunelHakimStateupdateOnlyTriple',             slice(None)         ),
+               #(BrunelHakimStateupdateOnly,                     'BrunelHakimStateupdateOnly',                   slice(None)         ),
+               #(BrunelHakimNeuronsOnly,                         'BrunelHakimNeuronsOnly',                       slice(None)         ),
+               #(BrunelHakimNeuronsOnlyNoXi,                     'BrunelHakimNeuronsOnlyNoXi',                   slice(None)         ),
+               #(BrunelHakimNeuronsOnlyNoRand,                   'BrunelHakimNeuronsOnlyNoRand',                 slice(None)         ),
+               #(BrunelHakimStateupdateThresholdOnly,            'BrunelHakimStateupdateThresholdOnly',          slice(None)         ),
+               #(BrunelHakimStateupdateThresholdResetOnly,       'BrunelHakimStateupdateThresholdResetOnly',     slice(None)         ),
+               #(BrunelHakimModelScalarDelayShort,               'BrunelHakimModelScalarDelayShort',             slice(None)         ),
+               #(BrunelHakimModelScalarDelayNoSelfConnections,   'BrunelHakimModelScalarDelayNoSelfConnections', slice(None)         ),
+               #(COBAHH,                                         'COBAHH',                                       slice(None)         ),
+               #(AdaptationOscillation,                          'AdaptationOscillation',                        slice(None)         ),
+               #(Vogels,                                         'Vogels',                                       slice(None)         ),
+               (STDP,                                           'STDP',                                         slice(None)         ),
+               (STDPEventDriven,                                'STDPEventDriven',                              slice(None)         ),
+               (CUBA,                                           'CUBA',                                         slice(None)         ),
+               #(BrunelHakimModelScalarDelay,                    'BrunelHakimModelScalarDelay',                  slice(None)         ),
+
+               #(VerySparseMediumRateSynapsesOnly,               'VerySparseMediumRateSynapsesOnly',             slice(None)         ),
+               #(SparseMediumRateSynapsesOnly,                   'SparseMediumRateSynapsesOnly',                 slice(None)         ),
+               #(DenseMediumRateSynapsesOnly,                    'DenseMediumRateSynapsesOnly',                  slice(None)         ),
+               #(SparseLowRateSynapsesOnly,                      'SparseLowRateSynapsesOnly',                    slice(None)         ),
+               #(SparseHighRateSynapsesOnly,                     'SparseHighRateSynapsesOnly',                   slice(None)         ),
+
+               #(STDPNotEventDriven,                             'STDPNotEventDriven',                           slice(None)         ),
+               #(STDPMultiPost,                                  'STDPMultiPost',                                slice(None)         ),
+               #(STDPNeuronalTraces,                             'STDPNeuronalTraces',                           slice(None)         ),
+               #(STDPMultiPostNeuronalTraces,                    'STDPMultiPostNeuronalTraces',                  slice(None)         ),
+
+               #(BrunelHakimModelHeterogeneousDelay,             'BrunelHakimModelHeterogeneousDelay',           slice(0,-1,1)         ),
+
+               #(LinearNeuronsOnly,                              'LinearNeuronsOnly',                            slice(None)         ),
+               #(HHNeuronsOnly,                                  'HHNeuronsOnly',                                slice(None)         ),
+               #(VogelsWithSynapticDynamic,                      'VogelsWithSynapticDynamic',                    slice(None)         ),
+
+               ### below uses monitors
+               #(CUBAFixedConnectivity,                          'CUBAFixedConnectivity',                        slice(None)         ),
+               #(COBAHHFixedConnectivity,                        'COBAHHFixedConnectivity',                      slice(None, -1)     ),
+]
+
+configurations = [config[0] for config in configs]
+project_dirs = [config[1] for config in configs]
+
+# check if multiple Configurations with same project_dirs are specified
+last_idx = {}
+for proj_dir in project_dirs:
+    if proj_dir is not None:
+        first_i = project_dirs.index(proj_dir)
+        last_i = len(project_dirs) - 1 - project_dirs[::-1].index(proj_dir)
+        if first_i != last_i:
+            print("WARNING there are multiple configurations using {d} as project "
+                  "directory. Profiling and logfiles will only be saved for the last one {c}.".format(
+                  d=proj_dir, c=configurations[last_i].__name__))
+        last_idx[proj_dir] = last_i
+
+time_stemp = time.time()
+date_str = datetime.datetime.fromtimestamp(time_stemp).strftime('%Y_%m_%d')
+
+directory = 'results_{}{}'.format(date_str, additional_dir_name)
+if os.path.exists(directory):
+    new_dir = directory + '_bak_' + str(int(time.time()))
+    print("Directory with name `{}` already exists. Renaming it to `{}`.".format(directory, new_dir))
+    os.rename(directory, new_dir)
+os.makedirs(directory)
+data_dir = os.path.join(directory, 'data')
+plot_dir = os.path.join(directory, 'plots')
+log_dir = os.path.join(directory, 'logs')
+prof_dir = os.path.join(directory, 'nvprof')
+os.makedirs(data_dir)
+os.makedirs(plot_dir)
+os.makedirs(log_dir)
+os.makedirs(prof_dir)
+print("Saving results in {}.".format(plot_dir))
+
+shutil.copy(os.path.realpath(__file__), os.path.join(directory, 'run_speed_test_script.py'))
+
+time_format = '%d.%m.%Y at %H:%M:%S'
+script_start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+
+with open(os.path.join(directory, 'git.diff'), 'w') as diff_file:
+    subprocess.call(['git', 'diff'], stdout=diff_file)
+
+try:
+    for n, (st, name, sl) in enumerate(speed_tests):
+        start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+        print("Starting {} on {}.".format(name, start))
+        maximum_run_time = 1*60*60*second
+        res = run_speed_tests(configurations=configurations,
+                              speed_tests=[st],
+                              n_slice=sl,
+                              #n_slice=slice(0,1,None),
+                              run_twice=False,
+                              verbose=True,
+                              maximum_run_time=maximum_run_time#,
+                              ## this needs modification of brian2 code
+                              #profile_only_active=True 
+                              #profile_only_active=False
+                             )
+        end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+        diff = datetime.datetime.strptime(end, time_format) - datetime.datetime.strptime(start, time_format)
+        print("Running {} took {}.".format(name, diff))
+        res.plot_all_tests()
+        ## this needs modification of brian2 code
+        #res.plot_all_tests(print_relative=True)
+        savefig(os.path.join(plot_dir, 'speed_test_{}_absolute.png'.format(speed_tests[n][1])))
+        res.plot_all_tests(relative=True)
+        savefig(os.path.join(plot_dir, 'speed_test_{}_relative.png'.format(name)))
+        res.plot_all_tests(profiling_minimum=0.05)
+        savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.png'.format(name)))
+        res.plot_all_tests(profiling_minimum=0.15)
+        savefig(os.path.join(plot_dir, 'speed_test_min_15_{}_profiling.png'.format(name)))
+
+        res.plot_all_tests()
+        ## this needs modification of brian2 code
+        #res.plot_all_tests(print_relative=True)
+        savefig(os.path.join(plot_dir, 'speed_test_{}_absolute.svg'.format(speed_tests[n][1])))
+        res.plot_all_tests(relative=True)
+        savefig(os.path.join(plot_dir, 'speed_test_{}_relative.svg'.format(name)))
+        res.plot_all_tests(profiling_minimum=0.05)
+        savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.svg'.format(name)))
+        res.plot_all_tests(profiling_minimum=0.15)
+        savefig(os.path.join(plot_dir, 'speed_test_min_15_{}_profiling.svg'.format(name)))
+
+        if 3 != len(get_fignums()):
+            print("WARNING: There were {} plots created, but only {} saved.".format(len(get_fignums()), 3*(n+1)))
+        for n in get_fignums():
+            close(n)
+
+        # pickel results object to disk
+        pkl_file = os.path.join(data_dir, name + '.pkl' )
+        with open(pkl_file, 'wb') as output:
+                pickle.dump(res, output, pickle.HIGHEST_PROTOCOL)
+
+        # save stdout log of last run (the other are deleted in run_speed_tests())
+        for proj_dir in set(project_dirs):
+            if not proj_dir is None and proj_dir in ['cuda_standalone', 'cpp_standalone']:
+                config = configurations[last_idx[proj_dir]]
+                stdout_file = os.path.join(proj_dir, 'results/stdout.txt')
+                if os.path.exists(stdout_file):
+                    shutil.copy(stdout_file,
+                                os.path.join(log_dir, 'stdout_{st}_{conf}_{n}.txt'.format(st=name, conf=proj_dir,
+                                                                                           n=st.n_range[sl][-1])))
+                else:
+                    print("WARNING Couldn't save {},file not found.".format(stdout_file))
+
+        # run nvprof on n_range[2]
+        for conf, proj_dir in zip(configurations, project_dirs):
+            main_arg = ''
+            if proj_dir in ['cuda_standalone', 'GeNNworkspace']:
+                if proj_dir == 'GeNNworkspace':
+                    main_arg = 'test {time} 1'.format(time=st.duration/second)
+                ns = st.n_range[sl]
+                idx = 2
+                max_runtime = 20
+                conf_name = conf.__name__
+                print("Rerunning {} with n = {} for nvprof profiling".format(conf_name, st.n_range[idx]))
+                tb, res, runtime, prof_info = results(conf, st, st.n_range[idx], maximum_run_time=maximum_run_time)
+                if not isinstance(res, Exception) and runtime < max_runtime:
+                    option = '--profile-from-start-off' if proj_dir == 'cuda_standalone' else ''
+                    cmd = 'cd {proj_dir} && nvprof {opt} --log-file ../{log_file} ./main {arg}'.format(
+                        proj_dir=proj_dir, arg=main_arg, opt=option,
+                        log_file=os.path.join(prof_dir, 'nvprof_{st}_{conf}_{n}.log'.format(
+                            st=name, conf=conf_name, n=st.n_range[idx])))
+                    prof_start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+                    print(cmd)
+                    x = os.system(cmd)
+                    if x:
+                        print('nvprof failed with {}'.format(x))
+                    prof_end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+                    prof_diff = datetime.datetime.strptime(prof_end, time_format) - datetime.datetime.strptime(prof_start, time_format)
+                    print("Profiling took {} for runtime of {}".format(prof_diff, runtime))
+finally:
+    create_readme(directory)
+    print("\nSummarized speed test results in {}".format(directory + '/README.md'))
+    script_end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+    script_diff = datetime.datetime.strptime(script_end, time_format) - datetime.datetime.strptime(script_start, time_format)
+    print("Finished speed test on {}. Total time = {}.".format(
+        datetime.datetime.fromtimestamp(time.time()).strftime(time_format), script_diff))
+
+
+##res.plot_all_tests(relative=True)
+#for n in get_fignums():
+#    plt.figure(n)
+#    savefig(plot_dir + '/speed_test_{}.png'.format(speed_tests[n-1][1]))
+
+## Debug (includes profiling infos)
+#from brian2.tests.features.base import results
+#for x in results(LocalConfiguration, LinearNeuronsOnly, 10, maximum_run_time=10*second):
+#    print x
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/README.md b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/README.md
new file mode 100644
index 00000000..364f3d4c
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/README.md
@@ -0,0 +1,254 @@
+
+# Benchmark results from 29.11.2017
+## Description:
+
+
+
+## Last git log:
+```
+commit 65e51048f25caaee2a6e0396269f90821d994f85
+Author: Denis Alevi <mail@denisalevi.de>
+Date:   Mon Nov 27 17:55:05 2017 +0100
+
+    Add recent benchmark results
+
+```
+There is also a `git diff` saved in the current directory.
+
+## Results
+
+### CUBA
+![](plots/speed_test_CUBA_absolute.svg)
+![](plots/speed_test_CUBA_profiling.svg)
+![](plots/speed_test_CUBA_relative.svg)
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfigurationProfileCPU**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==6637== NVPROF is profiling process 6637, command: ./main
+==6637== Profiling application: ./main
+==6637== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   27.59%  59.367ms     10000  5.9360us  5.7280us  6.9130us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double, double*, double*, double*, bool*)
+                   23.11%  49.736ms     10000  4.9730us  3.2960us  20.256us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double*, double, int*, int, int*, bool*)
+                   21.48%  46.232ms     10000  4.6230us  3.2960us  15.424us  kernel_synapses_1_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, int*, double*, bool*)
+                   11.66%  25.090ms     10000  2.5080us  2.2720us  3.0080us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+                    8.37%  18.003ms     10000  1.8000us  1.6640us  2.1760us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*, bool*)
+                    7.79%  16.764ms     10000  1.6760us  1.6000us  2.0480us  _GLOBAL__N__69_tmpxft_000017f7_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*)
+      API calls:   55.27%  767.72ms     60000  12.795us  10.547us  9.0021ms  cudaLaunch
+                   35.89%  498.54ms     80001  6.2310us  2.4830us  372.18us  cudaDeviceSynchronize
+                    6.50%  90.343ms    520000     173ns     138ns  371.16us  cudaSetupArgument
+                    1.33%  18.502ms     60000     308ns     238ns  364.34us  cudaConfigureCall
+                    0.99%  13.745ms     50000     274ns     217ns  21.746us  cudaGetLastError
+                    0.01%  138.51us         1  138.51us  138.51us  138.51us  cudaMemGetInfo
+                    0.00%  33.472us        39     858ns     721ns  1.8600us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  30.648us         8  3.8310us  3.1320us  5.3030us  cudaFuncGetAttributes
+                    0.00%  6.3800us        12     531ns     343ns  1.3920us  cudaDeviceGetAttribute
+                    0.00%  2.9800us         3     993ns     737ns  1.3910us  cudaGetDevice
+
+```
+
+</p></details>
+
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfiguration**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==5900== NVPROF is profiling process 5900, command: ./main
+==5900== Profiling application: ./main
+==5900== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   27.83%  60.653ms     10000  6.0650us  5.7920us  7.0400us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double, double*, double*, double*, bool*)
+                   23.00%  50.122ms     10000  5.0120us  3.2960us  24.320us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double*, double, int*, int, int*, bool*)
+                   20.65%  45.008ms     10000  4.5000us  3.2960us  17.824us  kernel_synapses_1_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, int*, double*, bool*)
+                   11.47%  25.008ms     10000  2.5000us  2.2720us  3.1680us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+                    9.22%  20.085ms     10000  2.0080us  1.8560us  2.1760us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*, bool*)
+                    7.83%  17.069ms     10000  1.7060us  1.6320us  2.2400us  _GLOBAL__N__69_tmpxft_00001511_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*)
+      API calls:   85.16%  640.31ms     60000  10.671us  9.6060us  9.0686ms  cudaLaunch
+                   11.50%  86.475ms    520000     166ns     135ns  344.23us  cudaSetupArgument
+                    1.87%  14.092ms     60000     234ns     176ns  334.30us  cudaConfigureCall
+                    1.43%  10.785ms     50000     215ns     189ns  10.220us  cudaGetLastError
+                    0.02%  139.19us         1  139.19us  139.19us  139.19us  cudaMemGetInfo
+                    0.00%  31.512us         8  3.9390us  3.0080us  5.7970us  cudaFuncGetAttributes
+                    0.00%  29.967us        39     768ns     653ns  1.9770us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  12.868us         1  12.868us  12.868us  12.868us  cudaDeviceSynchronize
+                    0.00%  6.2440us        12     520ns     331ns  1.3150us  cudaDeviceGetAttribute
+                    0.00%  3.7510us         3  1.2500us     823ns  1.7170us  cudaGetDevice
+
+```
+
+</p></details>
+
+
+***
+
+### CUBA - less kernels displayed
+![](plots/speed_test_CUBA-less_kernels_displayed_min_15_profiling.svg)
+
+
+***
+
+### STDPNotEventDriven
+![](plots/speed_test_STDP_absolute.svg)
+![](plots/speed_test_STDP_profiling.svg)
+![](plots/speed_test_STDP_relative.svg)
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfigurationProfileCPU**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==28576== NVPROF is profiling process 28576, command: ./main
+==28576== Profiling application: ./main
+==28576== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   26.71%  73.256ms     10000  7.3250us  3.2960us  22.720us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, double*, double, double*, int, int*, int, int*)
+                   15.80%  43.329ms     10000  4.3320us  3.8720us  6.2400us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*)
+                   12.77%  35.035ms     10000  3.5030us  3.3920us  6.3360us  kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, double, double*, int, int*)
+                    9.94%  27.271ms     10000  2.7270us  2.6240us  3.1680us  kernel_synapses_stateupdater_codeobject(unsigned int, unsigned int, double*, int, double*, int, double*, int*)
+                    9.28%  25.455ms     10000  2.5450us  2.2400us  2.9120us  kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*)
+                    6.66%  18.254ms     10000  1.8250us  1.7600us  2.0800us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+                    6.65%  18.226ms     10000  1.8220us  1.7600us  2.5600us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*)
+                    6.20%  16.991ms     10000  1.6990us  1.6000us  1.9200us  _GLOBAL__N__70_tmpxft_00006daf_00000000_6_poissongroup_thresholder_codeobject_cpp1_ii_7314966e::_reset_poissongroup_thresholder_codeobject(int*)
+                    5.88%  16.118ms     10000  1.6110us  1.4720us  1.8560us  _GLOBAL__N__69_tmpxft_00006dad_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_c0b8948b::_reset_neurongroup_thresholder_codeobject(int*)
+                    0.12%  330.53us         1  330.53us  330.53us  330.53us  void gen_sequenced<curandStateXORWOW, float, int, __operator_&__(float curand_uniform_noargs<curandStateXORWOW>(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int)
+      API calls:   58.50%  1.10914s     90001  12.323us  9.6560us  9.1188ms  cudaLaunch
+                   32.75%  621.00ms    100001  6.2090us  2.3660us  355.02us  cudaDeviceSynchronize
+                    5.78%  109.54ms    660005     165ns     124ns  14.341us  cudaSetupArgument
+                    1.49%  28.313ms     90001     314ns     245ns  12.028us  cudaConfigureCall
+                    1.45%  27.511ms     70002     393ns     230ns  366.98us  cudaGetLastError
+                    0.01%  208.18us         1  208.18us  208.18us  208.18us  cudaMalloc
+                    0.01%  131.79us         1  131.79us  131.79us  131.79us  cudaMemGetInfo
+                    0.00%  55.331us        74     747ns     647ns  1.4820us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  44.531us        12  3.7100us  3.1290us  4.8360us  cudaFuncGetAttributes
+                    0.00%  9.1380us        20     456ns     333ns     893ns  cudaDeviceGetAttribute
+                    0.00%  4.2750us         5     855ns     719ns  1.3080us  cudaGetDevice
+
+```
+
+</p></details>
+
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfiguration**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==27879== NVPROF is profiling process 27879, command: ./main
+==27879== Profiling application: ./main
+==27879== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   26.99%  74.731ms     10000  7.4730us  3.2960us  27.648us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, double*, double, double*, int, int*, int, int*)
+                   15.88%  43.964ms     10000  4.3960us  3.9360us  6.4000us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*)
+                   12.62%  34.946ms     10000  3.4940us  3.3920us  6.4960us  kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, double, double*, int, int*)
+                    9.80%  27.129ms     10000  2.7120us  2.3680us  2.9440us  kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*)
+                    9.58%  26.535ms     10000  2.6530us  2.5600us  3.0400us  kernel_synapses_stateupdater_codeobject(unsigned int, unsigned int, double*, int, double*, int, double*, int*)
+                    6.59%  18.247ms     10000  1.8240us  1.7280us  2.0480us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+                    6.58%  18.231ms     10000  1.8230us  1.7600us  2.5600us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*)
+                    6.20%  17.155ms     10000  1.7150us  1.6320us  1.9520us  _GLOBAL__N__70_tmpxft_00006ae9_00000000_6_poissongroup_thresholder_codeobject_cpp1_ii_7314966e::_reset_poissongroup_thresholder_codeobject(int*)
+                    5.65%  15.632ms     10000  1.5630us  1.4720us  1.6960us  _GLOBAL__N__69_tmpxft_00006ae5_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_c0b8948b::_reset_neurongroup_thresholder_codeobject(int*)
+                    0.12%  329.57us         1  329.57us  329.57us  329.57us  void gen_sequenced<curandStateXORWOW, float, int, __operator_&__(float curand_uniform_noargs<curandStateXORWOW>(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int)
+      API calls:   85.57%  910.45ms     90001  10.116us  8.9060us  9.1466ms  cudaLaunch
+                   10.88%  115.80ms    660005     175ns     132ns  353.03us  cudaSetupArgument
+                    2.00%  21.262ms     90001     236ns     181ns  330.07us  cudaConfigureCall
+                    1.50%  15.984ms     70002     228ns     182ns  318.18us  cudaGetLastError
+                    0.02%  207.89us         1  207.89us  207.89us  207.89us  cudaMalloc
+                    0.01%  132.37us         1  132.37us  132.37us  132.37us  cudaMemGetInfo
+                    0.01%  55.857us        74     754ns     674ns  1.5500us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  44.986us        12  3.7480us  3.0050us  5.8370us  cudaFuncGetAttributes
+                    0.00%  13.864us         1  13.864us  13.864us  13.864us  cudaDeviceSynchronize
+                    0.00%  9.5470us        20     477ns     338ns  1.1980us  cudaDeviceGetAttribute
+                    0.00%  4.8700us         5     974ns     851ns  1.4220us  cudaGetDevice
+
+```
+
+</p></details>
+
+
+***
+
+### STDPNotEventDriven - less kernels displayed
+![](plots/speed_test_STDP-less_kernels_displayed_min_15_profiling.svg)
+
+
+***
+
+### STDPEventDriven
+![](plots/speed_test_STDPEventDriven_absolute.svg)
+![](plots/speed_test_STDPEventDriven_profiling.svg)
+![](plots/speed_test_STDPEventDriven_relative.svg)
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfigurationProfileCPU**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==18877== NVPROF is profiling process 18877, command: ./main
+==18877== Profiling application: ./main
+==18877== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   33.24%  85.455ms     10000  8.5450us  3.3280us  25.984us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, double*, double, double*, int, int*, int, int*, int)
+                   16.85%  43.327ms     10000  4.3320us  3.8400us  6.2080us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*)
+                   13.77%  35.393ms     10000  3.5390us  3.4240us  7.2320us  kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, double, double*, int, int*, int*, int)
+                    9.92%  25.503ms     10000  2.5500us  2.2400us  2.9760us  kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*)
+                    7.11%  18.278ms     10000  1.8270us  1.7600us  2.0800us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+                    6.37%  16.365ms     10000  1.6360us  1.4080us  1.7920us  _GLOBAL__N__70_tmpxft_00004798_00000000_6_poissongroup_thresholder_codeobject_cpp1_ii_7314966e::_reset_poissongroup_thresholder_codeobject(int*)
+                    6.31%  16.219ms     10000  1.6210us  1.5040us  1.8560us  _GLOBAL__N__69_tmpxft_00004796_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_c0b8948b::_reset_neurongroup_thresholder_codeobject(int*)
+                    6.31%  16.209ms     10000  1.6200us  1.5360us  2.5920us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*)
+                    0.13%  330.27us         1  330.27us  330.27us  330.27us  void gen_sequenced<curandStateXORWOW, float, int, __operator_&__(float curand_uniform_noargs<curandStateXORWOW>(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int)
+      API calls:   57.23%  936.43ms     80001  11.705us  9.9320us  9.2809ms  cudaLaunch
+                   34.35%  562.06ms     90001  6.2450us  2.4600us  359.92us  cudaDeviceSynchronize
+                    5.96%  97.491ms    580005     168ns     132ns  357.12us  cudaSetupArgument
+                    1.41%  23.032ms     80001     287ns     242ns  13.914us  cudaConfigureCall
+                    1.02%  16.685ms     60002     278ns     235ns  14.273us  cudaGetLastError
+                    0.01%  200.02us         1  200.02us  200.02us  200.02us  cudaMalloc
+                    0.01%  134.78us         1  134.78us  134.78us  134.78us  cudaMemGetInfo
+                    0.00%  36.321us        10  3.6320us  3.0320us  4.6100us  cudaFuncGetAttributes
+                    0.00%  28.911us        41     705ns     592ns  1.5350us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  7.7890us        16     486ns     346ns  1.1310us  cudaDeviceGetAttribute
+                    0.00%  3.2980us         4     824ns     736ns  1.0260us  cudaGetDevice
+
+```
+
+</p></details>
+
+
+<details><summary>Examplary `nvprof` results for **CUDAStandaloneConfiguration**</summary><p>
+Profile summary for `N = 1000`:
+
+```
+==18067== NVPROF is profiling process 18067, command: ./main
+==18067== Profiling application: ./main
+==18067== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   33.20%  86.044ms     10000  8.6040us  3.3600us  26.176us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, double*, double, double*, int, int*, int, int*, int)
+                   16.74%  43.393ms     10000  4.3390us  3.8080us  5.9840us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*)
+                   13.67%  35.442ms     10000  3.5440us  3.4560us  7.0400us  kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, double, double*, int, int*, int*, int)
+                    9.83%  25.469ms     10000  2.5460us  2.2400us  2.7520us  kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*)
+                    7.17%  18.573ms     10000  1.8570us  1.7280us  2.0800us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+                    7.03%  18.222ms     10000  1.8220us  1.7280us  2.6240us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*)
+                    6.26%  16.215ms     10000  1.6210us  1.4080us  1.7920us  _GLOBAL__N__70_tmpxft_0000448e_00000000_6_poissongroup_thresholder_codeobject_cpp1_ii_7314966e::_reset_poissongroup_thresholder_codeobject(int*)
+                    5.98%  15.512ms     10000  1.5510us  1.4400us  1.6960us  _GLOBAL__N__69_tmpxft_0000448c_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_c0b8948b::_reset_neurongroup_thresholder_codeobject(int*)
+                    0.13%  330.56us         1  330.56us  330.56us  330.56us  void gen_sequenced<curandStateXORWOW, float, int, __operator_&__(float curand_uniform_noargs<curandStateXORWOW>(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int)
+      API calls:   83.75%  838.49ms     80001  10.480us  9.1490us  9.2085ms  cudaLaunch
+                   12.30%  123.18ms    580005     212ns     154ns  365.89us  cudaSetupArgument
+                    2.22%  22.230ms     80001     277ns     208ns  341.41us  cudaConfigureCall
+                    1.68%  16.830ms     60002     280ns     217ns  348.09us  cudaGetLastError
+                    0.02%  200.11us         1  200.11us  200.11us  200.11us  cudaMalloc
+                    0.01%  131.26us         1  131.26us  131.26us  131.26us  cudaMemGetInfo
+                    0.00%  37.933us        10  3.7930us  3.0410us  5.6940us  cudaFuncGetAttributes
+                    0.00%  33.513us        41     817ns     707ns  1.6920us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  13.505us         1  13.505us  13.505us  13.505us  cudaDeviceSynchronize
+                    0.00%  7.9010us        16     493ns     368ns  1.1420us  cudaDeviceGetAttribute
+                    0.00%  4.0280us         4  1.0070us     817ns  1.4860us  cudaGetDevice
+
+```
+
+</p></details>
+
+
+***
+
+### STDPEventDriven - less kernels displayed
+![](plots/speed_test_STDPEventDriven-less_kernels_displayed_min_15_profiling.svg)
+
+
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/data/CUBA.pkl b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/data/CUBA.pkl
new file mode 100644
index 00000000..e0e2942b
Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/data/CUBA.pkl differ
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/data/STDP.pkl b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/data/STDP.pkl
new file mode 100644
index 00000000..1c5601a6
Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/data/STDP.pkl differ
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/data/STDPEventDriven.pkl b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/data/STDPEventDriven.pkl
new file mode 100644
index 00000000..9e7fcd6a
Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/data/STDPEventDriven.pkl differ
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/git.diff b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/git.diff
new file mode 100644
index 00000000..7737e913
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/git.diff
@@ -0,0 +1,60 @@
+diff --git a/brian2cuda/device.py b/brian2cuda/device.py
+index b4610ee..e032d32 100644
+--- a/brian2cuda/device.py
++++ b/brian2cuda/device.py
+@@ -919,13 +919,13 @@ class CUDAStandaloneDevice(CPPStandaloneDevice):
+             if clock not in all_clocks:
+                 run_lines.append('{net.name}.add(&{clock.name}, NULL, NULL, NULL, NULL);'.format(clock=clock, net=net))
+ 
+-        if self.profile and self.profile != 'blocking':  # self.profile == True
++        if True:#self.profile and self.profile != 'blocking':  # self.profile == True
+             run_lines.append('cudaProfilerStart();')
+         run_lines.append('{net.name}.run({duration!r}, {report_call}, {report_period!r});'.format(net=net,
+                                                                                               duration=float(duration),
+                                                                                               report_call=report_call,
+                                                                                               report_period=float(report_period)))
+-        if self.profile and self.profile != 'blocking':  # self.profile == True
++        if True:#self.profile and self.profile != 'blocking':  # self.profile == True
+             run_lines.append('cudaDeviceSynchronize();')
+             run_lines.append('cudaProfilerStop();')
+         self.main_queue.append(('run_network', (net, run_lines)))
+diff --git a/brian2cuda/tests/features/speed.py b/brian2cuda/tests/features/speed.py
+index 2293533..c093bc9 100644
+--- a/brian2cuda/tests/features/speed.py
++++ b/brian2cuda/tests/features/speed.py
+@@ -558,7 +558,7 @@ class CUBA(SpeedTest):
+     category = "Full examples"
+     name = "CUBA fixed connectivity"
+     tags = ["Neurons", "Synapses"]
+-    n_range = [10, 100, 1000, 10000, 100000, 1000000]
++    n_range = [10, 100, 1000, 10000, 100000, 200000, 500000, 1000000]
+     n_label = 'Num neurons'
+ 
+     # configuration options
+@@ -720,7 +720,7 @@ class STDPNotEventDriven(SpeedTest):
+     category = "Full examples"
+     name = "STDP (not event-driven)"
+     tags = ["Neurons", "Synapses"]
+-    n_range = [10, 100, 1000, 10000, 20000, 50000, 100000]
++    n_range = [10, 100, 1000, 10000, 20000, 50000, 100000, 1000000, 5000000]
+     n_label = 'Num neurons'
+ 
+     # configuration options
+diff --git a/frozen_repos/brian2 b/frozen_repos/brian2
+--- a/frozen_repos/brian2
++++ b/frozen_repos/brian2
+@@ -1 +1 @@
+-Subproject commit fadc6a0aeb90d1b4d343470628457d8561536f67
++Subproject commit fadc6a0aeb90d1b4d343470628457d8561536f67-dirty
+diff --git a/frozen_repos/brian2genn b/frozen_repos/brian2genn
+--- a/frozen_repos/brian2genn
++++ b/frozen_repos/brian2genn
+@@ -1 +1 @@
+-Subproject commit 0553cafeab49ea5403c0230411035df504d4db06
++Subproject commit 0553cafeab49ea5403c0230411035df504d4db06-dirty
+diff --git a/frozen_repos/genn b/frozen_repos/genn
+--- a/frozen_repos/genn
++++ b/frozen_repos/genn
+@@ -1 +1 @@
+-Subproject commit e01c85f18339249558d6e570ae976609dc972846
++Subproject commit e01c85f18339249558d6e570ae976609dc972846-dirty
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/logs/stdout_CUBA_cuda_standalone_500000.txt b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/logs/stdout_CUBA_cuda_standalone_500000.txt
new file mode 100644
index 00000000..29393927
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/logs/stdout_CUBA_cuda_standalone_500000.txt
@@ -0,0 +1,54 @@
+INFO: setting cudaDevice stuff took 0.353296 seconds
+INFO kernel_neurongroup_group_variable_set_conditional_codeobject
+	489 blocks
+	1024 threads
+	12 registers per block
+	0 bytes statically-allocated shared memory per block
+	0 bytes local memory per thread
+	304 bytes user-allocated constant memory
+INFO connectivity matrix has size 7999406
+INFO connectivity matrix has size 32005238
+INFO kernel_neurongroup_stateupdater_codeobject
+	652 blocks
+	768 threads
+	35 registers per block
+	0 bytes statically-allocated shared memory per block
+	0 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	0.750 theoretical occupancy
+INFO kernel_neurongroup_thresholder_codeobject
+	489 blocks
+	1024 threads
+	15 registers per block
+	0 bytes statically-allocated shared memory per block
+	0 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	1.000 theoretical occupancy
+INFO kernel_synapses_1_pre_codeobject
+	15 blocks
+	1024 threads
+	22 registers per block
+	0 bytes statically-allocated shared memory per block
+	8 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	1.000 theoretical occupancy
+INFO kernel_synapses_pre_codeobject
+	15 blocks
+	1024 threads
+	22 registers per block
+	0 bytes statically-allocated shared memory per block
+	8 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	1.000 theoretical occupancy
+INFO kernel_neurongroup_resetter_codeobject
+	489 blocks
+	1024 threads
+	14 registers per block
+	0 bytes statically-allocated shared memory per block
+	0 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	1.000 theoretical occupancy
+Number of synapses: 32005238
+Number of synapses: 7999406
+INFO: main_lines took 198.948620 seconds
+INFO: main function took 200.907001 seconds
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/logs/stdout_STDPEventDriven_cuda_standalone_5000000.txt b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/logs/stdout_STDPEventDriven_cuda_standalone_5000000.txt
new file mode 100644
index 00000000..eb3a2e32
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/logs/stdout_STDPEventDriven_cuda_standalone_5000000.txt
@@ -0,0 +1,63 @@
+INFO: setting cudaDevice stuff took 0.345850 seconds
+INFO kernel_synapses_group_variable_set_conditional_codeobject
+	4883 blocks
+	1024 threads
+	8 registers per block
+	0 bytes statically-allocated shared memory per block
+	0 bytes local memory per thread
+	304 bytes user-allocated constant memory
+INFO connectivity matrix has size 5000000
+INFO connectivity matrix has size 5000000
+INFO generating 10000000 rand every 2 clock cycles for poissongroup_thresholder_codeobject
+INFO kernel_neurongroup_stateupdater_codeobject
+	1 blocks
+	768 threads
+	35 registers per block
+	0 bytes statically-allocated shared memory per block
+	0 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	0.750 theoretical occupancy
+INFO kernel_neurongroup_thresholder_codeobject
+	1 blocks
+	1024 threads
+	14 registers per block
+	0 bytes statically-allocated shared memory per block
+	0 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	1.000 theoretical occupancy
+INFO kernel_poissongroup_thresholder_codeobject
+	4883 blocks
+	1024 threads
+	14 registers per block
+	0 bytes statically-allocated shared memory per block
+	0 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	1.000 theoretical occupancy
+INFO kernel_synapses_pre_codeobject
+	15 blocks
+	1024 threads
+	40 registers per block
+	0 bytes statically-allocated shared memory per block
+	8 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	0.500 theoretical occupancy
+INFO kernel_synapses_post_codeobject
+	15 blocks
+	1024 threads
+	34 registers per block
+	0 bytes statically-allocated shared memory per block
+	8 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	0.500 theoretical occupancy
+INFO kernel_neurongroup_resetter_codeobject
+	1 blocks
+	1024 threads
+	14 registers per block
+	0 bytes statically-allocated shared memory per block
+	0 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	1.000 theoretical occupancy
+Number of synapses: 5000000
+Number of synapses: 5000000
+INFO: main_lines took 349.461367 seconds
+INFO: main function took 350.342466 seconds
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/logs/stdout_STDP_cuda_standalone_5000000.txt b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/logs/stdout_STDP_cuda_standalone_5000000.txt
new file mode 100644
index 00000000..0708f3c6
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/logs/stdout_STDP_cuda_standalone_5000000.txt
@@ -0,0 +1,71 @@
+INFO: setting cudaDevice stuff took 0.318718 seconds
+INFO kernel_synapses_group_variable_set_conditional_codeobject
+	4883 blocks
+	1024 threads
+	8 registers per block
+	0 bytes statically-allocated shared memory per block
+	0 bytes local memory per thread
+	304 bytes user-allocated constant memory
+INFO connectivity matrix has size 5000000
+INFO connectivity matrix has size 5000000
+INFO generating 10000000 rand every 2 clock cycles for poissongroup_thresholder_codeobject
+INFO kernel_neurongroup_stateupdater_codeobject
+	1 blocks
+	768 threads
+	35 registers per block
+	0 bytes statically-allocated shared memory per block
+	0 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	0.750 theoretical occupancy
+INFO kernel_synapses_stateupdater_codeobject
+	6511 blocks
+	768 threads
+	35 registers per block
+	0 bytes statically-allocated shared memory per block
+	0 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	0.750 theoretical occupancy
+INFO kernel_neurongroup_thresholder_codeobject
+	1 blocks
+	1024 threads
+	14 registers per block
+	0 bytes statically-allocated shared memory per block
+	0 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	1.000 theoretical occupancy
+INFO kernel_poissongroup_thresholder_codeobject
+	4883 blocks
+	1024 threads
+	14 registers per block
+	0 bytes statically-allocated shared memory per block
+	0 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	1.000 theoretical occupancy
+INFO kernel_synapses_pre_codeobject
+	15 blocks
+	1024 threads
+	28 registers per block
+	0 bytes statically-allocated shared memory per block
+	8 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	1.000 theoretical occupancy
+INFO kernel_synapses_post_codeobject
+	15 blocks
+	1024 threads
+	26 registers per block
+	0 bytes statically-allocated shared memory per block
+	8 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	1.000 theoretical occupancy
+INFO kernel_neurongroup_resetter_codeobject
+	1 blocks
+	1024 threads
+	14 registers per block
+	0 bytes statically-allocated shared memory per block
+	0 bytes local memory per thread
+	304 bytes user-allocated constant memory
+	1.000 theoretical occupancy
+Number of synapses: 5000000
+Number of synapses: 5000000
+INFO: main_lines took 325.966205 seconds
+INFO: main function took 326.809723 seconds
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/nvprof/nvprof_CUBA_CUDAStandaloneConfigurationProfileCPU_1000.log b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/nvprof/nvprof_CUBA_CUDAStandaloneConfigurationProfileCPU_1000.log
new file mode 100644
index 00000000..36aaed97
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/nvprof/nvprof_CUBA_CUDAStandaloneConfigurationProfileCPU_1000.log
@@ -0,0 +1,20 @@
+==6637== NVPROF is profiling process 6637, command: ./main
+==6637== Profiling application: ./main
+==6637== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   27.59%  59.367ms     10000  5.9360us  5.7280us  6.9130us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double, double*, double*, double*, bool*)
+                   23.11%  49.736ms     10000  4.9730us  3.2960us  20.256us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double*, double, int*, int, int*, bool*)
+                   21.48%  46.232ms     10000  4.6230us  3.2960us  15.424us  kernel_synapses_1_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, int*, double*, bool*)
+                   11.66%  25.090ms     10000  2.5080us  2.2720us  3.0080us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+                    8.37%  18.003ms     10000  1.8000us  1.6640us  2.1760us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*, bool*)
+                    7.79%  16.764ms     10000  1.6760us  1.6000us  2.0480us  _GLOBAL__N__69_tmpxft_000017f7_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*)
+      API calls:   55.27%  767.72ms     60000  12.795us  10.547us  9.0021ms  cudaLaunch
+                   35.89%  498.54ms     80001  6.2310us  2.4830us  372.18us  cudaDeviceSynchronize
+                    6.50%  90.343ms    520000     173ns     138ns  371.16us  cudaSetupArgument
+                    1.33%  18.502ms     60000     308ns     238ns  364.34us  cudaConfigureCall
+                    0.99%  13.745ms     50000     274ns     217ns  21.746us  cudaGetLastError
+                    0.01%  138.51us         1  138.51us  138.51us  138.51us  cudaMemGetInfo
+                    0.00%  33.472us        39     858ns     721ns  1.8600us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  30.648us         8  3.8310us  3.1320us  5.3030us  cudaFuncGetAttributes
+                    0.00%  6.3800us        12     531ns     343ns  1.3920us  cudaDeviceGetAttribute
+                    0.00%  2.9800us         3     993ns     737ns  1.3910us  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/nvprof/nvprof_CUBA_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/nvprof/nvprof_CUBA_CUDAStandaloneConfiguration_1000.log
new file mode 100644
index 00000000..e6fd195f
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/nvprof/nvprof_CUBA_CUDAStandaloneConfiguration_1000.log
@@ -0,0 +1,20 @@
+==5900== NVPROF is profiling process 5900, command: ./main
+==5900== Profiling application: ./main
+==5900== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   27.83%  60.653ms     10000  6.0650us  5.7920us  7.0400us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double, double*, double*, double*, bool*)
+                   23.00%  50.122ms     10000  5.0120us  3.2960us  24.320us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double*, double, int*, int, int*, bool*)
+                   20.65%  45.008ms     10000  4.5000us  3.2960us  17.824us  kernel_synapses_1_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, int*, int, double, int*, int, int*, double*, bool*)
+                   11.47%  25.008ms     10000  2.5000us  2.2720us  3.1680us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double, double*, double*, bool*)
+                    9.22%  20.085ms     10000  2.0080us  1.8560us  2.1760us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*, bool*)
+                    7.83%  17.069ms     10000  1.7060us  1.6320us  2.2400us  _GLOBAL__N__69_tmpxft_00001511_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_97ebdcc0::_reset_neurongroup_thresholder_codeobject(int*)
+      API calls:   85.16%  640.31ms     60000  10.671us  9.6060us  9.0686ms  cudaLaunch
+                   11.50%  86.475ms    520000     166ns     135ns  344.23us  cudaSetupArgument
+                    1.87%  14.092ms     60000     234ns     176ns  334.30us  cudaConfigureCall
+                    1.43%  10.785ms     50000     215ns     189ns  10.220us  cudaGetLastError
+                    0.02%  139.19us         1  139.19us  139.19us  139.19us  cudaMemGetInfo
+                    0.00%  31.512us         8  3.9390us  3.0080us  5.7970us  cudaFuncGetAttributes
+                    0.00%  29.967us        39     768ns     653ns  1.9770us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  12.868us         1  12.868us  12.868us  12.868us  cudaDeviceSynchronize
+                    0.00%  6.2440us        12     520ns     331ns  1.3150us  cudaDeviceGetAttribute
+                    0.00%  3.7510us         3  1.2500us     823ns  1.7170us  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/nvprof/nvprof_STDPEventDriven_CUDAStandaloneConfigurationProfileCPU_1000.log b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/nvprof/nvprof_STDPEventDriven_CUDAStandaloneConfigurationProfileCPU_1000.log
new file mode 100644
index 00000000..196a1cb2
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/nvprof/nvprof_STDPEventDriven_CUDAStandaloneConfigurationProfileCPU_1000.log
@@ -0,0 +1,24 @@
+==18877== NVPROF is profiling process 18877, command: ./main
+==18877== Profiling application: ./main
+==18877== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   33.24%  85.455ms     10000  8.5450us  3.3280us  25.984us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, double*, double, double*, int, int*, int, int*, int)
+                   16.85%  43.327ms     10000  4.3320us  3.8400us  6.2080us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*)
+                   13.77%  35.393ms     10000  3.5390us  3.4240us  7.2320us  kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, double, double*, int, int*, int*, int)
+                    9.92%  25.503ms     10000  2.5500us  2.2400us  2.9760us  kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*)
+                    7.11%  18.278ms     10000  1.8270us  1.7600us  2.0800us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+                    6.37%  16.365ms     10000  1.6360us  1.4080us  1.7920us  _GLOBAL__N__70_tmpxft_00004798_00000000_6_poissongroup_thresholder_codeobject_cpp1_ii_7314966e::_reset_poissongroup_thresholder_codeobject(int*)
+                    6.31%  16.219ms     10000  1.6210us  1.5040us  1.8560us  _GLOBAL__N__69_tmpxft_00004796_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_c0b8948b::_reset_neurongroup_thresholder_codeobject(int*)
+                    6.31%  16.209ms     10000  1.6200us  1.5360us  2.5920us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*)
+                    0.13%  330.27us         1  330.27us  330.27us  330.27us  void gen_sequenced<curandStateXORWOW, float, int, __operator_&__(float curand_uniform_noargs<curandStateXORWOW>(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int)
+      API calls:   57.23%  936.43ms     80001  11.705us  9.9320us  9.2809ms  cudaLaunch
+                   34.35%  562.06ms     90001  6.2450us  2.4600us  359.92us  cudaDeviceSynchronize
+                    5.96%  97.491ms    580005     168ns     132ns  357.12us  cudaSetupArgument
+                    1.41%  23.032ms     80001     287ns     242ns  13.914us  cudaConfigureCall
+                    1.02%  16.685ms     60002     278ns     235ns  14.273us  cudaGetLastError
+                    0.01%  200.02us         1  200.02us  200.02us  200.02us  cudaMalloc
+                    0.01%  134.78us         1  134.78us  134.78us  134.78us  cudaMemGetInfo
+                    0.00%  36.321us        10  3.6320us  3.0320us  4.6100us  cudaFuncGetAttributes
+                    0.00%  28.911us        41     705ns     592ns  1.5350us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  7.7890us        16     486ns     346ns  1.1310us  cudaDeviceGetAttribute
+                    0.00%  3.2980us         4     824ns     736ns  1.0260us  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/nvprof/nvprof_STDPEventDriven_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/nvprof/nvprof_STDPEventDriven_CUDAStandaloneConfiguration_1000.log
new file mode 100644
index 00000000..89cca30a
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/nvprof/nvprof_STDPEventDriven_CUDAStandaloneConfiguration_1000.log
@@ -0,0 +1,24 @@
+==18067== NVPROF is profiling process 18067, command: ./main
+==18067== Profiling application: ./main
+==18067== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   33.20%  86.044ms     10000  8.6040us  3.3600us  26.176us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, double*, double, double*, int, int*, int, int*, int)
+                   16.74%  43.393ms     10000  4.3390us  3.8080us  5.9840us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*)
+                   13.67%  35.442ms     10000  3.5440us  3.4560us  7.0400us  kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, double, double*, int, int*, int*, int)
+                    9.83%  25.469ms     10000  2.5460us  2.2400us  2.7520us  kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*)
+                    7.17%  18.573ms     10000  1.8570us  1.7280us  2.0800us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+                    7.03%  18.222ms     10000  1.8220us  1.7280us  2.6240us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*)
+                    6.26%  16.215ms     10000  1.6210us  1.4080us  1.7920us  _GLOBAL__N__70_tmpxft_0000448e_00000000_6_poissongroup_thresholder_codeobject_cpp1_ii_7314966e::_reset_poissongroup_thresholder_codeobject(int*)
+                    5.98%  15.512ms     10000  1.5510us  1.4400us  1.6960us  _GLOBAL__N__69_tmpxft_0000448c_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_c0b8948b::_reset_neurongroup_thresholder_codeobject(int*)
+                    0.13%  330.56us         1  330.56us  330.56us  330.56us  void gen_sequenced<curandStateXORWOW, float, int, __operator_&__(float curand_uniform_noargs<curandStateXORWOW>(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int)
+      API calls:   83.75%  838.49ms     80001  10.480us  9.1490us  9.2085ms  cudaLaunch
+                   12.30%  123.18ms    580005     212ns     154ns  365.89us  cudaSetupArgument
+                    2.22%  22.230ms     80001     277ns     208ns  341.41us  cudaConfigureCall
+                    1.68%  16.830ms     60002     280ns     217ns  348.09us  cudaGetLastError
+                    0.02%  200.11us         1  200.11us  200.11us  200.11us  cudaMalloc
+                    0.01%  131.26us         1  131.26us  131.26us  131.26us  cudaMemGetInfo
+                    0.00%  37.933us        10  3.7930us  3.0410us  5.6940us  cudaFuncGetAttributes
+                    0.00%  33.513us        41     817ns     707ns  1.6920us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  13.505us         1  13.505us  13.505us  13.505us  cudaDeviceSynchronize
+                    0.00%  7.9010us        16     493ns     368ns  1.1420us  cudaDeviceGetAttribute
+                    0.00%  4.0280us         4  1.0070us     817ns  1.4860us  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/nvprof/nvprof_STDP_CUDAStandaloneConfigurationProfileCPU_1000.log b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/nvprof/nvprof_STDP_CUDAStandaloneConfigurationProfileCPU_1000.log
new file mode 100644
index 00000000..d8883015
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/nvprof/nvprof_STDP_CUDAStandaloneConfigurationProfileCPU_1000.log
@@ -0,0 +1,25 @@
+==28576== NVPROF is profiling process 28576, command: ./main
+==28576== Profiling application: ./main
+==28576== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   26.71%  73.256ms     10000  7.3250us  3.2960us  22.720us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, double*, double, double*, int, int*, int, int*)
+                   15.80%  43.329ms     10000  4.3320us  3.8720us  6.2400us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*)
+                   12.77%  35.035ms     10000  3.5030us  3.3920us  6.3360us  kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, double, double*, int, int*)
+                    9.94%  27.271ms     10000  2.7270us  2.6240us  3.1680us  kernel_synapses_stateupdater_codeobject(unsigned int, unsigned int, double*, int, double*, int, double*, int*)
+                    9.28%  25.455ms     10000  2.5450us  2.2400us  2.9120us  kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*)
+                    6.66%  18.254ms     10000  1.8250us  1.7600us  2.0800us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+                    6.65%  18.226ms     10000  1.8220us  1.7600us  2.5600us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*)
+                    6.20%  16.991ms     10000  1.6990us  1.6000us  1.9200us  _GLOBAL__N__70_tmpxft_00006daf_00000000_6_poissongroup_thresholder_codeobject_cpp1_ii_7314966e::_reset_poissongroup_thresholder_codeobject(int*)
+                    5.88%  16.118ms     10000  1.6110us  1.4720us  1.8560us  _GLOBAL__N__69_tmpxft_00006dad_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_c0b8948b::_reset_neurongroup_thresholder_codeobject(int*)
+                    0.12%  330.53us         1  330.53us  330.53us  330.53us  void gen_sequenced<curandStateXORWOW, float, int, __operator_&__(float curand_uniform_noargs<curandStateXORWOW>(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int)
+      API calls:   58.50%  1.10914s     90001  12.323us  9.6560us  9.1188ms  cudaLaunch
+                   32.75%  621.00ms    100001  6.2090us  2.3660us  355.02us  cudaDeviceSynchronize
+                    5.78%  109.54ms    660005     165ns     124ns  14.341us  cudaSetupArgument
+                    1.49%  28.313ms     90001     314ns     245ns  12.028us  cudaConfigureCall
+                    1.45%  27.511ms     70002     393ns     230ns  366.98us  cudaGetLastError
+                    0.01%  208.18us         1  208.18us  208.18us  208.18us  cudaMalloc
+                    0.01%  131.79us         1  131.79us  131.79us  131.79us  cudaMemGetInfo
+                    0.00%  55.331us        74     747ns     647ns  1.4820us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  44.531us        12  3.7100us  3.1290us  4.8360us  cudaFuncGetAttributes
+                    0.00%  9.1380us        20     456ns     333ns     893ns  cudaDeviceGetAttribute
+                    0.00%  4.2750us         5     855ns     719ns  1.3080us  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/nvprof/nvprof_STDP_CUDAStandaloneConfiguration_1000.log b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/nvprof/nvprof_STDP_CUDAStandaloneConfiguration_1000.log
new file mode 100644
index 00000000..e670ef10
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/nvprof/nvprof_STDP_CUDAStandaloneConfiguration_1000.log
@@ -0,0 +1,25 @@
+==27879== NVPROF is profiling process 27879, command: ./main
+==27879== Profiling application: ./main
+==27879== Profiling result:
+            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+ GPU activities:   26.99%  74.731ms     10000  7.4730us  3.2960us  27.648us  kernel_synapses_pre_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, double*, double, double*, int, int*, int, int*)
+                   15.88%  43.964ms     10000  4.3960us  3.9360us  6.4000us  kernel_neurongroup_stateupdater_codeobject(unsigned int, unsigned int, double*, double*, double*)
+                   12.62%  34.946ms     10000  3.4940us  3.3920us  6.4960us  kernel_synapses_post_codeobject(unsigned int, unsigned int, unsigned int, int*, unsigned int, double*, int, double*, int, double*, int, int*, int, double, double*, int, int*)
+                    9.80%  27.129ms     10000  2.7120us  2.3680us  2.9440us  kernel_poissongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*, double*, float*)
+                    9.58%  26.535ms     10000  2.6530us  2.5600us  3.0400us  kernel_synapses_stateupdater_codeobject(unsigned int, unsigned int, double*, int, double*, int, double*, int*)
+                    6.59%  18.247ms     10000  1.8240us  1.7280us  2.0480us  kernel_neurongroup_resetter_codeobject(unsigned int, unsigned int, double*, int*)
+                    6.58%  18.231ms     10000  1.8230us  1.7600us  2.5600us  kernel_neurongroup_thresholder_codeobject(unsigned int, unsigned int, int*, double*)
+                    6.20%  17.155ms     10000  1.7150us  1.6320us  1.9520us  _GLOBAL__N__70_tmpxft_00006ae9_00000000_6_poissongroup_thresholder_codeobject_cpp1_ii_7314966e::_reset_poissongroup_thresholder_codeobject(int*)
+                    5.65%  15.632ms     10000  1.5630us  1.4720us  1.6960us  _GLOBAL__N__69_tmpxft_00006ae5_00000000_6_neurongroup_thresholder_codeobject_cpp1_ii_c0b8948b::_reset_neurongroup_thresholder_codeobject(int*)
+                    0.12%  329.57us         1  329.57us  329.57us  329.57us  void gen_sequenced<curandStateXORWOW, float, int, __operator_&__(float curand_uniform_noargs<curandStateXORWOW>(curandStateXORWOW*, int))>(curandStateXORWOW*, float*, unsigned long, unsigned long, int)
+      API calls:   85.57%  910.45ms     90001  10.116us  8.9060us  9.1466ms  cudaLaunch
+                   10.88%  115.80ms    660005     175ns     132ns  353.03us  cudaSetupArgument
+                    2.00%  21.262ms     90001     236ns     181ns  330.07us  cudaConfigureCall
+                    1.50%  15.984ms     70002     228ns     182ns  318.18us  cudaGetLastError
+                    0.02%  207.89us         1  207.89us  207.89us  207.89us  cudaMalloc
+                    0.01%  132.37us         1  132.37us  132.37us  132.37us  cudaMemGetInfo
+                    0.01%  55.857us        74     754ns     674ns  1.5500us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+                    0.00%  44.986us        12  3.7480us  3.0050us  5.8370us  cudaFuncGetAttributes
+                    0.00%  13.864us         1  13.864us  13.864us  13.864us  cudaDeviceSynchronize
+                    0.00%  9.5470us        20     477ns     338ns  1.1980us  cudaDeviceGetAttribute
+                    0.00%  4.8700us         5     974ns     851ns  1.4220us  cudaGetDevice
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_CUBA-less_kernels_displayed_min_15_profiling.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_CUBA-less_kernels_displayed_min_15_profiling.png
new file mode 100644
index 00000000..0abbb193
Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_CUBA-less_kernels_displayed_min_15_profiling.png differ
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_CUBA_absolute.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_CUBA_absolute.png
new file mode 100644
index 00000000..499bbcec
Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_CUBA_absolute.png differ
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_CUBA_profiling.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_CUBA_profiling.png
new file mode 100644
index 00000000..36649122
Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_CUBA_profiling.png differ
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_CUBA_relative.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_CUBA_relative.png
new file mode 100644
index 00000000..47c92e41
Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_CUBA_relative.png differ
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDP-less_kernels_displayed_min_15_profiling.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDP-less_kernels_displayed_min_15_profiling.png
new file mode 100644
index 00000000..70188222
Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDP-less_kernels_displayed_min_15_profiling.png differ
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDPEventDriven-less_kernels_displayed_min_15_profiling.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDPEventDriven-less_kernels_displayed_min_15_profiling.png
new file mode 100644
index 00000000..f64a187e
Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDPEventDriven-less_kernels_displayed_min_15_profiling.png differ
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDPEventDriven_absolute.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDPEventDriven_absolute.png
new file mode 100644
index 00000000..6754d066
Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDPEventDriven_absolute.png differ
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDPEventDriven_profiling.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDPEventDriven_profiling.png
new file mode 100644
index 00000000..bfaebf31
Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDPEventDriven_profiling.png differ
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDPEventDriven_relative.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDPEventDriven_relative.png
new file mode 100644
index 00000000..d32ac119
Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDPEventDriven_relative.png differ
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDP_absolute.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDP_absolute.png
new file mode 100644
index 00000000..0032b713
Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDP_absolute.png differ
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDP_profiling.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDP_profiling.png
new file mode 100644
index 00000000..b98b4ec2
Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDP_profiling.png differ
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDP_relative.png b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDP_relative.png
new file mode 100644
index 00000000..6fe3a90c
Binary files /dev/null and b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/plots/speed_test_STDP_relative.png differ
diff --git a/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/run_speed_test_script.py b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/run_speed_test_script.py
new file mode 100644
index 00000000..0941e299
--- /dev/null
+++ b/dev/benchmarks/results_2017_11_30_cuba_stdp/cuba_stdp_profiled/run_speed_test_script.py
@@ -0,0 +1,284 @@
+import os
+import shutil
+import glob
+import subprocess
+import sys
+import socket
+
+# run tests without X-server
+import matplotlib
+matplotlib.use('Agg')
+
+# pretty plots
+import seaborn
+
+import time
+import datetime
+import cPickle as pickle
+
+from brian2 import *
+from brian2.tests.features import *
+from brian2.tests.features.base import *
+from brian2.tests.features.base import results
+
+import brian2cuda
+from brian2cuda.tests.features.cuda_configuration import (CUDAStandaloneConfiguration,
+                                                          CUDAStandaloneConfigurationNoAssert,
+                                                          CUDAStandaloneConfigurationExtraThresholdKernel,
+                                                          CUDAStandaloneConfigurationCurandDouble,
+                                                          CUDAStandaloneConfigurationNoCudaOccupancyAPI,
+                                                          CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU,
+                                                          CUDAStandaloneConfiguration2BlocksPerSM,
+                                                          CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds,
+                                                          CUDAStandaloneConfigurationSynLaunchBounds,
+                                                          CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds,
+                                                          CUDAStandaloneConfigurationProfileGPU,
+                                                          CUDAStandaloneConfigurationProfileCPU)
+                                                          #CUDAStandaloneConfigurationTestBrunelHeteroAtomics,
+                                                          #CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU,
+                                                          #CUDAStandaloneConfigurationPushAtomicResize,
+                                                          #CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize,
+                                                          #CUDAStandaloneConfigurationPushAtomicResizeProfileCPU,
+                                                          #CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResizeProfileCPU,
+                                                          #CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy,
+                                                          #CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU)
+from brian2cuda.tests.features.speed import *
+
+from brian2genn.correctness_testing import GeNNConfiguration, GeNNConfigurationCPU, GeNNConfigurationOptimized
+
+from create_readme import create_readme
+
+assert len(sys.argv)<= 2, 'Only one command line argument supported! Got {}'.format(len(sys.argv)-1)
+if len(sys.argv) == 2:
+    additional_dir_name = '_' + sys.argv[1]
+else:
+    additional_dir_name = ''
+
+prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j12']
+
+# host specific settings
+if socket.gethostname() == 'elnath':
+    prefs['devices.cpp_standalone.extra_make_args_unix'] = ['-j24']
+    prefs['codegen.cuda.extra_compile_args_nvcc'].remove('-arch=sm_35')
+    prefs['codegen.cuda.extra_compile_args_nvcc'].extend(['-arch=sm_20'])
+
+configs = [# configuration                          project_directory
+          #(NumpyConfiguration,                     None),
+          #(WeaveConfiguration,                     None),
+          #(LocalConfiguration,                     None),
+          #(CPPStandaloneConfiguration,              'cpp_standalone'),
+          #(CPPStandaloneConfigurationOpenMP,        'cpp_standalone'),
+          (CUDAStandaloneConfiguration,             'cuda_standalone'),
+          #(CUDAStandaloneConfigurationPushAtomicResize,   'cuda_standalone'),
+          #(CUDAStandaloneConfigurationTestBrunelHeteroAtomics,   'cuda_standalone'),
+          #(CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResize,     'cuda_standalone'),
+          #(CUDAStandaloneConfigurationExtraThresholdKernel,             'cuda_standalone'),
+          #(CUDAStandaloneConfigurationNoAssert,             'cuda_standalone'),
+          #(CUDAStandaloneConfigurationCurandDouble,              'cuda_standalone'),
+          #(CUDAStandaloneConfigurationNoCudaOccupancyAPI,      'cuda_standalone'),
+          #(CUDAStandaloneConfigurationNoCudaOccupancyAPIProfileCPU,    'cuda_standalone'),
+          #(CUDAStandaloneConfiguration2BlocksPerSM, 'cuda_standalone'),
+          #(CUDAStandaloneConfiguration2BlocksPerSMLaunchBounds, 'cuda_standalone'),
+          #(CUDAStandaloneConfigurationSynLaunchBounds,     'cuda_standalone'),
+          #(CUDAStandaloneConfiguration2BlocksPerSMSynLaunchBounds, 'cuda_standalone'),
+          #(CUDAStandaloneConfigurationProfileGPU,   'cuda_standalone'),
+          (CUDAStandaloneConfigurationProfileCPU,   'cuda_standalone'),
+          #(CUDAStandaloneConfigurationTestBrunelHeteroAtomicsProfileCPU,   'cuda_standalone'),
+          #(CUDAStandaloneConfigurationPushAtomicResizeProfileCPU,   'cuda_standalone'),
+          #(CUDAStandaloneConfigurationBrunelHeterogAndPushAtomicResizeProfileCPU,     'cuda_standalone'),
+          #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpy,   'cuda_standalone'),
+          #(CUDAStandaloneConfigurationPushAtomicResizeAsyncMemcpyProfileCPU,   'cuda_standalone'),
+          #(GeNNConfiguration,                       'GeNNworkspace'),
+          #(GeNNConfigurationCPU,                    'GeNNworkspace'),
+          #(GeNNConfigurationOptimized,              'GeNNworkspace')
+          ]
+
+speed_tests = [# feature_test                     name                                  n_slice
+
+               #(ThresholderOnlyPoissonLowRate,                  'ThresholderOnlyPoissonLowRate',                slice(None)         ),
+               #(ThresholderOnlyPoissonMediumRate,               'ThresholderOnlyPoissonMediumRate',             slice(None)         ),
+               #(ThresholderOnlyPoissonHighRate,                 'ThresholderOnlyPoissonHighRate',               slice(None)         ),
+               #(ThresholderOnlyAlwaysSpiking,                   'ThresholderOnlyAlwaysSpiking',                 slice(None)         ),
+
+               #(BrunelHakimStateupdateOnlyDouble,               'BrunelHakimStateupdateOnlyDouble',             slice(None)         ),
+               #(BrunelHakimStateupdateOnlyTriple,               'BrunelHakimStateupdateOnlyTriple',             slice(None)         ),
+               #(BrunelHakimStateupdateOnly,                     'BrunelHakimStateupdateOnly',                   slice(None)         ),
+               #(BrunelHakimNeuronsOnly,                         'BrunelHakimNeuronsOnly',                       slice(None)         ),
+               #(BrunelHakimNeuronsOnlyNoXi,                     'BrunelHakimNeuronsOnlyNoXi',                   slice(None)         ),
+               #(BrunelHakimNeuronsOnlyNoRand,                   'BrunelHakimNeuronsOnlyNoRand',                 slice(None)         ),
+               #(BrunelHakimStateupdateThresholdOnly,            'BrunelHakimStateupdateThresholdOnly',          slice(None)         ),
+               #(BrunelHakimStateupdateThresholdResetOnly,       'BrunelHakimStateupdateThresholdResetOnly',     slice(None)         ),
+               #(BrunelHakimModelScalarDelayShort,               'BrunelHakimModelScalarDelayShort',             slice(None)         ),
+               #(BrunelHakimModelScalarDelayNoSelfConnections,   'BrunelHakimModelScalarDelayNoSelfConnections', slice(None)         ),
+               #(COBAHH,                                         'COBAHH',                                       slice(None)         ),
+               #(AdaptationOscillation,                          'AdaptationOscillation',                        slice(None)         ),
+               #(Vogels,                                         'Vogels',                                       slice(None)         ),
+               (CUBA,                                           'CUBA',                                         slice(0,-1,1)         ),
+               (STDPNotEventDriven,                                           'STDP',                                         slice(None)         ),
+               (STDPEventDriven,                                'STDPEventDriven',                              slice(None)         ),
+               #(BrunelHakimModelScalarDelay,                    'BrunelHakimModelScalarDelay',                  slice(None)         ),
+
+               #(VerySparseMediumRateSynapsesOnly,               'VerySparseMediumRateSynapsesOnly',             slice(None)         ),
+               #(SparseMediumRateSynapsesOnly,                   'SparseMediumRateSynapsesOnly',                 slice(None)         ),
+               #(DenseMediumRateSynapsesOnly,                    'DenseMediumRateSynapsesOnly',                  slice(None)         ),
+               #(SparseLowRateSynapsesOnly,                      'SparseLowRateSynapsesOnly',                    slice(None)         ),
+               #(SparseHighRateSynapsesOnly,                     'SparseHighRateSynapsesOnly',                   slice(None)         ),
+
+               #(STDPNotEventDriven,                             'STDPNotEventDriven',                           slice(None)         ),
+               #(STDPMultiPost,                                  'STDPMultiPost',                                slice(None)         ),
+               #(STDPNeuronalTraces,                             'STDPNeuronalTraces',                           slice(None)         ),
+               #(STDPMultiPostNeuronalTraces,                    'STDPMultiPostNeuronalTraces',                  slice(None)         ),
+
+               #(BrunelHakimModelHeterogeneousDelay,             'BrunelHakimModelHeterogeneousDelay',           slice(0,-1,1)         ),
+
+               #(LinearNeuronsOnly,                              'LinearNeuronsOnly',                            slice(None)         ),
+               #(HHNeuronsOnly,                                  'HHNeuronsOnly',                                slice(None)         ),
+               #(VogelsWithSynapticDynamic,                      'VogelsWithSynapticDynamic',                    slice(None)         ),
+
+               ### below uses monitors
+               #(CUBAFixedConnectivity,                          'CUBAFixedConnectivity',                        slice(None)         ),
+               #(COBAHHFixedConnectivity,                        'COBAHHFixedConnectivity',                      slice(None, -1)     ),
+]
+
+configurations = [config[0] for config in configs]
+project_dirs = [config[1] for config in configs]
+
+# check if multiple Configurations with same project_dirs are specified
+last_idx = {}
+for proj_dir in project_dirs:
+    if proj_dir is not None:
+        first_i = project_dirs.index(proj_dir)
+        last_i = len(project_dirs) - 1 - project_dirs[::-1].index(proj_dir)
+        if first_i != last_i:
+            print("WARNING there are multiple configurations using {d} as project "
+                  "directory. Profiling and logfiles will only be saved for the last one {c}.".format(
+                  d=proj_dir, c=configurations[last_i].__name__))
+        last_idx[proj_dir] = last_i
+
+time_stemp = time.time()
+date_str = datetime.datetime.fromtimestamp(time_stemp).strftime('%Y_%m_%d')
+
+directory = 'results_{}{}'.format(date_str, additional_dir_name)
+if os.path.exists(directory):
+    new_dir = directory + '_bak_' + str(int(time.time()))
+    print("Directory with name `{}` already exists. Renaming it to `{}`.".format(directory, new_dir))
+    os.rename(directory, new_dir)
+os.makedirs(directory)
+data_dir = os.path.join(directory, 'data')
+plot_dir = os.path.join(directory, 'plots')
+log_dir = os.path.join(directory, 'logs')
+prof_dir = os.path.join(directory, 'nvprof')
+os.makedirs(data_dir)
+os.makedirs(plot_dir)
+os.makedirs(log_dir)
+os.makedirs(prof_dir)
+print("Saving results in {}.".format(plot_dir))
+
+shutil.copy(os.path.realpath(__file__), os.path.join(directory, 'run_speed_test_script.py'))
+
+time_format = '%d.%m.%Y at %H:%M:%S'
+script_start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+
+with open(os.path.join(directory, 'git.diff'), 'w') as diff_file:
+    subprocess.call(['git', 'diff'], stdout=diff_file)
+
+try:
+    for n, (st, name, sl) in enumerate(speed_tests):
+        start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+        print("Starting {} on {}.".format(name, start))
+        maximum_run_time = 1*60*60*second
+        res = run_speed_tests(configurations=configurations,
+                              speed_tests=[st],
+                              n_slice=sl,
+                              #n_slice=slice(0,1,None),
+                              run_twice=False,
+                              verbose=True,
+                              maximum_run_time=maximum_run_time#,
+                              ## this needs modification of brian2 code
+                              #profile_only_active=True 
+                              #profile_only_active=False
+                             )
+        end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+        diff = datetime.datetime.strptime(end, time_format) - datetime.datetime.strptime(start, time_format)
+        print("Running {} took {}.".format(name, diff))
+        res.plot_all_tests()
+        ## this needs modification of brian2 code
+        #res.plot_all_tests(print_relative=True)
+
+        res.plot_all_tests()
+        ## this needs modification of brian2 code
+        #res.plot_all_tests(print_relative=True)
+        savefig(os.path.join(plot_dir, 'speed_test_{}_absolute.svg'.format(speed_tests[n][1])))
+        res.plot_all_tests(relative=True)
+        savefig(os.path.join(plot_dir, 'speed_test_{}_relative.svg'.format(name)))
+        res.plot_all_tests(profiling_minimum=0.05)
+        savefig(os.path.join(plot_dir, 'speed_test_{}_profiling.svg'.format(name)))
+        res.plot_all_tests(profiling_minimum=0.15)
+        savefig(os.path.join(plot_dir, 'speed_test_{}-less_kernels_displayed_min_15_profiling.svg'.format(name)))
+
+        if 3 != len(get_fignums()):
+            print("WARNING: There were {} plots created, but only {} saved.".format(len(get_fignums()), 3*(n+1)))
+        for n in get_fignums():
+            close(n)
+
+        # pickel results object to disk
+        pkl_file = os.path.join(data_dir, name + '.pkl' )
+        with open(pkl_file, 'wb') as output:
+                pickle.dump(res, output, pickle.HIGHEST_PROTOCOL)
+
+        # save stdout log of last run (the other are deleted in run_speed_tests())
+        for proj_dir in set(project_dirs):
+            if not proj_dir is None and proj_dir in ['cuda_standalone', 'cpp_standalone']:
+                config = configurations[last_idx[proj_dir]]
+                stdout_file = os.path.join(proj_dir, 'results/stdout.txt')
+                if os.path.exists(stdout_file):
+                    shutil.copy(stdout_file,
+                                os.path.join(log_dir, 'stdout_{st}_{conf}_{n}.txt'.format(st=name, conf=proj_dir,
+                                                                                           n=st.n_range[sl][-1])))
+                else:
+                    print("WARNING Couldn't save {},file not found.".format(stdout_file))
+
+        # run nvprof on n_range[2]
+        for conf, proj_dir in zip(configurations, project_dirs):
+            main_arg = ''
+            if proj_dir in ['cuda_standalone', 'GeNNworkspace']:
+                if proj_dir == 'GeNNworkspace':
+                    main_arg = 'test {time} 1'.format(time=st.duration/second)
+                ns = st.n_range[sl]
+                idx = 2
+                max_runtime = 20
+                conf_name = conf.__name__
+                print("Rerunning {} with n = {} for nvprof profiling".format(conf_name, st.n_range[idx]))
+                tb, res, runtime, prof_info = results(conf, st, st.n_range[idx], maximum_run_time=maximum_run_time)
+                if not isinstance(res, Exception) and runtime < max_runtime:
+                    option = '--profile-from-start-off' if proj_dir == 'cuda_standalone' else ''
+                    cmd = 'cd {proj_dir} && nvprof {opt} --log-file ../{log_file} ./main {arg}'.format(
+                        proj_dir=proj_dir, arg=main_arg, opt=option,
+                        log_file=os.path.join(prof_dir, 'nvprof_{st}_{conf}_{n}.log'.format(
+                            st=name, conf=conf_name, n=st.n_range[idx])))
+                    prof_start = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+                    print(cmd)
+                    x = os.system(cmd)
+                    if x:
+                        print('nvprof failed with {}'.format(x))
+                    prof_end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+                    prof_diff = datetime.datetime.strptime(prof_end, time_format) - datetime.datetime.strptime(prof_start, time_format)
+                    print("Profiling took {} for runtime of {}".format(prof_diff, runtime))
+finally:
+    create_readme(directory)
+    print("\nSummarized speed test results in {}".format(directory + '/README.md'))
+    script_end = datetime.datetime.fromtimestamp(time.time()).strftime(time_format)
+    script_diff = datetime.datetime.strptime(script_end, time_format) - datetime.datetime.strptime(script_start, time_format)
+    print("Finished speed test on {}. Total time = {}.".format(
+        datetime.datetime.fromtimestamp(time.time()).strftime(time_format), script_diff))
+
+
+##res.plot_all_tests(relative=True)
+#for n in get_fignums():
+#    plt.figure(n)
+#    savefig(plot_dir + '/speed_test_{}.png'.format(speed_tests[n-1][1]))
+
+## Debug (includes profiling infos)
+#from brian2.tests.features.base import results
+#for x in results(LocalConfiguration, LinearNeuronsOnly, 10, maximum_run_time=10*second):
+#    print x