diff --git a/brian2cuda/brianlib/cudaVector.h b/brian2cuda/brianlib/cudaVector.h
index 1bf30fd8..8344bbd3 100644
--- a/brian2cuda/brianlib/cudaVector.h
+++ b/brian2cuda/brianlib/cudaVector.h
@@ -51,6 +51,19 @@ class cudaVector
         return m_data;
     };
 
+    __device__ set_size_address(volatile size_type* size)
+    {
+        // TODO:
+        //  1. Mofigy m_size to be a pointer
+        //  2. Change it's address in here (not sure if method call needs 'volatile'?
+        //  3. In spikequeue.h, after initializing the queues, declare a
+        //     volatile array of size of queues and change each queues m_size
+        //     parameter with this function here.
+        //  4. In synapses.cu, get the queue size by indexing this new array
+        //     with current_offset and Memcpy it to host to set the correct kernel
+        //     dimensions.
+    };
+
     __device__ scalar& at(size_type index)
     {
         if (index < 0 || index >= m_size)
diff --git a/brian2cuda/templates/synapses_push_spikes.cu b/brian2cuda/templates/synapses_push_spikes.cu
index 16b408ce..57228ef5 100644
--- a/brian2cuda/templates/synapses_push_spikes.cu
+++ b/brian2cuda/templates/synapses_push_spikes.cu
@@ -1054,12 +1054,16 @@ void _run_{{codeobj_name}}()
      */
     needed_shared_memory = (2 * {{owner.name}}_max_num_unique_delays + 1) * sizeof(int);
     assert (needed_shared_memory <= max_shared_mem_size);
+    // We don't need more then max(num_synapses) threads per block.
+    num_threads = {{owner.name}}_max_size;
     {% else %}{# bundle_mode #}
     needed_shared_memory = 0;
+    // We don't need more then max(num_delays) threads per block.
+    num_threads = {{owner.name}}_bundle_size_max;
     {% endif %}{# not bundle_mode #}
 
-    // We don't need more then max(num_synapses) threads per block.
-    num_threads = {{owner.name}}_max_size;
+    // TODO: warp size multiple for num_threads?
+
     if (num_threads > max_threads_per_block)
     {
         num_threads = max_threads_per_block;