diff --git a/brian2cuda/brianlib/cudaVector.h b/brian2cuda/brianlib/cudaVector.h index 1bf30fd8..8344bbd3 100644 --- a/brian2cuda/brianlib/cudaVector.h +++ b/brian2cuda/brianlib/cudaVector.h @@ -51,6 +51,19 @@ class cudaVector return m_data; }; + __device__ set_size_address(volatile size_type* size) + { + // TODO: + // 1. Mofigy m_size to be a pointer + // 2. Change it's address in here (not sure if method call needs 'volatile'? + // 3. In spikequeue.h, after initializing the queues, declare a + // volatile array of size of queues and change each queues m_size + // parameter with this function here. + // 4. In synapses.cu, get the queue size by indexing this new array + // with current_offset and Memcpy it to host to set the correct kernel + // dimensions. + }; + __device__ scalar& at(size_type index) { if (index < 0 || index >= m_size) diff --git a/brian2cuda/templates/synapses_push_spikes.cu b/brian2cuda/templates/synapses_push_spikes.cu index 16b408ce..57228ef5 100644 --- a/brian2cuda/templates/synapses_push_spikes.cu +++ b/brian2cuda/templates/synapses_push_spikes.cu @@ -1054,12 +1054,16 @@ void _run_{{codeobj_name}}() */ needed_shared_memory = (2 * {{owner.name}}_max_num_unique_delays + 1) * sizeof(int); assert (needed_shared_memory <= max_shared_mem_size); + // We don't need more then max(num_synapses) threads per block. + num_threads = {{owner.name}}_max_size; {% else %}{# bundle_mode #} needed_shared_memory = 0; + // We don't need more then max(num_delays) threads per block. + num_threads = {{owner.name}}_bundle_size_max; {% endif %}{# not bundle_mode #} - // We don't need more then max(num_synapses) threads per block. - num_threads = {{owner.name}}_max_size; + // TODO: warp size multiple for num_threads? + if (num_threads > max_threads_per_block) { num_threads = max_threads_per_block;