Parallelize code execution

brian-team · SudeshnaBora · Oct 30, 2021 · Oct 30, 2021 · Nov 20, 2021 · Apr 6, 2022
commit 97a8116ed6f9caf61acfbd770d855ab855b695b2
diff --git a/...n/code/MushroomBody/cuda_standalone/code_objects/before_run_synapses_1_post_push_spikes.o b/...n/code/MushroomBody/cuda_standalone/code_objects/before_run_synapses_1_post_push_spikes.o
diff --git a/...on/code/MushroomBody/cuda_standalone/code_objects/before_run_synapses_1_pre_push_spikes.o b/...on/code/MushroomBody/cuda_standalone/code_objects/before_run_synapses_1_pre_push_spikes.o
diff --git a/...on/code/MushroomBody/cuda_standalone/code_objects/before_run_synapses_2_pre_push_spikes.o b/...on/code/MushroomBody/cuda_standalone/code_objects/before_run_synapses_2_pre_push_spikes.o
diff --git a/...tion/code/MushroomBody/cuda_standalone/code_objects/before_run_synapses_pre_push_spikes.o b/...tion/code/MushroomBody/cuda_standalone/code_objects/before_run_synapses_pre_push_spikes.o
diff --git a/...n/code/MushroomBody/cuda_standalone/code_objects/neurongroup_1_stateupdater_codeobject.cu b/...n/code/MushroomBody/cuda_standalone/code_objects/neurongroup_1_stateupdater_codeobject.cu
@@ -437,7 +437,7 @@ void _run_neurongroup_1_stateupdater_codeobject()
  }
 
 
- _run_kernel_neurongroup_1_stateupdater_codeobject<<<num_blocks, num_threads>>>(
+ _run_kernel_neurongroup_1_stateupdater_codeobject<<<num_blocks, num_threads,0, neurongroup_stream1>>>(
  _N,
  num_threads,
  ///// HOST_PARAMETERS /////

diff --git a/...on/code/MushroomBody/cuda_standalone/code_objects/neurongroup_1_stateupdater_codeobject.o b/...on/code/MushroomBody/cuda_standalone/code_objects/neurongroup_1_stateupdater_codeobject.o
diff --git a/...on/code/MushroomBody/cuda_standalone/code_objects/neurongroup_1_thresholder_codeobject.cu b/...on/code/MushroomBody/cuda_standalone/code_objects/neurongroup_1_thresholder_codeobject.cu
@@ -384,13 +384,13 @@ void _run_neurongroup_1_thresholder_codeobject()
  first_run = false;
  }
 
- _reset_neurongroup_1_thresholder_codeobject<<<num_blocks, num_threads>>>(
+ _reset_neurongroup_1_thresholder_codeobject<<<num_blocks, num_threads,0, neurongroup_stream1>>>(
  dev_array_neurongroup_1__spikespace[current_idx_array_neurongroup_1__spikespace]
  );
 
  CUDA_CHECK_ERROR("_reset_neurongroup_1_thresholder_codeobject");
 
- _run_kernel_neurongroup_1_thresholder_codeobject<<<num_blocks, num_threads>>>(
+ _run_kernel_neurongroup_1_thresholder_codeobject<<<num_blocks, num_threads, 0, neurongroup_stream1>>>(
  _N,
  num_threads,
  ///// HOST_PARAMETERS /////
@@ -403,7 +403,6 @@ void _run_neurongroup_1_thresholder_codeobject()
 
  CUDA_CHECK_ERROR("_run_kernel_neurongroup_1_thresholder_codeobject");
 
-
 }
 
 
diff --git a/...ion/code/MushroomBody/cuda_standalone/code_objects/neurongroup_1_thresholder_codeobject.o b/...ion/code/MushroomBody/cuda_standalone/code_objects/neurongroup_1_thresholder_codeobject.o
diff --git a/...ion/code/MushroomBody/cuda_standalone/code_objects/neurongroup_stateupdater_codeobject.cu b/...ion/code/MushroomBody/cuda_standalone/code_objects/neurongroup_stateupdater_codeobject.cu
@@ -428,7 +428,7 @@ void _run_neurongroup_stateupdater_codeobject()
  }
 
 
- _run_kernel_neurongroup_stateupdater_codeobject<<<num_blocks, num_threads>>>(
+ _run_kernel_neurongroup_stateupdater_codeobject<<<num_blocks, num_threads,0,neurongroup_stream>>>(
  _N,
  num_threads,
  ///// HOST_PARAMETERS /////

diff --git a/...tion/code/MushroomBody/cuda_standalone/code_objects/neurongroup_stateupdater_codeobject.o b/...tion/code/MushroomBody/cuda_standalone/code_objects/neurongroup_stateupdater_codeobject.o
diff --git a/...tion/code/MushroomBody/cuda_standalone/code_objects/neurongroup_thresholder_codeobject.cu b/...tion/code/MushroomBody/cuda_standalone/code_objects/neurongroup_thresholder_codeobject.cu
@@ -384,13 +384,13 @@ void _run_neurongroup_thresholder_codeobject()
  first_run = false;
  }
 
- _reset_neurongroup_thresholder_codeobject<<<num_blocks, num_threads>>>(
+ _reset_neurongroup_thresholder_codeobject<<<num_blocks, num_threads,0, neurongroup_stream>>>(
  dev_array_neurongroup__spikespace[current_idx_array_neurongroup__spikespace]
  );
 
  CUDA_CHECK_ERROR("_reset_neurongroup_thresholder_codeobject");
 
- _run_kernel_neurongroup_thresholder_codeobject<<<num_blocks, num_threads>>>(
+ _run_kernel_neurongroup_thresholder_codeobject<<<num_blocks, num_threads, 0, neurongroup_stream>>>(
  _N,
  num_threads,
  ///// HOST_PARAMETERS /////

diff --git a/...ution/code/MushroomBody/cuda_standalone/code_objects/neurongroup_thresholder_codeobject.o b/...ution/code/MushroomBody/cuda_standalone/code_objects/neurongroup_thresholder_codeobject.o
diff --git a/...xecution/code/MushroomBody/cuda_standalone/code_objects/spikegeneratorgroup_codeobject.cu b/...xecution/code/MushroomBody/cuda_standalone/code_objects/spikegeneratorgroup_codeobject.cu
@@ -439,7 +439,7 @@ void _run_spikegeneratorgroup_codeobject()
 
  // Note: If we have no delays, there is only one spikespace and
  // current_idx equals previous_idx.
- _reset_spikegeneratorgroup_codeobject<<<num_blocks, num_threads>>>(
+ _reset_spikegeneratorgroup_codeobject<<<num_blocks, num_threads, 0, spikegenerator_stream>>>(
  dev_array_spikegeneratorgroup__spikespace[previous_idx_array_spikegeneratorgroup__spikespace],
  ///// HOST_PARAMETERS /////
  dev_array_spikegeneratorgroup__lastindex,
@@ -456,7 +456,7 @@ void _run_spikegeneratorgroup_codeobject()
 
  CUDA_CHECK_ERROR("_reset_spikegeneratorgroup_codeobject");
 
- _run_kernel_spikegeneratorgroup_codeobject<<<num_blocks, num_threads>>>(
+ _run_kernel_spikegeneratorgroup_codeobject<<<num_blocks, num_threads,0, spikegenerator_stream>>>(
  _N,
  num_threads,
  ///// HOST_PARAMETERS /////
@@ -473,6 +473,7 @@ void _run_spikegeneratorgroup_codeobject()
  );
 
  CUDA_CHECK_ERROR("_run_kernel_spikegeneratorgroup_codeobject");
+ CUDA_SAFE_CALL(cudaDeviceSynchronize());
 
 
 }

diff --git a/...execution/code/MushroomBody/cuda_standalone/code_objects/spikegeneratorgroup_codeobject.o b/...execution/code/MushroomBody/cuda_standalone/code_objects/spikegeneratorgroup_codeobject.o
diff --git a/...lel_execution/code/MushroomBody/cuda_standalone/code_objects/spikemonitor_1_codeobject.cu b/...lel_execution/code/MushroomBody/cuda_standalone/code_objects/spikemonitor_1_codeobject.cu
@@ -304,7 +304,7 @@ void _run_spikemonitor_1_codeobject()
  static bool first_run = true;
  if (first_run)
  {
-_init_kernel_spikemonitor_1_codeobject<<<1,1>>>();
+_init_kernel_spikemonitor_1_codeobject<<<1,1,0,spikemonitor_stream1>>>();
 
 CUDA_CHECK_ERROR("_init_kernel_spikemonitor_1_codeobject");
 num_blocks = 1;
@@ -374,7 +374,7 @@ num_threads = 1;
  }
 
 
-_run_kernel_spikemonitor_1_codeobject<<<num_blocks, num_threads>>>(
+_run_kernel_spikemonitor_1_codeobject<<<num_blocks, num_threads,0,spikemonitor_stream1>>>(
  _num_spikespace-1,
  dev_array_spikemonitor_1_count,
  // HOST_PARAMETERS
@@ -519,7 +519,7 @@ void _copyToHost_spikemonitor_1_codeobject()
  double* const dev_array_spikemonitor_1_t = thrust::raw_pointer_cast(&dev_dynamic_array_spikemonitor_1_t[0]);
  const int _numt = dev_dynamic_array_spikemonitor_1_t.size();
 
- _count_kernel_spikemonitor_1_codeobject<<<1,1>>>(
+ _count_kernel_spikemonitor_1_codeobject<<<1,1,0,spikemonitor_stream1>>>(
  dev_num_events,
  // HOST_PARAMETERS
  dev_array_spikemonitor_1_N,
@@ -537,7 +537,7 @@ void _copyToHost_spikemonitor_1_codeobject()
  CUDA_CHECK_ERROR("_count_kernel_spikemonitor_1_codeobject");
 
  CUDA_SAFE_CALL(
- cudaMemcpy(&host_num_events, dev_num_events, sizeof(int), cudaMemcpyDeviceToHost)
+ cudaMemcpyAsync(&host_num_events, dev_num_events, sizeof(int), cudaMemcpyDeviceToHost,spikemonitor_stream1)
  );
 
  // resize monitor device vectors
@@ -548,7 +548,7 @@ void _copyToHost_spikemonitor_1_codeobject()
  dev_dynamic_array_spikemonitor_1_i.resize(host_num_events)
  );
 
- _copy_kernel_spikemonitor_1_codeobject<<<1,1>>>(
+ _copy_kernel_spikemonitor_1_codeobject<<<1,1,0,spikemonitor_stream1>>>(
  thrust::raw_pointer_cast(&dev_dynamic_array_spikemonitor_1_t[0]),
  thrust::raw_pointer_cast(&dev_dynamic_array_spikemonitor_1_i[0]),
  0 );
@@ -574,7 +574,7 @@ void _debugmsg_spikemonitor_1_codeobject()
  // TODO: can't we acces the correct _array_eventmonitor_N[0]
  // value here without any kernel call?
  // Yes: use _array_spikemonitor_1_N
- _debugmsg_kernel_spikemonitor_1_codeobject<<<1,1>>>(
+ _debugmsg_kernel_spikemonitor_1_codeobject<<<1,1,0,spikemonitor_stream1>>>(
  // HOST_PARAMETERS
  dev_array_spikemonitor_1_N,
  dev_array_neurongroup_i,

diff --git a/...llel_execution/code/MushroomBody/cuda_standalone/code_objects/spikemonitor_1_codeobject.o b/...llel_execution/code/MushroomBody/cuda_standalone/code_objects/spikemonitor_1_codeobject.o
diff --git a/...lel_execution/code/MushroomBody/cuda_standalone/code_objects/spikemonitor_2_codeobject.cu b/...lel_execution/code/MushroomBody/cuda_standalone/code_objects/spikemonitor_2_codeobject.cu
@@ -304,7 +304,7 @@ void _run_spikemonitor_2_codeobject()
  static bool first_run = true;
  if (first_run)
  {
-_init_kernel_spikemonitor_2_codeobject<<<1,1>>>();
+_init_kernel_spikemonitor_2_codeobject<<<1,1,0,spikemonitor_stream2>>>();
 
 CUDA_CHECK_ERROR("_init_kernel_spikemonitor_2_codeobject");
 num_blocks = 1;
@@ -374,7 +374,7 @@ num_threads = 1;
  }
 
 
-_run_kernel_spikemonitor_2_codeobject<<<num_blocks, num_threads>>>(
+_run_kernel_spikemonitor_2_codeobject<<<num_blocks, num_threads,0,spikemonitor_stream2>>>(
  _num_spikespace-1,
  dev_array_spikemonitor_2_count,
  // HOST_PARAMETERS
@@ -519,7 +519,7 @@ void _copyToHost_spikemonitor_2_codeobject()
  double* const dev_array_spikemonitor_2_t = thrust::raw_pointer_cast(&dev_dynamic_array_spikemonitor_2_t[0]);
  const int _numt = dev_dynamic_array_spikemonitor_2_t.size();
 
- _count_kernel_spikemonitor_2_codeobject<<<1,1>>>(
+ _count_kernel_spikemonitor_2_codeobject<<<1,1,0,spikemonitor_stream2>>>(
  dev_num_events,
  // HOST_PARAMETERS
  dev_array_spikemonitor_2_N,
@@ -537,7 +537,7 @@ void _copyToHost_spikemonitor_2_codeobject()
  CUDA_CHECK_ERROR("_count_kernel_spikemonitor_2_codeobject");
 
  CUDA_SAFE_CALL(
- cudaMemcpy(&host_num_events, dev_num_events, sizeof(int), cudaMemcpyDeviceToHost)
+ cudaMemcpyAsync(&host_num_events, dev_num_events, sizeof(int), cudaMemcpyDeviceToHost,spikemonitor_stream2)
  );
 
  // resize monitor device vectors
@@ -548,7 +548,7 @@ void _copyToHost_spikemonitor_2_codeobject()
  dev_dynamic_array_spikemonitor_2_i.resize(host_num_events)
  );
 
- _copy_kernel_spikemonitor_2_codeobject<<<1,1>>>(
+ _copy_kernel_spikemonitor_2_codeobject<<<1,1,0,spikemonitor_stream2>>>(
  thrust::raw_pointer_cast(&dev_dynamic_array_spikemonitor_2_t[0]),
  thrust::raw_pointer_cast(&dev_dynamic_array_spikemonitor_2_i[0]),
  0 );
@@ -574,7 +574,7 @@ void _debugmsg_spikemonitor_2_codeobject()
  // TODO: can't we acces the correct _array_eventmonitor_N[0]
  // value here without any kernel call?
  // Yes: use _array_spikemonitor_2_N
- _debugmsg_kernel_spikemonitor_2_codeobject<<<1,1>>>(
+ _debugmsg_kernel_spikemonitor_2_codeobject<<<1,1,0,spikemonitor_stream2>>>(
  // HOST_PARAMETERS
  dev_array_spikemonitor_2_N,
  dev_array_neurongroup_1_i,
@@ -589,5 +589,7 @@ void _debugmsg_spikemonitor_2_codeobject()
  );
 
  CUDA_CHECK_ERROR("_debugmsg_kernel_spikemonitor_2_codeobject");
+ CUDA_SAFE_CALL(cudaDeviceSynchronize());
 }
 
+
diff --git a/...llel_execution/code/MushroomBody/cuda_standalone/code_objects/spikemonitor_2_codeobject.o b/...llel_execution/code/MushroomBody/cuda_standalone/code_objects/spikemonitor_2_codeobject.o
diff --git a/...allel_execution/code/MushroomBody/cuda_standalone/code_objects/spikemonitor_codeobject.cu b/...allel_execution/code/MushroomBody/cuda_standalone/code_objects/spikemonitor_codeobject.cu
@@ -304,7 +304,7 @@ void _run_spikemonitor_codeobject()
  static bool first_run = true;
  if (first_run)
  {
-_init_kernel_spikemonitor_codeobject<<<1,1>>>();
+_init_kernel_spikemonitor_codeobject<<<1,1,0,spikemonitor_stream>>>();
 
 CUDA_CHECK_ERROR("_init_kernel_spikemonitor_codeobject");
 num_blocks = 1;
@@ -374,7 +374,7 @@ num_threads = 1;
  }
 
 
-_run_kernel_spikemonitor_codeobject<<<num_blocks, num_threads>>>(
+_run_kernel_spikemonitor_codeobject<<<num_blocks, num_threads,0, spikemonitor_stream>>>(
  _num_spikespace-1,
  dev_array_spikemonitor_count,
  // HOST_PARAMETERS
@@ -519,7 +519,7 @@ void _copyToHost_spikemonitor_codeobject()
  double* const dev_array_spikemonitor_t = thrust::raw_pointer_cast(&dev_dynamic_array_spikemonitor_t[0]);
  const int _numt = dev_dynamic_array_spikemonitor_t.size();
 
- _count_kernel_spikemonitor_codeobject<<<1,1>>>(
+ _count_kernel_spikemonitor_codeobject<<<1,1,0,spikemonitor_stream>>>(
  dev_num_events,
  // HOST_PARAMETERS
  dev_array_spikemonitor_N,
@@ -537,7 +537,7 @@ void _copyToHost_spikemonitor_codeobject()
  CUDA_CHECK_ERROR("_count_kernel_spikemonitor_codeobject");
 
  CUDA_SAFE_CALL(
- cudaMemcpy(&host_num_events, dev_num_events, sizeof(int), cudaMemcpyDeviceToHost)
+ cudaMemcpyAsync(&host_num_events, dev_num_events, sizeof(int), cudaMemcpyDeviceToHost,spikemonitor_stream)
  );
 
  // resize monitor device vectors
@@ -548,7 +548,7 @@ void _copyToHost_spikemonitor_codeobject()
  dev_dynamic_array_spikemonitor_i.resize(host_num_events)
  );
 
- _copy_kernel_spikemonitor_codeobject<<<1,1>>>(
+ _copy_kernel_spikemonitor_codeobject<<<1,1,0,spikemonitor_stream>>>(
  thrust::raw_pointer_cast(&dev_dynamic_array_spikemonitor_t[0]),
  thrust::raw_pointer_cast(&dev_dynamic_array_spikemonitor_i[0]),
  0 );
@@ -574,7 +574,7 @@ void _debugmsg_spikemonitor_codeobject()
  // TODO: can't we acces the correct _array_eventmonitor_N[0]
  // value here without any kernel call?
  // Yes: use _array_spikemonitor_N
- _debugmsg_kernel_spikemonitor_codeobject<<<1,1>>>(
+ _debugmsg_kernel_spikemonitor_codeobject<<<1,1,0,spikemonitor_stream>>>(
  // HOST_PARAMETERS
  dev_array_spikemonitor_N,
  dev_array_spikegeneratorgroup_i,

diff --git a/...rallel_execution/code/MushroomBody/cuda_standalone/code_objects/spikemonitor_codeobject.o b/...rallel_execution/code/MushroomBody/cuda_standalone/code_objects/spikemonitor_codeobject.o
diff --git a/...Body/cuda_standalone/code_objects/synapses_1_group_variable_set_conditional_codeobject.cu b/...Body/cuda_standalone/code_objects/synapses_1_group_variable_set_conditional_codeobject.cu
@@ -376,7 +376,7 @@ void _run_synapses_1_group_variable_set_conditional_codeobject()
  }
 
 
- _run_kernel_synapses_1_group_variable_set_conditional_codeobject<<<num_blocks, num_threads>>>(
+ _run_kernel_synapses_1_group_variable_set_conditional_codeobject<<<num_blocks, num_threads,0,stream1>>>(
  _N,
  num_threads,
  ///// HOST_PARAMETERS /////

diff --git a/...mBody/cuda_standalone/code_objects/synapses_1_group_variable_set_conditional_codeobject.o b/...mBody/cuda_standalone/code_objects/synapses_1_group_variable_set_conditional_codeobject.o
diff --git a/...dy/cuda_standalone/code_objects/synapses_1_group_variable_set_conditional_codeobject_1.cu b/...dy/cuda_standalone/code_objects/synapses_1_group_variable_set_conditional_codeobject_1.cu
@@ -401,7 +401,7 @@ void _run_synapses_1_group_variable_set_conditional_codeobject_1()
  }
 
 
- _run_kernel_synapses_1_group_variable_set_conditional_codeobject_1<<<num_blocks, num_threads>>>(
+ _run_kernel_synapses_1_group_variable_set_conditional_codeobject_1<<<num_blocks, num_threads,0,stream1>>>(
  _N,
  num_threads,
  ///// HOST_PARAMETERS /////

diff --git a/...ody/cuda_standalone/code_objects/synapses_1_group_variable_set_conditional_codeobject_1.o b/...ody/cuda_standalone/code_objects/synapses_1_group_variable_set_conditional_codeobject_1.o
diff --git a/...el_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_post_codeobject.cu b/...el_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_post_codeobject.cu
@@ -509,7 +509,7 @@ if (synapses_1_post_max_size > 0)
 {
  for(int bid_offset = 0; bid_offset < num_loops; bid_offset++)
  {
- _run_kernel_synapses_1_post_codeobject<<<num_blocks, num_threads>>>(
+ _run_kernel_synapses_1_post_codeobject<<<num_blocks, num_threads,0,stream1>>>(
  _N,
  bid_offset,
  defaultclock.timestep[0],
@@ -534,6 +534,7 @@ if (synapses_1_post_max_size > 0)
  }
 
  CUDA_CHECK_ERROR("_run_kernel_synapses_1_post_codeobject");
+ CUDA_SAFE_CALL(cudaDeviceSynchronize());
 }
 
 

diff --git a/...lel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_post_codeobject.o b/...lel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_post_codeobject.o
diff --git a/...l_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_post_push_spikes.cu b/...l_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_post_push_spikes.cu
@@ -265,13 +265,13 @@ void _run_synapses_1_post_push_spikes()
  // get the number of spiking neurons
  int32_t num_spiking_neurons;
  CUDA_SAFE_CALL(
- cudaMemcpy(&num_spiking_neurons,
+ cudaMemcpyAsync(&num_spiking_neurons,
  dev_array_neurongroup_1__spikespace[current_idx_array_neurongroup_1__spikespace] + _num_spikespace - 1,
- sizeof(int32_t), cudaMemcpyDeviceToHost)
+ sizeof(int32_t), cudaMemcpyDeviceToHost,stream1)
  );
 
  // advance spike queues
- _advance_kernel_synapses_1_post_push_spikes<<<1, num_parallel_blocks>>>();
+ _advance_kernel_synapses_1_post_push_spikes<<<1, num_parallel_blocks,0,stream1>>>();
 
  CUDA_CHECK_ERROR("_advance_kernel_synapses_1_post_push_spikes");
 
@@ -359,7 +359,7 @@ void _run_synapses_1_post_push_spikes()
  {
  num_blocks = num_parallel_blocks * num_spiking_neurons;
 
- _run_kernel_synapses_1_post_push_spikes<<<num_blocks, num_threads, needed_shared_memory>>>(
+ _run_kernel_synapses_1_post_push_spikes<<<num_blocks, num_threads, needed_shared_memory,stream1>>>(
  num_parallel_blocks,
  num_blocks,
  num_threads,

diff --git a/...el_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_post_push_spikes.o b/...el_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_post_push_spikes.o
diff --git a/...lel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_pre_codeobject.cu b/...lel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_pre_codeobject.cu
@@ -534,9 +534,9 @@ if (synapses_1_pre_max_size > 0)
  {
  if (defaultclock.timestep[0] >= synapses_1_pre_delay)
  {
- cudaMemcpy(&num_spiking_neurons,
+ cudaMemcpyAsync(&num_spiking_neurons,
  &dev_array_neurongroup__spikespace[synapses_1_pre_eventspace_idx][_num__array_neurongroup__spikespace - 1],
- sizeof(int32_t), cudaMemcpyDeviceToHost);
+ sizeof(int32_t), cudaMemcpyDeviceToHost,stream1);
  num_blocks = num_parallel_blocks * num_spiking_neurons;
  //TODO collect info abt mean, std of num spiking neurons per time
  //step and print INFO at end of simulation

diff --git a/...llel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_pre_codeobject.o b/...llel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_pre_codeobject.o
diff --git a/...el_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_pre_push_spikes.cu b/...el_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_pre_push_spikes.cu
@@ -265,13 +265,13 @@ void _run_synapses_1_pre_push_spikes()
  // get the number of spiking neurons
  int32_t num_spiking_neurons;
  CUDA_SAFE_CALL(
- cudaMemcpy(&num_spiking_neurons,
+ cudaMemcpyAsync(&num_spiking_neurons,
  dev_array_neurongroup__spikespace[current_idx_array_neurongroup__spikespace] + _num_spikespace - 1,
- sizeof(int32_t), cudaMemcpyDeviceToHost)
+ sizeof(int32_t), cudaMemcpyDeviceToHost,stream1)
  );
 
  // advance spike queues
- _advance_kernel_synapses_1_pre_push_spikes<<<1, num_parallel_blocks>>>();
+ _advance_kernel_synapses_1_pre_push_spikes<<<1, num_parallel_blocks,0,stream1>>>();
 
  CUDA_CHECK_ERROR("_advance_kernel_synapses_1_pre_push_spikes");
 

diff --git a/...lel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_pre_push_spikes.o b/...lel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_pre_push_spikes.o
diff --git a/...hroomBody/cuda_standalone/code_objects/synapses_1_synapses_create_generator_codeobject.cu b/...hroomBody/cuda_standalone/code_objects/synapses_1_synapses_create_generator_codeobject.cu
@@ -480,10 +480,10 @@ size_t used_device_memory_start = used_device_memory;
  dev_dynamic_array_synapses_1__synaptic_pre = _dynamic_array_synapses_1__synaptic_pre;
  dev_dynamic_array_synapses_1__synaptic_post = _dynamic_array_synapses_1__synaptic_post;
  CUDA_SAFE_CALL(
- cudaMemcpy(dev_array_synapses_1_N,
+ cudaMemcpyAsync(dev_array_synapses_1_N,
  _array_synapses_1_N,
  sizeof(int32_t),
- cudaMemcpyHostToDevice)
+ cudaMemcpyHostToDevice,stream1)
  );
 
 

diff --git a/...shroomBody/cuda_standalone/code_objects/synapses_1_synapses_create_generator_codeobject.o b/...shroomBody/cuda_standalone/code_objects/synapses_1_synapses_create_generator_codeobject.o
diff --git a/...lel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_2_pre_codeobject.cu b/...lel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_2_pre_codeobject.cu
@@ -443,9 +443,9 @@ if (synapses_2_pre_max_size > 0)
  {
  if (defaultclock.timestep[0] >= synapses_2_pre_delay)
  {
- cudaMemcpy(&num_spiking_neurons,
+ cudaMemcpyAsync(&num_spiking_neurons,
  &dev_array_neurongroup_1__spikespace[synapses_2_pre_eventspace_idx][_num__array_neurongroup_1__spikespace - 1],
- sizeof(int32_t), cudaMemcpyDeviceToHost);
+ sizeof(int32_t), cudaMemcpyDeviceToHost,stream2);
  num_blocks = num_parallel_blocks * num_spiking_neurons;
  //TODO collect info abt mean, std of num spiking neurons per time
  //step and print INFO at end of simulation

diff --git a/...llel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_2_pre_codeobject.o b/...llel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_2_pre_codeobject.o
diff --git a/...el_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_2_pre_push_spikes.cu b/...el_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_2_pre_push_spikes.cu
@@ -265,13 +265,13 @@ void _run_synapses_2_pre_push_spikes()
  // get the number of spiking neurons
  int32_t num_spiking_neurons;
  CUDA_SAFE_CALL(
- cudaMemcpy(&num_spiking_neurons,
+ cudaMemcpyAsync(&num_spiking_neurons,
  dev_array_neurongroup_1__spikespace[current_idx_array_neurongroup_1__spikespace] + _num_spikespace - 1,
- sizeof(int32_t), cudaMemcpyDeviceToHost)
+ sizeof(int32_t), cudaMemcpyDeviceToHost, stream2)
  );
 
  // advance spike queues
- _advance_kernel_synapses_2_pre_push_spikes<<<1, num_parallel_blocks>>>();
+ _advance_kernel_synapses_2_pre_push_spikes<<<1, num_parallel_blocks,0,stream2>>>();
 
  CUDA_CHECK_ERROR("_advance_kernel_synapses_2_pre_push_spikes");
 

diff --git a/...lel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_2_pre_push_spikes.o b/...lel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_2_pre_push_spikes.o
diff --git a/...hroomBody/cuda_standalone/code_objects/synapses_2_synapses_create_generator_codeobject.cu b/...hroomBody/cuda_standalone/code_objects/synapses_2_synapses_create_generator_codeobject.cu
@@ -460,10 +460,10 @@ size_t used_device_memory_start = used_device_memory;
  dev_dynamic_array_synapses_2__synaptic_pre = _dynamic_array_synapses_2__synaptic_pre;
  dev_dynamic_array_synapses_2__synaptic_post = _dynamic_array_synapses_2__synaptic_post;
  CUDA_SAFE_CALL(
- cudaMemcpy(dev_array_synapses_2_N,
+ cudaMemcpyAsync(dev_array_synapses_2_N,
  _array_synapses_2_N,
  sizeof(int32_t),
- cudaMemcpyHostToDevice)
+ cudaMemcpyHostToDevice, stream2)
  );
 
 

diff --git a/...shroomBody/cuda_standalone/code_objects/synapses_2_synapses_create_generator_codeobject.o b/...shroomBody/cuda_standalone/code_objects/synapses_2_synapses_create_generator_codeobject.o