Add cuda stream in network

brian-team · SudeshnaBora · Oct 30, 2021 · Oct 30, 2021 · Nov 20, 2021 · Apr 6, 2022
commit 45fbc5702938ce927928174c30a67e38f5318655
diff --git a/brian2cuda/device.py b/brian2cuda/device.py
@@ -995,11 +995,14 @@ def generate_network_source(self, writer):
  maximum_run_time = self._maximum_run_time
  if maximum_run_time is not None:
  maximum_run_time = float(maximum_run_time)
+ num_stream = max(Counter(self.stream_info).values())
  network_tmp = self.code_object_class().templater.network(None, None,
  maximum_run_time=maximum_run_time,
  eventspace_arrays=self.eventspace_arrays,
  spikegenerator_eventspaces=self.spikegenerator_eventspaces,
- stream_info=self.stream_info)
+ parallelize = True,
+ stream_info = self.stream_info,
+ num_stream= num_stream)
  writer.write('network.*', network_tmp)
 
  def generate_synapses_classes_source(self, writer):

diff --git a/brian2cuda/templates/network.cu b/brian2cuda/templates/network.cu
@@ -14,10 +14,18 @@
 
 double Network::_last_run_time = 0.0;
 double Network::_last_run_completed_fraction = 0.0;
+{% if parallelize %}
+cudaStream_t custom_stream[{{num_stream}}];
+{% endif %}
 
 Network::Network()
 {
  t = 0.0;
+ {% if parallelize %}
+ for(int i=0;i<{{num_stream}};i++){
+ CUDA_SAFE_CALL(cudaStreamCreate(&(custom_stream[i])));
+ }
+ {% endif %}
 }
 
 void Network::clear()
@@ -90,8 +98,8 @@ void Network::run(const double duration, void (*report_func)(const double, const
  int func_group_int = std::get<2>(objects[i]);
  if (func) // code objects can be NULL in cases where we store just the clock
  {
- //func_groups[func_group_int].push_back(func);
- func_groups.push_back(std::make_pair(func_group_int,func));
+ func_groups[func_group_int].push_back(func);
+ //func_groups.push_back(std::make_pair(func_group_int,func));
  //func();
  // [[func1,func2,func3],[func4...]]
  }
@@ -101,10 +109,13 @@ void Network::run(const double duration, void (*report_func)(const double, const
  // get maximum in objects.cu array
 
  // go through each list of func group - 2 loops
- for(int i=0; i<func_groups.size(); i++) {
- codeobj_func func = func_groups[i].second;
- //func(cuda_streams[i]);
- func();
+ for(int i=0; i<func_groups.size(); i++){
+ for(int j=0; j<func_groups.size(); j++){
- for(int j=0; j<func_groups.size(); j++){
+ for(int j=0; j<func_groups[i].size(); j++){
- for(int j=0; j<func_groups.size(); j++){
+ for(int j=0; j<func_groups[i].size(); j++){
+ codeobj_func func = func_groups[i][j];
+ func(custom_stream[j]);
+ }
+ // reset the func group for that sub stream
+ func_groups.resize(0);
  }
 
  for(std::set<Clock*>::iterator i=curclocks.begin(); i!=curclocks.end(); i++)
@@ -209,11 +220,15 @@ public:
 // TODO vectory of tuples having clock , codeobj_func and stread integer
  std::vector< std::tuple< Clock*, codeobj_func, int > > objects;
  //std::vector< std::pair< Clock*, codeobj_func > > objects;
- std::vector<std::pair< int, codeobj_func >> func_groups;
+ std::vector<std::vector<codeobj_func >> func_groups = std::vector<std::vector<codeobj_func >>({{num_stream}});
+ //std::vector<std::pair< int, codeobj_func >> func_groups;
  double t;
  static double _last_run_time;
  static double _last_run_completed_fraction;
  int num_streams;
+ {% if parallelize %}
+ cudaStream_t custom_stream[{{num_stream}}];
+ {% endif %}
 
  Network();
  void clear();

diff --git a/brian2cuda/templates/objects.cu b/brian2cuda/templates/objects.cu
@@ -42,9 +42,9 @@ const int brian::_num_{{varname}} = {{var.size}};
 
 
 ///////////////// array of streams for parallelization //////////////////////////
-{% if parallelize %}
-cudaStream_t brian::custom_stream[{{stream_size}}];
-{% endif %}
+// {% if parallelize %}
+// cudaStream_t brian::custom_stream[{{stream_size}}];
+// {% endif %}
 
 //////////////// eventspaces ///////////////
 // we dynamically create multiple eventspaces in no_or_const_delay_mode
@@ -232,11 +232,11 @@ void _init_arrays()
  );
  {% endif %}
 
- {% if parallelize %}
- for(int i=0;i<{{stream_size}};i++){
- CUDA_SAFE_CALL(cudaStreamCreate(&(custom_stream[i])));
- }
- {% endif %}
+//  {% if parallelize %}
+//  for(int i=0;i<{{stream_size}};i++){
+//  CUDA_SAFE_CALL(cudaStreamCreate(&(custom_stream[i])));
+//  }
+//  {% endif %}
 
 
 
@@ -613,9 +613,9 @@ extern thrust::device_vector<{{c_data_type(var.dtype)}}>* {{varname}};
 {% endfor %}
 
 //////////////// stream ////////////
-{% if parallelize %}
-extern cudaStream_t custom_stream[{{stream_size}}];
-{% endif %}
+// {% if parallelize %}
+// extern cudaStream_t custom_stream[{{stream_size}}];
+// {% endif %}
 
 
 /////////////// static arrays /////////////