Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DO NOT MERGE] Parallel execution #259

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
Prev Previous commit
Next Next commit
Add cuda stream in network
  • Loading branch information
SudeshnaBora committed Apr 18, 2022
commit 45fbc5702938ce927928174c30a67e38f5318655
5 changes: 4 additions & 1 deletion brian2cuda/device.py
Original file line number Diff line number Diff line change
Expand Up @@ -995,11 +995,14 @@ def generate_network_source(self, writer):
maximum_run_time = self._maximum_run_time
if maximum_run_time is not None:
maximum_run_time = float(maximum_run_time)
num_stream = max(Counter(self.stream_info).values())
network_tmp = self.code_object_class().templater.network(None, None,
maximum_run_time=maximum_run_time,
eventspace_arrays=self.eventspace_arrays,
spikegenerator_eventspaces=self.spikegenerator_eventspaces,
stream_info=self.stream_info)
parallelize = True,
stream_info = self.stream_info,
num_stream= num_stream)
writer.write('network.*', network_tmp)

def generate_synapses_classes_source(self, writer):
Expand Down
29 changes: 22 additions & 7 deletions brian2cuda/templates/network.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,18 @@

double Network::_last_run_time = 0.0;
double Network::_last_run_completed_fraction = 0.0;
{% if parallelize %}
cudaStream_t custom_stream[{{num_stream}}];
{% endif %}

Network::Network()
{
t = 0.0;
{% if parallelize %}
for(int i=0;i<{{num_stream}};i++){
CUDA_SAFE_CALL(cudaStreamCreate(&(custom_stream[i])));
}
{% endif %}
}

void Network::clear()
Expand Down Expand Up @@ -90,8 +98,8 @@ void Network::run(const double duration, void (*report_func)(const double, const
int func_group_int = std::get<2>(objects[i]);
if (func) // code objects can be NULL in cases where we store just the clock
{
//func_groups[func_group_int].push_back(func);
func_groups.push_back(std::make_pair(func_group_int,func));
func_groups[func_group_int].push_back(func);
//func_groups.push_back(std::make_pair(func_group_int,func));
//func();
// [[func1,func2,func3],[func4...]]
}
Expand All @@ -101,10 +109,13 @@ void Network::run(const double duration, void (*report_func)(const double, const
// get maximum in objects.cu array

// go through each list of func group - 2 loops
for(int i=0; i<func_groups.size(); i++) {
codeobj_func func = func_groups[i].second;
//func(cuda_streams[i]);
func();
for(int i=0; i<func_groups.size(); i++){
for(int j=0; j<func_groups.size(); j++){
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The second loop is wrong:

Suggested change
for(int j=0; j<func_groups.size(); j++){
for(int j=0; j<func_groups[i].size(); j++){

codeobj_func func = func_groups[i][j];
func(custom_stream[j]);
}
// reset the func group for that sub stream
func_groups.resize(0);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After each function group, you need to synchronize host and device. Check the documentation if cudaDeviceSynchronize() will do the job or if you need to synchronize all streams.

}

for(std::set<Clock*>::iterator i=curclocks.begin(); i!=curclocks.end(); i++)
Expand Down Expand Up @@ -209,11 +220,15 @@ public:
// TODO vectory of tuples having clock , codeobj_func and stread integer
std::vector< std::tuple< Clock*, codeobj_func, int > > objects;
//std::vector< std::pair< Clock*, codeobj_func > > objects;
std::vector<std::pair< int, codeobj_func >> func_groups;
std::vector<std::vector<codeobj_func >> func_groups = std::vector<std::vector<codeobj_func >>({{num_stream}});
//std::vector<std::pair< int, codeobj_func >> func_groups;
double t;
static double _last_run_time;
static double _last_run_completed_fraction;
int num_streams;
{% if parallelize %}
cudaStream_t custom_stream[{{num_stream}}];
{% endif %}

Network();
void clear();
Expand Down
22 changes: 11 additions & 11 deletions brian2cuda/templates/objects.cu
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@ const int brian::_num_{{varname}} = {{var.size}};


///////////////// array of streams for parallelization //////////////////////////
{% if parallelize %}
cudaStream_t brian::custom_stream[{{stream_size}}];
{% endif %}
// {% if parallelize %}
// cudaStream_t brian::custom_stream[{{stream_size}}];
// {% endif %}

//////////////// eventspaces ///////////////
// we dynamically create multiple eventspaces in no_or_const_delay_mode
Expand Down Expand Up @@ -232,11 +232,11 @@ void _init_arrays()
);
{% endif %}

{% if parallelize %}
for(int i=0;i<{{stream_size}};i++){
CUDA_SAFE_CALL(cudaStreamCreate(&(custom_stream[i])));
}
{% endif %}
// {% if parallelize %}
// for(int i=0;i<{{stream_size}};i++){
// CUDA_SAFE_CALL(cudaStreamCreate(&(custom_stream[i])));
// }
// {% endif %}



Expand Down Expand Up @@ -613,9 +613,9 @@ extern thrust::device_vector<{{c_data_type(var.dtype)}}>* {{varname}};
{% endfor %}

//////////////// stream ////////////
{% if parallelize %}
extern cudaStream_t custom_stream[{{stream_size}}];
{% endif %}
// {% if parallelize %}
// extern cudaStream_t custom_stream[{{stream_size}}];
// {% endif %}


/////////////// static arrays /////////////
Expand Down