diff --git a/brian2cuda/device.py b/brian2cuda/device.py index 7d783fd0..dc378da9 100644 --- a/brian2cuda/device.py +++ b/brian2cuda/device.py @@ -89,6 +89,8 @@ def __init__(self): # list of pre/post ID arrays that are not needed in device memory self.delete_synaptic_pre = {} self.delete_synaptic_post = {} + # dictionary to store parallalelization information + self.stream_info = {} # The following nested dictionary collects all codeobjects that use random # number generation (RNG). self.codeobjects_with_rng = { @@ -359,6 +361,7 @@ def code_object(self, owner, name, abstract_code, variables, template_name, template_kwds["sm_multiplier"] = prefs["devices.cuda_standalone.SM_multiplier"] template_kwds["syn_launch_bounds"] = prefs["devices.cuda_standalone.syn_launch_bounds"] template_kwds["calc_occupancy"] = prefs["devices.cuda_standalone.calc_occupancy"] + template_kwds["stream_info"] = self.stream_info if template_name in ["threshold", "spikegenerator"]: template_kwds["extra_threshold_kernel"] = prefs["devices.cuda_standalone.extra_threshold_kernel"] codeobj = super(CUDAStandaloneDevice, self).code_object(owner, name, abstract_code, variables, @@ -374,7 +377,7 @@ def check_openmp_compatible(self, nb_threads): if nb_threads > 0: raise NotImplementedError("Using OpenMP in a CUDA standalone project is not supported") - def generate_objects_source(self, writer, arange_arrays, synapses, static_array_specs, networks): + def generate_objects_source(self, writer, arange_arrays, synapses, static_array_specs, networks, stream_info): sm_multiplier = prefs.devices.cuda_standalone.SM_multiplier num_parallel_blocks = prefs.devices.cuda_standalone.parallel_blocks curand_generator_type = prefs.devices.cuda_standalone.random_number_generator_type @@ -393,6 +396,9 @@ def generate_objects_source(self, writer, arange_arrays, synapses, static_array_ for syn in synapses: if syn.multisynaptic_index is not None: multisyn_vars.append(syn.variables[syn.multisynaptic_index]) + # get number of unique streams + + num_stream = max(Counter(stream_info).values()) arr_tmp = self.code_object_class().templater.objects( None, None, array_specs=self.arrays, @@ -415,7 +421,9 @@ def generate_objects_source(self, writer, arange_arrays, synapses, static_array_ eventspace_arrays=self.eventspace_arrays, spikegenerator_eventspaces=self.spikegenerator_eventspaces, multisynaptic_idx_vars=multisyn_vars, - profiled_codeobjects=self.profiled_codeobjects) + profiled_codeobjects=self.profiled_codeobjects, + parallelize=True, + stream_size=num_stream) # Reinsert deleted entries, in case we use self.arrays later? maybe unnecassary... self.arrays.update(self.eventspace_arrays) writer.write('objects.*', arr_tmp) @@ -445,7 +453,8 @@ def generate_main_source(self, writer): # For codeobjects run every tick, this happens in the init() of # the random number buffer called at first clock cycle of the network main_lines.append('random_number_buffer.ensure_enough_curand_states();') - main_lines.append(f'_run_{codeobj.name}();') + # add stream - default + main_lines.append(f'_run_{codeobj.name}(0);') elif func == 'after_run_code_object': codeobj, = args main_lines.append(f'_after_run_{codeobj.name}();') @@ -986,10 +995,14 @@ def generate_network_source(self, writer): maximum_run_time = self._maximum_run_time if maximum_run_time is not None: maximum_run_time = float(maximum_run_time) + num_stream = max(Counter(self.stream_info).values()) network_tmp = self.code_object_class().templater.network(None, None, maximum_run_time=maximum_run_time, eventspace_arrays=self.eventspace_arrays, - spikegenerator_eventspaces=self.spikegenerator_eventspaces) + spikegenerator_eventspaces=self.spikegenerator_eventspaces, + parallelize = True, + stream_info = self.stream_info, + num_stream= num_stream) writer.write('network.*', network_tmp) def generate_synapses_classes_source(self, writer): @@ -1310,7 +1323,7 @@ def build(self, directory='output', self.generate_objects_source(self.writer, self.arange_arrays, net_synapses, self.static_array_specs, - self.networks) + self.networks, self.stream_info) self.generate_network_source(self.writer) self.generate_synapses_classes_source(self.writer) self.generate_run_source(self.writer) @@ -1382,6 +1395,25 @@ def network_run(self, net, duration, report=None, report_period=10*second, self.clocks.update(net._clocks) net.t_ = float(t_end) + + # Create dictionary for parallelisation with stream + streams_organization = defaultdict(list) + for obj in net.sorted_objects: + streams_organization[(obj.when, obj.order)].append(obj) + + # associate each code object with a particular stream + streams_details = defaultdict(list) + count = 1 + for key in streams_organization: + for object in streams_organization[key]: + streams_details[object.name] = count + count +=1 + + self.stream_info = streams_details + self.stream_info['default'] = 0 + + + # TODO: remove this horrible hack for clock in self.clocks: if clock.name=='clock': @@ -1516,11 +1548,21 @@ def network_run(self, net, duration, report=None, report_period=10*second, # create all random numbers needed for the next clock cycle for clock in net._clocks: - run_lines.append(f'{net.name}.add(&{clock.name}, _run_random_number_buffer);') + run_lines.append(f'{net.name}.add(&{clock.name}, _run_random_number_buffer, {self.stream_info["default"]});') all_clocks = set() + # TODO add for every code object -> add where in the list are there. + # TODO create new dic (code object, position in list) for clock, codeobj in code_objects: - run_lines.append(f'{net.name}.add(&{clock.name}, _run_{codeobj.name});') + # add this position as additional number here + # check if codeobj.name has _codeobject in it + name = codeobj.name + if "_codeobject" in codeobj.name: + name = codeobj.name[:-11] + if name in self.stream_info.keys(): + run_lines.append(f'{net.name}.add(&{clock.name}, _run_{codeobj.name}, {self.stream_info[name]});') + else: + run_lines.append(f'{net.name}.add(&{clock.name}, _run_{codeobj.name}, {self.stream_info["default"]});') all_clocks.add(clock) # Under some rare circumstances (e.g. a NeuronGroup only defining a diff --git a/brian2cuda/templates/common_group.cu b/brian2cuda/templates/common_group.cu index cc959fa0..c3896e32 100644 --- a/brian2cuda/templates/common_group.cu +++ b/brian2cuda/templates/common_group.cu @@ -155,7 +155,7 @@ _run_kernel_{{codeobj_name}}( {% endblock kernel %} -void _run_{{codeobj_name}}() +void _run_{{codeobj_name}}(cudaStream_t stream) { using namespace brian; @@ -292,7 +292,7 @@ void _run_{{codeobj_name}}() {% endblock %} {% block kernel_call %} - _run_kernel_{{codeobj_name}}<<>>( + _run_kernel_{{codeobj_name}}<<>>( _N, num_threads, ///// HOST_PARAMETERS ///// @@ -326,7 +326,7 @@ void _run_{{codeobj_name}}() #ifndef _INCLUDED_{{codeobj_name}} #define _INCLUDED_{{codeobj_name}} -void _run_{{codeobj_name}}(); +void _run_{{codeobj_name}}(cudaStream_t); {% block extra_functions_h %} {% endblock %} @@ -362,7 +362,7 @@ void _after_run_{{codeobj_name}}() } {% endmacro %} - +// {{codeobj_name}} {% macro after_run_h_file() %} #ifndef _INCLUDED_{{codeobj_name}}_after #define _INCLUDED_{{codeobj_name}}_affer diff --git a/brian2cuda/templates/makefile b/brian2cuda/templates/makefile index 6bcf06a1..02c6acef 100644 --- a/brian2cuda/templates/makefile +++ b/brian2cuda/templates/makefile @@ -7,7 +7,7 @@ OBJS := ${OBJS:.cpp=.o} OBJS := ${OBJS:.c=.o} NVCC = @{{ nvcc_path }} -ccbin $(CXX) NVCCFLAGS = -I. -std=c++11 {{gpu_arch_flags}} {{nvcc_compiler_flags}} {{compiler_debug_flags}} -Xcompiler "{{cpp_compiler_flags}}" -LFLAGS = -lcurand -I. {{gpu_arch_flags}} {{cpp_linker_flags}} {{linker_debug_flags}} +LFLAGS = -lcurand -lcudart -I. {{gpu_arch_flags}} {{cpp_linker_flags}} {{linker_debug_flags}} all: $(PROGRAM) diff --git a/brian2cuda/templates/network.cu b/brian2cuda/templates/network.cu index 51723178..e0eb5419 100644 --- a/brian2cuda/templates/network.cu +++ b/brian2cuda/templates/network.cu @@ -14,10 +14,18 @@ double Network::_last_run_time = 0.0; double Network::_last_run_completed_fraction = 0.0; +{% if parallelize %} +cudaStream_t custom_stream[{{num_stream}}]; +{% endif %} Network::Network() { t = 0.0; + {% if parallelize %} + for(int i=0;i<{{num_stream}};i++){ + CUDA_SAFE_CALL(cudaStreamCreate(&(custom_stream[i]))); + } + {% endif %} } void Network::clear() @@ -25,12 +33,14 @@ void Network::clear() objects.clear(); } -void Network::add(Clock *clock, codeobj_func func) +// TODO have to makr change in objects - make it a tuple +// make decision which bject has which stream +void Network::add(Clock *clock, codeobj_func func, int group_num) { #if defined(_MSC_VER) && (_MSC_VER>=1700) - objects.push_back(std::make_pair(std::move(clock), std::move(func))); + objects.push_back(std::make_tuple(std::move(clock), std::move(func), std::move(group_num))); #else - objects.push_back(std::make_pair(clock, func)); + objects.push_back(std::make_tuple(clock, func, group_num)); #endif } @@ -56,7 +66,7 @@ void Network::run(const double duration, void (*report_func)(const double, const Clock* clock = next_clocks(); double elapsed_realtime; bool did_break_early = false; - + //TODO here while(clock && clock->running()) { t = clock->t[0]; @@ -73,17 +83,42 @@ void Network::run(const double duration, void (*report_func)(const double, const next_report_time += report_period; } } - Clock *obj_clock = objects[i].first; + // TODO tuple of clock and function + //Clock *obj_clock = objects[i].first; + Clock *obj_clock = std::get<0>(objects[i]); + int group_int = std::get<2>(objects[i]); // Only execute the object if it uses the right clock for this step if (curclocks.find(obj_clock) != curclocks.end()) { - codeobj_func func = objects[i].second; + // function -> whixh is in templates like common_group.cu + // sort the code object - waiting mechanism between groups + // cudaEvent or cudaSynchronise + //codeobj_func func = objects[i].second; + codeobj_func func = std::get<1>(objects[i]); + int func_group_int = std::get<2>(objects[i]); if (func) // code objects can be NULL in cases where we store just the clock { - func(); + func_groups[func_group_int].push_back(func); + //func_groups.push_back(std::make_pair(func_group_int,func)); + //func(); + // [[func1,func2,func3],[func4...]] } } } + + // get maximum in objects.cu array + + // go through each list of func group - 2 loops + for(int i=0; i::iterator i=curclocks.begin(); i!=curclocks.end(); i++) (*i)->tick(); clock = next_clocks(); @@ -129,7 +164,8 @@ void Network::compute_clocks() clocks.clear(); for(int i=0; i(objects[i]); + // Clock *clock = std::get<0>()objects[i].first; clocks.insert(clock); } } @@ -174,7 +210,7 @@ Clock* Network::next_clocks() #include #include "brianlib/clocks.h" -typedef void (*codeobj_func)(); +typedef void (*codeobj_func)(cudaStream_t); class Network { @@ -182,14 +218,22 @@ class Network void compute_clocks(); Clock* next_clocks(); public: - std::vector< std::pair< Clock*, codeobj_func > > objects; +// TODO vectory of tuples having clock , codeobj_func and stread integer + std::vector< std::tuple< Clock*, codeobj_func, int > > objects; + //std::vector< std::pair< Clock*, codeobj_func > > objects; + std::vector> func_groups = std::vector>({{num_stream}}); + //std::vector> func_groups; double t; static double _last_run_time; static double _last_run_completed_fraction; + int num_streams; + {% if parallelize %} + cudaStream_t custom_stream[{{num_stream}}]; + {% endif %} Network(); void clear(); - void add(Clock *clock, codeobj_func func); + void add(Clock *clock, codeobj_func func, int num_streams); void run(const double duration, void (*report_func)(const double, const double, const double, const double), const double report_period); }; diff --git a/brian2cuda/templates/objects.cu b/brian2cuda/templates/objects.cu index 0a7891c8..812f9408 100644 --- a/brian2cuda/templates/objects.cu +++ b/brian2cuda/templates/objects.cu @@ -40,6 +40,12 @@ const int brian::_num_{{varname}} = {{var.size}}; {% endif %} {% endfor %} + +///////////////// array of streams for parallelization ////////////////////////// +// {% if parallelize %} +// cudaStream_t brian::custom_stream[{{stream_size}}]; +// {% endif %} + //////////////// eventspaces /////////////// // we dynamically create multiple eventspaces in no_or_const_delay_mode // for initiating the first spikespace, we need a host pointer @@ -226,6 +232,14 @@ void _init_arrays() ); {% endif %} +// {% if parallelize %} +// for(int i=0;i<{{stream_size}};i++){ +// CUDA_SAFE_CALL(cudaStreamCreate(&(custom_stream[i]))); +// } +// {% endif %} + + + // this sets seed for host and device api RNG random_number_buffer.set_seed(seed); @@ -546,6 +560,7 @@ typedef {{curand_float_type}} randomNumber_t; // random number type #include "network.h" #include "rand.h" +#include #include #include #include @@ -597,6 +612,12 @@ extern thrust::device_vector<{{c_data_type(var.dtype)}}*> addresses_monitor_{{va extern thrust::device_vector<{{c_data_type(var.dtype)}}>* {{varname}}; {% endfor %} +//////////////// stream //////////// +// {% if parallelize %} +// extern cudaStream_t custom_stream[{{stream_size}}]; +// {% endif %} + + /////////////// static arrays ///////////// {% for (name, dtype_spec, N, filename) in static_array_specs | sort %} {# arrays that are initialized from static data are already declared #} diff --git a/brian2cuda/templates/rand.cu b/brian2cuda/templates/rand.cu index 08208d5a..06e7eae2 100644 --- a/brian2cuda/templates/rand.cu +++ b/brian2cuda/templates/rand.cu @@ -44,8 +44,9 @@ namespace { // need a function pointer for Network::add(), can't pass a pointer to a class -// method, which is of different type -void _run_random_number_buffer() +// method, which is of different type. Random number buffer runs in default +// stream always, the `stream` parameter is not used. +void _run_random_number_buffer(cudaStream_t stream) { // random_number_buffer is a RandomNumberBuffer instance, declared in objects.cu random_number_buffer.next_time_step(); @@ -472,7 +473,7 @@ void RandomNumberBuffer::next_time_step() #include -void _run_random_number_buffer(); +void _run_random_number_buffer(cudaStream_t); class RandomNumberBuffer { @@ -562,4 +563,4 @@ public: #endif -{% endmacro %} +{% endmacro %} \ No newline at end of file diff --git a/brian2cuda/templates/synapses.cu b/brian2cuda/templates/synapses.cu index daf73bca..1b8f61b8 100644 --- a/brian2cuda/templates/synapses.cu +++ b/brian2cuda/templates/synapses.cu @@ -254,9 +254,10 @@ if ({{pathway.name}}_max_size > 0) { if (defaultclock.timestep[0] >= {{pathway.name}}_delay) { - cudaMemcpy(&num_spiking_neurons, + CUDA_SAFE_CALL(cudaMemcpyAsync(&num_spiking_neurons, &dev{{_eventspace}}[{{pathway.name}}_eventspace_idx][_num_{{_eventspace}} - 1], - sizeof(int32_t), cudaMemcpyDeviceToHost); + sizeof(int32_t), cudaMemcpyDeviceToHost, stream)); + CUDA_SAFE_CALL(cudaStreamSynchronize(stream)); num_blocks = num_parallel_blocks * num_spiking_neurons; //TODO collect info abt mean, std of num spiking neurons per time //step and print INFO at end of simulation diff --git a/brian2cuda/templates/synapses_push_spikes.cu b/brian2cuda/templates/synapses_push_spikes.cu index e6b4dd13..45856d0f 100644 --- a/brian2cuda/templates/synapses_push_spikes.cu +++ b/brian2cuda/templates/synapses_push_spikes.cu @@ -1014,7 +1014,7 @@ void _run_{{codeobj_name}}() ); // advance spike queues - _advance_kernel_{{codeobj_name}}<<<1, num_parallel_blocks>>>(); + _advance_kernel_{{codeobj_name}}<<<1, num_parallel_blocks, 0, stream>>>(); CUDA_CHECK_ERROR("_advance_kernel_{{codeobj_name}}"); diff --git a/dev/issues/issue179_speedup/speed_up/code/MushroomBody/MBody_cuda.py b/dev/issues/issue179_speedup/speed_up/code/MushroomBody/MBody_cuda.py new file mode 100755 index 00000000..2a60e584 --- /dev/null +++ b/dev/issues/issue179_speedup/speed_up/code/MushroomBody/MBody_cuda.py @@ -0,0 +1,204 @@ +import random as py_random + +from brian2 import * +import brian2genn +import sys +from utils import get_directory + +MB_scaling = float(sys.argv[1]) + +extra_args = {} +device = sys.argv[2] +threads = int(sys.argv[3]) +use_spikemon = sys.argv[4] == 'true' +do_run = sys.argv[5] == 'true' +if threads == -1: + extra_args = {'use_GPU': False} +else: + prefs.devices.cpp_standalone.openmp_threads = threads + +prefs.devices.genn.path = "/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn" + +codefolder = get_directory(device, delete_dir=False) + +set_device(device,directory=codefolder,**extra_args) + +print('Running with arguments: ', sys.argv) + +# defaultclock.dt = 0.025*ms +# Constants +g_Na = 7.15*uS +E_Na = 50*mV +g_K = 1.43*uS +E_K = -95*mV +g_leak = 0.0267*uS +E_leak = -63.56*mV +C = 0.3*nF +# Those two constants are dummy constants, only used when populations only have +# either inhibitory or excitatory inputs +E_e = 0*mV +E_i = -92*mV +# Actual constants used for synapses +tau_PN_LHI = 1*ms +tau_LHI_iKC = 3*ms +tau_PN_iKC = 2*ms +tau_iKC_eKC = 10*ms +tau_eKC_eKC = 5*ms +w_LHI_iKC = 8.75*nS +w_eKC_eKC = 75*nS +tau_pre = tau_post = 10*ms +dApre = 0.1*nS/MB_scaling +dApost = -dApre +# tau_STDP = 10*ms +# g_0 = 0.125*nS +g_max = 3.75*nS/MB_scaling +# g_mid = g_max/2 +# g_slope = g_mid +# tau_decay = 10e5*ms +# A = g_max/4 +# offset = 0.01*A + +scale = .675 + +# Number of neurons +N_AL = 100 +N_MB = int(2500*MB_scaling) +N_LB = 100 + +traub_miles = ''' +dV/dt = -(1/C)*(g_Na*m**3*h*(V - E_Na) + + g_K*n**4*(V - E_K) + + g_leak*(V - E_leak) + + I_syn) : volt +dm/dt = alpha_m*(1 - m) - beta_m*m : 1 +dn/dt = alpha_n*(1 - n) - beta_n*n : 1 +dh/dt = alpha_h*(1 - h) - beta_h*h : 1 + +alpha_m = 0.32*(-52 - V/mV)/(exp((-52 - V/mV)/4) - 1)/ms: Hz +beta_m = 0.28*(25 + V/mV)/(exp((25 + V/mV)/5) - 1)/ms: Hz +alpha_h = 0.128*exp((-48 - V/mV)/18)/ms: Hz +beta_h = 4/(exp((-25 - V/mV)/5) + 1)/ms : Hz +alpha_n = 0.032*(-50 - V/mV)/(exp((-50 - V/mV)/5) - 1)/ms: Hz +beta_n = 0.5*exp((-55 - V/mV)/40)/ms : Hz +''' + +# Principal neurons (Antennal Lobe) +n_patterns = 10 +n_repeats = 10 +p_perturb = 0.1 +patterns = np.repeat(np.array([np.random.choice(N_AL, int(0.2*N_AL), replace=False) for _ in range(n_patterns)]), n_repeats, axis=0) +# Make variants of the patterns +to_replace = np.random.binomial(int(0.2*N_AL), p=p_perturb, size=n_patterns*n_repeats) +variants = [] +for idx, variant in enumerate(patterns): + np.random.shuffle(variant) + if to_replace[idx] > 0: + variant = variant[:-to_replace[idx]] + new_indices = np.random.randint(N_AL, size=to_replace[idx]) + variant = np.unique(np.concatenate([variant, new_indices])) + variants.append(variant) + +training_size = (n_repeats-10) +training_variants = [] +for p in range(n_patterns): + training_variants.extend(variants[n_repeats * p:n_repeats * p + training_size]) +py_random.shuffle(training_variants) +sorted_variants = list(training_variants) +for p in range(n_patterns): + sorted_variants.extend(variants[n_repeats * p + training_size:n_repeats * (p + 1)]) + +# all_patterns = np.zeros((n_patterns*n_repeats, N_AL)) +# for idx, p in enumerate(sorted_variants): +# all_patterns[idx, p] = 1 +# plt.imshow(all_patterns[-10*n_patterns:, :], interpolation='none') +# plt.show() + +spike_times = np.arange(n_patterns*n_repeats)*50*ms + 1*ms + rand(n_patterns*n_repeats)*2*ms +spike_times = spike_times.repeat([len(p) for p in sorted_variants]) +spike_indices = np.concatenate(sorted_variants) + +PN = SpikeGeneratorGroup(N_AL, spike_indices, spike_times) + +# iKC of the mushroom body +I_syn = '''I_syn = g_PN_iKC*(V - E_e): amp + dg_PN_iKC/dt = -g_PN_iKC/tau_PN_iKC : siemens''' +eqs_iKC = Equations(traub_miles) + Equations(I_syn) +iKC = NeuronGroup(N_MB, eqs_iKC, threshold='V>0*mV', refractory='V>0*mV', + method='exponential_euler') +iKC.V = E_leak +iKC.h = 1 +iKC.m = 0 +iKC.n = .5 + +# eKCs of the mushroom body lobe +I_syn = '''I_syn = g_iKC_eKC*(V - E_e) + g_eKC_eKC*(V - E_i): amp + dg_iKC_eKC/dt = -g_iKC_eKC/tau_iKC_eKC : siemens + dg_eKC_eKC/dt = -g_eKC_eKC/tau_eKC_eKC : siemens''' +eqs_eKC = Equations(traub_miles) + Equations(I_syn) +eKC = NeuronGroup(N_LB, eqs_eKC, threshold='V>0*mV', refractory='V>0*mV', + method='exponential_euler') +eKC.V = E_leak +eKC.h = 1 +eKC.m = 0 +eKC.n = .5 + +# Synapses +PN_iKC = Synapses(PN, iKC, 'weight : siemens', on_pre='g_PN_iKC += scale*weight') +PN_iKC.connect(p=0.15) +PN_iKC.weight = '4.545*nS + 1.25*nS*randn()' + +# iKC_eKC = Synapses(iKC, eKC, +# ''' +# dg_raw/dt = (g_0 - g_raw)/tau_decay : siemens (event-driven) +# g_syn = g_max*(tanh((g_raw - g_mid)/g_slope) + 1)/2 : siemens +# dapre/dt = -apre/tau_stdp : siemens (event-driven) +# dapost/dt = -apost/tau_stdp : siemens (event-driven) +# ''', +# on_pre=''' +# apre += A +# g_iKC_eKC += g_max*(tanh((g_raw - g_mid)/g_slope) + 1)/2 +# ''', +# on_post=''' +# g_raw += apre - offset +# ''') +iKC_eKC = Synapses(iKC, eKC, + '''g_raw : siemens + dApre/dt = -Apre / tau_pre : siemens (event-driven) + dApost/dt = -Apost / tau_post : siemens (event-driven) + ''', + on_pre='''g_iKC_eKC += g_raw + Apre += dApre + g_raw = clip(g_raw + Apost, 0*siemens, g_max) + ''', + on_post=''' + Apost += dApost + g_raw = clip(g_raw + Apre, 0*siemens, g_max)''', + ) +iKC_eKC.connect() +# First set all synapses as "inactive", then set 20% to active +iKC_eKC.g_raw = 'rand()*g_max/10' +iKC_eKC.g_raw['rand() < 0.2'] = '1.25*nS + 0.25*nS*randn()' + +eKC_eKC = Synapses(eKC, eKC, on_pre='g_eKC_eKC += scale*w_eKC_eKC') +eKC_eKC.connect() + +if use_spikemon: + PN_spikes = SpikeMonitor(PN) + iKC_spikes = SpikeMonitor(iKC) + eKC_spikes = SpikeMonitor(eKC) + +import time +if do_run: + runtime = (n_patterns*n_repeats+1)*50*ms +else: + runtime = 0*second +start = time.time() +run(runtime, report='text') +took = (time.time()-start) +print('took %.1fs' % took) +neurons = N_AL + N_MB + N_LB +synapses = len(PN_iKC) + len(iKC_eKC) + len(eKC_eKC) + +with open('benchmarks.txt', 'a') as f: + data = [neurons, synapses, device, threads, use_spikemon, do_run, took] + f.write('\t'.join('%s' % d for d in data) + '\n') diff --git a/parallel_execution/parallel_execution/MushroomBody.py b/parallel_execution/parallel_execution/MushroomBody.py new file mode 100644 index 00000000..b5960cdb --- /dev/null +++ b/parallel_execution/parallel_execution/MushroomBody.py @@ -0,0 +1,220 @@ +import random as py_random +from brian2 import * +import brian2cuda +import os +import matplotlib.pyplot as plt +from utils import get_directory +import sys +import brian2genn +plt.switch_backend('agg') + + +#seed +np.random.seed(123) +py_random.seed(123) + +device_name = "cuda_standalone" +print("Running in device:") +print(device_name) +prefs.devices.cuda_standalone.cuda_backend.detect_gpus = False +prefs.devices.cuda_standalone.cuda_backend.compute_capability = 7.5 +prefs.devices.cuda_standalone.cuda_backend.gpu_id = 0 + +codefolder = get_directory(device_name, delete_dir=False) + +# preference for memory saving +set_device(device = device_name, directory=codefolder, debug=True) + + +category = "Full examples" +name = "MushroomBody" + +# configuration options +duration = 10*second + + +# Number of neurons +N_AL = 100 +N_MB = 2500 +N_LB = 100 +# Constants +g_Na = 7.15*uS +E_Na = 50*mV +g_K = 1.43*uS +E_K = -95*mV +g_leak = 0.0267*uS +E_leak = -63.56*mV +C = 0.3*nF +VT = -63*mV +# Those two constants are dummy constants, only used when populations only have +# either inhibitory or excitatory inputs +E_e = 0*mV +E_i = -92*mV +# Actual constants used for synapses +NKCKC= N_MB +if NKCKC > 10000: + NKCKC = 10000 +g_scaling = NKCKC/2500 +if g_scaling < 1: + g_scaling= 1 +tau_PN_LHI = 1*ms +tau_LHI_iKC = 3*ms +tau_PN_iKC = 2*ms +tau_iKC_eKC = 10*ms +tau_eKC_eKC = 5*ms +w_LHI_iKC = 8.75*nS +w_eKC_eKC = 75*nS +tau_pre = tau_post = 10*ms +dApre = 0.1*nS/g_scaling +dApost = -dApre +g_max = 3.75*nS/g_scaling + +scale = .675 + +traub_miles = ''' +dV/dt = -(1./C)*(g_Na*m**3.*h*(V - E_Na) + + g_K*n**4.*(V - E_K) + + g_leak*(V - E_leak) + + I_syn) : volt +dm/dt = alpha_m*(1. - m) - beta_m*m : 1 +dn/dt = alpha_n*(1. - n) - beta_n*n : 1 +dh/dt = alpha_h*(1. - h) - beta_h*h : 1 +alpha_m = 0.32*(mV**-1.)*(13.*mV-V+VT)/ + (exp((13.*mV-V+VT)/(4.*mV))-1.)/ms : Hz +beta_m = 0.28*(mV**-1.)*(V-VT-40.*mV)/ + (exp((V-VT-40.*mV)/(5.*mV))-1.)/ms : Hz +alpha_h = 0.128*exp((17.*mV-V+VT)/(18.*mV))/ms : Hz +beta_h = 4./(1+exp((40.*mV-V+VT)/(5.*mV)))/ms : Hz +alpha_n = 0.032*(mV**-1)*(15.*mV-V+VT)/ + (exp((15.*mV-V+VT)/(5.*mV))-1.)/ms : Hz +beta_n = .5*exp((10.*mV-V+VT)/(40.*mV))/ms : Hz +''' + +# Principal neurons (Antennal Lobe) +n_patterns = 10 +n_repeats = int(duration/second*10.) +p_perturb = 0.1 + +patterns = np.repeat(np.array([np.random.choice(N_AL, int(0.2*N_AL), replace=False) for _ in range(n_patterns)]), n_repeats, axis=0) + +# Make variants of the patterns +to_replace = np.random.binomial(int(0.2*N_AL), p=p_perturb, size=n_patterns*n_repeats) + +variants = [] +for idx, variant in enumerate(patterns): + np.random.shuffle(variant) + if to_replace[idx] > 0: + variant = variant[:-to_replace[idx]] + new_indices = np.random.randint(N_AL, size=to_replace[idx]) + variant = np.unique(np.concatenate([variant, new_indices])) + variants.append(variant) + +training_size = (n_repeats-10) +training_variants = [] +for p in range(n_patterns): + training_variants.extend(variants[n_repeats * p:n_repeats * p + training_size]) +py_random.shuffle(training_variants) +sorted_variants = list(training_variants) +for p in range(n_patterns): + sorted_variants.extend(variants[n_repeats * p + training_size:n_repeats * (p + 1)]) + +spike_time_randomness = rand(n_patterns*n_repeats)*2*ms + +spike_times = np.arange(n_patterns*n_repeats)*50*ms + 1*ms + spike_time_randomness +spike_times = spike_times.repeat([len(p) for p in sorted_variants]) +spike_indices = np.concatenate(sorted_variants) + +PN = SpikeGeneratorGroup(N_AL, spike_indices, spike_times) + +# iKC of the mushroom body +I_syn = '''I_syn = g_PN_iKC*(V - E_e): amp + dg_PN_iKC/dt = -g_PN_iKC/tau_PN_iKC : siemens''' +eqs_iKC = Equations(traub_miles) + Equations(I_syn) +iKC = NeuronGroup(N_MB, eqs_iKC, threshold='V>0*mV', refractory='V>0*mV', + method='exponential_euler') + +# eKCs of the mushroom body lobe +I_syn = '''I_syn = g_iKC_eKC*(V - E_e) + g_eKC_eKC*(V - E_i): amp + dg_iKC_eKC/dt = -g_iKC_eKC/tau_iKC_eKC : siemens + dg_eKC_eKC/dt = -g_eKC_eKC/tau_eKC_eKC : siemens''' +eqs_eKC = Equations(traub_miles) + Equations(I_syn) +eKC = NeuronGroup(N_LB, eqs_eKC, threshold='V>0.*mV', refractory='V>0.*mV', + method='exponential_euler') + +# Synapses +PN_iKC = Synapses(PN, iKC, 'weight : siemens', on_pre='g_PN_iKC += scale*weight') +iKC_eKC = Synapses(iKC, eKC, + '''g_raw : siemens + dApre/dt = -Apre / tau_pre : siemens (event-driven) + dApost/dt = -Apost / tau_post : siemens (event-driven) + ''', + on_pre='''g_iKC_eKC += g_raw + Apre += dApre + g_raw = clip(g_raw + Apost, 0*siemens, g_max) + ''', + on_post=''' + Apost += dApost + g_raw = clip(g_raw + Apre, 0*siemens, g_max)''', + delay=0*ms) +eKC_eKC = Synapses(eKC, eKC, on_pre='g_eKC_eKC += scale*w_eKC_eKC', delay=0*ms) +# bu.insert_benchmark_point() +pn_ikc_max_synapses = N_AL*N_MB +#p_e_array = TimedArray(np.random.rand(1,pn_ikc_max_synapses), dt=duration) +PN_iKC.connect(p=0.15) +#PN_iKC.connect('p_e_array(0*ms, i)<0.15') + +if (N_MB > 10000): + iKC_eKC.connect(p=float(10000)/N_MB) +else: + iKC_eKC.connect() +eKC_eKC.connect() +# bu.insert_benchmark_point() + +# First set all synapses as "inactive", then set 20% to active +#pn_ikc_array = TimedArray(np.random.randn(1, pn_ikc_max_synapses), dt= duration) +#PN_iKC.weight = '10*nS + 1.25*nS*pn_ikc_array(0.*ms, i + j*N_pre)' +PN_iKC.weight = '10*nS + 1.25*nS*randn()' + +#ikc_ekc_max_synapses = N_MB*N_LB +#ikc_ekc_array1 = TimedArray(np.random.rand(1, ikc_ekc_max_synapses), dt= duration) +#iKC_eKC.g_raw = 'ikc_ekc_array1(0.*ms, i +j*N_pre)*g_max/10./g_scaling' +iKC_eKC.g_raw = 'rand()*g_max/10/g_scaling' +#ikc_ekc_array2 = TimedArray(np.random.rand(1, ikc_ekc_max_synapses), dt= duration) +#ikc_ekc_array3 = TimedArray(np.random.randn(1, ikc_ekc_max_synapses), dt= duration) +#iKC_eKC.g_raw['ikc_ekc_array2(0.*ms, i+j*N_pre) < 0.2'] = '(2.5*nS + 0.5*nS*ikc_ekc_array3(0.*ms, i+j*N_pre))/g_scaling' +iKC_eKC.g_raw['rand() < 0.2'] = '(2.5*nS + 0.5*nS*randn())/g_scaling' +iKC.V = E_leak +iKC.h = 1 +iKC.m = 0 +iKC.n = .5 +eKC.V = E_leak +eKC.h = 1 +eKC.m = 0 +eKC.n = .5 + +#if use_spikemon: +PN_spikes = SpikeMonitor(PN) +iKC_spikes = SpikeMonitor(iKC) +eKC_spikes = SpikeMonitor(eKC) +run(duration) + +if not os.path.exists(codefolder): + os.mkdir(codefolder) # for plots and profiling txt file + +plot_array = [PN_spikes, iKC_spikes, eKC_spikes] +plot_array_name = ['PN_spikes', 'iKC_spikes', 'eKC_spikes'] + +for p, M in enumerate(plot_array): + subplot(3, 1, p+1) + plot(M.t/ms, M.i, ',k') + ylabel(plot_array_name[p]) + print('SpikeMon %s, average rate %.1f sp/s' % + (plot_array_name[p], M.num_spikes/(duration/second*len(M.source)))) + #show() + +plotfolder = get_directory(device_name, basedir='plots') +os.makedirs(plotfolder, exist_ok=True) +plotpath = os.path.join(plotfolder, '{}_{}.pdf'.format(name,device_name)) +savefig(plotpath) +print('plot saved in {}'.format(plotpath)) +print('the generated model in {} needs to removed manually if wanted'.format(codefolder)) diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/.objects.cu.swp b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/.objects.cu.swp new file mode 100644 index 00000000..c1385002 Binary files /dev/null and b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/.objects.cu.swp differ diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/brianlib/clocks.h b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/brianlib/clocks.h new file mode 100644 index 00000000..0a5a46be --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/brianlib/clocks.h @@ -0,0 +1,51 @@ +#ifndef _BRIAN_CLOCKS_H +#define _BRIAN_CLOCKS_H +#include +#include +#include +#include + +namespace { + inline int fround(double x) + { + return (int)(x+0.5); + }; +}; + +class Clock +{ +public: + double epsilon; + double *dt; + int64_t *timestep; + double *t; + int64_t i_end; + Clock(double _epsilon=1e-14) : epsilon(_epsilon) { i_end = 0;}; + inline void tick() + { + timestep[0] += 1; + t[0] = timestep[0] * dt[0]; + } + inline bool running() { return timestep[0] +#include + +#define inf (std::numeric_limits::infinity()) +#ifdef _MSC_VER +#define INFINITY (std::numeric_limits::infinity()) +#define NAN (std::numeric_limits::quiet_NaN()) +#define M_PI 3.14159265358979323846 +#endif + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/brianlib/cudaVector.h b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/brianlib/cudaVector.h new file mode 100644 index 00000000..1bf30fd8 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/brianlib/cudaVector.h @@ -0,0 +1,160 @@ +#ifndef _CUDA_VECTOR_H_ +#define _CUDA_VECTOR_H_ + +#include +#include + +/* + * current memory allocation strategy: + * only grow larger (new_size = old_size*2 + 1) ~= 2^n + */ + +#define INITIAL_SIZE 1 + +typedef int size_type; + +template +class cudaVector +{ +private: + // TODO: consider using data of type char*, since it does not have a cunstructor + scalar* volatile m_data; //pointer to allocated memory + volatile size_type m_capacity; //how much memory is allocated, should ALWAYS >= size + volatile size_type m_size; //how many elements are stored in this vector + +public: + __device__ cudaVector() + { + m_size = 0; + if(INITIAL_SIZE > 0) + { + m_data = (scalar*)malloc(sizeof(scalar) * INITIAL_SIZE); + if(m_data != NULL) + { + m_capacity = INITIAL_SIZE; + } + else + { + printf("ERROR while creating cudaVector with size %ld in cudaVector.h (constructor)\n", sizeof(scalar)*INITIAL_SIZE); + assert(m_data != NULL); + } + } + }; + + __device__ ~cudaVector() + { + free(m_data); + }; + + __device__ scalar* getDataPointer() + { + return m_data; + }; + + __device__ scalar& at(size_type index) + { + if (index < 0 || index >= m_size) + { + // TODO: check for proper exception throwing in cuda kernels + printf("ERROR returning a reference to index %d in cudaVector::at() (size = %u)\n", index, m_size); + assert(index < m_size); + } + return m_data[index]; + }; + + __device__ void push(scalar elem) + { + assert(m_size <= m_capacity); + if(m_capacity == m_size) + { + // increase capacity + reserve(m_capacity*2 + 1); + } + if(m_size < m_capacity) + { + m_data[m_size] = elem; + m_size++; + } + }; + + __device__ void update(size_type pos, scalar elem) + { + if(pos <= m_size) + { + m_data[pos] = elem; + } + else + { + printf("ERROR invalid index %d, must be in range 0 - %d\n", pos, m_size); + assert(pos <= m_size); + } + }; + + __device__ void resize(size_type new_size) + { + if (new_size > m_capacity) + reserve(new_size * 2); + m_size = new_size; + } + + __device__ size_type increaseSizeBy(size_type add_size) + { + size_type old_size = m_size; + size_type new_size = old_size + add_size; + if (new_size > m_capacity) + reserve(new_size * 2); + m_size = new_size; + return old_size; + } + + __device__ void reserve(size_type new_capacity) + { + if(new_capacity > m_capacity) + { + //realloc larger memory (deviceside realloc doesn't exist, so we write our own) + scalar* new_data = (scalar*)malloc(sizeof(scalar) * new_capacity); + // TODO: use C++ version, is there a way to copy data in parallel here? + // since only num_unique_delays threads resize, the other threads could help copy? + //scalar* new_data = new scalar[new_capacity]; + //if (new_data) + //{ + // for (size_type i = 0; i < m_size; i++) + // new_data[i] = m_data[i]; + // + // delete [] m_data; + // m_data = new_data; + // m_capacity = new_capacity; + //} + if (new_data != NULL) + { + memcpy(new_data, m_data, sizeof(scalar) * size()); + free(m_data); + m_data = new_data; + m_capacity = new_capacity; + } + else + { + printf("ERROR while allocating %ld bytes in cudaVector.h/reserve()\n", sizeof(scalar)*new_capacity); + assert(new_data != NULL); + } + } + else + { + //kleiner reallocen? + m_capacity = new_capacity; + }; + }; + + //does not overwrite old data, just resets number of elements stored to 0 + __device__ void reset() + { + m_size = 0; + }; + + __device__ size_type size() + { + return m_size; + }; +}; + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/brianlib/cuda_utils.h b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/brianlib/cuda_utils.h new file mode 100644 index 00000000..b8154d23 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/brianlib/cuda_utils.h @@ -0,0 +1,180 @@ +#ifndef BRIAN2CUDA_ERROR_CHECK_H +#define BRIAN2CUDA_ERROR_CHECK_H +#include +#include +#include "objects.h" +#include "curand.h" + +// Define this to turn on error checking +#define BRIAN2CUDA_ERROR_CHECK +// Define this to synchronize device before checking errors +//#define BRIAN2CUDA_ERROR_CHECK_BLOCKING + +// Define this to turn on memory checking +//#define BRIAN2CUDA_MEMORY_CHECK +// Define this to synchronize device before checking memory +//#define BRIAN2CUDA_MEMORY_CHECK_BLOCKING + + +// partly adapted from https://gist.github.com/ashwin/2652488 +#define CUDA_SAFE_CALL(err) _cudaSafeCall(err, __FILE__, __LINE__, #err) +#define CUDA_CHECK_ERROR(msg) _cudaCheckError(__FILE__, __LINE__, #msg) +#define CUDA_CHECK_MEMORY() _cudaCheckMemory(__FILE__, __LINE__) +#define THRUST_CHECK_ERROR(code) { try {code;} \ + catch(...) {_thrustCheckError(__FILE__, __LINE__, #code);} } + + +// adapted from NVIDIA cuda samples, shipped with cuda 10.1 (common/inc/helper_cuda.h) +#ifdef CURAND_H_ +// cuRAND API errors +static const char *_curandGetErrorEnum(curandStatus_t error) { + switch (error) { + case CURAND_STATUS_SUCCESS: + return "CURAND_STATUS_SUCCESS"; + + case CURAND_STATUS_VERSION_MISMATCH: + return "CURAND_STATUS_VERSION_MISMATCH"; + + case CURAND_STATUS_NOT_INITIALIZED: + return "CURAND_STATUS_NOT_INITIALIZED"; + + case CURAND_STATUS_ALLOCATION_FAILED: + return "CURAND_STATUS_ALLOCATION_FAILED"; + + case CURAND_STATUS_TYPE_ERROR: + return "CURAND_STATUS_TYPE_ERROR"; + + case CURAND_STATUS_OUT_OF_RANGE: + return "CURAND_STATUS_OUT_OF_RANGE"; + + case CURAND_STATUS_LENGTH_NOT_MULTIPLE: + return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; + + case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: + return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; + + case CURAND_STATUS_LAUNCH_FAILURE: + return "CURAND_STATUS_LAUNCH_FAILURE"; + + case CURAND_STATUS_PREEXISTING_FAILURE: + return "CURAND_STATUS_PREEXISTING_FAILURE"; + + case CURAND_STATUS_INITIALIZATION_FAILED: + return "CURAND_STATUS_INITIALIZATION_FAILED"; + + case CURAND_STATUS_ARCH_MISMATCH: + return "CURAND_STATUS_ARCH_MISMATCH"; + + case CURAND_STATUS_INTERNAL_ERROR: + return "CURAND_STATUS_INTERNAL_ERROR"; + } + + return ""; +} +#endif + + +inline void _cudaSafeCall(cudaError err, const char *file, const int line, const char *call = "") +{ +#ifdef BRIAN2CUDA_ERROR_CHECK + if (cudaSuccess != err) + { + fprintf(stderr, "ERROR: %s failed at %s:%i : %s\n", + call, file, line, cudaGetErrorString(err)); + exit(-1); + } +#endif + + return; +} + + +inline void _cudaSafeCall(curandStatus_t err, const char *file, const int line, const char *call = "") +{ +#ifdef BRIAN2CUDA_ERROR_CHECK + if (CURAND_STATUS_SUCCESS != err) + { + fprintf(stderr, "ERROR: %s failed at %s:%i : %s\n", + call, file, line, _curandGetErrorEnum(err)); + exit(-1); + } +#endif + + return; +} + + +inline void _cudaCheckError(const char *file, const int line, const char *msg) +{ +#ifdef BRIAN2CUDA_ERROR_CHECK_BLOCKING + // More careful checking. However, this will affect performance. + cudaError err = cudaDeviceSynchronize(); + if(cudaSuccess != err) + { + fprintf(stderr, "ERROR: CUDA_CHECK_ERROR() failed after %s at %s:%i : %s\n", + msg, file, line, cudaGetErrorString(err)); + exit(-1); + } +#else +#ifdef BRIAN2CUDA_ERROR_CHECK + cudaError err = cudaGetLastError(); + if (cudaSuccess != err) + { + fprintf(stderr, "ERROR: CUDA_CHECK_ERROR() failed at %s:%i : %s\n", + file, line, cudaGetErrorString(err)); + exit(-1); + } + +#endif +#endif + + return; +} + + +// Report device memory usage. The memory diff is reported with respect to the +// global brian::used_device_memory as reference, which was set in the last +// _cudaCheckMemory call. +inline void _cudaCheckMemory(const char *file, const int line) +{ +#ifdef BRIAN2CUDA_MEMORY_CHECK +#ifdef BRIAN2CUDA_MEMORY_CHECK_BLOCKING + cudaDeviceSynchronize(); +#endif + const double to_MB = 1.0 / (1024.0 * 1024.0); + size_t avail, total, used, diff; + cudaMemGetInfo(&avail, &total); + used = total - avail; + diff = used - brian::used_device_memory; + // print memory information only if device memory usage changed + // NOTE: Device memory is allocated in chunks. When allocating only little + // memory, the memory usage reported by cudaMemGetInfo might not change if + // the previously allocated chunk has enough free memory to be used for the + // newly requested allocation. + if (diff > 0) + { + fprintf(stdout, "INFO: cuda device memory usage in %s:%i\n" + "\t used: \t %f MB\n" + "\t avail: \t %f MB\n" + "\t total: \t %f MB\n" + "\t diff: \t %f MB \t (%zu bytes)\n", + file, line, + double(used) * to_MB, + double(avail) * to_MB, + double(total) * to_MB, + double(diff) * to_MB, diff); + brian::used_device_memory = used; + } +#endif +} + + +inline void _thrustCheckError(const char *file, const int line, + const char *code) +{ + fprintf(stderr, "ERROR: THRUST_CHECK_ERROR() caught an exception from %s at %s:%i\n", + code, file, line); + throw; +} + +#endif // BRIAN2CUDA_ERROR_CHECK_H diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/brianlib/curand_buffer.h b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/brianlib/curand_buffer.h new file mode 100644 index 00000000..54f1a379 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/brianlib/curand_buffer.h @@ -0,0 +1,194 @@ +#ifndef _CURAND_BUFFER_H +#define _CURAND_BUFFER_H + +#include +#include +#include + + +// XXX: for some documentation on random number generation, check out our wiki: +// https://github.com/brian-team/brian2cuda/wiki/Random-number-generation + + +enum ProbDistr +{ + RAND, // uniform distribution over [0,1) + RANDN // standard normal distribution with mean 0 and std 1 +}; + + +template // random number type +// only float and double are supported as template types +class CurandBuffer +/* This class generates a fixed sized buffer of random numbers on a cuda device, + * copies them to the host and whenever the operater[] is called from the host + * it returns the next random number. After all random numbers returned once, + * a new set of numbers is generated. + */ +{ +private: + int buffer_size; + int current_idx; + bool memory_allocated; + randomNumber_t* host_data; + randomNumber_t* dev_data; + curandGenerator_t* generator; + ProbDistr distribution; + + void generate_numbers() + { + if (current_idx != buffer_size && memory_allocated) + { + printf("WARNING: CurandBuffer::generate_numbers() called before " + "buffer was empty (current_idx = %u, buffer_size = %u)", + current_idx, buffer_size); + } + // TODO: should we allocate the memory in the constructor (even if we end up not using it)? + if (!memory_allocated) + { + // allocate host memory + host_data = new randomNumber_t[buffer_size]; + if (!host_data) + { + printf("ERROR allocating host_data for CurandBuffer (size %ld)\n", sizeof(randomNumber_t)*buffer_size); + exit(EXIT_FAILURE); + } + // allocate device memory + cudaError_t status = cudaMalloc((void **)&dev_data, buffer_size*sizeof(randomNumber_t)); + if (status != cudaSuccess) + { + printf("ERROR allocating memory on device (size = %ld) in %s(%d):\n\t%s\n", + buffer_size*sizeof(randomNumber_t), __FILE__, __LINE__, + cudaGetErrorString(status)); + exit(EXIT_FAILURE); + } + memory_allocated = true; + } + // generate random numbers on device + if (distribution == RAND) + { + curandStatus_t status = generateUniform(*generator, dev_data, buffer_size); + if (status != CURAND_STATUS_SUCCESS) + { + printf("ERROR generating random numbers in %s(%d):\n", __FILE__, __LINE__); + exit(EXIT_FAILURE); + } + } + else // distribution == RANDN + { + curandStatus_t status = generateNormal(*generator, dev_data, buffer_size, 0, 1); + if (status != CURAND_STATUS_SUCCESS) + { + printf("ERROR generating normal distributed random numbers in %s(%d):\n", + __FILE__, __LINE__); + exit(EXIT_FAILURE); + } + } + // copy random numbers to host + cudaError_t status = cudaMemcpy(host_data, dev_data, buffer_size*sizeof(randomNumber_t), cudaMemcpyDeviceToHost); + if (status != cudaSuccess) + { + printf("ERROR copying device to host memory (size = %ld) in %s(%d):\n\t%s\n", + buffer_size*sizeof(randomNumber_t), __FILE__, __LINE__, + cudaGetErrorString(status)); + exit(EXIT_FAILURE); + } + // reset buffer index + current_idx = 0; + } + + curandStatus_t generateUniform(curandGenerator_t generator, randomNumber_t *outputPtr, size_t num) + { + printf("ERROR curand can only generate random numbers as 'float' or 'double' types.\n"); + exit(EXIT_FAILURE); + } + + curandStatus_t generateNormal(curandGenerator_t generator, randomNumber_t *outputPtr, + size_t n, randomNumber_t mean, randomNumber_t stddev) + { + printf("ERROR curand can only generate random numbers as 'float' or 'double' types.\n"); + exit(EXIT_FAILURE); + } + +public: + CurandBuffer(curandGenerator_t* gen, ProbDistr distr) + { + generator = gen; + distribution = distr; + buffer_size = 10000; + current_idx = 0; + memory_allocated = false; + } + + ~CurandBuffer() + { + if (memory_allocated) + { + free_memory(); + } + } + + // We declare the CurandBuffer in anonymous namespace (file global + // variable) in the synapses_create_generator template, therefore its + // declaration scope only ends at program termination, but then the CUDA + // device is already detached, which results in an error when freeing the + // device memory in the destructor. This method can be called to free + // device memory manually before the destructor is called. + void free_memory() + { + delete[] host_data; + cudaError_t status = cudaFree(dev_data); + if (status != cudaSuccess) + { + printf("ERROR freeing device memory in %s(%d):%s\n", + __FILE__, __LINE__, cudaGetErrorString(status)); + exit(EXIT_FAILURE); + } + memory_allocated = false; + } + + // don't return reference to prohibit assignment + randomNumber_t operator[](const int dummy) + { + // we ignore dummy and just return the next number in the buffer + if (current_idx == buffer_size || !memory_allocated) + generate_numbers(); + randomNumber_t number = host_data[current_idx]; + current_idx += 1; + return number; + } +}; // class CurandBuffer + + +// define generator functions depending on curand float type +// normal (RANDN) +template <> inline +curandStatus_t CurandBuffer::generateNormal(curandGenerator_t generator, + float *outputPtr, size_t n, float mean, float stddev) +{ + return curandGenerateNormal(generator, outputPtr, n, mean, stddev); +} + +template <> inline +curandStatus_t CurandBuffer::generateNormal(curandGenerator_t generator, + double *outputPtr, size_t n, double mean, double stddev) +{ + return curandGenerateNormalDouble(generator, outputPtr, n, mean, stddev); +} + +// uniform (RAND) +template <> inline +curandStatus_t CurandBuffer::generateUniform(curandGenerator_t generator, + float *outputPtr, size_t num) +{ + return curandGenerateUniform(generator, outputPtr, num); +} + +template <> inline +curandStatus_t CurandBuffer::generateUniform(curandGenerator_t generator, + double *outputPtr, size_t num) +{ + return curandGenerateUniformDouble(generator, outputPtr, num); +} + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/brianlib/dynamic_array.h b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/brianlib/dynamic_array.h new file mode 100644 index 00000000..5e52e467 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/brianlib/dynamic_array.h @@ -0,0 +1,81 @@ +#ifndef _BRIAN_DYNAMIC_ARRAY_H +#define _BRIAN_DYNAMIC_ARRAY_H + +#include + +/* + * 2D Dynamic array class + * + * Efficiency note: if you are regularly resizing, make sure it is the first dimension that + * is resized, not the second one. + * + */ +template +class DynamicArray2D +{ + int old_n, old_m; + std::vector< std::vector* > data; +public: + int n, m; + DynamicArray2D(int _n=0, int _m=0) + { + old_n = 0; + old_m = 0; + resize(_n, _m); + }; + ~DynamicArray2D() + { + resize(0, 0); // handles deallocation + } + void resize() + { + if(old_n!=n) + { + if(nold_n) + { + for(int i=old_n; i; + } + } + if(old_m!=m) + { + for(int i=0; iresize(m); + } else if(n>old_n) + { + for(int i=old_n; iresize(m); + } + } else if(old_m!=m) + { + for(int i=0; iresize(m); + } + } + old_n = n; + old_m = m; + }; + void resize(int _n, int _m) + { + n = _n; + m = _m; + resize(); + } + inline T& operator()(int i, int j) + { + return (*data[i])[j]; + } +}; + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/brianlib/spikequeue.h b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/brianlib/spikequeue.h new file mode 100644 index 00000000..acff738d --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/brianlib/spikequeue.h @@ -0,0 +1,454 @@ +#include +#include +#include +#include + +#include "cudaVector.h" +#include + +#include + +using namespace std; + +//TODO: The data type for indices is currently fixed (int), all floating point +// variables (delays, dt) are assumed to use the same data type +typedef int32_t DTYPE_int; + +class CudaSpikeQueue +{ +private: + // critical path coding, taken from + // https://stackoverflow.com/questions/18963293/cuda-atomics-change-flag/18968893#18968893 + volatile int* semaphore; // controll data access when reallocating + __device__ void acquire_semaphore(volatile int *lock){ + while (atomicCAS((int *)lock, 0, 1) != 0); + } + + __device__ void release_semaphore(volatile int *lock){ + *lock = 0; + __threadfence(); + } + +public: + //these vectors should ALWAYS be the same size, since each index refers to a triple of (pre_id, syn_id, post_id) + cudaVector** synapses_queue; + + //our connectivity matrix with dimensions (num_blocks) * neuron_N + //each element + int* num_synapses_by_pre; + int* num_synapses_by_bundle; + int* num_unique_delays_by_pre; + int* unique_delays; + int* global_bundle_id_start_by_pre; + int* synapses_offset_by_bundle; + DTYPE_int* synapse_ids; + DTYPE_int** synapse_ids_by_pre; + int* unique_delays_offset_by_pre; + int* unique_delay_start_idcs; + int current_offset; // offset in circular queue structure + int num_queues; + //int max_num_delays_per_block; + int num_blocks; + int neuron_N; // number of neurons in source of SynapticPathway + int syn_N; + + // When we have 0 synapses, prepare() is not called in synapses_initialise_queue.cu + // and for destroy() to still work, synapses_queue needs to be a null pointer + __device__ CudaSpikeQueue(): synapses_queue(0) {}; + + //Since we can't have a destructor, we need to call this function manually + __device__ void destroy() + { + if(synapses_queue) + { + delete [] synapses_queue; + synapses_queue = 0; + } + } + + /* this function also initiliases all variables, allocs arrays, etc. + * so we need to call it before using the queue + */ + __device__ void prepare( + int tid, + int num_threads, + int _num_blocks, + double _dt, + int _neuron_N, + int _syn_N, + int _num_queues, + int* _num_synapses_by_pre, + int* _num_synapses_by_bundle, + int* _num_unique_delays_by_pre, + int* _unique_delays, + int* _global_bundle_id_start_by_pre, + int* _synapses_offset_by_bundle, + DTYPE_int* _synapse_ids, + DTYPE_int** _synapse_ids_by_pre, + int* _unique_delays_offset_by_pre, + int* _unique_delay_start_idcs + ) + { + if(tid == 0) + { + // TODO add comments + + semaphore = new int[_num_blocks]; + current_offset = 0; + num_blocks = _num_blocks; + neuron_N = _neuron_N; + syn_N = _syn_N; + num_queues = _num_queues; + + // TODO: do we need num_synapses_by_pre? is num_synapses_by_pre[pre_post_block_id] faster then synapses_by_pre[pre_post_block_id].size()? + // if so, add unique_num_synapses_by_pre as well! + num_synapses_by_pre = _num_synapses_by_pre; + num_synapses_by_bundle = _num_synapses_by_bundle; + num_unique_delays_by_pre = _num_unique_delays_by_pre; + unique_delays = _unique_delays; + global_bundle_id_start_by_pre = _global_bundle_id_start_by_pre; + synapses_offset_by_bundle = _synapses_offset_by_bundle; + synapse_ids = _synapse_ids; + synapse_ids_by_pre = _synapse_ids_by_pre; + unique_delays_offset_by_pre = _unique_delays_offset_by_pre; + unique_delay_start_idcs = _unique_delay_start_idcs; + + synapses_queue = new cudaVector*[num_queues]; + if(!synapses_queue) + { + printf("ERROR while allocating memory with size %ld in spikequeue.h/prepare()\n", sizeof(cudaVector*)*num_queues); + } + } + __syncthreads(); + + for (int i = tid; i < _num_blocks; i+=num_threads) + { + semaphore[i] = 0; + } + + for(int i = tid; i < num_queues; i+=num_threads) + { + synapses_queue[i] = new cudaVector[num_blocks]; + if(!synapses_queue[i]) + { + printf("ERROR while allocating memory with size %ld in spikequeue.h/prepare()\n", sizeof(cudaVector)*num_blocks); + } + } + }; + + __device__ void push_synapses( + char* _shared_mem, + int post_neuron_bid, + int tid, + int num_threads, + int spiking_neuron_id) + { + + // following arrays are in global device memory: + // + // synapse_ids_by_pre + // (size == number of synapses) + // + // unique_delays + // delay_start_idx + // (size == number of unique delays) + + assert(blockDim.x == num_threads); + + // idx in the connectivity matrix for this (preID, postBlock) pair + int pre_post_block_id = spiking_neuron_id * num_blocks + post_neuron_bid; + int num_synapses = num_synapses_by_pre[pre_post_block_id]; + int num_unique_delays = num_unique_delays_by_pre[pre_post_block_id]; + // offset in unique_delays and unique_delay_start_idcs arrays (which + // store all delay data, first sorted by pre_post_block, then by delay) + int delay_offset = unique_delays_offset_by_pre[pre_post_block_id]; + // shared_mem is allocated in push_spikes + int* shared_mem_unique_delay_start_idcs = (int*)_shared_mem; + // shared memory for inter thread communication needs to be volatile + volatile int* shared_mem_size_before_resize = shared_mem_unique_delay_start_idcs + num_unique_delays; + volatile int* shared_mem_last_cycle_size_before_resize = shared_mem_size_before_resize + num_unique_delays; + + // spiking_neuron_id should be in range [0,neuron_N] + assert(spiking_neuron_id < neuron_N); + + // Copy to shared memory. If more entries then threads, loop. + for (int i = tid; i < num_unique_delays; i += num_threads) + { + shared_mem_unique_delay_start_idcs[i] = unique_delay_start_idcs[delay_offset + i]; + } + __syncthreads(); + + // ( thread <-> synapse ) correspondence + // If num_threads < num_synapses, loop. + // syn is synapse number (not ID!) + int delay_previous_loop_cycle, size_before_resize; + for (int i = 0; i < num_synapses; i += num_threads) + { + /////////////////////////////////////////////////////////////////////////////////////// + // Example values and code paths for each thread for given delays, num_threads=3, num_synapses=12: + // + // syn (range(0,num_synapses), not ID!) 0 1 2 | 3 4 5 | 6 7 8 | 9 10 11 + // delay 0 0 0 | 0 0 0 | 0 1 1 | 1 2 2 + // + // tid 0 1 2 | 0 1 2 | 0 1 2 | 0 1 2 + // i 0 0 0 | 3 3 3 | 6 6 6 | 9 9 9 + // loop cycle (i/num_threads) 0 0 0 | 1 1 1 | 2 2 2 | 3 3 3 + // delay_start_idx_in_synapses_id 0 0 0 | 0 0 0 | 0 7 7 | 7 10 10 + // next_delay_start_idx_in_synapses_id 7 7 7 | 7 7 7 | 7 10 10 |10 12 12 + // delay_occurrence 7 7 7 | 7 7 7 | 7 3 3 | 3 2 2 + // + // Different code paths (see below): + // ^ ^ ^ ^ ^ ^ ^ ^ + // a + // * * * + // b + /////////////////////////////////////////////////////////////////////////////////////// + + // start loop at 0 to make sure all threads are executing the same number of loops (for __syncthread()) + int syn = i + tid; + // declare variables which we will need after __syncthread() call + int delay_queue, delay_start_idx_in_synapses_id, delay, delay_occurrence, idx_in_unique_delays; + int next_delay_start_idx_in_synapses_id = 0; + + if (syn < num_synapses) + { + // find the starting index in synapse_id_by_pre for the delay corresponding + // to the current synapse and the starting index for the next delay + for (int j = 1; j < num_unique_delays; j++) + { + delay_start_idx_in_synapses_id = next_delay_start_idx_in_synapses_id; + next_delay_start_idx_in_synapses_id = shared_mem_unique_delay_start_idcs[j]; + if (next_delay_start_idx_in_synapses_id > syn) + { + idx_in_unique_delays = j-1; + break; + } + if (j == num_unique_delays - 1) // end of loop + { + // this synapse has the highest delay for the current pre_neuron and post_neuron_block + delay_start_idx_in_synapses_id = next_delay_start_idx_in_synapses_id; + idx_in_unique_delays = j; + // there is no next delay, for the calculation of delay_occurrence we need + next_delay_start_idx_in_synapses_id = num_synapses; + } + } + + // TODO: remove this if statement once we have no_or_const_delay_mode implementation and add + // assert(num_unique_delays > 1) + // otherwise aboves loop is not entered and results in wrong delay_start_idx values + if (num_unique_delays == 1) + { + delay_start_idx_in_synapses_id = 0; + next_delay_start_idx_in_synapses_id = num_synapses; + idx_in_unique_delays = 0; + } + + assert(delay_start_idx_in_synapses_id <= syn && syn < next_delay_start_idx_in_synapses_id); + + // get the delay of the current synapse and the number of synapses with that delay + assert(unique_delays); + delay = unique_delays[delay_offset + idx_in_unique_delays]; + delay_occurrence = next_delay_start_idx_in_synapses_id - delay_start_idx_in_synapses_id; + + // find the spike queue corresponding to this synapses delay + delay_queue = (current_offset + delay) % num_queues; + } + + // make sure only one block resizes (and therefore possibly + // reallocates) and fills this CudaSpikeQueues' CudaVectors. + __syncthreads(); + if (tid == 0) + acquire_semaphore(semaphore + post_neuron_bid); + __syncthreads(); + + // begin critical section + + // Check if there will be at least one resize this loop cycle. + // We always resize in the first cycle (i == 0). + // We have NO resize this loop cycle only if delay_start_idx was in + // the last loop cycle (< i) and next_delay_start_idx is in the + // next cycle loop cycle (>= i + num_threads). + if (i == 0 || + !(delay_start_idx_in_synapses_id < i && + i + num_threads <= next_delay_start_idx_in_synapses_id)) + { + if (syn < num_synapses) // for __synchtheads() + { + // RESIZE QUEUES + // TODO: if we use pointers for cudaVector::m_size, consecutive threads should to the resize + // in order to get coalesced memory access, e.g. by letting the threads that copy the start_idx + // to shared memory then perform aboves code until resize and then let all threads do it again + // for their respective syn number + // -> we get coalesced memory access but have to do more shared mem reads and numerics + if (syn == delay_start_idx_in_synapses_id) // only one thread for each unique delay + { + // only the first thread for each delay does the resizing, in example marked as (*) + // and copies its offset from atomic resizing into shared memory for broadcasting to + // the other threads with same delay + shared_mem_size_before_resize[idx_in_unique_delays] = synapses_queue[delay_queue][post_neuron_bid].increaseSizeBy( + delay_occurrence); + } + } + // make sure size_before_resize is written to shared mem before read by other threads + __syncthreads(); + } + + if (syn < num_synapses) + { + // uncoalseced memory access, TODO: use pointers to consecutive memory locations for cudaVector::m_size + // currently multiple consecutive threads read same global memory address, + // then next consecutive threads read next global memory address + // TODO check memory broadcasting mechanism + // maybe copy size_before_resize into shared memory when copying the unique delay start idx + if (i == 0 || delay != delay_previous_loop_cycle) + { + // only update size_before_resize if we are pushing into a new delay_queue (for the same tid) + // or if we are in the 1. loop cycle (i/num_threads==0) + // in example marked as (^) + if (delay_start_idx_in_synapses_id < i) + { + // if in the previous loop cycle we were not done with that delay, then the delay_queue is + // already resized and we need to take the size_before_resize we saved to shared memory + // in example marked as (a) + // TODO just access the shared mem from the last delay of last cycle? + size_before_resize = shared_mem_last_cycle_size_before_resize[0]; + } + else + { + // the size_before_resize for this delay has not been broadcasted yet + // in example marked all (^), except of (a) + size_before_resize = shared_mem_size_before_resize[idx_in_unique_delays]; + } + } + delay_previous_loop_cycle = delay; + + // PUSH INTO QUEUES + int syn_id = synapse_ids_by_pre[pre_post_block_id][syn]; + // find position in queue for syn + int idx_in_queue = size_before_resize + (syn - delay_start_idx_in_synapses_id); + // each thread updates one value in queue + synapses_queue[delay_queue][post_neuron_bid].at(idx_in_queue) = syn_id; + + } // end if + //end critical section + + __syncthreads(); + if (tid == 0) + release_semaphore(semaphore + post_neuron_bid); + __syncthreads(); + + if (syn < num_synapses) + { + + // TODO: we could do this block right after resizing and save + // the __syncthread below, but would spend more time in the + // critical section, possibly blocking entire blocks + // --> needs benchmarking + if (syn == delay_start_idx_in_synapses_id && (num_threads - tid) < delay_occurrence && tid != 0) + { + // If pushing into this delay queue will not be finished within this loop cycle, + // then in the next loop cycle the queue will already be resized and we won't + // have access to size_before_resize. --> save it to shared memory + // If tid==0, all threads in this loop cycle will push into the same delay queue + // and the size_before_resize will be unchanged next loop cycle if the delay didn't change. + // in example marked as (b) + shared_mem_last_cycle_size_before_resize[0] = size_before_resize; + } + } // end if + + if (num_synapses - i > num_threads) // true if there is another loop cycle needed + { + // make sure shared_mem_last_cycle_size_before_resize is written for the next loop cycle + __syncthreads(); + } + } // end for + } // end push_synapses() + + __device__ void push_bundles( + int post_neuron_bid, + int tid, + int num_threads, + int spiking_neuron_id) + { + + // following arrays are in global device memory: + // + // synapse_ids_by_pre + // (size == number of synapses) + // + // unique_delays + // delay_start_idx + // (size == number of unique delays) + + assert(blockDim.x == num_threads); + + int pre_post_block_id = spiking_neuron_id * num_blocks + post_neuron_bid; + int global_bundle_id_start_idx = global_bundle_id_start_by_pre[pre_post_block_id]; + // num_unique_delays == num_bundles + int num_unique_delays = global_bundle_id_start_by_pre[pre_post_block_id + 1] + - global_bundle_id_start_idx; + + // spiking_neuron_id should be in range [0,neuron_N] + assert(spiking_neuron_id < neuron_N); + + + // ( thread <-> synapse_bundle ) correspondence + // If num_threads < num_unique_delays, loop. + // bundle_idx is bundle number per block (not global bundle ID!) + for (int i = 0; i < num_unique_delays; i += num_threads) + { + // start loop at 0 to make sure all threads are executing the same number of loops (for __syncthread()) + int bundle_idx = i + tid; + + int global_bundle_id, delay_queue; + if (bundle_idx < num_unique_delays) + { + // we have per pre_post_block_id (total of num_blocks * source_N) a + // local bundle index going from 0 to num_delays for that + // pre_post_block_id + global_bundle_id = global_bundle_id_start_idx + bundle_idx; + + int delay = unique_delays[global_bundle_id]; + // find the spike queue corresponding to this synapses delay + delay_queue = (current_offset + delay) % num_queues; + } + + // make sure only one block resizes (and therefore possibly + // reallocates) and fills this CudaSpikeQueues' CudaVectors. + __syncthreads(); + if (tid == 0) + acquire_semaphore(semaphore + post_neuron_bid); + __syncthreads(); + + if (bundle_idx < num_unique_delays) + { + // begin critical section + synapses_queue[delay_queue][post_neuron_bid].push(global_bundle_id); + // end critical section + } + + __syncthreads(); + if (tid == 0) + release_semaphore(semaphore + post_neuron_bid); + __syncthreads(); + + } // end for + + } // end push_bundles() + + __device__ void advance( + int tid) + { + assert(tid < num_blocks && current_offset < num_queues); + synapses_queue[current_offset][tid].reset(); + __syncthreads(); //TODO no need for this?... + if(tid == 0) + current_offset = (current_offset + 1)%num_queues; + } + + __device__ void peek( + cudaVector** _synapses_queue) + { + *(_synapses_queue) = &(synapses_queue[current_offset][0]); + } +}; diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/brianlib/stdint_compat.h b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/brianlib/stdint_compat.h new file mode 100644 index 00000000..66a75b95 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/brianlib/stdint_compat.h @@ -0,0 +1,13 @@ +#ifndef _BRIAN_STDINT_COMPAT_H +#define _BRIAN_STDINT_COMPAT_H + +// Work around the fact that older MSVC versions don't have stdint.h +#ifdef _MSC_VER +typedef __int32 int32_t; +typedef __int64 int64_t; +typedef unsigned __int64 uint64_t; +#else +#include +#endif + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/before_run_synapses_1_post_push_spikes.cu b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/before_run_synapses_1_post_push_spikes.cu new file mode 100644 index 00000000..3da384e0 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/before_run_synapses_1_post_push_spikes.cu @@ -0,0 +1,706 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "objects.h" +#include "code_objects/synapses_1_post_push_spikes.h" +#include "brianlib/cuda_utils.h" + +// Makro for file and line information in _cudaSafeCall +#define COPY_HOST_ARRAY_TO_DEVICE_SYMBOL(a, b, c, d) \ + _copyHostArrayToDeviceSymbol(a, b, c, d, __FILE__, __LINE__) + +namespace { + // vector_t is an alias for thrust:host_vector + template using vector_t = thrust::host_vector; + // tuple type typedef + typedef std::tuple tuple_t; + + std::vector memory_recorder; + + // Functions for online update of mean and std + // for a new value newValue, compute the new count, new mean, the new M2. + // mean accumulates the mean of the entire dataset + // M2 aggregates the squared distance from the mean + // count aggregates the number of samples seen so far + inline void updateMeanStd(int &count, double &mean, double& M2, double newValue){ + count += 1; + double delta = newValue - mean; + mean += delta / count; + double delta2 = newValue - mean; + M2 += delta * delta2; + } + + // get std from aggregated M2 value + double getStd(int count, double M2){ + if (count < 2){ + return NAN; + } + double variance = M2 / (count - 1); + double stdValue = sqrt(variance); + return stdValue; + } + + // Copy the data from a host array to global device memory and copy the + // symbol to a global device variable. + // host_array: host array with data to copy + // device_symbol: global __device__ variable of same type as `host_array` + // num_elements: number of elements in host_array to copy + // NOTE: T can be a pointer variable itself (when copying 2D arrays) + template + inline void _copyHostArrayToDeviceSymbol(const T *host_array, T *&device_symbol, + int num_elements, const char* name, const char* file, + const int line){ + T *d_ptr_tmp; + size_t bytes = sizeof(T) * num_elements; + // allocate device memory + _cudaSafeCall( + cudaMalloc((void**)&d_ptr_tmp, bytes), + file, line, "cudaMalloc"); + // copy data from host array to device + _cudaSafeCall( + cudaMemcpy(d_ptr_tmp, host_array, bytes, cudaMemcpyHostToDevice), + file, line, "cudaMemcpy"); + // copy the device data pointer to the global device symbol + _cudaSafeCall( + cudaMemcpyToSymbol(device_symbol, &d_ptr_tmp, sizeof(T*)), + file, line, "cudaMemcpyToSymbol"); + memory_recorder.push_back(std::make_tuple(name, bytes, num_elements)); + } +} + + +__global__ void _before_run_kernel_synapses_1_post_push_spikes( + int _source_N, + int _num_blocks, + int _num_threads, + double _dt, + int _syn_N, + int num_queues, + bool new_mode) +{ + using namespace brian; + + int tid = threadIdx.x; + + synapses_1_post.queue->prepare( + tid, + _num_threads, + _num_blocks, + 0, + _source_N, + _syn_N, + num_queues, + synapses_1_post_num_synapses_by_pre, + synapses_1_post_num_synapses_by_bundle, + synapses_1_post_num_unique_delays_by_pre, + synapses_1_post_unique_delays, + synapses_1_post_global_bundle_id_start_by_pre, + synapses_1_post_synapses_offset_by_bundle, + synapses_1_post_synapse_ids, + synapses_1_post_synapse_ids_by_pre, + synapses_1_post_unique_delays_offset_by_pre, + synapses_1_post_unique_delay_start_idcs); + synapses_1_post.no_or_const_delay_mode = new_mode; +} + +void _before_run_synapses_1_post_push_spikes() +{ + using namespace brian; + + std::clock_t start_timer = std::clock(); + const double to_MB = 1.0 / (1024.0 * 1024.0); + + CUDA_CHECK_MEMORY(); + size_t used_device_memory_start = used_device_memory; + + ///// HOST_CONSTANTS /////////// + const int _numN = 1; + const int _num_spikespace = 101; + double* const _array_synapses_1_delay_1 = thrust::raw_pointer_cast(&_dynamic_array_synapses_1_delay_1[0]); + const int _numdelay = _dynamic_array_synapses_1_delay_1.size(); + double* const dev_array_synapses_1_delay_1 = thrust::raw_pointer_cast(&dev_dynamic_array_synapses_1_delay_1[0]); + + ///// pointers_lines ///// + + double* _ptr_array_defaultclock_dt = _array_defaultclock_dt; + double* __restrict _ptr_array_synapses_1_delay_1 = _array_synapses_1_delay_1; + int32_t* __restrict _ptr_array_neurongroup_1__spikespace = _array_neurongroup_1__spikespace; + int32_t* _ptr_array_synapses_1_N = _array_synapses_1_N; + + + int64_t syn_N_check = _ptr_array_synapses_1_N[0]; + + if (syn_N_check == 0){ + return; + } + else if (syn_N_check > INT_MAX){ + printf("ERROR: There are more Synapses (%lu) than an int can " + "hold on this system (%u).\n", syn_N_check, INT_MAX); + } + // total number of synapses + int syn_N = (int)syn_N_check; + + // simulation time step + double dt = _ptr_array_defaultclock_dt[0]; + // number of neurons in source group + int source_N = 100; + // number of neurons in target group + int target_N = 2500; + + // TODO: for multiple SynapticPathways for the same Synapses object (on_pre and on_post) the following copy is identical in both pathways initialise templates + // delay (on device) was potentially set in group_variable_set_conditional and needs to be copied to host + _dynamic_array_synapses_1_delay_1 = dev_dynamic_array_synapses_1_delay_1; + + ////////////////////// + // Scalar variables // + ////////////////////// + + // total number of (preID, postBlock) pairs + int num_pre_post_blocks = num_parallel_blocks * source_N; + // size of the connectivity matrix (equal number of synapses) + int size_connectivity_matrix = 0; + + // statistics of number of synapses per (preID, postBlock) pair + int sum_num_elements = 0; + int count_num_elements = 0; + double mean_num_elements = 0; + double M2_num_elements = 0; + + // statistics of number of unique delays per (preID, postBlock) pair + int sum_num_unique_elements = 0; + int count_num_unique_elements = 0; + double mean_num_unique_elements = 0; + double M2_num_unique_elements = 0; + + // total number of bundles in all (preID, postBlock) pairs (not known yet) + int num_bundle_ids = 0; + + // statistics of number of synapses per bundle + int sum_bundle_sizes = 0; + int count_bundle_sizes = 0; + double mean_bundle_sizes = 0; + double M2_bundle_sizes = 0; + + + //////////////////////////////////////////////////////// + // Create array and vector variables (in host memory) // + //////////////////////////////////////////////////////// + + /* VARIABLE NAMING: + * Not scalar variables are named after TYPE_NAME_STRUCTURE, with: + * STRUCTURE: the first array dimensions structure (`by_pre`, `by_bundle` or none) + * `by_pre`: Array (host pointer type) of size `num_pre_post_blocks`, + * which is the number of (preID, postBlock) pairs. + * `by_bundle`: thrust::host_vector, size of total number of bundles, + * which is one for each delay in each (preID, postBlock) pair. + * Different (preID, postBlock) pairs can have different sets + * of delay values -> each bundle gets a global bundleID + * none: If no STRUCTURE given, it's a one dim array storing everything + * TYPE: data type in STRUCTURE (`h`, `h_vec`, `h_ptr`, `d_ptr`), with + * `h`: host value, `h_vec`: host vector, `h_ptr`: host pointer, + * `d_ptr`: device pointer (pointing to device, stored in host memory) + * NAME: the variable name + * + * EXAMPLES: + * `h_vec_delays_by_pre` - an array [size = num_pre_post_blocks] of host + * vectors, each storing delay values of a + * (preID, postBlock) pair + * `h_num_synapses_by_bundle` - a host vector of integers specifying the + * number of synapses in a bundle + * `d_ptr_synapse_ids` - a device pointer to synapse IDs (all of them) + */ + + // synapse IDs for each (preID, postBlock) pair + vector_t* h_vec_synapse_ids_by_pre = new vector_t[num_pre_post_blocks]; + // array of synapse IDs in device memory for each (preID, postBlock) pair + int32_t** d_ptr_synapse_ids_by_pre; + // number of synapses for each (preID, postBlock) pair + int* h_num_synapses_by_pre; + + // delay for each synapse in `h_vec_synapse_ids_by_pre`, + // only used to sort synapses by delay + vector_t* h_vec_delays_by_pre = new vector_t[num_pre_post_blocks]; + // array of vectors with unique delays and start indices in synapses arrays + vector_t* h_vec_unique_delays_by_pre; + vector_t* h_vec_unique_delay_start_idcs_by_pre; + // offset in array of all synapse IDs sorted by bundles (we are storing the + // offset as 32bit int instead of a 64bit pointer to the bundle start) + vector_t h_synapses_offset_by_bundle; + // number of synapses in each bundle + vector_t h_num_synapses_by_bundle; + // start of global bundle ID per (preID, postBlock) pair (+ total num bundles) + int* h_global_bundle_id_start_by_pre = new int[num_pre_post_blocks + 1]; + + + // we need to allocate device memory for synapse IDs independent of delay mode + int32_t* d_ptr_synapse_ids; + size_t memory_synapse_ids = sizeof(int32_t) * syn_N; + CUDA_SAFE_CALL( + cudaMalloc((void**)&d_ptr_synapse_ids, memory_synapse_ids) + ); + memory_recorder.push_back(std::make_tuple("synapse IDs", memory_synapse_ids, syn_N)); + + + //fill vectors of connectivity matrix with synapse IDs and delays (in units of simulation time step) + int max_delay = (int)(_dynamic_array_synapses_1_delay_1[0] / dt + 0.5); + int min_delay = max_delay; + for(int syn_id = 0; syn_id < syn_N; syn_id++) // loop through all synapses + { + + + // Code generation checks + assert(0 == 0); + + assert(0 == 0); + + // pre/post_neuron_id are integers from 0 to Nsource/Ntarget (from corresponding + // SynapticPathway) this is relevant only when using Subgroups where they might + // be NOT equal to the idx in their NeuronGroup + int32_t pre_neuron_id = _dynamic_array_synapses_1__synaptic_post[syn_id] - 0; + int32_t post_neuron_id = _dynamic_array_synapses_1__synaptic_pre[syn_id] - 0; + + int delay = (int)(_dynamic_array_synapses_1_delay_1[syn_id] / dt + 0.5); + if (delay > max_delay) + max_delay = delay; + if (delay < min_delay) + min_delay = delay; + + // each parallel executed cuda block gets an equal part of post neuron IDs + int post_block_id = (post_neuron_id * num_parallel_blocks) / target_N; + // we store synapses for each pre neuron and post block + int pre_post_block_id = pre_neuron_id * num_parallel_blocks + post_block_id; + + h_vec_synapse_ids_by_pre[pre_post_block_id].push_back(syn_id); + h_vec_delays_by_pre[pre_post_block_id].push_back(delay); + } + int num_queues = max_delay + 1; // we also need a current step + + bool scalar_delay = (max_delay == min_delay); + if (scalar_delay) + synapses_1_post_delay = max_delay; + // Delete delay (in sec) on device, we don't need it + // TODO: don't copy these delays to the device in first place, see #83 + dev_dynamic_array_synapses_1_delay_1.clear(); + dev_dynamic_array_synapses_1_delay_1.shrink_to_fit(); + CUDA_CHECK_MEMORY(); + size_t used_device_memory_after_dealloc = used_device_memory; + + /////////////////////////////////////////////////////// + // Memory allocations which depend on the delay mode // + /////////////////////////////////////////////////////// + + if (scalar_delay) + { + h_num_synapses_by_pre = new int[num_pre_post_blocks]; + d_ptr_synapse_ids_by_pre = new int32_t*[num_pre_post_blocks]; + } + + // allocate memory only if the delays are not all the same + if (!scalar_delay) + { + + h_vec_unique_delay_start_idcs_by_pre = new vector_t[num_pre_post_blocks]; + h_vec_unique_delays_by_pre = new vector_t[num_pre_post_blocks]; + + } + int global_bundle_id_start = 0; + + // loop through connectivity matrix [(preID, postBlock) pairs] + for(int i = 0; i < num_pre_post_blocks; i++) + { + // i is pre_post_block_id + + int num_elements = h_vec_synapse_ids_by_pre[i].size(); + size_connectivity_matrix += num_elements; + if (num_elements > synapses_1_post_max_size) + synapses_1_post_max_size = num_elements; + + if (!scalar_delay) + { + // for this (preID, postBlock), sort all synapses by delay, + // reduce the delay arrays to unique delays and store the + // start indices in the synapses array for each unique delay + + typedef vector_t::iterator itr; + + // sort synapses (values) and delays (keys) by delay + thrust::sort_by_key( + h_vec_delays_by_pre[i].begin(), // keys start + h_vec_delays_by_pre[i].end(), // keys end + h_vec_synapse_ids_by_pre[i].begin() // values start + ); + + // worst case: number of unique delays is num_elements + h_vec_unique_delay_start_idcs_by_pre[i].resize(num_elements); + + // Initialise the unique delay start idcs array as a sequence + thrust::sequence(h_vec_unique_delay_start_idcs_by_pre[i].begin(), + h_vec_unique_delay_start_idcs_by_pre[i].end()); + + // get delays (keys) and values (indices) for first occurence of each delay value + thrust::pair end_pair = thrust::unique_by_key( + h_vec_delays_by_pre[i].begin(), // keys start + h_vec_delays_by_pre[i].end(), // keys end + h_vec_unique_delay_start_idcs_by_pre[i].begin() // values start (position in original delay array) + ); + + itr unique_delay_end = end_pair.first; + itr idx_end = end_pair.second; + + // erase unneded vector entries + h_vec_unique_delay_start_idcs_by_pre[i].erase( + idx_end, h_vec_unique_delay_start_idcs_by_pre[i].end()); + // free not used but allocated host memory + h_vec_unique_delay_start_idcs_by_pre[i].shrink_to_fit(); + h_vec_delays_by_pre[i].erase(unique_delay_end, + h_vec_delays_by_pre[i].end()); + // delay_by_pre holds the set of unique delays now + // we don't need shrink_to_fit, swap takes care of that + h_vec_unique_delays_by_pre[i].swap(h_vec_delays_by_pre[i]); + + int num_unique_elements = h_vec_unique_delays_by_pre[i].size(); + sum_num_unique_elements += num_unique_elements; + + if (num_unique_elements > synapses_1_post_max_num_unique_delays) + synapses_1_post_max_num_unique_delays = num_unique_elements; + + // we need a start ID per i (pre_post_block_id) to calc the global + // bundle ID from the local bundle_idx when pushing + h_global_bundle_id_start_by_pre[i] = global_bundle_id_start; + global_bundle_id_start += num_unique_elements; + // the local bundle_idx goes from 0 to num_bundles for each i (pre_post_block_id) + for (int bundle_idx = 0; bundle_idx < num_unique_elements; bundle_idx++) + { + // find the start idx in the synapses array for this delay (bundle) + int synapses_start_idx = h_vec_unique_delay_start_idcs_by_pre[i][bundle_idx]; + // find the number of synapses for this delay (bundle) + int num_synapses; + if (bundle_idx == num_unique_elements - 1) + num_synapses = num_elements - synapses_start_idx; + else + num_synapses = h_vec_unique_delay_start_idcs_by_pre[i][bundle_idx + 1] - synapses_start_idx; + h_num_synapses_by_bundle.push_back(num_synapses); + if (num_synapses > synapses_1_post_max_bundle_size) + synapses_1_post_max_bundle_size = num_synapses; + + // copy this bundle to device and store the device pointer + int32_t* d_this_bundle = d_ptr_synapse_ids + sum_bundle_sizes; + int32_t* h_this_bundle = thrust::raw_pointer_cast(&h_vec_synapse_ids_by_pre[i][synapses_start_idx]); + size_t memory_size = sizeof(int32_t) * num_synapses; + CUDA_SAFE_CALL( + cudaMemcpy(d_this_bundle, h_this_bundle, memory_size, cudaMemcpyHostToDevice) + ); + + h_synapses_offset_by_bundle.push_back(sum_bundle_sizes); + sum_bundle_sizes += num_synapses; + updateMeanStd(count_bundle_sizes, mean_bundle_sizes, M2_bundle_sizes, num_synapses); + } + + updateMeanStd(count_num_unique_elements, mean_num_unique_elements, + M2_num_unique_elements, num_unique_elements); + + } // end if (!scalar_delay) + else // scalar_delay + { + // copy the synapse IDs and the number of synapses for this + // (preID, postBlock) to device and store the device pointer + + h_num_synapses_by_pre[i] = num_elements; + + d_ptr_synapse_ids_by_pre[i] = d_ptr_synapse_ids + sum_num_elements; + CUDA_SAFE_CALL( + cudaMemcpy(d_ptr_synapse_ids_by_pre[i], + thrust::raw_pointer_cast(&(h_vec_synapse_ids_by_pre[i][0])), + sizeof(int32_t) * num_elements, + cudaMemcpyHostToDevice) + ); + } + + sum_num_elements += num_elements; + updateMeanStd(count_num_elements, mean_num_elements, M2_num_elements, num_elements); + } // end for loop through connectivity matrix + printf("INFO connectivity matrix has size %i, number of (pre neuron ID, post neuron block) pairs is %u\n", + size_connectivity_matrix, num_pre_post_blocks); + + if (scalar_delay) + { + // synapses size + COPY_HOST_ARRAY_TO_DEVICE_SYMBOL(h_num_synapses_by_pre, + synapses_1_post_num_synapses_by_pre, num_pre_post_blocks, + "number of synapses per pre/post block"); + // synapses id + COPY_HOST_ARRAY_TO_DEVICE_SYMBOL(d_ptr_synapse_ids_by_pre, + synapses_1_post_synapse_ids_by_pre, num_pre_post_blocks, + "pointers to synapse IDs"); + } + + else // not scalar_delay + { + // Since we now know the total number of unique delays over all + // (preID, postBlock) pairs, we can allocate the device memory + size_t memory_unique_delays_by_pre = sizeof(int) * sum_num_unique_elements; + assert(sum_bundle_sizes == syn_N); + + // array of all unique delas, sorted first by pre_post_block and per + // pre_post_block by delay + int *d_ptr_unique_delays; + CUDA_SAFE_CALL( + cudaMalloc((void**)&d_ptr_unique_delays, memory_unique_delays_by_pre) + ); + memory_recorder.push_back(std::make_tuple( + "unique delays", memory_unique_delays_by_pre, + sum_num_unique_elements)); + + int sum_num_unique_elements_bak = sum_num_unique_elements; + + // reset sum_num_unique_elements, we will use it to offset cudaMemcy correctly + sum_num_unique_elements = 0; + for(int i = 0; i < num_pre_post_blocks; i++) // loop through connectivity matrix again + { + + int num_elements = h_vec_synapse_ids_by_pre[i].size(); + int num_unique_elements = h_vec_unique_delays_by_pre[i].size(); + + if(num_elements > 0) + { + // copy the unique delays to the device and store the device pointers + CUDA_SAFE_CALL( + cudaMemcpy(d_ptr_unique_delays + + sum_num_unique_elements, + thrust::raw_pointer_cast( + &(h_vec_unique_delays_by_pre[i][0])), + sizeof(int)*num_unique_elements, + cudaMemcpyHostToDevice) + ); + + + sum_num_unique_elements += num_unique_elements; + } // end if(num_elements < 0) + } // end second loop connectivity matrix + assert(sum_num_unique_elements_bak == sum_num_unique_elements); + + // pointer to start of unique delays array + CUDA_SAFE_CALL( + cudaMemcpyToSymbol(synapses_1_post_unique_delays, + &d_ptr_unique_delays, + sizeof(d_ptr_unique_delays)) + ); + + num_bundle_ids = sum_num_unique_elements; + + // add num_bundle_ids as last entry + h_global_bundle_id_start_by_pre[num_pre_post_blocks] = num_bundle_ids; + + // floor(mean(h_num_synapses_by_bundle)) + synapses_1_post_mean_bundle_size = sum_bundle_sizes / num_bundle_ids; + + // pointer to start of synapse IDs array + CUDA_SAFE_CALL( + cudaMemcpyToSymbol(synapses_1_post_synapse_ids, &d_ptr_synapse_ids, + sizeof(d_ptr_synapse_ids)) + ); + + // size by bundle + COPY_HOST_ARRAY_TO_DEVICE_SYMBOL( + thrust::raw_pointer_cast(&h_num_synapses_by_bundle[0]), + synapses_1_post_num_synapses_by_bundle, num_bundle_ids, + "number of synapses per bundle"); + + // synapses offset by bundle + COPY_HOST_ARRAY_TO_DEVICE_SYMBOL( + thrust::raw_pointer_cast(&h_synapses_offset_by_bundle[0]), + synapses_1_post_synapses_offset_by_bundle, num_bundle_ids, + "synapses bundle offset"); + + // global bundle id start idx by pre + COPY_HOST_ARRAY_TO_DEVICE_SYMBOL( + h_global_bundle_id_start_by_pre, + synapses_1_post_global_bundle_id_start_by_pre, + num_pre_post_blocks + 1, "global bundle ID start"); + + + } // end if (!scalar_delay) + + //////////////////////////////////////////////////// + //// PRINT INFORMATION ON MEMORY USAGE AND TIME //// + //////////////////////////////////////////////////// + + // TODO print statistics! + + // sum all allocated memory + size_t total_memory = 0; + int max_string_length = 0; + for(auto const& tuple: memory_recorder){ + total_memory += std::get<1>(tuple); + int str_len = std::get<0>(tuple).length(); + if (str_len > max_string_length) + max_string_length = str_len; + } + double total_memory_MB = total_memory * to_MB; + max_string_length += 5; + + // sort tuples by used memory + std::sort(begin(memory_recorder), end(memory_recorder), + [](tuple_t const &t1, tuple_t const &t2) { + return std::get<1>(t1) > std::get<1>(t2); // or use a custom compare function + } + ); + + double std_num_elements = getStd(count_num_elements, M2_num_elements); + double std_bundle_sizes = getStd(count_bundle_sizes, M2_bundle_sizes); + double std_num_unique_elements = getStd(count_num_unique_elements, M2_num_unique_elements); + + // print memory information + std::cout.precision(1); + std::cout.setf(std::ios::fixed, std::ios::floatfield); + std::cout << "INFO: synapse statistics and memory usage for synapses_1_post:\n" + << "\tnumber of synapses: " << syn_N << "\n" + << "\tnumber of bundles: " << num_bundle_ids << "\n" + << "\tnumber of pre/post blocks: " << num_pre_post_blocks << "\n" + << "\tnumber of synapses over all pre/post blocks:\n" + << "\t\tmean: " << mean_num_elements << "\tstd: " + << std_num_elements << "\n" + << "\tnumber of unique delays over all pre/post blocks:\n" + << "\t\tmean: " << mean_num_unique_elements << "\tstd: " + << std_num_unique_elements << "\n" + << "\tbundle size over all bundles:\n" + << "\t\tmean: " << mean_bundle_sizes << "\tstd: " + << std_bundle_sizes << "\n" + << "\n\tmemory usage: TOTAL: " << total_memory_MB << " MB (~" + << total_memory_MB / syn_N * 1024.0 * 1024.0 << " byte per synapse)" + << std::endl; + + for(auto const& tuple: memory_recorder){ + std::string name; + size_t bytes; + int num_elements; + std::tie(name, bytes, num_elements) = tuple; + double memory = bytes * to_MB; + double fraction = memory / total_memory_MB * 100; + std::cout << "\t\t" << std::setprecision(1) << std::fixed << fraction + << "%\t" << std::setprecision(3) << std::fixed << memory << " MB\t" + << name << " [" << num_elements << "]" << std::endl; + } + + + // Create circular eventspaces in no_or_const_delay_mode + if (scalar_delay) + { + int num_spikespaces = dev_array_neurongroup_1__spikespace.size(); + if (num_queues > num_spikespaces) + { + for (int i = num_spikespaces; i < num_queues; i++) + { + int32_t* new_eventspace; + cudaError_t status = cudaMalloc((void**)&new_eventspace, + sizeof(int32_t)*_num__array_neurongroup_1__spikespace); + if (status != cudaSuccess) + { + printf("ERROR while allocating momory for dev_array_neurongroup_1__spikespace[%i] on device: %s %s %d\n", + i, cudaGetErrorString(status), __FILE__, __LINE__); + exit(status); + } + dev_array_neurongroup_1__spikespace.push_back(new_eventspace); + } + } + } + + int num_threads = num_queues; + if(num_threads >= max_threads_per_block) + { + num_threads = max_threads_per_block; + } + int num_blocks = 1; + + // check if we have enough ressources to call kernel with given number + // of blocks and threads + struct cudaFuncAttributes funcAttrib; + cudaFuncGetAttributes(&funcAttrib, _before_run_kernel_synapses_1_post_push_spikes); + if (num_threads > funcAttrib.maxThreadsPerBlock) + { + // use the max num_threads before launch failure + num_threads = funcAttrib.maxThreadsPerBlock; + printf("WARNING Not enough ressources available to call " + "_before_run_kernel_synapses_1_post_push_spikes" + "with maximum possible threads per block (%u). " + "Reducing num_threads to %u. (Kernel needs %i " + "registers per block, %i bytes of " + "statically-allocated shared memory per block, %i " + "bytes of local memory per thread and a total of %i " + "bytes of user-allocated constant memory)\n", + max_threads_per_block, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes); + } + else + { + printf("INFO _before_run_kernel_synapses_1_post_push_spikes\n" + "\t%u blocks\n" + "\t%u threads\n" + "\t%i registers per block\n" + "\t%i bytes statically-allocated shared memory per block\n" + "\t%i bytes local memory per thread\n" + "\t%i bytes user-allocated constant memory\n" + "", + num_blocks, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes); + } + + _before_run_kernel_synapses_1_post_push_spikes<<>>( + source_N, + num_parallel_blocks, + num_threads, + dt, + syn_N, + num_queues, + scalar_delay + ); + + if (scalar_delay) + { + delete [] h_num_synapses_by_pre; + delete [] d_ptr_synapse_ids_by_pre; + } + + //delete temp arrays + delete [] h_vec_synapse_ids_by_pre; + delete [] h_vec_delays_by_pre; + if (!scalar_delay) + { + delete [] h_vec_unique_delay_start_idcs_by_pre; + delete [] h_vec_unique_delays_by_pre; + delete [] h_global_bundle_id_start_by_pre; + } + + synapses_1_post_scalar_delay = scalar_delay; + + cudaError_t status = cudaGetLastError(); + if (status != cudaSuccess) + { + printf("ERROR initialising synapses_1_post in %s:%d %s\n", + __FILE__, __LINE__, cudaGetErrorString(status)); + _dealloc_arrays(); + exit(status); + } + + CUDA_CHECK_MEMORY(); + double time_passed = (double)(std::clock() - start_timer) / CLOCKS_PER_SEC; + std::cout << "INFO: synapses_1_post initialisation took " << time_passed << "s"; + if (used_device_memory_after_dealloc < used_device_memory_start){ + size_t freed_bytes = used_device_memory_start - used_device_memory_after_dealloc; + std::cout << ", freed " << freed_bytes * to_MB << "MB"; + } + if (used_device_memory > used_device_memory_start){ + size_t used_bytes = used_device_memory - used_device_memory_start; + std::cout << " and used " << used_bytes * to_MB << "MB of device memory."; + } + std::cout << std::endl; +} diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/before_run_synapses_1_post_push_spikes.h b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/before_run_synapses_1_post_push_spikes.h new file mode 100644 index 00000000..83719d4f --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/before_run_synapses_1_post_push_spikes.h @@ -0,0 +1,6 @@ +#ifndef _INCLUDED_synapses_1_post_push_spikes_before +#define _INCLUDED_synapses_1_post_push_spikes_before + +void _before_run_synapses_1_post_push_spikes(); + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/before_run_synapses_1_pre_push_spikes.cu b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/before_run_synapses_1_pre_push_spikes.cu new file mode 100644 index 00000000..79175459 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/before_run_synapses_1_pre_push_spikes.cu @@ -0,0 +1,466 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "objects.h" +#include "code_objects/synapses_1_pre_push_spikes.h" +#include "brianlib/cuda_utils.h" + +// Makro for file and line information in _cudaSafeCall +#define COPY_HOST_ARRAY_TO_DEVICE_SYMBOL(a, b, c, d) \ + _copyHostArrayToDeviceSymbol(a, b, c, d, __FILE__, __LINE__) + +namespace { + // vector_t is an alias for thrust:host_vector + template using vector_t = thrust::host_vector; + // tuple type typedef + typedef std::tuple tuple_t; + + std::vector memory_recorder; + + // Functions for online update of mean and std + // for a new value newValue, compute the new count, new mean, the new M2. + // mean accumulates the mean of the entire dataset + // M2 aggregates the squared distance from the mean + // count aggregates the number of samples seen so far + inline void updateMeanStd(int &count, double &mean, double& M2, double newValue){ + count += 1; + double delta = newValue - mean; + mean += delta / count; + double delta2 = newValue - mean; + M2 += delta * delta2; + } + + // get std from aggregated M2 value + double getStd(int count, double M2){ + if (count < 2){ + return NAN; + } + double variance = M2 / (count - 1); + double stdValue = sqrt(variance); + return stdValue; + } + + // Copy the data from a host array to global device memory and copy the + // symbol to a global device variable. + // host_array: host array with data to copy + // device_symbol: global __device__ variable of same type as `host_array` + // num_elements: number of elements in host_array to copy + // NOTE: T can be a pointer variable itself (when copying 2D arrays) + template + inline void _copyHostArrayToDeviceSymbol(const T *host_array, T *&device_symbol, + int num_elements, const char* name, const char* file, + const int line){ + T *d_ptr_tmp; + size_t bytes = sizeof(T) * num_elements; + // allocate device memory + _cudaSafeCall( + cudaMalloc((void**)&d_ptr_tmp, bytes), + file, line, "cudaMalloc"); + // copy data from host array to device + _cudaSafeCall( + cudaMemcpy(d_ptr_tmp, host_array, bytes, cudaMemcpyHostToDevice), + file, line, "cudaMemcpy"); + // copy the device data pointer to the global device symbol + _cudaSafeCall( + cudaMemcpyToSymbol(device_symbol, &d_ptr_tmp, sizeof(T*)), + file, line, "cudaMemcpyToSymbol"); + memory_recorder.push_back(std::make_tuple(name, bytes, num_elements)); + } +} + + +__global__ void _before_run_kernel_synapses_1_pre_push_spikes( + int _source_N, + int _num_blocks, + int _num_threads, + double _dt, + int _syn_N, + int num_queues, + bool new_mode) +{ + using namespace brian; + + int tid = threadIdx.x; + + synapses_1_pre.queue->prepare( + tid, + _num_threads, + _num_blocks, + 0, + _source_N, + _syn_N, + num_queues, + synapses_1_pre_num_synapses_by_pre, + synapses_1_pre_num_synapses_by_bundle, + synapses_1_pre_num_unique_delays_by_pre, + synapses_1_pre_unique_delays, + synapses_1_pre_global_bundle_id_start_by_pre, + synapses_1_pre_synapses_offset_by_bundle, + synapses_1_pre_synapse_ids, + synapses_1_pre_synapse_ids_by_pre, + synapses_1_pre_unique_delays_offset_by_pre, + synapses_1_pre_unique_delay_start_idcs); + synapses_1_pre.no_or_const_delay_mode = new_mode; +} + +void _before_run_synapses_1_pre_push_spikes() +{ + using namespace brian; + + std::clock_t start_timer = std::clock(); + const double to_MB = 1.0 / (1024.0 * 1024.0); + + CUDA_CHECK_MEMORY(); + size_t used_device_memory_start = used_device_memory; + + ///// HOST_CONSTANTS /////////// + const int _numN = 1; + const int _num_spikespace = 2501; + double* const _array_synapses_1_delay = thrust::raw_pointer_cast(&_dynamic_array_synapses_1_delay[0]); + const int _numdelay = _dynamic_array_synapses_1_delay.size(); + double* const dev_array_synapses_1_delay = thrust::raw_pointer_cast(&dev_dynamic_array_synapses_1_delay[0]); + + ///// pointers_lines ///// + + double* _ptr_array_defaultclock_dt = _array_defaultclock_dt; + double* _ptr_array_synapses_1_delay = _array_synapses_1_delay; + int32_t* __restrict _ptr_array_neurongroup__spikespace = _array_neurongroup__spikespace; + int32_t* _ptr_array_synapses_1_N = _array_synapses_1_N; + + + int64_t syn_N_check = _ptr_array_synapses_1_N[0]; + + if (syn_N_check == 0){ + return; + } + else if (syn_N_check > INT_MAX){ + printf("ERROR: There are more Synapses (%lu) than an int can " + "hold on this system (%u).\n", syn_N_check, INT_MAX); + } + // total number of synapses + int syn_N = (int)syn_N_check; + + // simulation time step + double dt = _ptr_array_defaultclock_dt[0]; + // number of neurons in source group + int source_N = 2500; + // number of neurons in target group + int target_N = 100; + + // TODO: for multiple SynapticPathways for the same Synapses object (on_pre and on_post) the following copy is identical in both pathways initialise templates + + ////////////////////// + // Scalar variables // + ////////////////////// + + // total number of (preID, postBlock) pairs + int num_pre_post_blocks = num_parallel_blocks * source_N; + // size of the connectivity matrix (equal number of synapses) + int size_connectivity_matrix = 0; + + // statistics of number of synapses per (preID, postBlock) pair + int sum_num_elements = 0; + int count_num_elements = 0; + double mean_num_elements = 0; + double M2_num_elements = 0; + + + //////////////////////////////////////////////////////// + // Create array and vector variables (in host memory) // + //////////////////////////////////////////////////////// + + /* VARIABLE NAMING: + * Not scalar variables are named after TYPE_NAME_STRUCTURE, with: + * STRUCTURE: the first array dimensions structure (`by_pre`, `by_bundle` or none) + * `by_pre`: Array (host pointer type) of size `num_pre_post_blocks`, + * which is the number of (preID, postBlock) pairs. + * `by_bundle`: thrust::host_vector, size of total number of bundles, + * which is one for each delay in each (preID, postBlock) pair. + * Different (preID, postBlock) pairs can have different sets + * of delay values -> each bundle gets a global bundleID + * none: If no STRUCTURE given, it's a one dim array storing everything + * TYPE: data type in STRUCTURE (`h`, `h_vec`, `h_ptr`, `d_ptr`), with + * `h`: host value, `h_vec`: host vector, `h_ptr`: host pointer, + * `d_ptr`: device pointer (pointing to device, stored in host memory) + * NAME: the variable name + * + * EXAMPLES: + * `h_vec_delays_by_pre` - an array [size = num_pre_post_blocks] of host + * vectors, each storing delay values of a + * (preID, postBlock) pair + * `h_num_synapses_by_bundle` - a host vector of integers specifying the + * number of synapses in a bundle + * `d_ptr_synapse_ids` - a device pointer to synapse IDs (all of them) + */ + + // synapse IDs for each (preID, postBlock) pair + vector_t* h_vec_synapse_ids_by_pre = new vector_t[num_pre_post_blocks]; + // array of synapse IDs in device memory for each (preID, postBlock) pair + int32_t** d_ptr_synapse_ids_by_pre; + // number of synapses for each (preID, postBlock) pair + int* h_num_synapses_by_pre; + + + + // we need to allocate device memory for synapse IDs independent of delay mode + int32_t* d_ptr_synapse_ids; + size_t memory_synapse_ids = sizeof(int32_t) * syn_N; + CUDA_SAFE_CALL( + cudaMalloc((void**)&d_ptr_synapse_ids, memory_synapse_ids) + ); + memory_recorder.push_back(std::make_tuple("synapse IDs", memory_synapse_ids, syn_N)); + + + //fill vectors of connectivity matrix with synapse IDs and delays (in units of simulation time step) + int max_delay = (int)(_dynamic_array_synapses_1_delay[0] / dt + 0.5); + for(int syn_id = 0; syn_id < syn_N; syn_id++) // loop through all synapses + { + + + // Code generation checks + assert(0 == 0); + + assert(0 == 0); + + // pre/post_neuron_id are integers from 0 to Nsource/Ntarget (from corresponding + // SynapticPathway) this is relevant only when using Subgroups where they might + // be NOT equal to the idx in their NeuronGroup + int32_t pre_neuron_id = _dynamic_array_synapses_1__synaptic_pre[syn_id] - 0; + int32_t post_neuron_id = _dynamic_array_synapses_1__synaptic_post[syn_id] - 0; + + + // each parallel executed cuda block gets an equal part of post neuron IDs + int post_block_id = (post_neuron_id * num_parallel_blocks) / target_N; + // we store synapses for each pre neuron and post block + int pre_post_block_id = pre_neuron_id * num_parallel_blocks + post_block_id; + + h_vec_synapse_ids_by_pre[pre_post_block_id].push_back(syn_id); + } + int num_queues = max_delay + 1; // we also need a current step + + synapses_1_pre_delay = max_delay; + // Delete delay (in sec) on device, we don't need it + // TODO: don't copy these delays to the device in first place, see #83 + dev_dynamic_array_synapses_1_delay.clear(); + dev_dynamic_array_synapses_1_delay.shrink_to_fit(); + CUDA_CHECK_MEMORY(); + size_t used_device_memory_after_dealloc = used_device_memory; + + /////////////////////////////////////////////////////// + // Memory allocations which depend on the delay mode // + /////////////////////////////////////////////////////// + + { + h_num_synapses_by_pre = new int[num_pre_post_blocks]; + d_ptr_synapse_ids_by_pre = new int32_t*[num_pre_post_blocks]; + } + + + // loop through connectivity matrix [(preID, postBlock) pairs] + for(int i = 0; i < num_pre_post_blocks; i++) + { + // i is pre_post_block_id + + int num_elements = h_vec_synapse_ids_by_pre[i].size(); + size_connectivity_matrix += num_elements; + if (num_elements > synapses_1_pre_max_size) + synapses_1_pre_max_size = num_elements; + + { + // copy the synapse IDs and the number of synapses for this + // (preID, postBlock) to device and store the device pointer + + h_num_synapses_by_pre[i] = num_elements; + + d_ptr_synapse_ids_by_pre[i] = d_ptr_synapse_ids + sum_num_elements; + CUDA_SAFE_CALL( + cudaMemcpy(d_ptr_synapse_ids_by_pre[i], + thrust::raw_pointer_cast(&(h_vec_synapse_ids_by_pre[i][0])), + sizeof(int32_t) * num_elements, + cudaMemcpyHostToDevice) + ); + } + + sum_num_elements += num_elements; + updateMeanStd(count_num_elements, mean_num_elements, M2_num_elements, num_elements); + } // end for loop through connectivity matrix + printf("INFO connectivity matrix has size %i, number of (pre neuron ID, post neuron block) pairs is %u\n", + size_connectivity_matrix, num_pre_post_blocks); + + { + // synapses size + COPY_HOST_ARRAY_TO_DEVICE_SYMBOL(h_num_synapses_by_pre, + synapses_1_pre_num_synapses_by_pre, num_pre_post_blocks, + "number of synapses per pre/post block"); + // synapses id + COPY_HOST_ARRAY_TO_DEVICE_SYMBOL(d_ptr_synapse_ids_by_pre, + synapses_1_pre_synapse_ids_by_pre, num_pre_post_blocks, + "pointers to synapse IDs"); + } + + + //////////////////////////////////////////////////// + //// PRINT INFORMATION ON MEMORY USAGE AND TIME //// + //////////////////////////////////////////////////// + + // TODO print statistics! + + // sum all allocated memory + size_t total_memory = 0; + int max_string_length = 0; + for(auto const& tuple: memory_recorder){ + total_memory += std::get<1>(tuple); + int str_len = std::get<0>(tuple).length(); + if (str_len > max_string_length) + max_string_length = str_len; + } + double total_memory_MB = total_memory * to_MB; + max_string_length += 5; + + // sort tuples by used memory + std::sort(begin(memory_recorder), end(memory_recorder), + [](tuple_t const &t1, tuple_t const &t2) { + return std::get<1>(t1) > std::get<1>(t2); // or use a custom compare function + } + ); + + double std_num_elements = getStd(count_num_elements, M2_num_elements); + + // print memory information + std::cout.precision(1); + std::cout.setf(std::ios::fixed, std::ios::floatfield); + std::cout << "INFO: synapse statistics and memory usage for synapses_1_pre:\n" + << "\tnumber of synapses: " << syn_N << "\n" + << "\tnumber of pre/post blocks: " << num_pre_post_blocks << "\n" + << "\tnumber of synapses over all pre/post blocks:\n" + << "\t\tmean: " << mean_num_elements << "\tstd: " + << std_num_elements << "\n" + << "\n\tmemory usage: TOTAL: " << total_memory_MB << " MB (~" + << total_memory_MB / syn_N * 1024.0 * 1024.0 << " byte per synapse)" + << std::endl; + + for(auto const& tuple: memory_recorder){ + std::string name; + size_t bytes; + int num_elements; + std::tie(name, bytes, num_elements) = tuple; + double memory = bytes * to_MB; + double fraction = memory / total_memory_MB * 100; + std::cout << "\t\t" << std::setprecision(1) << std::fixed << fraction + << "%\t" << std::setprecision(3) << std::fixed << memory << " MB\t" + << name << " [" << num_elements << "]" << std::endl; + } + + + // Create circular eventspaces in no_or_const_delay_mode + { + int num_spikespaces = dev_array_neurongroup__spikespace.size(); + if (num_queues > num_spikespaces) + { + for (int i = num_spikespaces; i < num_queues; i++) + { + int32_t* new_eventspace; + cudaError_t status = cudaMalloc((void**)&new_eventspace, + sizeof(int32_t)*_num__array_neurongroup__spikespace); + if (status != cudaSuccess) + { + printf("ERROR while allocating momory for dev_array_neurongroup__spikespace[%i] on device: %s %s %d\n", + i, cudaGetErrorString(status), __FILE__, __LINE__); + exit(status); + } + dev_array_neurongroup__spikespace.push_back(new_eventspace); + } + } + } + + int num_threads = num_queues; + if(num_threads >= max_threads_per_block) + { + num_threads = max_threads_per_block; + } + int num_blocks = 1; + + // check if we have enough ressources to call kernel with given number + // of blocks and threads + struct cudaFuncAttributes funcAttrib; + cudaFuncGetAttributes(&funcAttrib, _before_run_kernel_synapses_1_pre_push_spikes); + if (num_threads > funcAttrib.maxThreadsPerBlock) + { + // use the max num_threads before launch failure + num_threads = funcAttrib.maxThreadsPerBlock; + printf("WARNING Not enough ressources available to call " + "_before_run_kernel_synapses_1_pre_push_spikes" + "with maximum possible threads per block (%u). " + "Reducing num_threads to %u. (Kernel needs %i " + "registers per block, %i bytes of " + "statically-allocated shared memory per block, %i " + "bytes of local memory per thread and a total of %i " + "bytes of user-allocated constant memory)\n", + max_threads_per_block, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes); + } + else + { + printf("INFO _before_run_kernel_synapses_1_pre_push_spikes\n" + "\t%u blocks\n" + "\t%u threads\n" + "\t%i registers per block\n" + "\t%i bytes statically-allocated shared memory per block\n" + "\t%i bytes local memory per thread\n" + "\t%i bytes user-allocated constant memory\n" + "", + num_blocks, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes); + } + + _before_run_kernel_synapses_1_pre_push_spikes<<>>( + source_N, + num_parallel_blocks, + num_threads, + dt, + syn_N, + num_queues, + true + ); + + { + delete [] h_num_synapses_by_pre; + delete [] d_ptr_synapse_ids_by_pre; + } + + //delete temp arrays + delete [] h_vec_synapse_ids_by_pre; + + synapses_1_pre_scalar_delay = true; + + cudaError_t status = cudaGetLastError(); + if (status != cudaSuccess) + { + printf("ERROR initialising synapses_1_pre in %s:%d %s\n", + __FILE__, __LINE__, cudaGetErrorString(status)); + _dealloc_arrays(); + exit(status); + } + + CUDA_CHECK_MEMORY(); + double time_passed = (double)(std::clock() - start_timer) / CLOCKS_PER_SEC; + std::cout << "INFO: synapses_1_pre initialisation took " << time_passed << "s"; + if (used_device_memory_after_dealloc < used_device_memory_start){ + size_t freed_bytes = used_device_memory_start - used_device_memory_after_dealloc; + std::cout << ", freed " << freed_bytes * to_MB << "MB"; + } + if (used_device_memory > used_device_memory_start){ + size_t used_bytes = used_device_memory - used_device_memory_start; + std::cout << " and used " << used_bytes * to_MB << "MB of device memory."; + } + std::cout << std::endl; +} diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/before_run_synapses_1_pre_push_spikes.h b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/before_run_synapses_1_pre_push_spikes.h new file mode 100644 index 00000000..2e0f7d44 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/before_run_synapses_1_pre_push_spikes.h @@ -0,0 +1,6 @@ +#ifndef _INCLUDED_synapses_1_pre_push_spikes_before +#define _INCLUDED_synapses_1_pre_push_spikes_before + +void _before_run_synapses_1_pre_push_spikes(); + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/before_run_synapses_2_pre_push_spikes.cu b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/before_run_synapses_2_pre_push_spikes.cu new file mode 100644 index 00000000..f2ab3820 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/before_run_synapses_2_pre_push_spikes.cu @@ -0,0 +1,466 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "objects.h" +#include "code_objects/synapses_2_pre_push_spikes.h" +#include "brianlib/cuda_utils.h" + +// Makro for file and line information in _cudaSafeCall +#define COPY_HOST_ARRAY_TO_DEVICE_SYMBOL(a, b, c, d) \ + _copyHostArrayToDeviceSymbol(a, b, c, d, __FILE__, __LINE__) + +namespace { + // vector_t is an alias for thrust:host_vector + template using vector_t = thrust::host_vector; + // tuple type typedef + typedef std::tuple tuple_t; + + std::vector memory_recorder; + + // Functions for online update of mean and std + // for a new value newValue, compute the new count, new mean, the new M2. + // mean accumulates the mean of the entire dataset + // M2 aggregates the squared distance from the mean + // count aggregates the number of samples seen so far + inline void updateMeanStd(int &count, double &mean, double& M2, double newValue){ + count += 1; + double delta = newValue - mean; + mean += delta / count; + double delta2 = newValue - mean; + M2 += delta * delta2; + } + + // get std from aggregated M2 value + double getStd(int count, double M2){ + if (count < 2){ + return NAN; + } + double variance = M2 / (count - 1); + double stdValue = sqrt(variance); + return stdValue; + } + + // Copy the data from a host array to global device memory and copy the + // symbol to a global device variable. + // host_array: host array with data to copy + // device_symbol: global __device__ variable of same type as `host_array` + // num_elements: number of elements in host_array to copy + // NOTE: T can be a pointer variable itself (when copying 2D arrays) + template + inline void _copyHostArrayToDeviceSymbol(const T *host_array, T *&device_symbol, + int num_elements, const char* name, const char* file, + const int line){ + T *d_ptr_tmp; + size_t bytes = sizeof(T) * num_elements; + // allocate device memory + _cudaSafeCall( + cudaMalloc((void**)&d_ptr_tmp, bytes), + file, line, "cudaMalloc"); + // copy data from host array to device + _cudaSafeCall( + cudaMemcpy(d_ptr_tmp, host_array, bytes, cudaMemcpyHostToDevice), + file, line, "cudaMemcpy"); + // copy the device data pointer to the global device symbol + _cudaSafeCall( + cudaMemcpyToSymbol(device_symbol, &d_ptr_tmp, sizeof(T*)), + file, line, "cudaMemcpyToSymbol"); + memory_recorder.push_back(std::make_tuple(name, bytes, num_elements)); + } +} + + +__global__ void _before_run_kernel_synapses_2_pre_push_spikes( + int _source_N, + int _num_blocks, + int _num_threads, + double _dt, + int _syn_N, + int num_queues, + bool new_mode) +{ + using namespace brian; + + int tid = threadIdx.x; + + synapses_2_pre.queue->prepare( + tid, + _num_threads, + _num_blocks, + 0, + _source_N, + _syn_N, + num_queues, + synapses_2_pre_num_synapses_by_pre, + synapses_2_pre_num_synapses_by_bundle, + synapses_2_pre_num_unique_delays_by_pre, + synapses_2_pre_unique_delays, + synapses_2_pre_global_bundle_id_start_by_pre, + synapses_2_pre_synapses_offset_by_bundle, + synapses_2_pre_synapse_ids, + synapses_2_pre_synapse_ids_by_pre, + synapses_2_pre_unique_delays_offset_by_pre, + synapses_2_pre_unique_delay_start_idcs); + synapses_2_pre.no_or_const_delay_mode = new_mode; +} + +void _before_run_synapses_2_pre_push_spikes() +{ + using namespace brian; + + std::clock_t start_timer = std::clock(); + const double to_MB = 1.0 / (1024.0 * 1024.0); + + CUDA_CHECK_MEMORY(); + size_t used_device_memory_start = used_device_memory; + + ///// HOST_CONSTANTS /////////// + const int _numN = 1; + const int _num_spikespace = 101; + double* const _array_synapses_2_delay = thrust::raw_pointer_cast(&_dynamic_array_synapses_2_delay[0]); + const int _numdelay = _dynamic_array_synapses_2_delay.size(); + double* const dev_array_synapses_2_delay = thrust::raw_pointer_cast(&dev_dynamic_array_synapses_2_delay[0]); + + ///// pointers_lines ///// + + double* _ptr_array_defaultclock_dt = _array_defaultclock_dt; + double* _ptr_array_synapses_2_delay = _array_synapses_2_delay; + int32_t* __restrict _ptr_array_neurongroup_1__spikespace = _array_neurongroup_1__spikespace; + int32_t* _ptr_array_synapses_2_N = _array_synapses_2_N; + + + int64_t syn_N_check = _ptr_array_synapses_2_N[0]; + + if (syn_N_check == 0){ + return; + } + else if (syn_N_check > INT_MAX){ + printf("ERROR: There are more Synapses (%lu) than an int can " + "hold on this system (%u).\n", syn_N_check, INT_MAX); + } + // total number of synapses + int syn_N = (int)syn_N_check; + + // simulation time step + double dt = _ptr_array_defaultclock_dt[0]; + // number of neurons in source group + int source_N = 100; + // number of neurons in target group + int target_N = 100; + + // TODO: for multiple SynapticPathways for the same Synapses object (on_pre and on_post) the following copy is identical in both pathways initialise templates + + ////////////////////// + // Scalar variables // + ////////////////////// + + // total number of (preID, postBlock) pairs + int num_pre_post_blocks = num_parallel_blocks * source_N; + // size of the connectivity matrix (equal number of synapses) + int size_connectivity_matrix = 0; + + // statistics of number of synapses per (preID, postBlock) pair + int sum_num_elements = 0; + int count_num_elements = 0; + double mean_num_elements = 0; + double M2_num_elements = 0; + + + //////////////////////////////////////////////////////// + // Create array and vector variables (in host memory) // + //////////////////////////////////////////////////////// + + /* VARIABLE NAMING: + * Not scalar variables are named after TYPE_NAME_STRUCTURE, with: + * STRUCTURE: the first array dimensions structure (`by_pre`, `by_bundle` or none) + * `by_pre`: Array (host pointer type) of size `num_pre_post_blocks`, + * which is the number of (preID, postBlock) pairs. + * `by_bundle`: thrust::host_vector, size of total number of bundles, + * which is one for each delay in each (preID, postBlock) pair. + * Different (preID, postBlock) pairs can have different sets + * of delay values -> each bundle gets a global bundleID + * none: If no STRUCTURE given, it's a one dim array storing everything + * TYPE: data type in STRUCTURE (`h`, `h_vec`, `h_ptr`, `d_ptr`), with + * `h`: host value, `h_vec`: host vector, `h_ptr`: host pointer, + * `d_ptr`: device pointer (pointing to device, stored in host memory) + * NAME: the variable name + * + * EXAMPLES: + * `h_vec_delays_by_pre` - an array [size = num_pre_post_blocks] of host + * vectors, each storing delay values of a + * (preID, postBlock) pair + * `h_num_synapses_by_bundle` - a host vector of integers specifying the + * number of synapses in a bundle + * `d_ptr_synapse_ids` - a device pointer to synapse IDs (all of them) + */ + + // synapse IDs for each (preID, postBlock) pair + vector_t* h_vec_synapse_ids_by_pre = new vector_t[num_pre_post_blocks]; + // array of synapse IDs in device memory for each (preID, postBlock) pair + int32_t** d_ptr_synapse_ids_by_pre; + // number of synapses for each (preID, postBlock) pair + int* h_num_synapses_by_pre; + + + + // we need to allocate device memory for synapse IDs independent of delay mode + int32_t* d_ptr_synapse_ids; + size_t memory_synapse_ids = sizeof(int32_t) * syn_N; + CUDA_SAFE_CALL( + cudaMalloc((void**)&d_ptr_synapse_ids, memory_synapse_ids) + ); + memory_recorder.push_back(std::make_tuple("synapse IDs", memory_synapse_ids, syn_N)); + + + //fill vectors of connectivity matrix with synapse IDs and delays (in units of simulation time step) + int max_delay = (int)(_dynamic_array_synapses_2_delay[0] / dt + 0.5); + for(int syn_id = 0; syn_id < syn_N; syn_id++) // loop through all synapses + { + + + // Code generation checks + assert(0 == 0); + + assert(0 == 0); + + // pre/post_neuron_id are integers from 0 to Nsource/Ntarget (from corresponding + // SynapticPathway) this is relevant only when using Subgroups where they might + // be NOT equal to the idx in their NeuronGroup + int32_t pre_neuron_id = _dynamic_array_synapses_2__synaptic_pre[syn_id] - 0; + int32_t post_neuron_id = _dynamic_array_synapses_2__synaptic_post[syn_id] - 0; + + + // each parallel executed cuda block gets an equal part of post neuron IDs + int post_block_id = (post_neuron_id * num_parallel_blocks) / target_N; + // we store synapses for each pre neuron and post block + int pre_post_block_id = pre_neuron_id * num_parallel_blocks + post_block_id; + + h_vec_synapse_ids_by_pre[pre_post_block_id].push_back(syn_id); + } + int num_queues = max_delay + 1; // we also need a current step + + synapses_2_pre_delay = max_delay; + // Delete delay (in sec) on device, we don't need it + // TODO: don't copy these delays to the device in first place, see #83 + dev_dynamic_array_synapses_2_delay.clear(); + dev_dynamic_array_synapses_2_delay.shrink_to_fit(); + CUDA_CHECK_MEMORY(); + size_t used_device_memory_after_dealloc = used_device_memory; + + /////////////////////////////////////////////////////// + // Memory allocations which depend on the delay mode // + /////////////////////////////////////////////////////// + + { + h_num_synapses_by_pre = new int[num_pre_post_blocks]; + d_ptr_synapse_ids_by_pre = new int32_t*[num_pre_post_blocks]; + } + + + // loop through connectivity matrix [(preID, postBlock) pairs] + for(int i = 0; i < num_pre_post_blocks; i++) + { + // i is pre_post_block_id + + int num_elements = h_vec_synapse_ids_by_pre[i].size(); + size_connectivity_matrix += num_elements; + if (num_elements > synapses_2_pre_max_size) + synapses_2_pre_max_size = num_elements; + + { + // copy the synapse IDs and the number of synapses for this + // (preID, postBlock) to device and store the device pointer + + h_num_synapses_by_pre[i] = num_elements; + + d_ptr_synapse_ids_by_pre[i] = d_ptr_synapse_ids + sum_num_elements; + CUDA_SAFE_CALL( + cudaMemcpy(d_ptr_synapse_ids_by_pre[i], + thrust::raw_pointer_cast(&(h_vec_synapse_ids_by_pre[i][0])), + sizeof(int32_t) * num_elements, + cudaMemcpyHostToDevice) + ); + } + + sum_num_elements += num_elements; + updateMeanStd(count_num_elements, mean_num_elements, M2_num_elements, num_elements); + } // end for loop through connectivity matrix + printf("INFO connectivity matrix has size %i, number of (pre neuron ID, post neuron block) pairs is %u\n", + size_connectivity_matrix, num_pre_post_blocks); + + { + // synapses size + COPY_HOST_ARRAY_TO_DEVICE_SYMBOL(h_num_synapses_by_pre, + synapses_2_pre_num_synapses_by_pre, num_pre_post_blocks, + "number of synapses per pre/post block"); + // synapses id + COPY_HOST_ARRAY_TO_DEVICE_SYMBOL(d_ptr_synapse_ids_by_pre, + synapses_2_pre_synapse_ids_by_pre, num_pre_post_blocks, + "pointers to synapse IDs"); + } + + + //////////////////////////////////////////////////// + //// PRINT INFORMATION ON MEMORY USAGE AND TIME //// + //////////////////////////////////////////////////// + + // TODO print statistics! + + // sum all allocated memory + size_t total_memory = 0; + int max_string_length = 0; + for(auto const& tuple: memory_recorder){ + total_memory += std::get<1>(tuple); + int str_len = std::get<0>(tuple).length(); + if (str_len > max_string_length) + max_string_length = str_len; + } + double total_memory_MB = total_memory * to_MB; + max_string_length += 5; + + // sort tuples by used memory + std::sort(begin(memory_recorder), end(memory_recorder), + [](tuple_t const &t1, tuple_t const &t2) { + return std::get<1>(t1) > std::get<1>(t2); // or use a custom compare function + } + ); + + double std_num_elements = getStd(count_num_elements, M2_num_elements); + + // print memory information + std::cout.precision(1); + std::cout.setf(std::ios::fixed, std::ios::floatfield); + std::cout << "INFO: synapse statistics and memory usage for synapses_2_pre:\n" + << "\tnumber of synapses: " << syn_N << "\n" + << "\tnumber of pre/post blocks: " << num_pre_post_blocks << "\n" + << "\tnumber of synapses over all pre/post blocks:\n" + << "\t\tmean: " << mean_num_elements << "\tstd: " + << std_num_elements << "\n" + << "\n\tmemory usage: TOTAL: " << total_memory_MB << " MB (~" + << total_memory_MB / syn_N * 1024.0 * 1024.0 << " byte per synapse)" + << std::endl; + + for(auto const& tuple: memory_recorder){ + std::string name; + size_t bytes; + int num_elements; + std::tie(name, bytes, num_elements) = tuple; + double memory = bytes * to_MB; + double fraction = memory / total_memory_MB * 100; + std::cout << "\t\t" << std::setprecision(1) << std::fixed << fraction + << "%\t" << std::setprecision(3) << std::fixed << memory << " MB\t" + << name << " [" << num_elements << "]" << std::endl; + } + + + // Create circular eventspaces in no_or_const_delay_mode + { + int num_spikespaces = dev_array_neurongroup_1__spikespace.size(); + if (num_queues > num_spikespaces) + { + for (int i = num_spikespaces; i < num_queues; i++) + { + int32_t* new_eventspace; + cudaError_t status = cudaMalloc((void**)&new_eventspace, + sizeof(int32_t)*_num__array_neurongroup_1__spikespace); + if (status != cudaSuccess) + { + printf("ERROR while allocating momory for dev_array_neurongroup_1__spikespace[%i] on device: %s %s %d\n", + i, cudaGetErrorString(status), __FILE__, __LINE__); + exit(status); + } + dev_array_neurongroup_1__spikespace.push_back(new_eventspace); + } + } + } + + int num_threads = num_queues; + if(num_threads >= max_threads_per_block) + { + num_threads = max_threads_per_block; + } + int num_blocks = 1; + + // check if we have enough ressources to call kernel with given number + // of blocks and threads + struct cudaFuncAttributes funcAttrib; + cudaFuncGetAttributes(&funcAttrib, _before_run_kernel_synapses_2_pre_push_spikes); + if (num_threads > funcAttrib.maxThreadsPerBlock) + { + // use the max num_threads before launch failure + num_threads = funcAttrib.maxThreadsPerBlock; + printf("WARNING Not enough ressources available to call " + "_before_run_kernel_synapses_2_pre_push_spikes" + "with maximum possible threads per block (%u). " + "Reducing num_threads to %u. (Kernel needs %i " + "registers per block, %i bytes of " + "statically-allocated shared memory per block, %i " + "bytes of local memory per thread and a total of %i " + "bytes of user-allocated constant memory)\n", + max_threads_per_block, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes); + } + else + { + printf("INFO _before_run_kernel_synapses_2_pre_push_spikes\n" + "\t%u blocks\n" + "\t%u threads\n" + "\t%i registers per block\n" + "\t%i bytes statically-allocated shared memory per block\n" + "\t%i bytes local memory per thread\n" + "\t%i bytes user-allocated constant memory\n" + "", + num_blocks, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes); + } + + _before_run_kernel_synapses_2_pre_push_spikes<<>>( + source_N, + num_parallel_blocks, + num_threads, + dt, + syn_N, + num_queues, + true + ); + + { + delete [] h_num_synapses_by_pre; + delete [] d_ptr_synapse_ids_by_pre; + } + + //delete temp arrays + delete [] h_vec_synapse_ids_by_pre; + + synapses_2_pre_scalar_delay = true; + + cudaError_t status = cudaGetLastError(); + if (status != cudaSuccess) + { + printf("ERROR initialising synapses_2_pre in %s:%d %s\n", + __FILE__, __LINE__, cudaGetErrorString(status)); + _dealloc_arrays(); + exit(status); + } + + CUDA_CHECK_MEMORY(); + double time_passed = (double)(std::clock() - start_timer) / CLOCKS_PER_SEC; + std::cout << "INFO: synapses_2_pre initialisation took " << time_passed << "s"; + if (used_device_memory_after_dealloc < used_device_memory_start){ + size_t freed_bytes = used_device_memory_start - used_device_memory_after_dealloc; + std::cout << ", freed " << freed_bytes * to_MB << "MB"; + } + if (used_device_memory > used_device_memory_start){ + size_t used_bytes = used_device_memory - used_device_memory_start; + std::cout << " and used " << used_bytes * to_MB << "MB of device memory."; + } + std::cout << std::endl; +} diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/before_run_synapses_2_pre_push_spikes.h b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/before_run_synapses_2_pre_push_spikes.h new file mode 100644 index 00000000..d5e092a9 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/before_run_synapses_2_pre_push_spikes.h @@ -0,0 +1,6 @@ +#ifndef _INCLUDED_synapses_2_pre_push_spikes_before +#define _INCLUDED_synapses_2_pre_push_spikes_before + +void _before_run_synapses_2_pre_push_spikes(); + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/before_run_synapses_pre_push_spikes.cu b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/before_run_synapses_pre_push_spikes.cu new file mode 100644 index 00000000..66954c95 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/before_run_synapses_pre_push_spikes.cu @@ -0,0 +1,706 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "objects.h" +#include "code_objects/synapses_pre_push_spikes.h" +#include "brianlib/cuda_utils.h" + +// Makro for file and line information in _cudaSafeCall +#define COPY_HOST_ARRAY_TO_DEVICE_SYMBOL(a, b, c, d) \ + _copyHostArrayToDeviceSymbol(a, b, c, d, __FILE__, __LINE__) + +namespace { + // vector_t is an alias for thrust:host_vector + template using vector_t = thrust::host_vector; + // tuple type typedef + typedef std::tuple tuple_t; + + std::vector memory_recorder; + + // Functions for online update of mean and std + // for a new value newValue, compute the new count, new mean, the new M2. + // mean accumulates the mean of the entire dataset + // M2 aggregates the squared distance from the mean + // count aggregates the number of samples seen so far + inline void updateMeanStd(int &count, double &mean, double& M2, double newValue){ + count += 1; + double delta = newValue - mean; + mean += delta / count; + double delta2 = newValue - mean; + M2 += delta * delta2; + } + + // get std from aggregated M2 value + double getStd(int count, double M2){ + if (count < 2){ + return NAN; + } + double variance = M2 / (count - 1); + double stdValue = sqrt(variance); + return stdValue; + } + + // Copy the data from a host array to global device memory and copy the + // symbol to a global device variable. + // host_array: host array with data to copy + // device_symbol: global __device__ variable of same type as `host_array` + // num_elements: number of elements in host_array to copy + // NOTE: T can be a pointer variable itself (when copying 2D arrays) + template + inline void _copyHostArrayToDeviceSymbol(const T *host_array, T *&device_symbol, + int num_elements, const char* name, const char* file, + const int line){ + T *d_ptr_tmp; + size_t bytes = sizeof(T) * num_elements; + // allocate device memory + _cudaSafeCall( + cudaMalloc((void**)&d_ptr_tmp, bytes), + file, line, "cudaMalloc"); + // copy data from host array to device + _cudaSafeCall( + cudaMemcpy(d_ptr_tmp, host_array, bytes, cudaMemcpyHostToDevice), + file, line, "cudaMemcpy"); + // copy the device data pointer to the global device symbol + _cudaSafeCall( + cudaMemcpyToSymbol(device_symbol, &d_ptr_tmp, sizeof(T*)), + file, line, "cudaMemcpyToSymbol"); + memory_recorder.push_back(std::make_tuple(name, bytes, num_elements)); + } +} + + +__global__ void _before_run_kernel_synapses_pre_push_spikes( + int _source_N, + int _num_blocks, + int _num_threads, + double _dt, + int _syn_N, + int num_queues, + bool new_mode) +{ + using namespace brian; + + int tid = threadIdx.x; + + synapses_pre.queue->prepare( + tid, + _num_threads, + _num_blocks, + 0, + _source_N, + _syn_N, + num_queues, + synapses_pre_num_synapses_by_pre, + synapses_pre_num_synapses_by_bundle, + synapses_pre_num_unique_delays_by_pre, + synapses_pre_unique_delays, + synapses_pre_global_bundle_id_start_by_pre, + synapses_pre_synapses_offset_by_bundle, + synapses_pre_synapse_ids, + synapses_pre_synapse_ids_by_pre, + synapses_pre_unique_delays_offset_by_pre, + synapses_pre_unique_delay_start_idcs); + synapses_pre.no_or_const_delay_mode = new_mode; +} + +void _before_run_synapses_pre_push_spikes() +{ + using namespace brian; + + std::clock_t start_timer = std::clock(); + const double to_MB = 1.0 / (1024.0 * 1024.0); + + CUDA_CHECK_MEMORY(); + size_t used_device_memory_start = used_device_memory; + + ///// HOST_CONSTANTS /////////// + const int _numN = 1; + const int _num_spikespace = 101; + double* const _array_synapses_delay = thrust::raw_pointer_cast(&_dynamic_array_synapses_delay[0]); + const int _numdelay = _dynamic_array_synapses_delay.size(); + double* const dev_array_synapses_delay = thrust::raw_pointer_cast(&dev_dynamic_array_synapses_delay[0]); + + ///// pointers_lines ///// + + double* _ptr_array_defaultclock_dt = _array_defaultclock_dt; + double* __restrict _ptr_array_synapses_delay = _array_synapses_delay; + int32_t* __restrict _ptr_array_spikegeneratorgroup__spikespace = _array_spikegeneratorgroup__spikespace; + int32_t* _ptr_array_synapses_N = _array_synapses_N; + + + int64_t syn_N_check = _ptr_array_synapses_N[0]; + + if (syn_N_check == 0){ + return; + } + else if (syn_N_check > INT_MAX){ + printf("ERROR: There are more Synapses (%lu) than an int can " + "hold on this system (%u).\n", syn_N_check, INT_MAX); + } + // total number of synapses + int syn_N = (int)syn_N_check; + + // simulation time step + double dt = _ptr_array_defaultclock_dt[0]; + // number of neurons in source group + int source_N = 100; + // number of neurons in target group + int target_N = 2500; + + // TODO: for multiple SynapticPathways for the same Synapses object (on_pre and on_post) the following copy is identical in both pathways initialise templates + // delay (on device) was potentially set in group_variable_set_conditional and needs to be copied to host + _dynamic_array_synapses_delay = dev_dynamic_array_synapses_delay; + + ////////////////////// + // Scalar variables // + ////////////////////// + + // total number of (preID, postBlock) pairs + int num_pre_post_blocks = num_parallel_blocks * source_N; + // size of the connectivity matrix (equal number of synapses) + int size_connectivity_matrix = 0; + + // statistics of number of synapses per (preID, postBlock) pair + int sum_num_elements = 0; + int count_num_elements = 0; + double mean_num_elements = 0; + double M2_num_elements = 0; + + // statistics of number of unique delays per (preID, postBlock) pair + int sum_num_unique_elements = 0; + int count_num_unique_elements = 0; + double mean_num_unique_elements = 0; + double M2_num_unique_elements = 0; + + // total number of bundles in all (preID, postBlock) pairs (not known yet) + int num_bundle_ids = 0; + + // statistics of number of synapses per bundle + int sum_bundle_sizes = 0; + int count_bundle_sizes = 0; + double mean_bundle_sizes = 0; + double M2_bundle_sizes = 0; + + + //////////////////////////////////////////////////////// + // Create array and vector variables (in host memory) // + //////////////////////////////////////////////////////// + + /* VARIABLE NAMING: + * Not scalar variables are named after TYPE_NAME_STRUCTURE, with: + * STRUCTURE: the first array dimensions structure (`by_pre`, `by_bundle` or none) + * `by_pre`: Array (host pointer type) of size `num_pre_post_blocks`, + * which is the number of (preID, postBlock) pairs. + * `by_bundle`: thrust::host_vector, size of total number of bundles, + * which is one for each delay in each (preID, postBlock) pair. + * Different (preID, postBlock) pairs can have different sets + * of delay values -> each bundle gets a global bundleID + * none: If no STRUCTURE given, it's a one dim array storing everything + * TYPE: data type in STRUCTURE (`h`, `h_vec`, `h_ptr`, `d_ptr`), with + * `h`: host value, `h_vec`: host vector, `h_ptr`: host pointer, + * `d_ptr`: device pointer (pointing to device, stored in host memory) + * NAME: the variable name + * + * EXAMPLES: + * `h_vec_delays_by_pre` - an array [size = num_pre_post_blocks] of host + * vectors, each storing delay values of a + * (preID, postBlock) pair + * `h_num_synapses_by_bundle` - a host vector of integers specifying the + * number of synapses in a bundle + * `d_ptr_synapse_ids` - a device pointer to synapse IDs (all of them) + */ + + // synapse IDs for each (preID, postBlock) pair + vector_t* h_vec_synapse_ids_by_pre = new vector_t[num_pre_post_blocks]; + // array of synapse IDs in device memory for each (preID, postBlock) pair + int32_t** d_ptr_synapse_ids_by_pre; + // number of synapses for each (preID, postBlock) pair + int* h_num_synapses_by_pre; + + // delay for each synapse in `h_vec_synapse_ids_by_pre`, + // only used to sort synapses by delay + vector_t* h_vec_delays_by_pre = new vector_t[num_pre_post_blocks]; + // array of vectors with unique delays and start indices in synapses arrays + vector_t* h_vec_unique_delays_by_pre; + vector_t* h_vec_unique_delay_start_idcs_by_pre; + // offset in array of all synapse IDs sorted by bundles (we are storing the + // offset as 32bit int instead of a 64bit pointer to the bundle start) + vector_t h_synapses_offset_by_bundle; + // number of synapses in each bundle + vector_t h_num_synapses_by_bundle; + // start of global bundle ID per (preID, postBlock) pair (+ total num bundles) + int* h_global_bundle_id_start_by_pre = new int[num_pre_post_blocks + 1]; + + + // we need to allocate device memory for synapse IDs independent of delay mode + int32_t* d_ptr_synapse_ids; + size_t memory_synapse_ids = sizeof(int32_t) * syn_N; + CUDA_SAFE_CALL( + cudaMalloc((void**)&d_ptr_synapse_ids, memory_synapse_ids) + ); + memory_recorder.push_back(std::make_tuple("synapse IDs", memory_synapse_ids, syn_N)); + + + //fill vectors of connectivity matrix with synapse IDs and delays (in units of simulation time step) + int max_delay = (int)(_dynamic_array_synapses_delay[0] / dt + 0.5); + int min_delay = max_delay; + for(int syn_id = 0; syn_id < syn_N; syn_id++) // loop through all synapses + { + + + // Code generation checks + assert(0 == 0); + + assert(0 == 0); + + // pre/post_neuron_id are integers from 0 to Nsource/Ntarget (from corresponding + // SynapticPathway) this is relevant only when using Subgroups where they might + // be NOT equal to the idx in their NeuronGroup + int32_t pre_neuron_id = _dynamic_array_synapses__synaptic_pre[syn_id] - 0; + int32_t post_neuron_id = _dynamic_array_synapses__synaptic_post[syn_id] - 0; + + int delay = (int)(_dynamic_array_synapses_delay[syn_id] / dt + 0.5); + if (delay > max_delay) + max_delay = delay; + if (delay < min_delay) + min_delay = delay; + + // each parallel executed cuda block gets an equal part of post neuron IDs + int post_block_id = (post_neuron_id * num_parallel_blocks) / target_N; + // we store synapses for each pre neuron and post block + int pre_post_block_id = pre_neuron_id * num_parallel_blocks + post_block_id; + + h_vec_synapse_ids_by_pre[pre_post_block_id].push_back(syn_id); + h_vec_delays_by_pre[pre_post_block_id].push_back(delay); + } + int num_queues = max_delay + 1; // we also need a current step + + bool scalar_delay = (max_delay == min_delay); + if (scalar_delay) + synapses_pre_delay = max_delay; + // Delete delay (in sec) on device, we don't need it + // TODO: don't copy these delays to the device in first place, see #83 + dev_dynamic_array_synapses_delay.clear(); + dev_dynamic_array_synapses_delay.shrink_to_fit(); + CUDA_CHECK_MEMORY(); + size_t used_device_memory_after_dealloc = used_device_memory; + + /////////////////////////////////////////////////////// + // Memory allocations which depend on the delay mode // + /////////////////////////////////////////////////////// + + if (scalar_delay) + { + h_num_synapses_by_pre = new int[num_pre_post_blocks]; + d_ptr_synapse_ids_by_pre = new int32_t*[num_pre_post_blocks]; + } + + // allocate memory only if the delays are not all the same + if (!scalar_delay) + { + + h_vec_unique_delay_start_idcs_by_pre = new vector_t[num_pre_post_blocks]; + h_vec_unique_delays_by_pre = new vector_t[num_pre_post_blocks]; + + } + int global_bundle_id_start = 0; + + // loop through connectivity matrix [(preID, postBlock) pairs] + for(int i = 0; i < num_pre_post_blocks; i++) + { + // i is pre_post_block_id + + int num_elements = h_vec_synapse_ids_by_pre[i].size(); + size_connectivity_matrix += num_elements; + if (num_elements > synapses_pre_max_size) + synapses_pre_max_size = num_elements; + + if (!scalar_delay) + { + // for this (preID, postBlock), sort all synapses by delay, + // reduce the delay arrays to unique delays and store the + // start indices in the synapses array for each unique delay + + typedef vector_t::iterator itr; + + // sort synapses (values) and delays (keys) by delay + thrust::sort_by_key( + h_vec_delays_by_pre[i].begin(), // keys start + h_vec_delays_by_pre[i].end(), // keys end + h_vec_synapse_ids_by_pre[i].begin() // values start + ); + + // worst case: number of unique delays is num_elements + h_vec_unique_delay_start_idcs_by_pre[i].resize(num_elements); + + // Initialise the unique delay start idcs array as a sequence + thrust::sequence(h_vec_unique_delay_start_idcs_by_pre[i].begin(), + h_vec_unique_delay_start_idcs_by_pre[i].end()); + + // get delays (keys) and values (indices) for first occurence of each delay value + thrust::pair end_pair = thrust::unique_by_key( + h_vec_delays_by_pre[i].begin(), // keys start + h_vec_delays_by_pre[i].end(), // keys end + h_vec_unique_delay_start_idcs_by_pre[i].begin() // values start (position in original delay array) + ); + + itr unique_delay_end = end_pair.first; + itr idx_end = end_pair.second; + + // erase unneded vector entries + h_vec_unique_delay_start_idcs_by_pre[i].erase( + idx_end, h_vec_unique_delay_start_idcs_by_pre[i].end()); + // free not used but allocated host memory + h_vec_unique_delay_start_idcs_by_pre[i].shrink_to_fit(); + h_vec_delays_by_pre[i].erase(unique_delay_end, + h_vec_delays_by_pre[i].end()); + // delay_by_pre holds the set of unique delays now + // we don't need shrink_to_fit, swap takes care of that + h_vec_unique_delays_by_pre[i].swap(h_vec_delays_by_pre[i]); + + int num_unique_elements = h_vec_unique_delays_by_pre[i].size(); + sum_num_unique_elements += num_unique_elements; + + if (num_unique_elements > synapses_pre_max_num_unique_delays) + synapses_pre_max_num_unique_delays = num_unique_elements; + + // we need a start ID per i (pre_post_block_id) to calc the global + // bundle ID from the local bundle_idx when pushing + h_global_bundle_id_start_by_pre[i] = global_bundle_id_start; + global_bundle_id_start += num_unique_elements; + // the local bundle_idx goes from 0 to num_bundles for each i (pre_post_block_id) + for (int bundle_idx = 0; bundle_idx < num_unique_elements; bundle_idx++) + { + // find the start idx in the synapses array for this delay (bundle) + int synapses_start_idx = h_vec_unique_delay_start_idcs_by_pre[i][bundle_idx]; + // find the number of synapses for this delay (bundle) + int num_synapses; + if (bundle_idx == num_unique_elements - 1) + num_synapses = num_elements - synapses_start_idx; + else + num_synapses = h_vec_unique_delay_start_idcs_by_pre[i][bundle_idx + 1] - synapses_start_idx; + h_num_synapses_by_bundle.push_back(num_synapses); + if (num_synapses > synapses_pre_max_bundle_size) + synapses_pre_max_bundle_size = num_synapses; + + // copy this bundle to device and store the device pointer + int32_t* d_this_bundle = d_ptr_synapse_ids + sum_bundle_sizes; + int32_t* h_this_bundle = thrust::raw_pointer_cast(&h_vec_synapse_ids_by_pre[i][synapses_start_idx]); + size_t memory_size = sizeof(int32_t) * num_synapses; + CUDA_SAFE_CALL( + cudaMemcpy(d_this_bundle, h_this_bundle, memory_size, cudaMemcpyHostToDevice) + ); + + h_synapses_offset_by_bundle.push_back(sum_bundle_sizes); + sum_bundle_sizes += num_synapses; + updateMeanStd(count_bundle_sizes, mean_bundle_sizes, M2_bundle_sizes, num_synapses); + } + + updateMeanStd(count_num_unique_elements, mean_num_unique_elements, + M2_num_unique_elements, num_unique_elements); + + } // end if (!scalar_delay) + else // scalar_delay + { + // copy the synapse IDs and the number of synapses for this + // (preID, postBlock) to device and store the device pointer + + h_num_synapses_by_pre[i] = num_elements; + + d_ptr_synapse_ids_by_pre[i] = d_ptr_synapse_ids + sum_num_elements; + CUDA_SAFE_CALL( + cudaMemcpy(d_ptr_synapse_ids_by_pre[i], + thrust::raw_pointer_cast(&(h_vec_synapse_ids_by_pre[i][0])), + sizeof(int32_t) * num_elements, + cudaMemcpyHostToDevice) + ); + } + + sum_num_elements += num_elements; + updateMeanStd(count_num_elements, mean_num_elements, M2_num_elements, num_elements); + } // end for loop through connectivity matrix + printf("INFO connectivity matrix has size %i, number of (pre neuron ID, post neuron block) pairs is %u\n", + size_connectivity_matrix, num_pre_post_blocks); + + if (scalar_delay) + { + // synapses size + COPY_HOST_ARRAY_TO_DEVICE_SYMBOL(h_num_synapses_by_pre, + synapses_pre_num_synapses_by_pre, num_pre_post_blocks, + "number of synapses per pre/post block"); + // synapses id + COPY_HOST_ARRAY_TO_DEVICE_SYMBOL(d_ptr_synapse_ids_by_pre, + synapses_pre_synapse_ids_by_pre, num_pre_post_blocks, + "pointers to synapse IDs"); + } + + else // not scalar_delay + { + // Since we now know the total number of unique delays over all + // (preID, postBlock) pairs, we can allocate the device memory + size_t memory_unique_delays_by_pre = sizeof(int) * sum_num_unique_elements; + assert(sum_bundle_sizes == syn_N); + + // array of all unique delas, sorted first by pre_post_block and per + // pre_post_block by delay + int *d_ptr_unique_delays; + CUDA_SAFE_CALL( + cudaMalloc((void**)&d_ptr_unique_delays, memory_unique_delays_by_pre) + ); + memory_recorder.push_back(std::make_tuple( + "unique delays", memory_unique_delays_by_pre, + sum_num_unique_elements)); + + int sum_num_unique_elements_bak = sum_num_unique_elements; + + // reset sum_num_unique_elements, we will use it to offset cudaMemcy correctly + sum_num_unique_elements = 0; + for(int i = 0; i < num_pre_post_blocks; i++) // loop through connectivity matrix again + { + + int num_elements = h_vec_synapse_ids_by_pre[i].size(); + int num_unique_elements = h_vec_unique_delays_by_pre[i].size(); + + if(num_elements > 0) + { + // copy the unique delays to the device and store the device pointers + CUDA_SAFE_CALL( + cudaMemcpy(d_ptr_unique_delays + + sum_num_unique_elements, + thrust::raw_pointer_cast( + &(h_vec_unique_delays_by_pre[i][0])), + sizeof(int)*num_unique_elements, + cudaMemcpyHostToDevice) + ); + + + sum_num_unique_elements += num_unique_elements; + } // end if(num_elements < 0) + } // end second loop connectivity matrix + assert(sum_num_unique_elements_bak == sum_num_unique_elements); + + // pointer to start of unique delays array + CUDA_SAFE_CALL( + cudaMemcpyToSymbol(synapses_pre_unique_delays, + &d_ptr_unique_delays, + sizeof(d_ptr_unique_delays)) + ); + + num_bundle_ids = sum_num_unique_elements; + + // add num_bundle_ids as last entry + h_global_bundle_id_start_by_pre[num_pre_post_blocks] = num_bundle_ids; + + // floor(mean(h_num_synapses_by_bundle)) + synapses_pre_mean_bundle_size = sum_bundle_sizes / num_bundle_ids; + + // pointer to start of synapse IDs array + CUDA_SAFE_CALL( + cudaMemcpyToSymbol(synapses_pre_synapse_ids, &d_ptr_synapse_ids, + sizeof(d_ptr_synapse_ids)) + ); + + // size by bundle + COPY_HOST_ARRAY_TO_DEVICE_SYMBOL( + thrust::raw_pointer_cast(&h_num_synapses_by_bundle[0]), + synapses_pre_num_synapses_by_bundle, num_bundle_ids, + "number of synapses per bundle"); + + // synapses offset by bundle + COPY_HOST_ARRAY_TO_DEVICE_SYMBOL( + thrust::raw_pointer_cast(&h_synapses_offset_by_bundle[0]), + synapses_pre_synapses_offset_by_bundle, num_bundle_ids, + "synapses bundle offset"); + + // global bundle id start idx by pre + COPY_HOST_ARRAY_TO_DEVICE_SYMBOL( + h_global_bundle_id_start_by_pre, + synapses_pre_global_bundle_id_start_by_pre, + num_pre_post_blocks + 1, "global bundle ID start"); + + + } // end if (!scalar_delay) + + //////////////////////////////////////////////////// + //// PRINT INFORMATION ON MEMORY USAGE AND TIME //// + //////////////////////////////////////////////////// + + // TODO print statistics! + + // sum all allocated memory + size_t total_memory = 0; + int max_string_length = 0; + for(auto const& tuple: memory_recorder){ + total_memory += std::get<1>(tuple); + int str_len = std::get<0>(tuple).length(); + if (str_len > max_string_length) + max_string_length = str_len; + } + double total_memory_MB = total_memory * to_MB; + max_string_length += 5; + + // sort tuples by used memory + std::sort(begin(memory_recorder), end(memory_recorder), + [](tuple_t const &t1, tuple_t const &t2) { + return std::get<1>(t1) > std::get<1>(t2); // or use a custom compare function + } + ); + + double std_num_elements = getStd(count_num_elements, M2_num_elements); + double std_bundle_sizes = getStd(count_bundle_sizes, M2_bundle_sizes); + double std_num_unique_elements = getStd(count_num_unique_elements, M2_num_unique_elements); + + // print memory information + std::cout.precision(1); + std::cout.setf(std::ios::fixed, std::ios::floatfield); + std::cout << "INFO: synapse statistics and memory usage for synapses_pre:\n" + << "\tnumber of synapses: " << syn_N << "\n" + << "\tnumber of bundles: " << num_bundle_ids << "\n" + << "\tnumber of pre/post blocks: " << num_pre_post_blocks << "\n" + << "\tnumber of synapses over all pre/post blocks:\n" + << "\t\tmean: " << mean_num_elements << "\tstd: " + << std_num_elements << "\n" + << "\tnumber of unique delays over all pre/post blocks:\n" + << "\t\tmean: " << mean_num_unique_elements << "\tstd: " + << std_num_unique_elements << "\n" + << "\tbundle size over all bundles:\n" + << "\t\tmean: " << mean_bundle_sizes << "\tstd: " + << std_bundle_sizes << "\n" + << "\n\tmemory usage: TOTAL: " << total_memory_MB << " MB (~" + << total_memory_MB / syn_N * 1024.0 * 1024.0 << " byte per synapse)" + << std::endl; + + for(auto const& tuple: memory_recorder){ + std::string name; + size_t bytes; + int num_elements; + std::tie(name, bytes, num_elements) = tuple; + double memory = bytes * to_MB; + double fraction = memory / total_memory_MB * 100; + std::cout << "\t\t" << std::setprecision(1) << std::fixed << fraction + << "%\t" << std::setprecision(3) << std::fixed << memory << " MB\t" + << name << " [" << num_elements << "]" << std::endl; + } + + + // Create circular eventspaces in no_or_const_delay_mode + if (scalar_delay) + { + int num_spikespaces = dev_array_spikegeneratorgroup__spikespace.size(); + if (num_queues > num_spikespaces) + { + for (int i = num_spikespaces; i < num_queues; i++) + { + int32_t* new_eventspace; + cudaError_t status = cudaMalloc((void**)&new_eventspace, + sizeof(int32_t)*_num__array_spikegeneratorgroup__spikespace); + if (status != cudaSuccess) + { + printf("ERROR while allocating momory for dev_array_spikegeneratorgroup__spikespace[%i] on device: %s %s %d\n", + i, cudaGetErrorString(status), __FILE__, __LINE__); + exit(status); + } + dev_array_spikegeneratorgroup__spikespace.push_back(new_eventspace); + } + } + } + + int num_threads = num_queues; + if(num_threads >= max_threads_per_block) + { + num_threads = max_threads_per_block; + } + int num_blocks = 1; + + // check if we have enough ressources to call kernel with given number + // of blocks and threads + struct cudaFuncAttributes funcAttrib; + cudaFuncGetAttributes(&funcAttrib, _before_run_kernel_synapses_pre_push_spikes); + if (num_threads > funcAttrib.maxThreadsPerBlock) + { + // use the max num_threads before launch failure + num_threads = funcAttrib.maxThreadsPerBlock; + printf("WARNING Not enough ressources available to call " + "_before_run_kernel_synapses_pre_push_spikes" + "with maximum possible threads per block (%u). " + "Reducing num_threads to %u. (Kernel needs %i " + "registers per block, %i bytes of " + "statically-allocated shared memory per block, %i " + "bytes of local memory per thread and a total of %i " + "bytes of user-allocated constant memory)\n", + max_threads_per_block, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes); + } + else + { + printf("INFO _before_run_kernel_synapses_pre_push_spikes\n" + "\t%u blocks\n" + "\t%u threads\n" + "\t%i registers per block\n" + "\t%i bytes statically-allocated shared memory per block\n" + "\t%i bytes local memory per thread\n" + "\t%i bytes user-allocated constant memory\n" + "", + num_blocks, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes); + } + + _before_run_kernel_synapses_pre_push_spikes<<>>( + source_N, + num_parallel_blocks, + num_threads, + dt, + syn_N, + num_queues, + scalar_delay + ); + + if (scalar_delay) + { + delete [] h_num_synapses_by_pre; + delete [] d_ptr_synapse_ids_by_pre; + } + + //delete temp arrays + delete [] h_vec_synapse_ids_by_pre; + delete [] h_vec_delays_by_pre; + if (!scalar_delay) + { + delete [] h_vec_unique_delay_start_idcs_by_pre; + delete [] h_vec_unique_delays_by_pre; + delete [] h_global_bundle_id_start_by_pre; + } + + synapses_pre_scalar_delay = scalar_delay; + + cudaError_t status = cudaGetLastError(); + if (status != cudaSuccess) + { + printf("ERROR initialising synapses_pre in %s:%d %s\n", + __FILE__, __LINE__, cudaGetErrorString(status)); + _dealloc_arrays(); + exit(status); + } + + CUDA_CHECK_MEMORY(); + double time_passed = (double)(std::clock() - start_timer) / CLOCKS_PER_SEC; + std::cout << "INFO: synapses_pre initialisation took " << time_passed << "s"; + if (used_device_memory_after_dealloc < used_device_memory_start){ + size_t freed_bytes = used_device_memory_start - used_device_memory_after_dealloc; + std::cout << ", freed " << freed_bytes * to_MB << "MB"; + } + if (used_device_memory > used_device_memory_start){ + size_t used_bytes = used_device_memory - used_device_memory_start; + std::cout << " and used " << used_bytes * to_MB << "MB of device memory."; + } + std::cout << std::endl; +} diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/before_run_synapses_pre_push_spikes.h b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/before_run_synapses_pre_push_spikes.h new file mode 100644 index 00000000..662d0691 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/before_run_synapses_pre_push_spikes.h @@ -0,0 +1,6 @@ +#ifndef _INCLUDED_synapses_pre_push_spikes_before +#define _INCLUDED_synapses_pre_push_spikes_before + +void _before_run_synapses_pre_push_spikes(); + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/neurongroup_1_stateupdater_codeobject.cu b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/neurongroup_1_stateupdater_codeobject.cu new file mode 100644 index 00000000..f24fc55f --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/neurongroup_1_stateupdater_codeobject.cu @@ -0,0 +1,459 @@ +#include "code_objects/neurongroup_1_stateupdater_codeobject.h" +#include "objects.h" +#include "brianlib/common_math.h" +#include "brianlib/cuda_utils.h" +#include "brianlib/stdint_compat.h" +#include +#include +#include +#include + + + +////// SUPPORT CODE /////// +namespace { + double _host_rand(const int _vectorisation_idx); + double _host_randn(const int _vectorisation_idx); + int32_t _host_poisson(double _lambda, const int _vectorisation_idx); + + ///// block extra_device_helper ///// + + ///// support_code_lines ///// + + template + __host__ __device__ + double _brian_exp(T value) + { + #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 0)) + return exp((double)value); + #else + return exp(value); + #endif + } + inline __host__ __device__ + float _brian_exp(float value) + { + return exp(value); + } + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + inline __device__ int _brian_atomicAdd(int* address, int val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ float _brian_atomicAdd(float* address, float val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ double _brian_atomicAdd(double* address, double val) + { + #if (__CUDA_ARCH__ >= 600) + // hardware implementation + return atomicAdd(address, val); + #else + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val + + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + #endif + } + inline __device__ int _brian_atomicMul(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val * assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicMul(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val * + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicMul(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val * + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + inline __device__ int _brian_atomicDiv(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val / assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicDiv(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val / + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicDiv(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val / + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + + + // Implement dummy functions such that the host compiled code of binomial + // functions works. Hacky, hacky ... + double _host_rand(const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } + double _host_randn(const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } + int32_t _host_poisson(double _lambda, const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_poisson` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } +} + +////// hashdefine_lines /////// + + + +__global__ void +_run_kernel_neurongroup_1_stateupdater_codeobject( + int _N, + int THREADS_PER_BLOCK, + ///// KERNEL_PARAMETERS ///// + double* _ptr_array_neurongroup_1_V, + const double _value_array_defaultclock_dt, + double* _ptr_array_neurongroup_1_g_eKC_eKC, + double* _ptr_array_neurongroup_1_g_iKC_eKC, + double* _ptr_array_neurongroup_1_h, + double* _ptr_array_neurongroup_1_m, + double* _ptr_array_neurongroup_1_n, + char* _ptr_array_neurongroup_1_not_refractory + ) +{ + using namespace brian; + + int tid = threadIdx.x; + int bid = blockIdx.x; + int _idx = bid * THREADS_PER_BLOCK + tid; + int _vectorisation_idx = _idx; + + ///// KERNEL_CONSTANTS ///// + const int _numV = 100; + const int _numg_eKC_eKC = 100; + const int _numg_iKC_eKC = 100; + const int _numh = 100; + const int _numm = 100; + const int _numn = 100; + const int _numnot_refractory = 100; + + ///// kernel_lines ///// + + const double* _ptr_array_defaultclock_dt = &_value_array_defaultclock_dt; + + + assert(THREADS_PER_BLOCK == blockDim.x); + + + if(_idx >= _N) + { + return; + } + + + ///// scalar_code ///// + + const double dt = _ptr_array_defaultclock_dt[0]; + const double _lio_1 = 1.0f*((- 0.06356) * 2.67e-08)/3e-10; + const double _lio_2 = 1.0f*((- 0.095) * 1.4299999999999999e-06)/3e-10; + const double _lio_3 = 1.0f*(0.05 * 7.15e-06)/3e-10; + const double _lio_4 = 1.0f*0.0/3e-10; + const double _lio_5 = 1.0f*(- 0.092)/3e-10; + const double _lio_6 = 0.0 - (1.0f*2.67e-08/3e-10); + const double _lio_7 = 1.0f*((- 1.0) * 1.4299999999999999e-06)/3e-10; + const double _lio_8 = 1.0f*7.15e-06/3e-10; + const double _lio_9 = 1.0f*1.0/3e-10; + const double _lio_10 = _brian_exp(1.0f*(- dt)/0.005); + const double _lio_11 = _brian_exp(1.0f*(- dt)/0.01); + const double _lio_12 = 1.0f*(0.329137207652868 * _brian_exp(1.0f*(0.0555555555555556 * (- 0.063))/0.001))/0.001; + const double _lio_13 = 1.0f*(- 0.0555555555555556)/0.001; + const double _lio_14 = 2980.95798704173 * (0.001 * _brian_exp(1.0f*(0.2 * (- 0.063))/0.001)); + const double _lio_15 = 1.0f*(- 0.2)/0.001; + const double _lio_16 = ((- 1.0) * (_brian_pow(0.001, 1.0))) * 0.001; + const double _lio_17 = 25.7903399171931 * (((_brian_pow(0.001, 1.0)) * 0.001) * _brian_exp(1.0f*(0.25 * (- 0.063))/0.001)); + const double _lio_18 = 1.0f*(- 0.25)/0.001; + const double _lio_19 = 0.32 * (- 0.063); + const double _lio_20 = 4.16 * 0.001; + const double _lio_21 = 0.0 - ((_brian_pow(0.001, 1.0)) * 0.001); + const double _lio_22 = 0.000335462627902512 * (((_brian_pow(0.001, 1.0)) * 0.001) * _brian_exp(1.0f*((- 0.2) * (- 0.063))/0.001)); + const double _lio_23 = 1.0f*0.2/0.001; + const double _lio_24 = 0.28 * (- 0.063); + const double _lio_25 = 11.2 * 0.001; + const double _lio_26 = ((- 1.0) * 0.001) * 0.001; + const double _lio_27 = 20.0855369231877 * ((0.001 * 0.001) * _brian_exp(1.0f*(0.2 * (- 0.063))/0.001)); + const double _lio_28 = 0.032 * (- 0.063); + const double _lio_29 = 0.48 * 0.001; + const double _lio_30 = 1.0f*(0.642012708343871 * _brian_exp(1.0f*(0.025 * (- 0.063))/0.001))/0.001; + const double _lio_31 = 1.0f*(- 0.025)/0.001; + + + { + ///// vector_code ///// + + double m = _ptr_array_neurongroup_1_m[_idx]; + double g_eKC_eKC = _ptr_array_neurongroup_1_g_eKC_eKC[_idx]; + char not_refractory = _ptr_array_neurongroup_1_not_refractory[_idx]; + double n = _ptr_array_neurongroup_1_n[_idx]; + double h = _ptr_array_neurongroup_1_h[_idx]; + double V = _ptr_array_neurongroup_1_V[_idx]; + const double dt = _ptr_array_defaultclock_dt[0]; + double g_iKC_eKC = _ptr_array_neurongroup_1_g_iKC_eKC[_idx]; + if(!not_refractory) + not_refractory = false || (! (V > 0.0)); + else + not_refractory = true || (! (V > 0.0)); + const double _BA_V = 1.0f*(_lio_1 + ((((_lio_2 * (_brian_pow(n, 4.0))) + (_lio_3 * (h * (_brian_pow(m, 3.0))))) + (_lio_4 * g_iKC_eKC)) + (_lio_5 * g_eKC_eKC)))/((_lio_6 + (_lio_7 * (_brian_pow(n, 4.0)))) - (((_lio_8 * (h * (_brian_pow(m, 3.0)))) + (_lio_9 * g_eKC_eKC)) + (_lio_9 * g_iKC_eKC))); + const double _V = (- _BA_V) + ((V + _BA_V) * _brian_exp(dt * ((_lio_6 + (_lio_7 * (_brian_pow(n, 4.0)))) - (((_lio_8 * (h * (_brian_pow(m, 3.0)))) + (_lio_9 * g_eKC_eKC)) + (_lio_9 * g_iKC_eKC))))); + const double _g_eKC_eKC = _lio_10 * g_eKC_eKC; + const double _g_iKC_eKC = _lio_11 * g_iKC_eKC; + const double _BA_h = 1.0f*(_lio_12 * _brian_exp(_lio_13 * V))/((1.0f*(- 4.0)/(0.001 + (_lio_14 * _brian_exp(_lio_15 * V)))) - (_lio_12 * _brian_exp(_lio_13 * V))); + const double _h = (- _BA_h) + ((_BA_h + h) * _brian_exp(dt * ((1.0f*(- 4.0)/(0.001 + (_lio_14 * _brian_exp(_lio_15 * V)))) - (_lio_12 * _brian_exp(_lio_13 * V))))); + const double _BA_m = 1.0f*(((1.0f*((- 0.32) * V)/(_lio_16 + (_lio_17 * _brian_exp(_lio_18 * V)))) + (1.0f*_lio_19/(_lio_16 + (_lio_17 * _brian_exp(_lio_18 * V))))) + (1.0f*_lio_20/(_lio_16 + (_lio_17 * _brian_exp(_lio_18 * V)))))/(((((1.0f*((- 0.28) * V)/(_lio_21 + (_lio_22 * _brian_exp(_lio_23 * V)))) + (1.0f*(0.32 * V)/(_lio_16 + (_lio_17 * _brian_exp(_lio_18 * V))))) + (1.0f*_lio_24/(_lio_21 + (_lio_22 * _brian_exp(_lio_23 * V))))) + (1.0f*_lio_25/(_lio_21 + (_lio_22 * _brian_exp(_lio_23 * V))))) - ((1.0f*_lio_19/(_lio_16 + (_lio_17 * _brian_exp(_lio_18 * V)))) + (1.0f*_lio_20/(_lio_16 + (_lio_17 * _brian_exp(_lio_18 * V)))))); + const double _m = (- _BA_m) + ((_BA_m + m) * _brian_exp(dt * (((((1.0f*((- 0.28) * V)/(_lio_21 + (_lio_22 * _brian_exp(_lio_23 * V)))) + (1.0f*(0.32 * V)/(_lio_16 + (_lio_17 * _brian_exp(_lio_18 * V))))) + (1.0f*_lio_24/(_lio_21 + (_lio_22 * _brian_exp(_lio_23 * V))))) + (1.0f*_lio_25/(_lio_21 + (_lio_22 * _brian_exp(_lio_23 * V))))) - ((1.0f*_lio_19/(_lio_16 + (_lio_17 * _brian_exp(_lio_18 * V)))) + (1.0f*_lio_20/(_lio_16 + (_lio_17 * _brian_exp(_lio_18 * V)))))))); + const double _BA_n = 1.0f*(((1.0f*((- 0.032) * V)/(_lio_26 + (_lio_27 * _brian_exp(_lio_15 * V)))) + (1.0f*_lio_28/(_lio_26 + (_lio_27 * _brian_exp(_lio_15 * V))))) + (1.0f*_lio_29/(_lio_26 + (_lio_27 * _brian_exp(_lio_15 * V)))))/((1.0f*(0.032 * V)/(_lio_26 + (_lio_27 * _brian_exp(_lio_15 * V)))) - (((1.0f*_lio_28/(_lio_26 + (_lio_27 * _brian_exp(_lio_15 * V)))) + (1.0f*_lio_29/(_lio_26 + (_lio_27 * _brian_exp(_lio_15 * V))))) + (_lio_30 * _brian_exp(_lio_31 * V)))); + const double _n = (- _BA_n) + ((_BA_n + n) * _brian_exp(dt * ((1.0f*(0.032 * V)/(_lio_26 + (_lio_27 * _brian_exp(_lio_15 * V)))) - (((1.0f*_lio_28/(_lio_26 + (_lio_27 * _brian_exp(_lio_15 * V)))) + (1.0f*_lio_29/(_lio_26 + (_lio_27 * _brian_exp(_lio_15 * V))))) + (_lio_30 * _brian_exp(_lio_31 * V)))))); + V = _V; + g_eKC_eKC = _g_eKC_eKC; + g_iKC_eKC = _g_iKC_eKC; + h = _h; + m = _m; + n = _n; + _ptr_array_neurongroup_1_g_eKC_eKC[_idx] = g_eKC_eKC; + _ptr_array_neurongroup_1_m[_idx] = m; + _ptr_array_neurongroup_1_not_refractory[_idx] = not_refractory; + _ptr_array_neurongroup_1_n[_idx] = n; + _ptr_array_neurongroup_1_h[_idx] = h; + _ptr_array_neurongroup_1_V[_idx] = V; + _ptr_array_neurongroup_1_g_iKC_eKC[_idx] = g_iKC_eKC; + + + } +} + + +void _run_neurongroup_1_stateupdater_codeobject() +{ + using namespace brian; + + + const int _N = 100; + + ///// HOST_CONSTANTS /////////// + const int _numV = 100; + const int _numg_eKC_eKC = 100; + const int _numg_iKC_eKC = 100; + const int _numh = 100; + const int _numm = 100; + const int _numn = 100; + const int _numnot_refractory = 100; + + + static int num_threads, num_blocks; + static size_t needed_shared_memory = 0; + static bool first_run = true; + if (first_run) + { + // get number of blocks and threads + int min_num_threads; // The minimum grid size needed to achieve the + // maximum occupancy for a full device launch + + CUDA_SAFE_CALL( + cudaOccupancyMaxPotentialBlockSize(&min_num_threads, &num_threads, + _run_kernel_neurongroup_1_stateupdater_codeobject, 0, 0) // last args: dynamicSMemSize, blockSizeLimit + ); + + // Round up according to array size + num_blocks = (_N + num_threads - 1) / num_threads; + + + + + + // calculate theoretical occupancy + int max_active_blocks; + CUDA_SAFE_CALL( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, + _run_kernel_neurongroup_1_stateupdater_codeobject, num_threads, needed_shared_memory) + ); + + float occupancy = (max_active_blocks * num_threads / num_threads_per_warp) / + (float)(max_threads_per_sm / num_threads_per_warp); + + + // check if we have enough ressources to call kernel with given number + // of blocks and threads (can only occur for the else case above as for the + // first max. occupancy) + struct cudaFuncAttributes funcAttrib; + CUDA_SAFE_CALL( + cudaFuncGetAttributes(&funcAttrib, _run_kernel_neurongroup_1_stateupdater_codeobject) + ); + if (num_threads > funcAttrib.maxThreadsPerBlock) + { + // use the max num_threads before launch failure + num_threads = funcAttrib.maxThreadsPerBlock; + printf("WARNING Not enough ressources available to call " + "_run_kernel_neurongroup_1_stateupdater_codeobject " + "with maximum possible threads per block (%u). " + "Reducing num_threads to %u. (Kernel needs %i " + "registers per block, %i bytes of " + "statically-allocated shared memory per block, %i " + "bytes of local memory per thread and a total of %i " + "bytes of user-allocated constant memory)\n", + max_threads_per_block, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes); + + // calculate theoretical occupancy for new num_threads + CUDA_SAFE_CALL( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, + _run_kernel_neurongroup_1_stateupdater_codeobject, num_threads, needed_shared_memory) + ); + + occupancy = (max_active_blocks * num_threads / num_threads_per_warp) / + (float)(max_threads_per_sm / num_threads_per_warp); + } + + + else + { + printf("INFO _run_kernel_neurongroup_1_stateupdater_codeobject\n" + "\t%u blocks\n" + "\t%u threads\n" + "\t%i registers per block\n" + "\t%i bytes statically-allocated shared memory per block\n" + "\t%i bytes local memory per thread\n" + "\t%i bytes user-allocated constant memory\n" + "\t%.3f theoretical occupancy\n", + num_blocks, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes, occupancy); + } + first_run = false; + } + + + _run_kernel_neurongroup_1_stateupdater_codeobject<<>>( + _N, + num_threads, + ///// HOST_PARAMETERS ///// + dev_array_neurongroup_1_V, + _array_defaultclock_dt[0], + dev_array_neurongroup_1_g_eKC_eKC, + dev_array_neurongroup_1_g_iKC_eKC, + dev_array_neurongroup_1_h, + dev_array_neurongroup_1_m, + dev_array_neurongroup_1_n, + dev_array_neurongroup_1_not_refractory + ); + + CUDA_CHECK_ERROR("_run_kernel_neurongroup_1_stateupdater_codeobject"); + + +} + + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/neurongroup_1_stateupdater_codeobject.h b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/neurongroup_1_stateupdater_codeobject.h new file mode 100644 index 00000000..466ea7ec --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/neurongroup_1_stateupdater_codeobject.h @@ -0,0 +1,7 @@ +#ifndef _INCLUDED_neurongroup_1_stateupdater_codeobject +#define _INCLUDED_neurongroup_1_stateupdater_codeobject + +void _run_neurongroup_1_stateupdater_codeobject(); + + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/neurongroup_1_thresholder_codeobject.cu b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/neurongroup_1_thresholder_codeobject.cu new file mode 100644 index 00000000..ff5e5f8f --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/neurongroup_1_thresholder_codeobject.cu @@ -0,0 +1,408 @@ +#include "code_objects/neurongroup_1_thresholder_codeobject.h" +#include "objects.h" +#include "brianlib/common_math.h" +#include "brianlib/cuda_utils.h" +#include "brianlib/stdint_compat.h" +#include +#include +#include +#include + + + +////// SUPPORT CODE /////// +namespace { + double _host_rand(const int _vectorisation_idx); + double _host_randn(const int _vectorisation_idx); + int32_t _host_poisson(double _lambda, const int _vectorisation_idx); + + ///// block extra_device_helper ///// + __global__ void + _reset_neurongroup_1_thresholder_codeobject( + int32_t* eventspace + ) + { + using namespace brian; + + int _idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (_idx >= 100) { + return; + } + + if (_idx == 0) { + // reset eventspace counter + eventspace[100] = 0; + } + + // reset eventspace + eventspace[_idx] = -1; + } + + ///// support_code_lines ///// + + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + inline __device__ int _brian_atomicAdd(int* address, int val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ float _brian_atomicAdd(float* address, float val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ double _brian_atomicAdd(double* address, double val) + { + #if (__CUDA_ARCH__ >= 600) + // hardware implementation + return atomicAdd(address, val); + #else + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val + + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + #endif + } + inline __device__ int _brian_atomicMul(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val * assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicMul(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val * + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicMul(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val * + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + inline __device__ int _brian_atomicDiv(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val / assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicDiv(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val / + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicDiv(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val / + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + + + // Implement dummy functions such that the host compiled code of binomial + // functions works. Hacky, hacky ... + double _host_rand(const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } + double _host_randn(const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } + int32_t _host_poisson(double _lambda, const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_poisson` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } +} + +////// hashdefine_lines /////// + + + +__global__ void +_run_kernel_neurongroup_1_thresholder_codeobject( + int _N, + int THREADS_PER_BLOCK, + ///// KERNEL_PARAMETERS ///// + double* _ptr_array_neurongroup_1_V, + int32_t* _ptr_array_neurongroup_1__spikespace, + double* _ptr_array_neurongroup_1_lastspike, + char* _ptr_array_neurongroup_1_not_refractory, + const double _value_array_defaultclock_t + ) +{ + using namespace brian; + + int tid = threadIdx.x; + int bid = blockIdx.x; + int _idx = bid * THREADS_PER_BLOCK + tid; + int _vectorisation_idx = _idx; + + ///// KERNEL_CONSTANTS ///// + const int _numV = 100; + const int _num_spikespace = 101; + const int _numlastspike = 100; + const int _numnot_refractory = 100; + + ///// kernel_lines ///// + + const double* _ptr_array_defaultclock_t = &_value_array_defaultclock_t; + + + assert(THREADS_PER_BLOCK == blockDim.x); + + + if(_idx >= _N) + { + return; + } + + + ///// scalar_code ///// + + + + + {// there might be the same variable defined in scalar and vector code + ///// vector_code ///// + + const double V = _ptr_array_neurongroup_1_V[_idx]; + const char not_refractory = _ptr_array_neurongroup_1_not_refractory[_idx]; + char _cond; + if(!not_refractory) + _cond = (V > 0.0) && false; + else + _cond = (V > 0.0) && true; + + + if (_cond) + { + int32_t spike_index = atomicAdd(&_ptr_array_neurongroup_1__spikespace[_N], 1); + _ptr_array_neurongroup_1__spikespace[spike_index] = _idx; + // We have to use the pointer names directly here: The condition + // might contain references to not_refractory or lastspike and in + // that case the names will refer to a single entry. + _ptr_array_neurongroup_1_not_refractory[_idx] = false; + _ptr_array_neurongroup_1_lastspike[_idx] = _ptr_array_defaultclock_t[0]; + } + } +} + + +void _run_neurongroup_1_thresholder_codeobject() +{ + using namespace brian; + + + const int _N = 100; + + ///// HOST_CONSTANTS /////////// + const int _numV = 100; + const int _num_spikespace = 101; + const int _numlastspike = 100; + const int _numnot_refractory = 100; + + + static int num_threads, num_blocks; + static size_t needed_shared_memory = 0; + static bool first_run = true; + if (first_run) + { + // get number of blocks and threads + int min_num_threads; // The minimum grid size needed to achieve the + // maximum occupancy for a full device launch + + CUDA_SAFE_CALL( + cudaOccupancyMaxPotentialBlockSize(&min_num_threads, &num_threads, + _run_kernel_neurongroup_1_thresholder_codeobject, 0, 0) // last args: dynamicSMemSize, blockSizeLimit + ); + + // Round up according to array size + num_blocks = (_N + num_threads - 1) / num_threads; + + + + + + // calculate theoretical occupancy + int max_active_blocks; + CUDA_SAFE_CALL( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, + _run_kernel_neurongroup_1_thresholder_codeobject, num_threads, needed_shared_memory) + ); + + float occupancy = (max_active_blocks * num_threads / num_threads_per_warp) / + (float)(max_threads_per_sm / num_threads_per_warp); + + + // check if we have enough ressources to call kernel with given number + // of blocks and threads (can only occur for the else case above as for the + // first max. occupancy) + struct cudaFuncAttributes funcAttrib; + CUDA_SAFE_CALL( + cudaFuncGetAttributes(&funcAttrib, _run_kernel_neurongroup_1_thresholder_codeobject) + ); + if (num_threads > funcAttrib.maxThreadsPerBlock) + { + // use the max num_threads before launch failure + num_threads = funcAttrib.maxThreadsPerBlock; + printf("WARNING Not enough ressources available to call " + "_run_kernel_neurongroup_1_thresholder_codeobject " + "with maximum possible threads per block (%u). " + "Reducing num_threads to %u. (Kernel needs %i " + "registers per block, %i bytes of " + "statically-allocated shared memory per block, %i " + "bytes of local memory per thread and a total of %i " + "bytes of user-allocated constant memory)\n", + max_threads_per_block, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes); + + // calculate theoretical occupancy for new num_threads + CUDA_SAFE_CALL( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, + _run_kernel_neurongroup_1_thresholder_codeobject, num_threads, needed_shared_memory) + ); + + occupancy = (max_active_blocks * num_threads / num_threads_per_warp) / + (float)(max_threads_per_sm / num_threads_per_warp); + } + + + else + { + printf("INFO _run_kernel_neurongroup_1_thresholder_codeobject\n" + "\t%u blocks\n" + "\t%u threads\n" + "\t%i registers per block\n" + "\t%i bytes statically-allocated shared memory per block\n" + "\t%i bytes local memory per thread\n" + "\t%i bytes user-allocated constant memory\n" + "\t%.3f theoretical occupancy\n", + num_blocks, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes, occupancy); + } + first_run = false; + } + + _reset_neurongroup_1_thresholder_codeobject<<>>( + dev_array_neurongroup_1__spikespace[current_idx_array_neurongroup_1__spikespace] + ); + + CUDA_CHECK_ERROR("_reset_neurongroup_1_thresholder_codeobject"); + + _run_kernel_neurongroup_1_thresholder_codeobject<<>>( + _N, + num_threads, + ///// HOST_PARAMETERS ///// + dev_array_neurongroup_1_V, + dev_array_neurongroup_1__spikespace[current_idx_array_neurongroup_1__spikespace], + dev_array_neurongroup_1_lastspike, + dev_array_neurongroup_1_not_refractory, + _array_defaultclock_t[0] + ); + + CUDA_CHECK_ERROR("_run_kernel_neurongroup_1_thresholder_codeobject"); + +} + + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/neurongroup_1_thresholder_codeobject.h b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/neurongroup_1_thresholder_codeobject.h new file mode 100644 index 00000000..284749de --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/neurongroup_1_thresholder_codeobject.h @@ -0,0 +1,7 @@ +#ifndef _INCLUDED_neurongroup_1_thresholder_codeobject +#define _INCLUDED_neurongroup_1_thresholder_codeobject + +void _run_neurongroup_1_thresholder_codeobject(); + + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/neurongroup_stateupdater_codeobject.cu b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/neurongroup_stateupdater_codeobject.cu new file mode 100644 index 00000000..88efe8f7 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/neurongroup_stateupdater_codeobject.cu @@ -0,0 +1,449 @@ +#include "code_objects/neurongroup_stateupdater_codeobject.h" +#include "objects.h" +#include "brianlib/common_math.h" +#include "brianlib/cuda_utils.h" +#include "brianlib/stdint_compat.h" +#include +#include +#include +#include + + + +////// SUPPORT CODE /////// +namespace { + double _host_rand(const int _vectorisation_idx); + double _host_randn(const int _vectorisation_idx); + int32_t _host_poisson(double _lambda, const int _vectorisation_idx); + + ///// block extra_device_helper ///// + + ///// support_code_lines ///// + + template + __host__ __device__ + double _brian_exp(T value) + { + #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 0)) + return exp((double)value); + #else + return exp(value); + #endif + } + inline __host__ __device__ + float _brian_exp(float value) + { + return exp(value); + } + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + inline __device__ int _brian_atomicAdd(int* address, int val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ float _brian_atomicAdd(float* address, float val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ double _brian_atomicAdd(double* address, double val) + { + #if (__CUDA_ARCH__ >= 600) + // hardware implementation + return atomicAdd(address, val); + #else + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val + + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + #endif + } + inline __device__ int _brian_atomicMul(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val * assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicMul(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val * + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicMul(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val * + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + inline __device__ int _brian_atomicDiv(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val / assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicDiv(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val / + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicDiv(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val / + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + + + // Implement dummy functions such that the host compiled code of binomial + // functions works. Hacky, hacky ... + double _host_rand(const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } + double _host_randn(const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } + int32_t _host_poisson(double _lambda, const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_poisson` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } +} + +////// hashdefine_lines /////// + + + +__global__ void +_run_kernel_neurongroup_stateupdater_codeobject( + int _N, + int THREADS_PER_BLOCK, + ///// KERNEL_PARAMETERS ///// + double* _ptr_array_neurongroup_V, + const double _value_array_defaultclock_dt, + double* _ptr_array_neurongroup_g_PN_iKC, + double* _ptr_array_neurongroup_h, + double* _ptr_array_neurongroup_m, + double* _ptr_array_neurongroup_n, + char* _ptr_array_neurongroup_not_refractory + ) +{ + using namespace brian; + + int tid = threadIdx.x; + int bid = blockIdx.x; + int _idx = bid * THREADS_PER_BLOCK + tid; + int _vectorisation_idx = _idx; + + ///// KERNEL_CONSTANTS ///// + const int _numV = 2500; + const int _numg_PN_iKC = 2500; + const int _numh = 2500; + const int _numm = 2500; + const int _numn = 2500; + const int _numnot_refractory = 2500; + + ///// kernel_lines ///// + + const double* _ptr_array_defaultclock_dt = &_value_array_defaultclock_dt; + + + assert(THREADS_PER_BLOCK == blockDim.x); + + + if(_idx >= _N) + { + return; + } + + + ///// scalar_code ///// + + const double dt = _ptr_array_defaultclock_dt[0]; + const double _lio_1 = 1.0f*((- 0.06356) * 2.67e-08)/3e-10; + const double _lio_2 = 1.0f*((- 0.095) * 1.4299999999999999e-06)/3e-10; + const double _lio_3 = 1.0f*(0.05 * 7.15e-06)/3e-10; + const double _lio_4 = 1.0f*0.0/3e-10; + const double _lio_5 = 0.0 - (1.0f*2.67e-08/3e-10); + const double _lio_6 = 1.0f*((- 1.0) * 1.4299999999999999e-06)/3e-10; + const double _lio_7 = 1.0f*7.15e-06/3e-10; + const double _lio_8 = 1.0f*1.0/3e-10; + const double _lio_9 = _brian_exp(1.0f*(- dt)/0.002); + const double _lio_10 = 1.0f*(0.329137207652868 * _brian_exp(1.0f*(0.0555555555555556 * (- 0.063))/0.001))/0.001; + const double _lio_11 = 1.0f*(- 0.0555555555555556)/0.001; + const double _lio_12 = 2980.95798704173 * (0.001 * _brian_exp(1.0f*(0.2 * (- 0.063))/0.001)); + const double _lio_13 = 1.0f*(- 0.2)/0.001; + const double _lio_14 = ((- 1.0) * (_brian_pow(0.001, 1.0))) * 0.001; + const double _lio_15 = 25.7903399171931 * (((_brian_pow(0.001, 1.0)) * 0.001) * _brian_exp(1.0f*(0.25 * (- 0.063))/0.001)); + const double _lio_16 = 1.0f*(- 0.25)/0.001; + const double _lio_17 = 0.32 * (- 0.063); + const double _lio_18 = 4.16 * 0.001; + const double _lio_19 = 0.0 - ((_brian_pow(0.001, 1.0)) * 0.001); + const double _lio_20 = 0.000335462627902512 * (((_brian_pow(0.001, 1.0)) * 0.001) * _brian_exp(1.0f*((- 0.2) * (- 0.063))/0.001)); + const double _lio_21 = 1.0f*0.2/0.001; + const double _lio_22 = 0.28 * (- 0.063); + const double _lio_23 = 11.2 * 0.001; + const double _lio_24 = ((- 1.0) * 0.001) * 0.001; + const double _lio_25 = 20.0855369231877 * ((0.001 * 0.001) * _brian_exp(1.0f*(0.2 * (- 0.063))/0.001)); + const double _lio_26 = 0.032 * (- 0.063); + const double _lio_27 = 0.48 * 0.001; + const double _lio_28 = 1.0f*(0.642012708343871 * _brian_exp(1.0f*(0.025 * (- 0.063))/0.001))/0.001; + const double _lio_29 = 1.0f*(- 0.025)/0.001; + + + { + ///// vector_code ///// + + double m = _ptr_array_neurongroup_m[_idx]; + char not_refractory = _ptr_array_neurongroup_not_refractory[_idx]; + double g_PN_iKC = _ptr_array_neurongroup_g_PN_iKC[_idx]; + double n = _ptr_array_neurongroup_n[_idx]; + double h = _ptr_array_neurongroup_h[_idx]; + double V = _ptr_array_neurongroup_V[_idx]; + const double dt = _ptr_array_defaultclock_dt[0]; + if(!not_refractory) + not_refractory = false || (! (V > 0.0)); + else + not_refractory = true || (! (V > 0.0)); + const double _BA_V = 1.0f*(_lio_1 + (((_lio_2 * (_brian_pow(n, 4.0))) + (_lio_3 * (h * (_brian_pow(m, 3.0))))) + (_lio_4 * g_PN_iKC)))/((_lio_5 + (_lio_6 * (_brian_pow(n, 4.0)))) - ((_lio_7 * (h * (_brian_pow(m, 3.0)))) + (_lio_8 * g_PN_iKC))); + const double _V = (- _BA_V) + ((V + _BA_V) * _brian_exp(dt * ((_lio_5 + (_lio_6 * (_brian_pow(n, 4.0)))) - ((_lio_7 * (h * (_brian_pow(m, 3.0)))) + (_lio_8 * g_PN_iKC))))); + const double _g_PN_iKC = _lio_9 * g_PN_iKC; + const double _BA_h = 1.0f*(_lio_10 * _brian_exp(_lio_11 * V))/((1.0f*(- 4.0)/(0.001 + (_lio_12 * _brian_exp(_lio_13 * V)))) - (_lio_10 * _brian_exp(_lio_11 * V))); + const double _h = (- _BA_h) + ((_BA_h + h) * _brian_exp(dt * ((1.0f*(- 4.0)/(0.001 + (_lio_12 * _brian_exp(_lio_13 * V)))) - (_lio_10 * _brian_exp(_lio_11 * V))))); + const double _BA_m = 1.0f*(((1.0f*((- 0.32) * V)/(_lio_14 + (_lio_15 * _brian_exp(_lio_16 * V)))) + (1.0f*_lio_17/(_lio_14 + (_lio_15 * _brian_exp(_lio_16 * V))))) + (1.0f*_lio_18/(_lio_14 + (_lio_15 * _brian_exp(_lio_16 * V)))))/(((((1.0f*((- 0.28) * V)/(_lio_19 + (_lio_20 * _brian_exp(_lio_21 * V)))) + (1.0f*(0.32 * V)/(_lio_14 + (_lio_15 * _brian_exp(_lio_16 * V))))) + (1.0f*_lio_22/(_lio_19 + (_lio_20 * _brian_exp(_lio_21 * V))))) + (1.0f*_lio_23/(_lio_19 + (_lio_20 * _brian_exp(_lio_21 * V))))) - ((1.0f*_lio_17/(_lio_14 + (_lio_15 * _brian_exp(_lio_16 * V)))) + (1.0f*_lio_18/(_lio_14 + (_lio_15 * _brian_exp(_lio_16 * V)))))); + const double _m = (- _BA_m) + ((_BA_m + m) * _brian_exp(dt * (((((1.0f*((- 0.28) * V)/(_lio_19 + (_lio_20 * _brian_exp(_lio_21 * V)))) + (1.0f*(0.32 * V)/(_lio_14 + (_lio_15 * _brian_exp(_lio_16 * V))))) + (1.0f*_lio_22/(_lio_19 + (_lio_20 * _brian_exp(_lio_21 * V))))) + (1.0f*_lio_23/(_lio_19 + (_lio_20 * _brian_exp(_lio_21 * V))))) - ((1.0f*_lio_17/(_lio_14 + (_lio_15 * _brian_exp(_lio_16 * V)))) + (1.0f*_lio_18/(_lio_14 + (_lio_15 * _brian_exp(_lio_16 * V)))))))); + const double _BA_n = 1.0f*(((1.0f*((- 0.032) * V)/(_lio_24 + (_lio_25 * _brian_exp(_lio_13 * V)))) + (1.0f*_lio_26/(_lio_24 + (_lio_25 * _brian_exp(_lio_13 * V))))) + (1.0f*_lio_27/(_lio_24 + (_lio_25 * _brian_exp(_lio_13 * V)))))/((1.0f*(0.032 * V)/(_lio_24 + (_lio_25 * _brian_exp(_lio_13 * V)))) - (((1.0f*_lio_26/(_lio_24 + (_lio_25 * _brian_exp(_lio_13 * V)))) + (1.0f*_lio_27/(_lio_24 + (_lio_25 * _brian_exp(_lio_13 * V))))) + (_lio_28 * _brian_exp(_lio_29 * V)))); + const double _n = (- _BA_n) + ((_BA_n + n) * _brian_exp(dt * ((1.0f*(0.032 * V)/(_lio_24 + (_lio_25 * _brian_exp(_lio_13 * V)))) - (((1.0f*_lio_26/(_lio_24 + (_lio_25 * _brian_exp(_lio_13 * V)))) + (1.0f*_lio_27/(_lio_24 + (_lio_25 * _brian_exp(_lio_13 * V))))) + (_lio_28 * _brian_exp(_lio_29 * V)))))); + V = _V; + g_PN_iKC = _g_PN_iKC; + h = _h; + m = _m; + n = _n; + _ptr_array_neurongroup_m[_idx] = m; + _ptr_array_neurongroup_not_refractory[_idx] = not_refractory; + _ptr_array_neurongroup_g_PN_iKC[_idx] = g_PN_iKC; + _ptr_array_neurongroup_n[_idx] = n; + _ptr_array_neurongroup_h[_idx] = h; + _ptr_array_neurongroup_V[_idx] = V; + + + } +} + + +void _run_neurongroup_stateupdater_codeobject() +{ + using namespace brian; + + + const int _N = 2500; + + ///// HOST_CONSTANTS /////////// + const int _numV = 2500; + const int _numg_PN_iKC = 2500; + const int _numh = 2500; + const int _numm = 2500; + const int _numn = 2500; + const int _numnot_refractory = 2500; + + + static int num_threads, num_blocks; + static size_t needed_shared_memory = 0; + static bool first_run = true; + if (first_run) + { + // get number of blocks and threads + int min_num_threads; // The minimum grid size needed to achieve the + // maximum occupancy for a full device launch + + CUDA_SAFE_CALL( + cudaOccupancyMaxPotentialBlockSize(&min_num_threads, &num_threads, + _run_kernel_neurongroup_stateupdater_codeobject, 0, 0) // last args: dynamicSMemSize, blockSizeLimit + ); + + // Round up according to array size + num_blocks = (_N + num_threads - 1) / num_threads; + + + + + + // calculate theoretical occupancy + int max_active_blocks; + CUDA_SAFE_CALL( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, + _run_kernel_neurongroup_stateupdater_codeobject, num_threads, needed_shared_memory) + ); + + float occupancy = (max_active_blocks * num_threads / num_threads_per_warp) / + (float)(max_threads_per_sm / num_threads_per_warp); + + + // check if we have enough ressources to call kernel with given number + // of blocks and threads (can only occur for the else case above as for the + // first max. occupancy) + struct cudaFuncAttributes funcAttrib; + CUDA_SAFE_CALL( + cudaFuncGetAttributes(&funcAttrib, _run_kernel_neurongroup_stateupdater_codeobject) + ); + if (num_threads > funcAttrib.maxThreadsPerBlock) + { + // use the max num_threads before launch failure + num_threads = funcAttrib.maxThreadsPerBlock; + printf("WARNING Not enough ressources available to call " + "_run_kernel_neurongroup_stateupdater_codeobject " + "with maximum possible threads per block (%u). " + "Reducing num_threads to %u. (Kernel needs %i " + "registers per block, %i bytes of " + "statically-allocated shared memory per block, %i " + "bytes of local memory per thread and a total of %i " + "bytes of user-allocated constant memory)\n", + max_threads_per_block, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes); + + // calculate theoretical occupancy for new num_threads + CUDA_SAFE_CALL( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, + _run_kernel_neurongroup_stateupdater_codeobject, num_threads, needed_shared_memory) + ); + + occupancy = (max_active_blocks * num_threads / num_threads_per_warp) / + (float)(max_threads_per_sm / num_threads_per_warp); + } + + + else + { + printf("INFO _run_kernel_neurongroup_stateupdater_codeobject\n" + "\t%u blocks\n" + "\t%u threads\n" + "\t%i registers per block\n" + "\t%i bytes statically-allocated shared memory per block\n" + "\t%i bytes local memory per thread\n" + "\t%i bytes user-allocated constant memory\n" + "\t%.3f theoretical occupancy\n", + num_blocks, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes, occupancy); + } + first_run = false; + } + + + _run_kernel_neurongroup_stateupdater_codeobject<<>>( + _N, + num_threads, + ///// HOST_PARAMETERS ///// + dev_array_neurongroup_V, + _array_defaultclock_dt[0], + dev_array_neurongroup_g_PN_iKC, + dev_array_neurongroup_h, + dev_array_neurongroup_m, + dev_array_neurongroup_n, + dev_array_neurongroup_not_refractory + ); + + CUDA_CHECK_ERROR("_run_kernel_neurongroup_stateupdater_codeobject"); + + +} + + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/neurongroup_stateupdater_codeobject.h b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/neurongroup_stateupdater_codeobject.h new file mode 100644 index 00000000..3d4e7427 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/neurongroup_stateupdater_codeobject.h @@ -0,0 +1,7 @@ +#ifndef _INCLUDED_neurongroup_stateupdater_codeobject +#define _INCLUDED_neurongroup_stateupdater_codeobject + +void _run_neurongroup_stateupdater_codeobject(); + + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/neurongroup_thresholder_codeobject.cu b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/neurongroup_thresholder_codeobject.cu new file mode 100644 index 00000000..6b5309e0 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/neurongroup_thresholder_codeobject.cu @@ -0,0 +1,409 @@ +#include "code_objects/neurongroup_thresholder_codeobject.h" +#include "objects.h" +#include "brianlib/common_math.h" +#include "brianlib/cuda_utils.h" +#include "brianlib/stdint_compat.h" +#include +#include +#include +#include + + + +////// SUPPORT CODE /////// +namespace { + double _host_rand(const int _vectorisation_idx); + double _host_randn(const int _vectorisation_idx); + int32_t _host_poisson(double _lambda, const int _vectorisation_idx); + + ///// block extra_device_helper ///// + __global__ void + _reset_neurongroup_thresholder_codeobject( + int32_t* eventspace + ) + { + using namespace brian; + + int _idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (_idx >= 2500) { + return; + } + + if (_idx == 0) { + // reset eventspace counter + eventspace[2500] = 0; + } + + // reset eventspace + eventspace[_idx] = -1; + } + + ///// support_code_lines ///// + + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + inline __device__ int _brian_atomicAdd(int* address, int val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ float _brian_atomicAdd(float* address, float val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ double _brian_atomicAdd(double* address, double val) + { + #if (__CUDA_ARCH__ >= 600) + // hardware implementation + return atomicAdd(address, val); + #else + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val + + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + #endif + } + inline __device__ int _brian_atomicMul(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val * assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicMul(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val * + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicMul(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val * + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + inline __device__ int _brian_atomicDiv(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val / assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicDiv(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val / + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicDiv(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val / + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + + + // Implement dummy functions such that the host compiled code of binomial + // functions works. Hacky, hacky ... + double _host_rand(const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } + double _host_randn(const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } + int32_t _host_poisson(double _lambda, const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_poisson` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } +} + +////// hashdefine_lines /////// + + + +__global__ void +_run_kernel_neurongroup_thresholder_codeobject( + int _N, + int THREADS_PER_BLOCK, + ///// KERNEL_PARAMETERS ///// + double* _ptr_array_neurongroup_V, + int32_t* _ptr_array_neurongroup__spikespace, + double* _ptr_array_neurongroup_lastspike, + char* _ptr_array_neurongroup_not_refractory, + const double _value_array_defaultclock_t + ) +{ + using namespace brian; + + int tid = threadIdx.x; + int bid = blockIdx.x; + int _idx = bid * THREADS_PER_BLOCK + tid; + int _vectorisation_idx = _idx; + + ///// KERNEL_CONSTANTS ///// + const int _numV = 2500; + const int _num_spikespace = 2501; + const int _numlastspike = 2500; + const int _numnot_refractory = 2500; + + ///// kernel_lines ///// + + const double* _ptr_array_defaultclock_t = &_value_array_defaultclock_t; + + + assert(THREADS_PER_BLOCK == blockDim.x); + + + if(_idx >= _N) + { + return; + } + + + ///// scalar_code ///// + + + + + {// there might be the same variable defined in scalar and vector code + ///// vector_code ///// + + const double V = _ptr_array_neurongroup_V[_idx]; + const char not_refractory = _ptr_array_neurongroup_not_refractory[_idx]; + char _cond; + if(!not_refractory) + _cond = (V > 0.0) && false; + else + _cond = (V > 0.0) && true; + + + if (_cond) + { + int32_t spike_index = atomicAdd(&_ptr_array_neurongroup__spikespace[_N], 1); + _ptr_array_neurongroup__spikespace[spike_index] = _idx; + // We have to use the pointer names directly here: The condition + // might contain references to not_refractory or lastspike and in + // that case the names will refer to a single entry. + _ptr_array_neurongroup_not_refractory[_idx] = false; + _ptr_array_neurongroup_lastspike[_idx] = _ptr_array_defaultclock_t[0]; + } + } +} + + +void _run_neurongroup_thresholder_codeobject() +{ + using namespace brian; + + + const int _N = 2500; + + ///// HOST_CONSTANTS /////////// + const int _numV = 2500; + const int _num_spikespace = 2501; + const int _numlastspike = 2500; + const int _numnot_refractory = 2500; + + + static int num_threads, num_blocks; + static size_t needed_shared_memory = 0; + static bool first_run = true; + if (first_run) + { + // get number of blocks and threads + int min_num_threads; // The minimum grid size needed to achieve the + // maximum occupancy for a full device launch + + CUDA_SAFE_CALL( + cudaOccupancyMaxPotentialBlockSize(&min_num_threads, &num_threads, + _run_kernel_neurongroup_thresholder_codeobject, 0, 0) // last args: dynamicSMemSize, blockSizeLimit + ); + + // Round up according to array size + num_blocks = (_N + num_threads - 1) / num_threads; + + + + + + // calculate theoretical occupancy + int max_active_blocks; + CUDA_SAFE_CALL( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, + _run_kernel_neurongroup_thresholder_codeobject, num_threads, needed_shared_memory) + ); + + float occupancy = (max_active_blocks * num_threads / num_threads_per_warp) / + (float)(max_threads_per_sm / num_threads_per_warp); + + + // check if we have enough ressources to call kernel with given number + // of blocks and threads (can only occur for the else case above as for the + // first max. occupancy) + struct cudaFuncAttributes funcAttrib; + CUDA_SAFE_CALL( + cudaFuncGetAttributes(&funcAttrib, _run_kernel_neurongroup_thresholder_codeobject) + ); + if (num_threads > funcAttrib.maxThreadsPerBlock) + { + // use the max num_threads before launch failure + num_threads = funcAttrib.maxThreadsPerBlock; + printf("WARNING Not enough ressources available to call " + "_run_kernel_neurongroup_thresholder_codeobject " + "with maximum possible threads per block (%u). " + "Reducing num_threads to %u. (Kernel needs %i " + "registers per block, %i bytes of " + "statically-allocated shared memory per block, %i " + "bytes of local memory per thread and a total of %i " + "bytes of user-allocated constant memory)\n", + max_threads_per_block, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes); + + // calculate theoretical occupancy for new num_threads + CUDA_SAFE_CALL( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, + _run_kernel_neurongroup_thresholder_codeobject, num_threads, needed_shared_memory) + ); + + occupancy = (max_active_blocks * num_threads / num_threads_per_warp) / + (float)(max_threads_per_sm / num_threads_per_warp); + } + + + else + { + printf("INFO _run_kernel_neurongroup_thresholder_codeobject\n" + "\t%u blocks\n" + "\t%u threads\n" + "\t%i registers per block\n" + "\t%i bytes statically-allocated shared memory per block\n" + "\t%i bytes local memory per thread\n" + "\t%i bytes user-allocated constant memory\n" + "\t%.3f theoretical occupancy\n", + num_blocks, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes, occupancy); + } + first_run = false; + } + + _reset_neurongroup_thresholder_codeobject<<>>( + dev_array_neurongroup__spikespace[current_idx_array_neurongroup__spikespace] + ); + + CUDA_CHECK_ERROR("_reset_neurongroup_thresholder_codeobject"); + + _run_kernel_neurongroup_thresholder_codeobject<<>>( + _N, + num_threads, + ///// HOST_PARAMETERS ///// + dev_array_neurongroup_V, + dev_array_neurongroup__spikespace[current_idx_array_neurongroup__spikespace], + dev_array_neurongroup_lastspike, + dev_array_neurongroup_not_refractory, + _array_defaultclock_t[0] + ); + + CUDA_CHECK_ERROR("_run_kernel_neurongroup_thresholder_codeobject"); + + +} + + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/neurongroup_thresholder_codeobject.h b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/neurongroup_thresholder_codeobject.h new file mode 100644 index 00000000..c9d8e893 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/neurongroup_thresholder_codeobject.h @@ -0,0 +1,7 @@ +#ifndef _INCLUDED_neurongroup_thresholder_codeobject +#define _INCLUDED_neurongroup_thresholder_codeobject + +void _run_neurongroup_thresholder_codeobject(); + + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/spikegeneratorgroup_codeobject.cu b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/spikegeneratorgroup_codeobject.cu new file mode 100644 index 00000000..e1dfbf39 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/spikegeneratorgroup_codeobject.cu @@ -0,0 +1,481 @@ +#include "code_objects/spikegeneratorgroup_codeobject.h" +#include "objects.h" +#include "brianlib/common_math.h" +#include "brianlib/cuda_utils.h" +#include "brianlib/stdint_compat.h" +#include +#include +#include +#include + + + +////// SUPPORT CODE /////// +namespace { + double _host_rand(const int _vectorisation_idx); + double _host_randn(const int _vectorisation_idx); + int32_t _host_poisson(double _lambda, const int _vectorisation_idx); + + ///// block extra_device_helper ///// + // Function to reset spikespace and set lastindex + __global__ void + _reset_spikegeneratorgroup_codeobject( + //int32_t* _spikespace, + int32_t* _previous_spikespace, + ///// KERNEL_PARAMETERS ///// + int32_t* _ptr_array_spikegeneratorgroup__lastindex, + int32_t* _ptr_array_spikegeneratorgroup__period_bins, + int32_t* _ptr_array_spikegeneratorgroup__spikespace, + int32_t* _ptr_array_spikegeneratorgroup__timebins, + const int _num_timebins, + int32_t* _ptr_array_spikegeneratorgroup_neuron_index, + const int _numneuron_index, + int32_t* _ptr_array_spikegeneratorgroup_spike_number, + const int _numspike_number, + const int64_t _value_array_defaultclock_timestep + ) + { + using namespace brian; + + int _idx = blockIdx.x * blockDim.x + threadIdx.x; + + // We need kernel_lines for time variables + ///// kernel_lines ///// + + const int64_t* _ptr_array_defaultclock_timestep = &_value_array_defaultclock_timestep; + + + if (_idx >= 100) { + return; + } + + if (_idx == 0) + { + // The period in multiples of dt + const int32_t _the_period = _ptr_array_spikegeneratorgroup__period_bins[0]; + // The spike times in multiples of dt + int32_t _timebin = _ptr_array_defaultclock_timestep[0]; + // index of the last spiking neuron in this spikespace + int32_t _lastindex = _ptr_array_spikegeneratorgroup__lastindex[0]; + + // Update the lastindex variable with the number of spikes from the + // spikespace from the previous time step + _lastindex += _previous_spikespace[100]; + + // Now reset the _lastindex if the priod has passed + if (_the_period > 0) { + _timebin %= _the_period; + // If there is a periodicity in the SpikeGenerator, we need to reset the + // lastindex when the period has passed + if (_lastindex > 0 && _ptr_array_spikegeneratorgroup__timebins[_lastindex - 1] >= _timebin) + _lastindex = 0; + } + _ptr_array_spikegeneratorgroup__lastindex[0] = _lastindex; + + // Reset spikespace counter for this time step + _ptr_array_spikegeneratorgroup__spikespace[100] = 0; + } + + // Reset the entire spikespace + _ptr_array_spikegeneratorgroup__spikespace[_idx] = -1; + } + + ///// support_code_lines ///// + + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + inline __device__ int _brian_atomicAdd(int* address, int val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ float _brian_atomicAdd(float* address, float val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ double _brian_atomicAdd(double* address, double val) + { + #if (__CUDA_ARCH__ >= 600) + // hardware implementation + return atomicAdd(address, val); + #else + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val + + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + #endif + } + inline __device__ int _brian_atomicMul(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val * assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicMul(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val * + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicMul(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val * + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + inline __device__ int _brian_atomicDiv(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val / assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicDiv(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val / + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicDiv(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val / + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + + + // Implement dummy functions such that the host compiled code of binomial + // functions works. Hacky, hacky ... + double _host_rand(const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } + double _host_randn(const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } + int32_t _host_poisson(double _lambda, const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_poisson` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } +} + +////// hashdefine_lines /////// + + + +__global__ void +_run_kernel_spikegeneratorgroup_codeobject( + int _N, + int THREADS_PER_BLOCK, + ///// KERNEL_PARAMETERS ///// + int32_t* _ptr_array_spikegeneratorgroup__lastindex, + int32_t* _ptr_array_spikegeneratorgroup__period_bins, + int32_t* _ptr_array_spikegeneratorgroup__spikespace, + int32_t* _ptr_array_spikegeneratorgroup__timebins, + const int _num_timebins, + int32_t* _ptr_array_spikegeneratorgroup_neuron_index, + const int _numneuron_index, + int32_t* _ptr_array_spikegeneratorgroup_spike_number, + const int _numspike_number, + const int64_t _value_array_defaultclock_timestep + ) +{ + using namespace brian; + + int tid = threadIdx.x; + int bid = blockIdx.x; + int _idx = bid * THREADS_PER_BLOCK + tid; + int _vectorisation_idx = _idx; + + ///// KERNEL_CONSTANTS ///// + const int _num_lastindex = 1; + const int _num_period_bins = 1; + const int _num_spikespace = 101; + + ///// kernel_lines ///// + + const int64_t* _ptr_array_defaultclock_timestep = &_value_array_defaultclock_timestep; + + + assert(THREADS_PER_BLOCK == blockDim.x); + + + if(_idx >= _N) + { + return; + } + + // The period in multiples of dt + const int32_t _the_period = _ptr_array_spikegeneratorgroup__period_bins[0]; + // The spike times in multiples of dt + int32_t _timebin = _ptr_array_defaultclock_timestep[0]; + + if (_the_period > 0) + _timebin %= _the_period; + + // We can have at most one spike per neuron per time step, which is the number of + // threads we call this kernel with. Hence, no need for any loops. + + // _spike_idx runs through the spikes in the spike generator + int _spike_idx = _idx + _ptr_array_spikegeneratorgroup__lastindex[0]; + + // TODO: Solve this smarter. Currently, we will call the reset kernel and this + // kernel at each time step even if the spikegenerator has emitted all its spikes! + // Instead, we should know on the host when this happened and not call any kernels. + // See also #193 + if (_spike_idx >= _num_timebins) + return; + + // If the spike time of this spike comes after the current time bin, do nothing + if (_ptr_array_spikegeneratorgroup__timebins[_spike_idx] > _timebin) + { + return; + } + + // Else add the spiking neuron to the spikespace + int32_t _neuron_id = _ptr_array_spikegeneratorgroup_neuron_index[_spike_idx]; + int32_t _spikespace_index = atomicAdd(&_ptr_array_spikegeneratorgroup__spikespace[100], 1); + _ptr_array_spikegeneratorgroup__spikespace[_spikespace_index] = _neuron_id; + +} + + +void _run_spikegeneratorgroup_codeobject() +{ + using namespace brian; + + + const int _N = 100; + + ///// HOST_CONSTANTS /////////// + const int _num_lastindex = 1; + const int _num_period_bins = 1; + const int _num_spikespace = 101; + int32_t* const dev_array_spikegeneratorgroup__timebins = thrust::raw_pointer_cast(&dev_dynamic_array_spikegeneratorgroup__timebins[0]); + const int _num_timebins = dev_dynamic_array_spikegeneratorgroup__timebins.size(); + int32_t* const dev_array_spikegeneratorgroup_neuron_index = thrust::raw_pointer_cast(&dev_dynamic_array_spikegeneratorgroup_neuron_index[0]); + const int _numneuron_index = dev_dynamic_array_spikegeneratorgroup_neuron_index.size(); + int32_t* const dev_array_spikegeneratorgroup_spike_number = thrust::raw_pointer_cast(&dev_dynamic_array_spikegeneratorgroup_spike_number[0]); + const int _numspike_number = dev_dynamic_array_spikegeneratorgroup_spike_number.size(); + + + static int num_threads, num_blocks; + static size_t needed_shared_memory = 0; + static bool first_run = true; + if (first_run) + { + // get number of blocks and threads + int min_num_threads; // The minimum grid size needed to achieve the + // maximum occupancy for a full device launch + + CUDA_SAFE_CALL( + cudaOccupancyMaxPotentialBlockSize(&min_num_threads, &num_threads, + _run_kernel_spikegeneratorgroup_codeobject, 0, 0) // last args: dynamicSMemSize, blockSizeLimit + ); + + // Round up according to array size + num_blocks = (_N + num_threads - 1) / num_threads; + + + + + + // calculate theoretical occupancy + int max_active_blocks; + CUDA_SAFE_CALL( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, + _run_kernel_spikegeneratorgroup_codeobject, num_threads, needed_shared_memory) + ); + + float occupancy = (max_active_blocks * num_threads / num_threads_per_warp) / + (float)(max_threads_per_sm / num_threads_per_warp); + + + // check if we have enough ressources to call kernel with given number + // of blocks and threads (can only occur for the else case above as for the + // first max. occupancy) + struct cudaFuncAttributes funcAttrib; + CUDA_SAFE_CALL( + cudaFuncGetAttributes(&funcAttrib, _run_kernel_spikegeneratorgroup_codeobject) + ); + if (num_threads > funcAttrib.maxThreadsPerBlock) + { + // use the max num_threads before launch failure + num_threads = funcAttrib.maxThreadsPerBlock; + printf("WARNING Not enough ressources available to call " + "_run_kernel_spikegeneratorgroup_codeobject " + "with maximum possible threads per block (%u). " + "Reducing num_threads to %u. (Kernel needs %i " + "registers per block, %i bytes of " + "statically-allocated shared memory per block, %i " + "bytes of local memory per thread and a total of %i " + "bytes of user-allocated constant memory)\n", + max_threads_per_block, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes); + + // calculate theoretical occupancy for new num_threads + CUDA_SAFE_CALL( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, + _run_kernel_spikegeneratorgroup_codeobject, num_threads, needed_shared_memory) + ); + + occupancy = (max_active_blocks * num_threads / num_threads_per_warp) / + (float)(max_threads_per_sm / num_threads_per_warp); + } + + + else + { + printf("INFO _run_kernel_spikegeneratorgroup_codeobject\n" + "\t%u blocks\n" + "\t%u threads\n" + "\t%i registers per block\n" + "\t%i bytes statically-allocated shared memory per block\n" + "\t%i bytes local memory per thread\n" + "\t%i bytes user-allocated constant memory\n" + "\t%.3f theoretical occupancy\n", + num_blocks, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes, occupancy); + } + first_run = false; + } + + // Note: If we have no delays, there is only one spikespace and + // current_idx equals previous_idx. + _reset_spikegeneratorgroup_codeobject<<>>( + dev_array_spikegeneratorgroup__spikespace[previous_idx_array_spikegeneratorgroup__spikespace], + ///// HOST_PARAMETERS ///// + dev_array_spikegeneratorgroup__lastindex, + dev_array_spikegeneratorgroup__period_bins, + dev_array_spikegeneratorgroup__spikespace[current_idx_array_spikegeneratorgroup__spikespace], + dev_array_spikegeneratorgroup__timebins, + _num_timebins, + dev_array_spikegeneratorgroup_neuron_index, + _numneuron_index, + dev_array_spikegeneratorgroup_spike_number, + _numspike_number, + _array_defaultclock_timestep[0] + ); + + CUDA_CHECK_ERROR("_reset_spikegeneratorgroup_codeobject"); + + _run_kernel_spikegeneratorgroup_codeobject<<>>( + _N, + num_threads, + ///// HOST_PARAMETERS ///// + dev_array_spikegeneratorgroup__lastindex, + dev_array_spikegeneratorgroup__period_bins, + dev_array_spikegeneratorgroup__spikespace[current_idx_array_spikegeneratorgroup__spikespace], + dev_array_spikegeneratorgroup__timebins, + _num_timebins, + dev_array_spikegeneratorgroup_neuron_index, + _numneuron_index, + dev_array_spikegeneratorgroup_spike_number, + _numspike_number, + _array_defaultclock_timestep[0] + ); + + CUDA_CHECK_ERROR("_run_kernel_spikegeneratorgroup_codeobject"); + CUDA_SAFE_CALL(cudaDeviceSynchronize()); + + +} + + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/spikegeneratorgroup_codeobject.h b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/spikegeneratorgroup_codeobject.h new file mode 100644 index 00000000..5afcfe58 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/spikegeneratorgroup_codeobject.h @@ -0,0 +1,7 @@ +#ifndef _INCLUDED_spikegeneratorgroup_codeobject +#define _INCLUDED_spikegeneratorgroup_codeobject + +void _run_spikegeneratorgroup_codeobject(); + + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/spikemonitor_1_codeobject.cu b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/spikemonitor_1_codeobject.cu new file mode 100644 index 00000000..f4f22951 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/spikemonitor_1_codeobject.cu @@ -0,0 +1,593 @@ +#include "code_objects/spikemonitor_1_codeobject.h" +#include "objects.h" +#include "brianlib/common_math.h" +#include "brianlib/cuda_utils.h" +#include "brianlib/stdint_compat.h" +#include +#include +#include +#include + + + +////// SUPPORT CODE /////// +namespace { + double _host_rand(const int _vectorisation_idx); + double _host_randn(const int _vectorisation_idx); + int32_t _host_poisson(double _lambda, const int _vectorisation_idx); + + ///// block extra_device_helper ///// + // declare monitor cudaVectors + __device__ cudaVector* monitor_t; + // declare monitor cudaVectors + __device__ cudaVector* monitor_i; + + ///// support_code_lines ///// + + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + inline __device__ int _brian_atomicAdd(int* address, int val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ float _brian_atomicAdd(float* address, float val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ double _brian_atomicAdd(double* address, double val) + { + #if (__CUDA_ARCH__ >= 600) + // hardware implementation + return atomicAdd(address, val); + #else + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val + + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + #endif + } + inline __device__ int _brian_atomicMul(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val * assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicMul(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val * + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicMul(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val * + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + inline __device__ int _brian_atomicDiv(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val / assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicDiv(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val / + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicDiv(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val / + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + + + // Implement dummy functions such that the host compiled code of binomial + // functions works. Hacky, hacky ... + double _host_rand(const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } + double _host_randn(const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } + int32_t _host_poisson(double _lambda, const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_poisson` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } +} + +////// hashdefine_lines /////// + + + +__global__ void _init_kernel_spikemonitor_1_codeobject() +{ + monitor_t = new cudaVector(); + monitor_i = new cudaVector(); +} + +__global__ void +_run_kernel_spikemonitor_1_codeobject( + int neurongroup_N, + int32_t* count, + // KERNEL_PARAMETERS + int32_t* _ptr_array_spikemonitor_1_N, + int32_t* _ptr_array_neurongroup_i, + int32_t* _ptr_array_spikemonitor_1__source_idx, + const double _value_array_defaultclock_t, + int32_t* _ptr_array_neurongroup__spikespace, + int32_t* _ptr_array_spikemonitor_1_count, + int32_t* _ptr_array_spikemonitor_1_i, + const int _numi, + double* _ptr_array_spikemonitor_1_t, + const int _numt + ) +{ + using namespace brian; + int tid = threadIdx.x; + int bid = blockIdx.x; + + + // KERNEL_CONSTANTS + const int _numN = 1; + const int _num_source_i = 2500; + const int _num_source_idx = 2500; + const int _num_spikespace = 2501; + const int _numcount = 2500; + + ///// kernel_lines ///// + + const double* _ptr_array_defaultclock_t = &_value_array_defaultclock_t; + + + // scalar_code + + + + // using not parallel spikespace: filled from left with all spiking neuron IDs, -1 ends the list + for(int i = 0; i < neurongroup_N; i++) + { + int32_t spiking_neuron = _ptr_array_neurongroup__spikespace[i]; + if(spiking_neuron != -1) + { + if(0 <= spiking_neuron && spiking_neuron < 2500) + { + int _idx = spiking_neuron; + int _vectorisation_idx = _idx; + + // vector_code + + const int32_t _source_i = _ptr_array_neurongroup_i[_idx]; + const double _source_t = _ptr_array_defaultclock_t[0]; + const double _to_record_t = _source_t; + const int32_t _to_record_i = _source_i; + + + // push to monitors + monitor_t->push(_to_record_t); + monitor_i->push(_to_record_i); + + count[_idx -0]++; + + } + } + else + { + + break; + } + } +} + + +void _run_spikemonitor_1_codeobject() +{ + using namespace brian; + + + const int _N = _array_spikemonitor_1_N[0]; + + ///// HOST_CONSTANTS /////////// + const int _numN = 1; + const int _num_source_i = 2500; + const int _num_source_idx = 2500; + const int _num_spikespace = 2501; + const int _numcount = 2500; + int32_t* const dev_array_spikemonitor_1_i = thrust::raw_pointer_cast(&dev_dynamic_array_spikemonitor_1_i[0]); + const int _numi = dev_dynamic_array_spikemonitor_1_i.size(); + double* const dev_array_spikemonitor_1_t = thrust::raw_pointer_cast(&dev_dynamic_array_spikemonitor_1_t[0]); + const int _numt = dev_dynamic_array_spikemonitor_1_t.size(); + + + static int num_threads, num_blocks; + static size_t needed_shared_memory = 0; + static bool first_run = true; + if (first_run) + { +_init_kernel_spikemonitor_1_codeobject<<<1,1,0,spikemonitor_stream1>>>(); + +CUDA_CHECK_ERROR("_init_kernel_spikemonitor_1_codeobject"); +num_blocks = 1; +num_threads = 1; + + + // calculate theoretical occupancy + int max_active_blocks; + CUDA_SAFE_CALL( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, + _run_kernel_spikemonitor_1_codeobject, num_threads, needed_shared_memory) + ); + + float occupancy = (max_active_blocks * num_threads / num_threads_per_warp) / + (float)(max_threads_per_sm / num_threads_per_warp); + + + // check if we have enough ressources to call kernel with given number + // of blocks and threads (can only occur for the else case above as for the + // first max. occupancy) + struct cudaFuncAttributes funcAttrib; + CUDA_SAFE_CALL( + cudaFuncGetAttributes(&funcAttrib, _run_kernel_spikemonitor_1_codeobject) + ); + if (num_threads > funcAttrib.maxThreadsPerBlock) + { + // use the max num_threads before launch failure + num_threads = funcAttrib.maxThreadsPerBlock; + printf("WARNING Not enough ressources available to call " + "_run_kernel_spikemonitor_1_codeobject " + "with maximum possible threads per block (%u). " + "Reducing num_threads to %u. (Kernel needs %i " + "registers per block, %i bytes of " + "statically-allocated shared memory per block, %i " + "bytes of local memory per thread and a total of %i " + "bytes of user-allocated constant memory)\n", + max_threads_per_block, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes); + + // calculate theoretical occupancy for new num_threads + CUDA_SAFE_CALL( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, + _run_kernel_spikemonitor_1_codeobject, num_threads, needed_shared_memory) + ); + + occupancy = (max_active_blocks * num_threads / num_threads_per_warp) / + (float)(max_threads_per_sm / num_threads_per_warp); + } + + + else + { + printf("INFO _run_kernel_spikemonitor_1_codeobject\n" + "\t%u blocks\n" + "\t%u threads\n" + "\t%i registers per block\n" + "\t%i bytes statically-allocated shared memory per block\n" + "\t%i bytes local memory per thread\n" + "\t%i bytes user-allocated constant memory\n" + "\t%.3f theoretical occupancy\n", + num_blocks, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes, occupancy); + } + first_run = false; + } + + +_run_kernel_spikemonitor_1_codeobject<<>>( + _num_spikespace-1, + dev_array_spikemonitor_1_count, + // HOST_PARAMETERS + dev_array_spikemonitor_1_N, + dev_array_neurongroup_i, + dev_array_spikemonitor_1__source_idx, + _array_defaultclock_t[0], + dev_array_neurongroup__spikespace[current_idx_array_neurongroup__spikespace], + dev_array_spikemonitor_1_count, + dev_array_spikemonitor_1_i, + _numi, + dev_array_spikemonitor_1_t, + _numt); + +CUDA_CHECK_ERROR("_run_kernel_spikemonitor_1_codeobject"); + + +} + +__global__ void _debugmsg_kernel_spikemonitor_1_codeobject( + // KERNEL_PARAMETERS + int32_t* _ptr_array_spikemonitor_1_N, + int32_t* _ptr_array_neurongroup_i, + int32_t* _ptr_array_spikemonitor_1__source_idx, + const double _value_array_defaultclock_t, + int32_t* _ptr_array_neurongroup__spikespace, + int32_t* _ptr_array_spikemonitor_1_count, + int32_t* _ptr_array_spikemonitor_1_i, + const int _numi, + double* _ptr_array_spikemonitor_1_t, + const int _numt +) +{ + using namespace brian; + + // KERNEL_CONSTANTS + const int _numN = 1; + const int _num_source_i = 2500; + const int _num_source_idx = 2500; + const int _num_spikespace = 2501; + const int _numcount = 2500; + + ///// kernel_lines ///// + + const double* _ptr_array_defaultclock_t = &_value_array_defaultclock_t; + + + printf("Number of spikes: %d\n", _ptr_array_spikemonitor_1_N[0]); +} + +__global__ void _count_kernel_spikemonitor_1_codeobject( + int* dev_num_events, + // KERNEL_PARAMETERS + int32_t* _ptr_array_spikemonitor_1_N, + int32_t* _ptr_array_neurongroup_i, + int32_t* _ptr_array_spikemonitor_1__source_idx, + const double _value_array_defaultclock_t, + int32_t* _ptr_array_neurongroup__spikespace, + int32_t* _ptr_array_spikemonitor_1_count, + int32_t* _ptr_array_spikemonitor_1_i, + const int _numi, + double* _ptr_array_spikemonitor_1_t, + const int _numt +) +{ + using namespace brian; + // TODO: fix int types, num_events and cudaVector::size() are int but _ptr_array_spikemonitor_1_N[0] is size32_t + int num_events; + + // KERNEL_CONSTANTS + const int _numN = 1; + const int _num_source_i = 2500; + const int _num_source_idx = 2500; + const int _num_spikespace = 2501; + const int _numcount = 2500; + + ///// kernel_lines ///// + + const double* _ptr_array_defaultclock_t = &_value_array_defaultclock_t; + + + num_events = monitor_t->size(); + _ptr_array_spikemonitor_1_N[0] = num_events; + + *dev_num_events = num_events; +} + +__global__ void _copy_kernel_spikemonitor_1_codeobject( + double* dev_monitor_t, + int32_t* dev_monitor_i, + int dummy ) +{ + using namespace brian; + int index = 0; + + // copy monitors + index = 0; + for(int j = 0; j < monitor_t->size(); j++) + { + dev_monitor_t[index] = monitor_t->at(j); + index++; + } + index = 0; + for(int j = 0; j < monitor_i->size(); j++) + { + dev_monitor_i[index] = monitor_i->at(j); + index++; + } +} + +void _copyToHost_spikemonitor_1_codeobject() +{ + using namespace brian; + + const std::clock_t _start_time = std::clock(); + + // TODO: Use the correct dev_eventmonitor_N instead of dev_num_events + // and the correct _array_eventmonitor_N instead of host_num_events. + // use: dev_array_spikemonitor_1_N and _array_spikemonitor_1_N + // dev_array_.. gets copied to _array_... in objects.cu::write_arrays() + // copying it here would result in copying it twice. + // monitor_... and dev_monitor... store the exact same values, but we + // need monitor_... as cudaVector for changing size from device funtions. + // Maybe use cudaVector as default for dynamic arrays, then we would not + // need monitor... at all. This would mean changing the copying in objects.cu + // for dynamic arrays (currently we just use thrust device to host vector). + int host_num_events; + int* dev_num_events; + + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_num_events, sizeof(int)) + ); + + // HOST_CONSTANTS + const int _numN = 1; + const int _num_source_i = 2500; + const int _num_source_idx = 2500; + const int _num_spikespace = 2501; + const int _numcount = 2500; + int32_t* const dev_array_spikemonitor_1_i = thrust::raw_pointer_cast(&dev_dynamic_array_spikemonitor_1_i[0]); + const int _numi = dev_dynamic_array_spikemonitor_1_i.size(); + double* const dev_array_spikemonitor_1_t = thrust::raw_pointer_cast(&dev_dynamic_array_spikemonitor_1_t[0]); + const int _numt = dev_dynamic_array_spikemonitor_1_t.size(); + + _count_kernel_spikemonitor_1_codeobject<<<1,1,0,spikemonitor_stream1>>>( + dev_num_events, + // HOST_PARAMETERS + dev_array_spikemonitor_1_N, + dev_array_neurongroup_i, + dev_array_spikemonitor_1__source_idx, + _array_defaultclock_t[0], + dev_array_neurongroup__spikespace[current_idx_array_neurongroup__spikespace], + dev_array_spikemonitor_1_count, + dev_array_spikemonitor_1_i, + _numi, + dev_array_spikemonitor_1_t, + _numt + ); + + CUDA_CHECK_ERROR("_count_kernel_spikemonitor_1_codeobject"); + + CUDA_SAFE_CALL( + cudaMemcpyAsync(&host_num_events, dev_num_events, sizeof(int), cudaMemcpyDeviceToHost,spikemonitor_stream1) + ); + + // resize monitor device vectors + THRUST_CHECK_ERROR( + dev_dynamic_array_spikemonitor_1_t.resize(host_num_events) + ); + THRUST_CHECK_ERROR( + dev_dynamic_array_spikemonitor_1_i.resize(host_num_events) + ); + + _copy_kernel_spikemonitor_1_codeobject<<<1,1,0,spikemonitor_stream1>>>( + thrust::raw_pointer_cast(&dev_dynamic_array_spikemonitor_1_t[0]), + thrust::raw_pointer_cast(&dev_dynamic_array_spikemonitor_1_i[0]), + 0 ); + + CUDA_CHECK_ERROR("_copy_kernel_spikemonitor_1_codeobject"); +} + +void _debugmsg_spikemonitor_1_codeobject() +{ + using namespace brian; + + // HOST_CONSTANTS + const int _numN = 1; + const int _num_source_i = 2500; + const int _num_source_idx = 2500; + const int _num_spikespace = 2501; + const int _numcount = 2500; + int32_t* const dev_array_spikemonitor_1_i = thrust::raw_pointer_cast(&dev_dynamic_array_spikemonitor_1_i[0]); + const int _numi = dev_dynamic_array_spikemonitor_1_i.size(); + double* const dev_array_spikemonitor_1_t = thrust::raw_pointer_cast(&dev_dynamic_array_spikemonitor_1_t[0]); + const int _numt = dev_dynamic_array_spikemonitor_1_t.size(); + + // TODO: can't we acces the correct _array_eventmonitor_N[0] + // value here without any kernel call? + // Yes: use _array_spikemonitor_1_N + _debugmsg_kernel_spikemonitor_1_codeobject<<<1,1,0,spikemonitor_stream1>>>( + // HOST_PARAMETERS + dev_array_spikemonitor_1_N, + dev_array_neurongroup_i, + dev_array_spikemonitor_1__source_idx, + _array_defaultclock_t[0], + dev_array_neurongroup__spikespace[current_idx_array_neurongroup__spikespace], + dev_array_spikemonitor_1_count, + dev_array_spikemonitor_1_i, + _numi, + dev_array_spikemonitor_1_t, + _numt + ); + + CUDA_CHECK_ERROR("_debugmsg_kernel_spikemonitor_1_codeobject"); +} + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/spikemonitor_1_codeobject.h b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/spikemonitor_1_codeobject.h new file mode 100644 index 00000000..2b55c3e2 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/spikemonitor_1_codeobject.h @@ -0,0 +1,9 @@ +#ifndef _INCLUDED_spikemonitor_1_codeobject +#define _INCLUDED_spikemonitor_1_codeobject + +void _run_spikemonitor_1_codeobject(); + +void _copyToHost_spikemonitor_1_codeobject(); +void _debugmsg_spikemonitor_1_codeobject(); + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/spikemonitor_2_codeobject.cu b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/spikemonitor_2_codeobject.cu new file mode 100644 index 00000000..12651158 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/spikemonitor_2_codeobject.cu @@ -0,0 +1,595 @@ +#include "code_objects/spikemonitor_2_codeobject.h" +#include "objects.h" +#include "brianlib/common_math.h" +#include "brianlib/cuda_utils.h" +#include "brianlib/stdint_compat.h" +#include +#include +#include +#include + + + +////// SUPPORT CODE /////// +namespace { + double _host_rand(const int _vectorisation_idx); + double _host_randn(const int _vectorisation_idx); + int32_t _host_poisson(double _lambda, const int _vectorisation_idx); + + ///// block extra_device_helper ///// + // declare monitor cudaVectors + __device__ cudaVector* monitor_t; + // declare monitor cudaVectors + __device__ cudaVector* monitor_i; + + ///// support_code_lines ///// + + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + inline __device__ int _brian_atomicAdd(int* address, int val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ float _brian_atomicAdd(float* address, float val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ double _brian_atomicAdd(double* address, double val) + { + #if (__CUDA_ARCH__ >= 600) + // hardware implementation + return atomicAdd(address, val); + #else + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val + + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + #endif + } + inline __device__ int _brian_atomicMul(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val * assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicMul(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val * + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicMul(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val * + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + inline __device__ int _brian_atomicDiv(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val / assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicDiv(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val / + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicDiv(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val / + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + + + // Implement dummy functions such that the host compiled code of binomial + // functions works. Hacky, hacky ... + double _host_rand(const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } + double _host_randn(const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } + int32_t _host_poisson(double _lambda, const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_poisson` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } +} + +////// hashdefine_lines /////// + + + +__global__ void _init_kernel_spikemonitor_2_codeobject() +{ + monitor_t = new cudaVector(); + monitor_i = new cudaVector(); +} + +__global__ void +_run_kernel_spikemonitor_2_codeobject( + int neurongroup_N, + int32_t* count, + // KERNEL_PARAMETERS + int32_t* _ptr_array_spikemonitor_2_N, + int32_t* _ptr_array_neurongroup_1_i, + int32_t* _ptr_array_spikemonitor_2__source_idx, + const double _value_array_defaultclock_t, + int32_t* _ptr_array_neurongroup_1__spikespace, + int32_t* _ptr_array_spikemonitor_2_count, + int32_t* _ptr_array_spikemonitor_2_i, + const int _numi, + double* _ptr_array_spikemonitor_2_t, + const int _numt + ) +{ + using namespace brian; + int tid = threadIdx.x; + int bid = blockIdx.x; + + + // KERNEL_CONSTANTS + const int _numN = 1; + const int _num_source_i = 100; + const int _num_source_idx = 100; + const int _num_spikespace = 101; + const int _numcount = 100; + + ///// kernel_lines ///// + + const double* _ptr_array_defaultclock_t = &_value_array_defaultclock_t; + + + // scalar_code + + + + // using not parallel spikespace: filled from left with all spiking neuron IDs, -1 ends the list + for(int i = 0; i < neurongroup_N; i++) + { + int32_t spiking_neuron = _ptr_array_neurongroup_1__spikespace[i]; + if(spiking_neuron != -1) + { + if(0 <= spiking_neuron && spiking_neuron < 100) + { + int _idx = spiking_neuron; + int _vectorisation_idx = _idx; + + // vector_code + + const int32_t _source_i = _ptr_array_neurongroup_1_i[_idx]; + const double _source_t = _ptr_array_defaultclock_t[0]; + const double _to_record_t = _source_t; + const int32_t _to_record_i = _source_i; + + + // push to monitors + monitor_t->push(_to_record_t); + monitor_i->push(_to_record_i); + + count[_idx -0]++; + + } + } + else + { + + break; + } + } +} + + +void _run_spikemonitor_2_codeobject() +{ + using namespace brian; + + + const int _N = _array_spikemonitor_2_N[0]; + + ///// HOST_CONSTANTS /////////// + const int _numN = 1; + const int _num_source_i = 100; + const int _num_source_idx = 100; + const int _num_spikespace = 101; + const int _numcount = 100; + int32_t* const dev_array_spikemonitor_2_i = thrust::raw_pointer_cast(&dev_dynamic_array_spikemonitor_2_i[0]); + const int _numi = dev_dynamic_array_spikemonitor_2_i.size(); + double* const dev_array_spikemonitor_2_t = thrust::raw_pointer_cast(&dev_dynamic_array_spikemonitor_2_t[0]); + const int _numt = dev_dynamic_array_spikemonitor_2_t.size(); + + + static int num_threads, num_blocks; + static size_t needed_shared_memory = 0; + static bool first_run = true; + if (first_run) + { +_init_kernel_spikemonitor_2_codeobject<<<1,1,0,spikemonitor_stream2>>>(); + +CUDA_CHECK_ERROR("_init_kernel_spikemonitor_2_codeobject"); +num_blocks = 1; +num_threads = 1; + + + // calculate theoretical occupancy + int max_active_blocks; + CUDA_SAFE_CALL( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, + _run_kernel_spikemonitor_2_codeobject, num_threads, needed_shared_memory) + ); + + float occupancy = (max_active_blocks * num_threads / num_threads_per_warp) / + (float)(max_threads_per_sm / num_threads_per_warp); + + + // check if we have enough ressources to call kernel with given number + // of blocks and threads (can only occur for the else case above as for the + // first max. occupancy) + struct cudaFuncAttributes funcAttrib; + CUDA_SAFE_CALL( + cudaFuncGetAttributes(&funcAttrib, _run_kernel_spikemonitor_2_codeobject) + ); + if (num_threads > funcAttrib.maxThreadsPerBlock) + { + // use the max num_threads before launch failure + num_threads = funcAttrib.maxThreadsPerBlock; + printf("WARNING Not enough ressources available to call " + "_run_kernel_spikemonitor_2_codeobject " + "with maximum possible threads per block (%u). " + "Reducing num_threads to %u. (Kernel needs %i " + "registers per block, %i bytes of " + "statically-allocated shared memory per block, %i " + "bytes of local memory per thread and a total of %i " + "bytes of user-allocated constant memory)\n", + max_threads_per_block, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes); + + // calculate theoretical occupancy for new num_threads + CUDA_SAFE_CALL( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, + _run_kernel_spikemonitor_2_codeobject, num_threads, needed_shared_memory) + ); + + occupancy = (max_active_blocks * num_threads / num_threads_per_warp) / + (float)(max_threads_per_sm / num_threads_per_warp); + } + + + else + { + printf("INFO _run_kernel_spikemonitor_2_codeobject\n" + "\t%u blocks\n" + "\t%u threads\n" + "\t%i registers per block\n" + "\t%i bytes statically-allocated shared memory per block\n" + "\t%i bytes local memory per thread\n" + "\t%i bytes user-allocated constant memory\n" + "\t%.3f theoretical occupancy\n", + num_blocks, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes, occupancy); + } + first_run = false; + } + + +_run_kernel_spikemonitor_2_codeobject<<>>( + _num_spikespace-1, + dev_array_spikemonitor_2_count, + // HOST_PARAMETERS + dev_array_spikemonitor_2_N, + dev_array_neurongroup_1_i, + dev_array_spikemonitor_2__source_idx, + _array_defaultclock_t[0], + dev_array_neurongroup_1__spikespace[current_idx_array_neurongroup_1__spikespace], + dev_array_spikemonitor_2_count, + dev_array_spikemonitor_2_i, + _numi, + dev_array_spikemonitor_2_t, + _numt); + +CUDA_CHECK_ERROR("_run_kernel_spikemonitor_2_codeobject"); + + +} + +__global__ void _debugmsg_kernel_spikemonitor_2_codeobject( + // KERNEL_PARAMETERS + int32_t* _ptr_array_spikemonitor_2_N, + int32_t* _ptr_array_neurongroup_1_i, + int32_t* _ptr_array_spikemonitor_2__source_idx, + const double _value_array_defaultclock_t, + int32_t* _ptr_array_neurongroup_1__spikespace, + int32_t* _ptr_array_spikemonitor_2_count, + int32_t* _ptr_array_spikemonitor_2_i, + const int _numi, + double* _ptr_array_spikemonitor_2_t, + const int _numt +) +{ + using namespace brian; + + // KERNEL_CONSTANTS + const int _numN = 1; + const int _num_source_i = 100; + const int _num_source_idx = 100; + const int _num_spikespace = 101; + const int _numcount = 100; + + ///// kernel_lines ///// + + const double* _ptr_array_defaultclock_t = &_value_array_defaultclock_t; + + + printf("Number of spikes: %d\n", _ptr_array_spikemonitor_2_N[0]); +} + +__global__ void _count_kernel_spikemonitor_2_codeobject( + int* dev_num_events, + // KERNEL_PARAMETERS + int32_t* _ptr_array_spikemonitor_2_N, + int32_t* _ptr_array_neurongroup_1_i, + int32_t* _ptr_array_spikemonitor_2__source_idx, + const double _value_array_defaultclock_t, + int32_t* _ptr_array_neurongroup_1__spikespace, + int32_t* _ptr_array_spikemonitor_2_count, + int32_t* _ptr_array_spikemonitor_2_i, + const int _numi, + double* _ptr_array_spikemonitor_2_t, + const int _numt +) +{ + using namespace brian; + // TODO: fix int types, num_events and cudaVector::size() are int but _ptr_array_spikemonitor_2_N[0] is size32_t + int num_events; + + // KERNEL_CONSTANTS + const int _numN = 1; + const int _num_source_i = 100; + const int _num_source_idx = 100; + const int _num_spikespace = 101; + const int _numcount = 100; + + ///// kernel_lines ///// + + const double* _ptr_array_defaultclock_t = &_value_array_defaultclock_t; + + + num_events = monitor_t->size(); + _ptr_array_spikemonitor_2_N[0] = num_events; + + *dev_num_events = num_events; +} + +__global__ void _copy_kernel_spikemonitor_2_codeobject( + double* dev_monitor_t, + int32_t* dev_monitor_i, + int dummy ) +{ + using namespace brian; + int index = 0; + + // copy monitors + index = 0; + for(int j = 0; j < monitor_t->size(); j++) + { + dev_monitor_t[index] = monitor_t->at(j); + index++; + } + index = 0; + for(int j = 0; j < monitor_i->size(); j++) + { + dev_monitor_i[index] = monitor_i->at(j); + index++; + } +} + +void _copyToHost_spikemonitor_2_codeobject() +{ + using namespace brian; + + const std::clock_t _start_time = std::clock(); + + // TODO: Use the correct dev_eventmonitor_N instead of dev_num_events + // and the correct _array_eventmonitor_N instead of host_num_events. + // use: dev_array_spikemonitor_2_N and _array_spikemonitor_2_N + // dev_array_.. gets copied to _array_... in objects.cu::write_arrays() + // copying it here would result in copying it twice. + // monitor_... and dev_monitor... store the exact same values, but we + // need monitor_... as cudaVector for changing size from device funtions. + // Maybe use cudaVector as default for dynamic arrays, then we would not + // need monitor... at all. This would mean changing the copying in objects.cu + // for dynamic arrays (currently we just use thrust device to host vector). + int host_num_events; + int* dev_num_events; + + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_num_events, sizeof(int)) + ); + + // HOST_CONSTANTS + const int _numN = 1; + const int _num_source_i = 100; + const int _num_source_idx = 100; + const int _num_spikespace = 101; + const int _numcount = 100; + int32_t* const dev_array_spikemonitor_2_i = thrust::raw_pointer_cast(&dev_dynamic_array_spikemonitor_2_i[0]); + const int _numi = dev_dynamic_array_spikemonitor_2_i.size(); + double* const dev_array_spikemonitor_2_t = thrust::raw_pointer_cast(&dev_dynamic_array_spikemonitor_2_t[0]); + const int _numt = dev_dynamic_array_spikemonitor_2_t.size(); + + _count_kernel_spikemonitor_2_codeobject<<<1,1,0,spikemonitor_stream2>>>( + dev_num_events, + // HOST_PARAMETERS + dev_array_spikemonitor_2_N, + dev_array_neurongroup_1_i, + dev_array_spikemonitor_2__source_idx, + _array_defaultclock_t[0], + dev_array_neurongroup_1__spikespace[current_idx_array_neurongroup_1__spikespace], + dev_array_spikemonitor_2_count, + dev_array_spikemonitor_2_i, + _numi, + dev_array_spikemonitor_2_t, + _numt + ); + + CUDA_CHECK_ERROR("_count_kernel_spikemonitor_2_codeobject"); + + CUDA_SAFE_CALL( + cudaMemcpyAsync(&host_num_events, dev_num_events, sizeof(int), cudaMemcpyDeviceToHost,spikemonitor_stream2) + ); + + // resize monitor device vectors + THRUST_CHECK_ERROR( + dev_dynamic_array_spikemonitor_2_t.resize(host_num_events) + ); + THRUST_CHECK_ERROR( + dev_dynamic_array_spikemonitor_2_i.resize(host_num_events) + ); + + _copy_kernel_spikemonitor_2_codeobject<<<1,1,0,spikemonitor_stream2>>>( + thrust::raw_pointer_cast(&dev_dynamic_array_spikemonitor_2_t[0]), + thrust::raw_pointer_cast(&dev_dynamic_array_spikemonitor_2_i[0]), + 0 ); + + CUDA_CHECK_ERROR("_copy_kernel_spikemonitor_2_codeobject"); +} + +void _debugmsg_spikemonitor_2_codeobject() +{ + using namespace brian; + + // HOST_CONSTANTS + const int _numN = 1; + const int _num_source_i = 100; + const int _num_source_idx = 100; + const int _num_spikespace = 101; + const int _numcount = 100; + int32_t* const dev_array_spikemonitor_2_i = thrust::raw_pointer_cast(&dev_dynamic_array_spikemonitor_2_i[0]); + const int _numi = dev_dynamic_array_spikemonitor_2_i.size(); + double* const dev_array_spikemonitor_2_t = thrust::raw_pointer_cast(&dev_dynamic_array_spikemonitor_2_t[0]); + const int _numt = dev_dynamic_array_spikemonitor_2_t.size(); + + // TODO: can't we acces the correct _array_eventmonitor_N[0] + // value here without any kernel call? + // Yes: use _array_spikemonitor_2_N + _debugmsg_kernel_spikemonitor_2_codeobject<<<1,1,0,spikemonitor_stream2>>>( + // HOST_PARAMETERS + dev_array_spikemonitor_2_N, + dev_array_neurongroup_1_i, + dev_array_spikemonitor_2__source_idx, + _array_defaultclock_t[0], + dev_array_neurongroup_1__spikespace[current_idx_array_neurongroup_1__spikespace], + dev_array_spikemonitor_2_count, + dev_array_spikemonitor_2_i, + _numi, + dev_array_spikemonitor_2_t, + _numt + ); + + CUDA_CHECK_ERROR("_debugmsg_kernel_spikemonitor_2_codeobject"); + CUDA_SAFE_CALL(cudaDeviceSynchronize()); +} + + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/spikemonitor_2_codeobject.h b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/spikemonitor_2_codeobject.h new file mode 100644 index 00000000..f803e377 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/spikemonitor_2_codeobject.h @@ -0,0 +1,9 @@ +#ifndef _INCLUDED_spikemonitor_2_codeobject +#define _INCLUDED_spikemonitor_2_codeobject + +void _run_spikemonitor_2_codeobject(); + +void _copyToHost_spikemonitor_2_codeobject(); +void _debugmsg_spikemonitor_2_codeobject(); + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/spikemonitor_codeobject.cu b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/spikemonitor_codeobject.cu new file mode 100644 index 00000000..22adb85e --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/spikemonitor_codeobject.cu @@ -0,0 +1,593 @@ +#include "code_objects/spikemonitor_codeobject.h" +#include "objects.h" +#include "brianlib/common_math.h" +#include "brianlib/cuda_utils.h" +#include "brianlib/stdint_compat.h" +#include +#include +#include +#include + + + +////// SUPPORT CODE /////// +namespace { + double _host_rand(const int _vectorisation_idx); + double _host_randn(const int _vectorisation_idx); + int32_t _host_poisson(double _lambda, const int _vectorisation_idx); + + ///// block extra_device_helper ///// + // declare monitor cudaVectors + __device__ cudaVector* monitor_t; + // declare monitor cudaVectors + __device__ cudaVector* monitor_i; + + ///// support_code_lines ///// + + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + inline __device__ int _brian_atomicAdd(int* address, int val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ float _brian_atomicAdd(float* address, float val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ double _brian_atomicAdd(double* address, double val) + { + #if (__CUDA_ARCH__ >= 600) + // hardware implementation + return atomicAdd(address, val); + #else + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val + + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + #endif + } + inline __device__ int _brian_atomicMul(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val * assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicMul(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val * + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicMul(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val * + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + inline __device__ int _brian_atomicDiv(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val / assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicDiv(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val / + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicDiv(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val / + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + + + // Implement dummy functions such that the host compiled code of binomial + // functions works. Hacky, hacky ... + double _host_rand(const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } + double _host_randn(const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } + int32_t _host_poisson(double _lambda, const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_poisson` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } +} + +////// hashdefine_lines /////// + + + +__global__ void _init_kernel_spikemonitor_codeobject() +{ + monitor_t = new cudaVector(); + monitor_i = new cudaVector(); +} + +__global__ void +_run_kernel_spikemonitor_codeobject( + int neurongroup_N, + int32_t* count, + // KERNEL_PARAMETERS + int32_t* _ptr_array_spikemonitor_N, + int32_t* _ptr_array_spikegeneratorgroup_i, + int32_t* _ptr_array_spikemonitor__source_idx, + const double _value_array_defaultclock_t, + int32_t* _ptr_array_spikegeneratorgroup__spikespace, + int32_t* _ptr_array_spikemonitor_count, + int32_t* _ptr_array_spikemonitor_i, + const int _numi, + double* _ptr_array_spikemonitor_t, + const int _numt + ) +{ + using namespace brian; + int tid = threadIdx.x; + int bid = blockIdx.x; + + + // KERNEL_CONSTANTS + const int _numN = 1; + const int _num_source_i = 100; + const int _num_source_idx = 100; + const int _num_spikespace = 101; + const int _numcount = 100; + + ///// kernel_lines ///// + + const double* _ptr_array_defaultclock_t = &_value_array_defaultclock_t; + + + // scalar_code + + + + // using not parallel spikespace: filled from left with all spiking neuron IDs, -1 ends the list + for(int i = 0; i < neurongroup_N; i++) + { + int32_t spiking_neuron = _ptr_array_spikegeneratorgroup__spikespace[i]; + if(spiking_neuron != -1) + { + if(0 <= spiking_neuron && spiking_neuron < 100) + { + int _idx = spiking_neuron; + int _vectorisation_idx = _idx; + + // vector_code + + const int32_t _source_i = _ptr_array_spikegeneratorgroup_i[_idx]; + const double _source_t = _ptr_array_defaultclock_t[0]; + const double _to_record_t = _source_t; + const int32_t _to_record_i = _source_i; + + + // push to monitors + monitor_t->push(_to_record_t); + monitor_i->push(_to_record_i); + + count[_idx -0]++; + + } + } + else + { + + break; + } + } +} + + +void _run_spikemonitor_codeobject() +{ + using namespace brian; + + + const int _N = _array_spikemonitor_N[0]; + + ///// HOST_CONSTANTS /////////// + const int _numN = 1; + const int _num_source_i = 100; + const int _num_source_idx = 100; + const int _num_spikespace = 101; + const int _numcount = 100; + int32_t* const dev_array_spikemonitor_i = thrust::raw_pointer_cast(&dev_dynamic_array_spikemonitor_i[0]); + const int _numi = dev_dynamic_array_spikemonitor_i.size(); + double* const dev_array_spikemonitor_t = thrust::raw_pointer_cast(&dev_dynamic_array_spikemonitor_t[0]); + const int _numt = dev_dynamic_array_spikemonitor_t.size(); + + + static int num_threads, num_blocks; + static size_t needed_shared_memory = 0; + static bool first_run = true; + if (first_run) + { +_init_kernel_spikemonitor_codeobject<<<1,1,0,spikemonitor_stream>>>(); + +CUDA_CHECK_ERROR("_init_kernel_spikemonitor_codeobject"); +num_blocks = 1; +num_threads = 1; + + + // calculate theoretical occupancy + int max_active_blocks; + CUDA_SAFE_CALL( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, + _run_kernel_spikemonitor_codeobject, num_threads, needed_shared_memory) + ); + + float occupancy = (max_active_blocks * num_threads / num_threads_per_warp) / + (float)(max_threads_per_sm / num_threads_per_warp); + + + // check if we have enough ressources to call kernel with given number + // of blocks and threads (can only occur for the else case above as for the + // first max. occupancy) + struct cudaFuncAttributes funcAttrib; + CUDA_SAFE_CALL( + cudaFuncGetAttributes(&funcAttrib, _run_kernel_spikemonitor_codeobject) + ); + if (num_threads > funcAttrib.maxThreadsPerBlock) + { + // use the max num_threads before launch failure + num_threads = funcAttrib.maxThreadsPerBlock; + printf("WARNING Not enough ressources available to call " + "_run_kernel_spikemonitor_codeobject " + "with maximum possible threads per block (%u). " + "Reducing num_threads to %u. (Kernel needs %i " + "registers per block, %i bytes of " + "statically-allocated shared memory per block, %i " + "bytes of local memory per thread and a total of %i " + "bytes of user-allocated constant memory)\n", + max_threads_per_block, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes); + + // calculate theoretical occupancy for new num_threads + CUDA_SAFE_CALL( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, + _run_kernel_spikemonitor_codeobject, num_threads, needed_shared_memory) + ); + + occupancy = (max_active_blocks * num_threads / num_threads_per_warp) / + (float)(max_threads_per_sm / num_threads_per_warp); + } + + + else + { + printf("INFO _run_kernel_spikemonitor_codeobject\n" + "\t%u blocks\n" + "\t%u threads\n" + "\t%i registers per block\n" + "\t%i bytes statically-allocated shared memory per block\n" + "\t%i bytes local memory per thread\n" + "\t%i bytes user-allocated constant memory\n" + "\t%.3f theoretical occupancy\n", + num_blocks, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes, occupancy); + } + first_run = false; + } + + +_run_kernel_spikemonitor_codeobject<<>>( + _num_spikespace-1, + dev_array_spikemonitor_count, + // HOST_PARAMETERS + dev_array_spikemonitor_N, + dev_array_spikegeneratorgroup_i, + dev_array_spikemonitor__source_idx, + _array_defaultclock_t[0], + dev_array_spikegeneratorgroup__spikespace[current_idx_array_spikegeneratorgroup__spikespace], + dev_array_spikemonitor_count, + dev_array_spikemonitor_i, + _numi, + dev_array_spikemonitor_t, + _numt); + +CUDA_CHECK_ERROR("_run_kernel_spikemonitor_codeobject"); + + +} + +__global__ void _debugmsg_kernel_spikemonitor_codeobject( + // KERNEL_PARAMETERS + int32_t* _ptr_array_spikemonitor_N, + int32_t* _ptr_array_spikegeneratorgroup_i, + int32_t* _ptr_array_spikemonitor__source_idx, + const double _value_array_defaultclock_t, + int32_t* _ptr_array_spikegeneratorgroup__spikespace, + int32_t* _ptr_array_spikemonitor_count, + int32_t* _ptr_array_spikemonitor_i, + const int _numi, + double* _ptr_array_spikemonitor_t, + const int _numt +) +{ + using namespace brian; + + // KERNEL_CONSTANTS + const int _numN = 1; + const int _num_source_i = 100; + const int _num_source_idx = 100; + const int _num_spikespace = 101; + const int _numcount = 100; + + ///// kernel_lines ///// + + const double* _ptr_array_defaultclock_t = &_value_array_defaultclock_t; + + + printf("Number of spikes: %d\n", _ptr_array_spikemonitor_N[0]); +} + +__global__ void _count_kernel_spikemonitor_codeobject( + int* dev_num_events, + // KERNEL_PARAMETERS + int32_t* _ptr_array_spikemonitor_N, + int32_t* _ptr_array_spikegeneratorgroup_i, + int32_t* _ptr_array_spikemonitor__source_idx, + const double _value_array_defaultclock_t, + int32_t* _ptr_array_spikegeneratorgroup__spikespace, + int32_t* _ptr_array_spikemonitor_count, + int32_t* _ptr_array_spikemonitor_i, + const int _numi, + double* _ptr_array_spikemonitor_t, + const int _numt +) +{ + using namespace brian; + // TODO: fix int types, num_events and cudaVector::size() are int but _ptr_array_spikemonitor_N[0] is size32_t + int num_events; + + // KERNEL_CONSTANTS + const int _numN = 1; + const int _num_source_i = 100; + const int _num_source_idx = 100; + const int _num_spikespace = 101; + const int _numcount = 100; + + ///// kernel_lines ///// + + const double* _ptr_array_defaultclock_t = &_value_array_defaultclock_t; + + + num_events = monitor_t->size(); + _ptr_array_spikemonitor_N[0] = num_events; + + *dev_num_events = num_events; +} + +__global__ void _copy_kernel_spikemonitor_codeobject( + double* dev_monitor_t, + int32_t* dev_monitor_i, + int dummy ) +{ + using namespace brian; + int index = 0; + + // copy monitors + index = 0; + for(int j = 0; j < monitor_t->size(); j++) + { + dev_monitor_t[index] = monitor_t->at(j); + index++; + } + index = 0; + for(int j = 0; j < monitor_i->size(); j++) + { + dev_monitor_i[index] = monitor_i->at(j); + index++; + } +} + +void _copyToHost_spikemonitor_codeobject() +{ + using namespace brian; + + const std::clock_t _start_time = std::clock(); + + // TODO: Use the correct dev_eventmonitor_N instead of dev_num_events + // and the correct _array_eventmonitor_N instead of host_num_events. + // use: dev_array_spikemonitor_N and _array_spikemonitor_N + // dev_array_.. gets copied to _array_... in objects.cu::write_arrays() + // copying it here would result in copying it twice. + // monitor_... and dev_monitor... store the exact same values, but we + // need monitor_... as cudaVector for changing size from device funtions. + // Maybe use cudaVector as default for dynamic arrays, then we would not + // need monitor... at all. This would mean changing the copying in objects.cu + // for dynamic arrays (currently we just use thrust device to host vector). + int host_num_events; + int* dev_num_events; + + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_num_events, sizeof(int)) + ); + + // HOST_CONSTANTS + const int _numN = 1; + const int _num_source_i = 100; + const int _num_source_idx = 100; + const int _num_spikespace = 101; + const int _numcount = 100; + int32_t* const dev_array_spikemonitor_i = thrust::raw_pointer_cast(&dev_dynamic_array_spikemonitor_i[0]); + const int _numi = dev_dynamic_array_spikemonitor_i.size(); + double* const dev_array_spikemonitor_t = thrust::raw_pointer_cast(&dev_dynamic_array_spikemonitor_t[0]); + const int _numt = dev_dynamic_array_spikemonitor_t.size(); + + _count_kernel_spikemonitor_codeobject<<<1,1,0,spikemonitor_stream>>>( + dev_num_events, + // HOST_PARAMETERS + dev_array_spikemonitor_N, + dev_array_spikegeneratorgroup_i, + dev_array_spikemonitor__source_idx, + _array_defaultclock_t[0], + dev_array_spikegeneratorgroup__spikespace[current_idx_array_spikegeneratorgroup__spikespace], + dev_array_spikemonitor_count, + dev_array_spikemonitor_i, + _numi, + dev_array_spikemonitor_t, + _numt + ); + + CUDA_CHECK_ERROR("_count_kernel_spikemonitor_codeobject"); + + CUDA_SAFE_CALL( + cudaMemcpyAsync(&host_num_events, dev_num_events, sizeof(int), cudaMemcpyDeviceToHost,spikemonitor_stream) + ); + + // resize monitor device vectors + THRUST_CHECK_ERROR( + dev_dynamic_array_spikemonitor_t.resize(host_num_events) + ); + THRUST_CHECK_ERROR( + dev_dynamic_array_spikemonitor_i.resize(host_num_events) + ); + + _copy_kernel_spikemonitor_codeobject<<<1,1,0,spikemonitor_stream>>>( + thrust::raw_pointer_cast(&dev_dynamic_array_spikemonitor_t[0]), + thrust::raw_pointer_cast(&dev_dynamic_array_spikemonitor_i[0]), + 0 ); + + CUDA_CHECK_ERROR("_copy_kernel_spikemonitor_codeobject"); +} + +void _debugmsg_spikemonitor_codeobject() +{ + using namespace brian; + + // HOST_CONSTANTS + const int _numN = 1; + const int _num_source_i = 100; + const int _num_source_idx = 100; + const int _num_spikespace = 101; + const int _numcount = 100; + int32_t* const dev_array_spikemonitor_i = thrust::raw_pointer_cast(&dev_dynamic_array_spikemonitor_i[0]); + const int _numi = dev_dynamic_array_spikemonitor_i.size(); + double* const dev_array_spikemonitor_t = thrust::raw_pointer_cast(&dev_dynamic_array_spikemonitor_t[0]); + const int _numt = dev_dynamic_array_spikemonitor_t.size(); + + // TODO: can't we acces the correct _array_eventmonitor_N[0] + // value here without any kernel call? + // Yes: use _array_spikemonitor_N + _debugmsg_kernel_spikemonitor_codeobject<<<1,1,0,spikemonitor_stream>>>( + // HOST_PARAMETERS + dev_array_spikemonitor_N, + dev_array_spikegeneratorgroup_i, + dev_array_spikemonitor__source_idx, + _array_defaultclock_t[0], + dev_array_spikegeneratorgroup__spikespace[current_idx_array_spikegeneratorgroup__spikespace], + dev_array_spikemonitor_count, + dev_array_spikemonitor_i, + _numi, + dev_array_spikemonitor_t, + _numt + ); + + CUDA_CHECK_ERROR("_debugmsg_kernel_spikemonitor_codeobject"); +} + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/spikemonitor_codeobject.h b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/spikemonitor_codeobject.h new file mode 100644 index 00000000..10715398 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/spikemonitor_codeobject.h @@ -0,0 +1,9 @@ +#ifndef _INCLUDED_spikemonitor_codeobject +#define _INCLUDED_spikemonitor_codeobject + +void _run_spikemonitor_codeobject(); + +void _copyToHost_spikemonitor_codeobject(); +void _debugmsg_spikemonitor_codeobject(); + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_group_variable_set_conditional_codeobject.cu b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_group_variable_set_conditional_codeobject.cu new file mode 100644 index 00000000..18051afc --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_group_variable_set_conditional_codeobject.cu @@ -0,0 +1,394 @@ +#include "code_objects/synapses_1_group_variable_set_conditional_codeobject.h" +#include "objects.h" +#include "brianlib/common_math.h" +#include "brianlib/cuda_utils.h" +#include "brianlib/stdint_compat.h" +#include +#include +#include +#include + + + +////// SUPPORT CODE /////// +namespace { + double _host_rand(const int _vectorisation_idx); + double _host_randn(const int _vectorisation_idx); + int32_t _host_poisson(double _lambda, const int _vectorisation_idx); + + ///// block extra_device_helper ///// + + ///// support_code_lines ///// + + #define _rand(vectorisation_idx) (_ptr_array_synapses_1_group_variable_set_conditional_codeobject_rand[vectorisation_idx]) + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + inline __device__ int _brian_atomicAdd(int* address, int val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ float _brian_atomicAdd(float* address, float val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ double _brian_atomicAdd(double* address, double val) + { + #if (__CUDA_ARCH__ >= 600) + // hardware implementation + return atomicAdd(address, val); + #else + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val + + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + #endif + } + inline __device__ int _brian_atomicMul(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val * assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicMul(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val * + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicMul(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val * + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + inline __device__ int _brian_atomicDiv(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val / assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicDiv(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val / + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicDiv(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val / + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + + + // Implement dummy functions such that the host compiled code of binomial + // functions works. Hacky, hacky ... + double _host_rand(const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } + double _host_randn(const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } + int32_t _host_poisson(double _lambda, const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_poisson` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } +} + +////// hashdefine_lines /////// + + + +__global__ void +_run_kernel_synapses_1_group_variable_set_conditional_codeobject( + int _N, + int THREADS_PER_BLOCK, + ///// KERNEL_PARAMETERS ///// + int32_t* _ptr_array_synapses_1_N, + double* _ptr_array_synapses_1_g_raw, + const int _numg_raw, + double* _ptr_array_synapses_1_group_variable_set_conditional_codeobject_rand + ) +{ + using namespace brian; + + int tid = threadIdx.x; + int bid = blockIdx.x; + int _idx = bid * THREADS_PER_BLOCK + tid; + int _vectorisation_idx = _idx; + + ///// KERNEL_CONSTANTS ///// + const int _numN = 1; + + ///// kernel_lines ///// + + + + assert(THREADS_PER_BLOCK == blockDim.x); + + + if(_idx >= _N) + { + return; + } + + ///// block kernel_maincode ///// + + ///// scalar_code['condition'] ///// + + + + ///// scalar_code['statement'] ///// + + const double _lio_statement_1 = 1.0f*(0.1 * 3.7500000000000005e-09)/1.0; + + + ///// vector_code['condition'] ///// + + const char _cond = true; + + + if (_cond) + { + ///// vector_code['statement'] ///// + + double g_raw; + g_raw = _lio_statement_1 * _rand(_vectorisation_idx + 0 * _N); + _ptr_array_synapses_1_g_raw[_idx] = g_raw; + + } + + ///// endblock kernel_maincode ///// +} + + +void _run_synapses_1_group_variable_set_conditional_codeobject() +{ + using namespace brian; + + + const int _N = _array_synapses_1_N[0]; + + ///// HOST_CONSTANTS /////////// + const int _numN = 1; + double* const dev_array_synapses_1_g_raw = thrust::raw_pointer_cast(&dev_dynamic_array_synapses_1_g_raw[0]); + const int _numg_raw = dev_dynamic_array_synapses_1_g_raw.size(); + + // Genenerate an array of random numbers on the device + // Make sure we generate an even number of random numbers + int32_t _rand_N = (_N % 2 == 0) ? _N : _N + 1; + double* dev_array_rand; + CUDA_SAFE_CALL( + cudaMalloc( + (void**)&dev_array_rand, + sizeof(double)*_rand_N*1 + ) + ); + CUDA_SAFE_CALL( + curandGenerateUniformDouble( + curand_generator, + dev_array_rand, + _rand_N*1 + ) + ); + + + + static int num_threads, num_blocks; + static size_t needed_shared_memory = 0; + static bool first_run = true; + if (first_run) + { + // get number of blocks and threads + int min_num_threads; // The minimum grid size needed to achieve the + // maximum occupancy for a full device launch + + CUDA_SAFE_CALL( + cudaOccupancyMaxPotentialBlockSize(&min_num_threads, &num_threads, + _run_kernel_synapses_1_group_variable_set_conditional_codeobject, 0, 0) // last args: dynamicSMemSize, blockSizeLimit + ); + + // Round up according to array size + num_blocks = (_N + num_threads - 1) / num_threads; + + + + + + // calculate theoretical occupancy + int max_active_blocks; + CUDA_SAFE_CALL( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, + _run_kernel_synapses_1_group_variable_set_conditional_codeobject, num_threads, needed_shared_memory) + ); + + float occupancy = (max_active_blocks * num_threads / num_threads_per_warp) / + (float)(max_threads_per_sm / num_threads_per_warp); + + + // check if we have enough ressources to call kernel with given number + // of blocks and threads (can only occur for the else case above as for the + // first max. occupancy) + struct cudaFuncAttributes funcAttrib; + CUDA_SAFE_CALL( + cudaFuncGetAttributes(&funcAttrib, _run_kernel_synapses_1_group_variable_set_conditional_codeobject) + ); + if (num_threads > funcAttrib.maxThreadsPerBlock) + { + // use the max num_threads before launch failure + num_threads = funcAttrib.maxThreadsPerBlock; + printf("WARNING Not enough ressources available to call " + "_run_kernel_synapses_1_group_variable_set_conditional_codeobject " + "with maximum possible threads per block (%u). " + "Reducing num_threads to %u. (Kernel needs %i " + "registers per block, %i bytes of " + "statically-allocated shared memory per block, %i " + "bytes of local memory per thread and a total of %i " + "bytes of user-allocated constant memory)\n", + max_threads_per_block, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes); + + // calculate theoretical occupancy for new num_threads + CUDA_SAFE_CALL( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, + _run_kernel_synapses_1_group_variable_set_conditional_codeobject, num_threads, needed_shared_memory) + ); + + occupancy = (max_active_blocks * num_threads / num_threads_per_warp) / + (float)(max_threads_per_sm / num_threads_per_warp); + } + + + else + { + printf("INFO _run_kernel_synapses_1_group_variable_set_conditional_codeobject\n" + "\t%u blocks\n" + "\t%u threads\n" + "\t%i registers per block\n" + "\t%i bytes statically-allocated shared memory per block\n" + "\t%i bytes local memory per thread\n" + "\t%i bytes user-allocated constant memory\n" + "\t%.3f theoretical occupancy\n", + num_blocks, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes, occupancy); + } + first_run = false; + } + + + _run_kernel_synapses_1_group_variable_set_conditional_codeobject<<>>( + _N, + num_threads, + ///// HOST_PARAMETERS ///// + dev_array_synapses_1_N, + dev_array_synapses_1_g_raw, + _numg_raw, + dev_array_rand + ); + + CUDA_CHECK_ERROR("_run_kernel_synapses_1_group_variable_set_conditional_codeobject"); + + +} + + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_group_variable_set_conditional_codeobject.h b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_group_variable_set_conditional_codeobject.h new file mode 100644 index 00000000..d3ba3171 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_group_variable_set_conditional_codeobject.h @@ -0,0 +1,7 @@ +#ifndef _INCLUDED_synapses_1_group_variable_set_conditional_codeobject +#define _INCLUDED_synapses_1_group_variable_set_conditional_codeobject + +void _run_synapses_1_group_variable_set_conditional_codeobject(); + + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_group_variable_set_conditional_codeobject_1.cu b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_group_variable_set_conditional_codeobject_1.cu new file mode 100644 index 00000000..fa83530f --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_group_variable_set_conditional_codeobject_1.cu @@ -0,0 +1,420 @@ +#include "code_objects/synapses_1_group_variable_set_conditional_codeobject_1.h" +#include "objects.h" +#include "brianlib/common_math.h" +#include "brianlib/cuda_utils.h" +#include "brianlib/stdint_compat.h" +#include +#include +#include +#include + + + +////// SUPPORT CODE /////// +namespace { + double _host_rand(const int _vectorisation_idx); + double _host_randn(const int _vectorisation_idx); + int32_t _host_poisson(double _lambda, const int _vectorisation_idx); + + ///// block extra_device_helper ///// + + ///// support_code_lines ///// + + #define _randn(vectorisation_idx) (_ptr_array_synapses_1_group_variable_set_conditional_codeobject_1_randn[vectorisation_idx]) + #define _rand(vectorisation_idx) (_ptr_array_synapses_1_group_variable_set_conditional_codeobject_1_rand[vectorisation_idx]) + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + inline __device__ int _brian_atomicAdd(int* address, int val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ float _brian_atomicAdd(float* address, float val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ double _brian_atomicAdd(double* address, double val) + { + #if (__CUDA_ARCH__ >= 600) + // hardware implementation + return atomicAdd(address, val); + #else + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val + + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + #endif + } + inline __device__ int _brian_atomicMul(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val * assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicMul(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val * + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicMul(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val * + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + inline __device__ int _brian_atomicDiv(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val / assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicDiv(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val / + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicDiv(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val / + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + + + // Implement dummy functions such that the host compiled code of binomial + // functions works. Hacky, hacky ... + double _host_rand(const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } + double _host_randn(const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } + int32_t _host_poisson(double _lambda, const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_poisson` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } +} + +////// hashdefine_lines /////// + + + +__global__ void +_run_kernel_synapses_1_group_variable_set_conditional_codeobject_1( + int _N, + int THREADS_PER_BLOCK, + ///// KERNEL_PARAMETERS ///// + int32_t* _ptr_array_synapses_1_N, + double* _ptr_array_synapses_1_g_raw, + const int _numg_raw, + double* _ptr_array_synapses_1_group_variable_set_conditional_codeobject_1_rand, + double* _ptr_array_synapses_1_group_variable_set_conditional_codeobject_1_randn + ) +{ + using namespace brian; + + int tid = threadIdx.x; + int bid = blockIdx.x; + int _idx = bid * THREADS_PER_BLOCK + tid; + int _vectorisation_idx = _idx; + + ///// KERNEL_CONSTANTS ///// + const int _numN = 1; + + ///// kernel_lines ///// + + + + assert(THREADS_PER_BLOCK == blockDim.x); + + + if(_idx >= _N) + { + return; + } + + ///// block kernel_maincode ///// + + ///// scalar_code['condition'] ///// + + + + ///// scalar_code['statement'] ///// + + const double _lio_statement_1 = 1.0f*1.0/1.0; + const double _lio_statement_2 = 2.5 * 1e-09; + const double _lio_statement_3 = 0.5 * 1e-09; + + + ///// vector_code['condition'] ///// + + const char _cond = _rand(_vectorisation_idx + 0 * _N) < 0.2; + + + if (_cond) + { + ///// vector_code['statement'] ///// + + double g_raw; + g_raw = _lio_statement_1 * (_lio_statement_2 + (_lio_statement_3 * _randn(_vectorisation_idx + 0 * _N))); + _ptr_array_synapses_1_g_raw[_idx] = g_raw; + + } + + ///// endblock kernel_maincode ///// +} + + +void _run_synapses_1_group_variable_set_conditional_codeobject_1() +{ + using namespace brian; + + + const int _N = _array_synapses_1_N[0]; + + ///// HOST_CONSTANTS /////////// + const int _numN = 1; + double* const dev_array_synapses_1_g_raw = thrust::raw_pointer_cast(&dev_dynamic_array_synapses_1_g_raw[0]); + const int _numg_raw = dev_dynamic_array_synapses_1_g_raw.size(); + + // Genenerate an array of random numbers on the device + // Make sure we generate an even number of random numbers + int32_t _rand_N = (_N % 2 == 0) ? _N : _N + 1; + double* dev_array_rand; + CUDA_SAFE_CALL( + cudaMalloc( + (void**)&dev_array_rand, + sizeof(double)*_rand_N*1 + ) + ); + CUDA_SAFE_CALL( + curandGenerateUniformDouble( + curand_generator, + dev_array_rand, + _rand_N*1 + ) + ); + + + // Genenerate an array of random numbers on the device + // Make sure we generate an even number of random numbers + int32_t _randn_N = (_N % 2 == 0) ? _N : _N + 1; + double* dev_array_randn; + CUDA_SAFE_CALL( + cudaMalloc( + (void**)&dev_array_randn, + sizeof(double)*_randn_N*1 + ) + ); + CUDA_SAFE_CALL( + curandGenerateNormalDouble( + curand_generator, + dev_array_randn, + _randn_N*1, + 0, // mean + 1 // stddev + ) + ); + + + + static int num_threads, num_blocks; + static size_t needed_shared_memory = 0; + static bool first_run = true; + if (first_run) + { + // get number of blocks and threads + int min_num_threads; // The minimum grid size needed to achieve the + // maximum occupancy for a full device launch + + CUDA_SAFE_CALL( + cudaOccupancyMaxPotentialBlockSize(&min_num_threads, &num_threads, + _run_kernel_synapses_1_group_variable_set_conditional_codeobject_1, 0, 0) // last args: dynamicSMemSize, blockSizeLimit + ); + + // Round up according to array size + num_blocks = (_N + num_threads - 1) / num_threads; + + + + + + // calculate theoretical occupancy + int max_active_blocks; + CUDA_SAFE_CALL( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, + _run_kernel_synapses_1_group_variable_set_conditional_codeobject_1, num_threads, needed_shared_memory) + ); + + float occupancy = (max_active_blocks * num_threads / num_threads_per_warp) / + (float)(max_threads_per_sm / num_threads_per_warp); + + + // check if we have enough ressources to call kernel with given number + // of blocks and threads (can only occur for the else case above as for the + // first max. occupancy) + struct cudaFuncAttributes funcAttrib; + CUDA_SAFE_CALL( + cudaFuncGetAttributes(&funcAttrib, _run_kernel_synapses_1_group_variable_set_conditional_codeobject_1) + ); + if (num_threads > funcAttrib.maxThreadsPerBlock) + { + // use the max num_threads before launch failure + num_threads = funcAttrib.maxThreadsPerBlock; + printf("WARNING Not enough ressources available to call " + "_run_kernel_synapses_1_group_variable_set_conditional_codeobject_1 " + "with maximum possible threads per block (%u). " + "Reducing num_threads to %u. (Kernel needs %i " + "registers per block, %i bytes of " + "statically-allocated shared memory per block, %i " + "bytes of local memory per thread and a total of %i " + "bytes of user-allocated constant memory)\n", + max_threads_per_block, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes); + + // calculate theoretical occupancy for new num_threads + CUDA_SAFE_CALL( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, + _run_kernel_synapses_1_group_variable_set_conditional_codeobject_1, num_threads, needed_shared_memory) + ); + + occupancy = (max_active_blocks * num_threads / num_threads_per_warp) / + (float)(max_threads_per_sm / num_threads_per_warp); + } + + + else + { + printf("INFO _run_kernel_synapses_1_group_variable_set_conditional_codeobject_1\n" + "\t%u blocks\n" + "\t%u threads\n" + "\t%i registers per block\n" + "\t%i bytes statically-allocated shared memory per block\n" + "\t%i bytes local memory per thread\n" + "\t%i bytes user-allocated constant memory\n" + "\t%.3f theoretical occupancy\n", + num_blocks, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes, occupancy); + } + first_run = false; + } + + + _run_kernel_synapses_1_group_variable_set_conditional_codeobject_1<<>>( + _N, + num_threads, + ///// HOST_PARAMETERS ///// + dev_array_synapses_1_N, + dev_array_synapses_1_g_raw, + _numg_raw, + dev_array_rand, + dev_array_randn + ); + + CUDA_CHECK_ERROR("_run_kernel_synapses_1_group_variable_set_conditional_codeobject_1"); + + +} + + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_group_variable_set_conditional_codeobject_1.h b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_group_variable_set_conditional_codeobject_1.h new file mode 100644 index 00000000..b45c23fe --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_group_variable_set_conditional_codeobject_1.h @@ -0,0 +1,7 @@ +#ifndef _INCLUDED_synapses_1_group_variable_set_conditional_codeobject_1 +#define _INCLUDED_synapses_1_group_variable_set_conditional_codeobject_1 + +void _run_synapses_1_group_variable_set_conditional_codeobject_1(); + + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_post_codeobject.cu b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_post_codeobject.cu new file mode 100644 index 00000000..4db164c8 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_post_codeobject.cu @@ -0,0 +1,548 @@ +#include "code_objects/synapses_1_post_codeobject.h" +#include "objects.h" +#include "brianlib/common_math.h" +#include "brianlib/cuda_utils.h" +#include "brianlib/stdint_compat.h" +#include +#include +#include +#include + +#include +#include "synapses_classes.h" + + +////// SUPPORT CODE /////// +namespace { + double _host_rand(const int _vectorisation_idx); + double _host_randn(const int _vectorisation_idx); + int32_t _host_poisson(double _lambda, const int _vectorisation_idx); + + ///// block extra_device_helper ///// + + ///// support_code_lines ///// + + template + __host__ __device__ + double _brian_exp(T value) + { + #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 0)) + return exp((double)value); + #else + return exp(value); + #endif + } + inline __host__ __device__ + float _brian_exp(float value) + { + return exp(value); + } + inline __host__ __device__ + double _brian_clip(const double value, + const double a_min, + const double a_max) + { + if (value < a_min) + return a_min; + if (value > a_max) + return a_max; + return value; + } + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + inline __device__ int _brian_atomicAdd(int* address, int val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ float _brian_atomicAdd(float* address, float val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ double _brian_atomicAdd(double* address, double val) + { + #if (__CUDA_ARCH__ >= 600) + // hardware implementation + return atomicAdd(address, val); + #else + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val + + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + #endif + } + inline __device__ int _brian_atomicMul(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val * assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicMul(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val * + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicMul(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val * + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + inline __device__ int _brian_atomicDiv(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val / assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicDiv(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val / + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicDiv(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val / + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + + + // Implement dummy functions such that the host compiled code of binomial + // functions works. Hacky, hacky ... + double _host_rand(const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } + double _host_randn(const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } + int32_t _host_poisson(double _lambda, const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_poisson` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } +} + +////// hashdefine_lines /////// + + + + +__global__ void +_run_kernel_synapses_1_post_codeobject( + int _N, + int bid_offset, + int timestep, + int THREADS_PER_BLOCK, + int threads_per_bundle, + int32_t* eventspace, + int neurongroup_size, + ///// KERNEL_PARAMETERS ///// + double* _ptr_array_synapses_1_Apost, + const int _numApost, + double* _ptr_array_synapses_1_Apre, + const int _numApre, + int32_t* _ptr_array_synapses_1_N, + int32_t* _ptr_array_synapses_1__synaptic_pre, + const int _num_synaptic_pre, + double* _ptr_array_synapses_1_g_raw, + const int _numg_raw, + double* _ptr_array_synapses_1_lastupdate, + const int _numlastupdate, + const double _value_array_defaultclock_t + ) +{ + using namespace brian; + + assert(THREADS_PER_BLOCK == blockDim.x); + + int tid = threadIdx.x; + int bid = blockIdx.x + bid_offset; + //TODO: do we need _idx here? if no, get also rid of scoping after scalar code + // scalar_code can depend on _idx (e.g. if the state update depends on a + // subexpression that is the same for all synapses, ?) + int _idx = bid * THREADS_PER_BLOCK + tid; + int _vectorisation_idx = _idx; + + ///// KERNEL_CONSTANTS ///// + const int _numN = 1; + + ///// kernel_lines ///// + + const double* _ptr_array_defaultclock_t = &_value_array_defaultclock_t; + + + + ///// scalar_code ///// + + const double _lio_1 = 1.0f*1.0/0.01; + const double _lio_2 = 1.0f*1.0/0.01; + + + { // _idx is defined in outer and inner scope (for `scalar_code`) + if (synapses_1_post.no_or_const_delay_mode) + { + // TODO: pass as kernel parameter instead? + int num_parallel_blocks = synapses_1_post.queue->num_blocks; + int32_t spikes_start = synapses_1_post.spikes_start; + int32_t spikes_stop = synapses_1_post.spikes_stop; + + // for the first delay timesteps the eventspace is not yet filled + // note that num_queues is the number of eventspaces, num_queues-1 the delay in timesteps + if (timestep >= synapses_1_post.queue->num_queues - 1) + { + // `spiking_neuron_idx` runs through the eventspace + // `post_block_idx` runs through the post neuron blocks of the connectivity matrix + int post_block_idx = bid; + // loop through neurons in eventspace (indices of event neurons, rest -1) + for(int spiking_neuron_idx = 0; + spiking_neuron_idx < neurongroup_size; + spiking_neuron_idx++) + { + + // spiking_neuron is index in NeuronGroup + int32_t spiking_neuron = eventspace[spiking_neuron_idx]; + + if(spiking_neuron == -1) // end of spiking neurons + { + assert(spiking_neuron_idx == eventspace[neurongroup_size]); + return; + } + + // apply effects if event neuron is in sources of current SynapticPathway + if(spikes_start <= spiking_neuron && spiking_neuron < spikes_stop) + { + int pre_post_block_id = (spiking_neuron - spikes_start) * num_parallel_blocks + post_block_idx; + int num_synapses = synapses_1_post_num_synapses_by_pre[pre_post_block_id]; + int32_t* propagating_synapses = synapses_1_post_synapse_ids_by_pre[pre_post_block_id]; + for(int j = tid; j < num_synapses; j+=THREADS_PER_BLOCK) + { + // _idx is the synapse id + int32_t _idx = propagating_synapses[j]; + _vectorisation_idx = j; + + ///// vector_code ///// + + double lastupdate = _ptr_array_synapses_1_lastupdate[_idx]; + const double t = _ptr_array_defaultclock_t[0]; + double Apost = _ptr_array_synapses_1_Apost[_idx]; + double g_raw = _ptr_array_synapses_1_g_raw[_idx]; + double Apre = _ptr_array_synapses_1_Apre[_idx]; + const double _Apost = Apost * _brian_exp(_lio_1 * (- (t - lastupdate))); + const double _Apre = Apre * _brian_exp(_lio_2 * (- (t - lastupdate))); + Apost = _Apost; + Apre = _Apre; + Apost += (- 1.0000000000000002e-10); + g_raw = _brian_clip(g_raw + Apre, 0.0, 3.7500000000000005e-09); + lastupdate = t; + _ptr_array_synapses_1_lastupdate[_idx] = lastupdate; + _ptr_array_synapses_1_Apre[_idx] = Apre; + _ptr_array_synapses_1_Apost[_idx] = Apost; + _ptr_array_synapses_1_g_raw[_idx] = g_raw; + + } + } + + __syncthreads(); + } + } + } + else // heterogeneous delay mode + { + cudaVector* synapses_queue; + synapses_1_post.queue->peek(&synapses_queue); + + int queue_size = synapses_queue[bid].size(); + + // use a fixed number of threads per bundle, i runs through all those threads of all bundles + // for threads_per_bundle == 1, we have one thread per bundle (parallel) + for (int i = tid; i < queue_size*threads_per_bundle; i+=THREADS_PER_BLOCK) + { + // bundle_idx runs through all bundles + int bundle_idx = i / threads_per_bundle; + // syn_in_bundle_idx runs through all threads in a single bundle + int syn_in_bundle_idx = i % threads_per_bundle; + + int bundle_id = synapses_queue[bid].at(bundle_idx); + int bundle_size = synapses_1_post_num_synapses_by_bundle[bundle_id]; + int synapses_offset = synapses_1_post_synapses_offset_by_bundle[bundle_id]; + int32_t* synapse_ids = synapses_1_post_synapse_ids; + int32_t* synapse_bundle = synapse_ids + synapses_offset; + + // loop through synapses of this bundle with all available threads_per_bundle + // if threads_per_bundle == 1, this is serial + for (int j = syn_in_bundle_idx; j < bundle_size; j+=threads_per_bundle) + { + int32_t _idx = synapse_bundle[j]; + + + ///// vector_code ///// + + double lastupdate = _ptr_array_synapses_1_lastupdate[_idx]; + const double t = _ptr_array_defaultclock_t[0]; + double Apost = _ptr_array_synapses_1_Apost[_idx]; + double g_raw = _ptr_array_synapses_1_g_raw[_idx]; + double Apre = _ptr_array_synapses_1_Apre[_idx]; + const double _Apost = Apost * _brian_exp(_lio_1 * (- (t - lastupdate))); + const double _Apre = Apre * _brian_exp(_lio_2 * (- (t - lastupdate))); + Apost = _Apost; + Apre = _Apre; + Apost += (- 1.0000000000000002e-10); + g_raw = _brian_clip(g_raw + Apre, 0.0, 3.7500000000000005e-09); + lastupdate = t; + _ptr_array_synapses_1_lastupdate[_idx] = lastupdate; + _ptr_array_synapses_1_Apre[_idx] = Apre; + _ptr_array_synapses_1_Apost[_idx] = Apost; + _ptr_array_synapses_1_g_raw[_idx] = g_raw; + + } + } + } + } + } + + + +void _run_synapses_1_post_codeobject() +{ + using namespace brian; + + + const int _N = _array_synapses_1_N[0]; + + ///// HOST_CONSTANTS /////////// + double* const dev_array_synapses_1_Apost = thrust::raw_pointer_cast(&dev_dynamic_array_synapses_1_Apost[0]); + const int _numApost = dev_dynamic_array_synapses_1_Apost.size(); + double* const dev_array_synapses_1_Apre = thrust::raw_pointer_cast(&dev_dynamic_array_synapses_1_Apre[0]); + const int _numApre = dev_dynamic_array_synapses_1_Apre.size(); + const int _numN = 1; + int32_t* const dev_array_synapses_1__synaptic_pre = thrust::raw_pointer_cast(&dev_dynamic_array_synapses_1__synaptic_pre[0]); + const int _num_synaptic_pre = dev_dynamic_array_synapses_1__synaptic_pre.size(); + double* const dev_array_synapses_1_g_raw = thrust::raw_pointer_cast(&dev_dynamic_array_synapses_1_g_raw[0]); + const int _numg_raw = dev_dynamic_array_synapses_1_g_raw.size(); + double* const dev_array_synapses_1_lastupdate = thrust::raw_pointer_cast(&dev_dynamic_array_synapses_1_lastupdate[0]); + const int _numlastupdate = dev_dynamic_array_synapses_1_lastupdate.size(); + +static int num_threads_per_bundle; +static int num_loops; + + static int num_threads, num_blocks; + static size_t needed_shared_memory = 0; + static bool first_run = true; + if (first_run) + { +// Synaptic effects modify only synapse variables. +num_blocks = num_parallel_blocks; +num_threads = max_threads_per_block; +// TODO: effect of mean instead of max? +num_threads_per_bundle = synapses_1_post_max_bundle_size; +num_loops = 1; + + + + // calculate theoretical occupancy + int max_active_blocks; + CUDA_SAFE_CALL( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, + _run_kernel_synapses_1_post_codeobject, num_threads, needed_shared_memory) + ); + + float occupancy = (max_active_blocks * num_threads / num_threads_per_warp) / + (float)(max_threads_per_sm / num_threads_per_warp); + + + // check if we have enough ressources to call kernel with given number + // of blocks and threads (can only occur for the else case above as for the + // first max. occupancy) + struct cudaFuncAttributes funcAttrib; + CUDA_SAFE_CALL( + cudaFuncGetAttributes(&funcAttrib, _run_kernel_synapses_1_post_codeobject) + ); + if (num_threads > funcAttrib.maxThreadsPerBlock) + { + // use the max num_threads before launch failure + num_threads = funcAttrib.maxThreadsPerBlock; + printf("WARNING Not enough ressources available to call " + "_run_kernel_synapses_1_post_codeobject " + "with maximum possible threads per block (%u). " + "Reducing num_threads to %u. (Kernel needs %i " + "registers per block, %i bytes of " + "statically-allocated shared memory per block, %i " + "bytes of local memory per thread and a total of %i " + "bytes of user-allocated constant memory)\n", + max_threads_per_block, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes); + + // calculate theoretical occupancy for new num_threads + CUDA_SAFE_CALL( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, + _run_kernel_synapses_1_post_codeobject, num_threads, needed_shared_memory) + ); + + occupancy = (max_active_blocks * num_threads / num_threads_per_warp) / + (float)(max_threads_per_sm / num_threads_per_warp); + } + +else if (synapses_1_post_max_size <= 0) +{ + printf("INFO there are no synapses in the synapses_1_post pathway. Skipping synapses_push and synapses kernels.\n"); +} + + else + { + printf("INFO _run_kernel_synapses_1_post_codeobject\n" + "\t%u blocks\n" + "\t%u threads\n" + "\t%i registers per block\n" + "\t%i bytes statically-allocated shared memory per block\n" + "\t%i bytes local memory per thread\n" + "\t%i bytes user-allocated constant memory\n" + "\t%.3f theoretical occupancy\n", + num_blocks, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes, occupancy); + } + first_run = false; + } + + +// only call kernel if we have synapses (otherwise we skipped the push kernel) +if (synapses_1_post_max_size > 0) +{ + for(int bid_offset = 0; bid_offset < num_loops; bid_offset++) + { + _run_kernel_synapses_1_post_codeobject<<>>( + _N, + bid_offset, + defaultclock.timestep[0], + num_threads, + num_threads_per_bundle, + dev_array_neurongroup_1__spikespace[synapses_1_post_eventspace_idx], + _num__array_neurongroup_1__spikespace-1, + ///// HOST_PARAMETERS ///// + dev_array_synapses_1_Apost, + _numApost, + dev_array_synapses_1_Apre, + _numApre, + dev_array_synapses_1_N, + dev_array_synapses_1__synaptic_pre, + _num_synaptic_pre, + dev_array_synapses_1_g_raw, + _numg_raw, + dev_array_synapses_1_lastupdate, + _numlastupdate, + _array_defaultclock_t[0] + ); + } + + CUDA_CHECK_ERROR("_run_kernel_synapses_1_post_codeobject"); + CUDA_SAFE_CALL(cudaDeviceSynchronize()); +} + + +} + +void _debugmsg_synapses_1_post_codeobject() +{ + using namespace brian; + std::cout << "Number of synapses: " << _array_synapses_1_N[0] << endl; +} + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_post_codeobject.h b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_post_codeobject.h new file mode 100644 index 00000000..7a5cc9fa --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_post_codeobject.h @@ -0,0 +1,8 @@ +#ifndef _INCLUDED_synapses_1_post_codeobject +#define _INCLUDED_synapses_1_post_codeobject + +void _run_synapses_1_post_codeobject(); + +void _debugmsg_synapses_1_post_codeobject(); + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_post_push_spikes.cu b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_post_push_spikes.cu new file mode 100644 index 00000000..d24a700e --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_post_push_spikes.cu @@ -0,0 +1,374 @@ +#include "code_objects/synapses_1_post_push_spikes.h" +#include "objects.h" +#include "brianlib/common_math.h" +#include "brianlib/cuda_utils.h" +#include "brianlib/stdint_compat.h" +#include +#include +#include +#include + + + +////// SUPPORT CODE /////// +namespace { + double _host_rand(const int _vectorisation_idx); + double _host_randn(const int _vectorisation_idx); + int32_t _host_poisson(double _lambda, const int _vectorisation_idx); + + ///// block extra_device_helper ///// +__global__ void _advance_kernel_synapses_1_post_push_spikes() +{ + using namespace brian; + int tid = threadIdx.x; + synapses_1_post.queue->advance( + tid); +} + + ///// support_code_lines ///// + + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + inline __device__ int _brian_atomicAdd(int* address, int val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ float _brian_atomicAdd(float* address, float val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ double _brian_atomicAdd(double* address, double val) + { + #if (__CUDA_ARCH__ >= 600) + // hardware implementation + return atomicAdd(address, val); + #else + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val + + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + #endif + } + inline __device__ int _brian_atomicMul(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val * assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicMul(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val * + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicMul(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val * + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + inline __device__ int _brian_atomicDiv(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val / assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicDiv(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val / + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicDiv(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val / + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + + + // Implement dummy functions such that the host compiled code of binomial + // functions works. Hacky, hacky ... + double _host_rand(const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } + double _host_randn(const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } + int32_t _host_poisson(double _lambda, const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_poisson` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } +} + +////// hashdefine_lines /////// + + + +__global__ void +_run_kernel_synapses_1_post_push_spikes( + int num_parallel_blocks, + int _num_blocks, + int _num_threads, + int32_t* _eventspace) +{ + // apperently this is not always true and that is why _num_threads is passed as function argument + // if this assert never fails, we could remove the _num_threads form the argument list + assert(blockDim.x == _num_threads); + + using namespace brian; + + int bid = blockIdx.x; + int tid = threadIdx.x; + + int post_neuron_bid = bid % num_parallel_blocks; + int pre_neuron_idx = bid / num_parallel_blocks; + + int32_t spiking_neuron = _eventspace[pre_neuron_idx]; + assert(spiking_neuron != -1); + + // push to spikequeue if spiking_neuron is in sources of current SynapticPathway + if(synapses_1_post.spikes_start <= spiking_neuron && spiking_neuron < synapses_1_post.spikes_stop) + { + synapses_1_post.queue->push_bundles( + post_neuron_bid, + tid, + _num_threads, + spiking_neuron - synapses_1_post.spikes_start); + } +} + + +void _run_synapses_1_post_push_spikes() +{ + using namespace brian; + + + + ///// HOST_CONSTANTS /////////// + const int _numN = 1; + const int _num_spikespace = 101; + double* const _array_synapses_1_delay_1 = thrust::raw_pointer_cast(&_dynamic_array_synapses_1_delay_1[0]); + const int _numdelay = _dynamic_array_synapses_1_delay_1.size(); + double* const dev_array_synapses_1_delay_1 = thrust::raw_pointer_cast(&dev_dynamic_array_synapses_1_delay_1[0]); + + if (synapses_1_post_scalar_delay) + { + int num_eventspaces = dev_array_neurongroup_1__spikespace.size(); + synapses_1_post_eventspace_idx = (current_idx_array_neurongroup_1__spikespace - synapses_1_post_delay + num_eventspaces) % num_eventspaces; + + ////////////////////////////////////////////// + //// No pushing in no_or_const_delay_mode //// + ////////////////////////////////////////////// + } + else if (synapses_1_post_max_size > 0) + { + + // get the number of spiking neurons + int32_t num_spiking_neurons; + CUDA_SAFE_CALL( + cudaMemcpyAsync(&num_spiking_neurons, + dev_array_neurongroup_1__spikespace[current_idx_array_neurongroup_1__spikespace] + _num_spikespace - 1, + sizeof(int32_t), cudaMemcpyDeviceToHost,stream1) + ); + + // advance spike queues + _advance_kernel_synapses_1_post_push_spikes<<<1, num_parallel_blocks,0,stream1>>>(); + + CUDA_CHECK_ERROR("_advance_kernel_synapses_1_post_push_spikes"); + + + static int num_threads, num_blocks; + static size_t needed_shared_memory = 0; + static bool first_run = true; + if (first_run) + { + needed_shared_memory = 0; + + // We don't need more then max(num_synapses) threads per block. + num_threads = synapses_1_post_max_size; + if (num_threads > max_threads_per_block) + { + num_threads = max_threads_per_block; + } + // num_blocks depends on num_spiking_neurons, which changes each time step + + + // calculate theoretical occupancy + int max_active_blocks; + CUDA_SAFE_CALL( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, + _run_kernel_synapses_1_post_push_spikes, num_threads, needed_shared_memory) + ); + + float occupancy = (max_active_blocks * num_threads / num_threads_per_warp) / + (float)(max_threads_per_sm / num_threads_per_warp); + + + // check if we have enough ressources to call kernel with given number + // of blocks and threads (can only occur for the else case above as for the + // first max. occupancy) + struct cudaFuncAttributes funcAttrib; + CUDA_SAFE_CALL( + cudaFuncGetAttributes(&funcAttrib, _run_kernel_synapses_1_post_push_spikes) + ); + if (num_threads > funcAttrib.maxThreadsPerBlock) + { + // use the max num_threads before launch failure + num_threads = funcAttrib.maxThreadsPerBlock; + printf("WARNING Not enough ressources available to call " + "_run_kernel_synapses_1_post_push_spikes " + "with maximum possible threads per block (%u). " + "Reducing num_threads to %u. (Kernel needs %i " + "registers per block, %i bytes of " + "statically-allocated shared memory per block, %i " + "bytes of local memory per thread and a total of %i " + "bytes of user-allocated constant memory)\n", + max_threads_per_block, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes); + + // calculate theoretical occupancy for new num_threads + CUDA_SAFE_CALL( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, + _run_kernel_synapses_1_post_push_spikes, num_threads, needed_shared_memory) + ); + + occupancy = (max_active_blocks * num_threads / num_threads_per_warp) / + (float)(max_threads_per_sm / num_threads_per_warp); + } + + + else + { + printf("INFO _run_kernel_synapses_1_post_push_spikes\n" + "\t%u blocks\n" + "\t%u threads\n" + "\t%i registers per block\n" + "\t%i bytes statically-allocated shared memory per block\n" + "\t%i bytes local memory per thread\n" + "\t%i bytes user-allocated constant memory\n" + "\t%.3f theoretical occupancy\n", + num_blocks, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes, occupancy); + } + first_run = false; + } + + + if (num_spiking_neurons > 0) + { + num_blocks = num_parallel_blocks * num_spiking_neurons; + + _run_kernel_synapses_1_post_push_spikes<<>>( + num_parallel_blocks, + num_blocks, + num_threads, + dev_array_neurongroup_1__spikespace[current_idx_array_neurongroup_1__spikespace]); + + CUDA_CHECK_ERROR("_run_kernel_synapses_1_post_push_spikes"); + } + } // end else if (synapses_1_post_max_size > 0) + +} + + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_post_push_spikes.h b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_post_push_spikes.h new file mode 100644 index 00000000..b135add8 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_post_push_spikes.h @@ -0,0 +1,7 @@ +#ifndef _INCLUDED_synapses_1_post_push_spikes +#define _INCLUDED_synapses_1_post_push_spikes + +void _run_synapses_1_post_push_spikes(); + + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_pre_codeobject.cu b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_pre_codeobject.cu new file mode 100644 index 00000000..df8ce424 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_pre_codeobject.cu @@ -0,0 +1,588 @@ +#include "code_objects/synapses_1_pre_codeobject.h" +#include "objects.h" +#include "brianlib/common_math.h" +#include "brianlib/cuda_utils.h" +#include "brianlib/stdint_compat.h" +#include +#include +#include +#include + +#include +#include "synapses_classes.h" + + +////// SUPPORT CODE /////// +namespace { + double _host_rand(const int _vectorisation_idx); + double _host_randn(const int _vectorisation_idx); + int32_t _host_poisson(double _lambda, const int _vectorisation_idx); + + ///// block extra_device_helper ///// + + ///// support_code_lines ///// + + template + __host__ __device__ + double _brian_exp(T value) + { + #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 0)) + return exp((double)value); + #else + return exp(value); + #endif + } + inline __host__ __device__ + float _brian_exp(float value) + { + return exp(value); + } + inline __host__ __device__ + double _brian_clip(const double value, + const double a_min, + const double a_max) + { + if (value < a_min) + return a_min; + if (value > a_max) + return a_max; + return value; + } + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + inline __device__ int _brian_atomicAdd(int* address, int val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ float _brian_atomicAdd(float* address, float val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ double _brian_atomicAdd(double* address, double val) + { + #if (__CUDA_ARCH__ >= 600) + // hardware implementation + return atomicAdd(address, val); + #else + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val + + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + #endif + } + inline __device__ int _brian_atomicMul(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val * assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicMul(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val * + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicMul(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val * + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + inline __device__ int _brian_atomicDiv(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val / assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicDiv(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val / + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicDiv(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val / + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + + + // Implement dummy functions such that the host compiled code of binomial + // functions works. Hacky, hacky ... + double _host_rand(const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } + double _host_randn(const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } + int32_t _host_poisson(double _lambda, const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_poisson` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } +} + +////// hashdefine_lines /////// + + + + +__global__ void +_run_kernel_synapses_1_pre_codeobject( + int _N, + int bid_offset, + int timestep, + int THREADS_PER_BLOCK, + int threads_per_bundle, + int32_t* eventspace, + int num_spiking_neurons, + ///// KERNEL_PARAMETERS ///// + double* _ptr_array_synapses_1_Apost, + const int _numApost, + double* _ptr_array_synapses_1_Apre, + const int _numApre, + int32_t* _ptr_array_synapses_1_N, + int32_t* _ptr_array_synapses_1__synaptic_post, + const int _num_postsynaptic_idx, + int32_t* _ptr_array_synapses_1__synaptic_pre, + const int _num_synaptic_pre, + double* _ptr_array_neurongroup_1_g_iKC_eKC, + double* _ptr_array_synapses_1_g_raw, + const int _numg_raw, + double* _ptr_array_synapses_1_lastupdate, + const int _numlastupdate, + const double _value_array_defaultclock_t + ) +{ + using namespace brian; + + assert(THREADS_PER_BLOCK == blockDim.x); + + int tid = threadIdx.x; + int bid = blockIdx.x + bid_offset; + //TODO: do we need _idx here? if no, get also rid of scoping after scalar code + // scalar_code can depend on _idx (e.g. if the state update depends on a + // subexpression that is the same for all synapses, ?) + int _idx = bid * THREADS_PER_BLOCK + tid; + int _vectorisation_idx = _idx; + + ///// KERNEL_CONSTANTS ///// + const int _numN = 1; + const int _numg_iKC_eKC = 100; + + ///// kernel_lines ///// + + const double* _ptr_array_defaultclock_t = &_value_array_defaultclock_t; + + + + ///// scalar_code ///// + + const double _lio_1 = 1.0f*1.0/0.01; + const double _lio_2 = 1.0f*1.0/0.01; + + + { // _idx is defined in outer and inner scope (for `scalar_code`) + if (synapses_1_pre.no_or_const_delay_mode) + { + // TODO: pass as kernel parameter instead? + int num_parallel_blocks = synapses_1_pre.queue->num_blocks; + int32_t spikes_start = synapses_1_pre.spikes_start; + int32_t spikes_stop = synapses_1_pre.spikes_stop; + + // for the first delay timesteps the eventspace is not yet filled + // note that num_queues is the number of eventspaces, num_queues-1 the delay in timesteps + if (timestep >= synapses_1_pre.queue->num_queues - 1) + { + // `spiking_neuron_idx` runs through the eventspace + // `post_block_idx` runs through the post neuron blocks of the connectivity matrix + int spiking_neuron_idx = bid / num_parallel_blocks; + int post_block_idx = bid % num_parallel_blocks; + { + + // spiking_neuron is index in NeuronGroup + int32_t spiking_neuron = eventspace[spiking_neuron_idx]; + + assert(spiking_neuron != -1); + + // apply effects if event neuron is in sources of current SynapticPathway + if(spikes_start <= spiking_neuron && spiking_neuron < spikes_stop) + { + int pre_post_block_id = (spiking_neuron - spikes_start) * num_parallel_blocks + post_block_idx; + int num_synapses = synapses_1_pre_num_synapses_by_pre[pre_post_block_id]; + int32_t* propagating_synapses = synapses_1_pre_synapse_ids_by_pre[pre_post_block_id]; + for(int j = tid; j < num_synapses; j+=THREADS_PER_BLOCK) + { + // _idx is the synapse id + int32_t _idx = propagating_synapses[j]; + _vectorisation_idx = j; + + ///// vector_code ///// + + // Abstract code: _Apost := Apost * exp(_lio_1 * (- (t - lastupdate))) + // Abstract code: _Apre := Apre * exp(_lio_2 * (- (t - lastupdate))) + // Abstract code: Apost = _Apost + // Abstract code: Apre = _Apre + // Abstract code: g_iKC_eKC += g_raw + // Abstract code: Apre += 1.0000000000000002e-10 + // Abstract code: g_raw = clip(g_raw + Apost, 0.0, 3.7500000000000005e-09) + // Abstract code: lastupdate = t + const int32_t _postsynaptic_idx = _ptr_array_synapses_1__synaptic_post[_idx]; + double lastupdate = _ptr_array_synapses_1_lastupdate[_idx]; + double Apost = _ptr_array_synapses_1_Apost[_idx]; + double g_raw = _ptr_array_synapses_1_g_raw[_idx]; + const double t = _ptr_array_defaultclock_t[0]; + double Apre = _ptr_array_synapses_1_Apre[_idx]; + const double _Apost = Apost * _brian_exp(_lio_1 * (- (t - lastupdate))); + const double _Apre = Apre * _brian_exp(_lio_2 * (- (t - lastupdate))); + Apost = _Apost; + Apre = _Apre; + _brian_atomicAdd(&_ptr_array_neurongroup_1_g_iKC_eKC[_postsynaptic_idx], (double)(g_raw)); + Apre += 1.0000000000000002e-10; + g_raw = _brian_clip(g_raw + Apost, 0.0, 3.7500000000000005e-09); + lastupdate = t; + _ptr_array_synapses_1_Apre[_idx] = Apre; + _ptr_array_synapses_1_lastupdate[_idx] = lastupdate; + _ptr_array_synapses_1_Apost[_idx] = Apost; + _ptr_array_synapses_1_g_raw[_idx] = g_raw; + + } + } + + __syncthreads(); + } + } + } + else // heterogeneous delay mode + { + cudaVector* synapses_queue; + synapses_1_pre.queue->peek(&synapses_queue); + + int queue_size = synapses_queue[bid].size(); + + // use a fixed number of threads per bundle, i runs through all those threads of all bundles + // for threads_per_bundle == 1, we have one thread per bundle (parallel) + for (int i = tid; i < queue_size*threads_per_bundle; i+=THREADS_PER_BLOCK) + { + // bundle_idx runs through all bundles + int bundle_idx = i / threads_per_bundle; + // syn_in_bundle_idx runs through all threads in a single bundle + int syn_in_bundle_idx = i % threads_per_bundle; + + int bundle_id = synapses_queue[bid].at(bundle_idx); + int bundle_size = synapses_1_pre_num_synapses_by_bundle[bundle_id]; + int synapses_offset = synapses_1_pre_synapses_offset_by_bundle[bundle_id]; + int32_t* synapse_ids = synapses_1_pre_synapse_ids; + int32_t* synapse_bundle = synapse_ids + synapses_offset; + + // loop through synapses of this bundle with all available threads_per_bundle + // if threads_per_bundle == 1, this is serial + for (int j = syn_in_bundle_idx; j < bundle_size; j+=threads_per_bundle) + { + int32_t _idx = synapse_bundle[j]; + + + ///// vector_code ///// + + // Abstract code: _Apost := Apost * exp(_lio_1 * (- (t - lastupdate))) + // Abstract code: _Apre := Apre * exp(_lio_2 * (- (t - lastupdate))) + // Abstract code: Apost = _Apost + // Abstract code: Apre = _Apre + // Abstract code: g_iKC_eKC += g_raw + // Abstract code: Apre += 1.0000000000000002e-10 + // Abstract code: g_raw = clip(g_raw + Apost, 0.0, 3.7500000000000005e-09) + // Abstract code: lastupdate = t + const int32_t _postsynaptic_idx = _ptr_array_synapses_1__synaptic_post[_idx]; + double lastupdate = _ptr_array_synapses_1_lastupdate[_idx]; + double Apost = _ptr_array_synapses_1_Apost[_idx]; + double g_raw = _ptr_array_synapses_1_g_raw[_idx]; + const double t = _ptr_array_defaultclock_t[0]; + double Apre = _ptr_array_synapses_1_Apre[_idx]; + const double _Apost = Apost * _brian_exp(_lio_1 * (- (t - lastupdate))); + const double _Apre = Apre * _brian_exp(_lio_2 * (- (t - lastupdate))); + Apost = _Apost; + Apre = _Apre; + _brian_atomicAdd(&_ptr_array_neurongroup_1_g_iKC_eKC[_postsynaptic_idx], (double)(g_raw)); + Apre += 1.0000000000000002e-10; + g_raw = _brian_clip(g_raw + Apost, 0.0, 3.7500000000000005e-09); + lastupdate = t; + _ptr_array_synapses_1_Apre[_idx] = Apre; + _ptr_array_synapses_1_lastupdate[_idx] = lastupdate; + _ptr_array_synapses_1_Apost[_idx] = Apost; + _ptr_array_synapses_1_g_raw[_idx] = g_raw; + + } + } + } + } + } + + + +void _run_synapses_1_pre_codeobject() +{ + using namespace brian; + + + const int _N = _array_synapses_1_N[0]; + + ///// HOST_CONSTANTS /////////// + double* const dev_array_synapses_1_Apost = thrust::raw_pointer_cast(&dev_dynamic_array_synapses_1_Apost[0]); + const int _numApost = dev_dynamic_array_synapses_1_Apost.size(); + double* const dev_array_synapses_1_Apre = thrust::raw_pointer_cast(&dev_dynamic_array_synapses_1_Apre[0]); + const int _numApre = dev_dynamic_array_synapses_1_Apre.size(); + const int _numN = 1; + int32_t* const dev_array_synapses_1__synaptic_post = thrust::raw_pointer_cast(&dev_dynamic_array_synapses_1__synaptic_post[0]); + const int _num_postsynaptic_idx = dev_dynamic_array_synapses_1__synaptic_post.size(); + int32_t* const dev_array_synapses_1__synaptic_pre = thrust::raw_pointer_cast(&dev_dynamic_array_synapses_1__synaptic_pre[0]); + const int _num_synaptic_pre = dev_dynamic_array_synapses_1__synaptic_pre.size(); + const int _numg_iKC_eKC = 100; + double* const dev_array_synapses_1_g_raw = thrust::raw_pointer_cast(&dev_dynamic_array_synapses_1_g_raw[0]); + const int _numg_raw = dev_dynamic_array_synapses_1_g_raw.size(); + double* const dev_array_synapses_1_lastupdate = thrust::raw_pointer_cast(&dev_dynamic_array_synapses_1_lastupdate[0]); + const int _numlastupdate = dev_dynamic_array_synapses_1_lastupdate.size(); + +static int num_threads_per_bundle; +static int num_loops; + + static int num_threads, num_blocks; + static size_t needed_shared_memory = 0; + static bool first_run = true; + if (first_run) + { +// We are using atomics, we can fully parallelise. +num_blocks = num_parallel_blocks; +num_threads = max_threads_per_block; +// TODO: effect of mean instead of max? +num_threads_per_bundle = synapses_1_pre_max_bundle_size; +num_loops = 1; + + + + // calculate theoretical occupancy + int max_active_blocks; + CUDA_SAFE_CALL( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, + _run_kernel_synapses_1_pre_codeobject, num_threads, needed_shared_memory) + ); + + float occupancy = (max_active_blocks * num_threads / num_threads_per_warp) / + (float)(max_threads_per_sm / num_threads_per_warp); + + + // check if we have enough ressources to call kernel with given number + // of blocks and threads (can only occur for the else case above as for the + // first max. occupancy) + struct cudaFuncAttributes funcAttrib; + CUDA_SAFE_CALL( + cudaFuncGetAttributes(&funcAttrib, _run_kernel_synapses_1_pre_codeobject) + ); + if (num_threads > funcAttrib.maxThreadsPerBlock) + { + // use the max num_threads before launch failure + num_threads = funcAttrib.maxThreadsPerBlock; + printf("WARNING Not enough ressources available to call " + "_run_kernel_synapses_1_pre_codeobject " + "with maximum possible threads per block (%u). " + "Reducing num_threads to %u. (Kernel needs %i " + "registers per block, %i bytes of " + "statically-allocated shared memory per block, %i " + "bytes of local memory per thread and a total of %i " + "bytes of user-allocated constant memory)\n", + max_threads_per_block, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes); + + // calculate theoretical occupancy for new num_threads + CUDA_SAFE_CALL( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, + _run_kernel_synapses_1_pre_codeobject, num_threads, needed_shared_memory) + ); + + occupancy = (max_active_blocks * num_threads / num_threads_per_warp) / + (float)(max_threads_per_sm / num_threads_per_warp); + } + +else if (synapses_1_pre_max_size <= 0) +{ + printf("INFO there are no synapses in the synapses_1_pre pathway. Skipping synapses_push and synapses kernels.\n"); +} + + else + { + printf("INFO _run_kernel_synapses_1_pre_codeobject\n" + "\t%u blocks\n" + "\t%u threads\n" + "\t%i registers per block\n" + "\t%i bytes statically-allocated shared memory per block\n" + "\t%i bytes local memory per thread\n" + "\t%i bytes user-allocated constant memory\n" + "\t%.3f theoretical occupancy\n", + num_blocks, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes, occupancy); + } + first_run = false; + } + + +// only call kernel if we have synapses (otherwise we skipped the push kernel) +if (synapses_1_pre_max_size > 0) +{ + int32_t num_spiking_neurons; + // we only need the number of spiking neurons if we parallelise effect + // application over spiking neurons in homogeneous delay mode + if (synapses_1_pre_scalar_delay) + { + if (defaultclock.timestep[0] >= synapses_1_pre_delay) + { + cudaMemcpyAsync(&num_spiking_neurons, + &dev_array_neurongroup__spikespace[synapses_1_pre_eventspace_idx][_num__array_neurongroup__spikespace - 1], + sizeof(int32_t), cudaMemcpyDeviceToHost,stream1); + num_blocks = num_parallel_blocks * num_spiking_neurons; + //TODO collect info abt mean, std of num spiking neurons per time + //step and print INFO at end of simulation + } + } + // only call kernel if neurons spiked (else num_blocks is zero) + if (num_blocks != 0) { + for(int bid_offset = 0; bid_offset < num_loops; bid_offset++) + { + _run_kernel_synapses_1_pre_codeobject<<>>( + _N, + bid_offset, + defaultclock.timestep[0], + num_threads, + num_threads_per_bundle, + dev_array_neurongroup__spikespace[synapses_1_pre_eventspace_idx], + num_spiking_neurons, + ///// HOST_PARAMETERS ///// + dev_array_synapses_1_Apost, + _numApost, + dev_array_synapses_1_Apre, + _numApre, + dev_array_synapses_1_N, + dev_array_synapses_1__synaptic_post, + _num_postsynaptic_idx, + dev_array_synapses_1__synaptic_pre, + _num_synaptic_pre, + dev_array_neurongroup_1_g_iKC_eKC, + dev_array_synapses_1_g_raw, + _numg_raw, + dev_array_synapses_1_lastupdate, + _numlastupdate, + _array_defaultclock_t[0] + ); + } + } + + CUDA_CHECK_ERROR("_run_kernel_synapses_1_pre_codeobject"); +} + + +} + +void _debugmsg_synapses_1_pre_codeobject() +{ + using namespace brian; + std::cout << "Number of synapses: " << _array_synapses_1_N[0] << endl; +} + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_pre_codeobject.h b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_pre_codeobject.h new file mode 100644 index 00000000..edd7d62b --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_pre_codeobject.h @@ -0,0 +1,8 @@ +#ifndef _INCLUDED_synapses_1_pre_codeobject +#define _INCLUDED_synapses_1_pre_codeobject + +void _run_synapses_1_pre_codeobject(); + +void _debugmsg_synapses_1_pre_codeobject(); + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_pre_push_spikes.cu b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_pre_push_spikes.cu new file mode 100644 index 00000000..d915dd89 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_pre_push_spikes.cu @@ -0,0 +1,374 @@ +#include "code_objects/synapses_1_pre_push_spikes.h" +#include "objects.h" +#include "brianlib/common_math.h" +#include "brianlib/cuda_utils.h" +#include "brianlib/stdint_compat.h" +#include +#include +#include +#include + + + +////// SUPPORT CODE /////// +namespace { + double _host_rand(const int _vectorisation_idx); + double _host_randn(const int _vectorisation_idx); + int32_t _host_poisson(double _lambda, const int _vectorisation_idx); + + ///// block extra_device_helper ///// +__global__ void _advance_kernel_synapses_1_pre_push_spikes() +{ + using namespace brian; + int tid = threadIdx.x; + synapses_1_pre.queue->advance( + tid); +} + + ///// support_code_lines ///// + + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + inline __device__ int _brian_atomicAdd(int* address, int val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ float _brian_atomicAdd(float* address, float val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ double _brian_atomicAdd(double* address, double val) + { + #if (__CUDA_ARCH__ >= 600) + // hardware implementation + return atomicAdd(address, val); + #else + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val + + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + #endif + } + inline __device__ int _brian_atomicMul(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val * assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicMul(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val * + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicMul(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val * + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + inline __device__ int _brian_atomicDiv(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val / assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicDiv(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val / + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicDiv(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val / + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + + + // Implement dummy functions such that the host compiled code of binomial + // functions works. Hacky, hacky ... + double _host_rand(const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } + double _host_randn(const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } + int32_t _host_poisson(double _lambda, const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_poisson` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } +} + +////// hashdefine_lines /////// + + + +__global__ void +_run_kernel_synapses_1_pre_push_spikes( + int num_parallel_blocks, + int _num_blocks, + int _num_threads, + int32_t* _eventspace) +{ + // apperently this is not always true and that is why _num_threads is passed as function argument + // if this assert never fails, we could remove the _num_threads form the argument list + assert(blockDim.x == _num_threads); + + using namespace brian; + + int bid = blockIdx.x; + int tid = threadIdx.x; + + int post_neuron_bid = bid % num_parallel_blocks; + int pre_neuron_idx = bid / num_parallel_blocks; + + int32_t spiking_neuron = _eventspace[pre_neuron_idx]; + assert(spiking_neuron != -1); + + // push to spikequeue if spiking_neuron is in sources of current SynapticPathway + if(synapses_1_pre.spikes_start <= spiking_neuron && spiking_neuron < synapses_1_pre.spikes_stop) + { + synapses_1_pre.queue->push_bundles( + post_neuron_bid, + tid, + _num_threads, + spiking_neuron - synapses_1_pre.spikes_start); + } +} + + +void _run_synapses_1_pre_push_spikes() +{ + using namespace brian; + + + + ///// HOST_CONSTANTS /////////// + const int _numN = 1; + const int _num_spikespace = 2501; + double* const _array_synapses_1_delay = thrust::raw_pointer_cast(&_dynamic_array_synapses_1_delay[0]); + const int _numdelay = _dynamic_array_synapses_1_delay.size(); + double* const dev_array_synapses_1_delay = thrust::raw_pointer_cast(&dev_dynamic_array_synapses_1_delay[0]); + + if (synapses_1_pre_scalar_delay) + { + int num_eventspaces = dev_array_neurongroup__spikespace.size(); + synapses_1_pre_eventspace_idx = (current_idx_array_neurongroup__spikespace - synapses_1_pre_delay + num_eventspaces) % num_eventspaces; + + ////////////////////////////////////////////// + //// No pushing in no_or_const_delay_mode //// + ////////////////////////////////////////////// + } + else if (synapses_1_pre_max_size > 0) + { + + // get the number of spiking neurons + int32_t num_spiking_neurons; + CUDA_SAFE_CALL( + cudaMemcpyAsync(&num_spiking_neurons, + dev_array_neurongroup__spikespace[current_idx_array_neurongroup__spikespace] + _num_spikespace - 1, + sizeof(int32_t), cudaMemcpyDeviceToHost,stream1) + ); + + // advance spike queues + _advance_kernel_synapses_1_pre_push_spikes<<<1, num_parallel_blocks,0,stream1>>>(); + + CUDA_CHECK_ERROR("_advance_kernel_synapses_1_pre_push_spikes"); + + + static int num_threads, num_blocks; + static size_t needed_shared_memory = 0; + static bool first_run = true; + if (first_run) + { + needed_shared_memory = 0; + + // We don't need more then max(num_synapses) threads per block. + num_threads = synapses_1_pre_max_size; + if (num_threads > max_threads_per_block) + { + num_threads = max_threads_per_block; + } + // num_blocks depends on num_spiking_neurons, which changes each time step + + + // calculate theoretical occupancy + int max_active_blocks; + CUDA_SAFE_CALL( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, + _run_kernel_synapses_1_pre_push_spikes, num_threads, needed_shared_memory) + ); + + float occupancy = (max_active_blocks * num_threads / num_threads_per_warp) / + (float)(max_threads_per_sm / num_threads_per_warp); + + + // check if we have enough ressources to call kernel with given number + // of blocks and threads (can only occur for the else case above as for the + // first max. occupancy) + struct cudaFuncAttributes funcAttrib; + CUDA_SAFE_CALL( + cudaFuncGetAttributes(&funcAttrib, _run_kernel_synapses_1_pre_push_spikes) + ); + if (num_threads > funcAttrib.maxThreadsPerBlock) + { + // use the max num_threads before launch failure + num_threads = funcAttrib.maxThreadsPerBlock; + printf("WARNING Not enough ressources available to call " + "_run_kernel_synapses_1_pre_push_spikes " + "with maximum possible threads per block (%u). " + "Reducing num_threads to %u. (Kernel needs %i " + "registers per block, %i bytes of " + "statically-allocated shared memory per block, %i " + "bytes of local memory per thread and a total of %i " + "bytes of user-allocated constant memory)\n", + max_threads_per_block, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes); + + // calculate theoretical occupancy for new num_threads + CUDA_SAFE_CALL( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, + _run_kernel_synapses_1_pre_push_spikes, num_threads, needed_shared_memory) + ); + + occupancy = (max_active_blocks * num_threads / num_threads_per_warp) / + (float)(max_threads_per_sm / num_threads_per_warp); + } + + + else + { + printf("INFO _run_kernel_synapses_1_pre_push_spikes\n" + "\t%u blocks\n" + "\t%u threads\n" + "\t%i registers per block\n" + "\t%i bytes statically-allocated shared memory per block\n" + "\t%i bytes local memory per thread\n" + "\t%i bytes user-allocated constant memory\n" + "\t%.3f theoretical occupancy\n", + num_blocks, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes, occupancy); + } + first_run = false; + } + + + if (num_spiking_neurons > 0) + { + num_blocks = num_parallel_blocks * num_spiking_neurons; + + _run_kernel_synapses_1_pre_push_spikes<<>>( + num_parallel_blocks, + num_blocks, + num_threads, + dev_array_neurongroup__spikespace[current_idx_array_neurongroup__spikespace]); + + CUDA_CHECK_ERROR("_run_kernel_synapses_1_pre_push_spikes"); + } + } // end else if (synapses_1_pre_max_size > 0) + +} + + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_pre_push_spikes.h b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_pre_push_spikes.h new file mode 100644 index 00000000..2ece2ada --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_pre_push_spikes.h @@ -0,0 +1,7 @@ +#ifndef _INCLUDED_synapses_1_pre_push_spikes +#define _INCLUDED_synapses_1_pre_push_spikes + +void _run_synapses_1_pre_push_spikes(); + + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_synapses_create_generator_codeobject.cu b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_synapses_create_generator_codeobject.cu new file mode 100644 index 00000000..fd7e7eb6 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_synapses_create_generator_codeobject.cu @@ -0,0 +1,506 @@ +#include "code_objects/synapses_1_synapses_create_generator_codeobject.h" +#include "objects.h" +#include "brianlib/common_math.h" +#include "brianlib/cuda_utils.h" +#include "brianlib/stdint_compat.h" +#include +#include +#include +#include + +#include +#include "synapses_classes.h" + +#include +#include +#include +#include "brianlib/cuda_utils.h" +#include + + +////// SUPPORT CODE /////// +namespace { + double _host_rand(const int _vectorisation_idx); + double _host_randn(const int _vectorisation_idx); + int32_t _host_poisson(double _lambda, const int _vectorisation_idx); + + ///// block extra_device_helper ///// + + ///// support_code_lines ///// + + #define _rand(vectorisation_idx) (_ptr_array_synapses_1_synapses_create_generator_codeobject_rand[vectorisation_idx]) + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + inline __device__ int _brian_atomicAdd(int* address, int val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ float _brian_atomicAdd(float* address, float val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ double _brian_atomicAdd(double* address, double val) + { + #if (__CUDA_ARCH__ >= 600) + // hardware implementation + return atomicAdd(address, val); + #else + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val + + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + #endif + } + inline __device__ int _brian_atomicMul(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val * assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicMul(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val * + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicMul(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val * + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + inline __device__ int _brian_atomicDiv(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val / assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicDiv(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val / + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicDiv(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val / + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + + +// NOTE: _ptr_array_synapses_1_synapses_create_generator_codeobject_rand is NOT an array +// but an instance of CurandBuffer, which overloads the operator[], which then just +// returns the next random number in the buffer, ignoring the argument passed to operator[] +// NOTE: Put buffers into anonymous namespace such that _host_rand/n and rand/n +// in main code have access to it. +// NOTE: _host_rand/n is used in the host compiled implementation of binomial +// functions. Here, it just returns the next element from the CurandBuffer. +CurandBuffer _ptr_array_synapses_1_synapses_create_generator_codeobject_rand(&brian::curand_generator, RAND); +randomNumber_t _host_rand(const int _vectorisation_idx) +{ + return _ptr_array_synapses_1_synapses_create_generator_codeobject_rand[_vectorisation_idx]; +} + +CurandBuffer _ptr_array_synapses_1_synapses_create_generator_codeobject_randn(&brian::curand_generator, RANDN); +randomNumber_t _host_randn(const int _vectorisation_idx) +{ + return _ptr_array_synapses_1_synapses_create_generator_codeobject_randn[_vectorisation_idx]; +} + +// This is the C++ Standalone implementation of the poisson function, which we use +double _loggam(double x) { + double x0, x2, xp, gl, gl0; + int32_t k, n; + + static double a[10] = {8.333333333333333e-02, -2.777777777777778e-03, + 7.936507936507937e-04, -5.952380952380952e-04, + 8.417508417508418e-04, -1.917526917526918e-03, + 6.410256410256410e-03, -2.955065359477124e-02, + 1.796443723688307e-01, -1.39243221690590e+00}; + x0 = x; + n = 0; + if ((x == 1.0) || (x == 2.0)) + return 0.0; + else if (x <= 7.0) { + n = (int32_t)(7 - x); + x0 = x + n; + } + x2 = 1.0 / (x0 * x0); + xp = 2 * M_PI; + gl0 = a[9]; + for (k=8; k>=0; k--) { + gl0 *= x2; + gl0 += a[k]; + } + gl = gl0 / x0 + 0.5 * log(xp) + (x0 - 0.5) * log(x0) - x0; + if (x <= 7.0) { + for (k=1; k<=n; k++) { + gl -= log(x0 - 1.0); + x0 -= 1.0; + } + } + return gl; +} + +int32_t _poisson_mult(double lam, int _vectorisation_idx) { + int32_t X; + double prod, U, enlam; + + enlam = exp(-lam); + X = 0; + prod = 1.0; + while (1) { + U = _rand(_vectorisation_idx); + prod *= U; + if (prod > enlam) + X += 1; + else + return X; + } +} + +int32_t _poisson_ptrs(double lam, int _vectorisation_idx) { + int32_t k; + double U, V, slam, loglam, a, b, invalpha, vr, us; + + slam = sqrt(lam); + loglam = log(lam); + b = 0.931 + 2.53 * slam; + a = -0.059 + 0.02483 * b; + invalpha = 1.1239 + 1.1328 / (b - 3.4); + vr = 0.9277 - 3.6224 / (b - 2); + + while (1) { + U = _rand(_vectorisation_idx) - 0.5; + V = _rand(_vectorisation_idx); + us = 0.5 - abs(U); + k = (int32_t)floor((2 * a / us + b) * U + lam + 0.43); + if ((us >= 0.07) && (V <= vr)) + return k; + if ((k < 0) || ((us < 0.013) && (V > us))) + continue; + if ((log(V) + log(invalpha) - log(a / (us * us) + b)) <= + (-lam + k * loglam - _loggam(k + 1))) + return k; + } +} + +int32_t _host_poisson(double lam, int32_t _idx) { + if (lam >= 10) + return _poisson_ptrs(lam, _idx); + else if (lam == 0) + return 0; + else + return _poisson_mult(lam, _idx); +} +} + +////// hashdefine_lines /////// + + + + + +void _run_synapses_1_synapses_create_generator_codeobject() +{ + using namespace brian; + +std::clock_t start_timer = std::clock(); + +CUDA_CHECK_MEMORY(); +size_t used_device_memory_start = used_device_memory; + + + ///// HOST_CONSTANTS /////////// + const int _numN = 1; + int32_t* const _array_synapses_1_N_incoming = thrust::raw_pointer_cast(&_dynamic_array_synapses_1_N_incoming[0]); + const int _numN_incoming = _dynamic_array_synapses_1_N_incoming.size(); + int32_t* const _array_synapses_1_N_outgoing = thrust::raw_pointer_cast(&_dynamic_array_synapses_1_N_outgoing[0]); + const int _numN_outgoing = _dynamic_array_synapses_1_N_outgoing.size(); + int32_t* const _array_synapses_1__synaptic_post = thrust::raw_pointer_cast(&_dynamic_array_synapses_1__synaptic_post[0]); + const int _num_synaptic_post = _dynamic_array_synapses_1__synaptic_post.size(); + int32_t* const _array_synapses_1__synaptic_pre = thrust::raw_pointer_cast(&_dynamic_array_synapses_1__synaptic_pre[0]); + const int _num_synaptic_pre = _dynamic_array_synapses_1__synaptic_pre.size(); + + + ///// pointers_lines ///// + + int32_t* __restrict _ptr_array_synapses_1_N_outgoing = _array_synapses_1_N_outgoing; + int32_t* __restrict _ptr_array_synapses_1_N_incoming = _array_synapses_1_N_incoming; + int32_t* _ptr_array_synapses_1_N = _array_synapses_1_N; + int32_t* __restrict _ptr_array_synapses_1__synaptic_pre = _array_synapses_1__synaptic_pre; + int32_t* __restrict _ptr_array_synapses_1__synaptic_post = _array_synapses_1__synaptic_post; + + + const int _N_pre = 2500; + const int _N_post = 100; + _dynamic_array_synapses_1_N_incoming.resize(_N_post + 0); + _dynamic_array_synapses_1_N_outgoing.resize(_N_pre + 0); + + int _raw_pre_idx, _raw_post_idx; + const int _vectorisation_idx = -1; + ///// scalar_code['setup_iterator'] ///// + + + ///// scalar_code['create_j'] ///// + + + ///// scalar_code['create_cond'] ///// + + + ///// scalar_code['update_post'] ///// + + + + for(int _i = 0; _i < _N_pre; _i++) + { + + bool __cond, _cond; + _raw_pre_idx = _i + 0; + { + ///// vector_code['create_cond'] ///// + + const char _cond = true; + + __cond = _cond; + } + _cond = __cond; + if(!_cond) continue; + // Some explanation of this hackery. The problem is that we have multiple code blocks. + // Each code block is generated independently of the others, and they declare variables + // at the beginning if necessary (including declaring them as const if their values don't + // change). However, if two code blocks follow each other in the same C++ scope then + // that causes a redeclaration error. So we solve it by putting each block inside a + // pair of braces to create a new scope specific to each code block. However, that brings + // up another problem: we need the values from these code blocks. I don't have a general + // solution to this problem, but in the case of this particular template, we know which + // values we need from them so we simply create outer scoped variables to copy the value + // into. Later on we have a slightly more complicated problem because the original name + // _j has to be used, so we create two variables __j, _j at the outer scope, copy + // _j to __j in the inner scope (using the inner scope version of _j), and then + // __j to _j in the outer scope (to the outer scope version of _j). This outer scope + // version of _j will then be used in subsequent blocks. + long _uiter_low; + long _uiter_high; + long _uiter_step; + { + ///// vector_code['setup_iterator'] ///// + + const int32_t _iter_low = 0; + const int32_t _iter_high = 100; + const int32_t _iter_step = 1; + + _uiter_low = _iter_low; + _uiter_high = _iter_high; + _uiter_step = _iter_step; + } + for(int _k=_uiter_low; _k<_uiter_high; _k+=_uiter_step) + { + long __j, _j, _pre_idx, __pre_idx; + { + ///// vector_code['create_j'] ///// + + const int32_t _pre_idx = _raw_pre_idx; + const int32_t _j = _k; + + __j = _j; // pick up the locally scoped _j and store in __j + __pre_idx = _pre_idx; + } + _j = __j; // make the previously locally scoped _j available + _pre_idx = __pre_idx; + _raw_post_idx = _j + 0; + + + if(_j<0 || _j>=_N_post) + { + cout << "Error: tried to create synapse to neuron j=" << _j << " outside range 0 to " << + _N_post-1 << endl; + exit(1); + } + + ///// vector_code['update_post'] ///// + + const int32_t _post_idx = _raw_post_idx; + const int32_t _n = 1; + + + for (int _repetition=0; _repetition<_n; _repetition++) { + _dynamic_array_synapses_1_N_outgoing[_pre_idx] += 1; + _dynamic_array_synapses_1_N_incoming[_post_idx] += 1; + _dynamic_array_synapses_1__synaptic_pre.push_back(_pre_idx); + _dynamic_array_synapses_1__synaptic_post.push_back(_post_idx); + } + } + } + + // now we need to resize all registered variables + const int32_t newsize = _dynamic_array_synapses_1__synaptic_pre.size(); + THRUST_CHECK_ERROR( + dev_dynamic_array_synapses_1__synaptic_post.resize(newsize) + ); + _dynamic_array_synapses_1__synaptic_post.resize(newsize); + THRUST_CHECK_ERROR( + dev_dynamic_array_synapses_1__synaptic_pre.resize(newsize) + ); + _dynamic_array_synapses_1__synaptic_pre.resize(newsize); + THRUST_CHECK_ERROR( + dev_dynamic_array_synapses_1_Apost.resize(newsize) + ); + _dynamic_array_synapses_1_Apost.resize(newsize); + THRUST_CHECK_ERROR( + dev_dynamic_array_synapses_1_Apre.resize(newsize) + ); + _dynamic_array_synapses_1_Apre.resize(newsize); + THRUST_CHECK_ERROR( + dev_dynamic_array_synapses_1_delay_1.resize(newsize) + ); + _dynamic_array_synapses_1_delay_1.resize(newsize); + THRUST_CHECK_ERROR( + dev_dynamic_array_synapses_1_g_raw.resize(newsize) + ); + _dynamic_array_synapses_1_g_raw.resize(newsize); + THRUST_CHECK_ERROR( + dev_dynamic_array_synapses_1_lastupdate.resize(newsize) + ); + _dynamic_array_synapses_1_lastupdate.resize(newsize); + + // update the total number of synapses + _ptr_array_synapses_1_N[0] = newsize; + + // Check for occurrence of multiple source-target pairs in synapses ("synapse number") + std::map, int32_t> source_target_count; + for (int _i=0; _i source_target = std::pair(_dynamic_array_synapses_1__synaptic_pre[_i], _dynamic_array_synapses_1__synaptic_post[_i]); + source_target_count[source_target]++; + //printf("source target count = %i\n", source_target_count[source_target]); + if (source_target_count[source_target] > 1) + { + synapses_1_multiple_pre_post = true; + break; + } + } + + // copy changed host data to device + dev_dynamic_array_synapses_1_N_incoming = _dynamic_array_synapses_1_N_incoming; + dev_dynamic_array_synapses_1_N_outgoing = _dynamic_array_synapses_1_N_outgoing; + dev_dynamic_array_synapses_1__synaptic_pre = _dynamic_array_synapses_1__synaptic_pre; + dev_dynamic_array_synapses_1__synaptic_post = _dynamic_array_synapses_1__synaptic_post; + CUDA_SAFE_CALL( + cudaMemcpyAsync(dev_array_synapses_1_N, + _array_synapses_1_N, + sizeof(int32_t), + cudaMemcpyHostToDevice,stream1) + ); + + + + +// free memory in CurandBuffers +_ptr_array_synapses_1_synapses_create_generator_codeobject_rand.free_memory(); +_ptr_array_synapses_1_synapses_create_generator_codeobject_randn.free_memory(); + +CUDA_CHECK_MEMORY(); +const double to_MB = 1.0 / (1024.0 * 1024.0); +double tot_memory_MB = (used_device_memory - used_device_memory_start) * to_MB; +double time_passed = (double)(std::clock() - start_timer) / CLOCKS_PER_SEC; +std::cout << "INFO: synapses_1 creation took " << time_passed << "s"; +if (tot_memory_MB > 0) + std::cout << " and used " << tot_memory_MB << "MB of memory."; +std::cout << std::endl; +} + + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_synapses_create_generator_codeobject.h b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_synapses_create_generator_codeobject.h new file mode 100644 index 00000000..912c2be7 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_1_synapses_create_generator_codeobject.h @@ -0,0 +1,7 @@ +#ifndef _INCLUDED_synapses_1_synapses_create_generator_codeobject +#define _INCLUDED_synapses_1_synapses_create_generator_codeobject + +void _run_synapses_1_synapses_create_generator_codeobject(); + + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_2_pre_codeobject.cu b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_2_pre_codeobject.cu new file mode 100644 index 00000000..3c40cdea --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_2_pre_codeobject.cu @@ -0,0 +1,488 @@ +#include "code_objects/synapses_2_pre_codeobject.h" +#include "objects.h" +#include "brianlib/common_math.h" +#include "brianlib/cuda_utils.h" +#include "brianlib/stdint_compat.h" +#include +#include +#include +#include + +#include +#include "synapses_classes.h" + + +////// SUPPORT CODE /////// +namespace { + double _host_rand(const int _vectorisation_idx); + double _host_randn(const int _vectorisation_idx); + int32_t _host_poisson(double _lambda, const int _vectorisation_idx); + + ///// block extra_device_helper ///// + + ///// support_code_lines ///// + + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + inline __device__ int _brian_atomicAdd(int* address, int val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ float _brian_atomicAdd(float* address, float val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ double _brian_atomicAdd(double* address, double val) + { + #if (__CUDA_ARCH__ >= 600) + // hardware implementation + return atomicAdd(address, val); + #else + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val + + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + #endif + } + inline __device__ int _brian_atomicMul(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val * assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicMul(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val * + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicMul(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val * + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + inline __device__ int _brian_atomicDiv(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val / assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicDiv(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val / + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicDiv(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val / + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + + + // Implement dummy functions such that the host compiled code of binomial + // functions works. Hacky, hacky ... + double _host_rand(const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } + double _host_randn(const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } + int32_t _host_poisson(double _lambda, const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_poisson` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } +} + +////// hashdefine_lines /////// + + + + +__global__ void +_run_kernel_synapses_2_pre_codeobject( + int _N, + int bid_offset, + int timestep, + int THREADS_PER_BLOCK, + int threads_per_bundle, + int32_t* eventspace, + int num_spiking_neurons, + ///// KERNEL_PARAMETERS ///// + int32_t* _ptr_array_synapses_2_N, + int32_t* _ptr_array_synapses_2__synaptic_post, + const int _num_postsynaptic_idx, + int32_t* _ptr_array_synapses_2__synaptic_pre, + const int _num_synaptic_pre, + double* _ptr_array_neurongroup_1_g_eKC_eKC + ) +{ + using namespace brian; + + assert(THREADS_PER_BLOCK == blockDim.x); + + int tid = threadIdx.x; + int bid = blockIdx.x + bid_offset; + //TODO: do we need _idx here? if no, get also rid of scoping after scalar code + // scalar_code can depend on _idx (e.g. if the state update depends on a + // subexpression that is the same for all synapses, ?) + int _idx = bid * THREADS_PER_BLOCK + tid; + int _vectorisation_idx = _idx; + + ///// KERNEL_CONSTANTS ///// + const int _numN = 1; + const int _numg_eKC_eKC = 100; + + ///// kernel_lines ///// + + + + + ///// scalar_code ///// + + const double _lio_1 = 0.675 * 7.500000000000001e-08; + + + { // _idx is defined in outer and inner scope (for `scalar_code`) + if (synapses_2_pre.no_or_const_delay_mode) + { + // TODO: pass as kernel parameter instead? + int num_parallel_blocks = synapses_2_pre.queue->num_blocks; + int32_t spikes_start = synapses_2_pre.spikes_start; + int32_t spikes_stop = synapses_2_pre.spikes_stop; + + // for the first delay timesteps the eventspace is not yet filled + // note that num_queues is the number of eventspaces, num_queues-1 the delay in timesteps + if (timestep >= synapses_2_pre.queue->num_queues - 1) + { + // `spiking_neuron_idx` runs through the eventspace + // `post_block_idx` runs through the post neuron blocks of the connectivity matrix + int spiking_neuron_idx = bid / num_parallel_blocks; + int post_block_idx = bid % num_parallel_blocks; + { + + // spiking_neuron is index in NeuronGroup + int32_t spiking_neuron = eventspace[spiking_neuron_idx]; + + assert(spiking_neuron != -1); + + // apply effects if event neuron is in sources of current SynapticPathway + if(spikes_start <= spiking_neuron && spiking_neuron < spikes_stop) + { + int pre_post_block_id = (spiking_neuron - spikes_start) * num_parallel_blocks + post_block_idx; + int num_synapses = synapses_2_pre_num_synapses_by_pre[pre_post_block_id]; + int32_t* propagating_synapses = synapses_2_pre_synapse_ids_by_pre[pre_post_block_id]; + for(int j = tid; j < num_synapses; j+=THREADS_PER_BLOCK) + { + // _idx is the synapse id + int32_t _idx = propagating_synapses[j]; + _vectorisation_idx = j; + + ///// vector_code ///// + + // Abstract code: g_eKC_eKC += _lio_1 + const int32_t _postsynaptic_idx = _ptr_array_synapses_2__synaptic_post[_idx]; + _brian_atomicAdd(&_ptr_array_neurongroup_1_g_eKC_eKC[_postsynaptic_idx], (double)(_lio_1)); + + } + } + + __syncthreads(); + } + } + } + else // heterogeneous delay mode + { + cudaVector* synapses_queue; + synapses_2_pre.queue->peek(&synapses_queue); + + int queue_size = synapses_queue[bid].size(); + + // use a fixed number of threads per bundle, i runs through all those threads of all bundles + // for threads_per_bundle == 1, we have one thread per bundle (parallel) + for (int i = tid; i < queue_size*threads_per_bundle; i+=THREADS_PER_BLOCK) + { + // bundle_idx runs through all bundles + int bundle_idx = i / threads_per_bundle; + // syn_in_bundle_idx runs through all threads in a single bundle + int syn_in_bundle_idx = i % threads_per_bundle; + + int bundle_id = synapses_queue[bid].at(bundle_idx); + int bundle_size = synapses_2_pre_num_synapses_by_bundle[bundle_id]; + int synapses_offset = synapses_2_pre_synapses_offset_by_bundle[bundle_id]; + int32_t* synapse_ids = synapses_2_pre_synapse_ids; + int32_t* synapse_bundle = synapse_ids + synapses_offset; + + // loop through synapses of this bundle with all available threads_per_bundle + // if threads_per_bundle == 1, this is serial + for (int j = syn_in_bundle_idx; j < bundle_size; j+=threads_per_bundle) + { + int32_t _idx = synapse_bundle[j]; + + + ///// vector_code ///// + + // Abstract code: g_eKC_eKC += _lio_1 + const int32_t _postsynaptic_idx = _ptr_array_synapses_2__synaptic_post[_idx]; + _brian_atomicAdd(&_ptr_array_neurongroup_1_g_eKC_eKC[_postsynaptic_idx], (double)(_lio_1)); + + } + } + } + } + } + + + +void _run_synapses_2_pre_codeobject() +{ + using namespace brian; + + + const int _N = _array_synapses_2_N[0]; + + ///// HOST_CONSTANTS /////////// + const int _numN = 1; + int32_t* const dev_array_synapses_2__synaptic_post = thrust::raw_pointer_cast(&dev_dynamic_array_synapses_2__synaptic_post[0]); + const int _num_postsynaptic_idx = dev_dynamic_array_synapses_2__synaptic_post.size(); + int32_t* const dev_array_synapses_2__synaptic_pre = thrust::raw_pointer_cast(&dev_dynamic_array_synapses_2__synaptic_pre[0]); + const int _num_synaptic_pre = dev_dynamic_array_synapses_2__synaptic_pre.size(); + const int _numg_eKC_eKC = 100; + +static int num_threads_per_bundle; +static int num_loops; + + static int num_threads, num_blocks; + static size_t needed_shared_memory = 0; + static bool first_run = true; + if (first_run) + { +// We are using atomics, we can fully parallelise. +num_blocks = num_parallel_blocks; +num_threads = max_threads_per_block; +// TODO: effect of mean instead of max? +num_threads_per_bundle = synapses_2_pre_max_bundle_size; +num_loops = 1; + + + + // calculate theoretical occupancy + int max_active_blocks; + CUDA_SAFE_CALL( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, + _run_kernel_synapses_2_pre_codeobject, num_threads, needed_shared_memory) + ); + + float occupancy = (max_active_blocks * num_threads / num_threads_per_warp) / + (float)(max_threads_per_sm / num_threads_per_warp); + + + // check if we have enough ressources to call kernel with given number + // of blocks and threads (can only occur for the else case above as for the + // first max. occupancy) + struct cudaFuncAttributes funcAttrib; + CUDA_SAFE_CALL( + cudaFuncGetAttributes(&funcAttrib, _run_kernel_synapses_2_pre_codeobject) + ); + if (num_threads > funcAttrib.maxThreadsPerBlock) + { + // use the max num_threads before launch failure + num_threads = funcAttrib.maxThreadsPerBlock; + printf("WARNING Not enough ressources available to call " + "_run_kernel_synapses_2_pre_codeobject " + "with maximum possible threads per block (%u). " + "Reducing num_threads to %u. (Kernel needs %i " + "registers per block, %i bytes of " + "statically-allocated shared memory per block, %i " + "bytes of local memory per thread and a total of %i " + "bytes of user-allocated constant memory)\n", + max_threads_per_block, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes); + + // calculate theoretical occupancy for new num_threads + CUDA_SAFE_CALL( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, + _run_kernel_synapses_2_pre_codeobject, num_threads, needed_shared_memory) + ); + + occupancy = (max_active_blocks * num_threads / num_threads_per_warp) / + (float)(max_threads_per_sm / num_threads_per_warp); + } + +else if (synapses_2_pre_max_size <= 0) +{ + printf("INFO there are no synapses in the synapses_2_pre pathway. Skipping synapses_push and synapses kernels.\n"); +} + + else + { + printf("INFO _run_kernel_synapses_2_pre_codeobject\n" + "\t%u blocks\n" + "\t%u threads\n" + "\t%i registers per block\n" + "\t%i bytes statically-allocated shared memory per block\n" + "\t%i bytes local memory per thread\n" + "\t%i bytes user-allocated constant memory\n" + "\t%.3f theoretical occupancy\n", + num_blocks, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes, occupancy); + } + first_run = false; + } + + +// only call kernel if we have synapses (otherwise we skipped the push kernel) +if (synapses_2_pre_max_size > 0) +{ + int32_t num_spiking_neurons; + // we only need the number of spiking neurons if we parallelise effect + // application over spiking neurons in homogeneous delay mode + if (synapses_2_pre_scalar_delay) + { + if (defaultclock.timestep[0] >= synapses_2_pre_delay) + { + cudaMemcpyAsync(&num_spiking_neurons, + &dev_array_neurongroup_1__spikespace[synapses_2_pre_eventspace_idx][_num__array_neurongroup_1__spikespace - 1], + sizeof(int32_t), cudaMemcpyDeviceToHost,stream2); + num_blocks = num_parallel_blocks * num_spiking_neurons; + //TODO collect info abt mean, std of num spiking neurons per time + //step and print INFO at end of simulation + } + } + // only call kernel if neurons spiked (else num_blocks is zero) + if (num_blocks != 0) { + for(int bid_offset = 0; bid_offset < num_loops; bid_offset++) + { + _run_kernel_synapses_2_pre_codeobject<<>>( + _N, + bid_offset, + defaultclock.timestep[0], + num_threads, + num_threads_per_bundle, + dev_array_neurongroup_1__spikespace[synapses_2_pre_eventspace_idx], + num_spiking_neurons, + ///// HOST_PARAMETERS ///// + dev_array_synapses_2_N, + dev_array_synapses_2__synaptic_post, + _num_postsynaptic_idx, + dev_array_synapses_2__synaptic_pre, + _num_synaptic_pre, + dev_array_neurongroup_1_g_eKC_eKC + ); + } + } + + CUDA_CHECK_ERROR("_run_kernel_synapses_2_pre_codeobject"); +} + + +} + +void _debugmsg_synapses_2_pre_codeobject() +{ + using namespace brian; + std::cout << "Number of synapses: " << _array_synapses_2_N[0] << endl; +} + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_2_pre_codeobject.h b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_2_pre_codeobject.h new file mode 100644 index 00000000..f8cc950c --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_2_pre_codeobject.h @@ -0,0 +1,8 @@ +#ifndef _INCLUDED_synapses_2_pre_codeobject +#define _INCLUDED_synapses_2_pre_codeobject + +void _run_synapses_2_pre_codeobject(); + +void _debugmsg_synapses_2_pre_codeobject(); + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_2_pre_push_spikes.cu b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_2_pre_push_spikes.cu new file mode 100644 index 00000000..0ce86057 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_2_pre_push_spikes.cu @@ -0,0 +1,374 @@ +#include "code_objects/synapses_2_pre_push_spikes.h" +#include "objects.h" +#include "brianlib/common_math.h" +#include "brianlib/cuda_utils.h" +#include "brianlib/stdint_compat.h" +#include +#include +#include +#include + + + +////// SUPPORT CODE /////// +namespace { + double _host_rand(const int _vectorisation_idx); + double _host_randn(const int _vectorisation_idx); + int32_t _host_poisson(double _lambda, const int _vectorisation_idx); + + ///// block extra_device_helper ///// +__global__ void _advance_kernel_synapses_2_pre_push_spikes() +{ + using namespace brian; + int tid = threadIdx.x; + synapses_2_pre.queue->advance( + tid); +} + + ///// support_code_lines ///// + + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + inline __device__ int _brian_atomicAdd(int* address, int val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ float _brian_atomicAdd(float* address, float val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ double _brian_atomicAdd(double* address, double val) + { + #if (__CUDA_ARCH__ >= 600) + // hardware implementation + return atomicAdd(address, val); + #else + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val + + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + #endif + } + inline __device__ int _brian_atomicMul(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val * assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicMul(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val * + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicMul(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val * + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + inline __device__ int _brian_atomicDiv(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val / assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicDiv(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val / + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicDiv(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val / + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + + + // Implement dummy functions such that the host compiled code of binomial + // functions works. Hacky, hacky ... + double _host_rand(const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } + double _host_randn(const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } + int32_t _host_poisson(double _lambda, const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_poisson` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } +} + +////// hashdefine_lines /////// + + + +__global__ void +_run_kernel_synapses_2_pre_push_spikes( + int num_parallel_blocks, + int _num_blocks, + int _num_threads, + int32_t* _eventspace) +{ + // apperently this is not always true and that is why _num_threads is passed as function argument + // if this assert never fails, we could remove the _num_threads form the argument list + assert(blockDim.x == _num_threads); + + using namespace brian; + + int bid = blockIdx.x; + int tid = threadIdx.x; + + int post_neuron_bid = bid % num_parallel_blocks; + int pre_neuron_idx = bid / num_parallel_blocks; + + int32_t spiking_neuron = _eventspace[pre_neuron_idx]; + assert(spiking_neuron != -1); + + // push to spikequeue if spiking_neuron is in sources of current SynapticPathway + if(synapses_2_pre.spikes_start <= spiking_neuron && spiking_neuron < synapses_2_pre.spikes_stop) + { + synapses_2_pre.queue->push_bundles( + post_neuron_bid, + tid, + _num_threads, + spiking_neuron - synapses_2_pre.spikes_start); + } +} + + +void _run_synapses_2_pre_push_spikes() +{ + using namespace brian; + + + + ///// HOST_CONSTANTS /////////// + const int _numN = 1; + const int _num_spikespace = 101; + double* const _array_synapses_2_delay = thrust::raw_pointer_cast(&_dynamic_array_synapses_2_delay[0]); + const int _numdelay = _dynamic_array_synapses_2_delay.size(); + double* const dev_array_synapses_2_delay = thrust::raw_pointer_cast(&dev_dynamic_array_synapses_2_delay[0]); + + if (synapses_2_pre_scalar_delay) + { + int num_eventspaces = dev_array_neurongroup_1__spikespace.size(); + synapses_2_pre_eventspace_idx = (current_idx_array_neurongroup_1__spikespace - synapses_2_pre_delay + num_eventspaces) % num_eventspaces; + + ////////////////////////////////////////////// + //// No pushing in no_or_const_delay_mode //// + ////////////////////////////////////////////// + } + else if (synapses_2_pre_max_size > 0) + { + + // get the number of spiking neurons + int32_t num_spiking_neurons; + CUDA_SAFE_CALL( + cudaMemcpyAsync(&num_spiking_neurons, + dev_array_neurongroup_1__spikespace[current_idx_array_neurongroup_1__spikespace] + _num_spikespace - 1, + sizeof(int32_t), cudaMemcpyDeviceToHost, stream2) + ); + + // advance spike queues + _advance_kernel_synapses_2_pre_push_spikes<<<1, num_parallel_blocks,0,stream2>>>(); + + CUDA_CHECK_ERROR("_advance_kernel_synapses_2_pre_push_spikes"); + + + static int num_threads, num_blocks; + static size_t needed_shared_memory = 0; + static bool first_run = true; + if (first_run) + { + needed_shared_memory = 0; + + // We don't need more then max(num_synapses) threads per block. + num_threads = synapses_2_pre_max_size; + if (num_threads > max_threads_per_block) + { + num_threads = max_threads_per_block; + } + // num_blocks depends on num_spiking_neurons, which changes each time step + + + // calculate theoretical occupancy + int max_active_blocks; + CUDA_SAFE_CALL( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, + _run_kernel_synapses_2_pre_push_spikes, num_threads, needed_shared_memory) + ); + + float occupancy = (max_active_blocks * num_threads / num_threads_per_warp) / + (float)(max_threads_per_sm / num_threads_per_warp); + + + // check if we have enough ressources to call kernel with given number + // of blocks and threads (can only occur for the else case above as for the + // first max. occupancy) + struct cudaFuncAttributes funcAttrib; + CUDA_SAFE_CALL( + cudaFuncGetAttributes(&funcAttrib, _run_kernel_synapses_2_pre_push_spikes) + ); + if (num_threads > funcAttrib.maxThreadsPerBlock) + { + // use the max num_threads before launch failure + num_threads = funcAttrib.maxThreadsPerBlock; + printf("WARNING Not enough ressources available to call " + "_run_kernel_synapses_2_pre_push_spikes " + "with maximum possible threads per block (%u). " + "Reducing num_threads to %u. (Kernel needs %i " + "registers per block, %i bytes of " + "statically-allocated shared memory per block, %i " + "bytes of local memory per thread and a total of %i " + "bytes of user-allocated constant memory)\n", + max_threads_per_block, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes); + + // calculate theoretical occupancy for new num_threads + CUDA_SAFE_CALL( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, + _run_kernel_synapses_2_pre_push_spikes, num_threads, needed_shared_memory) + ); + + occupancy = (max_active_blocks * num_threads / num_threads_per_warp) / + (float)(max_threads_per_sm / num_threads_per_warp); + } + + + else + { + printf("INFO _run_kernel_synapses_2_pre_push_spikes\n" + "\t%u blocks\n" + "\t%u threads\n" + "\t%i registers per block\n" + "\t%i bytes statically-allocated shared memory per block\n" + "\t%i bytes local memory per thread\n" + "\t%i bytes user-allocated constant memory\n" + "\t%.3f theoretical occupancy\n", + num_blocks, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes, occupancy); + } + first_run = false; + } + + + if (num_spiking_neurons > 0) + { + num_blocks = num_parallel_blocks * num_spiking_neurons; + + _run_kernel_synapses_2_pre_push_spikes<<>>( + num_parallel_blocks, + num_blocks, + num_threads, + dev_array_neurongroup_1__spikespace[current_idx_array_neurongroup_1__spikespace]); + + CUDA_CHECK_ERROR("_run_kernel_synapses_2_pre_push_spikes"); + } + } // end else if (synapses_2_pre_max_size > 0) + +} + + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_2_pre_push_spikes.h b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_2_pre_push_spikes.h new file mode 100644 index 00000000..52d331c4 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_2_pre_push_spikes.h @@ -0,0 +1,7 @@ +#ifndef _INCLUDED_synapses_2_pre_push_spikes +#define _INCLUDED_synapses_2_pre_push_spikes + +void _run_synapses_2_pre_push_spikes(); + + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_2_synapses_create_generator_codeobject.cu b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_2_synapses_create_generator_codeobject.cu new file mode 100644 index 00000000..fcded747 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_2_synapses_create_generator_codeobject.cu @@ -0,0 +1,486 @@ +#include "code_objects/synapses_2_synapses_create_generator_codeobject.h" +#include "objects.h" +#include "brianlib/common_math.h" +#include "brianlib/cuda_utils.h" +#include "brianlib/stdint_compat.h" +#include +#include +#include +#include + +#include +#include "synapses_classes.h" + +#include +#include +#include +#include "brianlib/cuda_utils.h" +#include + + +////// SUPPORT CODE /////// +namespace { + double _host_rand(const int _vectorisation_idx); + double _host_randn(const int _vectorisation_idx); + int32_t _host_poisson(double _lambda, const int _vectorisation_idx); + + ///// block extra_device_helper ///// + + ///// support_code_lines ///// + + #define _rand(vectorisation_idx) (_ptr_array_synapses_2_synapses_create_generator_codeobject_rand[vectorisation_idx]) + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + inline __device__ int _brian_atomicAdd(int* address, int val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ float _brian_atomicAdd(float* address, float val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ double _brian_atomicAdd(double* address, double val) + { + #if (__CUDA_ARCH__ >= 600) + // hardware implementation + return atomicAdd(address, val); + #else + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val + + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + #endif + } + inline __device__ int _brian_atomicMul(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val * assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicMul(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val * + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicMul(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val * + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + inline __device__ int _brian_atomicDiv(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val / assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicDiv(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val / + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicDiv(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val / + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + + +// NOTE: _ptr_array_synapses_2_synapses_create_generator_codeobject_rand is NOT an array +// but an instance of CurandBuffer, which overloads the operator[], which then just +// returns the next random number in the buffer, ignoring the argument passed to operator[] +// NOTE: Put buffers into anonymous namespace such that _host_rand/n and rand/n +// in main code have access to it. +// NOTE: _host_rand/n is used in the host compiled implementation of binomial +// functions. Here, it just returns the next element from the CurandBuffer. +CurandBuffer _ptr_array_synapses_2_synapses_create_generator_codeobject_rand(&brian::curand_generator, RAND); +randomNumber_t _host_rand(const int _vectorisation_idx) +{ + return _ptr_array_synapses_2_synapses_create_generator_codeobject_rand[_vectorisation_idx]; +} + +CurandBuffer _ptr_array_synapses_2_synapses_create_generator_codeobject_randn(&brian::curand_generator, RANDN); +randomNumber_t _host_randn(const int _vectorisation_idx) +{ + return _ptr_array_synapses_2_synapses_create_generator_codeobject_randn[_vectorisation_idx]; +} + +// This is the C++ Standalone implementation of the poisson function, which we use +double _loggam(double x) { + double x0, x2, xp, gl, gl0; + int32_t k, n; + + static double a[10] = {8.333333333333333e-02, -2.777777777777778e-03, + 7.936507936507937e-04, -5.952380952380952e-04, + 8.417508417508418e-04, -1.917526917526918e-03, + 6.410256410256410e-03, -2.955065359477124e-02, + 1.796443723688307e-01, -1.39243221690590e+00}; + x0 = x; + n = 0; + if ((x == 1.0) || (x == 2.0)) + return 0.0; + else if (x <= 7.0) { + n = (int32_t)(7 - x); + x0 = x + n; + } + x2 = 1.0 / (x0 * x0); + xp = 2 * M_PI; + gl0 = a[9]; + for (k=8; k>=0; k--) { + gl0 *= x2; + gl0 += a[k]; + } + gl = gl0 / x0 + 0.5 * log(xp) + (x0 - 0.5) * log(x0) - x0; + if (x <= 7.0) { + for (k=1; k<=n; k++) { + gl -= log(x0 - 1.0); + x0 -= 1.0; + } + } + return gl; +} + +int32_t _poisson_mult(double lam, int _vectorisation_idx) { + int32_t X; + double prod, U, enlam; + + enlam = exp(-lam); + X = 0; + prod = 1.0; + while (1) { + U = _rand(_vectorisation_idx); + prod *= U; + if (prod > enlam) + X += 1; + else + return X; + } +} + +int32_t _poisson_ptrs(double lam, int _vectorisation_idx) { + int32_t k; + double U, V, slam, loglam, a, b, invalpha, vr, us; + + slam = sqrt(lam); + loglam = log(lam); + b = 0.931 + 2.53 * slam; + a = -0.059 + 0.02483 * b; + invalpha = 1.1239 + 1.1328 / (b - 3.4); + vr = 0.9277 - 3.6224 / (b - 2); + + while (1) { + U = _rand(_vectorisation_idx) - 0.5; + V = _rand(_vectorisation_idx); + us = 0.5 - abs(U); + k = (int32_t)floor((2 * a / us + b) * U + lam + 0.43); + if ((us >= 0.07) && (V <= vr)) + return k; + if ((k < 0) || ((us < 0.013) && (V > us))) + continue; + if ((log(V) + log(invalpha) - log(a / (us * us) + b)) <= + (-lam + k * loglam - _loggam(k + 1))) + return k; + } +} + +int32_t _host_poisson(double lam, int32_t _idx) { + if (lam >= 10) + return _poisson_ptrs(lam, _idx); + else if (lam == 0) + return 0; + else + return _poisson_mult(lam, _idx); +} +} + +////// hashdefine_lines /////// + + + + + +void _run_synapses_2_synapses_create_generator_codeobject() +{ + using namespace brian; + +std::clock_t start_timer = std::clock(); + +CUDA_CHECK_MEMORY(); +size_t used_device_memory_start = used_device_memory; + + + ///// HOST_CONSTANTS /////////// + const int _numN = 1; + int32_t* const _array_synapses_2_N_incoming = thrust::raw_pointer_cast(&_dynamic_array_synapses_2_N_incoming[0]); + const int _numN_incoming = _dynamic_array_synapses_2_N_incoming.size(); + int32_t* const _array_synapses_2_N_outgoing = thrust::raw_pointer_cast(&_dynamic_array_synapses_2_N_outgoing[0]); + const int _numN_outgoing = _dynamic_array_synapses_2_N_outgoing.size(); + int32_t* const _array_synapses_2__synaptic_post = thrust::raw_pointer_cast(&_dynamic_array_synapses_2__synaptic_post[0]); + const int _num_synaptic_post = _dynamic_array_synapses_2__synaptic_post.size(); + int32_t* const _array_synapses_2__synaptic_pre = thrust::raw_pointer_cast(&_dynamic_array_synapses_2__synaptic_pre[0]); + const int _num_synaptic_pre = _dynamic_array_synapses_2__synaptic_pre.size(); + + + ///// pointers_lines ///// + + int32_t* __restrict _ptr_array_synapses_2_N_outgoing = _array_synapses_2_N_outgoing; + int32_t* __restrict _ptr_array_synapses_2_N_incoming = _array_synapses_2_N_incoming; + int32_t* _ptr_array_synapses_2_N = _array_synapses_2_N; + int32_t* __restrict _ptr_array_synapses_2__synaptic_pre = _array_synapses_2__synaptic_pre; + int32_t* __restrict _ptr_array_synapses_2__synaptic_post = _array_synapses_2__synaptic_post; + + + const int _N_pre = 100; + const int _N_post = 100; + _dynamic_array_synapses_2_N_incoming.resize(_N_post + 0); + _dynamic_array_synapses_2_N_outgoing.resize(_N_pre + 0); + + int _raw_pre_idx, _raw_post_idx; + const int _vectorisation_idx = -1; + ///// scalar_code['setup_iterator'] ///// + + + ///// scalar_code['create_j'] ///// + + + ///// scalar_code['create_cond'] ///// + + + ///// scalar_code['update_post'] ///// + + + + for(int _i = 0; _i < _N_pre; _i++) + { + + bool __cond, _cond; + _raw_pre_idx = _i + 0; + { + ///// vector_code['create_cond'] ///// + + const char _cond = true; + + __cond = _cond; + } + _cond = __cond; + if(!_cond) continue; + // Some explanation of this hackery. The problem is that we have multiple code blocks. + // Each code block is generated independently of the others, and they declare variables + // at the beginning if necessary (including declaring them as const if their values don't + // change). However, if two code blocks follow each other in the same C++ scope then + // that causes a redeclaration error. So we solve it by putting each block inside a + // pair of braces to create a new scope specific to each code block. However, that brings + // up another problem: we need the values from these code blocks. I don't have a general + // solution to this problem, but in the case of this particular template, we know which + // values we need from them so we simply create outer scoped variables to copy the value + // into. Later on we have a slightly more complicated problem because the original name + // _j has to be used, so we create two variables __j, _j at the outer scope, copy + // _j to __j in the inner scope (using the inner scope version of _j), and then + // __j to _j in the outer scope (to the outer scope version of _j). This outer scope + // version of _j will then be used in subsequent blocks. + long _uiter_low; + long _uiter_high; + long _uiter_step; + { + ///// vector_code['setup_iterator'] ///// + + const int32_t _iter_low = 0; + const int32_t _iter_high = 100; + const int32_t _iter_step = 1; + + _uiter_low = _iter_low; + _uiter_high = _iter_high; + _uiter_step = _iter_step; + } + for(int _k=_uiter_low; _k<_uiter_high; _k+=_uiter_step) + { + long __j, _j, _pre_idx, __pre_idx; + { + ///// vector_code['create_j'] ///// + + const int32_t _pre_idx = _raw_pre_idx; + const int32_t _j = _k; + + __j = _j; // pick up the locally scoped _j and store in __j + __pre_idx = _pre_idx; + } + _j = __j; // make the previously locally scoped _j available + _pre_idx = __pre_idx; + _raw_post_idx = _j + 0; + + + if(_j<0 || _j>=_N_post) + { + cout << "Error: tried to create synapse to neuron j=" << _j << " outside range 0 to " << + _N_post-1 << endl; + exit(1); + } + + ///// vector_code['update_post'] ///// + + const int32_t _post_idx = _raw_post_idx; + const int32_t _n = 1; + + + for (int _repetition=0; _repetition<_n; _repetition++) { + _dynamic_array_synapses_2_N_outgoing[_pre_idx] += 1; + _dynamic_array_synapses_2_N_incoming[_post_idx] += 1; + _dynamic_array_synapses_2__synaptic_pre.push_back(_pre_idx); + _dynamic_array_synapses_2__synaptic_post.push_back(_post_idx); + } + } + } + + // now we need to resize all registered variables + const int32_t newsize = _dynamic_array_synapses_2__synaptic_pre.size(); + THRUST_CHECK_ERROR( + dev_dynamic_array_synapses_2__synaptic_post.resize(newsize) + ); + _dynamic_array_synapses_2__synaptic_post.resize(newsize); + THRUST_CHECK_ERROR( + dev_dynamic_array_synapses_2__synaptic_pre.resize(newsize) + ); + _dynamic_array_synapses_2__synaptic_pre.resize(newsize); + + // update the total number of synapses + _ptr_array_synapses_2_N[0] = newsize; + + // Check for occurrence of multiple source-target pairs in synapses ("synapse number") + std::map, int32_t> source_target_count; + for (int _i=0; _i source_target = std::pair(_dynamic_array_synapses_2__synaptic_pre[_i], _dynamic_array_synapses_2__synaptic_post[_i]); + source_target_count[source_target]++; + //printf("source target count = %i\n", source_target_count[source_target]); + if (source_target_count[source_target] > 1) + { + synapses_2_multiple_pre_post = true; + break; + } + } + + // copy changed host data to device + dev_dynamic_array_synapses_2_N_incoming = _dynamic_array_synapses_2_N_incoming; + dev_dynamic_array_synapses_2_N_outgoing = _dynamic_array_synapses_2_N_outgoing; + dev_dynamic_array_synapses_2__synaptic_pre = _dynamic_array_synapses_2__synaptic_pre; + dev_dynamic_array_synapses_2__synaptic_post = _dynamic_array_synapses_2__synaptic_post; + CUDA_SAFE_CALL( + cudaMemcpyAsync(dev_array_synapses_2_N, + _array_synapses_2_N, + sizeof(int32_t), + cudaMemcpyHostToDevice, stream2) + ); + + + + +// free memory in CurandBuffers +_ptr_array_synapses_2_synapses_create_generator_codeobject_rand.free_memory(); +_ptr_array_synapses_2_synapses_create_generator_codeobject_randn.free_memory(); + +CUDA_CHECK_MEMORY(); +const double to_MB = 1.0 / (1024.0 * 1024.0); +double tot_memory_MB = (used_device_memory - used_device_memory_start) * to_MB; +double time_passed = (double)(std::clock() - start_timer) / CLOCKS_PER_SEC; +std::cout << "INFO: synapses_2 creation took " << time_passed << "s"; +if (tot_memory_MB > 0) + std::cout << " and used " << tot_memory_MB << "MB of memory."; +std::cout << std::endl; +} + + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_2_synapses_create_generator_codeobject.h b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_2_synapses_create_generator_codeobject.h new file mode 100644 index 00000000..86b7880a --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_2_synapses_create_generator_codeobject.h @@ -0,0 +1,7 @@ +#ifndef _INCLUDED_synapses_2_synapses_create_generator_codeobject +#define _INCLUDED_synapses_2_synapses_create_generator_codeobject + +void _run_synapses_2_synapses_create_generator_codeobject(); + + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_group_variable_set_conditional_codeobject.cu b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_group_variable_set_conditional_codeobject.cu new file mode 100644 index 00000000..fb25b1e3 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_group_variable_set_conditional_codeobject.cu @@ -0,0 +1,397 @@ +#include "code_objects/synapses_group_variable_set_conditional_codeobject.h" +#include "objects.h" +#include "brianlib/common_math.h" +#include "brianlib/cuda_utils.h" +#include "brianlib/stdint_compat.h" +#include +#include +#include +#include + + + +////// SUPPORT CODE /////// +namespace { + double _host_rand(const int _vectorisation_idx); + double _host_randn(const int _vectorisation_idx); + int32_t _host_poisson(double _lambda, const int _vectorisation_idx); + + ///// block extra_device_helper ///// + + ///// support_code_lines ///// + + #define _randn(vectorisation_idx) (_ptr_array_synapses_group_variable_set_conditional_codeobject_randn[vectorisation_idx]) + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + inline __device__ int _brian_atomicAdd(int* address, int val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ float _brian_atomicAdd(float* address, float val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ double _brian_atomicAdd(double* address, double val) + { + #if (__CUDA_ARCH__ >= 600) + // hardware implementation + return atomicAdd(address, val); + #else + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val + + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + #endif + } + inline __device__ int _brian_atomicMul(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val * assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicMul(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val * + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicMul(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val * + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + inline __device__ int _brian_atomicDiv(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val / assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicDiv(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val / + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicDiv(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val / + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + + + // Implement dummy functions such that the host compiled code of binomial + // functions works. Hacky, hacky ... + double _host_rand(const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } + double _host_randn(const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } + int32_t _host_poisson(double _lambda, const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_poisson` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } +} + +////// hashdefine_lines /////// + + + +__global__ void +_run_kernel_synapses_group_variable_set_conditional_codeobject( + int _N, + int THREADS_PER_BLOCK, + ///// KERNEL_PARAMETERS ///// + int32_t* _ptr_array_synapses_N, + double* _ptr_array_synapses_group_variable_set_conditional_codeobject_randn, + double* _ptr_array_synapses_weight, + const int _numweight + ) +{ + using namespace brian; + + int tid = threadIdx.x; + int bid = blockIdx.x; + int _idx = bid * THREADS_PER_BLOCK + tid; + int _vectorisation_idx = _idx; + + ///// KERNEL_CONSTANTS ///// + const int _numN = 1; + + ///// kernel_lines ///// + + + + assert(THREADS_PER_BLOCK == blockDim.x); + + + if(_idx >= _N) + { + return; + } + + ///// block kernel_maincode ///// + + ///// scalar_code['condition'] ///// + + + + ///// scalar_code['statement'] ///// + + const double _lio_statement_1 = 10.0 * 1e-09; + const double _lio_statement_2 = 1.25 * 1e-09; + + + ///// vector_code['condition'] ///// + + const char _cond = true; + + + if (_cond) + { + ///// vector_code['statement'] ///// + + double weight; + weight = _lio_statement_1 + (_lio_statement_2 * _randn(_vectorisation_idx + 0 * _N)); + _ptr_array_synapses_weight[_idx] = weight; + + } + + ///// endblock kernel_maincode ///// +} + + +void _run_synapses_group_variable_set_conditional_codeobject() +{ + using namespace brian; + + + const int _N = _array_synapses_N[0]; + + ///// HOST_CONSTANTS /////////// + const int _numN = 1; + double* const dev_array_synapses_weight = thrust::raw_pointer_cast(&dev_dynamic_array_synapses_weight[0]); + const int _numweight = dev_dynamic_array_synapses_weight.size(); + + // Genenerate an array of random numbers on the device + // Make sure we generate an even number of random numbers + int32_t _randn_N = (_N % 2 == 0) ? _N : _N + 1; + double* dev_array_randn; + CUDA_SAFE_CALL( + cudaMalloc( + (void**)&dev_array_randn, + sizeof(double)*_randn_N*1 + ) + ); + CUDA_SAFE_CALL( + curandGenerateNormalDouble( + curand_generator, + dev_array_randn, + _randn_N*1, + 0, // mean + 1 // stddev + ) + ); + + + + static int num_threads, num_blocks; + static size_t needed_shared_memory = 0; + static bool first_run = true; + if (first_run) + { + // get number of blocks and threads + int min_num_threads; // The minimum grid size needed to achieve the + // maximum occupancy for a full device launch + + CUDA_SAFE_CALL( + cudaOccupancyMaxPotentialBlockSize(&min_num_threads, &num_threads, + _run_kernel_synapses_group_variable_set_conditional_codeobject, 0, 0) // last args: dynamicSMemSize, blockSizeLimit + ); + + // Round up according to array size + num_blocks = (_N + num_threads - 1) / num_threads; + + + + + + // calculate theoretical occupancy + int max_active_blocks; + CUDA_SAFE_CALL( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, + _run_kernel_synapses_group_variable_set_conditional_codeobject, num_threads, needed_shared_memory) + ); + + float occupancy = (max_active_blocks * num_threads / num_threads_per_warp) / + (float)(max_threads_per_sm / num_threads_per_warp); + + + // check if we have enough ressources to call kernel with given number + // of blocks and threads (can only occur for the else case above as for the + // first max. occupancy) + struct cudaFuncAttributes funcAttrib; + CUDA_SAFE_CALL( + cudaFuncGetAttributes(&funcAttrib, _run_kernel_synapses_group_variable_set_conditional_codeobject) + ); + if (num_threads > funcAttrib.maxThreadsPerBlock) + { + // use the max num_threads before launch failure + num_threads = funcAttrib.maxThreadsPerBlock; + printf("WARNING Not enough ressources available to call " + "_run_kernel_synapses_group_variable_set_conditional_codeobject " + "with maximum possible threads per block (%u). " + "Reducing num_threads to %u. (Kernel needs %i " + "registers per block, %i bytes of " + "statically-allocated shared memory per block, %i " + "bytes of local memory per thread and a total of %i " + "bytes of user-allocated constant memory)\n", + max_threads_per_block, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes); + + // calculate theoretical occupancy for new num_threads + CUDA_SAFE_CALL( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, + _run_kernel_synapses_group_variable_set_conditional_codeobject, num_threads, needed_shared_memory) + ); + + occupancy = (max_active_blocks * num_threads / num_threads_per_warp) / + (float)(max_threads_per_sm / num_threads_per_warp); + } + + + else + { + printf("INFO _run_kernel_synapses_group_variable_set_conditional_codeobject\n" + "\t%u blocks\n" + "\t%u threads\n" + "\t%i registers per block\n" + "\t%i bytes statically-allocated shared memory per block\n" + "\t%i bytes local memory per thread\n" + "\t%i bytes user-allocated constant memory\n" + "\t%.3f theoretical occupancy\n", + num_blocks, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes, occupancy); + } + first_run = false; + } + + + _run_kernel_synapses_group_variable_set_conditional_codeobject<<>>( + _N, + num_threads, + ///// HOST_PARAMETERS ///// + dev_array_synapses_N, + dev_array_randn, + dev_array_synapses_weight, + _numweight + ); + + CUDA_CHECK_ERROR("_run_kernel_synapses_group_variable_set_conditional_codeobject"); + + +} + + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_group_variable_set_conditional_codeobject.h b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_group_variable_set_conditional_codeobject.h new file mode 100644 index 00000000..d59aae0b --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_group_variable_set_conditional_codeobject.h @@ -0,0 +1,7 @@ +#ifndef _INCLUDED_synapses_group_variable_set_conditional_codeobject +#define _INCLUDED_synapses_group_variable_set_conditional_codeobject + +void _run_synapses_group_variable_set_conditional_codeobject(); + + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_pre_codeobject.cu b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_pre_codeobject.cu new file mode 100644 index 00000000..91e9721d --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_pre_codeobject.cu @@ -0,0 +1,495 @@ +#include "code_objects/synapses_pre_codeobject.h" +#include "objects.h" +#include "brianlib/common_math.h" +#include "brianlib/cuda_utils.h" +#include "brianlib/stdint_compat.h" +#include +#include +#include +#include + +#include +#include "synapses_classes.h" + + +////// SUPPORT CODE /////// +namespace { + double _host_rand(const int _vectorisation_idx); + double _host_randn(const int _vectorisation_idx); + int32_t _host_poisson(double _lambda, const int _vectorisation_idx); + + ///// block extra_device_helper ///// + + ///// support_code_lines ///// + + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + inline __device__ int _brian_atomicAdd(int* address, int val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ float _brian_atomicAdd(float* address, float val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ double _brian_atomicAdd(double* address, double val) + { + #if (__CUDA_ARCH__ >= 600) + // hardware implementation + return atomicAdd(address, val); + #else + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val + + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + #endif + } + inline __device__ int _brian_atomicMul(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val * assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicMul(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val * + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicMul(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val * + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + inline __device__ int _brian_atomicDiv(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val / assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicDiv(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val / + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicDiv(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val / + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + + + // Implement dummy functions such that the host compiled code of binomial + // functions works. Hacky, hacky ... + double _host_rand(const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } + double _host_randn(const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } + int32_t _host_poisson(double _lambda, const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_poisson` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } +} + +////// hashdefine_lines /////// + + + + +__global__ void +_run_kernel_synapses_pre_codeobject( + int _N, + int bid_offset, + int timestep, + int THREADS_PER_BLOCK, + int threads_per_bundle, + int32_t* eventspace, + int num_spiking_neurons, + ///// KERNEL_PARAMETERS ///// + int32_t* _ptr_array_synapses_N, + int32_t* _ptr_array_synapses__synaptic_post, + const int _num_postsynaptic_idx, + int32_t* _ptr_array_synapses__synaptic_pre, + const int _num_synaptic_pre, + double* _ptr_array_neurongroup_g_PN_iKC, + double* _ptr_array_synapses_weight, + const int _numweight + ) +{ + using namespace brian; + + assert(THREADS_PER_BLOCK == blockDim.x); + + int tid = threadIdx.x; + int bid = blockIdx.x + bid_offset; + //TODO: do we need _idx here? if no, get also rid of scoping after scalar code + // scalar_code can depend on _idx (e.g. if the state update depends on a + // subexpression that is the same for all synapses, ?) + int _idx = bid * THREADS_PER_BLOCK + tid; + int _vectorisation_idx = _idx; + + ///// KERNEL_CONSTANTS ///// + const int _numN = 1; + const int _numg_PN_iKC = 2500; + + ///// kernel_lines ///// + + + + + ///// scalar_code ///// + + + + { // _idx is defined in outer and inner scope (for `scalar_code`) + if (synapses_pre.no_or_const_delay_mode) + { + // TODO: pass as kernel parameter instead? + int num_parallel_blocks = synapses_pre.queue->num_blocks; + int32_t spikes_start = synapses_pre.spikes_start; + int32_t spikes_stop = synapses_pre.spikes_stop; + + // for the first delay timesteps the eventspace is not yet filled + // note that num_queues is the number of eventspaces, num_queues-1 the delay in timesteps + if (timestep >= synapses_pre.queue->num_queues - 1) + { + // `spiking_neuron_idx` runs through the eventspace + // `post_block_idx` runs through the post neuron blocks of the connectivity matrix + int spiking_neuron_idx = bid / num_parallel_blocks; + int post_block_idx = bid % num_parallel_blocks; + { + + // spiking_neuron is index in NeuronGroup + int32_t spiking_neuron = eventspace[spiking_neuron_idx]; + + assert(spiking_neuron != -1); + + // apply effects if event neuron is in sources of current SynapticPathway + if(spikes_start <= spiking_neuron && spiking_neuron < spikes_stop) + { + int pre_post_block_id = (spiking_neuron - spikes_start) * num_parallel_blocks + post_block_idx; + int num_synapses = synapses_pre_num_synapses_by_pre[pre_post_block_id]; + int32_t* propagating_synapses = synapses_pre_synapse_ids_by_pre[pre_post_block_id]; + for(int j = tid; j < num_synapses; j+=THREADS_PER_BLOCK) + { + // _idx is the synapse id + int32_t _idx = propagating_synapses[j]; + _vectorisation_idx = j; + + ///// vector_code ///// + + // Abstract code: g_PN_iKC += 0.675 * weight + const int32_t _postsynaptic_idx = _ptr_array_synapses__synaptic_post[_idx]; + const double weight = _ptr_array_synapses_weight[_idx]; + _brian_atomicAdd(&_ptr_array_neurongroup_g_PN_iKC[_postsynaptic_idx], (double)(0.675 * weight)); + + } + } + + __syncthreads(); + } + } + } + else // heterogeneous delay mode + { + cudaVector* synapses_queue; + synapses_pre.queue->peek(&synapses_queue); + + int queue_size = synapses_queue[bid].size(); + + // use a fixed number of threads per bundle, i runs through all those threads of all bundles + // for threads_per_bundle == 1, we have one thread per bundle (parallel) + for (int i = tid; i < queue_size*threads_per_bundle; i+=THREADS_PER_BLOCK) + { + // bundle_idx runs through all bundles + int bundle_idx = i / threads_per_bundle; + // syn_in_bundle_idx runs through all threads in a single bundle + int syn_in_bundle_idx = i % threads_per_bundle; + + int bundle_id = synapses_queue[bid].at(bundle_idx); + int bundle_size = synapses_pre_num_synapses_by_bundle[bundle_id]; + int synapses_offset = synapses_pre_synapses_offset_by_bundle[bundle_id]; + int32_t* synapse_ids = synapses_pre_synapse_ids; + int32_t* synapse_bundle = synapse_ids + synapses_offset; + + // loop through synapses of this bundle with all available threads_per_bundle + // if threads_per_bundle == 1, this is serial + for (int j = syn_in_bundle_idx; j < bundle_size; j+=threads_per_bundle) + { + int32_t _idx = synapse_bundle[j]; + + + ///// vector_code ///// + + // Abstract code: g_PN_iKC += 0.675 * weight + const int32_t _postsynaptic_idx = _ptr_array_synapses__synaptic_post[_idx]; + const double weight = _ptr_array_synapses_weight[_idx]; + _brian_atomicAdd(&_ptr_array_neurongroup_g_PN_iKC[_postsynaptic_idx], (double)(0.675 * weight)); + + } + } + } + } + } + + + +void _run_synapses_pre_codeobject() +{ + using namespace brian; + + + const int _N = _array_synapses_N[0]; + + ///// HOST_CONSTANTS /////////// + const int _numN = 1; + int32_t* const dev_array_synapses__synaptic_post = thrust::raw_pointer_cast(&dev_dynamic_array_synapses__synaptic_post[0]); + const int _num_postsynaptic_idx = dev_dynamic_array_synapses__synaptic_post.size(); + int32_t* const dev_array_synapses__synaptic_pre = thrust::raw_pointer_cast(&dev_dynamic_array_synapses__synaptic_pre[0]); + const int _num_synaptic_pre = dev_dynamic_array_synapses__synaptic_pre.size(); + const int _numg_PN_iKC = 2500; + double* const dev_array_synapses_weight = thrust::raw_pointer_cast(&dev_dynamic_array_synapses_weight[0]); + const int _numweight = dev_dynamic_array_synapses_weight.size(); + +static int num_threads_per_bundle; +static int num_loops; + + static int num_threads, num_blocks; + static size_t needed_shared_memory = 0; + static bool first_run = true; + if (first_run) + { +// We are using atomics, we can fully parallelise. +num_blocks = num_parallel_blocks; +num_threads = max_threads_per_block; +// TODO: effect of mean instead of max? +num_threads_per_bundle = synapses_pre_max_bundle_size; +num_loops = 1; + + + + // calculate theoretical occupancy + int max_active_blocks; + CUDA_SAFE_CALL( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, + _run_kernel_synapses_pre_codeobject, num_threads, needed_shared_memory) + ); + + float occupancy = (max_active_blocks * num_threads / num_threads_per_warp) / + (float)(max_threads_per_sm / num_threads_per_warp); + + + // check if we have enough ressources to call kernel with given number + // of blocks and threads (can only occur for the else case above as for the + // first max. occupancy) + struct cudaFuncAttributes funcAttrib; + CUDA_SAFE_CALL( + cudaFuncGetAttributes(&funcAttrib, _run_kernel_synapses_pre_codeobject) + ); + if (num_threads > funcAttrib.maxThreadsPerBlock) + { + // use the max num_threads before launch failure + num_threads = funcAttrib.maxThreadsPerBlock; + printf("WARNING Not enough ressources available to call " + "_run_kernel_synapses_pre_codeobject " + "with maximum possible threads per block (%u). " + "Reducing num_threads to %u. (Kernel needs %i " + "registers per block, %i bytes of " + "statically-allocated shared memory per block, %i " + "bytes of local memory per thread and a total of %i " + "bytes of user-allocated constant memory)\n", + max_threads_per_block, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes); + + // calculate theoretical occupancy for new num_threads + CUDA_SAFE_CALL( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, + _run_kernel_synapses_pre_codeobject, num_threads, needed_shared_memory) + ); + + occupancy = (max_active_blocks * num_threads / num_threads_per_warp) / + (float)(max_threads_per_sm / num_threads_per_warp); + } + +else if (synapses_pre_max_size <= 0) +{ + printf("INFO there are no synapses in the synapses_pre pathway. Skipping synapses_push and synapses kernels.\n"); +} + + else + { + printf("INFO _run_kernel_synapses_pre_codeobject\n" + "\t%u blocks\n" + "\t%u threads\n" + "\t%i registers per block\n" + "\t%i bytes statically-allocated shared memory per block\n" + "\t%i bytes local memory per thread\n" + "\t%i bytes user-allocated constant memory\n" + "\t%.3f theoretical occupancy\n", + num_blocks, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes, occupancy); + } + first_run = false; + } + + +// only call kernel if we have synapses (otherwise we skipped the push kernel) +if (synapses_pre_max_size > 0) +{ + int32_t num_spiking_neurons; + // we only need the number of spiking neurons if we parallelise effect + // application over spiking neurons in homogeneous delay mode + if (synapses_pre_scalar_delay) + { + if (defaultclock.timestep[0] >= synapses_pre_delay) + { + cudaMemcpyAsync(&num_spiking_neurons, + &dev_array_spikegeneratorgroup__spikespace[synapses_pre_eventspace_idx][_num__array_spikegeneratorgroup__spikespace - 1], + sizeof(int32_t), cudaMemcpyDeviceToHost,stream); + num_blocks = num_parallel_blocks * num_spiking_neurons; + //TODO collect info abt mean, std of num spiking neurons per time + //step and print INFO at end of simulation + } + } + // only call kernel if neurons spiked (else num_blocks is zero) + if (num_blocks != 0) { + for(int bid_offset = 0; bid_offset < num_loops; bid_offset++) + { + _run_kernel_synapses_pre_codeobject<<>>( + _N, + bid_offset, + defaultclock.timestep[0], + num_threads, + num_threads_per_bundle, + dev_array_spikegeneratorgroup__spikespace[synapses_pre_eventspace_idx], + num_spiking_neurons, + ///// HOST_PARAMETERS ///// + dev_array_synapses_N, + dev_array_synapses__synaptic_post, + _num_postsynaptic_idx, + dev_array_synapses__synaptic_pre, + _num_synaptic_pre, + dev_array_neurongroup_g_PN_iKC, + dev_array_synapses_weight, + _numweight + ); + } + } + + CUDA_CHECK_ERROR("_run_kernel_synapses_pre_codeobject"); +} + + +} + +void _debugmsg_synapses_pre_codeobject() +{ + using namespace brian; + std::cout << "Number of synapses: " << _array_synapses_N[0] << endl; +} + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_pre_codeobject.h b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_pre_codeobject.h new file mode 100644 index 00000000..d99be85b --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_pre_codeobject.h @@ -0,0 +1,8 @@ +#ifndef _INCLUDED_synapses_pre_codeobject +#define _INCLUDED_synapses_pre_codeobject + +void _run_synapses_pre_codeobject(); + +void _debugmsg_synapses_pre_codeobject(); + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_pre_push_spikes.cu b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_pre_push_spikes.cu new file mode 100644 index 00000000..d064eef5 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_pre_push_spikes.cu @@ -0,0 +1,374 @@ +#include "code_objects/synapses_pre_push_spikes.h" +#include "objects.h" +#include "brianlib/common_math.h" +#include "brianlib/cuda_utils.h" +#include "brianlib/stdint_compat.h" +#include +#include +#include +#include + + + +////// SUPPORT CODE /////// +namespace { + double _host_rand(const int _vectorisation_idx); + double _host_randn(const int _vectorisation_idx); + int32_t _host_poisson(double _lambda, const int _vectorisation_idx); + + ///// block extra_device_helper ///// +__global__ void _advance_kernel_synapses_pre_push_spikes() +{ + using namespace brian; + int tid = threadIdx.x; + synapses_pre.queue->advance( + tid); +} + + ///// support_code_lines ///// + + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + inline __device__ int _brian_atomicAdd(int* address, int val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ float _brian_atomicAdd(float* address, float val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ double _brian_atomicAdd(double* address, double val) + { + #if (__CUDA_ARCH__ >= 600) + // hardware implementation + return atomicAdd(address, val); + #else + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val + + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + #endif + } + inline __device__ int _brian_atomicMul(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val * assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicMul(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val * + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicMul(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val * + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + inline __device__ int _brian_atomicDiv(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val / assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicDiv(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val / + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicDiv(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val / + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + + + // Implement dummy functions such that the host compiled code of binomial + // functions works. Hacky, hacky ... + double _host_rand(const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } + double _host_randn(const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } + int32_t _host_poisson(double _lambda, const int _vectorisation_idx) + { + printf("ERROR: Called dummy function `_host_poisson` in %s:%d\n", __FILE__, + __LINE__); + exit(EXIT_FAILURE); + } +} + +////// hashdefine_lines /////// + + + +__global__ void +_run_kernel_synapses_pre_push_spikes( + int num_parallel_blocks, + int _num_blocks, + int _num_threads, + int32_t* _eventspace) +{ + // apperently this is not always true and that is why _num_threads is passed as function argument + // if this assert never fails, we could remove the _num_threads form the argument list + assert(blockDim.x == _num_threads); + + using namespace brian; + + int bid = blockIdx.x; + int tid = threadIdx.x; + + int post_neuron_bid = bid % num_parallel_blocks; + int pre_neuron_idx = bid / num_parallel_blocks; + + int32_t spiking_neuron = _eventspace[pre_neuron_idx]; + assert(spiking_neuron != -1); + + // push to spikequeue if spiking_neuron is in sources of current SynapticPathway + if(synapses_pre.spikes_start <= spiking_neuron && spiking_neuron < synapses_pre.spikes_stop) + { + synapses_pre.queue->push_bundles( + post_neuron_bid, + tid, + _num_threads, + spiking_neuron - synapses_pre.spikes_start); + } +} + + +void _run_synapses_pre_push_spikes() +{ + using namespace brian; + + + + ///// HOST_CONSTANTS /////////// + const int _numN = 1; + const int _num_spikespace = 101; + double* const _array_synapses_delay = thrust::raw_pointer_cast(&_dynamic_array_synapses_delay[0]); + const int _numdelay = _dynamic_array_synapses_delay.size(); + double* const dev_array_synapses_delay = thrust::raw_pointer_cast(&dev_dynamic_array_synapses_delay[0]); + + if (synapses_pre_scalar_delay) + { + int num_eventspaces = dev_array_spikegeneratorgroup__spikespace.size(); + synapses_pre_eventspace_idx = (current_idx_array_spikegeneratorgroup__spikespace - synapses_pre_delay + num_eventspaces) % num_eventspaces; + + ////////////////////////////////////////////// + //// No pushing in no_or_const_delay_mode //// + ////////////////////////////////////////////// + } + else if (synapses_pre_max_size > 0) + { + + // get the number of spiking neurons + int32_t num_spiking_neurons; + CUDA_SAFE_CALL( + cudaMemcpyAsync(&num_spiking_neurons, + dev_array_spikegeneratorgroup__spikespace[current_idx_array_spikegeneratorgroup__spikespace] + _num_spikespace - 1, + sizeof(int32_t), cudaMemcpyDeviceToHost,stream) + ); + + // advance spike queues + _advance_kernel_synapses_pre_push_spikes<<<1, num_parallel_blocks,0,stream>>>(); + + CUDA_CHECK_ERROR("_advance_kernel_synapses_pre_push_spikes"); + + + static int num_threads, num_blocks; + static size_t needed_shared_memory = 0; + static bool first_run = true; + if (first_run) + { + needed_shared_memory = 0; + + // We don't need more then max(num_synapses) threads per block. + num_threads = synapses_pre_max_size; + if (num_threads > max_threads_per_block) + { + num_threads = max_threads_per_block; + } + // num_blocks depends on num_spiking_neurons, which changes each time step + + + // calculate theoretical occupancy + int max_active_blocks; + CUDA_SAFE_CALL( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, + _run_kernel_synapses_pre_push_spikes, num_threads, needed_shared_memory) + ); + + float occupancy = (max_active_blocks * num_threads / num_threads_per_warp) / + (float)(max_threads_per_sm / num_threads_per_warp); + + + // check if we have enough ressources to call kernel with given number + // of blocks and threads (can only occur for the else case above as for the + // first max. occupancy) + struct cudaFuncAttributes funcAttrib; + CUDA_SAFE_CALL( + cudaFuncGetAttributes(&funcAttrib, _run_kernel_synapses_pre_push_spikes) + ); + if (num_threads > funcAttrib.maxThreadsPerBlock) + { + // use the max num_threads before launch failure + num_threads = funcAttrib.maxThreadsPerBlock; + printf("WARNING Not enough ressources available to call " + "_run_kernel_synapses_pre_push_spikes " + "with maximum possible threads per block (%u). " + "Reducing num_threads to %u. (Kernel needs %i " + "registers per block, %i bytes of " + "statically-allocated shared memory per block, %i " + "bytes of local memory per thread and a total of %i " + "bytes of user-allocated constant memory)\n", + max_threads_per_block, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes); + + // calculate theoretical occupancy for new num_threads + CUDA_SAFE_CALL( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, + _run_kernel_synapses_pre_push_spikes, num_threads, needed_shared_memory) + ); + + occupancy = (max_active_blocks * num_threads / num_threads_per_warp) / + (float)(max_threads_per_sm / num_threads_per_warp); + } + + + else + { + printf("INFO _run_kernel_synapses_pre_push_spikes\n" + "\t%u blocks\n" + "\t%u threads\n" + "\t%i registers per block\n" + "\t%i bytes statically-allocated shared memory per block\n" + "\t%i bytes local memory per thread\n" + "\t%i bytes user-allocated constant memory\n" + "\t%.3f theoretical occupancy\n", + num_blocks, num_threads, funcAttrib.numRegs, + funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes, + funcAttrib.constSizeBytes, occupancy); + } + first_run = false; + } + + + if (num_spiking_neurons > 0) + { + num_blocks = num_parallel_blocks * num_spiking_neurons; + + _run_kernel_synapses_pre_push_spikes<<>>( + num_parallel_blocks, + num_blocks, + num_threads, + dev_array_spikegeneratorgroup__spikespace[current_idx_array_spikegeneratorgroup__spikespace]); + + CUDA_CHECK_ERROR("_run_kernel_synapses_pre_push_spikes"); + } + } // end else if (synapses_pre_max_size > 0) + +} + + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_pre_push_spikes.h b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_pre_push_spikes.h new file mode 100644 index 00000000..0968700d --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_pre_push_spikes.h @@ -0,0 +1,7 @@ +#ifndef _INCLUDED_synapses_pre_push_spikes +#define _INCLUDED_synapses_pre_push_spikes + +void _run_synapses_pre_push_spikes(); + + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_synapses_create_generator_codeobject.cu b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_synapses_create_generator_codeobject.cu new file mode 100644 index 00000000..aaa6727a --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_synapses_create_generator_codeobject.cu @@ -0,0 +1,514 @@ +#include "code_objects/synapses_synapses_create_generator_codeobject.h" +#include "objects.h" +#include "brianlib/common_math.h" +#include "brianlib/cuda_utils.h" +#include "brianlib/stdint_compat.h" +#include +#include +#include +#include + +#include +#include "synapses_classes.h" + +#include +#include +#include +#include "brianlib/cuda_utils.h" +#include + + +////// SUPPORT CODE /////// +namespace { + double _host_rand(const int _vectorisation_idx); + double _host_randn(const int _vectorisation_idx); + int32_t _host_poisson(double _lambda, const int _vectorisation_idx); + + ///// block extra_device_helper ///// + + ///// support_code_lines ///// + + #define _rand(vectorisation_idx) (_ptr_array_synapses_synapses_create_generator_codeobject_rand[vectorisation_idx]) + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + __host__ __device__ static inline typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + inline __device__ int _brian_atomicAdd(int* address, int val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ float _brian_atomicAdd(float* address, float val) + { + // hardware implementation + return atomicAdd(address, val); + } + inline __device__ double _brian_atomicAdd(double* address, double val) + { + #if (__CUDA_ARCH__ >= 600) + // hardware implementation + return atomicAdd(address, val); + #else + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val + + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + #endif + } + inline __device__ int _brian_atomicMul(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val * assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicMul(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val * + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicMul(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val * + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + inline __device__ int _brian_atomicDiv(int* address, int val) + { + // software implementation + int old = *address, assumed; + do { + assumed = old; + old = atomicCAS(address, assumed, val / assumed); + } while (assumed != old); + return old; + } + inline __device__ float _brian_atomicDiv(float* address, float val) + { + // software implementation + int* address_as_int = (int*)address; + int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val / + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __int_as_float(old); + } + inline __device__ double _brian_atomicDiv(double* address, double val) + { + // software implementation + unsigned long long int* address_as_int = (unsigned long long int*)address; + unsigned long long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __double_as_longlong(val / + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + return __longlong_as_double(old); + } + + +// NOTE: _ptr_array_synapses_synapses_create_generator_codeobject_rand is NOT an array +// but an instance of CurandBuffer, which overloads the operator[], which then just +// returns the next random number in the buffer, ignoring the argument passed to operator[] +// NOTE: Put buffers into anonymous namespace such that _host_rand/n and rand/n +// in main code have access to it. +// NOTE: _host_rand/n is used in the host compiled implementation of binomial +// functions. Here, it just returns the next element from the CurandBuffer. +CurandBuffer _ptr_array_synapses_synapses_create_generator_codeobject_rand(&brian::curand_generator, RAND); +randomNumber_t _host_rand(const int _vectorisation_idx) +{ + return _ptr_array_synapses_synapses_create_generator_codeobject_rand[_vectorisation_idx]; +} + +CurandBuffer _ptr_array_synapses_synapses_create_generator_codeobject_randn(&brian::curand_generator, RANDN); +randomNumber_t _host_randn(const int _vectorisation_idx) +{ + return _ptr_array_synapses_synapses_create_generator_codeobject_randn[_vectorisation_idx]; +} + +// This is the C++ Standalone implementation of the poisson function, which we use +double _loggam(double x) { + double x0, x2, xp, gl, gl0; + int32_t k, n; + + static double a[10] = {8.333333333333333e-02, -2.777777777777778e-03, + 7.936507936507937e-04, -5.952380952380952e-04, + 8.417508417508418e-04, -1.917526917526918e-03, + 6.410256410256410e-03, -2.955065359477124e-02, + 1.796443723688307e-01, -1.39243221690590e+00}; + x0 = x; + n = 0; + if ((x == 1.0) || (x == 2.0)) + return 0.0; + else if (x <= 7.0) { + n = (int32_t)(7 - x); + x0 = x + n; + } + x2 = 1.0 / (x0 * x0); + xp = 2 * M_PI; + gl0 = a[9]; + for (k=8; k>=0; k--) { + gl0 *= x2; + gl0 += a[k]; + } + gl = gl0 / x0 + 0.5 * log(xp) + (x0 - 0.5) * log(x0) - x0; + if (x <= 7.0) { + for (k=1; k<=n; k++) { + gl -= log(x0 - 1.0); + x0 -= 1.0; + } + } + return gl; +} + +int32_t _poisson_mult(double lam, int _vectorisation_idx) { + int32_t X; + double prod, U, enlam; + + enlam = exp(-lam); + X = 0; + prod = 1.0; + while (1) { + U = _rand(_vectorisation_idx); + prod *= U; + if (prod > enlam) + X += 1; + else + return X; + } +} + +int32_t _poisson_ptrs(double lam, int _vectorisation_idx) { + int32_t k; + double U, V, slam, loglam, a, b, invalpha, vr, us; + + slam = sqrt(lam); + loglam = log(lam); + b = 0.931 + 2.53 * slam; + a = -0.059 + 0.02483 * b; + invalpha = 1.1239 + 1.1328 / (b - 3.4); + vr = 0.9277 - 3.6224 / (b - 2); + + while (1) { + U = _rand(_vectorisation_idx) - 0.5; + V = _rand(_vectorisation_idx); + us = 0.5 - abs(U); + k = (int32_t)floor((2 * a / us + b) * U + lam + 0.43); + if ((us >= 0.07) && (V <= vr)) + return k; + if ((k < 0) || ((us < 0.013) && (V > us))) + continue; + if ((log(V) + log(invalpha) - log(a / (us * us) + b)) <= + (-lam + k * loglam - _loggam(k + 1))) + return k; + } +} + +int32_t _host_poisson(double lam, int32_t _idx) { + if (lam >= 10) + return _poisson_ptrs(lam, _idx); + else if (lam == 0) + return 0; + else + return _poisson_mult(lam, _idx); +} +} + +////// hashdefine_lines /////// + + + + + +void _run_synapses_synapses_create_generator_codeobject() +{ + using namespace brian; + +std::clock_t start_timer = std::clock(); + +CUDA_CHECK_MEMORY(); +size_t used_device_memory_start = used_device_memory; + + + ///// HOST_CONSTANTS /////////// + const int _numN = 1; + int32_t* const _array_synapses_N_incoming = thrust::raw_pointer_cast(&_dynamic_array_synapses_N_incoming[0]); + const int _numN_incoming = _dynamic_array_synapses_N_incoming.size(); + int32_t* const _array_synapses_N_outgoing = thrust::raw_pointer_cast(&_dynamic_array_synapses_N_outgoing[0]); + const int _numN_outgoing = _dynamic_array_synapses_N_outgoing.size(); + int32_t* const _array_synapses__synaptic_post = thrust::raw_pointer_cast(&_dynamic_array_synapses__synaptic_post[0]); + const int _num_synaptic_post = _dynamic_array_synapses__synaptic_post.size(); + int32_t* const _array_synapses__synaptic_pre = thrust::raw_pointer_cast(&_dynamic_array_synapses__synaptic_pre[0]); + const int _num_synaptic_pre = _dynamic_array_synapses__synaptic_pre.size(); + + + ///// pointers_lines ///// + + int32_t* __restrict _ptr_array_synapses_N_outgoing = _array_synapses_N_outgoing; + int32_t* __restrict _ptr_array_synapses_N_incoming = _array_synapses_N_incoming; + int32_t* _ptr_array_synapses_N = _array_synapses_N; + int32_t* __restrict _ptr_array_synapses__synaptic_pre = _array_synapses__synaptic_pre; + int32_t* __restrict _ptr_array_synapses__synaptic_post = _array_synapses__synaptic_post; + + + const int _N_pre = 100; + const int _N_post = 2500; + _dynamic_array_synapses_N_incoming.resize(_N_post + 0); + _dynamic_array_synapses_N_outgoing.resize(_N_pre + 0); + + int _raw_pre_idx, _raw_post_idx; + const int _vectorisation_idx = -1; + ///// scalar_code['setup_iterator'] ///// + + + ///// scalar_code['create_j'] ///// + + + ///// scalar_code['create_cond'] ///// + + + ///// scalar_code['update_post'] ///// + + + + for(int _i = 0; _i < _N_pre; _i++) + { + + bool __cond, _cond; + _raw_pre_idx = _i + 0; + { + ///// vector_code['create_cond'] ///// + + const char _cond = true; + + __cond = _cond; + } + _cond = __cond; + if(!_cond) continue; + // Some explanation of this hackery. The problem is that we have multiple code blocks. + // Each code block is generated independently of the others, and they declare variables + // at the beginning if necessary (including declaring them as const if their values don't + // change). However, if two code blocks follow each other in the same C++ scope then + // that causes a redeclaration error. So we solve it by putting each block inside a + // pair of braces to create a new scope specific to each code block. However, that brings + // up another problem: we need the values from these code blocks. I don't have a general + // solution to this problem, but in the case of this particular template, we know which + // values we need from them so we simply create outer scoped variables to copy the value + // into. Later on we have a slightly more complicated problem because the original name + // _j has to be used, so we create two variables __j, _j at the outer scope, copy + // _j to __j in the inner scope (using the inner scope version of _j), and then + // __j to _j in the outer scope (to the outer scope version of _j). This outer scope + // version of _j will then be used in subsequent blocks. + long _uiter_low; + long _uiter_high; + long _uiter_step; + double _uiter_p; + { + ///// vector_code['setup_iterator'] ///// + + const int32_t _iter_low = 0; + const int32_t _iter_high = 2500; + const int32_t _iter_step = 1; + const double _iter_p = 0.15; + + _uiter_low = _iter_low; + _uiter_high = _iter_high; + _uiter_step = _iter_step; + _uiter_p = _iter_p; + } + if(_uiter_p==0) continue; + const bool _jump_algo = _uiter_p<0.25; + double _log1p; + if(_jump_algo) + _log1p = log(1-_uiter_p); + else + _log1p = 1.0; // will be ignored + const double _pconst = 1.0/log(1-_uiter_p); + for(int _k=_uiter_low; _k<_uiter_high; _k++) + { + if(_jump_algo) { + const double _r = _rand(_vectorisation_idx); + if(_r==0.0) break; + const int _jump = floor(log(_r)*_pconst)*_uiter_step; + _k += _jump; + if(_k>=_uiter_high) continue; + } else { + if(_rand(_vectorisation_idx)>=_uiter_p) continue; + } + long __j, _j, _pre_idx, __pre_idx; + { + ///// vector_code['create_j'] ///// + + const int32_t _pre_idx = _raw_pre_idx; + const int32_t _j = _k; + + __j = _j; // pick up the locally scoped _j and store in __j + __pre_idx = _pre_idx; + } + _j = __j; // make the previously locally scoped _j available + _pre_idx = __pre_idx; + _raw_post_idx = _j + 0; + + + if(_j<0 || _j>=_N_post) + { + cout << "Error: tried to create synapse to neuron j=" << _j << " outside range 0 to " << + _N_post-1 << endl; + exit(1); + } + + ///// vector_code['update_post'] ///// + + const int32_t _post_idx = _raw_post_idx; + const int32_t _n = 1; + + + for (int _repetition=0; _repetition<_n; _repetition++) { + _dynamic_array_synapses_N_outgoing[_pre_idx] += 1; + _dynamic_array_synapses_N_incoming[_post_idx] += 1; + _dynamic_array_synapses__synaptic_pre.push_back(_pre_idx); + _dynamic_array_synapses__synaptic_post.push_back(_post_idx); + } + } + } + + // now we need to resize all registered variables + const int32_t newsize = _dynamic_array_synapses__synaptic_pre.size(); + THRUST_CHECK_ERROR( + dev_dynamic_array_synapses__synaptic_post.resize(newsize) + ); + _dynamic_array_synapses__synaptic_post.resize(newsize); + THRUST_CHECK_ERROR( + dev_dynamic_array_synapses__synaptic_pre.resize(newsize) + ); + _dynamic_array_synapses__synaptic_pre.resize(newsize); + THRUST_CHECK_ERROR( + dev_dynamic_array_synapses_delay.resize(newsize) + ); + _dynamic_array_synapses_delay.resize(newsize); + THRUST_CHECK_ERROR( + dev_dynamic_array_synapses_weight.resize(newsize) + ); + _dynamic_array_synapses_weight.resize(newsize); + + // update the total number of synapses + _ptr_array_synapses_N[0] = newsize; + + // Check for occurrence of multiple source-target pairs in synapses ("synapse number") + std::map, int32_t> source_target_count; + for (int _i=0; _i source_target = std::pair(_dynamic_array_synapses__synaptic_pre[_i], _dynamic_array_synapses__synaptic_post[_i]); + source_target_count[source_target]++; + //printf("source target count = %i\n", source_target_count[source_target]); + if (source_target_count[source_target] > 1) + { + synapses_multiple_pre_post = true; + break; + } + } + + // copy changed host data to device + dev_dynamic_array_synapses_N_incoming = _dynamic_array_synapses_N_incoming; + dev_dynamic_array_synapses_N_outgoing = _dynamic_array_synapses_N_outgoing; + dev_dynamic_array_synapses__synaptic_pre = _dynamic_array_synapses__synaptic_pre; + dev_dynamic_array_synapses__synaptic_post = _dynamic_array_synapses__synaptic_post; + CUDA_SAFE_CALL( + cudaMemcpyAsync(dev_array_synapses_N, + _array_synapses_N, + sizeof(int32_t), + cudaMemcpyHostToDevice,stream) + ); + + + + +// free memory in CurandBuffers +_ptr_array_synapses_synapses_create_generator_codeobject_rand.free_memory(); +_ptr_array_synapses_synapses_create_generator_codeobject_randn.free_memory(); + +CUDA_CHECK_MEMORY(); +const double to_MB = 1.0 / (1024.0 * 1024.0); +double tot_memory_MB = (used_device_memory - used_device_memory_start) * to_MB; +double time_passed = (double)(std::clock() - start_timer) / CLOCKS_PER_SEC; +std::cout << "INFO: synapses creation took " << time_passed << "s"; +if (tot_memory_MB > 0) + std::cout << " and used " << tot_memory_MB << "MB of memory."; +std::cout << std::endl; +} + + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_synapses_create_generator_codeobject.h b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_synapses_create_generator_codeobject.h new file mode 100644 index 00000000..88c4628e --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/code_objects/synapses_synapses_create_generator_codeobject.h @@ -0,0 +1,7 @@ +#ifndef _INCLUDED_synapses_synapses_create_generator_codeobject +#define _INCLUDED_synapses_synapses_create_generator_codeobject + +void _run_synapses_synapses_create_generator_codeobject(); + + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/main.cu b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/main.cu new file mode 100644 index 00000000..615aab6a --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/main.cu @@ -0,0 +1,528 @@ +#include +#include "objects.h" +#include +#include +#include "run.h" +#include "brianlib/common_math.h" +#include "brianlib/cuda_utils.h" +#include "rand.h" + +#include "code_objects/synapses_synapses_create_generator_codeobject.h" +#include "code_objects/synapses_1_synapses_create_generator_codeobject.h" +#include "code_objects/synapses_2_synapses_create_generator_codeobject.h" +#include "code_objects/synapses_group_variable_set_conditional_codeobject.h" +#include "code_objects/synapses_1_group_variable_set_conditional_codeobject.h" +#include "code_objects/synapses_1_group_variable_set_conditional_codeobject_1.h" +#include "code_objects/neurongroup_1_stateupdater_codeobject.h" +#include "code_objects/neurongroup_stateupdater_codeobject.h" +#include "code_objects/neurongroup_1_thresholder_codeobject.h" +#include "code_objects/neurongroup_thresholder_codeobject.h" +#include "code_objects/spikegeneratorgroup_codeobject.h" +#include "code_objects/spikemonitor_codeobject.h" +#include "code_objects/spikemonitor_1_codeobject.h" +#include "code_objects/spikemonitor_2_codeobject.h" +#include "code_objects/synapses_1_pre_push_spikes.h" +#include "code_objects/before_run_synapses_1_pre_push_spikes.h" +#include "code_objects/synapses_1_pre_codeobject.h" +#include "code_objects/synapses_2_pre_push_spikes.h" +#include "code_objects/before_run_synapses_2_pre_push_spikes.h" +#include "code_objects/synapses_2_pre_codeobject.h" +#include "code_objects/synapses_pre_push_spikes.h" +#include "code_objects/before_run_synapses_pre_push_spikes.h" +#include "code_objects/synapses_pre_codeobject.h" +#include "code_objects/synapses_1_post_push_spikes.h" +#include "code_objects/before_run_synapses_1_post_push_spikes.h" +#include "code_objects/synapses_1_post_codeobject.h" + + +#include +#include +#include +#include "cuda_profiler_api.h" + + + + +int main(int argc, char **argv) +{ + + + + // seed variable set in Python through brian2.seed() calls can use this + // variable (see device.py CUDAStandaloneDevice.generate_main_source()) + unsigned long long seed; + + const std::clock_t _start_time = std::clock(); + + CUDA_SAFE_CALL( + cudaSetDevice(0) + ); + + cudaDeviceProp props; + CUDA_SAFE_CALL( + cudaGetDeviceProperties(&props, 0) + ); + size_t limit = 128 * 1024 * 1024; + CUDA_SAFE_CALL( + cudaDeviceSetLimit(cudaLimitMallocHeapSize, limit) + ); + CUDA_SAFE_CALL( + cudaDeviceSynchronize() + ); + + const double _run_time2 = (double)(std::clock() -_start_time)/CLOCKS_PER_SEC; + printf("INFO: setting cudaDevice stuff took %f seconds\n", _run_time2); + + brian_start(); + + + + + const std::clock_t _start_time3 = std::clock(); + { + using namespace brian; + + + for(int i=0; i<_num__array_spikegeneratorgroup__spikespace; i++) + { + _array_spikegeneratorgroup__spikespace[i] = - 1; + } + CUDA_SAFE_CALL( + cudaMemcpy( + dev_array_spikegeneratorgroup__spikespace[current_idx_array_spikegeneratorgroup__spikespace], + &_array_spikegeneratorgroup__spikespace[0], + sizeof(_array_spikegeneratorgroup__spikespace[0])*_num__array_spikegeneratorgroup__spikespace, + cudaMemcpyHostToDevice + ) + ); + _array_spikegeneratorgroup__spikespace[_num__array_spikegeneratorgroup__spikespace - 1] = 0; + CUDA_SAFE_CALL( + cudaMemcpy( + dev_array_spikegeneratorgroup__spikespace[current_idx_array_spikegeneratorgroup__spikespace] + _num__array_spikegeneratorgroup__spikespace - 1, + &_array_spikegeneratorgroup__spikespace[_num__array_spikegeneratorgroup__spikespace - 1], + sizeof(_array_spikegeneratorgroup__spikespace[_num__array_spikegeneratorgroup__spikespace - 1]), + cudaMemcpyHostToDevice + ) + ); + for(int i=0; i<_num__array_neurongroup__spikespace; i++) + { + _array_neurongroup__spikespace[i] = - 1; + } + CUDA_SAFE_CALL( + cudaMemcpy( + dev_array_neurongroup__spikespace[current_idx_array_neurongroup__spikespace], + &_array_neurongroup__spikespace[0], + sizeof(_array_neurongroup__spikespace[0])*_num__array_neurongroup__spikespace, + cudaMemcpyHostToDevice + ) + ); + _array_neurongroup__spikespace[_num__array_neurongroup__spikespace - 1] = 0; + CUDA_SAFE_CALL( + cudaMemcpy( + dev_array_neurongroup__spikespace[current_idx_array_neurongroup__spikespace] + _num__array_neurongroup__spikespace - 1, + &_array_neurongroup__spikespace[_num__array_neurongroup__spikespace - 1], + sizeof(_array_neurongroup__spikespace[_num__array_neurongroup__spikespace - 1]), + cudaMemcpyHostToDevice + ) + ); + for(int i=0; i<_num__array_neurongroup_1__spikespace; i++) + { + _array_neurongroup_1__spikespace[i] = - 1; + } + CUDA_SAFE_CALL( + cudaMemcpy( + dev_array_neurongroup_1__spikespace[current_idx_array_neurongroup_1__spikespace], + &_array_neurongroup_1__spikespace[0], + sizeof(_array_neurongroup_1__spikespace[0])*_num__array_neurongroup_1__spikespace, + cudaMemcpyHostToDevice + ) + ); + _array_neurongroup_1__spikespace[_num__array_neurongroup_1__spikespace - 1] = 0; + CUDA_SAFE_CALL( + cudaMemcpy( + dev_array_neurongroup_1__spikespace[current_idx_array_neurongroup_1__spikespace] + _num__array_neurongroup_1__spikespace - 1, + &_array_neurongroup_1__spikespace[_num__array_neurongroup_1__spikespace - 1], + sizeof(_array_neurongroup_1__spikespace[_num__array_neurongroup_1__spikespace - 1]), + cudaMemcpyHostToDevice + ) + ); + _array_defaultclock_dt[0] = 0.0001; + CUDA_SAFE_CALL( + cudaMemcpy( + dev_array_defaultclock_dt + 0, + &_array_defaultclock_dt[0], + sizeof(_array_defaultclock_dt[0]), + cudaMemcpyHostToDevice + ) + ); + _array_defaultclock_dt[0] = 0.0001; + CUDA_SAFE_CALL( + cudaMemcpy( + dev_array_defaultclock_dt + 0, + &_array_defaultclock_dt[0], + sizeof(_array_defaultclock_dt[0]), + cudaMemcpyHostToDevice + ) + ); + _array_defaultclock_dt[0] = 0.0001; + CUDA_SAFE_CALL( + cudaMemcpy( + dev_array_defaultclock_dt + 0, + &_array_defaultclock_dt[0], + sizeof(_array_defaultclock_dt[0]), + cudaMemcpyHostToDevice + ) + ); + _dynamic_array_spikegeneratorgroup_spike_number.resize(19676); + THRUST_CHECK_ERROR(dev_dynamic_array_spikegeneratorgroup_spike_number.resize(19676)); + for(int i=0; i<_num__static_array__dynamic_array_spikegeneratorgroup_spike_number; i++) + { + _dynamic_array_spikegeneratorgroup_spike_number[i] = _static_array__dynamic_array_spikegeneratorgroup_spike_number[i]; + } + CUDA_SAFE_CALL( + cudaMemcpy( + thrust::raw_pointer_cast(&dev_dynamic_array_spikegeneratorgroup_spike_number[0]), + &_dynamic_array_spikegeneratorgroup_spike_number[0], + sizeof(_dynamic_array_spikegeneratorgroup_spike_number[0])*_dynamic_array_spikegeneratorgroup_spike_number.size(), + cudaMemcpyHostToDevice + ) + ); + _dynamic_array_spikegeneratorgroup_neuron_index.resize(19676); + THRUST_CHECK_ERROR(dev_dynamic_array_spikegeneratorgroup_neuron_index.resize(19676)); + for(int i=0; i<_num__static_array__dynamic_array_spikegeneratorgroup_neuron_index; i++) + { + _dynamic_array_spikegeneratorgroup_neuron_index[i] = _static_array__dynamic_array_spikegeneratorgroup_neuron_index[i]; + } + CUDA_SAFE_CALL( + cudaMemcpy( + thrust::raw_pointer_cast(&dev_dynamic_array_spikegeneratorgroup_neuron_index[0]), + &_dynamic_array_spikegeneratorgroup_neuron_index[0], + sizeof(_dynamic_array_spikegeneratorgroup_neuron_index[0])*_dynamic_array_spikegeneratorgroup_neuron_index.size(), + cudaMemcpyHostToDevice + ) + ); + _dynamic_array_spikegeneratorgroup_spike_time.resize(19676); + THRUST_CHECK_ERROR(dev_dynamic_array_spikegeneratorgroup_spike_time.resize(19676)); + for(int i=0; i<_num__static_array__dynamic_array_spikegeneratorgroup_spike_time; i++) + { + _dynamic_array_spikegeneratorgroup_spike_time[i] = _static_array__dynamic_array_spikegeneratorgroup_spike_time[i]; + } + CUDA_SAFE_CALL( + cudaMemcpy( + thrust::raw_pointer_cast(&dev_dynamic_array_spikegeneratorgroup_spike_time[0]), + &_dynamic_array_spikegeneratorgroup_spike_time[0], + sizeof(_dynamic_array_spikegeneratorgroup_spike_time[0])*_dynamic_array_spikegeneratorgroup_spike_time.size(), + cudaMemcpyHostToDevice + ) + ); + _dynamic_array_spikegeneratorgroup__timebins.resize(19676); + THRUST_CHECK_ERROR(dev_dynamic_array_spikegeneratorgroup__timebins.resize(19676)); + _array_spikegeneratorgroup__lastindex[0] = 0; + CUDA_SAFE_CALL( + cudaMemcpy( + dev_array_spikegeneratorgroup__lastindex + 0, + &_array_spikegeneratorgroup__lastindex[0], + sizeof(_array_spikegeneratorgroup__lastindex[0]), + cudaMemcpyHostToDevice + ) + ); + _array_spikegeneratorgroup_period[0] = 0.0; + CUDA_SAFE_CALL( + cudaMemcpy( + dev_array_spikegeneratorgroup_period + 0, + &_array_spikegeneratorgroup_period[0], + sizeof(_array_spikegeneratorgroup_period[0]), + cudaMemcpyHostToDevice + ) + ); + for(int i=0; i<_num__array_neurongroup_lastspike; i++) + { + _array_neurongroup_lastspike[i] = - 10000.0; + } + CUDA_SAFE_CALL( + cudaMemcpy( + dev_array_neurongroup_lastspike, + &_array_neurongroup_lastspike[0], + sizeof(_array_neurongroup_lastspike[0])*_num__array_neurongroup_lastspike, + cudaMemcpyHostToDevice + ) + ); + for(int i=0; i<_num__array_neurongroup_not_refractory; i++) + { + _array_neurongroup_not_refractory[i] = true; + } + CUDA_SAFE_CALL( + cudaMemcpy( + dev_array_neurongroup_not_refractory, + &_array_neurongroup_not_refractory[0], + sizeof(_array_neurongroup_not_refractory[0])*_num__array_neurongroup_not_refractory, + cudaMemcpyHostToDevice + ) + ); + for(int i=0; i<_num__array_neurongroup_1_lastspike; i++) + { + _array_neurongroup_1_lastspike[i] = - 10000.0; + } + CUDA_SAFE_CALL( + cudaMemcpy( + dev_array_neurongroup_1_lastspike, + &_array_neurongroup_1_lastspike[0], + sizeof(_array_neurongroup_1_lastspike[0])*_num__array_neurongroup_1_lastspike, + cudaMemcpyHostToDevice + ) + ); + for(int i=0; i<_num__array_neurongroup_1_not_refractory; i++) + { + _array_neurongroup_1_not_refractory[i] = true; + } + CUDA_SAFE_CALL( + cudaMemcpy( + dev_array_neurongroup_1_not_refractory, + &_array_neurongroup_1_not_refractory[0], + sizeof(_array_neurongroup_1_not_refractory[0])*_num__array_neurongroup_1_not_refractory, + cudaMemcpyHostToDevice + ) + ); + _dynamic_array_synapses_1_delay.resize(1); + THRUST_CHECK_ERROR(dev_dynamic_array_synapses_1_delay.resize(1)); + _dynamic_array_synapses_1_delay.resize(1); + THRUST_CHECK_ERROR(dev_dynamic_array_synapses_1_delay.resize(1)); + _dynamic_array_synapses_1_delay[0] = 0.0; + CUDA_SAFE_CALL( + cudaMemcpy( + thrust::raw_pointer_cast(&dev_dynamic_array_synapses_1_delay[0]) + 0, + &_dynamic_array_synapses_1_delay[0], + sizeof(_dynamic_array_synapses_1_delay[0]), + cudaMemcpyHostToDevice + ) + ); + _dynamic_array_synapses_2_delay.resize(1); + THRUST_CHECK_ERROR(dev_dynamic_array_synapses_2_delay.resize(1)); + _dynamic_array_synapses_2_delay.resize(1); + THRUST_CHECK_ERROR(dev_dynamic_array_synapses_2_delay.resize(1)); + _dynamic_array_synapses_2_delay[0] = 0.0; + CUDA_SAFE_CALL( + cudaMemcpy( + thrust::raw_pointer_cast(&dev_dynamic_array_synapses_2_delay[0]) + 0, + &_dynamic_array_synapses_2_delay[0], + sizeof(_dynamic_array_synapses_2_delay[0]), + cudaMemcpyHostToDevice + ) + ); + _run_synapses_synapses_create_generator_codeobject(); + _run_synapses_1_synapses_create_generator_codeobject(); + _run_synapses_2_synapses_create_generator_codeobject(); + _run_synapses_group_variable_set_conditional_codeobject(); + _run_synapses_1_group_variable_set_conditional_codeobject(); + _run_synapses_1_group_variable_set_conditional_codeobject_1(); + for(int i=0; i<_num__array_neurongroup_V; i++) + { + _array_neurongroup_V[i] = - 0.06356; + } + CUDA_SAFE_CALL( + cudaMemcpy( + dev_array_neurongroup_V, + &_array_neurongroup_V[0], + sizeof(_array_neurongroup_V[0])*_num__array_neurongroup_V, + cudaMemcpyHostToDevice + ) + ); + for(int i=0; i<_num__array_neurongroup_h; i++) + { + _array_neurongroup_h[i] = 1; + } + CUDA_SAFE_CALL( + cudaMemcpy( + dev_array_neurongroup_h, + &_array_neurongroup_h[0], + sizeof(_array_neurongroup_h[0])*_num__array_neurongroup_h, + cudaMemcpyHostToDevice + ) + ); + for(int i=0; i<_num__array_neurongroup_m; i++) + { + _array_neurongroup_m[i] = 0; + } + CUDA_SAFE_CALL( + cudaMemcpy( + dev_array_neurongroup_m, + &_array_neurongroup_m[0], + sizeof(_array_neurongroup_m[0])*_num__array_neurongroup_m, + cudaMemcpyHostToDevice + ) + ); + for(int i=0; i<_num__array_neurongroup_n; i++) + { + _array_neurongroup_n[i] = 0.5; + } + CUDA_SAFE_CALL( + cudaMemcpy( + dev_array_neurongroup_n, + &_array_neurongroup_n[0], + sizeof(_array_neurongroup_n[0])*_num__array_neurongroup_n, + cudaMemcpyHostToDevice + ) + ); + for(int i=0; i<_num__array_neurongroup_1_V; i++) + { + _array_neurongroup_1_V[i] = - 0.06356; + } + CUDA_SAFE_CALL( + cudaMemcpy( + dev_array_neurongroup_1_V, + &_array_neurongroup_1_V[0], + sizeof(_array_neurongroup_1_V[0])*_num__array_neurongroup_1_V, + cudaMemcpyHostToDevice + ) + ); + for(int i=0; i<_num__array_neurongroup_1_h; i++) + { + _array_neurongroup_1_h[i] = 1; + } + CUDA_SAFE_CALL( + cudaMemcpy( + dev_array_neurongroup_1_h, + &_array_neurongroup_1_h[0], + sizeof(_array_neurongroup_1_h[0])*_num__array_neurongroup_1_h, + cudaMemcpyHostToDevice + ) + ); + for(int i=0; i<_num__array_neurongroup_1_m; i++) + { + _array_neurongroup_1_m[i] = 0; + } + CUDA_SAFE_CALL( + cudaMemcpy( + dev_array_neurongroup_1_m, + &_array_neurongroup_1_m[0], + sizeof(_array_neurongroup_1_m[0])*_num__array_neurongroup_1_m, + cudaMemcpyHostToDevice + ) + ); + for(int i=0; i<_num__array_neurongroup_1_n; i++) + { + _array_neurongroup_1_n[i] = 0.5; + } + CUDA_SAFE_CALL( + cudaMemcpy( + dev_array_neurongroup_1_n, + &_array_neurongroup_1_n[0], + sizeof(_array_neurongroup_1_n[0])*_num__array_neurongroup_1_n, + cudaMemcpyHostToDevice + ) + ); + _array_defaultclock_timestep[0] = 0; + CUDA_SAFE_CALL( + cudaMemcpy( + dev_array_defaultclock_timestep + 0, + &_array_defaultclock_timestep[0], + sizeof(_array_defaultclock_timestep[0]), + cudaMemcpyHostToDevice + ) + ); + _array_defaultclock_t[0] = 0.0; + CUDA_SAFE_CALL( + cudaMemcpy( + dev_array_defaultclock_t + 0, + &_array_defaultclock_t[0], + sizeof(_array_defaultclock_t[0]), + cudaMemcpyHostToDevice + ) + ); + _array_spikegeneratorgroup__lastindex[0] = 0; + CUDA_SAFE_CALL( + cudaMemcpy( + dev_array_spikegeneratorgroup__lastindex + 0, + &_array_spikegeneratorgroup__lastindex[0], + sizeof(_array_spikegeneratorgroup__lastindex[0]), + cudaMemcpyHostToDevice + ) + ); + for(int i=0; i<_num__static_array__dynamic_array_spikegeneratorgroup__timebins; i++) + { + _dynamic_array_spikegeneratorgroup__timebins[i] = _static_array__dynamic_array_spikegeneratorgroup__timebins[i]; + } + CUDA_SAFE_CALL( + cudaMemcpy( + thrust::raw_pointer_cast(&dev_dynamic_array_spikegeneratorgroup__timebins[0]), + &_dynamic_array_spikegeneratorgroup__timebins[0], + sizeof(_dynamic_array_spikegeneratorgroup__timebins[0])*_dynamic_array_spikegeneratorgroup__timebins.size(), + cudaMemcpyHostToDevice + ) + ); + _array_spikegeneratorgroup__period_bins[0] = 0.0; + CUDA_SAFE_CALL( + cudaMemcpy( + dev_array_spikegeneratorgroup__period_bins + 0, + &_array_spikegeneratorgroup__period_bins[0], + sizeof(_array_spikegeneratorgroup__period_bins[0]), + cudaMemcpyHostToDevice + ) + ); + _before_run_synapses_1_pre_push_spikes(); + _before_run_synapses_2_pre_push_spikes(); + _before_run_synapses_pre_push_spikes(); + _before_run_synapses_1_post_push_spikes(); + dev_dynamic_array_synapses_1__synaptic_pre.clear(); + dev_dynamic_array_synapses_1__synaptic_pre.shrink_to_fit(); + dev_dynamic_array_synapses_2__synaptic_pre.clear(); + dev_dynamic_array_synapses_2__synaptic_pre.shrink_to_fit(); + dev_dynamic_array_synapses__synaptic_pre.clear(); + dev_dynamic_array_synapses__synaptic_pre.shrink_to_fit(); + magicnetwork.clear(); + magicnetwork.add(&defaultclock, _run_random_number_buffer); + magicnetwork.add(&defaultclock, _run_neurongroup_1_stateupdater_codeobject); + magicnetwork.add(&defaultclock, _run_neurongroup_stateupdater_codeobject); + magicnetwork.add(&defaultclock, _run_neurongroup_1_thresholder_codeobject); + magicnetwork.add(&defaultclock, _run_neurongroup_thresholder_codeobject); + magicnetwork.add(&defaultclock, _run_spikegeneratorgroup_codeobject); + magicnetwork.add(&defaultclock, _run_spikemonitor_codeobject); + magicnetwork.add(&defaultclock, _run_spikemonitor_1_codeobject); + magicnetwork.add(&defaultclock, _run_spikemonitor_2_codeobject); + magicnetwork.add(&defaultclock, _run_synapses_1_pre_push_spikes); + magicnetwork.add(&defaultclock, _run_synapses_1_pre_codeobject); + magicnetwork.add(&defaultclock, _run_synapses_2_pre_push_spikes); + magicnetwork.add(&defaultclock, _run_synapses_2_pre_codeobject); + magicnetwork.add(&defaultclock, _run_synapses_pre_push_spikes); + magicnetwork.add(&defaultclock, _run_synapses_pre_codeobject); + magicnetwork.add(&defaultclock, _run_synapses_1_post_push_spikes); + magicnetwork.add(&defaultclock, _run_synapses_1_post_codeobject); + CUDA_SAFE_CALL(cudaProfilerStart()); + magicnetwork.run(0.01, NULL, 10.0); + random_number_buffer.run_finished(); + CUDA_SAFE_CALL(cudaDeviceSynchronize()); + CUDA_SAFE_CALL(cudaProfilerStop()); + _copyToHost_spikemonitor_codeobject(); + _debugmsg_spikemonitor_codeobject(); + + _copyToHost_spikemonitor_1_codeobject(); + _debugmsg_spikemonitor_1_codeobject(); + + _copyToHost_spikemonitor_2_codeobject(); + _debugmsg_spikemonitor_2_codeobject(); + + _debugmsg_synapses_1_pre_codeobject(); + + _debugmsg_synapses_2_pre_codeobject(); + + _debugmsg_synapses_pre_codeobject(); + + _debugmsg_synapses_1_post_codeobject(); + + } + + const double _run_time3 = (double)(std::clock() -_start_time3)/CLOCKS_PER_SEC; + printf("INFO: main_lines took %f seconds\n", _run_time3); + + + + brian_end(); + + + + // Profiling + const double _run_time = (double)(std::clock() -_start_time)/CLOCKS_PER_SEC; + printf("INFO: main function took %f seconds\n", _run_time); + + return 0; +} \ No newline at end of file diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/makefile b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/makefile new file mode 100644 index 00000000..672482e5 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/makefile @@ -0,0 +1,23 @@ +PROGRAM = main + +SRCS = code_objects/before_run_synapses_1_post_push_spikes.cu code_objects/before_run_synapses_1_pre_push_spikes.cu code_objects/before_run_synapses_2_pre_push_spikes.cu code_objects/before_run_synapses_pre_push_spikes.cu code_objects/neurongroup_1_stateupdater_codeobject.cu code_objects/neurongroup_1_thresholder_codeobject.cu code_objects/neurongroup_stateupdater_codeobject.cu code_objects/neurongroup_thresholder_codeobject.cu code_objects/spikegeneratorgroup_codeobject.cu code_objects/spikemonitor_1_codeobject.cu code_objects/spikemonitor_2_codeobject.cu code_objects/spikemonitor_codeobject.cu code_objects/synapses_1_group_variable_set_conditional_codeobject.cu code_objects/synapses_1_group_variable_set_conditional_codeobject_1.cu code_objects/synapses_1_post_codeobject.cu code_objects/synapses_1_post_push_spikes.cu code_objects/synapses_1_pre_codeobject.cu code_objects/synapses_1_pre_push_spikes.cu code_objects/synapses_1_synapses_create_generator_codeobject.cu code_objects/synapses_2_pre_codeobject.cu code_objects/synapses_2_pre_push_spikes.cu code_objects/synapses_2_synapses_create_generator_codeobject.cu code_objects/synapses_group_variable_set_conditional_codeobject.cu code_objects/synapses_pre_codeobject.cu code_objects/synapses_pre_push_spikes.cu code_objects/synapses_synapses_create_generator_codeobject.cu main.cu network.cu objects.cu rand.cu run.cu synapses_classes.cu +H_SRCS = brianlib/clocks.h brianlib/common_math.h brianlib/cudaVector.h brianlib/cuda_utils.h brianlib/curand_buffer.h brianlib/dynamic_array.h brianlib/spikequeue.h brianlib/stdint_compat.h code_objects/before_run_synapses_1_post_push_spikes.h code_objects/before_run_synapses_1_pre_push_spikes.h code_objects/before_run_synapses_2_pre_push_spikes.h code_objects/before_run_synapses_pre_push_spikes.h code_objects/neurongroup_1_stateupdater_codeobject.h code_objects/neurongroup_1_thresholder_codeobject.h code_objects/neurongroup_stateupdater_codeobject.h code_objects/neurongroup_thresholder_codeobject.h code_objects/spikegeneratorgroup_codeobject.h code_objects/spikemonitor_1_codeobject.h code_objects/spikemonitor_2_codeobject.h code_objects/spikemonitor_codeobject.h code_objects/synapses_1_group_variable_set_conditional_codeobject.h code_objects/synapses_1_group_variable_set_conditional_codeobject_1.h code_objects/synapses_1_post_codeobject.h code_objects/synapses_1_post_push_spikes.h code_objects/synapses_1_pre_codeobject.h code_objects/synapses_1_pre_push_spikes.h code_objects/synapses_1_synapses_create_generator_codeobject.h code_objects/synapses_2_pre_codeobject.h code_objects/synapses_2_pre_push_spikes.h code_objects/synapses_2_synapses_create_generator_codeobject.h code_objects/synapses_group_variable_set_conditional_codeobject.h code_objects/synapses_pre_codeobject.h code_objects/synapses_pre_push_spikes.h code_objects/synapses_synapses_create_generator_codeobject.h network.h objects.h rand.h run.h synapses_classes.h +OBJS = ${SRCS:.cu=.o} +OBJS := ${OBJS:.cpp=.o} +OBJS := ${OBJS:.c=.o} +NVCC = @/cognition/home/local/cuda/cuda-11.2/bin/nvcc +NVCCFLAGS = -I. -std=c++11 -arch=sm_61 -w -use_fast_math -g -DDEBUG -G -DTHRUST_DEBUG -Xcompiler "-I/cognition/home/subora/miniconda3/envs/b2c/include -w -O3 -ffast-math -fno-finite-math-only -march=native -std=c++11" +LFLAGS = -lcurand -I. -arch=sm_61 -L/cognition/home/subora/miniconda3/envs/b2c/lib -Xlinker -R/cognition/home/subora/miniconda3/envs/b2c/lib -w -g -G + +all: $(PROGRAM) + +.PHONY: all clean + +$(PROGRAM): $(OBJS) + $(NVCC) $(LFLAGS) $(OBJS) -o $(PROGRAM) + +clean: + rm $(OBJS) $(PROGRAM) $(DEPS) + +%.o : %.cu + $(NVCC) $(NVCCFLAGS) -dc $< -o $@ diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/network.cu b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/network.cu new file mode 100644 index 00000000..ef5b5143 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/network.cu @@ -0,0 +1,153 @@ + +#include "brianlib/cuda_utils.h" +#include "objects.h" +#include "network.h" +#include +#include +#include +#include +#include +#include + +#define Clock_epsilon 1e-14 + +double Network::_last_run_time = 0.0; +double Network::_last_run_completed_fraction = 0.0; + +Network::Network() +{ + t = 0.0; +} + +void Network::clear() +{ + objects.clear(); +} + +void Network::add(Clock *clock, codeobj_func func) +{ +#if defined(_MSC_VER) && (_MSC_VER>=1700) + objects.push_back(std::make_pair(std::move(clock), std::move(func))); +#else + objects.push_back(std::make_pair(clock, func)); +#endif +} + +void Network::run(const double duration, void (*report_func)(const double, const double, const double, const double), const double report_period) +{ + std::clock_t start, current; + const double t_start = t; + const double t_end = t + duration; + double next_report_time = report_period; + // compute the set of clocks + compute_clocks(); + // set interval for all clocks + + for(std::set::iterator i=clocks.begin(); i!=clocks.end(); i++) + (*i)->set_interval(t, t_end); + + start = std::clock(); + if (report_func) + { + report_func(0.0, 0.0, t_start, duration); + } + + Clock* clock = next_clocks(); + double elapsed_realtime; + bool did_break_early = false; + + while(clock && clock->running()) + { + t = clock->t[0]; + + for(int i=0; i next_report_time) + { + report_func(elapsed, (clock->t[0]-t_start)/duration, t_start, duration); + next_report_time += report_period; + } + } + Clock *obj_clock = objects[i].first; + // Only execute the object if it uses the right clock for this step + if (curclocks.find(obj_clock) != curclocks.end()) + { + codeobj_func func = objects[i].second; + if (func) // code objects can be NULL in cases where we store just the clock + { + func(); + } + } + } + for(std::set::iterator i=curclocks.begin(); i!=curclocks.end(); i++) + (*i)->tick(); + clock = next_clocks(); + + // Advance index for circular eventspace vector (for no_or_const_delay_mode) + brian::current_idx_array_neurongroup_1__spikespace = (brian::current_idx_array_neurongroup_1__spikespace + 1) % brian::dev_array_neurongroup_1__spikespace.size(); + brian::current_idx_array_neurongroup__spikespace = (brian::current_idx_array_neurongroup__spikespace + 1) % brian::dev_array_neurongroup__spikespace.size(); + brian::previous_idx_array_spikegeneratorgroup__spikespace = brian::current_idx_array_spikegeneratorgroup__spikespace; + brian::current_idx_array_spikegeneratorgroup__spikespace = (brian::current_idx_array_spikegeneratorgroup__spikespace + 1) % brian::dev_array_spikegeneratorgroup__spikespace.size(); + + current = std::clock(); + elapsed_realtime = (double)(current - start)/CLOCKS_PER_SEC; + + + } + + if(!did_break_early) t = t_end; + + _last_run_time = elapsed_realtime; + if(duration>0) + { + _last_run_completed_fraction = (t-t_start)/duration; + } else { + _last_run_completed_fraction = 1.0; + } + if (report_func) + { + report_func(elapsed_realtime, 1.0, t_start, duration); + } +} + +void Network::compute_clocks() +{ + clocks.clear(); + for(int i=0; i::iterator i=clocks.begin(); i!=clocks.end(); i++) + { + Clock *clock = *i; + if(clock->t[0]t[0]) + minclock = clock; + } + // find set of equal clocks + curclocks.clear(); + + double t = minclock->t[0]; + for(std::set::iterator i=clocks.begin(); i!=clocks.end(); i++) + { + Clock *clock = *i; + double s = clock->t[0]; + if(s==t || fabs(s-t)<=Clock_epsilon) + curclocks.insert(clock); + } + return minclock; +} + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/network.h b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/network.h new file mode 100644 index 00000000..92f037b9 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/network.h @@ -0,0 +1,31 @@ + +#ifndef _BRIAN_NETWORK_H +#define _BRIAN_NETWORK_H + +#include +#include +#include +#include +#include "brianlib/clocks.h" + +typedef void (*codeobj_func)(); + +class Network +{ + std::set clocks, curclocks; + void compute_clocks(); + Clock* next_clocks(); +public: + std::vector< std::pair< Clock*, codeobj_func > > objects; + double t; + static double _last_run_time; + static double _last_run_completed_fraction; + + Network(); + void clear(); + void add(Clock *clock, codeobj_func func); + void run(const double duration, void (*report_func)(const double, const double, const double, const double), const double report_period); +}; + +#endif + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/objects.cu b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/objects.cu new file mode 100644 index 00000000..3f290d2f --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/objects.cu @@ -0,0 +1,2518 @@ + +#include "objects.h" +#include "synapses_classes.h" +#include "brianlib/clocks.h" +#include "brianlib/cuda_utils.h" +#include "network.h" +#include "rand.h" +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +size_t brian::used_device_memory = 0; + +//////////////// clocks /////////////////// +Clock brian::defaultclock; + +//////////////// networks ///////////////// +Network brian::magicnetwork; + +//////////////// arrays /////////////////// +double * brian::_array_defaultclock_dt; +double * brian::dev_array_defaultclock_dt; +__device__ double * brian::d_array_defaultclock_dt; +const int brian::_num__array_defaultclock_dt = 1; + +double * brian::_array_defaultclock_t; +double * brian::dev_array_defaultclock_t; +__device__ double * brian::d_array_defaultclock_t; +const int brian::_num__array_defaultclock_t = 1; + +int64_t * brian::_array_defaultclock_timestep; +int64_t * brian::dev_array_defaultclock_timestep; +__device__ int64_t * brian::d_array_defaultclock_timestep; +const int brian::_num__array_defaultclock_timestep = 1; + +double * brian::_array_neurongroup_1_g_eKC_eKC; +double * brian::dev_array_neurongroup_1_g_eKC_eKC; +__device__ double * brian::d_array_neurongroup_1_g_eKC_eKC; +const int brian::_num__array_neurongroup_1_g_eKC_eKC = 100; + +double * brian::_array_neurongroup_1_g_iKC_eKC; +double * brian::dev_array_neurongroup_1_g_iKC_eKC; +__device__ double * brian::d_array_neurongroup_1_g_iKC_eKC; +const int brian::_num__array_neurongroup_1_g_iKC_eKC = 100; + +double * brian::_array_neurongroup_1_h; +double * brian::dev_array_neurongroup_1_h; +__device__ double * brian::d_array_neurongroup_1_h; +const int brian::_num__array_neurongroup_1_h = 100; + +int32_t * brian::_array_neurongroup_1_i; +int32_t * brian::dev_array_neurongroup_1_i; +__device__ int32_t * brian::d_array_neurongroup_1_i; +const int brian::_num__array_neurongroup_1_i = 100; + +double * brian::_array_neurongroup_1_lastspike; +double * brian::dev_array_neurongroup_1_lastspike; +__device__ double * brian::d_array_neurongroup_1_lastspike; +const int brian::_num__array_neurongroup_1_lastspike = 100; + +double * brian::_array_neurongroup_1_m; +double * brian::dev_array_neurongroup_1_m; +__device__ double * brian::d_array_neurongroup_1_m; +const int brian::_num__array_neurongroup_1_m = 100; + +double * brian::_array_neurongroup_1_n; +double * brian::dev_array_neurongroup_1_n; +__device__ double * brian::d_array_neurongroup_1_n; +const int brian::_num__array_neurongroup_1_n = 100; + +char * brian::_array_neurongroup_1_not_refractory; +char * brian::dev_array_neurongroup_1_not_refractory; +__device__ char * brian::d_array_neurongroup_1_not_refractory; +const int brian::_num__array_neurongroup_1_not_refractory = 100; + +double * brian::_array_neurongroup_1_V; +double * brian::dev_array_neurongroup_1_V; +__device__ double * brian::d_array_neurongroup_1_V; +const int brian::_num__array_neurongroup_1_V = 100; + +double * brian::_array_neurongroup_g_PN_iKC; +double * brian::dev_array_neurongroup_g_PN_iKC; +__device__ double * brian::d_array_neurongroup_g_PN_iKC; +const int brian::_num__array_neurongroup_g_PN_iKC = 2500; + +double * brian::_array_neurongroup_h; +double * brian::dev_array_neurongroup_h; +__device__ double * brian::d_array_neurongroup_h; +const int brian::_num__array_neurongroup_h = 2500; + +int32_t * brian::_array_neurongroup_i; +int32_t * brian::dev_array_neurongroup_i; +__device__ int32_t * brian::d_array_neurongroup_i; +const int brian::_num__array_neurongroup_i = 2500; + +double * brian::_array_neurongroup_lastspike; +double * brian::dev_array_neurongroup_lastspike; +__device__ double * brian::d_array_neurongroup_lastspike; +const int brian::_num__array_neurongroup_lastspike = 2500; + +double * brian::_array_neurongroup_m; +double * brian::dev_array_neurongroup_m; +__device__ double * brian::d_array_neurongroup_m; +const int brian::_num__array_neurongroup_m = 2500; + +double * brian::_array_neurongroup_n; +double * brian::dev_array_neurongroup_n; +__device__ double * brian::d_array_neurongroup_n; +const int brian::_num__array_neurongroup_n = 2500; + +char * brian::_array_neurongroup_not_refractory; +char * brian::dev_array_neurongroup_not_refractory; +__device__ char * brian::d_array_neurongroup_not_refractory; +const int brian::_num__array_neurongroup_not_refractory = 2500; + +double * brian::_array_neurongroup_V; +double * brian::dev_array_neurongroup_V; +__device__ double * brian::d_array_neurongroup_V; +const int brian::_num__array_neurongroup_V = 2500; + +int32_t * brian::_array_spikegeneratorgroup__lastindex; +int32_t * brian::dev_array_spikegeneratorgroup__lastindex; +__device__ int32_t * brian::d_array_spikegeneratorgroup__lastindex; +const int brian::_num__array_spikegeneratorgroup__lastindex = 1; + +int32_t * brian::_array_spikegeneratorgroup__period_bins; +int32_t * brian::dev_array_spikegeneratorgroup__period_bins; +__device__ int32_t * brian::d_array_spikegeneratorgroup__period_bins; +const int brian::_num__array_spikegeneratorgroup__period_bins = 1; + +int32_t * brian::_array_spikegeneratorgroup_i; +int32_t * brian::dev_array_spikegeneratorgroup_i; +__device__ int32_t * brian::d_array_spikegeneratorgroup_i; +const int brian::_num__array_spikegeneratorgroup_i = 100; + +double * brian::_array_spikegeneratorgroup_period; +double * brian::dev_array_spikegeneratorgroup_period; +__device__ double * brian::d_array_spikegeneratorgroup_period; +const int brian::_num__array_spikegeneratorgroup_period = 1; + +int32_t * brian::_array_spikemonitor_1__source_idx; +int32_t * brian::dev_array_spikemonitor_1__source_idx; +__device__ int32_t * brian::d_array_spikemonitor_1__source_idx; +const int brian::_num__array_spikemonitor_1__source_idx = 2500; + +int32_t * brian::_array_spikemonitor_1_count; +int32_t * brian::dev_array_spikemonitor_1_count; +__device__ int32_t * brian::d_array_spikemonitor_1_count; +const int brian::_num__array_spikemonitor_1_count = 2500; + +int32_t * brian::_array_spikemonitor_1_N; +int32_t * brian::dev_array_spikemonitor_1_N; +__device__ int32_t * brian::d_array_spikemonitor_1_N; +const int brian::_num__array_spikemonitor_1_N = 1; + +int32_t * brian::_array_spikemonitor_2__source_idx; +int32_t * brian::dev_array_spikemonitor_2__source_idx; +__device__ int32_t * brian::d_array_spikemonitor_2__source_idx; +const int brian::_num__array_spikemonitor_2__source_idx = 100; + +int32_t * brian::_array_spikemonitor_2_count; +int32_t * brian::dev_array_spikemonitor_2_count; +__device__ int32_t * brian::d_array_spikemonitor_2_count; +const int brian::_num__array_spikemonitor_2_count = 100; + +int32_t * brian::_array_spikemonitor_2_N; +int32_t * brian::dev_array_spikemonitor_2_N; +__device__ int32_t * brian::d_array_spikemonitor_2_N; +const int brian::_num__array_spikemonitor_2_N = 1; + +int32_t * brian::_array_spikemonitor__source_idx; +int32_t * brian::dev_array_spikemonitor__source_idx; +__device__ int32_t * brian::d_array_spikemonitor__source_idx; +const int brian::_num__array_spikemonitor__source_idx = 100; + +int32_t * brian::_array_spikemonitor_count; +int32_t * brian::dev_array_spikemonitor_count; +__device__ int32_t * brian::d_array_spikemonitor_count; +const int brian::_num__array_spikemonitor_count = 100; + +int32_t * brian::_array_spikemonitor_N; +int32_t * brian::dev_array_spikemonitor_N; +__device__ int32_t * brian::d_array_spikemonitor_N; +const int brian::_num__array_spikemonitor_N = 1; + +int32_t * brian::_array_synapses_1_N; +int32_t * brian::dev_array_synapses_1_N; +__device__ int32_t * brian::d_array_synapses_1_N; +const int brian::_num__array_synapses_1_N = 1; + +int32_t * brian::_array_synapses_2_N; +int32_t * brian::dev_array_synapses_2_N; +__device__ int32_t * brian::d_array_synapses_2_N; +const int brian::_num__array_synapses_2_N = 1; + +int32_t * brian::_array_synapses_N; +int32_t * brian::dev_array_synapses_N; +__device__ int32_t * brian::d_array_synapses_N; +const int brian::_num__array_synapses_N = 1; + + +//////////////// eventspaces /////////////// +// we dynamically create multiple eventspaces in no_or_const_delay_mode +// for initiating the first spikespace, we need a host pointer +// for choosing the right spikespace, we need a global index variable +int32_t * brian::_array_neurongroup_1__spikespace; +const int brian::_num__array_neurongroup_1__spikespace = 101; +thrust::host_vector brian::dev_array_neurongroup_1__spikespace(1); +int brian::current_idx_array_neurongroup_1__spikespace = 0; +int32_t * brian::_array_neurongroup__spikespace; +const int brian::_num__array_neurongroup__spikespace = 2501; +thrust::host_vector brian::dev_array_neurongroup__spikespace(1); +int brian::current_idx_array_neurongroup__spikespace = 0; +int32_t * brian::_array_spikegeneratorgroup__spikespace; +const int brian::_num__array_spikegeneratorgroup__spikespace = 101; +thrust::host_vector brian::dev_array_spikegeneratorgroup__spikespace(1); +int brian::current_idx_array_spikegeneratorgroup__spikespace = 0; +int brian::previous_idx_array_spikegeneratorgroup__spikespace; + +//////////////// dynamic arrays 1d ///////// +thrust::host_vector brian::_dynamic_array_spikegeneratorgroup__timebins; +thrust::device_vector brian::dev_dynamic_array_spikegeneratorgroup__timebins; +thrust::host_vector brian::_dynamic_array_spikegeneratorgroup_neuron_index; +thrust::device_vector brian::dev_dynamic_array_spikegeneratorgroup_neuron_index; +thrust::host_vector brian::_dynamic_array_spikegeneratorgroup_spike_number; +thrust::device_vector brian::dev_dynamic_array_spikegeneratorgroup_spike_number; +thrust::host_vector brian::_dynamic_array_spikegeneratorgroup_spike_time; +thrust::device_vector brian::dev_dynamic_array_spikegeneratorgroup_spike_time; +thrust::host_vector brian::_dynamic_array_spikemonitor_1_i; +thrust::device_vector brian::dev_dynamic_array_spikemonitor_1_i; +thrust::host_vector brian::_dynamic_array_spikemonitor_1_t; +thrust::device_vector brian::dev_dynamic_array_spikemonitor_1_t; +thrust::host_vector brian::_dynamic_array_spikemonitor_2_i; +thrust::device_vector brian::dev_dynamic_array_spikemonitor_2_i; +thrust::host_vector brian::_dynamic_array_spikemonitor_2_t; +thrust::device_vector brian::dev_dynamic_array_spikemonitor_2_t; +thrust::host_vector brian::_dynamic_array_spikemonitor_i; +thrust::device_vector brian::dev_dynamic_array_spikemonitor_i; +thrust::host_vector brian::_dynamic_array_spikemonitor_t; +thrust::device_vector brian::dev_dynamic_array_spikemonitor_t; +thrust::host_vector brian::_dynamic_array_synapses_1__synaptic_post; +thrust::device_vector brian::dev_dynamic_array_synapses_1__synaptic_post; +thrust::host_vector brian::_dynamic_array_synapses_1__synaptic_pre; +thrust::device_vector brian::dev_dynamic_array_synapses_1__synaptic_pre; +thrust::host_vector brian::_dynamic_array_synapses_1_Apost; +thrust::device_vector brian::dev_dynamic_array_synapses_1_Apost; +thrust::host_vector brian::_dynamic_array_synapses_1_Apre; +thrust::device_vector brian::dev_dynamic_array_synapses_1_Apre; +thrust::host_vector brian::_dynamic_array_synapses_1_delay; +thrust::device_vector brian::dev_dynamic_array_synapses_1_delay; +thrust::host_vector brian::_dynamic_array_synapses_1_delay_1; +thrust::device_vector brian::dev_dynamic_array_synapses_1_delay_1; +thrust::host_vector brian::_dynamic_array_synapses_1_g_raw; +thrust::device_vector brian::dev_dynamic_array_synapses_1_g_raw; +thrust::host_vector brian::_dynamic_array_synapses_1_lastupdate; +thrust::device_vector brian::dev_dynamic_array_synapses_1_lastupdate; +thrust::host_vector brian::_dynamic_array_synapses_1_N_incoming; +thrust::device_vector brian::dev_dynamic_array_synapses_1_N_incoming; +thrust::host_vector brian::_dynamic_array_synapses_1_N_outgoing; +thrust::device_vector brian::dev_dynamic_array_synapses_1_N_outgoing; +thrust::host_vector brian::_dynamic_array_synapses_2__synaptic_post; +thrust::device_vector brian::dev_dynamic_array_synapses_2__synaptic_post; +thrust::host_vector brian::_dynamic_array_synapses_2__synaptic_pre; +thrust::device_vector brian::dev_dynamic_array_synapses_2__synaptic_pre; +thrust::host_vector brian::_dynamic_array_synapses_2_delay; +thrust::device_vector brian::dev_dynamic_array_synapses_2_delay; +thrust::host_vector brian::_dynamic_array_synapses_2_N_incoming; +thrust::device_vector brian::dev_dynamic_array_synapses_2_N_incoming; +thrust::host_vector brian::_dynamic_array_synapses_2_N_outgoing; +thrust::device_vector brian::dev_dynamic_array_synapses_2_N_outgoing; +thrust::host_vector brian::_dynamic_array_synapses__synaptic_post; +thrust::device_vector brian::dev_dynamic_array_synapses__synaptic_post; +thrust::host_vector brian::_dynamic_array_synapses__synaptic_pre; +thrust::device_vector brian::dev_dynamic_array_synapses__synaptic_pre; +thrust::host_vector brian::_dynamic_array_synapses_delay; +thrust::device_vector brian::dev_dynamic_array_synapses_delay; +thrust::host_vector brian::_dynamic_array_synapses_N_incoming; +thrust::device_vector brian::dev_dynamic_array_synapses_N_incoming; +thrust::host_vector brian::_dynamic_array_synapses_N_outgoing; +thrust::device_vector brian::dev_dynamic_array_synapses_N_outgoing; +thrust::host_vector brian::_dynamic_array_synapses_weight; +thrust::device_vector brian::dev_dynamic_array_synapses_weight; + +//////////////// dynamic arrays 2d ///////// + +/////////////// static arrays ///////////// +int32_t * brian::_static_array__dynamic_array_spikegeneratorgroup__timebins; +int32_t * brian::dev_static_array__dynamic_array_spikegeneratorgroup__timebins; +__device__ int32_t * brian::d_static_array__dynamic_array_spikegeneratorgroup__timebins; +const int brian::_num__static_array__dynamic_array_spikegeneratorgroup__timebins = 19676; +int64_t * brian::_static_array__dynamic_array_spikegeneratorgroup_neuron_index; +int64_t * brian::dev_static_array__dynamic_array_spikegeneratorgroup_neuron_index; +__device__ int64_t * brian::d_static_array__dynamic_array_spikegeneratorgroup_neuron_index; +const int brian::_num__static_array__dynamic_array_spikegeneratorgroup_neuron_index = 19676; +int64_t * brian::_static_array__dynamic_array_spikegeneratorgroup_spike_number; +int64_t * brian::dev_static_array__dynamic_array_spikegeneratorgroup_spike_number; +__device__ int64_t * brian::d_static_array__dynamic_array_spikegeneratorgroup_spike_number; +const int brian::_num__static_array__dynamic_array_spikegeneratorgroup_spike_number = 19676; +double * brian::_static_array__dynamic_array_spikegeneratorgroup_spike_time; +double * brian::dev_static_array__dynamic_array_spikegeneratorgroup_spike_time; +__device__ double * brian::d_static_array__dynamic_array_spikegeneratorgroup_spike_time; +const int brian::_num__static_array__dynamic_array_spikegeneratorgroup_spike_time = 19676; + +//////////////// synapses ///////////////// +// synapses +int32_t synapses_source_start_index; +int32_t synapses_source_stop_index; +bool brian::synapses_multiple_pre_post = false; +// synapses_pre +__device__ int* brian::synapses_pre_num_synapses_by_pre; +__device__ int* brian::synapses_pre_num_synapses_by_bundle; +__device__ int* brian::synapses_pre_unique_delays; +__device__ int* brian::synapses_pre_synapses_offset_by_bundle; +__device__ int* brian::synapses_pre_global_bundle_id_start_by_pre; +int brian::synapses_pre_max_bundle_size = 0; +int brian::synapses_pre_mean_bundle_size = 0; +int brian::synapses_pre_max_size = 0; +__device__ int* brian::synapses_pre_num_unique_delays_by_pre; +int brian::synapses_pre_max_num_unique_delays = 0; +__device__ int32_t** brian::synapses_pre_synapse_ids_by_pre; +__device__ int32_t* brian::synapses_pre_synapse_ids; +__device__ int* brian::synapses_pre_unique_delay_start_idcs; +__device__ int* brian::synapses_pre_unique_delays_offset_by_pre; +__device__ SynapticPathway brian::synapses_pre; +int brian::synapses_pre_eventspace_idx = 0; +int brian::synapses_pre_delay; +bool brian::synapses_pre_scalar_delay; +// synapses_1 +int32_t synapses_1_source_start_index; +int32_t synapses_1_source_stop_index; +bool brian::synapses_1_multiple_pre_post = false; +// synapses_1_post +__device__ int* brian::synapses_1_post_num_synapses_by_pre; +__device__ int* brian::synapses_1_post_num_synapses_by_bundle; +__device__ int* brian::synapses_1_post_unique_delays; +__device__ int* brian::synapses_1_post_synapses_offset_by_bundle; +__device__ int* brian::synapses_1_post_global_bundle_id_start_by_pre; +int brian::synapses_1_post_max_bundle_size = 0; +int brian::synapses_1_post_mean_bundle_size = 0; +int brian::synapses_1_post_max_size = 0; +__device__ int* brian::synapses_1_post_num_unique_delays_by_pre; +int brian::synapses_1_post_max_num_unique_delays = 0; +__device__ int32_t** brian::synapses_1_post_synapse_ids_by_pre; +__device__ int32_t* brian::synapses_1_post_synapse_ids; +__device__ int* brian::synapses_1_post_unique_delay_start_idcs; +__device__ int* brian::synapses_1_post_unique_delays_offset_by_pre; +__device__ SynapticPathway brian::synapses_1_post; +int brian::synapses_1_post_eventspace_idx = 0; +int brian::synapses_1_post_delay; +bool brian::synapses_1_post_scalar_delay; +// synapses_1_pre +__device__ int* brian::synapses_1_pre_num_synapses_by_pre; +__device__ int* brian::synapses_1_pre_num_synapses_by_bundle; +__device__ int* brian::synapses_1_pre_unique_delays; +__device__ int* brian::synapses_1_pre_synapses_offset_by_bundle; +__device__ int* brian::synapses_1_pre_global_bundle_id_start_by_pre; +int brian::synapses_1_pre_max_bundle_size = 0; +int brian::synapses_1_pre_mean_bundle_size = 0; +int brian::synapses_1_pre_max_size = 0; +__device__ int* brian::synapses_1_pre_num_unique_delays_by_pre; +int brian::synapses_1_pre_max_num_unique_delays = 0; +__device__ int32_t** brian::synapses_1_pre_synapse_ids_by_pre; +__device__ int32_t* brian::synapses_1_pre_synapse_ids; +__device__ int* brian::synapses_1_pre_unique_delay_start_idcs; +__device__ int* brian::synapses_1_pre_unique_delays_offset_by_pre; +__device__ SynapticPathway brian::synapses_1_pre; +int brian::synapses_1_pre_eventspace_idx = 0; +int brian::synapses_1_pre_delay; +bool brian::synapses_1_pre_scalar_delay; +// synapses_2 +int32_t synapses_2_source_start_index; +int32_t synapses_2_source_stop_index; +bool brian::synapses_2_multiple_pre_post = false; +// synapses_2_pre +__device__ int* brian::synapses_2_pre_num_synapses_by_pre; +__device__ int* brian::synapses_2_pre_num_synapses_by_bundle; +__device__ int* brian::synapses_2_pre_unique_delays; +__device__ int* brian::synapses_2_pre_synapses_offset_by_bundle; +__device__ int* brian::synapses_2_pre_global_bundle_id_start_by_pre; +int brian::synapses_2_pre_max_bundle_size = 0; +int brian::synapses_2_pre_mean_bundle_size = 0; +int brian::synapses_2_pre_max_size = 0; +__device__ int* brian::synapses_2_pre_num_unique_delays_by_pre; +int brian::synapses_2_pre_max_num_unique_delays = 0; +__device__ int32_t** brian::synapses_2_pre_synapse_ids_by_pre; +__device__ int32_t* brian::synapses_2_pre_synapse_ids; +__device__ int* brian::synapses_2_pre_unique_delay_start_idcs; +__device__ int* brian::synapses_2_pre_unique_delays_offset_by_pre; +__device__ SynapticPathway brian::synapses_2_pre; +int brian::synapses_2_pre_eventspace_idx = 0; +int brian::synapses_2_pre_delay; +bool brian::synapses_2_pre_scalar_delay; + +int brian::num_parallel_blocks; +int brian::max_threads_per_block; +int brian::max_threads_per_sm; +int brian::max_shared_mem_size; +int brian::num_threads_per_warp; + +__global__ void synapses_pre_init( + int32_t* sources, + int32_t* targets, + double dt, + int32_t source_start, + int32_t source_stop + ) +{ + using namespace brian; + + synapses_pre.init( + sources, + targets, + dt, + // TODO: called source here, spikes in SynapticPathway (use same name) + source_start, + source_stop); +} +__global__ void synapses_1_post_init( + int32_t* sources, + int32_t* targets, + double dt, + int32_t source_start, + int32_t source_stop + ) +{ + using namespace brian; + + synapses_1_post.init( + sources, + targets, + dt, + // TODO: called source here, spikes in SynapticPathway (use same name) + source_start, + source_stop); +} +__global__ void synapses_1_pre_init( + int32_t* sources, + int32_t* targets, + double dt, + int32_t source_start, + int32_t source_stop + ) +{ + using namespace brian; + + synapses_1_pre.init( + sources, + targets, + dt, + // TODO: called source here, spikes in SynapticPathway (use same name) + source_start, + source_stop); +} +__global__ void synapses_2_pre_init( + int32_t* sources, + int32_t* targets, + double dt, + int32_t source_start, + int32_t source_stop + ) +{ + using namespace brian; + + synapses_2_pre.init( + sources, + targets, + dt, + // TODO: called source here, spikes in SynapticPathway (use same name) + source_start, + source_stop); +} + +// Profiling information for each code object + +//////////////random numbers////////////////// +curandGenerator_t brian::curand_generator; +__device__ unsigned long long* brian::d_curand_seed; +unsigned long long* brian::dev_curand_seed; +// dev_{co.name}_{rng_type}_allocator +// pointer to start of generated random numbers array +// at each generation cycle this array is refilled +// dev_{co.name}_{rng_type} +// pointer moving through generated random number array +// until it is regenerated at the next generation cycle +curandState* brian::dev_curand_states; +cudaStream_t brian::stream; +cudaStream_t brian::stream1; +cudaStream_t brian::stream2; +cudaStream_t brian::neurongroup_stream1; +cudaStream_t brian::neurongroup_stream; +cudaStream_t brian::spikegenerator_stream; +cudaStream_t brian::spikemonitor_stream1; +cudaStream_t brian::spikegenerator_stream2; +cudaStream_t brian::spikegenerator_stream; + +__device__ curandState* brian::d_curand_states; +RandomNumberBuffer brian::random_number_buffer; + +void _init_arrays() +{ + using namespace brian; + + std::clock_t start_timer = std::clock(); + + CUDA_CHECK_MEMORY(); + size_t used_device_memory_start = used_device_memory; + + cudaDeviceProp props; + CUDA_SAFE_CALL( + cudaGetDeviceProperties(&props, 0) + ); + + num_parallel_blocks = props.multiProcessorCount * 1; + printf("objects cu num par blocks %d\n", num_parallel_blocks); + max_threads_per_block = props.maxThreadsPerBlock; + max_threads_per_sm = props.maxThreadsPerMultiProcessor; + max_shared_mem_size = props.sharedMemPerBlock; + num_threads_per_warp = props.warpSize; + + // Random seeds might be overwritten in main.cu + unsigned long long seed = time(0); + + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_curand_seed, + sizeof(unsigned long long)) + ); + + CUDA_SAFE_CALL( + cudaMemcpyToSymbol(d_curand_seed, &dev_curand_seed, + sizeof(unsigned long long*)) + ); + + CUDA_SAFE_CALL( + curandCreateGenerator(&curand_generator, CURAND_RNG_PSEUDO_DEFAULT) + ); + + + // this sets seed for host and device api RNG + random_number_buffer.set_seed(seed); + // initialise neurongroups + CUDA_SAFE_CALL(cudaStreamCreate(&neurongroup_stream1)); + CUDA_SAFE_CALL(cudaStreamCreate(&neurongroup_stream)); + + //spike generator + CUDA_SAFE_CALL(cudaStreamCreate(&spikegenerator_stream)); + + //spike monitor + CUDA_SAFE_CALL(cudaStreamCreate(&spikemonitor_stream1)); + CUDA_SAFE_CALL(cudaStreamCreate(&spikemonitor_stream)); + CUDA_SAFE_CALL(cudaStreamCreate(&spikemonitor_stream2)); + + + CUDA_SAFE_CALL(cudaStreamCreate(&stream)); + + synapses_pre_init<<<1,1>>>( + thrust::raw_pointer_cast(&dev_dynamic_array_synapses__synaptic_pre[0]), + thrust::raw_pointer_cast(&dev_dynamic_array_synapses__synaptic_post[0]), + 0, //was dt, maybe irrelevant? + 0, + 100 + ); + CUDA_CHECK_ERROR("synapses_pre_init"); + synapses_1_post_init<<<1,1>>>( + thrust::raw_pointer_cast(&dev_dynamic_array_synapses_1__synaptic_post[0]), + thrust::raw_pointer_cast(&dev_dynamic_array_synapses_1__synaptic_pre[0]), + 0, //was dt, maybe irrelevant? + 0, + 100 + ); + CUDA_CHECK_ERROR("synapses_1_post_init"); + CUDA_SAFE_CALL(cudaStreamCreate(&stream1)); + synapses_1_pre_init<<<1,1>>>( + thrust::raw_pointer_cast(&dev_dynamic_array_synapses_1__synaptic_pre[0]), + thrust::raw_pointer_cast(&dev_dynamic_array_synapses_1__synaptic_post[0]), + 0, //was dt, maybe irrelevant? + 0, + 2500 + ); + CUDA_CHECK_ERROR("synapses_1_pre_init"); + CUDA_SAFE_CALL(cudaStreamCreate(&stream2)); + synapses_2_pre_init<<<1,1>>>( + thrust::raw_pointer_cast(&dev_dynamic_array_synapses_2__synaptic_pre[0]), + thrust::raw_pointer_cast(&dev_dynamic_array_synapses_2__synaptic_post[0]), + 0, //was dt, maybe irrelevant? + 0, + 100 + ); + CUDA_CHECK_ERROR("synapses_2_pre_init"); + + // Arrays initialized to 0 + _array_defaultclock_dt = new double[1]; + for(int i=0; i<1; i++) _array_defaultclock_dt[i] = 0; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_defaultclock_dt, sizeof(double)*_num__array_defaultclock_dt) + ); + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_defaultclock_dt, _array_defaultclock_dt, sizeof(double)*_num__array_defaultclock_dt, cudaMemcpyHostToDevice) + ); + _array_defaultclock_t = new double[1]; + for(int i=0; i<1; i++) _array_defaultclock_t[i] = 0; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_defaultclock_t, sizeof(double)*_num__array_defaultclock_t) + ); + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_defaultclock_t, _array_defaultclock_t, sizeof(double)*_num__array_defaultclock_t, cudaMemcpyHostToDevice) + ); + _array_defaultclock_timestep = new int64_t[1]; + for(int i=0; i<1; i++) _array_defaultclock_timestep[i] = 0; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_defaultclock_timestep, sizeof(int64_t)*_num__array_defaultclock_timestep) + ); + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_defaultclock_timestep, _array_defaultclock_timestep, sizeof(int64_t)*_num__array_defaultclock_timestep, cudaMemcpyHostToDevice) + ); + _array_neurongroup_1_g_eKC_eKC = new double[100]; + for(int i=0; i<100; i++) _array_neurongroup_1_g_eKC_eKC[i] = 0; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_neurongroup_1_g_eKC_eKC, sizeof(double)*_num__array_neurongroup_1_g_eKC_eKC) + ); + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_neurongroup_1_g_eKC_eKC, _array_neurongroup_1_g_eKC_eKC, sizeof(double)*_num__array_neurongroup_1_g_eKC_eKC, cudaMemcpyHostToDevice) + ); + _array_neurongroup_1_g_iKC_eKC = new double[100]; + for(int i=0; i<100; i++) _array_neurongroup_1_g_iKC_eKC[i] = 0; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_neurongroup_1_g_iKC_eKC, sizeof(double)*_num__array_neurongroup_1_g_iKC_eKC) + ); + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_neurongroup_1_g_iKC_eKC, _array_neurongroup_1_g_iKC_eKC, sizeof(double)*_num__array_neurongroup_1_g_iKC_eKC, cudaMemcpyHostToDevice) + ); + _array_neurongroup_1_h = new double[100]; + for(int i=0; i<100; i++) _array_neurongroup_1_h[i] = 0; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_neurongroup_1_h, sizeof(double)*_num__array_neurongroup_1_h) + ); + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_neurongroup_1_h, _array_neurongroup_1_h, sizeof(double)*_num__array_neurongroup_1_h, cudaMemcpyHostToDevice) + ); + _array_neurongroup_1_i = new int32_t[100]; + for(int i=0; i<100; i++) _array_neurongroup_1_i[i] = 0; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_neurongroup_1_i, sizeof(int32_t)*_num__array_neurongroup_1_i) + ); + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_neurongroup_1_i, _array_neurongroup_1_i, sizeof(int32_t)*_num__array_neurongroup_1_i, cudaMemcpyHostToDevice) + ); + _array_neurongroup_1_lastspike = new double[100]; + for(int i=0; i<100; i++) _array_neurongroup_1_lastspike[i] = 0; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_neurongroup_1_lastspike, sizeof(double)*_num__array_neurongroup_1_lastspike) + ); + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_neurongroup_1_lastspike, _array_neurongroup_1_lastspike, sizeof(double)*_num__array_neurongroup_1_lastspike, cudaMemcpyHostToDevice) + ); + _array_neurongroup_1_m = new double[100]; + for(int i=0; i<100; i++) _array_neurongroup_1_m[i] = 0; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_neurongroup_1_m, sizeof(double)*_num__array_neurongroup_1_m) + ); + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_neurongroup_1_m, _array_neurongroup_1_m, sizeof(double)*_num__array_neurongroup_1_m, cudaMemcpyHostToDevice) + ); + _array_neurongroup_1_n = new double[100]; + for(int i=0; i<100; i++) _array_neurongroup_1_n[i] = 0; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_neurongroup_1_n, sizeof(double)*_num__array_neurongroup_1_n) + ); + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_neurongroup_1_n, _array_neurongroup_1_n, sizeof(double)*_num__array_neurongroup_1_n, cudaMemcpyHostToDevice) + ); + _array_neurongroup_1_not_refractory = new char[100]; + for(int i=0; i<100; i++) _array_neurongroup_1_not_refractory[i] = 0; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_neurongroup_1_not_refractory, sizeof(char)*_num__array_neurongroup_1_not_refractory) + ); + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_neurongroup_1_not_refractory, _array_neurongroup_1_not_refractory, sizeof(char)*_num__array_neurongroup_1_not_refractory, cudaMemcpyHostToDevice) + ); + _array_neurongroup_1_V = new double[100]; + for(int i=0; i<100; i++) _array_neurongroup_1_V[i] = 0; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_neurongroup_1_V, sizeof(double)*_num__array_neurongroup_1_V) + ); + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_neurongroup_1_V, _array_neurongroup_1_V, sizeof(double)*_num__array_neurongroup_1_V, cudaMemcpyHostToDevice) + ); + _array_neurongroup_g_PN_iKC = new double[2500]; + for(int i=0; i<2500; i++) _array_neurongroup_g_PN_iKC[i] = 0; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_neurongroup_g_PN_iKC, sizeof(double)*_num__array_neurongroup_g_PN_iKC) + ); + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_neurongroup_g_PN_iKC, _array_neurongroup_g_PN_iKC, sizeof(double)*_num__array_neurongroup_g_PN_iKC, cudaMemcpyHostToDevice) + ); + _array_neurongroup_h = new double[2500]; + for(int i=0; i<2500; i++) _array_neurongroup_h[i] = 0; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_neurongroup_h, sizeof(double)*_num__array_neurongroup_h) + ); + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_neurongroup_h, _array_neurongroup_h, sizeof(double)*_num__array_neurongroup_h, cudaMemcpyHostToDevice) + ); + _array_neurongroup_i = new int32_t[2500]; + for(int i=0; i<2500; i++) _array_neurongroup_i[i] = 0; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_neurongroup_i, sizeof(int32_t)*_num__array_neurongroup_i) + ); + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_neurongroup_i, _array_neurongroup_i, sizeof(int32_t)*_num__array_neurongroup_i, cudaMemcpyHostToDevice) + ); + _array_neurongroup_lastspike = new double[2500]; + for(int i=0; i<2500; i++) _array_neurongroup_lastspike[i] = 0; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_neurongroup_lastspike, sizeof(double)*_num__array_neurongroup_lastspike) + ); + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_neurongroup_lastspike, _array_neurongroup_lastspike, sizeof(double)*_num__array_neurongroup_lastspike, cudaMemcpyHostToDevice) + ); + _array_neurongroup_m = new double[2500]; + for(int i=0; i<2500; i++) _array_neurongroup_m[i] = 0; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_neurongroup_m, sizeof(double)*_num__array_neurongroup_m) + ); + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_neurongroup_m, _array_neurongroup_m, sizeof(double)*_num__array_neurongroup_m, cudaMemcpyHostToDevice) + ); + _array_neurongroup_n = new double[2500]; + for(int i=0; i<2500; i++) _array_neurongroup_n[i] = 0; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_neurongroup_n, sizeof(double)*_num__array_neurongroup_n) + ); + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_neurongroup_n, _array_neurongroup_n, sizeof(double)*_num__array_neurongroup_n, cudaMemcpyHostToDevice) + ); + _array_neurongroup_not_refractory = new char[2500]; + for(int i=0; i<2500; i++) _array_neurongroup_not_refractory[i] = 0; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_neurongroup_not_refractory, sizeof(char)*_num__array_neurongroup_not_refractory) + ); + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_neurongroup_not_refractory, _array_neurongroup_not_refractory, sizeof(char)*_num__array_neurongroup_not_refractory, cudaMemcpyHostToDevice) + ); + _array_neurongroup_V = new double[2500]; + for(int i=0; i<2500; i++) _array_neurongroup_V[i] = 0; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_neurongroup_V, sizeof(double)*_num__array_neurongroup_V) + ); + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_neurongroup_V, _array_neurongroup_V, sizeof(double)*_num__array_neurongroup_V, cudaMemcpyHostToDevice) + ); + _array_spikegeneratorgroup__lastindex = new int32_t[1]; + for(int i=0; i<1; i++) _array_spikegeneratorgroup__lastindex[i] = 0; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_spikegeneratorgroup__lastindex, sizeof(int32_t)*_num__array_spikegeneratorgroup__lastindex) + ); + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_spikegeneratorgroup__lastindex, _array_spikegeneratorgroup__lastindex, sizeof(int32_t)*_num__array_spikegeneratorgroup__lastindex, cudaMemcpyHostToDevice) + ); + _array_spikegeneratorgroup__period_bins = new int32_t[1]; + for(int i=0; i<1; i++) _array_spikegeneratorgroup__period_bins[i] = 0; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_spikegeneratorgroup__period_bins, sizeof(int32_t)*_num__array_spikegeneratorgroup__period_bins) + ); + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_spikegeneratorgroup__period_bins, _array_spikegeneratorgroup__period_bins, sizeof(int32_t)*_num__array_spikegeneratorgroup__period_bins, cudaMemcpyHostToDevice) + ); + _array_spikegeneratorgroup_i = new int32_t[100]; + for(int i=0; i<100; i++) _array_spikegeneratorgroup_i[i] = 0; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_spikegeneratorgroup_i, sizeof(int32_t)*_num__array_spikegeneratorgroup_i) + ); + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_spikegeneratorgroup_i, _array_spikegeneratorgroup_i, sizeof(int32_t)*_num__array_spikegeneratorgroup_i, cudaMemcpyHostToDevice) + ); + _array_spikegeneratorgroup_period = new double[1]; + for(int i=0; i<1; i++) _array_spikegeneratorgroup_period[i] = 0; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_spikegeneratorgroup_period, sizeof(double)*_num__array_spikegeneratorgroup_period) + ); + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_spikegeneratorgroup_period, _array_spikegeneratorgroup_period, sizeof(double)*_num__array_spikegeneratorgroup_period, cudaMemcpyHostToDevice) + ); + _array_spikemonitor_1__source_idx = new int32_t[2500]; + for(int i=0; i<2500; i++) _array_spikemonitor_1__source_idx[i] = 0; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_spikemonitor_1__source_idx, sizeof(int32_t)*_num__array_spikemonitor_1__source_idx) + ); + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_spikemonitor_1__source_idx, _array_spikemonitor_1__source_idx, sizeof(int32_t)*_num__array_spikemonitor_1__source_idx, cudaMemcpyHostToDevice) + ); + _array_spikemonitor_1_count = new int32_t[2500]; + for(int i=0; i<2500; i++) _array_spikemonitor_1_count[i] = 0; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_spikemonitor_1_count, sizeof(int32_t)*_num__array_spikemonitor_1_count) + ); + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_spikemonitor_1_count, _array_spikemonitor_1_count, sizeof(int32_t)*_num__array_spikemonitor_1_count, cudaMemcpyHostToDevice) + ); + _array_spikemonitor_1_N = new int32_t[1]; + for(int i=0; i<1; i++) _array_spikemonitor_1_N[i] = 0; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_spikemonitor_1_N, sizeof(int32_t)*_num__array_spikemonitor_1_N) + ); + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_spikemonitor_1_N, _array_spikemonitor_1_N, sizeof(int32_t)*_num__array_spikemonitor_1_N, cudaMemcpyHostToDevice) + ); + _array_spikemonitor_2__source_idx = new int32_t[100]; + for(int i=0; i<100; i++) _array_spikemonitor_2__source_idx[i] = 0; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_spikemonitor_2__source_idx, sizeof(int32_t)*_num__array_spikemonitor_2__source_idx) + ); + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_spikemonitor_2__source_idx, _array_spikemonitor_2__source_idx, sizeof(int32_t)*_num__array_spikemonitor_2__source_idx, cudaMemcpyHostToDevice) + ); + _array_spikemonitor_2_count = new int32_t[100]; + for(int i=0; i<100; i++) _array_spikemonitor_2_count[i] = 0; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_spikemonitor_2_count, sizeof(int32_t)*_num__array_spikemonitor_2_count) + ); + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_spikemonitor_2_count, _array_spikemonitor_2_count, sizeof(int32_t)*_num__array_spikemonitor_2_count, cudaMemcpyHostToDevice) + ); + _array_spikemonitor_2_N = new int32_t[1]; + for(int i=0; i<1; i++) _array_spikemonitor_2_N[i] = 0; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_spikemonitor_2_N, sizeof(int32_t)*_num__array_spikemonitor_2_N) + ); + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_spikemonitor_2_N, _array_spikemonitor_2_N, sizeof(int32_t)*_num__array_spikemonitor_2_N, cudaMemcpyHostToDevice) + ); + _array_spikemonitor__source_idx = new int32_t[100]; + for(int i=0; i<100; i++) _array_spikemonitor__source_idx[i] = 0; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_spikemonitor__source_idx, sizeof(int32_t)*_num__array_spikemonitor__source_idx) + ); + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_spikemonitor__source_idx, _array_spikemonitor__source_idx, sizeof(int32_t)*_num__array_spikemonitor__source_idx, cudaMemcpyHostToDevice) + ); + _array_spikemonitor_count = new int32_t[100]; + for(int i=0; i<100; i++) _array_spikemonitor_count[i] = 0; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_spikemonitor_count, sizeof(int32_t)*_num__array_spikemonitor_count) + ); + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_spikemonitor_count, _array_spikemonitor_count, sizeof(int32_t)*_num__array_spikemonitor_count, cudaMemcpyHostToDevice) + ); + _array_spikemonitor_N = new int32_t[1]; + for(int i=0; i<1; i++) _array_spikemonitor_N[i] = 0; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_spikemonitor_N, sizeof(int32_t)*_num__array_spikemonitor_N) + ); + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_spikemonitor_N, _array_spikemonitor_N, sizeof(int32_t)*_num__array_spikemonitor_N, cudaMemcpyHostToDevice) + ); + _array_synapses_1_N = new int32_t[1]; + for(int i=0; i<1; i++) _array_synapses_1_N[i] = 0; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_synapses_1_N, sizeof(int32_t)*_num__array_synapses_1_N) + ); + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_synapses_1_N, _array_synapses_1_N, sizeof(int32_t)*_num__array_synapses_1_N, cudaMemcpyHostToDevice) + ); + _array_synapses_2_N = new int32_t[1]; + for(int i=0; i<1; i++) _array_synapses_2_N[i] = 0; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_synapses_2_N, sizeof(int32_t)*_num__array_synapses_2_N) + ); + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_synapses_2_N, _array_synapses_2_N, sizeof(int32_t)*_num__array_synapses_2_N, cudaMemcpyHostToDevice) + ); + _array_synapses_N = new int32_t[1]; + for(int i=0; i<1; i++) _array_synapses_N[i] = 0; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_synapses_N, sizeof(int32_t)*_num__array_synapses_N) + ); + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_synapses_N, _array_synapses_N, sizeof(int32_t)*_num__array_synapses_N, cudaMemcpyHostToDevice) + ); + _dynamic_array_spikegeneratorgroup__timebins.resize(19676); + THRUST_CHECK_ERROR(dev_dynamic_array_spikegeneratorgroup__timebins.resize(19676)); + for(int i=0; i<19676; i++) + { + _dynamic_array_spikegeneratorgroup__timebins[i] = 0; + dev_dynamic_array_spikegeneratorgroup__timebins[i] = 0; + } + _dynamic_array_synapses_1_delay.resize(1); + THRUST_CHECK_ERROR(dev_dynamic_array_synapses_1_delay.resize(1)); + for(int i=0; i<1; i++) + { + _dynamic_array_synapses_1_delay[i] = 0; + dev_dynamic_array_synapses_1_delay[i] = 0; + } + _dynamic_array_synapses_2_delay.resize(1); + THRUST_CHECK_ERROR(dev_dynamic_array_synapses_2_delay.resize(1)); + for(int i=0; i<1; i++) + { + _dynamic_array_synapses_2_delay[i] = 0; + dev_dynamic_array_synapses_2_delay[i] = 0; + } + + // Arrays initialized to an "arange" + _array_neurongroup_1_i = new int32_t[100]; + for(int i=0; i<100; i++) _array_neurongroup_1_i[i] = 0 + i; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_neurongroup_1_i, sizeof(int32_t)*_num__array_neurongroup_1_i) + ); + + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_neurongroup_1_i, _array_neurongroup_1_i, sizeof(int32_t)*_num__array_neurongroup_1_i, cudaMemcpyHostToDevice) + ); + _array_neurongroup_i = new int32_t[2500]; + for(int i=0; i<2500; i++) _array_neurongroup_i[i] = 0 + i; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_neurongroup_i, sizeof(int32_t)*_num__array_neurongroup_i) + ); + + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_neurongroup_i, _array_neurongroup_i, sizeof(int32_t)*_num__array_neurongroup_i, cudaMemcpyHostToDevice) + ); + _array_spikegeneratorgroup_i = new int32_t[100]; + for(int i=0; i<100; i++) _array_spikegeneratorgroup_i[i] = 0 + i; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_spikegeneratorgroup_i, sizeof(int32_t)*_num__array_spikegeneratorgroup_i) + ); + + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_spikegeneratorgroup_i, _array_spikegeneratorgroup_i, sizeof(int32_t)*_num__array_spikegeneratorgroup_i, cudaMemcpyHostToDevice) + ); + _array_spikemonitor_1__source_idx = new int32_t[2500]; + for(int i=0; i<2500; i++) _array_spikemonitor_1__source_idx[i] = 0 + i; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_spikemonitor_1__source_idx, sizeof(int32_t)*_num__array_spikemonitor_1__source_idx) + ); + + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_spikemonitor_1__source_idx, _array_spikemonitor_1__source_idx, sizeof(int32_t)*_num__array_spikemonitor_1__source_idx, cudaMemcpyHostToDevice) + ); + _array_spikemonitor_2__source_idx = new int32_t[100]; + for(int i=0; i<100; i++) _array_spikemonitor_2__source_idx[i] = 0 + i; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_spikemonitor_2__source_idx, sizeof(int32_t)*_num__array_spikemonitor_2__source_idx) + ); + + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_spikemonitor_2__source_idx, _array_spikemonitor_2__source_idx, sizeof(int32_t)*_num__array_spikemonitor_2__source_idx, cudaMemcpyHostToDevice) + ); + _array_spikemonitor__source_idx = new int32_t[100]; + for(int i=0; i<100; i++) _array_spikemonitor__source_idx[i] = 0 + i; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_spikemonitor__source_idx, sizeof(int32_t)*_num__array_spikemonitor__source_idx) + ); + + CUDA_SAFE_CALL( + cudaMemcpy(dev_array_spikemonitor__source_idx, _array_spikemonitor__source_idx, sizeof(int32_t)*_num__array_spikemonitor__source_idx, cudaMemcpyHostToDevice) + ); + + // static arrays + _static_array__dynamic_array_spikegeneratorgroup__timebins = new int32_t[19676]; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_static_array__dynamic_array_spikegeneratorgroup__timebins, sizeof(int32_t)*19676) + ); + CUDA_SAFE_CALL( + cudaMemcpyToSymbol(d_static_array__dynamic_array_spikegeneratorgroup__timebins, &dev_static_array__dynamic_array_spikegeneratorgroup__timebins, sizeof(int32_t*)) + ); + _static_array__dynamic_array_spikegeneratorgroup_neuron_index = new int64_t[19676]; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_static_array__dynamic_array_spikegeneratorgroup_neuron_index, sizeof(int64_t)*19676) + ); + CUDA_SAFE_CALL( + cudaMemcpyToSymbol(d_static_array__dynamic_array_spikegeneratorgroup_neuron_index, &dev_static_array__dynamic_array_spikegeneratorgroup_neuron_index, sizeof(int64_t*)) + ); + _static_array__dynamic_array_spikegeneratorgroup_spike_number = new int64_t[19676]; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_static_array__dynamic_array_spikegeneratorgroup_spike_number, sizeof(int64_t)*19676) + ); + CUDA_SAFE_CALL( + cudaMemcpyToSymbol(d_static_array__dynamic_array_spikegeneratorgroup_spike_number, &dev_static_array__dynamic_array_spikegeneratorgroup_spike_number, sizeof(int64_t*)) + ); + _static_array__dynamic_array_spikegeneratorgroup_spike_time = new double[19676]; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_static_array__dynamic_array_spikegeneratorgroup_spike_time, sizeof(double)*19676) + ); + CUDA_SAFE_CALL( + cudaMemcpyToSymbol(d_static_array__dynamic_array_spikegeneratorgroup_spike_time, &dev_static_array__dynamic_array_spikegeneratorgroup_spike_time, sizeof(double*)) + ); + + + // eventspace_arrays + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_neurongroup_1__spikespace[0], sizeof(int32_t)*_num__array_neurongroup_1__spikespace) + ); + _array_neurongroup_1__spikespace = new int32_t[101]; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_neurongroup__spikespace[0], sizeof(int32_t)*_num__array_neurongroup__spikespace) + ); + _array_neurongroup__spikespace = new int32_t[2501]; + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_array_spikegeneratorgroup__spikespace[0], sizeof(int32_t)*_num__array_spikegeneratorgroup__spikespace) + ); + _array_spikegeneratorgroup__spikespace = new int32_t[101]; + + CUDA_CHECK_MEMORY(); + const double to_MB = 1.0 / (1024.0 * 1024.0); + double tot_memory_MB = (used_device_memory - used_device_memory_start) * to_MB; + double time_passed = (double)(std::clock() - start_timer) / CLOCKS_PER_SEC; + std::cout << "INFO: _init_arrays() took " << time_passed << "s"; + if (tot_memory_MB > 0) + std::cout << " and used " << tot_memory_MB << "MB of device memory."; + std::cout << std::endl; +} + +void _load_arrays() +{ + using namespace brian; + + ifstream f_static_array__dynamic_array_spikegeneratorgroup__timebins; + f_static_array__dynamic_array_spikegeneratorgroup__timebins.open("static_arrays/_static_array__dynamic_array_spikegeneratorgroup__timebins", ios::in | ios::binary); + if(f_static_array__dynamic_array_spikegeneratorgroup__timebins.is_open()) + { + f_static_array__dynamic_array_spikegeneratorgroup__timebins.read(reinterpret_cast(_static_array__dynamic_array_spikegeneratorgroup__timebins), 19676*sizeof(int32_t)); + } else + { + std::cout << "Error opening static array _static_array__dynamic_array_spikegeneratorgroup__timebins." << endl; + } + CUDA_SAFE_CALL( + cudaMemcpy(dev_static_array__dynamic_array_spikegeneratorgroup__timebins, _static_array__dynamic_array_spikegeneratorgroup__timebins, sizeof(int32_t)*19676, cudaMemcpyHostToDevice) + ); + ifstream f_static_array__dynamic_array_spikegeneratorgroup_neuron_index; + f_static_array__dynamic_array_spikegeneratorgroup_neuron_index.open("static_arrays/_static_array__dynamic_array_spikegeneratorgroup_neuron_index", ios::in | ios::binary); + if(f_static_array__dynamic_array_spikegeneratorgroup_neuron_index.is_open()) + { + f_static_array__dynamic_array_spikegeneratorgroup_neuron_index.read(reinterpret_cast(_static_array__dynamic_array_spikegeneratorgroup_neuron_index), 19676*sizeof(int64_t)); + } else + { + std::cout << "Error opening static array _static_array__dynamic_array_spikegeneratorgroup_neuron_index." << endl; + } + CUDA_SAFE_CALL( + cudaMemcpy(dev_static_array__dynamic_array_spikegeneratorgroup_neuron_index, _static_array__dynamic_array_spikegeneratorgroup_neuron_index, sizeof(int64_t)*19676, cudaMemcpyHostToDevice) + ); + ifstream f_static_array__dynamic_array_spikegeneratorgroup_spike_number; + f_static_array__dynamic_array_spikegeneratorgroup_spike_number.open("static_arrays/_static_array__dynamic_array_spikegeneratorgroup_spike_number", ios::in | ios::binary); + if(f_static_array__dynamic_array_spikegeneratorgroup_spike_number.is_open()) + { + f_static_array__dynamic_array_spikegeneratorgroup_spike_number.read(reinterpret_cast(_static_array__dynamic_array_spikegeneratorgroup_spike_number), 19676*sizeof(int64_t)); + } else + { + std::cout << "Error opening static array _static_array__dynamic_array_spikegeneratorgroup_spike_number." << endl; + } + CUDA_SAFE_CALL( + cudaMemcpy(dev_static_array__dynamic_array_spikegeneratorgroup_spike_number, _static_array__dynamic_array_spikegeneratorgroup_spike_number, sizeof(int64_t)*19676, cudaMemcpyHostToDevice) + ); + ifstream f_static_array__dynamic_array_spikegeneratorgroup_spike_time; + f_static_array__dynamic_array_spikegeneratorgroup_spike_time.open("static_arrays/_static_array__dynamic_array_spikegeneratorgroup_spike_time", ios::in | ios::binary); + if(f_static_array__dynamic_array_spikegeneratorgroup_spike_time.is_open()) + { + f_static_array__dynamic_array_spikegeneratorgroup_spike_time.read(reinterpret_cast(_static_array__dynamic_array_spikegeneratorgroup_spike_time), 19676*sizeof(double)); + } else + { + std::cout << "Error opening static array _static_array__dynamic_array_spikegeneratorgroup_spike_time." << endl; + } + CUDA_SAFE_CALL( + cudaMemcpy(dev_static_array__dynamic_array_spikegeneratorgroup_spike_time, _static_array__dynamic_array_spikegeneratorgroup_spike_time, sizeof(double)*19676, cudaMemcpyHostToDevice) + ); +} + +void _write_arrays() +{ + using namespace brian; + + CUDA_SAFE_CALL( + cudaMemcpy(_array_defaultclock_dt, dev_array_defaultclock_dt, sizeof(double)*_num__array_defaultclock_dt, cudaMemcpyDeviceToHost) + ); + ofstream outfile__array_defaultclock_dt; + outfile__array_defaultclock_dt.open("results/_array_defaultclock_dt_-847410599827917468", ios::binary | ios::out); + if(outfile__array_defaultclock_dt.is_open()) + { + outfile__array_defaultclock_dt.write(reinterpret_cast(_array_defaultclock_dt), 1*sizeof(double)); + outfile__array_defaultclock_dt.close(); + } else + { + std::cout << "Error writing output file for _array_defaultclock_dt." << endl; + } + CUDA_SAFE_CALL( + cudaMemcpy(_array_defaultclock_t, dev_array_defaultclock_t, sizeof(double)*_num__array_defaultclock_t, cudaMemcpyDeviceToHost) + ); + ofstream outfile__array_defaultclock_t; + outfile__array_defaultclock_t.open("results/_array_defaultclock_t_8322660633589888012", ios::binary | ios::out); + if(outfile__array_defaultclock_t.is_open()) + { + outfile__array_defaultclock_t.write(reinterpret_cast(_array_defaultclock_t), 1*sizeof(double)); + outfile__array_defaultclock_t.close(); + } else + { + std::cout << "Error writing output file for _array_defaultclock_t." << endl; + } + CUDA_SAFE_CALL( + cudaMemcpy(_array_defaultclock_timestep, dev_array_defaultclock_timestep, sizeof(int64_t)*_num__array_defaultclock_timestep, cudaMemcpyDeviceToHost) + ); + ofstream outfile__array_defaultclock_timestep; + outfile__array_defaultclock_timestep.open("results/_array_defaultclock_timestep_1352370266667125095", ios::binary | ios::out); + if(outfile__array_defaultclock_timestep.is_open()) + { + outfile__array_defaultclock_timestep.write(reinterpret_cast(_array_defaultclock_timestep), 1*sizeof(int64_t)); + outfile__array_defaultclock_timestep.close(); + } else + { + std::cout << "Error writing output file for _array_defaultclock_timestep." << endl; + } + CUDA_SAFE_CALL( + cudaMemcpy(_array_neurongroup_1_g_eKC_eKC, dev_array_neurongroup_1_g_eKC_eKC, sizeof(double)*_num__array_neurongroup_1_g_eKC_eKC, cudaMemcpyDeviceToHost) + ); + ofstream outfile__array_neurongroup_1_g_eKC_eKC; + outfile__array_neurongroup_1_g_eKC_eKC.open("results/_array_neurongroup_1_g_eKC_eKC_-2719670425652398549", ios::binary | ios::out); + if(outfile__array_neurongroup_1_g_eKC_eKC.is_open()) + { + outfile__array_neurongroup_1_g_eKC_eKC.write(reinterpret_cast(_array_neurongroup_1_g_eKC_eKC), 100*sizeof(double)); + outfile__array_neurongroup_1_g_eKC_eKC.close(); + } else + { + std::cout << "Error writing output file for _array_neurongroup_1_g_eKC_eKC." << endl; + } + CUDA_SAFE_CALL( + cudaMemcpy(_array_neurongroup_1_g_iKC_eKC, dev_array_neurongroup_1_g_iKC_eKC, sizeof(double)*_num__array_neurongroup_1_g_iKC_eKC, cudaMemcpyDeviceToHost) + ); + ofstream outfile__array_neurongroup_1_g_iKC_eKC; + outfile__array_neurongroup_1_g_iKC_eKC.open("results/_array_neurongroup_1_g_iKC_eKC_-6839007311668324058", ios::binary | ios::out); + if(outfile__array_neurongroup_1_g_iKC_eKC.is_open()) + { + outfile__array_neurongroup_1_g_iKC_eKC.write(reinterpret_cast(_array_neurongroup_1_g_iKC_eKC), 100*sizeof(double)); + outfile__array_neurongroup_1_g_iKC_eKC.close(); + } else + { + std::cout << "Error writing output file for _array_neurongroup_1_g_iKC_eKC." << endl; + } + CUDA_SAFE_CALL( + cudaMemcpy(_array_neurongroup_1_h, dev_array_neurongroup_1_h, sizeof(double)*_num__array_neurongroup_1_h, cudaMemcpyDeviceToHost) + ); + ofstream outfile__array_neurongroup_1_h; + outfile__array_neurongroup_1_h.open("results/_array_neurongroup_1_h_1075921236281676937", ios::binary | ios::out); + if(outfile__array_neurongroup_1_h.is_open()) + { + outfile__array_neurongroup_1_h.write(reinterpret_cast(_array_neurongroup_1_h), 100*sizeof(double)); + outfile__array_neurongroup_1_h.close(); + } else + { + std::cout << "Error writing output file for _array_neurongroup_1_h." << endl; + } + CUDA_SAFE_CALL( + cudaMemcpy(_array_neurongroup_1_i, dev_array_neurongroup_1_i, sizeof(int32_t)*_num__array_neurongroup_1_i, cudaMemcpyDeviceToHost) + ); + ofstream outfile__array_neurongroup_1_i; + outfile__array_neurongroup_1_i.open("results/_array_neurongroup_1_i_8994940115406199838", ios::binary | ios::out); + if(outfile__array_neurongroup_1_i.is_open()) + { + outfile__array_neurongroup_1_i.write(reinterpret_cast(_array_neurongroup_1_i), 100*sizeof(int32_t)); + outfile__array_neurongroup_1_i.close(); + } else + { + std::cout << "Error writing output file for _array_neurongroup_1_i." << endl; + } + CUDA_SAFE_CALL( + cudaMemcpy(_array_neurongroup_1_lastspike, dev_array_neurongroup_1_lastspike, sizeof(double)*_num__array_neurongroup_1_lastspike, cudaMemcpyDeviceToHost) + ); + ofstream outfile__array_neurongroup_1_lastspike; + outfile__array_neurongroup_1_lastspike.open("results/_array_neurongroup_1_lastspike_-8689292283566925331", ios::binary | ios::out); + if(outfile__array_neurongroup_1_lastspike.is_open()) + { + outfile__array_neurongroup_1_lastspike.write(reinterpret_cast(_array_neurongroup_1_lastspike), 100*sizeof(double)); + outfile__array_neurongroup_1_lastspike.close(); + } else + { + std::cout << "Error writing output file for _array_neurongroup_1_lastspike." << endl; + } + CUDA_SAFE_CALL( + cudaMemcpy(_array_neurongroup_1_m, dev_array_neurongroup_1_m, sizeof(double)*_num__array_neurongroup_1_m, cudaMemcpyDeviceToHost) + ); + ofstream outfile__array_neurongroup_1_m; + outfile__array_neurongroup_1_m.open("results/_array_neurongroup_1_m_7921550157009594959", ios::binary | ios::out); + if(outfile__array_neurongroup_1_m.is_open()) + { + outfile__array_neurongroup_1_m.write(reinterpret_cast(_array_neurongroup_1_m), 100*sizeof(double)); + outfile__array_neurongroup_1_m.close(); + } else + { + std::cout << "Error writing output file for _array_neurongroup_1_m." << endl; + } + CUDA_SAFE_CALL( + cudaMemcpy(_array_neurongroup_1_n, dev_array_neurongroup_1_n, sizeof(double)*_num__array_neurongroup_1_n, cudaMemcpyDeviceToHost) + ); + ofstream outfile__array_neurongroup_1_n; + outfile__array_neurongroup_1_n.open("results/_array_neurongroup_1_n_-5628489820633515426", ios::binary | ios::out); + if(outfile__array_neurongroup_1_n.is_open()) + { + outfile__array_neurongroup_1_n.write(reinterpret_cast(_array_neurongroup_1_n), 100*sizeof(double)); + outfile__array_neurongroup_1_n.close(); + } else + { + std::cout << "Error writing output file for _array_neurongroup_1_n." << endl; + } + CUDA_SAFE_CALL( + cudaMemcpy(_array_neurongroup_1_not_refractory, dev_array_neurongroup_1_not_refractory, sizeof(char)*_num__array_neurongroup_1_not_refractory, cudaMemcpyDeviceToHost) + ); + ofstream outfile__array_neurongroup_1_not_refractory; + outfile__array_neurongroup_1_not_refractory.open("results/_array_neurongroup_1_not_refractory_-6252862397328651189", ios::binary | ios::out); + if(outfile__array_neurongroup_1_not_refractory.is_open()) + { + outfile__array_neurongroup_1_not_refractory.write(reinterpret_cast(_array_neurongroup_1_not_refractory), 100*sizeof(char)); + outfile__array_neurongroup_1_not_refractory.close(); + } else + { + std::cout << "Error writing output file for _array_neurongroup_1_not_refractory." << endl; + } + CUDA_SAFE_CALL( + cudaMemcpy(_array_neurongroup_1_V, dev_array_neurongroup_1_V, sizeof(double)*_num__array_neurongroup_1_V, cudaMemcpyDeviceToHost) + ); + ofstream outfile__array_neurongroup_1_V; + outfile__array_neurongroup_1_V.open("results/_array_neurongroup_1_V_-1395569865091706992", ios::binary | ios::out); + if(outfile__array_neurongroup_1_V.is_open()) + { + outfile__array_neurongroup_1_V.write(reinterpret_cast(_array_neurongroup_1_V), 100*sizeof(double)); + outfile__array_neurongroup_1_V.close(); + } else + { + std::cout << "Error writing output file for _array_neurongroup_1_V." << endl; + } + CUDA_SAFE_CALL( + cudaMemcpy(_array_neurongroup_g_PN_iKC, dev_array_neurongroup_g_PN_iKC, sizeof(double)*_num__array_neurongroup_g_PN_iKC, cudaMemcpyDeviceToHost) + ); + ofstream outfile__array_neurongroup_g_PN_iKC; + outfile__array_neurongroup_g_PN_iKC.open("results/_array_neurongroup_g_PN_iKC_-4808752820085404947", ios::binary | ios::out); + if(outfile__array_neurongroup_g_PN_iKC.is_open()) + { + outfile__array_neurongroup_g_PN_iKC.write(reinterpret_cast(_array_neurongroup_g_PN_iKC), 2500*sizeof(double)); + outfile__array_neurongroup_g_PN_iKC.close(); + } else + { + std::cout << "Error writing output file for _array_neurongroup_g_PN_iKC." << endl; + } + CUDA_SAFE_CALL( + cudaMemcpy(_array_neurongroup_h, dev_array_neurongroup_h, sizeof(double)*_num__array_neurongroup_h, cudaMemcpyDeviceToHost) + ); + ofstream outfile__array_neurongroup_h; + outfile__array_neurongroup_h.open("results/_array_neurongroup_h_8698551290289247068", ios::binary | ios::out); + if(outfile__array_neurongroup_h.is_open()) + { + outfile__array_neurongroup_h.write(reinterpret_cast(_array_neurongroup_h), 2500*sizeof(double)); + outfile__array_neurongroup_h.close(); + } else + { + std::cout << "Error writing output file for _array_neurongroup_h." << endl; + } + CUDA_SAFE_CALL( + cudaMemcpy(_array_neurongroup_i, dev_array_neurongroup_i, sizeof(int32_t)*_num__array_neurongroup_i, cudaMemcpyDeviceToHost) + ); + ofstream outfile__array_neurongroup_i; + outfile__array_neurongroup_i.open("results/_array_neurongroup_i_8335793832464323850", ios::binary | ios::out); + if(outfile__array_neurongroup_i.is_open()) + { + outfile__array_neurongroup_i.write(reinterpret_cast(_array_neurongroup_i), 2500*sizeof(int32_t)); + outfile__array_neurongroup_i.close(); + } else + { + std::cout << "Error writing output file for _array_neurongroup_i." << endl; + } + CUDA_SAFE_CALL( + cudaMemcpy(_array_neurongroup_lastspike, dev_array_neurongroup_lastspike, sizeof(double)*_num__array_neurongroup_lastspike, cudaMemcpyDeviceToHost) + ); + ofstream outfile__array_neurongroup_lastspike; + outfile__array_neurongroup_lastspike.open("results/_array_neurongroup_lastspike_6427935437904044193", ios::binary | ios::out); + if(outfile__array_neurongroup_lastspike.is_open()) + { + outfile__array_neurongroup_lastspike.write(reinterpret_cast(_array_neurongroup_lastspike), 2500*sizeof(double)); + outfile__array_neurongroup_lastspike.close(); + } else + { + std::cout << "Error writing output file for _array_neurongroup_lastspike." << endl; + } + CUDA_SAFE_CALL( + cudaMemcpy(_array_neurongroup_m, dev_array_neurongroup_m, sizeof(double)*_num__array_neurongroup_m, cudaMemcpyDeviceToHost) + ); + ofstream outfile__array_neurongroup_m; + outfile__array_neurongroup_m.open("results/_array_neurongroup_m_-5621447401784989625", ios::binary | ios::out); + if(outfile__array_neurongroup_m.is_open()) + { + outfile__array_neurongroup_m.write(reinterpret_cast(_array_neurongroup_m), 2500*sizeof(double)); + outfile__array_neurongroup_m.close(); + } else + { + std::cout << "Error writing output file for _array_neurongroup_m." << endl; + } + CUDA_SAFE_CALL( + cudaMemcpy(_array_neurongroup_n, dev_array_neurongroup_n, sizeof(double)*_num__array_neurongroup_n, cudaMemcpyDeviceToHost) + ); + ofstream outfile__array_neurongroup_n; + outfile__array_neurongroup_n.open("results/_array_neurongroup_n_-2546797609979266637", ios::binary | ios::out); + if(outfile__array_neurongroup_n.is_open()) + { + outfile__array_neurongroup_n.write(reinterpret_cast(_array_neurongroup_n), 2500*sizeof(double)); + outfile__array_neurongroup_n.close(); + } else + { + std::cout << "Error writing output file for _array_neurongroup_n." << endl; + } + CUDA_SAFE_CALL( + cudaMemcpy(_array_neurongroup_not_refractory, dev_array_neurongroup_not_refractory, sizeof(char)*_num__array_neurongroup_not_refractory, cudaMemcpyDeviceToHost) + ); + ofstream outfile__array_neurongroup_not_refractory; + outfile__array_neurongroup_not_refractory.open("results/_array_neurongroup_not_refractory_5726736962615233645", ios::binary | ios::out); + if(outfile__array_neurongroup_not_refractory.is_open()) + { + outfile__array_neurongroup_not_refractory.write(reinterpret_cast(_array_neurongroup_not_refractory), 2500*sizeof(char)); + outfile__array_neurongroup_not_refractory.close(); + } else + { + std::cout << "Error writing output file for _array_neurongroup_not_refractory." << endl; + } + CUDA_SAFE_CALL( + cudaMemcpy(_array_neurongroup_V, dev_array_neurongroup_V, sizeof(double)*_num__array_neurongroup_V, cudaMemcpyDeviceToHost) + ); + ofstream outfile__array_neurongroup_V; + outfile__array_neurongroup_V.open("results/_array_neurongroup_V_2686151377283509651", ios::binary | ios::out); + if(outfile__array_neurongroup_V.is_open()) + { + outfile__array_neurongroup_V.write(reinterpret_cast(_array_neurongroup_V), 2500*sizeof(double)); + outfile__array_neurongroup_V.close(); + } else + { + std::cout << "Error writing output file for _array_neurongroup_V." << endl; + } + CUDA_SAFE_CALL( + cudaMemcpy(_array_spikegeneratorgroup__lastindex, dev_array_spikegeneratorgroup__lastindex, sizeof(int32_t)*_num__array_spikegeneratorgroup__lastindex, cudaMemcpyDeviceToHost) + ); + ofstream outfile__array_spikegeneratorgroup__lastindex; + outfile__array_spikegeneratorgroup__lastindex.open("results/_array_spikegeneratorgroup__lastindex_1821964835846880533", ios::binary | ios::out); + if(outfile__array_spikegeneratorgroup__lastindex.is_open()) + { + outfile__array_spikegeneratorgroup__lastindex.write(reinterpret_cast(_array_spikegeneratorgroup__lastindex), 1*sizeof(int32_t)); + outfile__array_spikegeneratorgroup__lastindex.close(); + } else + { + std::cout << "Error writing output file for _array_spikegeneratorgroup__lastindex." << endl; + } + CUDA_SAFE_CALL( + cudaMemcpy(_array_spikegeneratorgroup__period_bins, dev_array_spikegeneratorgroup__period_bins, sizeof(int32_t)*_num__array_spikegeneratorgroup__period_bins, cudaMemcpyDeviceToHost) + ); + ofstream outfile__array_spikegeneratorgroup__period_bins; + outfile__array_spikegeneratorgroup__period_bins.open("results/_array_spikegeneratorgroup__period_bins_-7971398493031931846", ios::binary | ios::out); + if(outfile__array_spikegeneratorgroup__period_bins.is_open()) + { + outfile__array_spikegeneratorgroup__period_bins.write(reinterpret_cast(_array_spikegeneratorgroup__period_bins), 1*sizeof(int32_t)); + outfile__array_spikegeneratorgroup__period_bins.close(); + } else + { + std::cout << "Error writing output file for _array_spikegeneratorgroup__period_bins." << endl; + } + CUDA_SAFE_CALL( + cudaMemcpy(_array_spikegeneratorgroup_i, dev_array_spikegeneratorgroup_i, sizeof(int32_t)*_num__array_spikegeneratorgroup_i, cudaMemcpyDeviceToHost) + ); + ofstream outfile__array_spikegeneratorgroup_i; + outfile__array_spikegeneratorgroup_i.open("results/_array_spikegeneratorgroup_i_-1292482055040653574", ios::binary | ios::out); + if(outfile__array_spikegeneratorgroup_i.is_open()) + { + outfile__array_spikegeneratorgroup_i.write(reinterpret_cast(_array_spikegeneratorgroup_i), 100*sizeof(int32_t)); + outfile__array_spikegeneratorgroup_i.close(); + } else + { + std::cout << "Error writing output file for _array_spikegeneratorgroup_i." << endl; + } + CUDA_SAFE_CALL( + cudaMemcpy(_array_spikegeneratorgroup_period, dev_array_spikegeneratorgroup_period, sizeof(double)*_num__array_spikegeneratorgroup_period, cudaMemcpyDeviceToHost) + ); + ofstream outfile__array_spikegeneratorgroup_period; + outfile__array_spikegeneratorgroup_period.open("results/_array_spikegeneratorgroup_period_-353366131269823746", ios::binary | ios::out); + if(outfile__array_spikegeneratorgroup_period.is_open()) + { + outfile__array_spikegeneratorgroup_period.write(reinterpret_cast(_array_spikegeneratorgroup_period), 1*sizeof(double)); + outfile__array_spikegeneratorgroup_period.close(); + } else + { + std::cout << "Error writing output file for _array_spikegeneratorgroup_period." << endl; + } + CUDA_SAFE_CALL( + cudaMemcpy(_array_spikemonitor_1__source_idx, dev_array_spikemonitor_1__source_idx, sizeof(int32_t)*_num__array_spikemonitor_1__source_idx, cudaMemcpyDeviceToHost) + ); + ofstream outfile__array_spikemonitor_1__source_idx; + outfile__array_spikemonitor_1__source_idx.open("results/_array_spikemonitor_1__source_idx_-50543664629489326", ios::binary | ios::out); + if(outfile__array_spikemonitor_1__source_idx.is_open()) + { + outfile__array_spikemonitor_1__source_idx.write(reinterpret_cast(_array_spikemonitor_1__source_idx), 2500*sizeof(int32_t)); + outfile__array_spikemonitor_1__source_idx.close(); + } else + { + std::cout << "Error writing output file for _array_spikemonitor_1__source_idx." << endl; + } + CUDA_SAFE_CALL( + cudaMemcpy(_array_spikemonitor_1_count, dev_array_spikemonitor_1_count, sizeof(int32_t)*_num__array_spikemonitor_1_count, cudaMemcpyDeviceToHost) + ); + ofstream outfile__array_spikemonitor_1_count; + outfile__array_spikemonitor_1_count.open("results/_array_spikemonitor_1_count_6013008031212298333", ios::binary | ios::out); + if(outfile__array_spikemonitor_1_count.is_open()) + { + outfile__array_spikemonitor_1_count.write(reinterpret_cast(_array_spikemonitor_1_count), 2500*sizeof(int32_t)); + outfile__array_spikemonitor_1_count.close(); + } else + { + std::cout << "Error writing output file for _array_spikemonitor_1_count." << endl; + } + CUDA_SAFE_CALL( + cudaMemcpy(_array_spikemonitor_1_N, dev_array_spikemonitor_1_N, sizeof(int32_t)*_num__array_spikemonitor_1_N, cudaMemcpyDeviceToHost) + ); + ofstream outfile__array_spikemonitor_1_N; + outfile__array_spikemonitor_1_N.open("results/_array_spikemonitor_1_N_3169190033621949867", ios::binary | ios::out); + if(outfile__array_spikemonitor_1_N.is_open()) + { + outfile__array_spikemonitor_1_N.write(reinterpret_cast(_array_spikemonitor_1_N), 1*sizeof(int32_t)); + outfile__array_spikemonitor_1_N.close(); + } else + { + std::cout << "Error writing output file for _array_spikemonitor_1_N." << endl; + } + CUDA_SAFE_CALL( + cudaMemcpy(_array_spikemonitor_2__source_idx, dev_array_spikemonitor_2__source_idx, sizeof(int32_t)*_num__array_spikemonitor_2__source_idx, cudaMemcpyDeviceToHost) + ); + ofstream outfile__array_spikemonitor_2__source_idx; + outfile__array_spikemonitor_2__source_idx.open("results/_array_spikemonitor_2__source_idx_-7925017314742328674", ios::binary | ios::out); + if(outfile__array_spikemonitor_2__source_idx.is_open()) + { + outfile__array_spikemonitor_2__source_idx.write(reinterpret_cast(_array_spikemonitor_2__source_idx), 100*sizeof(int32_t)); + outfile__array_spikemonitor_2__source_idx.close(); + } else + { + std::cout << "Error writing output file for _array_spikemonitor_2__source_idx." << endl; + } + CUDA_SAFE_CALL( + cudaMemcpy(_array_spikemonitor_2_count, dev_array_spikemonitor_2_count, sizeof(int32_t)*_num__array_spikemonitor_2_count, cudaMemcpyDeviceToHost) + ); + ofstream outfile__array_spikemonitor_2_count; + outfile__array_spikemonitor_2_count.open("results/_array_spikemonitor_2_count_7670286378054215486", ios::binary | ios::out); + if(outfile__array_spikemonitor_2_count.is_open()) + { + outfile__array_spikemonitor_2_count.write(reinterpret_cast(_array_spikemonitor_2_count), 100*sizeof(int32_t)); + outfile__array_spikemonitor_2_count.close(); + } else + { + std::cout << "Error writing output file for _array_spikemonitor_2_count." << endl; + } + CUDA_SAFE_CALL( + cudaMemcpy(_array_spikemonitor_2_N, dev_array_spikemonitor_2_N, sizeof(int32_t)*_num__array_spikemonitor_2_N, cudaMemcpyDeviceToHost) + ); + ofstream outfile__array_spikemonitor_2_N; + outfile__array_spikemonitor_2_N.open("results/_array_spikemonitor_2_N_6693733537479841813", ios::binary | ios::out); + if(outfile__array_spikemonitor_2_N.is_open()) + { + outfile__array_spikemonitor_2_N.write(reinterpret_cast(_array_spikemonitor_2_N), 1*sizeof(int32_t)); + outfile__array_spikemonitor_2_N.close(); + } else + { + std::cout << "Error writing output file for _array_spikemonitor_2_N." << endl; + } + CUDA_SAFE_CALL( + cudaMemcpy(_array_spikemonitor__source_idx, dev_array_spikemonitor__source_idx, sizeof(int32_t)*_num__array_spikemonitor__source_idx, cudaMemcpyDeviceToHost) + ); + ofstream outfile__array_spikemonitor__source_idx; + outfile__array_spikemonitor__source_idx.open("results/_array_spikemonitor__source_idx_-8117872864355079535", ios::binary | ios::out); + if(outfile__array_spikemonitor__source_idx.is_open()) + { + outfile__array_spikemonitor__source_idx.write(reinterpret_cast(_array_spikemonitor__source_idx), 100*sizeof(int32_t)); + outfile__array_spikemonitor__source_idx.close(); + } else + { + std::cout << "Error writing output file for _array_spikemonitor__source_idx." << endl; + } + CUDA_SAFE_CALL( + cudaMemcpy(_array_spikemonitor_count, dev_array_spikemonitor_count, sizeof(int32_t)*_num__array_spikemonitor_count, cudaMemcpyDeviceToHost) + ); + ofstream outfile__array_spikemonitor_count; + outfile__array_spikemonitor_count.open("results/_array_spikemonitor_count_2626824674132290633", ios::binary | ios::out); + if(outfile__array_spikemonitor_count.is_open()) + { + outfile__array_spikemonitor_count.write(reinterpret_cast(_array_spikemonitor_count), 100*sizeof(int32_t)); + outfile__array_spikemonitor_count.close(); + } else + { + std::cout << "Error writing output file for _array_spikemonitor_count." << endl; + } + CUDA_SAFE_CALL( + cudaMemcpy(_array_spikemonitor_N, dev_array_spikemonitor_N, sizeof(int32_t)*_num__array_spikemonitor_N, cudaMemcpyDeviceToHost) + ); + ofstream outfile__array_spikemonitor_N; + outfile__array_spikemonitor_N.open("results/_array_spikemonitor_N_6263166261093207124", ios::binary | ios::out); + if(outfile__array_spikemonitor_N.is_open()) + { + outfile__array_spikemonitor_N.write(reinterpret_cast(_array_spikemonitor_N), 1*sizeof(int32_t)); + outfile__array_spikemonitor_N.close(); + } else + { + std::cout << "Error writing output file for _array_spikemonitor_N." << endl; + } + CUDA_SAFE_CALL( + cudaMemcpy(_array_synapses_1_N, dev_array_synapses_1_N, sizeof(int32_t)*_num__array_synapses_1_N, cudaMemcpyDeviceToHost) + ); + ofstream outfile__array_synapses_1_N; + outfile__array_synapses_1_N.open("results/_array_synapses_1_N_-5388579170602877692", ios::binary | ios::out); + if(outfile__array_synapses_1_N.is_open()) + { + outfile__array_synapses_1_N.write(reinterpret_cast(_array_synapses_1_N), 1*sizeof(int32_t)); + outfile__array_synapses_1_N.close(); + } else + { + std::cout << "Error writing output file for _array_synapses_1_N." << endl; + } + CUDA_SAFE_CALL( + cudaMemcpy(_array_synapses_2_N, dev_array_synapses_2_N, sizeof(int32_t)*_num__array_synapses_2_N, cudaMemcpyDeviceToHost) + ); + ofstream outfile__array_synapses_2_N; + outfile__array_synapses_2_N.open("results/_array_synapses_2_N_5269920966642024342", ios::binary | ios::out); + if(outfile__array_synapses_2_N.is_open()) + { + outfile__array_synapses_2_N.write(reinterpret_cast(_array_synapses_2_N), 1*sizeof(int32_t)); + outfile__array_synapses_2_N.close(); + } else + { + std::cout << "Error writing output file for _array_synapses_2_N." << endl; + } + CUDA_SAFE_CALL( + cudaMemcpy(_array_synapses_N, dev_array_synapses_N, sizeof(int32_t)*_num__array_synapses_N, cudaMemcpyDeviceToHost) + ); + ofstream outfile__array_synapses_N; + outfile__array_synapses_N.open("results/_array_synapses_N_-2482695578908200934", ios::binary | ios::out); + if(outfile__array_synapses_N.is_open()) + { + outfile__array_synapses_N.write(reinterpret_cast(_array_synapses_N), 1*sizeof(int32_t)); + outfile__array_synapses_N.close(); + } else + { + std::cout << "Error writing output file for _array_synapses_N." << endl; + } + + _dynamic_array_spikegeneratorgroup__timebins = dev_dynamic_array_spikegeneratorgroup__timebins; + ofstream outfile__dynamic_array_spikegeneratorgroup__timebins; + outfile__dynamic_array_spikegeneratorgroup__timebins.open("results/_dynamic_array_spikegeneratorgroup__timebins_8131810897310887393", ios::binary | ios::out); + if(outfile__dynamic_array_spikegeneratorgroup__timebins.is_open()) + { + outfile__dynamic_array_spikegeneratorgroup__timebins.write(reinterpret_cast(thrust::raw_pointer_cast(&_dynamic_array_spikegeneratorgroup__timebins[0])), _dynamic_array_spikegeneratorgroup__timebins.size()*sizeof(int32_t)); + outfile__dynamic_array_spikegeneratorgroup__timebins.close(); + } else + { + std::cout << "Error writing output file for _dynamic_array_spikegeneratorgroup__timebins." << endl; + } + _dynamic_array_spikegeneratorgroup_neuron_index = dev_dynamic_array_spikegeneratorgroup_neuron_index; + ofstream outfile__dynamic_array_spikegeneratorgroup_neuron_index; + outfile__dynamic_array_spikegeneratorgroup_neuron_index.open("results/_dynamic_array_spikegeneratorgroup_neuron_index_-7594505304508306195", ios::binary | ios::out); + if(outfile__dynamic_array_spikegeneratorgroup_neuron_index.is_open()) + { + outfile__dynamic_array_spikegeneratorgroup_neuron_index.write(reinterpret_cast(thrust::raw_pointer_cast(&_dynamic_array_spikegeneratorgroup_neuron_index[0])), _dynamic_array_spikegeneratorgroup_neuron_index.size()*sizeof(int32_t)); + outfile__dynamic_array_spikegeneratorgroup_neuron_index.close(); + } else + { + std::cout << "Error writing output file for _dynamic_array_spikegeneratorgroup_neuron_index." << endl; + } + _dynamic_array_spikegeneratorgroup_spike_number = dev_dynamic_array_spikegeneratorgroup_spike_number; + ofstream outfile__dynamic_array_spikegeneratorgroup_spike_number; + outfile__dynamic_array_spikegeneratorgroup_spike_number.open("results/_dynamic_array_spikegeneratorgroup_spike_number_-4815301131874600719", ios::binary | ios::out); + if(outfile__dynamic_array_spikegeneratorgroup_spike_number.is_open()) + { + outfile__dynamic_array_spikegeneratorgroup_spike_number.write(reinterpret_cast(thrust::raw_pointer_cast(&_dynamic_array_spikegeneratorgroup_spike_number[0])), _dynamic_array_spikegeneratorgroup_spike_number.size()*sizeof(int32_t)); + outfile__dynamic_array_spikegeneratorgroup_spike_number.close(); + } else + { + std::cout << "Error writing output file for _dynamic_array_spikegeneratorgroup_spike_number." << endl; + } + _dynamic_array_spikegeneratorgroup_spike_time = dev_dynamic_array_spikegeneratorgroup_spike_time; + ofstream outfile__dynamic_array_spikegeneratorgroup_spike_time; + outfile__dynamic_array_spikegeneratorgroup_spike_time.open("results/_dynamic_array_spikegeneratorgroup_spike_time_6567911360708844700", ios::binary | ios::out); + if(outfile__dynamic_array_spikegeneratorgroup_spike_time.is_open()) + { + outfile__dynamic_array_spikegeneratorgroup_spike_time.write(reinterpret_cast(thrust::raw_pointer_cast(&_dynamic_array_spikegeneratorgroup_spike_time[0])), _dynamic_array_spikegeneratorgroup_spike_time.size()*sizeof(double)); + outfile__dynamic_array_spikegeneratorgroup_spike_time.close(); + } else + { + std::cout << "Error writing output file for _dynamic_array_spikegeneratorgroup_spike_time." << endl; + } + _dynamic_array_spikemonitor_1_i = dev_dynamic_array_spikemonitor_1_i; + ofstream outfile__dynamic_array_spikemonitor_1_i; + outfile__dynamic_array_spikemonitor_1_i.open("results/_dynamic_array_spikemonitor_1_i_-2190502851196353835", ios::binary | ios::out); + if(outfile__dynamic_array_spikemonitor_1_i.is_open()) + { + outfile__dynamic_array_spikemonitor_1_i.write(reinterpret_cast(thrust::raw_pointer_cast(&_dynamic_array_spikemonitor_1_i[0])), _dynamic_array_spikemonitor_1_i.size()*sizeof(int32_t)); + outfile__dynamic_array_spikemonitor_1_i.close(); + } else + { + std::cout << "Error writing output file for _dynamic_array_spikemonitor_1_i." << endl; + } + _dynamic_array_spikemonitor_1_t = dev_dynamic_array_spikemonitor_1_t; + ofstream outfile__dynamic_array_spikemonitor_1_t; + outfile__dynamic_array_spikemonitor_1_t.open("results/_dynamic_array_spikemonitor_1_t_-841006843677588084", ios::binary | ios::out); + if(outfile__dynamic_array_spikemonitor_1_t.is_open()) + { + outfile__dynamic_array_spikemonitor_1_t.write(reinterpret_cast(thrust::raw_pointer_cast(&_dynamic_array_spikemonitor_1_t[0])), _dynamic_array_spikemonitor_1_t.size()*sizeof(double)); + outfile__dynamic_array_spikemonitor_1_t.close(); + } else + { + std::cout << "Error writing output file for _dynamic_array_spikemonitor_1_t." << endl; + } + _dynamic_array_spikemonitor_2_i = dev_dynamic_array_spikemonitor_2_i; + ofstream outfile__dynamic_array_spikemonitor_2_i; + outfile__dynamic_array_spikemonitor_2_i.open("results/_dynamic_array_spikemonitor_2_i_-7452697810678630303", ios::binary | ios::out); + if(outfile__dynamic_array_spikemonitor_2_i.is_open()) + { + outfile__dynamic_array_spikemonitor_2_i.write(reinterpret_cast(thrust::raw_pointer_cast(&_dynamic_array_spikemonitor_2_i[0])), _dynamic_array_spikemonitor_2_i.size()*sizeof(int32_t)); + outfile__dynamic_array_spikemonitor_2_i.close(); + } else + { + std::cout << "Error writing output file for _dynamic_array_spikemonitor_2_i." << endl; + } + _dynamic_array_spikemonitor_2_t = dev_dynamic_array_spikemonitor_2_t; + ofstream outfile__dynamic_array_spikemonitor_2_t; + outfile__dynamic_array_spikemonitor_2_t.open("results/_dynamic_array_spikemonitor_2_t_-2066051122613997313", ios::binary | ios::out); + if(outfile__dynamic_array_spikemonitor_2_t.is_open()) + { + outfile__dynamic_array_spikemonitor_2_t.write(reinterpret_cast(thrust::raw_pointer_cast(&_dynamic_array_spikemonitor_2_t[0])), _dynamic_array_spikemonitor_2_t.size()*sizeof(double)); + outfile__dynamic_array_spikemonitor_2_t.close(); + } else + { + std::cout << "Error writing output file for _dynamic_array_spikemonitor_2_t." << endl; + } + _dynamic_array_spikemonitor_i = dev_dynamic_array_spikemonitor_i; + ofstream outfile__dynamic_array_spikemonitor_i; + outfile__dynamic_array_spikemonitor_i.open("results/_dynamic_array_spikemonitor_i_2878104665717261157", ios::binary | ios::out); + if(outfile__dynamic_array_spikemonitor_i.is_open()) + { + outfile__dynamic_array_spikemonitor_i.write(reinterpret_cast(thrust::raw_pointer_cast(&_dynamic_array_spikemonitor_i[0])), _dynamic_array_spikemonitor_i.size()*sizeof(int32_t)); + outfile__dynamic_array_spikemonitor_i.close(); + } else + { + std::cout << "Error writing output file for _dynamic_array_spikemonitor_i." << endl; + } + _dynamic_array_spikemonitor_t = dev_dynamic_array_spikemonitor_t; + ofstream outfile__dynamic_array_spikemonitor_t; + outfile__dynamic_array_spikemonitor_t.open("results/_dynamic_array_spikemonitor_t_7865095316440674513", ios::binary | ios::out); + if(outfile__dynamic_array_spikemonitor_t.is_open()) + { + outfile__dynamic_array_spikemonitor_t.write(reinterpret_cast(thrust::raw_pointer_cast(&_dynamic_array_spikemonitor_t[0])), _dynamic_array_spikemonitor_t.size()*sizeof(double)); + outfile__dynamic_array_spikemonitor_t.close(); + } else + { + std::cout << "Error writing output file for _dynamic_array_spikemonitor_t." << endl; + } + ofstream outfile__dynamic_array_synapses_1__synaptic_post; + outfile__dynamic_array_synapses_1__synaptic_post.open("results/_dynamic_array_synapses_1__synaptic_post_-7537747434503640794", ios::binary | ios::out); + if(outfile__dynamic_array_synapses_1__synaptic_post.is_open()) + { + outfile__dynamic_array_synapses_1__synaptic_post.write(reinterpret_cast(thrust::raw_pointer_cast(&_dynamic_array_synapses_1__synaptic_post[0])), _dynamic_array_synapses_1__synaptic_post.size()*sizeof(int32_t)); + outfile__dynamic_array_synapses_1__synaptic_post.close(); + } else + { + std::cout << "Error writing output file for _dynamic_array_synapses_1__synaptic_post." << endl; + } + ofstream outfile__dynamic_array_synapses_1__synaptic_pre; + outfile__dynamic_array_synapses_1__synaptic_pre.open("results/_dynamic_array_synapses_1__synaptic_pre_-8170898951251124790", ios::binary | ios::out); + if(outfile__dynamic_array_synapses_1__synaptic_pre.is_open()) + { + outfile__dynamic_array_synapses_1__synaptic_pre.write(reinterpret_cast(thrust::raw_pointer_cast(&_dynamic_array_synapses_1__synaptic_pre[0])), _dynamic_array_synapses_1__synaptic_pre.size()*sizeof(int32_t)); + outfile__dynamic_array_synapses_1__synaptic_pre.close(); + } else + { + std::cout << "Error writing output file for _dynamic_array_synapses_1__synaptic_pre." << endl; + } + _dynamic_array_synapses_1_Apost = dev_dynamic_array_synapses_1_Apost; + ofstream outfile__dynamic_array_synapses_1_Apost; + outfile__dynamic_array_synapses_1_Apost.open("results/_dynamic_array_synapses_1_Apost_6485379228718605548", ios::binary | ios::out); + if(outfile__dynamic_array_synapses_1_Apost.is_open()) + { + outfile__dynamic_array_synapses_1_Apost.write(reinterpret_cast(thrust::raw_pointer_cast(&_dynamic_array_synapses_1_Apost[0])), _dynamic_array_synapses_1_Apost.size()*sizeof(double)); + outfile__dynamic_array_synapses_1_Apost.close(); + } else + { + std::cout << "Error writing output file for _dynamic_array_synapses_1_Apost." << endl; + } + _dynamic_array_synapses_1_Apre = dev_dynamic_array_synapses_1_Apre; + ofstream outfile__dynamic_array_synapses_1_Apre; + outfile__dynamic_array_synapses_1_Apre.open("results/_dynamic_array_synapses_1_Apre_1158801600114762896", ios::binary | ios::out); + if(outfile__dynamic_array_synapses_1_Apre.is_open()) + { + outfile__dynamic_array_synapses_1_Apre.write(reinterpret_cast(thrust::raw_pointer_cast(&_dynamic_array_synapses_1_Apre[0])), _dynamic_array_synapses_1_Apre.size()*sizeof(double)); + outfile__dynamic_array_synapses_1_Apre.close(); + } else + { + std::cout << "Error writing output file for _dynamic_array_synapses_1_Apre." << endl; + } + ofstream outfile__dynamic_array_synapses_1_delay; + outfile__dynamic_array_synapses_1_delay.open("results/_dynamic_array_synapses_1_delay_-2566178675962201282", ios::binary | ios::out); + if(outfile__dynamic_array_synapses_1_delay.is_open()) + { + outfile__dynamic_array_synapses_1_delay.write(reinterpret_cast(thrust::raw_pointer_cast(&_dynamic_array_synapses_1_delay[0])), _dynamic_array_synapses_1_delay.size()*sizeof(double)); + outfile__dynamic_array_synapses_1_delay.close(); + } else + { + std::cout << "Error writing output file for _dynamic_array_synapses_1_delay." << endl; + } + ofstream outfile__dynamic_array_synapses_1_delay_1; + outfile__dynamic_array_synapses_1_delay_1.open("results/_dynamic_array_synapses_1_delay_1_-2293552668042484320", ios::binary | ios::out); + if(outfile__dynamic_array_synapses_1_delay_1.is_open()) + { + outfile__dynamic_array_synapses_1_delay_1.write(reinterpret_cast(thrust::raw_pointer_cast(&_dynamic_array_synapses_1_delay_1[0])), _dynamic_array_synapses_1_delay_1.size()*sizeof(double)); + outfile__dynamic_array_synapses_1_delay_1.close(); + } else + { + std::cout << "Error writing output file for _dynamic_array_synapses_1_delay_1." << endl; + } + _dynamic_array_synapses_1_g_raw = dev_dynamic_array_synapses_1_g_raw; + ofstream outfile__dynamic_array_synapses_1_g_raw; + outfile__dynamic_array_synapses_1_g_raw.open("results/_dynamic_array_synapses_1_g_raw_-296211884898250956", ios::binary | ios::out); + if(outfile__dynamic_array_synapses_1_g_raw.is_open()) + { + outfile__dynamic_array_synapses_1_g_raw.write(reinterpret_cast(thrust::raw_pointer_cast(&_dynamic_array_synapses_1_g_raw[0])), _dynamic_array_synapses_1_g_raw.size()*sizeof(double)); + outfile__dynamic_array_synapses_1_g_raw.close(); + } else + { + std::cout << "Error writing output file for _dynamic_array_synapses_1_g_raw." << endl; + } + _dynamic_array_synapses_1_lastupdate = dev_dynamic_array_synapses_1_lastupdate; + ofstream outfile__dynamic_array_synapses_1_lastupdate; + outfile__dynamic_array_synapses_1_lastupdate.open("results/_dynamic_array_synapses_1_lastupdate_-4620983009986066308", ios::binary | ios::out); + if(outfile__dynamic_array_synapses_1_lastupdate.is_open()) + { + outfile__dynamic_array_synapses_1_lastupdate.write(reinterpret_cast(thrust::raw_pointer_cast(&_dynamic_array_synapses_1_lastupdate[0])), _dynamic_array_synapses_1_lastupdate.size()*sizeof(double)); + outfile__dynamic_array_synapses_1_lastupdate.close(); + } else + { + std::cout << "Error writing output file for _dynamic_array_synapses_1_lastupdate." << endl; + } + _dynamic_array_synapses_1_N_incoming = dev_dynamic_array_synapses_1_N_incoming; + ofstream outfile__dynamic_array_synapses_1_N_incoming; + outfile__dynamic_array_synapses_1_N_incoming.open("results/_dynamic_array_synapses_1_N_incoming_-5416286353695559554", ios::binary | ios::out); + if(outfile__dynamic_array_synapses_1_N_incoming.is_open()) + { + outfile__dynamic_array_synapses_1_N_incoming.write(reinterpret_cast(thrust::raw_pointer_cast(&_dynamic_array_synapses_1_N_incoming[0])), _dynamic_array_synapses_1_N_incoming.size()*sizeof(int32_t)); + outfile__dynamic_array_synapses_1_N_incoming.close(); + } else + { + std::cout << "Error writing output file for _dynamic_array_synapses_1_N_incoming." << endl; + } + _dynamic_array_synapses_1_N_outgoing = dev_dynamic_array_synapses_1_N_outgoing; + ofstream outfile__dynamic_array_synapses_1_N_outgoing; + outfile__dynamic_array_synapses_1_N_outgoing.open("results/_dynamic_array_synapses_1_N_outgoing_5769272226699040095", ios::binary | ios::out); + if(outfile__dynamic_array_synapses_1_N_outgoing.is_open()) + { + outfile__dynamic_array_synapses_1_N_outgoing.write(reinterpret_cast(thrust::raw_pointer_cast(&_dynamic_array_synapses_1_N_outgoing[0])), _dynamic_array_synapses_1_N_outgoing.size()*sizeof(int32_t)); + outfile__dynamic_array_synapses_1_N_outgoing.close(); + } else + { + std::cout << "Error writing output file for _dynamic_array_synapses_1_N_outgoing." << endl; + } + ofstream outfile__dynamic_array_synapses_2__synaptic_post; + outfile__dynamic_array_synapses_2__synaptic_post.open("results/_dynamic_array_synapses_2__synaptic_post_-8504964520201554399", ios::binary | ios::out); + if(outfile__dynamic_array_synapses_2__synaptic_post.is_open()) + { + outfile__dynamic_array_synapses_2__synaptic_post.write(reinterpret_cast(thrust::raw_pointer_cast(&_dynamic_array_synapses_2__synaptic_post[0])), _dynamic_array_synapses_2__synaptic_post.size()*sizeof(int32_t)); + outfile__dynamic_array_synapses_2__synaptic_post.close(); + } else + { + std::cout << "Error writing output file for _dynamic_array_synapses_2__synaptic_post." << endl; + } + ofstream outfile__dynamic_array_synapses_2__synaptic_pre; + outfile__dynamic_array_synapses_2__synaptic_pre.open("results/_dynamic_array_synapses_2__synaptic_pre_-5492879376519788356", ios::binary | ios::out); + if(outfile__dynamic_array_synapses_2__synaptic_pre.is_open()) + { + outfile__dynamic_array_synapses_2__synaptic_pre.write(reinterpret_cast(thrust::raw_pointer_cast(&_dynamic_array_synapses_2__synaptic_pre[0])), _dynamic_array_synapses_2__synaptic_pre.size()*sizeof(int32_t)); + outfile__dynamic_array_synapses_2__synaptic_pre.close(); + } else + { + std::cout << "Error writing output file for _dynamic_array_synapses_2__synaptic_pre." << endl; + } + ofstream outfile__dynamic_array_synapses_2_delay; + outfile__dynamic_array_synapses_2_delay.open("results/_dynamic_array_synapses_2_delay_-785530481191211215", ios::binary | ios::out); + if(outfile__dynamic_array_synapses_2_delay.is_open()) + { + outfile__dynamic_array_synapses_2_delay.write(reinterpret_cast(thrust::raw_pointer_cast(&_dynamic_array_synapses_2_delay[0])), _dynamic_array_synapses_2_delay.size()*sizeof(double)); + outfile__dynamic_array_synapses_2_delay.close(); + } else + { + std::cout << "Error writing output file for _dynamic_array_synapses_2_delay." << endl; + } + _dynamic_array_synapses_2_N_incoming = dev_dynamic_array_synapses_2_N_incoming; + ofstream outfile__dynamic_array_synapses_2_N_incoming; + outfile__dynamic_array_synapses_2_N_incoming.open("results/_dynamic_array_synapses_2_N_incoming_-2633956166385116811", ios::binary | ios::out); + if(outfile__dynamic_array_synapses_2_N_incoming.is_open()) + { + outfile__dynamic_array_synapses_2_N_incoming.write(reinterpret_cast(thrust::raw_pointer_cast(&_dynamic_array_synapses_2_N_incoming[0])), _dynamic_array_synapses_2_N_incoming.size()*sizeof(int32_t)); + outfile__dynamic_array_synapses_2_N_incoming.close(); + } else + { + std::cout << "Error writing output file for _dynamic_array_synapses_2_N_incoming." << endl; + } + _dynamic_array_synapses_2_N_outgoing = dev_dynamic_array_synapses_2_N_outgoing; + ofstream outfile__dynamic_array_synapses_2_N_outgoing; + outfile__dynamic_array_synapses_2_N_outgoing.open("results/_dynamic_array_synapses_2_N_outgoing_-8330418898748964037", ios::binary | ios::out); + if(outfile__dynamic_array_synapses_2_N_outgoing.is_open()) + { + outfile__dynamic_array_synapses_2_N_outgoing.write(reinterpret_cast(thrust::raw_pointer_cast(&_dynamic_array_synapses_2_N_outgoing[0])), _dynamic_array_synapses_2_N_outgoing.size()*sizeof(int32_t)); + outfile__dynamic_array_synapses_2_N_outgoing.close(); + } else + { + std::cout << "Error writing output file for _dynamic_array_synapses_2_N_outgoing." << endl; + } + ofstream outfile__dynamic_array_synapses__synaptic_post; + outfile__dynamic_array_synapses__synaptic_post.open("results/_dynamic_array_synapses__synaptic_post_6330116830759336919", ios::binary | ios::out); + if(outfile__dynamic_array_synapses__synaptic_post.is_open()) + { + outfile__dynamic_array_synapses__synaptic_post.write(reinterpret_cast(thrust::raw_pointer_cast(&_dynamic_array_synapses__synaptic_post[0])), _dynamic_array_synapses__synaptic_post.size()*sizeof(int32_t)); + outfile__dynamic_array_synapses__synaptic_post.close(); + } else + { + std::cout << "Error writing output file for _dynamic_array_synapses__synaptic_post." << endl; + } + ofstream outfile__dynamic_array_synapses__synaptic_pre; + outfile__dynamic_array_synapses__synaptic_pre.open("results/_dynamic_array_synapses__synaptic_pre_2137649452266235309", ios::binary | ios::out); + if(outfile__dynamic_array_synapses__synaptic_pre.is_open()) + { + outfile__dynamic_array_synapses__synaptic_pre.write(reinterpret_cast(thrust::raw_pointer_cast(&_dynamic_array_synapses__synaptic_pre[0])), _dynamic_array_synapses__synaptic_pre.size()*sizeof(int32_t)); + outfile__dynamic_array_synapses__synaptic_pre.close(); + } else + { + std::cout << "Error writing output file for _dynamic_array_synapses__synaptic_pre." << endl; + } + ofstream outfile__dynamic_array_synapses_delay; + outfile__dynamic_array_synapses_delay.open("results/_dynamic_array_synapses_delay_6546873993127671381", ios::binary | ios::out); + if(outfile__dynamic_array_synapses_delay.is_open()) + { + outfile__dynamic_array_synapses_delay.write(reinterpret_cast(thrust::raw_pointer_cast(&_dynamic_array_synapses_delay[0])), _dynamic_array_synapses_delay.size()*sizeof(double)); + outfile__dynamic_array_synapses_delay.close(); + } else + { + std::cout << "Error writing output file for _dynamic_array_synapses_delay." << endl; + } + _dynamic_array_synapses_N_incoming = dev_dynamic_array_synapses_N_incoming; + ofstream outfile__dynamic_array_synapses_N_incoming; + outfile__dynamic_array_synapses_N_incoming.open("results/_dynamic_array_synapses_N_incoming_2854242842403593343", ios::binary | ios::out); + if(outfile__dynamic_array_synapses_N_incoming.is_open()) + { + outfile__dynamic_array_synapses_N_incoming.write(reinterpret_cast(thrust::raw_pointer_cast(&_dynamic_array_synapses_N_incoming[0])), _dynamic_array_synapses_N_incoming.size()*sizeof(int32_t)); + outfile__dynamic_array_synapses_N_incoming.close(); + } else + { + std::cout << "Error writing output file for _dynamic_array_synapses_N_incoming." << endl; + } + _dynamic_array_synapses_N_outgoing = dev_dynamic_array_synapses_N_outgoing; + ofstream outfile__dynamic_array_synapses_N_outgoing; + outfile__dynamic_array_synapses_N_outgoing.open("results/_dynamic_array_synapses_N_outgoing_-6705529799763348580", ios::binary | ios::out); + if(outfile__dynamic_array_synapses_N_outgoing.is_open()) + { + outfile__dynamic_array_synapses_N_outgoing.write(reinterpret_cast(thrust::raw_pointer_cast(&_dynamic_array_synapses_N_outgoing[0])), _dynamic_array_synapses_N_outgoing.size()*sizeof(int32_t)); + outfile__dynamic_array_synapses_N_outgoing.close(); + } else + { + std::cout << "Error writing output file for _dynamic_array_synapses_N_outgoing." << endl; + } + _dynamic_array_synapses_weight = dev_dynamic_array_synapses_weight; + ofstream outfile__dynamic_array_synapses_weight; + outfile__dynamic_array_synapses_weight.open("results/_dynamic_array_synapses_weight_-4970804317082307398", ios::binary | ios::out); + if(outfile__dynamic_array_synapses_weight.is_open()) + { + outfile__dynamic_array_synapses_weight.write(reinterpret_cast(thrust::raw_pointer_cast(&_dynamic_array_synapses_weight[0])), _dynamic_array_synapses_weight.size()*sizeof(double)); + outfile__dynamic_array_synapses_weight.close(); + } else + { + std::cout << "Error writing output file for _dynamic_array_synapses_weight." << endl; + } + + + // Write last run info to disk + ofstream outfile_last_run_info; + outfile_last_run_info.open("results/last_run_info.txt", ios::out); + if(outfile_last_run_info.is_open()) + { + outfile_last_run_info << (Network::_last_run_time) << " " << (Network::_last_run_completed_fraction) << std::endl; + outfile_last_run_info.close(); + } else + { + std::cout << "Error writing last run info to file." << std::endl; + } +} + +__global__ void synapses_pre_destroy() +{ + using namespace brian; + + synapses_pre.destroy(); +} +__global__ void synapses_1_post_destroy() +{ + using namespace brian; + + synapses_1_post.destroy(); +} +__global__ void synapses_1_pre_destroy() +{ + using namespace brian; + + synapses_1_pre.destroy(); +} +__global__ void synapses_2_pre_destroy() +{ + using namespace brian; + + synapses_2_pre.destroy(); +} + +void _dealloc_arrays() +{ + using namespace brian; + + + CUDA_SAFE_CALL( + curandDestroyGenerator(curand_generator) + ); + + synapses_pre_destroy<<<1,1>>>(); + CUDA_CHECK_ERROR("synapses_pre_destroy"); + synapses_1_post_destroy<<<1,1>>>(); + CUDA_CHECK_ERROR("synapses_1_post_destroy"); + synapses_1_pre_destroy<<<1,1>>>(); + CUDA_CHECK_ERROR("synapses_1_pre_destroy"); + synapses_2_pre_destroy<<<1,1>>>(); + CUDA_CHECK_ERROR("synapses_2_pre_destroy"); + + dev_dynamic_array_spikegeneratorgroup__timebins.clear(); + thrust::device_vector().swap(dev_dynamic_array_spikegeneratorgroup__timebins); + _dynamic_array_spikegeneratorgroup__timebins.clear(); + thrust::host_vector().swap(_dynamic_array_spikegeneratorgroup__timebins); + dev_dynamic_array_spikegeneratorgroup_neuron_index.clear(); + thrust::device_vector().swap(dev_dynamic_array_spikegeneratorgroup_neuron_index); + _dynamic_array_spikegeneratorgroup_neuron_index.clear(); + thrust::host_vector().swap(_dynamic_array_spikegeneratorgroup_neuron_index); + dev_dynamic_array_spikegeneratorgroup_spike_number.clear(); + thrust::device_vector().swap(dev_dynamic_array_spikegeneratorgroup_spike_number); + _dynamic_array_spikegeneratorgroup_spike_number.clear(); + thrust::host_vector().swap(_dynamic_array_spikegeneratorgroup_spike_number); + dev_dynamic_array_spikegeneratorgroup_spike_time.clear(); + thrust::device_vector().swap(dev_dynamic_array_spikegeneratorgroup_spike_time); + _dynamic_array_spikegeneratorgroup_spike_time.clear(); + thrust::host_vector().swap(_dynamic_array_spikegeneratorgroup_spike_time); + dev_dynamic_array_spikemonitor_1_i.clear(); + thrust::device_vector().swap(dev_dynamic_array_spikemonitor_1_i); + _dynamic_array_spikemonitor_1_i.clear(); + thrust::host_vector().swap(_dynamic_array_spikemonitor_1_i); + dev_dynamic_array_spikemonitor_1_t.clear(); + thrust::device_vector().swap(dev_dynamic_array_spikemonitor_1_t); + _dynamic_array_spikemonitor_1_t.clear(); + thrust::host_vector().swap(_dynamic_array_spikemonitor_1_t); + dev_dynamic_array_spikemonitor_2_i.clear(); + thrust::device_vector().swap(dev_dynamic_array_spikemonitor_2_i); + _dynamic_array_spikemonitor_2_i.clear(); + thrust::host_vector().swap(_dynamic_array_spikemonitor_2_i); + dev_dynamic_array_spikemonitor_2_t.clear(); + thrust::device_vector().swap(dev_dynamic_array_spikemonitor_2_t); + _dynamic_array_spikemonitor_2_t.clear(); + thrust::host_vector().swap(_dynamic_array_spikemonitor_2_t); + dev_dynamic_array_spikemonitor_i.clear(); + thrust::device_vector().swap(dev_dynamic_array_spikemonitor_i); + _dynamic_array_spikemonitor_i.clear(); + thrust::host_vector().swap(_dynamic_array_spikemonitor_i); + dev_dynamic_array_spikemonitor_t.clear(); + thrust::device_vector().swap(dev_dynamic_array_spikemonitor_t); + _dynamic_array_spikemonitor_t.clear(); + thrust::host_vector().swap(_dynamic_array_spikemonitor_t); + dev_dynamic_array_synapses_1__synaptic_post.clear(); + thrust::device_vector().swap(dev_dynamic_array_synapses_1__synaptic_post); + _dynamic_array_synapses_1__synaptic_post.clear(); + thrust::host_vector().swap(_dynamic_array_synapses_1__synaptic_post); + dev_dynamic_array_synapses_1__synaptic_pre.clear(); + thrust::device_vector().swap(dev_dynamic_array_synapses_1__synaptic_pre); + _dynamic_array_synapses_1__synaptic_pre.clear(); + thrust::host_vector().swap(_dynamic_array_synapses_1__synaptic_pre); + dev_dynamic_array_synapses_1_Apost.clear(); + thrust::device_vector().swap(dev_dynamic_array_synapses_1_Apost); + _dynamic_array_synapses_1_Apost.clear(); + thrust::host_vector().swap(_dynamic_array_synapses_1_Apost); + dev_dynamic_array_synapses_1_Apre.clear(); + thrust::device_vector().swap(dev_dynamic_array_synapses_1_Apre); + _dynamic_array_synapses_1_Apre.clear(); + thrust::host_vector().swap(_dynamic_array_synapses_1_Apre); + dev_dynamic_array_synapses_1_delay.clear(); + thrust::device_vector().swap(dev_dynamic_array_synapses_1_delay); + _dynamic_array_synapses_1_delay.clear(); + thrust::host_vector().swap(_dynamic_array_synapses_1_delay); + dev_dynamic_array_synapses_1_delay_1.clear(); + thrust::device_vector().swap(dev_dynamic_array_synapses_1_delay_1); + _dynamic_array_synapses_1_delay_1.clear(); + thrust::host_vector().swap(_dynamic_array_synapses_1_delay_1); + dev_dynamic_array_synapses_1_g_raw.clear(); + thrust::device_vector().swap(dev_dynamic_array_synapses_1_g_raw); + _dynamic_array_synapses_1_g_raw.clear(); + thrust::host_vector().swap(_dynamic_array_synapses_1_g_raw); + dev_dynamic_array_synapses_1_lastupdate.clear(); + thrust::device_vector().swap(dev_dynamic_array_synapses_1_lastupdate); + _dynamic_array_synapses_1_lastupdate.clear(); + thrust::host_vector().swap(_dynamic_array_synapses_1_lastupdate); + dev_dynamic_array_synapses_1_N_incoming.clear(); + thrust::device_vector().swap(dev_dynamic_array_synapses_1_N_incoming); + _dynamic_array_synapses_1_N_incoming.clear(); + thrust::host_vector().swap(_dynamic_array_synapses_1_N_incoming); + dev_dynamic_array_synapses_1_N_outgoing.clear(); + thrust::device_vector().swap(dev_dynamic_array_synapses_1_N_outgoing); + _dynamic_array_synapses_1_N_outgoing.clear(); + thrust::host_vector().swap(_dynamic_array_synapses_1_N_outgoing); + dev_dynamic_array_synapses_2__synaptic_post.clear(); + thrust::device_vector().swap(dev_dynamic_array_synapses_2__synaptic_post); + _dynamic_array_synapses_2__synaptic_post.clear(); + thrust::host_vector().swap(_dynamic_array_synapses_2__synaptic_post); + dev_dynamic_array_synapses_2__synaptic_pre.clear(); + thrust::device_vector().swap(dev_dynamic_array_synapses_2__synaptic_pre); + _dynamic_array_synapses_2__synaptic_pre.clear(); + thrust::host_vector().swap(_dynamic_array_synapses_2__synaptic_pre); + dev_dynamic_array_synapses_2_delay.clear(); + thrust::device_vector().swap(dev_dynamic_array_synapses_2_delay); + _dynamic_array_synapses_2_delay.clear(); + thrust::host_vector().swap(_dynamic_array_synapses_2_delay); + dev_dynamic_array_synapses_2_N_incoming.clear(); + thrust::device_vector().swap(dev_dynamic_array_synapses_2_N_incoming); + _dynamic_array_synapses_2_N_incoming.clear(); + thrust::host_vector().swap(_dynamic_array_synapses_2_N_incoming); + dev_dynamic_array_synapses_2_N_outgoing.clear(); + thrust::device_vector().swap(dev_dynamic_array_synapses_2_N_outgoing); + _dynamic_array_synapses_2_N_outgoing.clear(); + thrust::host_vector().swap(_dynamic_array_synapses_2_N_outgoing); + dev_dynamic_array_synapses__synaptic_post.clear(); + thrust::device_vector().swap(dev_dynamic_array_synapses__synaptic_post); + _dynamic_array_synapses__synaptic_post.clear(); + thrust::host_vector().swap(_dynamic_array_synapses__synaptic_post); + dev_dynamic_array_synapses__synaptic_pre.clear(); + thrust::device_vector().swap(dev_dynamic_array_synapses__synaptic_pre); + _dynamic_array_synapses__synaptic_pre.clear(); + thrust::host_vector().swap(_dynamic_array_synapses__synaptic_pre); + dev_dynamic_array_synapses_delay.clear(); + thrust::device_vector().swap(dev_dynamic_array_synapses_delay); + _dynamic_array_synapses_delay.clear(); + thrust::host_vector().swap(_dynamic_array_synapses_delay); + dev_dynamic_array_synapses_N_incoming.clear(); + thrust::device_vector().swap(dev_dynamic_array_synapses_N_incoming); + _dynamic_array_synapses_N_incoming.clear(); + thrust::host_vector().swap(_dynamic_array_synapses_N_incoming); + dev_dynamic_array_synapses_N_outgoing.clear(); + thrust::device_vector().swap(dev_dynamic_array_synapses_N_outgoing); + _dynamic_array_synapses_N_outgoing.clear(); + thrust::host_vector().swap(_dynamic_array_synapses_N_outgoing); + dev_dynamic_array_synapses_weight.clear(); + thrust::device_vector().swap(dev_dynamic_array_synapses_weight); + _dynamic_array_synapses_weight.clear(); + thrust::host_vector().swap(_dynamic_array_synapses_weight); + + if(_array_defaultclock_dt!=0) + { + delete [] _array_defaultclock_dt; + _array_defaultclock_dt = 0; + } + if(dev_array_defaultclock_dt!=0) + { + CUDA_SAFE_CALL( + cudaFree(dev_array_defaultclock_dt) + ); + dev_array_defaultclock_dt = 0; + } + if(_array_defaultclock_t!=0) + { + delete [] _array_defaultclock_t; + _array_defaultclock_t = 0; + } + if(dev_array_defaultclock_t!=0) + { + CUDA_SAFE_CALL( + cudaFree(dev_array_defaultclock_t) + ); + dev_array_defaultclock_t = 0; + } + if(_array_defaultclock_timestep!=0) + { + delete [] _array_defaultclock_timestep; + _array_defaultclock_timestep = 0; + } + if(dev_array_defaultclock_timestep!=0) + { + CUDA_SAFE_CALL( + cudaFree(dev_array_defaultclock_timestep) + ); + dev_array_defaultclock_timestep = 0; + } + if(_array_neurongroup_1_g_eKC_eKC!=0) + { + delete [] _array_neurongroup_1_g_eKC_eKC; + _array_neurongroup_1_g_eKC_eKC = 0; + } + if(dev_array_neurongroup_1_g_eKC_eKC!=0) + { + CUDA_SAFE_CALL( + cudaFree(dev_array_neurongroup_1_g_eKC_eKC) + ); + dev_array_neurongroup_1_g_eKC_eKC = 0; + } + if(_array_neurongroup_1_g_iKC_eKC!=0) + { + delete [] _array_neurongroup_1_g_iKC_eKC; + _array_neurongroup_1_g_iKC_eKC = 0; + } + if(dev_array_neurongroup_1_g_iKC_eKC!=0) + { + CUDA_SAFE_CALL( + cudaFree(dev_array_neurongroup_1_g_iKC_eKC) + ); + dev_array_neurongroup_1_g_iKC_eKC = 0; + } + if(_array_neurongroup_1_h!=0) + { + delete [] _array_neurongroup_1_h; + _array_neurongroup_1_h = 0; + } + if(dev_array_neurongroup_1_h!=0) + { + CUDA_SAFE_CALL( + cudaFree(dev_array_neurongroup_1_h) + ); + dev_array_neurongroup_1_h = 0; + } + if(_array_neurongroup_1_i!=0) + { + delete [] _array_neurongroup_1_i; + _array_neurongroup_1_i = 0; + } + if(dev_array_neurongroup_1_i!=0) + { + CUDA_SAFE_CALL( + cudaFree(dev_array_neurongroup_1_i) + ); + dev_array_neurongroup_1_i = 0; + } + if(_array_neurongroup_1_lastspike!=0) + { + delete [] _array_neurongroup_1_lastspike; + _array_neurongroup_1_lastspike = 0; + } + if(dev_array_neurongroup_1_lastspike!=0) + { + CUDA_SAFE_CALL( + cudaFree(dev_array_neurongroup_1_lastspike) + ); + dev_array_neurongroup_1_lastspike = 0; + } + if(_array_neurongroup_1_m!=0) + { + delete [] _array_neurongroup_1_m; + _array_neurongroup_1_m = 0; + } + if(dev_array_neurongroup_1_m!=0) + { + CUDA_SAFE_CALL( + cudaFree(dev_array_neurongroup_1_m) + ); + dev_array_neurongroup_1_m = 0; + } + if(_array_neurongroup_1_n!=0) + { + delete [] _array_neurongroup_1_n; + _array_neurongroup_1_n = 0; + } + if(dev_array_neurongroup_1_n!=0) + { + CUDA_SAFE_CALL( + cudaFree(dev_array_neurongroup_1_n) + ); + dev_array_neurongroup_1_n = 0; + } + if(_array_neurongroup_1_not_refractory!=0) + { + delete [] _array_neurongroup_1_not_refractory; + _array_neurongroup_1_not_refractory = 0; + } + if(dev_array_neurongroup_1_not_refractory!=0) + { + CUDA_SAFE_CALL( + cudaFree(dev_array_neurongroup_1_not_refractory) + ); + dev_array_neurongroup_1_not_refractory = 0; + } + if(_array_neurongroup_1_V!=0) + { + delete [] _array_neurongroup_1_V; + _array_neurongroup_1_V = 0; + } + if(dev_array_neurongroup_1_V!=0) + { + CUDA_SAFE_CALL( + cudaFree(dev_array_neurongroup_1_V) + ); + dev_array_neurongroup_1_V = 0; + } + if(_array_neurongroup_g_PN_iKC!=0) + { + delete [] _array_neurongroup_g_PN_iKC; + _array_neurongroup_g_PN_iKC = 0; + } + if(dev_array_neurongroup_g_PN_iKC!=0) + { + CUDA_SAFE_CALL( + cudaFree(dev_array_neurongroup_g_PN_iKC) + ); + dev_array_neurongroup_g_PN_iKC = 0; + } + if(_array_neurongroup_h!=0) + { + delete [] _array_neurongroup_h; + _array_neurongroup_h = 0; + } + if(dev_array_neurongroup_h!=0) + { + CUDA_SAFE_CALL( + cudaFree(dev_array_neurongroup_h) + ); + dev_array_neurongroup_h = 0; + } + if(_array_neurongroup_i!=0) + { + delete [] _array_neurongroup_i; + _array_neurongroup_i = 0; + } + if(dev_array_neurongroup_i!=0) + { + CUDA_SAFE_CALL( + cudaFree(dev_array_neurongroup_i) + ); + dev_array_neurongroup_i = 0; + } + if(_array_neurongroup_lastspike!=0) + { + delete [] _array_neurongroup_lastspike; + _array_neurongroup_lastspike = 0; + } + if(dev_array_neurongroup_lastspike!=0) + { + CUDA_SAFE_CALL( + cudaFree(dev_array_neurongroup_lastspike) + ); + dev_array_neurongroup_lastspike = 0; + } + if(_array_neurongroup_m!=0) + { + delete [] _array_neurongroup_m; + _array_neurongroup_m = 0; + } + if(dev_array_neurongroup_m!=0) + { + CUDA_SAFE_CALL( + cudaFree(dev_array_neurongroup_m) + ); + dev_array_neurongroup_m = 0; + } + if(_array_neurongroup_n!=0) + { + delete [] _array_neurongroup_n; + _array_neurongroup_n = 0; + } + if(dev_array_neurongroup_n!=0) + { + CUDA_SAFE_CALL( + cudaFree(dev_array_neurongroup_n) + ); + dev_array_neurongroup_n = 0; + } + if(_array_neurongroup_not_refractory!=0) + { + delete [] _array_neurongroup_not_refractory; + _array_neurongroup_not_refractory = 0; + } + if(dev_array_neurongroup_not_refractory!=0) + { + CUDA_SAFE_CALL( + cudaFree(dev_array_neurongroup_not_refractory) + ); + dev_array_neurongroup_not_refractory = 0; + } + if(_array_neurongroup_V!=0) + { + delete [] _array_neurongroup_V; + _array_neurongroup_V = 0; + } + if(dev_array_neurongroup_V!=0) + { + CUDA_SAFE_CALL( + cudaFree(dev_array_neurongroup_V) + ); + dev_array_neurongroup_V = 0; + } + if(_array_spikegeneratorgroup__lastindex!=0) + { + delete [] _array_spikegeneratorgroup__lastindex; + _array_spikegeneratorgroup__lastindex = 0; + } + if(dev_array_spikegeneratorgroup__lastindex!=0) + { + CUDA_SAFE_CALL( + cudaFree(dev_array_spikegeneratorgroup__lastindex) + ); + dev_array_spikegeneratorgroup__lastindex = 0; + } + if(_array_spikegeneratorgroup__period_bins!=0) + { + delete [] _array_spikegeneratorgroup__period_bins; + _array_spikegeneratorgroup__period_bins = 0; + } + if(dev_array_spikegeneratorgroup__period_bins!=0) + { + CUDA_SAFE_CALL( + cudaFree(dev_array_spikegeneratorgroup__period_bins) + ); + dev_array_spikegeneratorgroup__period_bins = 0; + } + if(_array_spikegeneratorgroup_i!=0) + { + delete [] _array_spikegeneratorgroup_i; + _array_spikegeneratorgroup_i = 0; + } + if(dev_array_spikegeneratorgroup_i!=0) + { + CUDA_SAFE_CALL( + cudaFree(dev_array_spikegeneratorgroup_i) + ); + dev_array_spikegeneratorgroup_i = 0; + } + if(_array_spikegeneratorgroup_period!=0) + { + delete [] _array_spikegeneratorgroup_period; + _array_spikegeneratorgroup_period = 0; + } + if(dev_array_spikegeneratorgroup_period!=0) + { + CUDA_SAFE_CALL( + cudaFree(dev_array_spikegeneratorgroup_period) + ); + dev_array_spikegeneratorgroup_period = 0; + } + if(_array_spikemonitor_1__source_idx!=0) + { + delete [] _array_spikemonitor_1__source_idx; + _array_spikemonitor_1__source_idx = 0; + } + if(dev_array_spikemonitor_1__source_idx!=0) + { + CUDA_SAFE_CALL( + cudaFree(dev_array_spikemonitor_1__source_idx) + ); + dev_array_spikemonitor_1__source_idx = 0; + } + if(_array_spikemonitor_1_count!=0) + { + delete [] _array_spikemonitor_1_count; + _array_spikemonitor_1_count = 0; + } + if(dev_array_spikemonitor_1_count!=0) + { + CUDA_SAFE_CALL( + cudaFree(dev_array_spikemonitor_1_count) + ); + dev_array_spikemonitor_1_count = 0; + } + if(_array_spikemonitor_1_N!=0) + { + delete [] _array_spikemonitor_1_N; + _array_spikemonitor_1_N = 0; + } + if(dev_array_spikemonitor_1_N!=0) + { + CUDA_SAFE_CALL( + cudaFree(dev_array_spikemonitor_1_N) + ); + dev_array_spikemonitor_1_N = 0; + } + if(_array_spikemonitor_2__source_idx!=0) + { + delete [] _array_spikemonitor_2__source_idx; + _array_spikemonitor_2__source_idx = 0; + } + if(dev_array_spikemonitor_2__source_idx!=0) + { + CUDA_SAFE_CALL( + cudaFree(dev_array_spikemonitor_2__source_idx) + ); + dev_array_spikemonitor_2__source_idx = 0; + } + if(_array_spikemonitor_2_count!=0) + { + delete [] _array_spikemonitor_2_count; + _array_spikemonitor_2_count = 0; + } + if(dev_array_spikemonitor_2_count!=0) + { + CUDA_SAFE_CALL( + cudaFree(dev_array_spikemonitor_2_count) + ); + dev_array_spikemonitor_2_count = 0; + } + if(_array_spikemonitor_2_N!=0) + { + delete [] _array_spikemonitor_2_N; + _array_spikemonitor_2_N = 0; + } + if(dev_array_spikemonitor_2_N!=0) + { + CUDA_SAFE_CALL( + cudaFree(dev_array_spikemonitor_2_N) + ); + dev_array_spikemonitor_2_N = 0; + } + if(_array_spikemonitor__source_idx!=0) + { + delete [] _array_spikemonitor__source_idx; + _array_spikemonitor__source_idx = 0; + } + if(dev_array_spikemonitor__source_idx!=0) + { + CUDA_SAFE_CALL( + cudaFree(dev_array_spikemonitor__source_idx) + ); + dev_array_spikemonitor__source_idx = 0; + } + if(_array_spikemonitor_count!=0) + { + delete [] _array_spikemonitor_count; + _array_spikemonitor_count = 0; + } + if(dev_array_spikemonitor_count!=0) + { + CUDA_SAFE_CALL( + cudaFree(dev_array_spikemonitor_count) + ); + dev_array_spikemonitor_count = 0; + } + if(_array_spikemonitor_N!=0) + { + delete [] _array_spikemonitor_N; + _array_spikemonitor_N = 0; + } + if(dev_array_spikemonitor_N!=0) + { + CUDA_SAFE_CALL( + cudaFree(dev_array_spikemonitor_N) + ); + dev_array_spikemonitor_N = 0; + } + if(_array_synapses_1_N!=0) + { + delete [] _array_synapses_1_N; + _array_synapses_1_N = 0; + } + if(dev_array_synapses_1_N!=0) + { + CUDA_SAFE_CALL( + cudaFree(dev_array_synapses_1_N) + ); + dev_array_synapses_1_N = 0; + } + if(_array_synapses_2_N!=0) + { + delete [] _array_synapses_2_N; + _array_synapses_2_N = 0; + } + if(dev_array_synapses_2_N!=0) + { + CUDA_SAFE_CALL( + cudaFree(dev_array_synapses_2_N) + ); + dev_array_synapses_2_N = 0; + } + if(_array_synapses_N!=0) + { + delete [] _array_synapses_N; + _array_synapses_N = 0; + } + if(dev_array_synapses_N!=0) + { + CUDA_SAFE_CALL( + cudaFree(dev_array_synapses_N) + ); + dev_array_synapses_N = 0; + } + + + // static arrays + if(_static_array__dynamic_array_spikegeneratorgroup__timebins!=0) + { + delete [] _static_array__dynamic_array_spikegeneratorgroup__timebins; + _static_array__dynamic_array_spikegeneratorgroup__timebins = 0; + } + if(_static_array__dynamic_array_spikegeneratorgroup_neuron_index!=0) + { + delete [] _static_array__dynamic_array_spikegeneratorgroup_neuron_index; + _static_array__dynamic_array_spikegeneratorgroup_neuron_index = 0; + } + if(_static_array__dynamic_array_spikegeneratorgroup_spike_number!=0) + { + delete [] _static_array__dynamic_array_spikegeneratorgroup_spike_number; + _static_array__dynamic_array_spikegeneratorgroup_spike_number = 0; + } + if(_static_array__dynamic_array_spikegeneratorgroup_spike_time!=0) + { + delete [] _static_array__dynamic_array_spikegeneratorgroup_spike_time; + _static_array__dynamic_array_spikegeneratorgroup_spike_time = 0; + } + +} + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/objects.h b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/objects.h new file mode 100644 index 00000000..de71235b --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/objects.h @@ -0,0 +1,397 @@ +#include +// typedefs need to be outside the include guards to +// be visible to all files including objects.h +typedef double randomNumber_t; // random number type + +#ifndef _BRIAN_OBJECTS_H +#define _BRIAN_OBJECTS_H + +#include +#include +#include "synapses_classes.h" +#include "brianlib/clocks.h" +#include "network.h" +#include "rand.h" + +#include +#include +#include + +namespace brian { + +extern size_t used_device_memory; + +//////////////// clocks /////////////////// +extern Clock defaultclock; + +//////////////// networks ///////////////// +extern Network magicnetwork; + +//////////////// dynamic arrays /////////// +extern thrust::host_vector _dynamic_array_spikegeneratorgroup__timebins; +extern thrust::device_vector dev_dynamic_array_spikegeneratorgroup__timebins; +extern thrust::host_vector _dynamic_array_spikegeneratorgroup_neuron_index; +extern thrust::device_vector dev_dynamic_array_spikegeneratorgroup_neuron_index; +extern thrust::host_vector _dynamic_array_spikegeneratorgroup_spike_number; +extern thrust::device_vector dev_dynamic_array_spikegeneratorgroup_spike_number; +extern thrust::host_vector _dynamic_array_spikegeneratorgroup_spike_time; +extern thrust::device_vector dev_dynamic_array_spikegeneratorgroup_spike_time; +extern thrust::host_vector _dynamic_array_spikemonitor_1_i; +extern thrust::device_vector dev_dynamic_array_spikemonitor_1_i; +extern thrust::host_vector _dynamic_array_spikemonitor_1_t; +extern thrust::device_vector dev_dynamic_array_spikemonitor_1_t; +extern thrust::host_vector _dynamic_array_spikemonitor_2_i; +extern thrust::device_vector dev_dynamic_array_spikemonitor_2_i; +extern thrust::host_vector _dynamic_array_spikemonitor_2_t; +extern thrust::device_vector dev_dynamic_array_spikemonitor_2_t; +extern thrust::host_vector _dynamic_array_spikemonitor_i; +extern thrust::device_vector dev_dynamic_array_spikemonitor_i; +extern thrust::host_vector _dynamic_array_spikemonitor_t; +extern thrust::device_vector dev_dynamic_array_spikemonitor_t; +extern thrust::host_vector _dynamic_array_synapses_1__synaptic_post; +extern thrust::device_vector dev_dynamic_array_synapses_1__synaptic_post; +extern thrust::host_vector _dynamic_array_synapses_1__synaptic_pre; +extern thrust::device_vector dev_dynamic_array_synapses_1__synaptic_pre; +extern thrust::host_vector _dynamic_array_synapses_1_Apost; +extern thrust::device_vector dev_dynamic_array_synapses_1_Apost; +extern thrust::host_vector _dynamic_array_synapses_1_Apre; +extern thrust::device_vector dev_dynamic_array_synapses_1_Apre; +extern thrust::host_vector _dynamic_array_synapses_1_delay; +extern thrust::device_vector dev_dynamic_array_synapses_1_delay; +extern thrust::host_vector _dynamic_array_synapses_1_delay_1; +extern thrust::device_vector dev_dynamic_array_synapses_1_delay_1; +extern thrust::host_vector _dynamic_array_synapses_1_g_raw; +extern thrust::device_vector dev_dynamic_array_synapses_1_g_raw; +extern thrust::host_vector _dynamic_array_synapses_1_lastupdate; +extern thrust::device_vector dev_dynamic_array_synapses_1_lastupdate; +extern thrust::host_vector _dynamic_array_synapses_1_N_incoming; +extern thrust::device_vector dev_dynamic_array_synapses_1_N_incoming; +extern thrust::host_vector _dynamic_array_synapses_1_N_outgoing; +extern thrust::device_vector dev_dynamic_array_synapses_1_N_outgoing; +extern thrust::host_vector _dynamic_array_synapses_2__synaptic_post; +extern thrust::device_vector dev_dynamic_array_synapses_2__synaptic_post; +extern thrust::host_vector _dynamic_array_synapses_2__synaptic_pre; +extern thrust::device_vector dev_dynamic_array_synapses_2__synaptic_pre; +extern thrust::host_vector _dynamic_array_synapses_2_delay; +extern thrust::device_vector dev_dynamic_array_synapses_2_delay; +extern thrust::host_vector _dynamic_array_synapses_2_N_incoming; +extern thrust::device_vector dev_dynamic_array_synapses_2_N_incoming; +extern thrust::host_vector _dynamic_array_synapses_2_N_outgoing; +extern thrust::device_vector dev_dynamic_array_synapses_2_N_outgoing; +extern thrust::host_vector _dynamic_array_synapses__synaptic_post; +extern thrust::device_vector dev_dynamic_array_synapses__synaptic_post; +extern thrust::host_vector _dynamic_array_synapses__synaptic_pre; +extern thrust::device_vector dev_dynamic_array_synapses__synaptic_pre; +extern thrust::host_vector _dynamic_array_synapses_delay; +extern thrust::device_vector dev_dynamic_array_synapses_delay; +extern thrust::host_vector _dynamic_array_synapses_N_incoming; +extern thrust::device_vector dev_dynamic_array_synapses_N_incoming; +extern thrust::host_vector _dynamic_array_synapses_N_outgoing; +extern thrust::device_vector dev_dynamic_array_synapses_N_outgoing; +extern thrust::host_vector _dynamic_array_synapses_weight; +extern thrust::device_vector dev_dynamic_array_synapses_weight; + +//////////////// arrays /////////////////// +extern double * _array_defaultclock_dt; +extern double * dev_array_defaultclock_dt; +extern __device__ double *d_array_defaultclock_dt; +extern const int _num__array_defaultclock_dt; +extern double * _array_defaultclock_t; +extern double * dev_array_defaultclock_t; +extern __device__ double *d_array_defaultclock_t; +extern const int _num__array_defaultclock_t; +extern int64_t * _array_defaultclock_timestep; +extern int64_t * dev_array_defaultclock_timestep; +extern __device__ int64_t *d_array_defaultclock_timestep; +extern const int _num__array_defaultclock_timestep; +extern double * _array_neurongroup_1_g_eKC_eKC; +extern double * dev_array_neurongroup_1_g_eKC_eKC; +extern __device__ double *d_array_neurongroup_1_g_eKC_eKC; +extern const int _num__array_neurongroup_1_g_eKC_eKC; +extern double * _array_neurongroup_1_g_iKC_eKC; +extern double * dev_array_neurongroup_1_g_iKC_eKC; +extern __device__ double *d_array_neurongroup_1_g_iKC_eKC; +extern const int _num__array_neurongroup_1_g_iKC_eKC; +extern double * _array_neurongroup_1_h; +extern double * dev_array_neurongroup_1_h; +extern __device__ double *d_array_neurongroup_1_h; +extern const int _num__array_neurongroup_1_h; +extern int32_t * _array_neurongroup_1_i; +extern int32_t * dev_array_neurongroup_1_i; +extern __device__ int32_t *d_array_neurongroup_1_i; +extern const int _num__array_neurongroup_1_i; +extern double * _array_neurongroup_1_lastspike; +extern double * dev_array_neurongroup_1_lastspike; +extern __device__ double *d_array_neurongroup_1_lastspike; +extern const int _num__array_neurongroup_1_lastspike; +extern double * _array_neurongroup_1_m; +extern double * dev_array_neurongroup_1_m; +extern __device__ double *d_array_neurongroup_1_m; +extern const int _num__array_neurongroup_1_m; +extern double * _array_neurongroup_1_n; +extern double * dev_array_neurongroup_1_n; +extern __device__ double *d_array_neurongroup_1_n; +extern const int _num__array_neurongroup_1_n; +extern char * _array_neurongroup_1_not_refractory; +extern char * dev_array_neurongroup_1_not_refractory; +extern __device__ char *d_array_neurongroup_1_not_refractory; +extern const int _num__array_neurongroup_1_not_refractory; +extern double * _array_neurongroup_1_V; +extern double * dev_array_neurongroup_1_V; +extern __device__ double *d_array_neurongroup_1_V; +extern const int _num__array_neurongroup_1_V; +extern double * _array_neurongroup_g_PN_iKC; +extern double * dev_array_neurongroup_g_PN_iKC; +extern __device__ double *d_array_neurongroup_g_PN_iKC; +extern const int _num__array_neurongroup_g_PN_iKC; +extern double * _array_neurongroup_h; +extern double * dev_array_neurongroup_h; +extern __device__ double *d_array_neurongroup_h; +extern const int _num__array_neurongroup_h; +extern int32_t * _array_neurongroup_i; +extern int32_t * dev_array_neurongroup_i; +extern __device__ int32_t *d_array_neurongroup_i; +extern const int _num__array_neurongroup_i; +extern double * _array_neurongroup_lastspike; +extern double * dev_array_neurongroup_lastspike; +extern __device__ double *d_array_neurongroup_lastspike; +extern const int _num__array_neurongroup_lastspike; +extern double * _array_neurongroup_m; +extern double * dev_array_neurongroup_m; +extern __device__ double *d_array_neurongroup_m; +extern const int _num__array_neurongroup_m; +extern double * _array_neurongroup_n; +extern double * dev_array_neurongroup_n; +extern __device__ double *d_array_neurongroup_n; +extern const int _num__array_neurongroup_n; +extern char * _array_neurongroup_not_refractory; +extern char * dev_array_neurongroup_not_refractory; +extern __device__ char *d_array_neurongroup_not_refractory; +extern const int _num__array_neurongroup_not_refractory; +extern double * _array_neurongroup_V; +extern double * dev_array_neurongroup_V; +extern __device__ double *d_array_neurongroup_V; +extern const int _num__array_neurongroup_V; +extern int32_t * _array_spikegeneratorgroup__lastindex; +extern int32_t * dev_array_spikegeneratorgroup__lastindex; +extern __device__ int32_t *d_array_spikegeneratorgroup__lastindex; +extern const int _num__array_spikegeneratorgroup__lastindex; +extern int32_t * _array_spikegeneratorgroup__period_bins; +extern int32_t * dev_array_spikegeneratorgroup__period_bins; +extern __device__ int32_t *d_array_spikegeneratorgroup__period_bins; +extern const int _num__array_spikegeneratorgroup__period_bins; +extern int32_t * _array_spikegeneratorgroup_i; +extern int32_t * dev_array_spikegeneratorgroup_i; +extern __device__ int32_t *d_array_spikegeneratorgroup_i; +extern const int _num__array_spikegeneratorgroup_i; +extern double * _array_spikegeneratorgroup_period; +extern double * dev_array_spikegeneratorgroup_period; +extern __device__ double *d_array_spikegeneratorgroup_period; +extern const int _num__array_spikegeneratorgroup_period; +extern int32_t * _array_spikemonitor_1__source_idx; +extern int32_t * dev_array_spikemonitor_1__source_idx; +extern __device__ int32_t *d_array_spikemonitor_1__source_idx; +extern const int _num__array_spikemonitor_1__source_idx; +extern int32_t * _array_spikemonitor_1_count; +extern int32_t * dev_array_spikemonitor_1_count; +extern __device__ int32_t *d_array_spikemonitor_1_count; +extern const int _num__array_spikemonitor_1_count; +extern int32_t * _array_spikemonitor_1_N; +extern int32_t * dev_array_spikemonitor_1_N; +extern __device__ int32_t *d_array_spikemonitor_1_N; +extern const int _num__array_spikemonitor_1_N; +extern int32_t * _array_spikemonitor_2__source_idx; +extern int32_t * dev_array_spikemonitor_2__source_idx; +extern __device__ int32_t *d_array_spikemonitor_2__source_idx; +extern const int _num__array_spikemonitor_2__source_idx; +extern int32_t * _array_spikemonitor_2_count; +extern int32_t * dev_array_spikemonitor_2_count; +extern __device__ int32_t *d_array_spikemonitor_2_count; +extern const int _num__array_spikemonitor_2_count; +extern int32_t * _array_spikemonitor_2_N; +extern int32_t * dev_array_spikemonitor_2_N; +extern __device__ int32_t *d_array_spikemonitor_2_N; +extern const int _num__array_spikemonitor_2_N; +extern int32_t * _array_spikemonitor__source_idx; +extern int32_t * dev_array_spikemonitor__source_idx; +extern __device__ int32_t *d_array_spikemonitor__source_idx; +extern const int _num__array_spikemonitor__source_idx; +extern int32_t * _array_spikemonitor_count; +extern int32_t * dev_array_spikemonitor_count; +extern __device__ int32_t *d_array_spikemonitor_count; +extern const int _num__array_spikemonitor_count; +extern int32_t * _array_spikemonitor_N; +extern int32_t * dev_array_spikemonitor_N; +extern __device__ int32_t *d_array_spikemonitor_N; +extern const int _num__array_spikemonitor_N; +extern int32_t * _array_synapses_1_N; +extern int32_t * dev_array_synapses_1_N; +extern __device__ int32_t *d_array_synapses_1_N; +extern const int _num__array_synapses_1_N; +extern int32_t * _array_synapses_2_N; +extern int32_t * dev_array_synapses_2_N; +extern __device__ int32_t *d_array_synapses_2_N; +extern const int _num__array_synapses_2_N; +extern int32_t * _array_synapses_N; +extern int32_t * dev_array_synapses_N; +extern __device__ int32_t *d_array_synapses_N; +extern const int _num__array_synapses_N; + +//////////////// eventspaces /////////////// +extern int32_t * _array_neurongroup_1__spikespace; +extern thrust::host_vector dev_array_neurongroup_1__spikespace; +extern const int _num__array_neurongroup_1__spikespace; +extern int current_idx_array_neurongroup_1__spikespace; +extern int32_t * _array_neurongroup__spikespace; +extern thrust::host_vector dev_array_neurongroup__spikespace; +extern const int _num__array_neurongroup__spikespace; +extern int current_idx_array_neurongroup__spikespace; +extern int32_t * _array_spikegeneratorgroup__spikespace; +extern thrust::host_vector dev_array_spikegeneratorgroup__spikespace; +extern const int _num__array_spikegeneratorgroup__spikespace; +extern int current_idx_array_spikegeneratorgroup__spikespace; +extern int previous_idx_array_spikegeneratorgroup__spikespace; + +//////////////// dynamic arrays 2d ///////// + +/////////////// static arrays ///////////// +extern int32_t *_static_array__dynamic_array_spikegeneratorgroup__timebins; +extern int32_t *dev_static_array__dynamic_array_spikegeneratorgroup__timebins; +extern __device__ int32_t *d_static_array__dynamic_array_spikegeneratorgroup__timebins; +extern const int _num__static_array__dynamic_array_spikegeneratorgroup__timebins; +extern int64_t *_static_array__dynamic_array_spikegeneratorgroup_neuron_index; +extern int64_t *dev_static_array__dynamic_array_spikegeneratorgroup_neuron_index; +extern __device__ int64_t *d_static_array__dynamic_array_spikegeneratorgroup_neuron_index; +extern const int _num__static_array__dynamic_array_spikegeneratorgroup_neuron_index; +extern int64_t *_static_array__dynamic_array_spikegeneratorgroup_spike_number; +extern int64_t *dev_static_array__dynamic_array_spikegeneratorgroup_spike_number; +extern __device__ int64_t *d_static_array__dynamic_array_spikegeneratorgroup_spike_number; +extern const int _num__static_array__dynamic_array_spikegeneratorgroup_spike_number; +extern double *_static_array__dynamic_array_spikegeneratorgroup_spike_time; +extern double *dev_static_array__dynamic_array_spikegeneratorgroup_spike_time; +extern __device__ double *d_static_array__dynamic_array_spikegeneratorgroup_spike_time; +extern const int _num__static_array__dynamic_array_spikegeneratorgroup_spike_time; + +//////////////// synapses ///////////////// +// synapses +extern cudaStream_t stream; +extern bool synapses_multiple_pre_post; +extern __device__ int* synapses_pre_num_synapses_by_pre; +extern __device__ int* synapses_pre_num_synapses_by_bundle; +extern __device__ int* synapses_pre_unique_delays; +extern __device__ int* synapses_pre_synapses_offset_by_bundle; +extern __device__ int* synapses_pre_global_bundle_id_start_by_pre; +extern int synapses_pre_max_bundle_size; +extern int synapses_pre_mean_bundle_size; +extern int synapses_pre_max_size; +extern __device__ int* synapses_pre_num_unique_delays_by_pre; +extern int synapses_pre_max_num_unique_delays; +extern __device__ int32_t** synapses_pre_synapse_ids_by_pre; +extern __device__ int32_t* synapses_pre_synapse_ids; +extern __device__ int* synapses_pre_unique_delay_start_idcs; +extern __device__ int* synapses_pre_unique_delays_offset_by_pre; +extern __device__ SynapticPathway synapses_pre; +extern int synapses_pre_eventspace_idx; +extern int synapses_pre_delay; +extern bool synapses_pre_scalar_delay; +// synapses_1 +extern cudaStream_t stream1; +extern bool synapses_1_multiple_pre_post; +extern __device__ int* synapses_1_post_num_synapses_by_pre; +extern __device__ int* synapses_1_post_num_synapses_by_bundle; +extern __device__ int* synapses_1_post_unique_delays; +extern __device__ int* synapses_1_post_synapses_offset_by_bundle; +extern __device__ int* synapses_1_post_global_bundle_id_start_by_pre; +extern int synapses_1_post_max_bundle_size; +extern int synapses_1_post_mean_bundle_size; +extern int synapses_1_post_max_size; +extern __device__ int* synapses_1_post_num_unique_delays_by_pre; +extern int synapses_1_post_max_num_unique_delays; +extern __device__ int32_t** synapses_1_post_synapse_ids_by_pre; +extern __device__ int32_t* synapses_1_post_synapse_ids; +extern __device__ int* synapses_1_post_unique_delay_start_idcs; +extern __device__ int* synapses_1_post_unique_delays_offset_by_pre; +extern __device__ SynapticPathway synapses_1_post; +extern int synapses_1_post_eventspace_idx; +extern int synapses_1_post_delay; +extern bool synapses_1_post_scalar_delay; +extern __device__ int* synapses_1_pre_num_synapses_by_pre; +extern __device__ int* synapses_1_pre_num_synapses_by_bundle; +extern __device__ int* synapses_1_pre_unique_delays; +extern __device__ int* synapses_1_pre_synapses_offset_by_bundle; +extern __device__ int* synapses_1_pre_global_bundle_id_start_by_pre; +extern int synapses_1_pre_max_bundle_size; +extern int synapses_1_pre_mean_bundle_size; +extern int synapses_1_pre_max_size; +extern __device__ int* synapses_1_pre_num_unique_delays_by_pre; +extern int synapses_1_pre_max_num_unique_delays; +extern __device__ int32_t** synapses_1_pre_synapse_ids_by_pre; +extern __device__ int32_t* synapses_1_pre_synapse_ids; +extern __device__ int* synapses_1_pre_unique_delay_start_idcs; +extern __device__ int* synapses_1_pre_unique_delays_offset_by_pre; +extern __device__ SynapticPathway synapses_1_pre; +extern int synapses_1_pre_eventspace_idx; +extern int synapses_1_pre_delay; +extern bool synapses_1_pre_scalar_delay; +// synapses_2 +extern cudaStream_t stream2; +extern bool synapses_2_multiple_pre_post; +extern __device__ int* synapses_2_pre_num_synapses_by_pre; +extern __device__ int* synapses_2_pre_num_synapses_by_bundle; +extern __device__ int* synapses_2_pre_unique_delays; +extern __device__ int* synapses_2_pre_synapses_offset_by_bundle; +extern __device__ int* synapses_2_pre_global_bundle_id_start_by_pre; +extern int synapses_2_pre_max_bundle_size; +extern int synapses_2_pre_mean_bundle_size; +extern int synapses_2_pre_max_size; +extern __device__ int* synapses_2_pre_num_unique_delays_by_pre; +extern int synapses_2_pre_max_num_unique_delays; +extern __device__ int32_t** synapses_2_pre_synapse_ids_by_pre; +extern __device__ int32_t* synapses_2_pre_synapse_ids; +extern __device__ int* synapses_2_pre_unique_delay_start_idcs; +extern __device__ int* synapses_2_pre_unique_delays_offset_by_pre; +extern __device__ SynapticPathway synapses_2_pre; +extern int synapses_2_pre_eventspace_idx; +extern int synapses_2_pre_delay; +extern bool synapses_2_pre_scalar_delay; + +// neurongroup +extern cudaStream_t neurongroup_stream1; +extern cudaStream_t neurongroup_stream; + +//spike generator +extern cudaStream_t spikegenerator_stream; + +// spike monitor +extern cudaStream_t spikemonitor_stream1; +extern cudaStream_t spikemonitor_stream; +extern cudaStream_t spikemonitor_stream2; + +// Profiling information for each code object + +//////////////// random numbers ///////////////// +extern curandGenerator_t curand_generator; +extern unsigned long long* dev_curand_seed; +extern __device__ unsigned long long* d_curand_seed; + +extern curandState* dev_curand_states; +extern __device__ curandState* d_curand_states; +extern RandomNumberBuffer random_number_buffer; + +//CUDA +extern int num_parallel_blocks; +extern int max_threads_per_block; +extern int max_threads_per_sm; +extern int max_shared_mem_size; +extern int num_threads_per_warp; + +} + +void _init_arrays(); +void _load_arrays(); +void _write_arrays(); +void _dealloc_arrays(); + +#endif + + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/output.txt b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/output.txt new file mode 100644 index 00000000..f42215d9 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/output.txt @@ -0,0 +1,207 @@ +INFO: setting cudaDevice stuff took 0.866146 seconds +objects cu num par blocks 20 +INFO: _init_arrays() took 0.212342s +INFO: synapses creation took 0.015249s +INFO: synapses_1 creation took 0.123005s +INFO: synapses_2 creation took 0.004719s +INFO _run_kernel_synapses_group_variable_set_conditional_codeobject + 37 blocks + 1024 threads + 28 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 2232 bytes user-allocated constant memory + 1.000 theoretical occupancy +INFO _run_kernel_synapses_1_group_variable_set_conditional_codeobject + 245 blocks + 1024 threads + 28 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 2232 bytes user-allocated constant memory + 1.000 theoretical occupancy +INFO _run_kernel_synapses_1_group_variable_set_conditional_codeobject_1 + 245 blocks + 1024 threads + 30 registers per block + 0 bytes statically-allocated shared memory per block + 0 bytes local memory per thread + 2232 bytes user-allocated constant memory + 1.000 theoretical occupancy +INFO connectivity matrix has size 250000, number of (pre neuron ID, post neuron block) pairs is 50000 +INFO: synapse statistics and memory usage for synapses_1_pre: + number of synapses: 250000 + number of pre/post blocks: 50000 + number of synapses over all pre/post blocks: + mean: 5.0 std: 0.0 + + memory usage: TOTAL: 1.5 MB (~6.4 byte per synapse) + 62.5% 0.954 MB synapse IDs [250000] + 25.0% 0.381 MB pointers to synapse IDs [50000] + 12.5% 0.191 MB number of synapses per pre/post block [50000] +INFO _before_run_kernel_synapses_1_pre_push_spikes + 1 blocks + 1 threads + 94 registers per block + 0 bytes statically-allocated shared memory per block + 336 bytes local memory per thread + 2232 bytes user-allocated constant memory +INFO: synapses_1_pre initialisation took 0.214s +INFO connectivity matrix has size 10000, number of (pre neuron ID, post neuron block) pairs is 2000 +INFO: synapse statistics and memory usage for synapses_2_pre: + number of synapses: 10000 + number of pre/post blocks: 2000 + number of synapses over all pre/post blocks: + mean: 5.0 std: 0.0 + + memory usage: TOTAL: 0.1 MB (~6.4 byte per synapse) + 62.5% 0.038 MB synapse IDs [10000] + 25.0% 0.015 MB pointers to synapse IDs [2000] + 12.5% 0.008 MB number of synapses per pre/post block [2000] +INFO _before_run_kernel_synapses_2_pre_push_spikes + 1 blocks + 1 threads + 94 registers per block + 0 bytes statically-allocated shared memory per block + 336 bytes local memory per thread + 2232 bytes user-allocated constant memory +INFO: synapses_2_pre initialisation took 0.009s +INFO connectivity matrix has size 37454, number of (pre neuron ID, post neuron block) pairs is 2000 +INFO: synapse statistics and memory usage for synapses_pre: + number of synapses: 37454 + number of bundles: 0 + number of pre/post blocks: 2000 + number of synapses over all pre/post blocks: + mean: 18.7 std: 3.9 + number of unique delays over all pre/post blocks: + mean: 0.0 std: nan + bundle size over all bundles: + mean: 0.0 std: nan + + memory usage: TOTAL: 0.2 MB (~4.6 byte per synapse) + 86.2% 0.143 MB synapse IDs [37454] + 9.2% 0.015 MB pointers to synapse IDs [2000] + 4.6% 0.008 MB number of synapses per pre/post block [2000] +INFO _before_run_kernel_synapses_pre_push_spikes + 1 blocks + 1 threads + 94 registers per block + 0 bytes statically-allocated shared memory per block + 336 bytes local memory per thread + 2232 bytes user-allocated constant memory +INFO: synapses_pre initialisation took 0.010s +INFO connectivity matrix has size 250000, number of (pre neuron ID, post neuron block) pairs is 2000 +INFO: synapse statistics and memory usage for synapses_1_post: + number of synapses: 250000 + number of bundles: 0 + number of pre/post blocks: 2000 + number of synapses over all pre/post blocks: + mean: 125.0 std: 0.0 + number of unique delays over all pre/post blocks: + mean: 0.0 std: nan + bundle size over all bundles: + mean: 0.0 std: nan + + memory usage: TOTAL: 1.0 MB (~4.1 byte per synapse) + 97.7% 0.954 MB synapse IDs [250000] + 1.6% 0.015 MB pointers to synapse IDs [2000] + 0.8% 0.008 MB number of synapses per pre/post block [2000] +INFO _before_run_kernel_synapses_1_post_push_spikes + 1 blocks + 1 threads + 94 registers per block + 0 bytes statically-allocated shared memory per block + 336 bytes local memory per thread + 2232 bytes user-allocated constant memory +INFO: synapses_1_post initialisation took 0.017s +INFO _run_kernel_neurongroup_1_stateupdater_codeobject + 1 blocks + 256 threads + 174 registers per block + 0 bytes statically-allocated shared memory per block + 40 bytes local memory per thread + 2232 bytes user-allocated constant memory + 0.125 theoretical occupancy +INFO _run_kernel_neurongroup_stateupdater_codeobject + 7 blocks + 384 threads + 160 registers per block + 0 bytes statically-allocated shared memory per block + 40 bytes local memory per thread + 2232 bytes user-allocated constant memory + 0.188 theoretical occupancy +INFO _run_kernel_neurongroup_1_thresholder_codeobject + 1 blocks + 1024 threads + 30 registers per block + 0 bytes statically-allocated shared memory per block + 24 bytes local memory per thread + 2232 bytes user-allocated constant memory + 1.000 theoretical occupancy +INFO _run_kernel_neurongroup_thresholder_codeobject + 3 blocks + 1024 threads + 30 registers per block + 0 bytes statically-allocated shared memory per block + 24 bytes local memory per thread + 2232 bytes user-allocated constant memory + 1.000 theoretical occupancy +INFO _run_kernel_spikegeneratorgroup_codeobject + 1 blocks + 576 threads + 54 registers per block + 0 bytes statically-allocated shared memory per block + 24 bytes local memory per thread + 2232 bytes user-allocated constant memory + 0.562 theoretical occupancy +INFO _run_kernel_spikemonitor_codeobject + 1 blocks + 1 threads + 54 registers per block + 0 bytes statically-allocated shared memory per block + 64 bytes local memory per thread + 2232 bytes user-allocated constant memory + 0.016 theoretical occupancy +INFO _run_kernel_spikemonitor_1_codeobject + 1 blocks + 1 threads + 54 registers per block + 0 bytes statically-allocated shared memory per block + 64 bytes local memory per thread + 2232 bytes user-allocated constant memory + 0.016 theoretical occupancy +INFO _run_kernel_spikemonitor_2_codeobject + 1 blocks + 1 threads + 54 registers per block + 0 bytes statically-allocated shared memory per block + 64 bytes local memory per thread + 2232 bytes user-allocated constant memory + 0.016 theoretical occupancy +WARNING Not enough ressources available to call _run_kernel_synapses_1_pre_codeobject with maximum possible threads per block (1024). Reducing num_threads to 512. (Kernel needs 112 registers per block, 0 bytes of statically-allocated shared memory per block, 48 bytes of local memory per thread and a total of 2232 bytes of user-allocated constant memory) +INFO _run_kernel_synapses_2_pre_codeobject + 20 blocks + 1024 threads + 54 registers per block + 0 bytes statically-allocated shared memory per block + 40 bytes local memory per thread + 2232 bytes user-allocated constant memory + 0.500 theoretical occupancy +INFO _run_kernel_synapses_pre_codeobject + 20 blocks + 1024 threads + 64 registers per block + 0 bytes statically-allocated shared memory per block + 40 bytes local memory per thread + 2232 bytes user-allocated constant memory + 0.500 theoretical occupancy +WARNING Not enough ressources available to call _run_kernel_synapses_1_post_codeobject with maximum possible threads per block (1024). Reducing num_threads to 512. (Kernel needs 112 registers per block, 0 bytes of statically-allocated shared memory per block, 40 bytes of local memory per thread and a total of 2232 bytes of user-allocated constant memory) +Number of spikes: 3936 +Number of spikes: 143462 +Number of synapses: 250000 +Number of synapses: 10000 +Number of synapses: 37454 +Number of synapses: 250000 +INFO: main_lines took 46.566814 seconds +Number of spikes: 10791 +INFO: main function took 47.753639 seconds diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/rand.cu b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/rand.cu new file mode 100644 index 00000000..526004dc --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/rand.cu @@ -0,0 +1,299 @@ + +#include "objects.h" +#include "rand.h" +#include "synapses_classes.h" +#include "brianlib/clocks.h" +#include "brianlib/cuda_utils.h" +#include "network.h" +#include +#include +#include + +// XXX: for some documentation on random number generation, check out our wiki: +// https://github.com/brian-team/brian2cuda/wiki/Random-number-generation + +using namespace brian; + +// TODO make this a class member function +// TODO don't call one kernel per codeobject but instead on kernel which takes +// care of all codeobjects, preferably called with as many threads/blocks +// as necessary for all states and initializing in parallel with warp +// level divergence [needs changing set_curand_device_api_states()] +namespace { + + __global__ void init_curand_states(int N, int sequence_offset) + { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < N) + { + // Each thread gets the same seed, a different sequence number and + // no offset + // TODO: different seed and 0 sequence number is much faster, with + // less security for independent sequences, add option as + // preference! + //curand_init(curand_seed + idx, 0, 0, + curand_init( + *d_curand_seed, // seed + sequence_offset + idx, // sequence number + 0, // offset + &d_curand_states[idx]); + } + } +} + + +// need a function pointer for Network::add(), can't pass a pointer to a class +// method, which is of different type +void _run_random_number_buffer() +{ + // random_number_buffer is a RandomNumberBuffer instance, declared in objects.cu + random_number_buffer.next_time_step(); +} + + +void RandomNumberBuffer::init() +{ + // check that we have enough memory available + size_t free_byte; + size_t total_byte; + CUDA_SAFE_CALL( + cudaMemGetInfo(&free_byte, &total_byte) + ); + // TODO: This assumes all random number have randomNumber_t type, but poisson + // objects have different type + size_t num_free_floats = free_byte / sizeof(randomNumber_t); + + if (run_counter == 0) + { + // number of time steps each codeobject is executed during current Network::run() call + // XXX: we are assuming here that this function is only run in the first time step of a Network::run() + + + // now check if the total number of generated floats fit into available memory + int total_num_generated_floats = 0; + if (num_free_floats < total_num_generated_floats) + { + // TODO: find a way to deal with this? E.g. looping over buffers sorted + // by buffer size and reducing them until it fits. + printf("MEMORY ERROR: Trying to generate more random numbers than fit " + "into available memory. Please report this as an issue on " + "GitHub: https://github.com/brian-team/brian2cuda/issues/new"); + _dealloc_arrays(); + exit(1); + } + + } // if (run_counter == 0) + + // init curand states only in first run + if (run_counter == 0) + { + + // Update curand device api states once before anything is run. At this + // point all N's (also from probabilistically generated synapses) are + // known. This might update the number of needed curand states. + ensure_enough_curand_states(); + } + +} + + +void RandomNumberBuffer::allocate_device_curand_states() +{ + // allocate globabl memory for curand device api states + CUDA_SAFE_CALL( + cudaMalloc((void**)&dev_curand_states, + sizeof(curandState) * num_curand_states) + ); + CUDA_SAFE_CALL( + cudaMemcpyToSymbol(d_curand_states, + &dev_curand_states, sizeof(curandState*)) + ); +} + + + +void RandomNumberBuffer::update_needed_number_curand_states() +{ + // Find the maximum number of threads generating random numbers in parallel + // using the cuRAND device API. For synapses objects, the number of + // synapses might not be known yet. This is the case when the first random + // seed is set and for any seed() call before the synapses creation. + num_threads_curand_init = max_threads_per_block; + num_blocks_curand_init = num_curand_states / max_threads_per_block + 1; + if (num_curand_states < num_threads_curand_init) + num_threads_curand_init = num_curand_states; +} + + +void RandomNumberBuffer::set_curand_device_api_states(bool reset_seed) +{ + int sequence_offset = 0; + int num_curand_states_old = num_curand_states; + // Whenever curand states are set, check if enough states where + // initialized. This will generate states the first time the seed is set. + // But it can be that the seed is set before all network objects' N are + // available (e.g. synapses not created yet) and before the network is + // run. In such a case, once the network is run, missing curand states are + // generated here. If the seed was not reset inbetween, the pervious states + // should not be reinitialized (achieved by the `sequence_offset` + // parameter). If the seed was reset, then all states should be + // reinitialized. + update_needed_number_curand_states(); + + // number of curand states that need to be initialized + int num_curand_states_to_init; + + if (reset_seed) + { + // initialize all curand states + num_curand_states_to_init = num_curand_states; + sequence_offset = 0; + } + else + { + // don't initialize existing curand states, only the new ones + num_curand_states_to_init = num_curand_states - num_curand_states_old; + sequence_offset = num_curand_states_old; + } + + if (num_curand_states_old < num_curand_states) + { + // copy curand states to new array of updated size + curandState* dev_curand_states_old = dev_curand_states; + // allocate memory for new number of curand states + allocate_device_curand_states(); + + if ((!reset_seed) && (num_curand_states_old > 0)) + { + // copy old states to new memory address on device + CUDA_SAFE_CALL( + cudaMemcpy(dev_curand_states, dev_curand_states_old, + sizeof(curandState) * num_curand_states_old, + cudaMemcpyDeviceToDevice) + ); + } + } + + if (num_curand_states_to_init > 0) + { + init_curand_states<<>>( + num_curand_states_to_init, + sequence_offset); + } +} + + +void RandomNumberBuffer::ensure_enough_curand_states() +{ + // Separate public function needed for synapses codeobjects that are run + // only once before the network + // The N of synapses will not be known when setting the seed and needs to + // be updated before using random numbers per synapse. This occurs e.g. + // when initializing synaptic variables (synapses_group_conditional_....) + bool reset_seed = false; + set_curand_device_api_states(reset_seed); +} + + +void RandomNumberBuffer::run_finished() +{ + needs_init = true; + run_counter += 1; +} + + +void RandomNumberBuffer::set_seed(unsigned long long seed) +{ + CUDA_SAFE_CALL( + curandSetPseudoRandomGeneratorSeed(curand_generator, seed) + ); + + // generator offset needs to be reset to its default (=0) + CUDA_SAFE_CALL( + curandSetGeneratorOffset(curand_generator, 0ULL) + ); + + // set seed for curand device api calls + // don't set the same seed for host api and device api random states, just in case + unsigned long long curand_seed = seed + 1; + CUDA_SAFE_CALL( + cudaMemcpy(dev_curand_seed, &curand_seed, + sizeof(unsigned long long), cudaMemcpyHostToDevice) + ); + + bool reset_seed = true; + set_curand_device_api_states(reset_seed); + // We set all device api states for codeobjects run outside the network + // since we don't know when they will be used. + //set_curand_device_api_states_for_separate_calls(); + // Curand device api states for binomials during network runs will be set + // only for the current run in init(), once the network starts. +} + + +void RandomNumberBuffer::refill_uniform_numbers( + randomNumber_t* dev_rand_allocator, + randomNumber_t* &dev_rand, + int num_per_gen_rand, + int &idx_rand) +{ + // generate uniform distributed random numbers and reset buffer index + + curandGenerateUniformDouble(curand_generator, dev_rand_allocator, num_per_gen_rand); + // before: XXX dev_rand = &dev_rand_allocator[0]; + dev_rand = dev_rand_allocator; + idx_rand = 1; +} + + +void RandomNumberBuffer::refill_normal_numbers( + randomNumber_t* dev_randn_allocator, + randomNumber_t* &dev_randn, + int num_per_gen_randn, + int &idx_randn) +{ + // generate normal distributed random numbers and reset buffer index + + curandGenerateNormalDouble(curand_generator, dev_randn_allocator, num_per_gen_randn, 0, 1); + // before: XXX dev_randn = &dev_randn_allocator[0]; + dev_randn = dev_randn_allocator; + idx_randn = 1; +} + + +void RandomNumberBuffer::refill_poisson_numbers( + double lambda, + unsigned int* dev_poisson_allocator, + unsigned int* &dev_poisson, + int num_per_gen_poisson, + int &idx_poisson) +{ + // generate poisson distributed random numbers and reset buffer index + + printf("num_per_gen_poisson %d, lambda %f\n", num_per_gen_poisson, lambda); + CUDA_SAFE_CALL( + curandGeneratePoisson(curand_generator, dev_poisson_allocator, num_per_gen_poisson, lambda) + ); + dev_poisson = dev_poisson_allocator; + idx_poisson = 1; +} + +void RandomNumberBuffer::next_time_step() +{ + // init buffers at fist time step of each run call + if (needs_init) + { + // free device memory for random numbers used during last run call + if (run_counter > 0) + { + } + + // init random number buffers + init(); + needs_init = false; + } + + if (run_counter == 0) + { + }// run_counter == 0 +} diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/rand.h b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/rand.h new file mode 100644 index 00000000..213e1f99 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/rand.h @@ -0,0 +1,70 @@ + +#ifndef _BRIAN_RAND_H +#define _BRIAN_RAND_H + +#include + +void _run_random_number_buffer(); + +class RandomNumberBuffer +{ + // TODO let all random number pointers be class members of this class -> + // check which ones are needed as global variables, maybe have both, + // global and member variables? or change parameters in codeobjects? + + // before each run, buffers need to be reinitialized + bool needs_init = true; + // how many 'run' calls have finished + int run_counter = 0; + // number of needed cuRAND states + int num_curand_states = 0; + // number of threads and blocks to set curand states + int num_threads_curand_init, num_blocks_curand_init; + + // how many random numbers we want to create at once (tradeoff memory usage <-> generation overhead) + double mb_per_obj = 50; // MB per codeobject and rand / randn + // TODO: This assumes all random number have randomNumber_t type, but poisson + // objects have different type + int floats_per_obj = (mb_per_obj * 1024.0 * 1024.0) / sizeof(randomNumber_t); + + // The number of needed random numbers per clock cycle, the generation interval, and the number generated per curand call. + // + // needed random numbers per clock cycle + // int num_per_cycle_rand_{}; + // + // number of time steps after which buffer needs to be refilled + // int rand_interval_{}; + // + // buffer size + // int num_per_gen_rand_{}; + // + // number of time steps since last buffer refill + // int idx_rand_{}; + // + // maximum number of random numbers fitting given allocated memory + // int rand_floats_per_obj_{}; + + // For each call of brians `run`, a new set of codeobjects (with different + // suffixes) is generated. The following are variables for all codeobjects + // for all runs that need random numbers. + + ////// run 0 + + + void init(); + void allocate_device_curand_states(); + void update_needed_number_curand_states(); + void set_curand_device_api_states(bool); + void refill_uniform_numbers(randomNumber_t*, randomNumber_t*&, int, int&); + void refill_normal_numbers(randomNumber_t*, randomNumber_t*&, int, int&); + void refill_poisson_numbers(double lambda, unsigned int*, unsigned int*&, int, int&); + +public: + void next_time_step(); + void set_seed(unsigned long long); + void run_finished(); + void ensure_enough_curand_states(); +}; + +#endif + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/run.cu b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/run.cu new file mode 100644 index 00000000..6e073db5 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/run.cu @@ -0,0 +1,48 @@ +#include +#include "brianlib/cuda_utils.h" +#include "objects.h" +#include + +#include "code_objects/neurongroup_1_stateupdater_codeobject.h" +#include "code_objects/neurongroup_1_thresholder_codeobject.h" +#include "code_objects/neurongroup_stateupdater_codeobject.h" +#include "code_objects/neurongroup_thresholder_codeobject.h" +#include "code_objects/spikegeneratorgroup_codeobject.h" +#include "code_objects/spikemonitor_1_codeobject.h" +#include "code_objects/spikemonitor_2_codeobject.h" +#include "code_objects/spikemonitor_codeobject.h" +#include "code_objects/synapses_1_group_variable_set_conditional_codeobject.h" +#include "code_objects/synapses_1_group_variable_set_conditional_codeobject_1.h" +#include "code_objects/synapses_1_post_codeobject.h" +#include "code_objects/synapses_1_post_push_spikes.h" +#include "code_objects/synapses_1_pre_codeobject.h" +#include "code_objects/synapses_1_pre_push_spikes.h" +#include "code_objects/synapses_1_synapses_create_generator_codeobject.h" +#include "code_objects/synapses_2_pre_codeobject.h" +#include "code_objects/synapses_2_pre_push_spikes.h" +#include "code_objects/synapses_2_synapses_create_generator_codeobject.h" +#include "code_objects/synapses_group_variable_set_conditional_codeobject.h" +#include "code_objects/synapses_pre_codeobject.h" +#include "code_objects/synapses_pre_push_spikes.h" +#include "code_objects/synapses_synapses_create_generator_codeobject.h" + + +void brian_start() +{ + _init_arrays(); + _load_arrays(); + srand(time(NULL)); + + // Initialize clocks (link timestep and dt to the respective arrays) + brian::defaultclock.timestep = brian::_array_defaultclock_timestep; + brian::defaultclock.dt = brian::_array_defaultclock_dt; + brian::defaultclock.t = brian::_array_defaultclock_t; +} + +void brian_end() +{ + _write_arrays(); + _dealloc_arrays(); +} + + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/run.h b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/run.h new file mode 100644 index 00000000..0317d02c --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/run.h @@ -0,0 +1,5 @@ + +void brian_start(); +void brian_end(); + + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/static_arrays/_static_array__dynamic_array_spikegeneratorgroup__timebins b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/static_arrays/_static_array__dynamic_array_spikegeneratorgroup__timebins new file mode 100644 index 00000000..8aa5afba Binary files /dev/null and b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/static_arrays/_static_array__dynamic_array_spikegeneratorgroup__timebins differ diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/static_arrays/_static_array__dynamic_array_spikegeneratorgroup_neuron_index b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/static_arrays/_static_array__dynamic_array_spikegeneratorgroup_neuron_index new file mode 100644 index 00000000..cfab3527 Binary files /dev/null and b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/static_arrays/_static_array__dynamic_array_spikegeneratorgroup_neuron_index differ diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/static_arrays/_static_array__dynamic_array_spikegeneratorgroup_spike_number b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/static_arrays/_static_array__dynamic_array_spikegeneratorgroup_spike_number new file mode 100644 index 00000000..0464ec40 Binary files /dev/null and b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/static_arrays/_static_array__dynamic_array_spikegeneratorgroup_spike_number differ diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/static_arrays/_static_array__dynamic_array_spikegeneratorgroup_spike_time b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/static_arrays/_static_array__dynamic_array_spikegeneratorgroup_spike_time new file mode 100644 index 00000000..2e516d16 Binary files /dev/null and b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/static_arrays/_static_array__dynamic_array_spikegeneratorgroup_spike_time differ diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/synapses_classes.cu b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/synapses_classes.cu new file mode 100644 index 00000000..e69de29b diff --git a/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/synapses_classes.h b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/synapses_classes.h new file mode 100644 index 00000000..87da7f79 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/cuda_standalone/synapses_classes.h @@ -0,0 +1,49 @@ + +#ifndef _BRIAN_SYNAPSES_H +#define _BRIAN_SYNAPSES_H + +#include +#include + +#include "brianlib/spikequeue.h" + +class SynapticPathway +{ +public: + int32_t* dev_sources; + int32_t* dev_targets; + + // first and last index in source NeuronGroup corresponding to Subgroup in SynapticPathway + // important for Subgroups created with syntax: NeuronGroup(N=4000,...)[:3200] + int32_t spikes_start; + int32_t spikes_stop; + + double dt; + CudaSpikeQueue* queue; + bool no_or_const_delay_mode; + + //our real constructor + __device__ void init( + int32_t* _sources, + int32_t* _targets, + double _dt, + int32_t _spikes_start, + int32_t _spikes_stop) + { + dev_sources = _sources; + dev_targets = _targets; + dt = _dt; + spikes_start = _spikes_start; + spikes_stop = _spikes_stop; + queue = new CudaSpikeQueue; + }; + + //our real destructor + __device__ void destroy() + { + queue->destroy(); + delete queue; + } +}; +#endif + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/Makefile b/parallel_execution/parallel_execution/code/MushroomBody/genn/Makefile new file mode 100644 index 00000000..b03196b8 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/Makefile @@ -0,0 +1,13 @@ +GENERATED_CODE_DIR :=magicnetwork_model_CODE +CXXFLAGS +=-std=c++11 -Wno-write-strings -I. -Ibrianlib/randomkit -w -O3 -ffast-math -fno-finite-math-only -march=native -std=c++11 +LDFLAGS +=-L$(GENERATED_CODE_DIR) -lrunner -Wl,-rpath $(GENERATED_CODE_DIR) + +.PHONY: all clean generated_code + +all: main + +main: main.cpp code_objects/spikemonitor_codeobject.cpp code_objects/spikegeneratorgroup_codeobject.cpp code_objects/synapses_synapses_create_generator_codeobject.cpp code_objects/spikemonitor_2_codeobject.cpp code_objects/synapses_1_group_variable_set_conditional_codeobject_1.cpp code_objects/synapses_1_synapses_create_generator_codeobject.cpp code_objects/synapses_pre_push_spikes.cpp code_objects/synapses_1_post_push_spikes.cpp code_objects/synapses_1_pre_push_spikes.cpp code_objects/synapses_group_variable_set_conditional_codeobject.cpp code_objects/synapses_2_synapses_create_generator_codeobject.cpp code_objects/synapses_2_pre_push_spikes.cpp code_objects/spikemonitor_1_codeobject.cpp objects.cpp code_objects/synapses_1_group_variable_set_conditional_codeobject.cpp brianlib/randomkit/randomkit.cc generated_code + $(CXX) $(CXXFLAGS) main.cpp code_objects/spikemonitor_codeobject.cpp code_objects/spikegeneratorgroup_codeobject.cpp code_objects/synapses_synapses_create_generator_codeobject.cpp code_objects/spikemonitor_2_codeobject.cpp code_objects/synapses_1_group_variable_set_conditional_codeobject_1.cpp code_objects/synapses_1_synapses_create_generator_codeobject.cpp code_objects/synapses_pre_push_spikes.cpp code_objects/synapses_1_post_push_spikes.cpp code_objects/synapses_1_pre_push_spikes.cpp code_objects/synapses_group_variable_set_conditional_codeobject.cpp code_objects/synapses_2_synapses_create_generator_codeobject.cpp code_objects/synapses_2_pre_push_spikes.cpp code_objects/spikemonitor_1_codeobject.cpp objects.cpp code_objects/synapses_1_group_variable_set_conditional_codeobject.cpp brianlib/randomkit/randomkit.cc -o main $(LDFLAGS) + +generated_code: + $(MAKE) -C $(GENERATED_CODE_DIR) \ No newline at end of file diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/b2glib/convert_synapses.h b/parallel_execution/parallel_execution/code/MushroomBody/genn/b2glib/convert_synapses.h new file mode 100644 index 00000000..a3c59257 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/b2glib/convert_synapses.h @@ -0,0 +1,143 @@ +#pragma once + +// scalar can be any scalar type such as float, double +#include +#include +#include +#include +#include + +template +void convert_dynamic_arrays_2_dense_matrix(vector &source, vector &target, vector &gvector, scalar *g, int srcNN, int trgNN) +{ + assert(source.size() == target.size()); + assert(source.size() == gvector.size()); + unsigned int size= source.size(); + for (int s= 0; s < srcNN; s++) { + for (int t= 0; t < trgNN; t++) { + g[s*trgNN+t]= (scalar)NAN; + } + } + + for (int i= 0; i < size; i++) { + assert(source[i] < srcNN); + assert(target[i] < trgNN); + // Check for duplicate entries + if (! std::isnan(g[source[i]*trgNN+target[i]])) { + std::cerr << "*****" << std::endl; + std::cerr << "ERROR Cannot run GeNN simulation: More than one synapse for pair " << source[i] << " - " << target[i] << " and DENSE connectivity used." << std::endl; + std::cerr << "*****" << std::endl; + exit(222); + } + g[source[i]*trgNN+target[i]]= gvector[i]; + } + for (int s= 0; s < srcNN; s++) { + for (int t= 0; t < trgNN; t++) { + if (std::isnan(g[s*trgNN+t])) + g[s*trgNN+t] = 0.0; + } + } +} + +namespace b2g { + unsigned int FULL_MONTY= 0; + unsigned int COPY_ONLY= 1; +}; + +void initialize_sparse_synapses(const vector &source, const vector &target, + unsigned int *rowLength, unsigned int *ind, unsigned int maxRowLength, + int srcNN, int trgNN, + vector &indices) +{ + // Initially zero row lengths + std::fill_n(rowLength, srcNN, 0); + + const size_t size = source.size(); + + // Reserve indices + indices.clear(); + indices.reserve(size); + + // Loop through input arrays + for (size_t i= 0; i < size; i++) { + assert(source[i] < srcNN); + assert(target[i] < trgNN); + + // Calculate index of synapse in ragged structure + const size_t index = (source[i] * maxRowLength) + rowLength[source[i]]; + + // Add index to vector and insert postsynaptic index into correct location + // **TODO** insert in correct position to keep sorted + indices.push_back(index); + ind[index] = target[i]; + + // Increment row length + rowLength[source[i]]++; + } +} + + +template +void convert_dynamic_arrays_2_sparse_synapses(const vector &gvector, const vector &indices, + scalar *gv, int srcNN, int trgNN) +{ + const size_t size = indices.size(); + for (size_t i= 0; i < size; i++) { + // Insert postsynaptic index in correct location + gv[indices[i]] = gvector[i]; + } +} + + +template +void convert_dense_matrix_2_dynamic_arrays(scalar *g, int srcNN, int trgNN, vector &source, vector &target, vector &gvector) +{ + assert(source.size() == target.size()); + assert(source.size() == gvector.size()); + unsigned int size= source.size(); + for (int i= 0; i < size; i++) { + assert(source[i] < srcNN); + assert(target[i] < trgNN); + gvector[i]= g[source[i]*trgNN+target[i]]; + } +} + +template +void convert_sparse_synapses_2_dynamic_arrays(unsigned int *rowLength, unsigned int *ind, unsigned int maxRowLength, + scalar *gv, int srcNN, int trgNN, vector &source, vector &target, vector &gvector, unsigned int mode) +{ +// note: this does not preserve the original order of entries in the brian arrays - is that a problem? + if (mode == b2g::FULL_MONTY) { + assert(source.size() == target.size()); + assert(source.size() == gvector.size()); + size_t cnt= 0; + for (int i= 0; i < srcNN; i++) { + for (int j= 0; j < rowLength[i]; j++) { + source[cnt]= i; + target[cnt]= ind[(i * maxRowLength) + j]; + gvector[cnt]= gv[(i * maxRowLength) + j]; + cnt++; + } + } + } + else { + size_t cnt= 0; + for (int i= 0; i < srcNN; i++) { + for (int j= 0; j < rowLength[i]; j++) { + gvector[cnt++]= gv[(i * maxRowLength) + j]; + } + } + } +} + +void create_hidden_weightmatrix(vector &source, vector &target, char* hwm, int srcNN, int trgNN) +{ + for (int s= 0; s < srcNN; s++) { + for (int t= 0; t < trgNN; t++) { + hwm[s*trgNN+t]= 0; + } + } + for (int i= 0; i < source.size(); i++) { + hwm[source[i]*trgNN+target[i]]= 1; + } +} diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/brianlib/clocks.h b/parallel_execution/parallel_execution/code/MushroomBody/genn/brianlib/clocks.h new file mode 100644 index 00000000..b3316830 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/brianlib/clocks.h @@ -0,0 +1,51 @@ +#ifndef _BRIAN_CLOCKS_H +#define _BRIAN_CLOCKS_H +#include +#include +#include +#include + +namespace { + inline int fround(double x) + { + return (int)(x+0.5); + }; +}; + +class Clock +{ +public: + double epsilon; + double *dt; + int64_t *timestep; + double *t; + Clock(double _epsilon=1e-14) : epsilon(_epsilon) { i_end = 0;}; + inline void tick() + { + timestep[0] += 1; + t[0] = timestep[0] * dt[0]; + } + inline bool running() { return timestep[0] +#include + +#ifdef _MSC_VER +#define INFINITY (std::numeric_limits::infinity()) +#define NAN (std::numeric_limits::quiet_NaN()) +#define M_PI 3.14159265358979323846 +#endif + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/brianlib/dynamic_array.h b/parallel_execution/parallel_execution/code/MushroomBody/genn/brianlib/dynamic_array.h new file mode 100644 index 00000000..2cafb176 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/brianlib/dynamic_array.h @@ -0,0 +1,87 @@ +#ifndef _BRIAN_DYNAMIC_ARRAY_H +#define _BRIAN_DYNAMIC_ARRAY_H + +#include + +/* + * 2D Dynamic array class + * + * Efficiency note: if you are regularly resizing, make sure it is the first dimension that + * is resized, not the second one. + * + */ +template +class DynamicArray2D +{ + int old_n, old_m; + std::vector< std::vector* > data; +public: + int n, m; + DynamicArray2D(int _n=0, int _m=0) + { + old_n = 0; + old_m = 0; + resize(_n, _m); + }; + ~DynamicArray2D() + { + resize(0, 0); // handles deallocation + } + void resize() + { + if(old_n!=n) + { + if(nold_n) + { + for(int i=old_n; i; + } + } + if(old_m!=m) + { + for(int i=0; iresize(m); + } else if(n>old_n) + { + for(int i=old_n; iresize(m); + } + } else if(old_m!=m) + { + for(int i=0; iresize(m); + } + } + old_n = n; + old_m = m; + }; + void resize(int _n, int _m) + { + n = _n; + m = _m; + resize(); + } + // We cannot simply use T& as the return type here, since we don't + // get a bool& out of a std::vector + inline typename std::vector::reference operator()(int i, int j) + { + return (*data[i])[j]; + } + inline std::vector& operator()(int i) + { + return (*data[i]); + } +}; + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/brianlib/randomkit/randomkit.cc b/parallel_execution/parallel_execution/code/MushroomBody/genn/brianlib/randomkit/randomkit.cc new file mode 100644 index 00000000..169c288d --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/brianlib/randomkit/randomkit.cc @@ -0,0 +1,402 @@ +/* Random kit 1.3 */ + +/* + * Copyright (c) 2003-2005, Jean-Sebastien Roy (js@jeannot.org) + * + * The rk_random and rk_seed functions algorithms and the original design of + * the Mersenne Twister RNG: + * + * Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura, + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. The names of its contributors may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Original algorithm for the implementation of rk_interval function from + * Richard J. Wagner's implementation of the Mersenne Twister RNG, optimised by + * Magnus Jonsson. + * + * Constants used in the rk_double implementation by Isaku Wada. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/* static char const rcsid[] = + "@(#) $Jeannot: randomkit.c,v 1.28 2005/07/21 22:14:09 js Exp $"; */ +#include +#include +#include +#include +#include +#include + +#ifdef _WIN32 +/* + * Windows + * XXX: we have to use this ugly defined(__GNUC__) because it is not easy to + * detect the compiler used in distutils itself + */ +#if (defined(__GNUC__) && defined(NPY_NEEDS_MINGW_TIME_WORKAROUND)) + +/* + * FIXME: ideally, we should set this to the real version of MSVCRT. We need + * something higher than 0x601 to enable _ftime64 and co + */ +#define __MSVCRT_VERSION__ 0x0700 +#include +#include + +/* + * mingw msvcr lib import wrongly export _ftime, which does not exist in the + * actual msvc runtime for version >= 8; we make it an alias to _ftime64, which + * is available in those versions of the runtime + */ +#define _FTIME(x) _ftime64((x)) +#else +#include +#include +#define _FTIME(x) _ftime((x)) +#endif + +#ifndef RK_NO_WINCRYPT +/* Windows crypto */ +#ifndef _WIN32_WINNT +#define _WIN32_WINNT 0x0400 +#endif +#include +#include +#endif + +#else +/* Unix */ +#include +#include +#include +#endif + +#include "randomkit.h" + +#ifndef RK_DEV_URANDOM +#define RK_DEV_URANDOM "/dev/urandom" +#endif + +#ifndef RK_DEV_RANDOM +#define RK_DEV_RANDOM "/dev/random" +#endif + +char *rk_strerror[RK_ERR_MAX] = +{ + "no error", + "random device unvavailable" +}; + +/* static functions */ +static unsigned long rk_hash(unsigned long key); + +void +rk_seed(unsigned long seed, rk_state *state) +{ + int pos; + seed &= 0xffffffffUL; + + /* Knuth's PRNG as used in the Mersenne Twister reference implementation */ + for (pos = 0; pos < RK_STATE_LEN; pos++) { + state->key[pos] = seed; + seed = (1812433253UL * (seed ^ (seed >> 30)) + pos + 1) & 0xffffffffUL; + } + state->pos = RK_STATE_LEN; + state->gauss = 0; + state->has_gauss = 0; + state->has_binomial = 0; +} + +/* Thomas Wang 32 bits integer hash function */ +unsigned long +rk_hash(unsigned long key) +{ + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); + return key; +} + +rk_error +rk_randomseed(rk_state *state) +{ +#ifndef _WIN32 + struct timeval tv; +#else + struct _timeb tv; +#endif + int i; + + if (rk_devfill(state->key, sizeof(state->key), 0) == RK_NOERR) { + /* ensures non-zero key */ + state->key[0] |= 0x80000000UL; + state->pos = RK_STATE_LEN; + state->gauss = 0; + state->has_gauss = 0; + state->has_binomial = 0; + + for (i = 0; i < 624; i++) { + state->key[i] &= 0xffffffffUL; + } + return RK_NOERR; + } + +#ifndef _WIN32 + gettimeofday(&tv, NULL); + rk_seed(rk_hash(getpid()) ^ rk_hash(tv.tv_sec) ^ rk_hash(tv.tv_usec) + ^ rk_hash(clock()), state); +#else + _FTIME(&tv); + rk_seed(rk_hash(tv.time) ^ rk_hash(tv.millitm) ^ rk_hash(clock()), state); +#endif + + return RK_ENODEV; +} + +/* Magic Mersenne Twister constants */ +#define N 624 +#define M 397 +#define MATRIX_A 0x9908b0dfUL +#define UPPER_MASK 0x80000000UL +#define LOWER_MASK 0x7fffffffUL + +/* Slightly optimised reference implementation of the Mersenne Twister */ +unsigned long +rk_random(rk_state *state) +{ + unsigned long y; + + if (state->pos == RK_STATE_LEN) { + int i; + + for (i = 0; i < N - M; i++) { + y = (state->key[i] & UPPER_MASK) | (state->key[i+1] & LOWER_MASK); + state->key[i] = state->key[i+M] ^ (y>>1) ^ (-(y & 1) & MATRIX_A); + } + for (; i < N - 1; i++) { + y = (state->key[i] & UPPER_MASK) | (state->key[i+1] & LOWER_MASK); + state->key[i] = state->key[i+(M-N)] ^ (y>>1) ^ (-(y & 1) & MATRIX_A); + } + y = (state->key[N - 1] & UPPER_MASK) | (state->key[0] & LOWER_MASK); + state->key[N - 1] = state->key[M - 1] ^ (y >> 1) ^ (-(y & 1) & MATRIX_A); + + state->pos = 0; + } + y = state->key[state->pos++]; + + /* Tempering */ + y ^= (y >> 11); + y ^= (y << 7) & 0x9d2c5680UL; + y ^= (y << 15) & 0xefc60000UL; + y ^= (y >> 18); + + return y; +} + +long +rk_long(rk_state *state) +{ + return rk_ulong(state) >> 1; +} + +unsigned long +rk_ulong(rk_state *state) +{ +#if ULONG_MAX <= 0xffffffffUL + return rk_random(state); +#else + return (rk_random(state) << 32) | (rk_random(state)); +#endif +} + +unsigned long +rk_interval(unsigned long max, rk_state *state) +{ + unsigned long mask = max, value; + + if (max == 0) { + return 0; + } + /* Smallest bit mask >= max */ + mask |= mask >> 1; + mask |= mask >> 2; + mask |= mask >> 4; + mask |= mask >> 8; + mask |= mask >> 16; +#if ULONG_MAX > 0xffffffffUL + mask |= mask >> 32; +#endif + + /* Search a random value in [0..mask] <= max */ +#if ULONG_MAX > 0xffffffffUL + if (max <= 0xffffffffUL) { + while ((value = (rk_random(state) & mask)) > max); + } + else { + while ((value = (rk_ulong(state) & mask)) > max); + } +#else + while ((value = (rk_ulong(state) & mask)) > max); +#endif + return value; +} + +double +rk_double(rk_state *state) +{ + /* shifts : 67108864 = 0x4000000, 9007199254740992 = 0x20000000000000 */ + long a = rk_random(state) >> 5, b = rk_random(state) >> 6; + return (a * 67108864.0 + b) / 9007199254740992.0; +} + +void +rk_fill(void *buffer, size_t size, rk_state *state) +{ + unsigned long r; + unsigned char *buf = (unsigned char *)buffer; + + for (; size >= 4; size -= 4) { + r = rk_random(state); + *(buf++) = r & 0xFF; + *(buf++) = (r >> 8) & 0xFF; + *(buf++) = (r >> 16) & 0xFF; + *(buf++) = (r >> 24) & 0xFF; + } + + if (!size) { + return; + } + r = rk_random(state); + for (; size; r >>= 8, size --) { + *(buf++) = (unsigned char)(r & 0xFF); + } +} + +rk_error +rk_devfill(void *buffer, size_t size, int strong) +{ +#ifndef _WIN32 + FILE *rfile; + int done; + + if (strong) { + rfile = fopen(RK_DEV_RANDOM, "rb"); + } + else { + rfile = fopen(RK_DEV_URANDOM, "rb"); + } + if (rfile == NULL) { + return RK_ENODEV; + } + done = fread(buffer, size, 1, rfile); + fclose(rfile); + if (done) { + return RK_NOERR; + } +#else + +#ifndef RK_NO_WINCRYPT + HCRYPTPROV hCryptProv; + BOOL done; + + if (!CryptAcquireContext(&hCryptProv, NULL, NULL, PROV_RSA_FULL, + CRYPT_VERIFYCONTEXT) || !hCryptProv) { + return RK_ENODEV; + } + done = CryptGenRandom(hCryptProv, size, (unsigned char *)buffer); + CryptReleaseContext(hCryptProv, 0); + if (done) { + return RK_NOERR; + } +#endif + +#endif + return RK_ENODEV; +} + +rk_error +rk_altfill(void *buffer, size_t size, int strong, rk_state *state) +{ + rk_error err; + + err = rk_devfill(buffer, size, strong); + if (err) { + rk_fill(buffer, size, state); + } + return err; +} + +double +rk_gauss(rk_state *state) +{ + if (state->has_gauss) { + const double tmp = state->gauss; + state->gauss = 0; + state->has_gauss = 0; + return tmp; + } + else { + double f, x1, x2, r2; + + do { + x1 = 2.0*rk_double(state) - 1.0; + x2 = 2.0*rk_double(state) - 1.0; + r2 = x1*x1 + x2*x2; + } + while (r2 >= 1.0 || r2 == 0.0); + + /* Box-Muller transform */ + f = sqrt(-2.0*log(r2)/r2); + /* Keep for next call */ + state->gauss = f*x1; + state->has_gauss = 1; + return f*x2; + } +} diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/brianlib/randomkit/randomkit.h b/parallel_execution/parallel_execution/code/MushroomBody/genn/brianlib/randomkit/randomkit.h new file mode 100644 index 00000000..e049488e --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/brianlib/randomkit/randomkit.h @@ -0,0 +1,189 @@ +/* Random kit 1.3 */ + +/* + * Copyright (c) 2003-2005, Jean-Sebastien Roy (js@jeannot.org) + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/* @(#) $Jeannot: randomkit.h,v 1.24 2005/07/21 22:14:09 js Exp $ */ + +/* + * Typical use: + * + * { + * rk_state state; + * unsigned long seed = 1, random_value; + * + * rk_seed(seed, &state); // Initialize the RNG + * ... + * random_value = rk_random(&state); // Generate random values in [0..RK_MAX] + * } + * + * Instead of rk_seed, you can use rk_randomseed which will get a random seed + * from /dev/urandom (or the clock, if /dev/urandom is unavailable): + * + * { + * rk_state state; + * unsigned long random_value; + * + * rk_randomseed(&state); // Initialize the RNG with a random seed + * ... + * random_value = rk_random(&state); // Generate random values in [0..RK_MAX] + * } + */ + +/* + * Useful macro: + * RK_DEV_RANDOM: the device used for random seeding. + * defaults to "/dev/urandom" + */ + +#include + +#ifndef _RANDOMKIT_ +#define _RANDOMKIT_ + +#define RK_STATE_LEN 624 + +typedef struct rk_state_ +{ + unsigned long key[RK_STATE_LEN]; + int pos; + int has_gauss; /* !=0: gauss contains a gaussian deviate */ + double gauss; + + /* The rk_state structure has been extended to store the following + * information for the binomial generator. If the input values of n or p + * are different than nsave and psave, then the other parameters will be + * recomputed. RTK 2005-09-02 */ + + int has_binomial; /* !=0: following parameters initialized for + binomial */ + double psave; + long nsave; + double r; + double q; + double fm; + long m; + double p1; + double xm; + double xl; + double xr; + double c; + double laml; + double lamr; + double p2; + double p3; + double p4; + +} +rk_state; + +typedef enum { + RK_NOERR = 0, /* no error */ + RK_ENODEV = 1, /* no RK_DEV_RANDOM device */ + RK_ERR_MAX = 2 +} rk_error; + +/* error strings */ +extern char *rk_strerror[RK_ERR_MAX]; + +/* Maximum generated random value */ +#define RK_MAX 0xFFFFFFFFUL + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Initialize the RNG state using the given seed. + */ +extern void rk_seed(unsigned long seed, rk_state *state); + +/* + * Initialize the RNG state using a random seed. + * Uses /dev/random or, when unavailable, the clock (see randomkit.c). + * Returns RK_NOERR when no errors occurs. + * Returns RK_ENODEV when the use of RK_DEV_RANDOM failed (for example because + * there is no such device). In this case, the RNG was initialized using the + * clock. + */ +extern rk_error rk_randomseed(rk_state *state); + +/* + * Returns a random unsigned long between 0 and RK_MAX inclusive + */ +extern unsigned long rk_random(rk_state *state); + +/* + * Returns a random long between 0 and LONG_MAX inclusive + */ +extern long rk_long(rk_state *state); + +/* + * Returns a random unsigned long between 0 and ULONG_MAX inclusive + */ +extern unsigned long rk_ulong(rk_state *state); + +/* + * Returns a random unsigned long between 0 and max inclusive. + */ +extern unsigned long rk_interval(unsigned long max, rk_state *state); + +/* + * Returns a random double between 0.0 and 1.0, 1.0 excluded. + */ +extern double rk_double(rk_state *state); + +/* + * fill the buffer with size random bytes + */ +extern void rk_fill(void *buffer, size_t size, rk_state *state); + +/* + * fill the buffer with randombytes from the random device + * Returns RK_ENODEV if the device is unavailable, or RK_NOERR if it is + * On Unix, if strong is defined, RK_DEV_RANDOM is used. If not, RK_DEV_URANDOM + * is used instead. This parameter has no effect on Windows. + * Warning: on most unixes RK_DEV_RANDOM will wait for enough entropy to answer + * which can take a very long time on quiet systems. + */ +extern rk_error rk_devfill(void *buffer, size_t size, int strong); + +/* + * fill the buffer using rk_devfill if the random device is available and using + * rk_fill if is is not + * parameters have the same meaning as rk_fill and rk_devfill + * Returns RK_ENODEV if the device is unavailable, or RK_NOERR if it is + */ +extern rk_error rk_altfill(void *buffer, size_t size, int strong, + rk_state *state); + +/* + * return a random gaussian deviate with variance unity and zero mean. + */ +extern double rk_gauss(rk_state *state); + +#ifdef __cplusplus +} +#endif + +#endif /* _RANDOMKIT_ */ diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/brianlib/spikequeue.h b/parallel_execution/parallel_execution/code/MushroomBody/genn/brianlib/spikequeue.h new file mode 100644 index 00000000..10a84c9f --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/brianlib/spikequeue.h @@ -0,0 +1,204 @@ +#include +#include +#include +#include +#include +#include"stdint_compat.h" +#include +using namespace std; + +//TODO: The data type for indices is currently fixed (int), all floating point +// variables (delays, dt) are assumed to use the same data type +class CSpikeQueue +{ +public: + vector< vector > queue; // queue[(offset+i)%queue.size()] is delay i relative to current time + double dt; + int offset; + bool scalar_delay; + int *delays; + int32_t source_start; + int32_t source_end; + int openmp_padding; + vector< vector > synapses; + // data structures for the store/restore mechanism + + CSpikeQueue(int _source_start, int _source_end) + : source_start(_source_start), source_end(_source_end) + { + queue.resize(1); + offset = 0; + dt = 0.0; + delays = NULL; + openmp_padding = 0; + scalar_delay = 0; + }; + + ~CSpikeQueue() + { + if (delays) + { + delete[] delays; + delays = NULL; + } + } + + template void prepare(scalar *real_delays, int n_delays, + int32_t *sources, int n_synapses, + double _dt) + { + + assert(n_delays == 1 || n_delays == n_synapses); + + if (delays) + delete [] delays; + + if (dt != 0.0 && dt != _dt) + { + // dt changed, we have to get the old spikes out of the queue and + // reinsert them at the correct positions + vector< vector > queue_copy = queue; // does a real copy + const double conversion_factor = dt / _dt; + const size_t oldsize = queue.size(); + const size_t newsize = (int)(oldsize * conversion_factor) + 1; + queue.clear(); + queue.resize(newsize); + for (size_t i=0; i spikes = queue_copy[(i + offset) % oldsize]; + queue[(int)(i * conversion_factor + 0.5)] = spikes; + } + offset = 0; + } + + delays = new int[n_delays]; + synapses.clear(); + synapses.resize(source_end - source_start); + + // Note that n_synapses and n_delays do not have to be identical + // (homogeneous delays are stored as a single scalar), we therefore + // use two independent loops to initialize the delays and the synapses + // array + scalar first_delay = n_delays > 0 ? real_delays[0] : 0.0; + int min_delay = (int)(first_delay / _dt + 0.5); + int max_delay = min_delay; + for (int i=0; i max_delay) + max_delay = delays[i]; + else if (delays[i] < min_delay) + min_delay = delays[i]; + } + for (int i=0; i > > _full_state() + { + pair > > state(offset, queue); + return state; + } + + void _clear() + { + } + + void _restore_from_full_state(const pair > > state) + { + int stored_offset = state.first; + vector< vector > stored_queue = state.second; + size_t size = stored_queue.size(); + queue.clear(); + if (size == 0) // the queue did not exist at the time of the store call + size = 1; + queue.resize(size); + for (size_t i=0; i= (int)queue.size()) + { + expand(delay+1); + } + }; + + void push(int32_t *spikes, int nspikes) + { + if(nspikes==0) return; + const int start = static_cast(distance(spikes, lower_bound(spikes, spikes+nspikes, source_start))); + const int stop = static_cast(distance(spikes, upper_bound(spikes, spikes+nspikes, source_end-1))); + const int32_t * __restrict rspikes = spikes; + if(scalar_delay) + { + vector &homog_queue = queue[(offset+delays[0])%queue.size()]; + for(int idx_spike=start; idx_spike* peek() + { + return &queue[offset]; + }; + + void advance() + { + // empty the current queue, note that for most compilers this shouldn't deallocate the memory, + // although VC<=7.1 will, so it will be less efficient with that compiler + queue[offset].clear(); + // and advance to the next offset + offset = (offset+1)%queue.size(); + }; +}; diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/brianlib/stdint_compat.h b/parallel_execution/parallel_execution/code/MushroomBody/genn/brianlib/stdint_compat.h new file mode 100644 index 00000000..40e577e7 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/brianlib/stdint_compat.h @@ -0,0 +1,20 @@ +#ifndef _BRIAN_STDINT_COMPAT_H +#define _BRIAN_STDINT_COMPAT_H +// Work around the fact that older MSVC versions don't have stdint.h +#ifdef _MSC_VER +typedef __int32 int32_t; +typedef __int64 int64_t; +typedef unsigned __int64 uint64_t; +#else +#include +#endif +// Implement the int_ function here so that it can also be used from Cython +template inline int int_(T value) +{ + return (int)value; +} +template<> inline int int_(bool value) +{ + return value ? 1 : 0; +} +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/spikegeneratorgroup_codeobject.cpp b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/spikegeneratorgroup_codeobject.cpp new file mode 100644 index 00000000..c3a59953 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/spikegeneratorgroup_codeobject.cpp @@ -0,0 +1,132 @@ +#include "objects.h" +#include "code_objects/spikegeneratorgroup_codeobject.h" +#include "objects.h" +#include "brianlib/common_math.h" +#include "brianlib/stdint_compat.h" +#include +#include +#include +#include +#include +#include "magicnetwork_model_CODE/definitions.h" + +////// SUPPORT CODE /////// +namespace spikegeneratorgroup_codeobject { + + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < typename T1, typename T2 > + static inline typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + static inline typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + +} + +////// HASH DEFINES /////// + + + +void _run_spikegeneratorgroup_codeobject() +{ + using namespace brian; +using namespace spikegeneratorgroup_codeobject; + + + ///// CONSTANTS /////////// + const int _num_period_bins = 1; +int32_t* const _array_spikegeneratorgroup__timebins = &_dynamic_array_spikegeneratorgroup__timebins[0]; +const int _num_timebins = _dynamic_array_spikegeneratorgroup__timebins.size(); +const int _num_lastindex = 1; +const int _num_spikespace = 101; +int32_t* const _array_spikegeneratorgroup_neuron_index = &_dynamic_array_spikegeneratorgroup_neuron_index[0]; +const int _numneuron_index = _dynamic_array_spikegeneratorgroup_neuron_index.size(); +int32_t* const _array_spikegeneratorgroup_spike_number = &_dynamic_array_spikegeneratorgroup_spike_number[0]; +const int _numspike_number = _dynamic_array_spikegeneratorgroup_spike_number.size(); + ///// POINTERS //////////// + + int32_t* _ptr_array_spikegeneratorgroup__period_bins = _array_spikegeneratorgroup__period_bins; + int32_t* __restrict _ptr_array_spikegeneratorgroup__timebins = _array_spikegeneratorgroup__timebins; + int32_t* _ptr_array_spikegeneratorgroup__lastindex = _array_spikegeneratorgroup__lastindex; + int32_t* __restrict _ptr_array_spikegeneratorgroup__spikespace = _array_spikegeneratorgroup__spikespace; + int32_t* __restrict _ptr_array_spikegeneratorgroup_neuron_index = _array_spikegeneratorgroup_neuron_index; + int32_t* __restrict _ptr_array_spikegeneratorgroup_spike_number = _array_spikegeneratorgroup_spike_number; + + + + const int32_t _the_period = _ptr_array_spikegeneratorgroup__period_bins[0]; + int32_t _timebin = iT; + + if (_the_period > 0) { + _timebin %= _the_period; + // If there is a periodicity in the SpikeGenerator, we need to reset the + // lastindex when the period has passed + if (_ptr_array_spikegeneratorgroup__lastindex[0] > 0 && _ptr_array_spikegeneratorgroup__timebins[_ptr_array_spikegeneratorgroup__lastindex[0] - 1] >= _timebin) + _ptr_array_spikegeneratorgroup__lastindex[0] = 0; + } + + int32_t _cpp_numspikes = 0; + + for(int _idx=_ptr_array_spikegeneratorgroup__lastindex[0]; _idx < _num_timebins; _idx++) + { + if (_ptr_array_spikegeneratorgroup__timebins[_idx] > _timebin) + break; + + spike_spikegeneratorgroup[_cpp_numspikes++] = _ptr_array_spikegeneratorgroup_neuron_index[_idx]; + } + + spikeCount_spikegeneratorgroup = _cpp_numspikes; + + _ptr_array_spikegeneratorgroup__lastindex[0] += _cpp_numspikes; + + +} + + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/spikegeneratorgroup_codeobject.h b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/spikegeneratorgroup_codeobject.h new file mode 100644 index 00000000..5afcfe58 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/spikegeneratorgroup_codeobject.h @@ -0,0 +1,7 @@ +#ifndef _INCLUDED_spikegeneratorgroup_codeobject +#define _INCLUDED_spikegeneratorgroup_codeobject + +void _run_spikegeneratorgroup_codeobject(); + + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/spikemonitor_1_codeobject.cpp b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/spikemonitor_1_codeobject.cpp new file mode 100644 index 00000000..d000e4c5 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/spikemonitor_1_codeobject.cpp @@ -0,0 +1,131 @@ +#include "objects.h" +#include "code_objects/spikemonitor_1_codeobject.h" +#include "objects.h" +#include "brianlib/common_math.h" +#include "brianlib/stdint_compat.h" +#include +#include +#include +#include +#include +#include "magicnetwork_model_CODE/definitions.h" + +////// SUPPORT CODE /////// +namespace spikemonitor_1_codeobject { + + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < typename T1, typename T2 > + static inline typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + static inline typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + +} + +////// HASH DEFINES /////// + + + +void _run_spikemonitor_1_codeobject() +{ + using namespace brian; +using namespace spikemonitor_1_codeobject; + + + ///// CONSTANTS /////////// + int32_t* const _array_spikemonitor_1_i = &_dynamic_array_spikemonitor_1_i[0]; +const int _numi = _dynamic_array_spikemonitor_1_i.size(); +const int _numN = 1; +const int _numcount = 2500; +const int _num_clock_t = 1; +double* const _array_spikemonitor_1_t = &_dynamic_array_spikemonitor_1_t[0]; +const int _numt = _dynamic_array_spikemonitor_1_t.size(); +const int _num_spikespace = 2501; +const int _num_source_i = 2500; +const int _num_source_t = 1; +const int _num_source_idx = 2500; + ///// POINTERS //////////// + + int32_t* __restrict _ptr_array_spikemonitor_1_i = _array_spikemonitor_1_i; + int32_t* _ptr_array_spikemonitor_1_N = _array_spikemonitor_1_N; + int32_t* __restrict _ptr_array_spikemonitor_1_count = _array_spikemonitor_1_count; + double* _ptr_array_defaultclock_t = _array_defaultclock_t; + double* __restrict _ptr_array_spikemonitor_1_t = _array_spikemonitor_1_t; + int32_t* __restrict _ptr_array_neurongroup__spikespace = _array_neurongroup__spikespace; + int32_t* __restrict _ptr_array_neurongroup_i = _array_neurongroup_i; + int32_t* __restrict _ptr_array_spikemonitor_1__source_idx = _array_spikemonitor_1__source_idx; + + + //// MAIN CODE //////////// + int32_t _num_events = spikeCount_neurongroup; + + + + if (_num_events > 0) + { + unsigned int _true_events= 0; + for(int _j=0; _j<_num_events; _j++) + { + const int _idx = spike_neurongroup[_j]; + if ((_idx >= 0) && (_idx < 2500)) { + _dynamic_array_spikemonitor_1_i.push_back(_idx - 0); + _dynamic_array_spikemonitor_1_t.push_back(t); + _ptr_array_spikemonitor_1_count[_idx-0]++; + _true_events++; + } + } + _ptr_array_spikemonitor_1_N[0] += _true_events; + } + + +} + + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/spikemonitor_1_codeobject.h b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/spikemonitor_1_codeobject.h new file mode 100644 index 00000000..951ae2d8 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/spikemonitor_1_codeobject.h @@ -0,0 +1,7 @@ +#ifndef _INCLUDED_spikemonitor_1_codeobject +#define _INCLUDED_spikemonitor_1_codeobject + +void _run_spikemonitor_1_codeobject(); + + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/spikemonitor_2_codeobject.cpp b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/spikemonitor_2_codeobject.cpp new file mode 100644 index 00000000..f692cc15 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/spikemonitor_2_codeobject.cpp @@ -0,0 +1,131 @@ +#include "objects.h" +#include "code_objects/spikemonitor_2_codeobject.h" +#include "objects.h" +#include "brianlib/common_math.h" +#include "brianlib/stdint_compat.h" +#include +#include +#include +#include +#include +#include "magicnetwork_model_CODE/definitions.h" + +////// SUPPORT CODE /////// +namespace spikemonitor_2_codeobject { + + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < typename T1, typename T2 > + static inline typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + static inline typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + +} + +////// HASH DEFINES /////// + + + +void _run_spikemonitor_2_codeobject() +{ + using namespace brian; +using namespace spikemonitor_2_codeobject; + + + ///// CONSTANTS /////////// + int32_t* const _array_spikemonitor_2_i = &_dynamic_array_spikemonitor_2_i[0]; +const int _numi = _dynamic_array_spikemonitor_2_i.size(); +const int _numN = 1; +const int _numcount = 100; +const int _num_clock_t = 1; +double* const _array_spikemonitor_2_t = &_dynamic_array_spikemonitor_2_t[0]; +const int _numt = _dynamic_array_spikemonitor_2_t.size(); +const int _num_spikespace = 101; +const int _num_source_i = 100; +const int _num_source_t = 1; +const int _num_source_idx = 100; + ///// POINTERS //////////// + + int32_t* __restrict _ptr_array_spikemonitor_2_i = _array_spikemonitor_2_i; + int32_t* _ptr_array_spikemonitor_2_N = _array_spikemonitor_2_N; + int32_t* __restrict _ptr_array_spikemonitor_2_count = _array_spikemonitor_2_count; + double* _ptr_array_defaultclock_t = _array_defaultclock_t; + double* __restrict _ptr_array_spikemonitor_2_t = _array_spikemonitor_2_t; + int32_t* __restrict _ptr_array_neurongroup_1__spikespace = _array_neurongroup_1__spikespace; + int32_t* __restrict _ptr_array_neurongroup_1_i = _array_neurongroup_1_i; + int32_t* __restrict _ptr_array_spikemonitor_2__source_idx = _array_spikemonitor_2__source_idx; + + + //// MAIN CODE //////////// + int32_t _num_events = spikeCount_neurongroup_1; + + + + if (_num_events > 0) + { + unsigned int _true_events= 0; + for(int _j=0; _j<_num_events; _j++) + { + const int _idx = spike_neurongroup_1[_j]; + if ((_idx >= 0) && (_idx < 100)) { + _dynamic_array_spikemonitor_2_i.push_back(_idx - 0); + _dynamic_array_spikemonitor_2_t.push_back(t); + _ptr_array_spikemonitor_2_count[_idx-0]++; + _true_events++; + } + } + _ptr_array_spikemonitor_2_N[0] += _true_events; + } + + +} + + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/spikemonitor_2_codeobject.h b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/spikemonitor_2_codeobject.h new file mode 100644 index 00000000..a60d58cd --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/spikemonitor_2_codeobject.h @@ -0,0 +1,7 @@ +#ifndef _INCLUDED_spikemonitor_2_codeobject +#define _INCLUDED_spikemonitor_2_codeobject + +void _run_spikemonitor_2_codeobject(); + + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/spikemonitor_codeobject.cpp b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/spikemonitor_codeobject.cpp new file mode 100644 index 00000000..c1a8a214 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/spikemonitor_codeobject.cpp @@ -0,0 +1,131 @@ +#include "objects.h" +#include "code_objects/spikemonitor_codeobject.h" +#include "objects.h" +#include "brianlib/common_math.h" +#include "brianlib/stdint_compat.h" +#include +#include +#include +#include +#include +#include "magicnetwork_model_CODE/definitions.h" + +////// SUPPORT CODE /////// +namespace spikemonitor_codeobject { + + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < typename T1, typename T2 > + static inline typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + static inline typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + +} + +////// HASH DEFINES /////// + + + +void _run_spikemonitor_codeobject() +{ + using namespace brian; +using namespace spikemonitor_codeobject; + + + ///// CONSTANTS /////////// + int32_t* const _array_spikemonitor_i = &_dynamic_array_spikemonitor_i[0]; +const int _numi = _dynamic_array_spikemonitor_i.size(); +const int _numN = 1; +const int _numcount = 100; +const int _num_clock_t = 1; +double* const _array_spikemonitor_t = &_dynamic_array_spikemonitor_t[0]; +const int _numt = _dynamic_array_spikemonitor_t.size(); +const int _num_spikespace = 101; +const int _num_source_i = 100; +const int _num_source_t = 1; +const int _num_source_idx = 100; + ///// POINTERS //////////// + + int32_t* __restrict _ptr_array_spikemonitor_i = _array_spikemonitor_i; + int32_t* _ptr_array_spikemonitor_N = _array_spikemonitor_N; + int32_t* __restrict _ptr_array_spikemonitor_count = _array_spikemonitor_count; + double* _ptr_array_defaultclock_t = _array_defaultclock_t; + double* __restrict _ptr_array_spikemonitor_t = _array_spikemonitor_t; + int32_t* __restrict _ptr_array_spikegeneratorgroup__spikespace = _array_spikegeneratorgroup__spikespace; + int32_t* __restrict _ptr_array_spikegeneratorgroup_i = _array_spikegeneratorgroup_i; + int32_t* __restrict _ptr_array_spikemonitor__source_idx = _array_spikemonitor__source_idx; + + + //// MAIN CODE //////////// + int32_t _num_events = spikeCount_spikegeneratorgroup; + + + + if (_num_events > 0) + { + unsigned int _true_events= 0; + for(int _j=0; _j<_num_events; _j++) + { + const int _idx = spike_spikegeneratorgroup[_j]; + if ((_idx >= 0) && (_idx < 100)) { + _dynamic_array_spikemonitor_i.push_back(_idx - 0); + _dynamic_array_spikemonitor_t.push_back(t); + _ptr_array_spikemonitor_count[_idx-0]++; + _true_events++; + } + } + _ptr_array_spikemonitor_N[0] += _true_events; + } + + +} + + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/spikemonitor_codeobject.h b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/spikemonitor_codeobject.h new file mode 100644 index 00000000..d7af5793 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/spikemonitor_codeobject.h @@ -0,0 +1,7 @@ +#ifndef _INCLUDED_spikemonitor_codeobject +#define _INCLUDED_spikemonitor_codeobject + +void _run_spikemonitor_codeobject(); + + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_group_variable_set_conditional_codeobject.cpp b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_group_variable_set_conditional_codeobject.cpp new file mode 100644 index 00000000..42cec67a --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_group_variable_set_conditional_codeobject.cpp @@ -0,0 +1,125 @@ +#include "objects.h" +#include "code_objects/synapses_1_group_variable_set_conditional_codeobject.h" +#include "objects.h" +#include "brianlib/common_math.h" +#include "brianlib/stdint_compat.h" +#include +#include +#include +#include +#include + +////// SUPPORT CODE /////// +namespace synapses_1_group_variable_set_conditional_codeobject { + + double _rand(const int _vectorisation_idx) { + return rk_double(brian::_mersenne_twister_states[0]); + } + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < typename T1, typename T2 > + static inline typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + static inline typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + +} + +////// HASH DEFINES /////// + + + +void _run_synapses_1_group_variable_set_conditional_codeobject() +{ + using namespace brian; +using namespace synapses_1_group_variable_set_conditional_codeobject; + + + ///// CONSTANTS /////////// + const int _numN = 1; +double* const _array_synapses_1_g_raw = &_dynamic_array_synapses_1_g_raw[0]; +const int _numg_raw = _dynamic_array_synapses_1_g_raw.size(); + ///// POINTERS //////////// + + int32_t* _ptr_array_synapses_1_N = _array_synapses_1_N; + double* __restrict _ptr_array_synapses_1_g_raw = _array_synapses_1_g_raw; + + +//// MAIN CODE //////////// +// scalar code +const size_t _vectorisation_idx = -1; + + + + + +const int _N = _array_synapses_1_N[0]; + + +for(int _idx=0; _idx<_N; _idx++) +{ + // vector code + const size_t _vectorisation_idx = _idx; + + const char _cond = true; + + if (_cond) + { + + double g_raw; + g_raw = 1.0f*(1.0f*(_rand(_vectorisation_idx) * 3.7500000000000005e-09)/10)/1.0; + _ptr_array_synapses_1_g_raw[_idx] = g_raw; + + } +} + +} + + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_group_variable_set_conditional_codeobject.h b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_group_variable_set_conditional_codeobject.h new file mode 100644 index 00000000..d3ba3171 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_group_variable_set_conditional_codeobject.h @@ -0,0 +1,7 @@ +#ifndef _INCLUDED_synapses_1_group_variable_set_conditional_codeobject +#define _INCLUDED_synapses_1_group_variable_set_conditional_codeobject + +void _run_synapses_1_group_variable_set_conditional_codeobject(); + + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_group_variable_set_conditional_codeobject_1.cpp b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_group_variable_set_conditional_codeobject_1.cpp new file mode 100644 index 00000000..2c368d65 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_group_variable_set_conditional_codeobject_1.cpp @@ -0,0 +1,128 @@ +#include "objects.h" +#include "code_objects/synapses_1_group_variable_set_conditional_codeobject_1.h" +#include "objects.h" +#include "brianlib/common_math.h" +#include "brianlib/stdint_compat.h" +#include +#include +#include +#include +#include + +////// SUPPORT CODE /////// +namespace synapses_1_group_variable_set_conditional_codeobject_1 { + + double _rand(const int _vectorisation_idx) { + return rk_double(brian::_mersenne_twister_states[0]); + } + double _randn(const int _vectorisation_idx) { + return rk_gauss(brian::_mersenne_twister_states[0]); + } + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < typename T1, typename T2 > + static inline typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + static inline typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + +} + +////// HASH DEFINES /////// + + + +void _run_synapses_1_group_variable_set_conditional_codeobject_1() +{ + using namespace brian; +using namespace synapses_1_group_variable_set_conditional_codeobject_1; + + + ///// CONSTANTS /////////// + const int _numN = 1; +double* const _array_synapses_1_g_raw = &_dynamic_array_synapses_1_g_raw[0]; +const int _numg_raw = _dynamic_array_synapses_1_g_raw.size(); + ///// POINTERS //////////// + + int32_t* _ptr_array_synapses_1_N = _array_synapses_1_N; + double* __restrict _ptr_array_synapses_1_g_raw = _array_synapses_1_g_raw; + + +//// MAIN CODE //////////// +// scalar code +const size_t _vectorisation_idx = -1; + + + + + +const int _N = _array_synapses_1_N[0]; + + +for(int _idx=0; _idx<_N; _idx++) +{ + // vector code + const size_t _vectorisation_idx = _idx; + + const char _cond = _rand(_vectorisation_idx) < 0.2; + + if (_cond) + { + + double g_raw; + g_raw = 1.0f*((2.5 * 1e-09) + ((0.5 * 1e-09) * _randn(_vectorisation_idx)))/1.0; + _ptr_array_synapses_1_g_raw[_idx] = g_raw; + + } +} + +} + + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_group_variable_set_conditional_codeobject_1.h b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_group_variable_set_conditional_codeobject_1.h new file mode 100644 index 00000000..b45c23fe --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_group_variable_set_conditional_codeobject_1.h @@ -0,0 +1,7 @@ +#ifndef _INCLUDED_synapses_1_group_variable_set_conditional_codeobject_1 +#define _INCLUDED_synapses_1_group_variable_set_conditional_codeobject_1 + +void _run_synapses_1_group_variable_set_conditional_codeobject_1(); + + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_max_row_length.cpp b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_max_row_length.cpp new file mode 100644 index 00000000..883b22d9 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_max_row_length.cpp @@ -0,0 +1,191 @@ + +#include "brianlib/common_math.h" +#include "brianlib/stdint_compat.h" +#include +#include +#include +#include +#include +#include "brianlib/stdint_compat.h" +#include "synapses_classes.h" + +////// SUPPORT CODE /////// +namespace synapses_1_max_row_length_generator { + + double _rand(const int _vectorisation_idx) { + return rk_double(brian::_mersenne_twister_states[0]); + } + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < typename T1, typename T2 > + static inline typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + static inline typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + +} + +////// HASH DEFINES /////// + + + +void _run_synapses_1_max_row_length() +{ + using namespace brian; + using namespace synapses_1_max_row_length_generator; + + ///// CONSTANTS /////////// + const int _numN = 1; +const int _numN_incoming = _dynamic_array_synapses_1_N_incoming.size(); +const int _numN_outgoing = _dynamic_array_synapses_1_N_outgoing.size(); +const int _num_synaptic_pre = _dynamic_array_synapses_1__synaptic_pre.size(); +int32_t* const _array_synapses_1__synaptic_post = &_dynamic_array_synapses_1__synaptic_post[0]; +int32_t* const _array_synapses_1_N_incoming = &_dynamic_array_synapses_1_N_incoming[0]; +int32_t* const _array_synapses_1_N_outgoing = &_dynamic_array_synapses_1_N_outgoing[0]; +const int _num_synaptic_post = _dynamic_array_synapses_1__synaptic_post.size(); +int32_t* const _array_synapses_1__synaptic_pre = &_dynamic_array_synapses_1__synaptic_pre[0]; + + ///// POINTERS //////////// + + int32_t* _ptr_array_synapses_1_N = _array_synapses_1_N; + int32_t* __restrict _ptr_array_synapses_1_N_outgoing = _array_synapses_1_N_outgoing; + int32_t* __restrict _ptr_array_synapses_1_N_incoming = _array_synapses_1_N_incoming; + int32_t* __restrict _ptr_array_synapses_1__synaptic_post = _array_synapses_1__synaptic_post; + int32_t* __restrict _ptr_array_synapses_1__synaptic_pre = _array_synapses_1__synaptic_pre; + + + const size_t _N_pre = 2500; + const size_t _N_post = 100; + _dynamic_array_synapses_1_N_incoming.resize(_N_post + 0); + _dynamic_array_synapses_1_N_outgoing.resize(_N_pre + 0); + size_t _raw_pre_idx, _raw_post_idx; + // scalar code + const int _vectorisation_idx = -1; + + + + + + + + + for(size_t _i=0; _i<_N_pre; _i++) + { + bool __cond, _cond; + _raw_pre_idx = _i + 0; + { + + const char _cond = true; + + __cond = _cond; + } + _cond = __cond; + if(!_cond) continue; + // Some explanation of this hackery. The problem is that we have multiple code blocks. + // Each code block is generated independently of the others, and they declare variables + // at the beginning if necessary (including declaring them as const if their values don't + // change). However, if two code blocks follow each other in the same C++ scope then + // that causes a redeclaration error. So we solve it by putting each block inside a + // pair of braces to create a new scope specific to each code block. However, that brings + // up another problem: we need the values from these code blocks. I don't have a general + // solution to this problem, but in the case of this particular template, we know which + // values we need from them so we simply create outer scoped variables to copy the value + // into. Later on we have a slightly more complicated problem because the original name + // _j has to be used, so we create two variables __j, _j at the outer scope, copy + // _j to __j in the inner scope (using the inner scope version of _j), and then + // __j to _j in the outer scope (to the outer scope version of _j). This outer scope + // version of _j will then be used in subsequent blocks. + long _uiter_low; + long _uiter_high; + long _uiter_step; + { + + const int32_t _iter_low = 0; + const int32_t _iter_high = 100; + const int32_t _iter_step = 1; + + _uiter_low = _iter_low; + _uiter_high = _iter_high; + _uiter_step = _iter_step; + } + for(long _k=_uiter_low; _k<_uiter_high; _k+=_uiter_step) + { + long __j, _j, _pre_idx, __pre_idx; + { + + const int32_t _pre_idx = _raw_pre_idx; + const int32_t _j = _k; + + __j = _j; // pick up the locally scoped _j and store in __j + __pre_idx = _pre_idx; + } + _j = __j; // make the previously locally scoped _j available + _pre_idx = __pre_idx; + _raw_post_idx = _j + 0; + + if(_j<0 || _j>=_N_post) + { + cout << "Error: tried to create synapse to neuron j=" << _j << " outside range 0 to " << + _N_post-1 << endl; + exit(1); + } + + const int32_t _post_idx = _raw_post_idx; + const int32_t _n = 1; + + + for (size_t _repetition=0; _repetition<_n; _repetition++) { + _dynamic_array_synapses_1_N_outgoing[_pre_idx] += 1; + _dynamic_array_synapses_1_N_incoming[_post_idx] += 1; + } + } + } + +} \ No newline at end of file diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_post_push_spikes.cpp b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_post_push_spikes.cpp new file mode 100644 index 00000000..2e0ea12d --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_post_push_spikes.cpp @@ -0,0 +1,105 @@ +#include "objects.h" +#include "code_objects/synapses_1_post_push_spikes.h" +#include "objects.h" +#include "brianlib/common_math.h" +#include "brianlib/stdint_compat.h" +#include +#include +#include +#include +#include + +////// SUPPORT CODE /////// +namespace synapses_1_post_push_spikes { + + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < typename T1, typename T2 > + static inline typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + static inline typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + +} + +////// HASH DEFINES /////// + + + +void _run_synapses_1_post_push_spikes() +{ + using namespace brian; +using namespace synapses_1_post_push_spikes; + + + ///// CONSTANTS /////////// + double* const _array_synapses_1_delay_1 = &_dynamic_array_synapses_1_delay_1[0]; +const int _numdelay = _dynamic_array_synapses_1_delay_1.size(); +const int _num_spikespace = 101; +const int _num_source_dt = 1; + ///// POINTERS //////////// + + double* __restrict _ptr_array_synapses_1_delay_1 = _array_synapses_1_delay_1; + int32_t* __restrict _ptr_array_neurongroup_1__spikespace = _array_neurongroup_1__spikespace; + double* _ptr_array_defaultclock_dt = _array_defaultclock_dt; + + + // we do advance at the beginning rather than at the end because it saves us making + // a copy of the current spiking synapses + + { + synapses_1_post.advance(); + synapses_1_post.push(_ptr_array_neurongroup_1__spikespace, _ptr_array_neurongroup_1__spikespace[_num_spikespace-1]); + } + + +} + + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_post_push_spikes.h b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_post_push_spikes.h new file mode 100644 index 00000000..b135add8 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_post_push_spikes.h @@ -0,0 +1,7 @@ +#ifndef _INCLUDED_synapses_1_post_push_spikes +#define _INCLUDED_synapses_1_post_push_spikes + +void _run_synapses_1_post_push_spikes(); + + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_pre_push_spikes.cpp b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_pre_push_spikes.cpp new file mode 100644 index 00000000..e4db3dff --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_pre_push_spikes.cpp @@ -0,0 +1,105 @@ +#include "objects.h" +#include "code_objects/synapses_1_pre_push_spikes.h" +#include "objects.h" +#include "brianlib/common_math.h" +#include "brianlib/stdint_compat.h" +#include +#include +#include +#include +#include + +////// SUPPORT CODE /////// +namespace synapses_1_pre_push_spikes { + + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < typename T1, typename T2 > + static inline typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + static inline typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + +} + +////// HASH DEFINES /////// + + + +void _run_synapses_1_pre_push_spikes() +{ + using namespace brian; +using namespace synapses_1_pre_push_spikes; + + + ///// CONSTANTS /////////// + double* const _array_synapses_1_delay = &_dynamic_array_synapses_1_delay[0]; +const int _numdelay = _dynamic_array_synapses_1_delay.size(); +const int _num_spikespace = 2501; +const int _num_source_dt = 1; + ///// POINTERS //////////// + + double* _ptr_array_synapses_1_delay = _array_synapses_1_delay; + int32_t* __restrict _ptr_array_neurongroup__spikespace = _array_neurongroup__spikespace; + double* _ptr_array_defaultclock_dt = _array_defaultclock_dt; + + + // we do advance at the beginning rather than at the end because it saves us making + // a copy of the current spiking synapses + + { + synapses_1_pre.advance(); + synapses_1_pre.push(_ptr_array_neurongroup__spikespace, _ptr_array_neurongroup__spikespace[_num_spikespace-1]); + } + + +} + + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_pre_push_spikes.h b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_pre_push_spikes.h new file mode 100644 index 00000000..2ece2ada --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_pre_push_spikes.h @@ -0,0 +1,7 @@ +#ifndef _INCLUDED_synapses_1_pre_push_spikes +#define _INCLUDED_synapses_1_pre_push_spikes + +void _run_synapses_1_pre_push_spikes(); + + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_synapses_create_generator_codeobject.cpp b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_synapses_create_generator_codeobject.cpp new file mode 100644 index 00000000..2d4eb02c --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_synapses_create_generator_codeobject.cpp @@ -0,0 +1,212 @@ +#include "objects.h" +#include "code_objects/synapses_1_synapses_create_generator_codeobject.h" +#include "objects.h" +#include "brianlib/common_math.h" +#include "brianlib/stdint_compat.h" +#include +#include +#include +#include +#include +#include "brianlib/stdint_compat.h" +#include "synapses_classes.h" + +////// SUPPORT CODE /////// +namespace synapses_1_synapses_create_generator_codeobject { + + double _rand(const int _vectorisation_idx) { + return rk_double(brian::_mersenne_twister_states[0]); + } + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < typename T1, typename T2 > + static inline typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + static inline typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + +} + +////// HASH DEFINES /////// + + + +void _run_synapses_1_synapses_create_generator_codeobject() +{ + using namespace brian; +using namespace synapses_1_synapses_create_generator_codeobject; + + + ///// CONSTANTS /////////// + const int _numN = 1; +int32_t* const _array_synapses_1_N_outgoing = &_dynamic_array_synapses_1_N_outgoing[0]; +const int _numN_outgoing = _dynamic_array_synapses_1_N_outgoing.size(); +int32_t* const _array_synapses_1_N_incoming = &_dynamic_array_synapses_1_N_incoming[0]; +const int _numN_incoming = _dynamic_array_synapses_1_N_incoming.size(); +int32_t* const _array_synapses_1__synaptic_post = &_dynamic_array_synapses_1__synaptic_post[0]; +const int _num_synaptic_post = _dynamic_array_synapses_1__synaptic_post.size(); +int32_t* const _array_synapses_1__synaptic_pre = &_dynamic_array_synapses_1__synaptic_pre[0]; +const int _num_synaptic_pre = _dynamic_array_synapses_1__synaptic_pre.size(); + ///// POINTERS //////////// + + int32_t* _ptr_array_synapses_1_N = _array_synapses_1_N; + int32_t* __restrict _ptr_array_synapses_1_N_outgoing = _array_synapses_1_N_outgoing; + int32_t* __restrict _ptr_array_synapses_1_N_incoming = _array_synapses_1_N_incoming; + int32_t* __restrict _ptr_array_synapses_1__synaptic_post = _array_synapses_1__synaptic_post; + int32_t* __restrict _ptr_array_synapses_1__synaptic_pre = _array_synapses_1__synaptic_pre; + + + #include + + const size_t _N_pre = 2500; + const size_t _N_post = 100; + _dynamic_array_synapses_1_N_incoming.resize(_N_post + 0); + _dynamic_array_synapses_1_N_outgoing.resize(_N_pre + 0); + size_t _raw_pre_idx, _raw_post_idx; + // scalar code + const size_t _vectorisation_idx = -1; + + + + + + + + + for(size_t _i=0; _i<_N_pre; _i++) + { + bool __cond, _cond; + _raw_pre_idx = _i + 0; + { + + const char _cond = true; + + __cond = _cond; + } + _cond = __cond; + if(!_cond) continue; + // Some explanation of this hackery. The problem is that we have multiple code blocks. + // Each code block is generated independently of the others, and they declare variables + // at the beginning if necessary (including declaring them as const if their values don't + // change). However, if two code blocks follow each other in the same C++ scope then + // that causes a redeclaration error. So we solve it by putting each block inside a + // pair of braces to create a new scope specific to each code block. However, that brings + // up another problem: we need the values from these code blocks. I don't have a general + // solution to this problem, but in the case of this particular template, we know which + // values we need from them so we simply create outer scoped variables to copy the value + // into. Later on we have a slightly more complicated problem because the original name + // _j has to be used, so we create two variables __j, _j at the outer scope, copy + // _j to __j in the inner scope (using the inner scope version of _j), and then + // __j to _j in the outer scope (to the outer scope version of _j). This outer scope + // version of _j will then be used in subsequent blocks. + long _uiter_low; + long _uiter_high; + long _uiter_step; + { + + const int32_t _iter_low = 0; + const int32_t _iter_high = 100; + const int32_t _iter_step = 1; + + _uiter_low = _iter_low; + _uiter_high = _iter_high; + _uiter_step = _iter_step; + } + for(long _k=_uiter_low; _k<_uiter_high; _k+=_uiter_step) + { + long __j, _j, _pre_idx, __pre_idx; + { + + const int32_t _pre_idx = _raw_pre_idx; + const int32_t _j = _k; + + __j = _j; // pick up the locally scoped _j and store in __j + __pre_idx = _pre_idx; + } + _j = __j; // make the previously locally scoped _j available + _pre_idx = __pre_idx; + _raw_post_idx = _j + 0; + + if(_j<0 || _j>=_N_post) + { + cout << "Error: tried to create synapse to neuron j=" << _j << " outside range 0 to " << + _N_post-1 << endl; + exit(1); + } + + const int32_t _post_idx = _raw_post_idx; + const int32_t _n = 1; + + + for (size_t _repetition=0; _repetition<_n; _repetition++) { + _dynamic_array_synapses_1_N_outgoing[_pre_idx] += 1; + _dynamic_array_synapses_1_N_incoming[_post_idx] += 1; + _dynamic_array_synapses_1__synaptic_pre.push_back(_pre_idx); + _dynamic_array_synapses_1__synaptic_post.push_back(_post_idx); + } + } + } + + // now we need to resize all registered variables + const int32_t newsize = _dynamic_array_synapses_1__synaptic_pre.size(); + _dynamic_array_synapses_1__synaptic_post.resize(newsize); + _dynamic_array_synapses_1__synaptic_pre.resize(newsize); + _dynamic_array_synapses_1_Apost.resize(newsize); + _dynamic_array_synapses_1_Apre.resize(newsize); + _dynamic_array_synapses_1_delay_1.resize(newsize); + _dynamic_array_synapses_1_g_raw.resize(newsize); + _dynamic_array_synapses_1_lastupdate.resize(newsize); + // Also update the total number of synapses + _ptr_array_synapses_1_N[0] = newsize; + + +} + + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_synapses_create_generator_codeobject.h b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_synapses_create_generator_codeobject.h new file mode 100644 index 00000000..912c2be7 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_synapses_create_generator_codeobject.h @@ -0,0 +1,7 @@ +#ifndef _INCLUDED_synapses_1_synapses_create_generator_codeobject +#define _INCLUDED_synapses_1_synapses_create_generator_codeobject + +void _run_synapses_1_synapses_create_generator_codeobject(); + + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_2_max_row_length.cpp b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_2_max_row_length.cpp new file mode 100644 index 00000000..b004f920 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_2_max_row_length.cpp @@ -0,0 +1,191 @@ + +#include "brianlib/common_math.h" +#include "brianlib/stdint_compat.h" +#include +#include +#include +#include +#include +#include "brianlib/stdint_compat.h" +#include "synapses_classes.h" + +////// SUPPORT CODE /////// +namespace synapses_2_max_row_length_generator { + + double _rand(const int _vectorisation_idx) { + return rk_double(brian::_mersenne_twister_states[0]); + } + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < typename T1, typename T2 > + static inline typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + static inline typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + +} + +////// HASH DEFINES /////// + + + +void _run_synapses_2_max_row_length() +{ + using namespace brian; + using namespace synapses_2_max_row_length_generator; + + ///// CONSTANTS /////////// + const int _num_synaptic_pre = _dynamic_array_synapses_2__synaptic_pre.size(); +const int _numN = 1; +const int _numN_outgoing = _dynamic_array_synapses_2_N_outgoing.size(); +int32_t* const _array_synapses_2_N_outgoing = &_dynamic_array_synapses_2_N_outgoing[0]; +const int _numN_incoming = _dynamic_array_synapses_2_N_incoming.size(); +int32_t* const _array_synapses_2__synaptic_post = &_dynamic_array_synapses_2__synaptic_post[0]; +int32_t* const _array_synapses_2_N_incoming = &_dynamic_array_synapses_2_N_incoming[0]; +const int _num_synaptic_post = _dynamic_array_synapses_2__synaptic_post.size(); +int32_t* const _array_synapses_2__synaptic_pre = &_dynamic_array_synapses_2__synaptic_pre[0]; + + ///// POINTERS //////////// + + int32_t* _ptr_array_synapses_2_N = _array_synapses_2_N; + int32_t* __restrict _ptr_array_synapses_2_N_outgoing = _array_synapses_2_N_outgoing; + int32_t* __restrict _ptr_array_synapses_2_N_incoming = _array_synapses_2_N_incoming; + int32_t* __restrict _ptr_array_synapses_2__synaptic_post = _array_synapses_2__synaptic_post; + int32_t* __restrict _ptr_array_synapses_2__synaptic_pre = _array_synapses_2__synaptic_pre; + + + const size_t _N_pre = 100; + const size_t _N_post = 100; + _dynamic_array_synapses_2_N_incoming.resize(_N_post + 0); + _dynamic_array_synapses_2_N_outgoing.resize(_N_pre + 0); + size_t _raw_pre_idx, _raw_post_idx; + // scalar code + const int _vectorisation_idx = -1; + + + + + + + + + for(size_t _i=0; _i<_N_pre; _i++) + { + bool __cond, _cond; + _raw_pre_idx = _i + 0; + { + + const char _cond = true; + + __cond = _cond; + } + _cond = __cond; + if(!_cond) continue; + // Some explanation of this hackery. The problem is that we have multiple code blocks. + // Each code block is generated independently of the others, and they declare variables + // at the beginning if necessary (including declaring them as const if their values don't + // change). However, if two code blocks follow each other in the same C++ scope then + // that causes a redeclaration error. So we solve it by putting each block inside a + // pair of braces to create a new scope specific to each code block. However, that brings + // up another problem: we need the values from these code blocks. I don't have a general + // solution to this problem, but in the case of this particular template, we know which + // values we need from them so we simply create outer scoped variables to copy the value + // into. Later on we have a slightly more complicated problem because the original name + // _j has to be used, so we create two variables __j, _j at the outer scope, copy + // _j to __j in the inner scope (using the inner scope version of _j), and then + // __j to _j in the outer scope (to the outer scope version of _j). This outer scope + // version of _j will then be used in subsequent blocks. + long _uiter_low; + long _uiter_high; + long _uiter_step; + { + + const int32_t _iter_low = 0; + const int32_t _iter_high = 100; + const int32_t _iter_step = 1; + + _uiter_low = _iter_low; + _uiter_high = _iter_high; + _uiter_step = _iter_step; + } + for(long _k=_uiter_low; _k<_uiter_high; _k+=_uiter_step) + { + long __j, _j, _pre_idx, __pre_idx; + { + + const int32_t _pre_idx = _raw_pre_idx; + const int32_t _j = _k; + + __j = _j; // pick up the locally scoped _j and store in __j + __pre_idx = _pre_idx; + } + _j = __j; // make the previously locally scoped _j available + _pre_idx = __pre_idx; + _raw_post_idx = _j + 0; + + if(_j<0 || _j>=_N_post) + { + cout << "Error: tried to create synapse to neuron j=" << _j << " outside range 0 to " << + _N_post-1 << endl; + exit(1); + } + + const int32_t _post_idx = _raw_post_idx; + const int32_t _n = 1; + + + for (size_t _repetition=0; _repetition<_n; _repetition++) { + _dynamic_array_synapses_2_N_outgoing[_pre_idx] += 1; + _dynamic_array_synapses_2_N_incoming[_post_idx] += 1; + } + } + } + +} \ No newline at end of file diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_2_pre_push_spikes.cpp b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_2_pre_push_spikes.cpp new file mode 100644 index 00000000..8c329724 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_2_pre_push_spikes.cpp @@ -0,0 +1,105 @@ +#include "objects.h" +#include "code_objects/synapses_2_pre_push_spikes.h" +#include "objects.h" +#include "brianlib/common_math.h" +#include "brianlib/stdint_compat.h" +#include +#include +#include +#include +#include + +////// SUPPORT CODE /////// +namespace synapses_2_pre_push_spikes { + + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < typename T1, typename T2 > + static inline typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + static inline typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + +} + +////// HASH DEFINES /////// + + + +void _run_synapses_2_pre_push_spikes() +{ + using namespace brian; +using namespace synapses_2_pre_push_spikes; + + + ///// CONSTANTS /////////// + double* const _array_synapses_2_delay = &_dynamic_array_synapses_2_delay[0]; +const int _numdelay = _dynamic_array_synapses_2_delay.size(); +const int _num_spikespace = 101; +const int _num_source_dt = 1; + ///// POINTERS //////////// + + double* _ptr_array_synapses_2_delay = _array_synapses_2_delay; + int32_t* __restrict _ptr_array_neurongroup_1__spikespace = _array_neurongroup_1__spikespace; + double* _ptr_array_defaultclock_dt = _array_defaultclock_dt; + + + // we do advance at the beginning rather than at the end because it saves us making + // a copy of the current spiking synapses + + { + synapses_2_pre.advance(); + synapses_2_pre.push(_ptr_array_neurongroup_1__spikespace, _ptr_array_neurongroup_1__spikespace[_num_spikespace-1]); + } + + +} + + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_2_pre_push_spikes.h b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_2_pre_push_spikes.h new file mode 100644 index 00000000..52d331c4 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_2_pre_push_spikes.h @@ -0,0 +1,7 @@ +#ifndef _INCLUDED_synapses_2_pre_push_spikes +#define _INCLUDED_synapses_2_pre_push_spikes + +void _run_synapses_2_pre_push_spikes(); + + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_2_synapses_create_generator_codeobject.cpp b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_2_synapses_create_generator_codeobject.cpp new file mode 100644 index 00000000..b75fbeff --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_2_synapses_create_generator_codeobject.cpp @@ -0,0 +1,207 @@ +#include "objects.h" +#include "code_objects/synapses_2_synapses_create_generator_codeobject.h" +#include "objects.h" +#include "brianlib/common_math.h" +#include "brianlib/stdint_compat.h" +#include +#include +#include +#include +#include +#include "brianlib/stdint_compat.h" +#include "synapses_classes.h" + +////// SUPPORT CODE /////// +namespace synapses_2_synapses_create_generator_codeobject { + + double _rand(const int _vectorisation_idx) { + return rk_double(brian::_mersenne_twister_states[0]); + } + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < typename T1, typename T2 > + static inline typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + static inline typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + +} + +////// HASH DEFINES /////// + + + +void _run_synapses_2_synapses_create_generator_codeobject() +{ + using namespace brian; +using namespace synapses_2_synapses_create_generator_codeobject; + + + ///// CONSTANTS /////////// + const int _numN = 1; +int32_t* const _array_synapses_2_N_outgoing = &_dynamic_array_synapses_2_N_outgoing[0]; +const int _numN_outgoing = _dynamic_array_synapses_2_N_outgoing.size(); +int32_t* const _array_synapses_2_N_incoming = &_dynamic_array_synapses_2_N_incoming[0]; +const int _numN_incoming = _dynamic_array_synapses_2_N_incoming.size(); +int32_t* const _array_synapses_2__synaptic_post = &_dynamic_array_synapses_2__synaptic_post[0]; +const int _num_synaptic_post = _dynamic_array_synapses_2__synaptic_post.size(); +int32_t* const _array_synapses_2__synaptic_pre = &_dynamic_array_synapses_2__synaptic_pre[0]; +const int _num_synaptic_pre = _dynamic_array_synapses_2__synaptic_pre.size(); + ///// POINTERS //////////// + + int32_t* _ptr_array_synapses_2_N = _array_synapses_2_N; + int32_t* __restrict _ptr_array_synapses_2_N_outgoing = _array_synapses_2_N_outgoing; + int32_t* __restrict _ptr_array_synapses_2_N_incoming = _array_synapses_2_N_incoming; + int32_t* __restrict _ptr_array_synapses_2__synaptic_post = _array_synapses_2__synaptic_post; + int32_t* __restrict _ptr_array_synapses_2__synaptic_pre = _array_synapses_2__synaptic_pre; + + + #include + + const size_t _N_pre = 100; + const size_t _N_post = 100; + _dynamic_array_synapses_2_N_incoming.resize(_N_post + 0); + _dynamic_array_synapses_2_N_outgoing.resize(_N_pre + 0); + size_t _raw_pre_idx, _raw_post_idx; + // scalar code + const size_t _vectorisation_idx = -1; + + + + + + + + + for(size_t _i=0; _i<_N_pre; _i++) + { + bool __cond, _cond; + _raw_pre_idx = _i + 0; + { + + const char _cond = true; + + __cond = _cond; + } + _cond = __cond; + if(!_cond) continue; + // Some explanation of this hackery. The problem is that we have multiple code blocks. + // Each code block is generated independently of the others, and they declare variables + // at the beginning if necessary (including declaring them as const if their values don't + // change). However, if two code blocks follow each other in the same C++ scope then + // that causes a redeclaration error. So we solve it by putting each block inside a + // pair of braces to create a new scope specific to each code block. However, that brings + // up another problem: we need the values from these code blocks. I don't have a general + // solution to this problem, but in the case of this particular template, we know which + // values we need from them so we simply create outer scoped variables to copy the value + // into. Later on we have a slightly more complicated problem because the original name + // _j has to be used, so we create two variables __j, _j at the outer scope, copy + // _j to __j in the inner scope (using the inner scope version of _j), and then + // __j to _j in the outer scope (to the outer scope version of _j). This outer scope + // version of _j will then be used in subsequent blocks. + long _uiter_low; + long _uiter_high; + long _uiter_step; + { + + const int32_t _iter_low = 0; + const int32_t _iter_high = 100; + const int32_t _iter_step = 1; + + _uiter_low = _iter_low; + _uiter_high = _iter_high; + _uiter_step = _iter_step; + } + for(long _k=_uiter_low; _k<_uiter_high; _k+=_uiter_step) + { + long __j, _j, _pre_idx, __pre_idx; + { + + const int32_t _pre_idx = _raw_pre_idx; + const int32_t _j = _k; + + __j = _j; // pick up the locally scoped _j and store in __j + __pre_idx = _pre_idx; + } + _j = __j; // make the previously locally scoped _j available + _pre_idx = __pre_idx; + _raw_post_idx = _j + 0; + + if(_j<0 || _j>=_N_post) + { + cout << "Error: tried to create synapse to neuron j=" << _j << " outside range 0 to " << + _N_post-1 << endl; + exit(1); + } + + const int32_t _post_idx = _raw_post_idx; + const int32_t _n = 1; + + + for (size_t _repetition=0; _repetition<_n; _repetition++) { + _dynamic_array_synapses_2_N_outgoing[_pre_idx] += 1; + _dynamic_array_synapses_2_N_incoming[_post_idx] += 1; + _dynamic_array_synapses_2__synaptic_pre.push_back(_pre_idx); + _dynamic_array_synapses_2__synaptic_post.push_back(_post_idx); + } + } + } + + // now we need to resize all registered variables + const int32_t newsize = _dynamic_array_synapses_2__synaptic_pre.size(); + _dynamic_array_synapses_2__synaptic_post.resize(newsize); + _dynamic_array_synapses_2__synaptic_pre.resize(newsize); + // Also update the total number of synapses + _ptr_array_synapses_2_N[0] = newsize; + + +} + + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_2_synapses_create_generator_codeobject.h b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_2_synapses_create_generator_codeobject.h new file mode 100644 index 00000000..86b7880a --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_2_synapses_create_generator_codeobject.h @@ -0,0 +1,7 @@ +#ifndef _INCLUDED_synapses_2_synapses_create_generator_codeobject +#define _INCLUDED_synapses_2_synapses_create_generator_codeobject + +void _run_synapses_2_synapses_create_generator_codeobject(); + + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_group_variable_set_conditional_codeobject.cpp b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_group_variable_set_conditional_codeobject.cpp new file mode 100644 index 00000000..aa85a2af --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_group_variable_set_conditional_codeobject.cpp @@ -0,0 +1,125 @@ +#include "objects.h" +#include "code_objects/synapses_group_variable_set_conditional_codeobject.h" +#include "objects.h" +#include "brianlib/common_math.h" +#include "brianlib/stdint_compat.h" +#include +#include +#include +#include +#include + +////// SUPPORT CODE /////// +namespace synapses_group_variable_set_conditional_codeobject { + + double _randn(const int _vectorisation_idx) { + return rk_gauss(brian::_mersenne_twister_states[0]); + } + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < typename T1, typename T2 > + static inline typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + static inline typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + +} + +////// HASH DEFINES /////// + + + +void _run_synapses_group_variable_set_conditional_codeobject() +{ + using namespace brian; +using namespace synapses_group_variable_set_conditional_codeobject; + + + ///// CONSTANTS /////////// + double* const _array_synapses_weight = &_dynamic_array_synapses_weight[0]; +const int _numweight = _dynamic_array_synapses_weight.size(); +const int _numN = 1; + ///// POINTERS //////////// + + double* __restrict _ptr_array_synapses_weight = _array_synapses_weight; + int32_t* _ptr_array_synapses_N = _array_synapses_N; + + +//// MAIN CODE //////////// +// scalar code +const size_t _vectorisation_idx = -1; + + + + + +const int _N = _array_synapses_N[0]; + + +for(int _idx=0; _idx<_N; _idx++) +{ + // vector code + const size_t _vectorisation_idx = _idx; + + const char _cond = true; + + if (_cond) + { + + double weight; + weight = (10 * 1e-09) + ((1.25 * 1e-09) * _randn(_vectorisation_idx)); + _ptr_array_synapses_weight[_idx] = weight; + + } +} + +} + + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_group_variable_set_conditional_codeobject.h b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_group_variable_set_conditional_codeobject.h new file mode 100644 index 00000000..d59aae0b --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_group_variable_set_conditional_codeobject.h @@ -0,0 +1,7 @@ +#ifndef _INCLUDED_synapses_group_variable_set_conditional_codeobject +#define _INCLUDED_synapses_group_variable_set_conditional_codeobject + +void _run_synapses_group_variable_set_conditional_codeobject(); + + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_max_row_length.cpp b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_max_row_length.cpp new file mode 100644 index 00000000..fe4d02b9 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_max_row_length.cpp @@ -0,0 +1,211 @@ + +#include "brianlib/common_math.h" +#include "brianlib/stdint_compat.h" +#include +#include +#include +#include +#include +#include "brianlib/stdint_compat.h" +#include "synapses_classes.h" + +////// SUPPORT CODE /////// +namespace synapses_max_row_length_generator { + + double _rand(const int _vectorisation_idx) { + return rk_double(brian::_mersenne_twister_states[0]); + } + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < typename T1, typename T2 > + static inline typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + static inline typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + +} + +////// HASH DEFINES /////// + + + +void _run_synapses_max_row_length() +{ + using namespace brian; + using namespace synapses_max_row_length_generator; + + ///// CONSTANTS /////////// + const int _numN = 1; +const int _num_synaptic_post = _dynamic_array_synapses__synaptic_post.size(); +const int _numN_outgoing = _dynamic_array_synapses_N_outgoing.size(); +int32_t* const _array_synapses__synaptic_pre = &_dynamic_array_synapses__synaptic_pre[0]; +int32_t* const _array_synapses__synaptic_post = &_dynamic_array_synapses__synaptic_post[0]; +int32_t* const _array_synapses_N_incoming = &_dynamic_array_synapses_N_incoming[0]; +int32_t* const _array_synapses_N_outgoing = &_dynamic_array_synapses_N_outgoing[0]; +const int _num_synaptic_pre = _dynamic_array_synapses__synaptic_pre.size(); +const int _numN_incoming = _dynamic_array_synapses_N_incoming.size(); + + ///// POINTERS //////////// + + int32_t* _ptr_array_synapses_N = _array_synapses_N; + int32_t* __restrict _ptr_array_synapses_N_outgoing = _array_synapses_N_outgoing; + int32_t* __restrict _ptr_array_synapses_N_incoming = _array_synapses_N_incoming; + int32_t* __restrict _ptr_array_synapses__synaptic_post = _array_synapses__synaptic_post; + int32_t* __restrict _ptr_array_synapses__synaptic_pre = _array_synapses__synaptic_pre; + + + const size_t _N_pre = 100; + const size_t _N_post = 2500; + _dynamic_array_synapses_N_incoming.resize(_N_post + 0); + _dynamic_array_synapses_N_outgoing.resize(_N_pre + 0); + size_t _raw_pre_idx, _raw_post_idx; + // scalar code + const int _vectorisation_idx = -1; + + + + + + + + + for(size_t _i=0; _i<_N_pre; _i++) + { + bool __cond, _cond; + _raw_pre_idx = _i + 0; + { + + const char _cond = true; + + __cond = _cond; + } + _cond = __cond; + if(!_cond) continue; + // Some explanation of this hackery. The problem is that we have multiple code blocks. + // Each code block is generated independently of the others, and they declare variables + // at the beginning if necessary (including declaring them as const if their values don't + // change). However, if two code blocks follow each other in the same C++ scope then + // that causes a redeclaration error. So we solve it by putting each block inside a + // pair of braces to create a new scope specific to each code block. However, that brings + // up another problem: we need the values from these code blocks. I don't have a general + // solution to this problem, but in the case of this particular template, we know which + // values we need from them so we simply create outer scoped variables to copy the value + // into. Later on we have a slightly more complicated problem because the original name + // _j has to be used, so we create two variables __j, _j at the outer scope, copy + // _j to __j in the inner scope (using the inner scope version of _j), and then + // __j to _j in the outer scope (to the outer scope version of _j). This outer scope + // version of _j will then be used in subsequent blocks. + long _uiter_low; + long _uiter_high; + long _uiter_step; + double _uiter_p; + { + + const int32_t _iter_low = 0; + const int32_t _iter_high = 2500; + const int32_t _iter_step = 1; + const double _iter_p = 0.15; + + _uiter_low = _iter_low; + _uiter_high = _iter_high; + _uiter_step = _iter_step; + _uiter_p = _iter_p; + } + if(_uiter_p==0) continue; + const bool _jump_algo = _uiter_p<0.25; + double _log1p; + if(_jump_algo) + _log1p = log(1-_uiter_p); + else + _log1p = 1.0; // will be ignored + const double _pconst = 1.0/log(1-_uiter_p); + for(long _k=_uiter_low; _k<_uiter_high; _k++) + { + if(_jump_algo) { + const double _r = _rand(_vectorisation_idx); + if(_r==0.0) break; + const int _jump = floor(log(_r)*_pconst)*_uiter_step; + _k += _jump; + if(_k>=_uiter_high) continue; + } else { + if(_rand(_vectorisation_idx)>=_uiter_p) continue; + } + long __j, _j, _pre_idx, __pre_idx; + { + + const int32_t _pre_idx = _raw_pre_idx; + const int32_t _j = _k; + + __j = _j; // pick up the locally scoped _j and store in __j + __pre_idx = _pre_idx; + } + _j = __j; // make the previously locally scoped _j available + _pre_idx = __pre_idx; + _raw_post_idx = _j + 0; + + if(_j<0 || _j>=_N_post) + { + cout << "Error: tried to create synapse to neuron j=" << _j << " outside range 0 to " << + _N_post-1 << endl; + exit(1); + } + + const int32_t _post_idx = _raw_post_idx; + const int32_t _n = 1; + + + for (size_t _repetition=0; _repetition<_n; _repetition++) { + _dynamic_array_synapses_N_outgoing[_pre_idx] += 1; + _dynamic_array_synapses_N_incoming[_post_idx] += 1; + } + } + } + +} \ No newline at end of file diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_pre_push_spikes.cpp b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_pre_push_spikes.cpp new file mode 100644 index 00000000..c4efc2c8 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_pre_push_spikes.cpp @@ -0,0 +1,105 @@ +#include "objects.h" +#include "code_objects/synapses_pre_push_spikes.h" +#include "objects.h" +#include "brianlib/common_math.h" +#include "brianlib/stdint_compat.h" +#include +#include +#include +#include +#include + +////// SUPPORT CODE /////// +namespace synapses_pre_push_spikes { + + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < typename T1, typename T2 > + static inline typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + static inline typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + +} + +////// HASH DEFINES /////// + + + +void _run_synapses_pre_push_spikes() +{ + using namespace brian; +using namespace synapses_pre_push_spikes; + + + ///// CONSTANTS /////////// + double* const _array_synapses_delay = &_dynamic_array_synapses_delay[0]; +const int _numdelay = _dynamic_array_synapses_delay.size(); +const int _num_spikespace = 101; +const int _num_source_dt = 1; + ///// POINTERS //////////// + + double* __restrict _ptr_array_synapses_delay = _array_synapses_delay; + int32_t* __restrict _ptr_array_spikegeneratorgroup__spikespace = _array_spikegeneratorgroup__spikespace; + double* _ptr_array_defaultclock_dt = _array_defaultclock_dt; + + + // we do advance at the beginning rather than at the end because it saves us making + // a copy of the current spiking synapses + + { + synapses_pre.advance(); + synapses_pre.push(_ptr_array_spikegeneratorgroup__spikespace, _ptr_array_spikegeneratorgroup__spikespace[_num_spikespace-1]); + } + + +} + + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_pre_push_spikes.h b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_pre_push_spikes.h new file mode 100644 index 00000000..0968700d --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_pre_push_spikes.h @@ -0,0 +1,7 @@ +#ifndef _INCLUDED_synapses_pre_push_spikes +#define _INCLUDED_synapses_pre_push_spikes + +void _run_synapses_pre_push_spikes(); + + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_synapses_create_generator_codeobject.cpp b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_synapses_create_generator_codeobject.cpp new file mode 100644 index 00000000..4db626cd --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_synapses_create_generator_codeobject.cpp @@ -0,0 +1,229 @@ +#include "objects.h" +#include "code_objects/synapses_synapses_create_generator_codeobject.h" +#include "objects.h" +#include "brianlib/common_math.h" +#include "brianlib/stdint_compat.h" +#include +#include +#include +#include +#include +#include "brianlib/stdint_compat.h" +#include "synapses_classes.h" + +////// SUPPORT CODE /////// +namespace synapses_synapses_create_generator_codeobject { + + double _rand(const int _vectorisation_idx) { + return rk_double(brian::_mersenne_twister_states[0]); + } + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef long long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < > struct _higher_type { typedef long double type; }; + template < typename T1, typename T2 > + static inline typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + static inline typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + +} + +////// HASH DEFINES /////// + + + +void _run_synapses_synapses_create_generator_codeobject() +{ + using namespace brian; +using namespace synapses_synapses_create_generator_codeobject; + + + ///// CONSTANTS /////////// + const int _numN = 1; +int32_t* const _array_synapses_N_outgoing = &_dynamic_array_synapses_N_outgoing[0]; +const int _numN_outgoing = _dynamic_array_synapses_N_outgoing.size(); +int32_t* const _array_synapses_N_incoming = &_dynamic_array_synapses_N_incoming[0]; +const int _numN_incoming = _dynamic_array_synapses_N_incoming.size(); +int32_t* const _array_synapses__synaptic_post = &_dynamic_array_synapses__synaptic_post[0]; +const int _num_synaptic_post = _dynamic_array_synapses__synaptic_post.size(); +int32_t* const _array_synapses__synaptic_pre = &_dynamic_array_synapses__synaptic_pre[0]; +const int _num_synaptic_pre = _dynamic_array_synapses__synaptic_pre.size(); + ///// POINTERS //////////// + + int32_t* _ptr_array_synapses_N = _array_synapses_N; + int32_t* __restrict _ptr_array_synapses_N_outgoing = _array_synapses_N_outgoing; + int32_t* __restrict _ptr_array_synapses_N_incoming = _array_synapses_N_incoming; + int32_t* __restrict _ptr_array_synapses__synaptic_post = _array_synapses__synaptic_post; + int32_t* __restrict _ptr_array_synapses__synaptic_pre = _array_synapses__synaptic_pre; + + + #include + + const size_t _N_pre = 100; + const size_t _N_post = 2500; + _dynamic_array_synapses_N_incoming.resize(_N_post + 0); + _dynamic_array_synapses_N_outgoing.resize(_N_pre + 0); + size_t _raw_pre_idx, _raw_post_idx; + // scalar code + const size_t _vectorisation_idx = -1; + + + + + + + + + for(size_t _i=0; _i<_N_pre; _i++) + { + bool __cond, _cond; + _raw_pre_idx = _i + 0; + { + + const char _cond = true; + + __cond = _cond; + } + _cond = __cond; + if(!_cond) continue; + // Some explanation of this hackery. The problem is that we have multiple code blocks. + // Each code block is generated independently of the others, and they declare variables + // at the beginning if necessary (including declaring them as const if their values don't + // change). However, if two code blocks follow each other in the same C++ scope then + // that causes a redeclaration error. So we solve it by putting each block inside a + // pair of braces to create a new scope specific to each code block. However, that brings + // up another problem: we need the values from these code blocks. I don't have a general + // solution to this problem, but in the case of this particular template, we know which + // values we need from them so we simply create outer scoped variables to copy the value + // into. Later on we have a slightly more complicated problem because the original name + // _j has to be used, so we create two variables __j, _j at the outer scope, copy + // _j to __j in the inner scope (using the inner scope version of _j), and then + // __j to _j in the outer scope (to the outer scope version of _j). This outer scope + // version of _j will then be used in subsequent blocks. + long _uiter_low; + long _uiter_high; + long _uiter_step; + double _uiter_p; + { + + const int32_t _iter_low = 0; + const int32_t _iter_high = 2500; + const int32_t _iter_step = 1; + const double _iter_p = 0.15; + + _uiter_low = _iter_low; + _uiter_high = _iter_high; + _uiter_step = _iter_step; + _uiter_p = _iter_p; + } + if(_uiter_p==0) continue; + const bool _jump_algo = _uiter_p<0.25; + double _log1p; + if(_jump_algo) + _log1p = log(1-_uiter_p); + else + _log1p = 1.0; // will be ignored + const double _pconst = 1.0/log(1-_uiter_p); + for(long _k=_uiter_low; _k<_uiter_high; _k++) + { + if(_jump_algo) { + const double _r = _rand(_vectorisation_idx); + if(_r==0.0) break; + const int _jump = floor(log(_r)*_pconst)*_uiter_step; + _k += _jump; + if(_k>=_uiter_high) continue; + } else { + if(_rand(_vectorisation_idx)>=_uiter_p) continue; + } + long __j, _j, _pre_idx, __pre_idx; + { + + const int32_t _pre_idx = _raw_pre_idx; + const int32_t _j = _k; + + __j = _j; // pick up the locally scoped _j and store in __j + __pre_idx = _pre_idx; + } + _j = __j; // make the previously locally scoped _j available + _pre_idx = __pre_idx; + _raw_post_idx = _j + 0; + + if(_j<0 || _j>=_N_post) + { + cout << "Error: tried to create synapse to neuron j=" << _j << " outside range 0 to " << + _N_post-1 << endl; + exit(1); + } + + const int32_t _post_idx = _raw_post_idx; + const int32_t _n = 1; + + + for (size_t _repetition=0; _repetition<_n; _repetition++) { + _dynamic_array_synapses_N_outgoing[_pre_idx] += 1; + _dynamic_array_synapses_N_incoming[_post_idx] += 1; + _dynamic_array_synapses__synaptic_pre.push_back(_pre_idx); + _dynamic_array_synapses__synaptic_post.push_back(_post_idx); + } + } + } + + // now we need to resize all registered variables + const int32_t newsize = _dynamic_array_synapses__synaptic_pre.size(); + _dynamic_array_synapses__synaptic_post.resize(newsize); + _dynamic_array_synapses__synaptic_pre.resize(newsize); + _dynamic_array_synapses_delay.resize(newsize); + _dynamic_array_synapses_weight.resize(newsize); + // Also update the total number of synapses + _ptr_array_synapses_N[0] = newsize; + + +} + + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_synapses_create_generator_codeobject.h b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_synapses_create_generator_codeobject.h new file mode 100644 index 00000000..88c4628e --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/code_objects/synapses_synapses_create_generator_codeobject.h @@ -0,0 +1,7 @@ +#ifndef _INCLUDED_synapses_synapses_create_generator_codeobject +#define _INCLUDED_synapses_synapses_create_generator_codeobject + +void _run_synapses_synapses_create_generator_codeobject(); + + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/engine.cpp b/parallel_execution/parallel_execution/code/MushroomBody/genn/engine.cpp new file mode 100644 index 00000000..f16382ec --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/engine.cpp @@ -0,0 +1,100 @@ +#ifndef _ENGINE_CC_ +#define _ENGINE_CC_ + +//-------------------------------------------------------------------------- +/*! \file engine.cc +\brief Implementation of the engine class. +*/ +//-------------------------------------------------------------------------- + +#include "engine.h" +#include "network.h" + +engine::engine() +{ + allocateMem(); + initialize(); + Network::_last_run_time= 0.0; + Network::_last_run_completed_fraction= 0.0; +} + +//-------------------------------------------------------------------------- +//-------------------------------------------------------------------------- + +engine::~engine() +{ +} + + +//-------------------------------------------------------------------------- +/*! \brief Method for simulating the model for a given period of time + */ +//-------------------------------------------------------------------------- + +void engine::run(double duration) //!< Duration of time to run the model for +{ + std::clock_t start, current; + const double t_start = t; + + start = std::clock(); + int riT= (int) (duration/DT+1e-2); + double elapsed_realtime; + + for (int i= 0; i < riT; i++) { + // The StateMonitor and run_regularly operations are ordered by their "order" value + stepTime(); + // The stepTimeGPU function already updated everything for the next time step + iT--; + t = iT*DT; + _run_spikegeneratorgroup_codeobject(); + pushspikegeneratorgroupSpikesToDevice(); + pullneurongroup_1CurrentSpikesFromDevice(); + pullneurongroupCurrentSpikesFromDevice(); + // report state + // report spikes + _run_spikemonitor_2_codeobject(); + _run_spikemonitor_1_codeobject(); + _run_spikemonitor_codeobject(); + // Bring the time step back to the value for the next loop iteration + iT++; + t = iT*DT; + } + current= std::clock(); + elapsed_realtime= (double) (current - start)/CLOCKS_PER_SEC; + Network::_last_run_time = elapsed_realtime; + if (duration > 0.0) + { + Network::_last_run_completed_fraction = (t-t_start)/duration; + } else { + Network::_last_run_completed_fraction = 1.0; + } +} + +//-------------------------------------------------------------------------- +/*! \brief Method for copying all variables of the last time step from the GPU + + This is a simple wrapper for the convenience function copyStateFromDevice() which is provided by GeNN. +*/ +//-------------------------------------------------------------------------- + +void engine::getStateFromGPU() +{ + copyStateFromDevice(); +} + +//-------------------------------------------------------------------------- +/*! \brief Method for copying all spikes of the last time step from the GPU + + This is a simple wrapper for the convenience function copySpikesFromDevice() which is provided by GeNN. +*/ +//-------------------------------------------------------------------------- + +void engine::getSpikesFromGPU() +{ + copyCurrentSpikesFromDevice(); +} + + + +#endif + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/engine.h b/parallel_execution/parallel_execution/code/MushroomBody/genn/engine.h new file mode 100644 index 00000000..8a2c5ee8 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/engine.h @@ -0,0 +1,37 @@ + +#ifndef ENGINE_H +#define ENGINE_H + +//-------------------------------------------------------------------------- +/*! \file engine.h + +\brief Header file containing the class definition for the engine to conveniently run a model in GeNN +*/ +//-------------------------------------------------------------------------- + +//-------------------------------------------------------------------------- +/*! \brief This class contains the methods for running the model. + */ +//-------------------------------------------------------------------------- + +#include +#include "magicnetwork_model_CODE/definitions.h" +#include "network.h" + +double Network::_last_run_time = 0.0; +double Network::_last_run_completed_fraction = 0.0; + +class engine +{ + public: + // end of data fields + + engine(); + ~engine(); + void free_device_mem(); + void run(double); + void getStateFromGPU(); + void getSpikesFromGPU(); +}; + +#endif diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/generator b/parallel_execution/parallel_execution/code/MushroomBody/genn/generator new file mode 100755 index 00000000..5ba5a35f Binary files /dev/null and b/parallel_execution/parallel_execution/code/MushroomBody/genn/generator differ diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/generator.d b/parallel_execution/parallel_execution/code/MushroomBody/genn/generator.d new file mode 100644 index 00000000..a8d031ea --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/generator.d @@ -0,0 +1,290 @@ +/cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/generator: \ + generator.cc \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/third_party/plog/Appenders/ConsoleAppender.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/third_party/plog/Appenders/IAppender.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/third_party/plog/Record.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/third_party/plog/Severity.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/third_party/plog/Util.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/third_party/plog/WinApi.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/third_party/path.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/logging.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/third_party/plog/Log.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/third_party/plog/Logger.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/third_party/plog/Init.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/third_party/plog/Formatters/CsvFormatter.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/third_party/plog/Formatters/TxtFormatter.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/third_party/plog/Appenders/RollingFileAppender.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/third_party/plog/Converters/UTF8Converter.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/third_party/plog/Converters/NativeEOLConverter.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/gennExport.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/modelSpecInternal.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/modelSpec.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/currentSourceInternal.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/currentSource.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/currentSourceModels.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/models.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/snippet.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/gennUtils.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/third_party/sha1.hpp \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/initVarSnippet.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/varAccess.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/variableMode.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/customUpdateInternal.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/customUpdate.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/customUpdateModels.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/neuronGroupInternal.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/neuronGroup.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/neuronModels.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/synapseGroupInternal.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/synapseGroup.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/initSparseConnectivitySnippet.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/binomial.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/postsynapticModels.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/weightUpdateModels.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/synapseMatrixType.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/code_generator/generateAll.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/code_generator/backendBase.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/code_generator/codeStream.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/code_generator/generateMakefile.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/code_generator/generateMSBuild.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/backends/cuda/optimiser.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/backendExport.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/backends/cuda/backend.h \ + /cognition/home/local/cuda/cuda-11.2/include/cuda.h \ + /cognition/home/local/cuda/cuda-11.2/include/cuda_runtime.h \ + /cognition/home/local/cuda/cuda-11.2/include/crt/host_config.h \ + /cognition/home/local/cuda/cuda-11.2/include/builtin_types.h \ + /cognition/home/local/cuda/cuda-11.2/include/device_types.h \ + /cognition/home/local/cuda/cuda-11.2/include/crt/host_defines.h \ + /cognition/home/local/cuda/cuda-11.2/include/driver_types.h \ + /cognition/home/local/cuda/cuda-11.2/include/vector_types.h \ + /cognition/home/local/cuda/cuda-11.2/include/surface_types.h \ + /cognition/home/local/cuda/cuda-11.2/include/texture_types.h \ + /cognition/home/local/cuda/cuda-11.2/include/library_types.h \ + /cognition/home/local/cuda/cuda-11.2/include/channel_descriptor.h \ + /cognition/home/local/cuda/cuda-11.2/include/cuda_runtime_api.h \ + /cognition/home/local/cuda/cuda-11.2/include/cuda_device_runtime_api.h \ + /cognition/home/local/cuda/cuda-11.2/include/driver_functions.h \ + /cognition/home/local/cuda/cuda-11.2/include/vector_functions.h \ + /cognition/home/local/cuda/cuda-11.2/include/vector_functions.hpp \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/code_generator/backendSIMT.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/code_generator/presynapticUpdateStrategySIMT.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/code_generator/substitutions.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/magicnetwork_model.cpp \ + /cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/brianlib/randomkit/randomkit.cc \ + /cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/brianlib/randomkit/randomkit.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/objects.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/synapses_classes.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/brianlib/spikequeue.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/brianlib/stdint_compat.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/brianlib/clocks.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/brianlib/stdint_compat.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/brianlib/dynamic_array.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/brianlib/stdint_compat.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/network.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/brianlib/randomkit/randomkit.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/objects.cpp \ + /cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/code_objects/synapses_group_variable_set_conditional_codeobject.cpp \ + /cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/objects.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/code_objects/synapses_group_variable_set_conditional_codeobject.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/brianlib/common_math.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_group_variable_set_conditional_codeobject.cpp \ + /cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_group_variable_set_conditional_codeobject.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_group_variable_set_conditional_codeobject_1.cpp \ + /cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_group_variable_set_conditional_codeobject_1.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/code_objects/synapses_max_row_length.cpp \ + /cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/synapses_classes.h \ + /cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_max_row_length.cpp \ + /cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/code_objects/synapses_2_max_row_length.cpp + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/third_party/plog/Appenders/ConsoleAppender.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/third_party/plog/Appenders/IAppender.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/third_party/plog/Record.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/third_party/plog/Severity.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/third_party/plog/Util.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/third_party/plog/WinApi.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/third_party/path.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/logging.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/third_party/plog/Log.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/third_party/plog/Logger.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/third_party/plog/Init.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/third_party/plog/Formatters/CsvFormatter.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/third_party/plog/Formatters/TxtFormatter.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/third_party/plog/Appenders/RollingFileAppender.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/third_party/plog/Converters/UTF8Converter.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/third_party/plog/Converters/NativeEOLConverter.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/gennExport.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/modelSpecInternal.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/modelSpec.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/currentSourceInternal.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/currentSource.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/currentSourceModels.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/models.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/snippet.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/gennUtils.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/third_party/sha1.hpp: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/initVarSnippet.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/varAccess.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/variableMode.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/customUpdateInternal.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/customUpdate.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/customUpdateModels.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/neuronGroupInternal.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/neuronGroup.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/neuronModels.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/synapseGroupInternal.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/synapseGroup.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/initSparseConnectivitySnippet.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/binomial.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/postsynapticModels.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/weightUpdateModels.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/synapseMatrixType.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/code_generator/generateAll.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/code_generator/backendBase.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/code_generator/codeStream.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/code_generator/generateMakefile.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/code_generator/generateMSBuild.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/backends/cuda/optimiser.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/backendExport.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/backends/cuda/backend.h: + +/cognition/home/local/cuda/cuda-11.2/include/cuda.h: + +/cognition/home/local/cuda/cuda-11.2/include/cuda_runtime.h: + +/cognition/home/local/cuda/cuda-11.2/include/crt/host_config.h: + +/cognition/home/local/cuda/cuda-11.2/include/builtin_types.h: + +/cognition/home/local/cuda/cuda-11.2/include/device_types.h: + +/cognition/home/local/cuda/cuda-11.2/include/crt/host_defines.h: + +/cognition/home/local/cuda/cuda-11.2/include/driver_types.h: + +/cognition/home/local/cuda/cuda-11.2/include/vector_types.h: + +/cognition/home/local/cuda/cuda-11.2/include/surface_types.h: + +/cognition/home/local/cuda/cuda-11.2/include/texture_types.h: + +/cognition/home/local/cuda/cuda-11.2/include/library_types.h: + +/cognition/home/local/cuda/cuda-11.2/include/channel_descriptor.h: + +/cognition/home/local/cuda/cuda-11.2/include/cuda_runtime_api.h: + +/cognition/home/local/cuda/cuda-11.2/include/cuda_device_runtime_api.h: + +/cognition/home/local/cuda/cuda-11.2/include/driver_functions.h: + +/cognition/home/local/cuda/cuda-11.2/include/vector_functions.h: + +/cognition/home/local/cuda/cuda-11.2/include/vector_functions.hpp: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/code_generator/backendSIMT.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/code_generator/presynapticUpdateStrategySIMT.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/frozen_repos/genn/include/genn/genn/code_generator/substitutions.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/magicnetwork_model.cpp: + +/cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/brianlib/randomkit/randomkit.cc: + +/cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/brianlib/randomkit/randomkit.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/objects.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/synapses_classes.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/brianlib/spikequeue.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/brianlib/stdint_compat.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/brianlib/clocks.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/brianlib/stdint_compat.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/brianlib/dynamic_array.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/brianlib/stdint_compat.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/network.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/brianlib/randomkit/randomkit.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/objects.cpp: + +/cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/code_objects/synapses_group_variable_set_conditional_codeobject.cpp: + +/cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/objects.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/code_objects/synapses_group_variable_set_conditional_codeobject.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/brianlib/common_math.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_group_variable_set_conditional_codeobject.cpp: + +/cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_group_variable_set_conditional_codeobject.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_group_variable_set_conditional_codeobject_1.cpp: + +/cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_group_variable_set_conditional_codeobject_1.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/code_objects/synapses_max_row_length.cpp: + +/cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/synapses_classes.h: + +/cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/code_objects/synapses_1_max_row_length.cpp: + +/cognition/home/subora/Documents/github_repository/brian2cuda/parallel_execution/code/MushroomBody/genn/code_objects/synapses_2_max_row_length.cpp: diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model.cpp b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model.cpp new file mode 100644 index 00000000..22f01d6c --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model.cpp @@ -0,0 +1,728 @@ +// define the time step + +#include +#include "modelSpec.h" +#include "brianlib/randomkit/randomkit.cc" + +#include "objects.h" +#include "objects.cpp" +// We need these to compile objects.cpp, but they are only used in _write_arrays which we never call. +double Network::_last_run_time = 0.0; +double Network::_last_run_completed_fraction = 0.0; + +#include "code_objects/synapses_group_variable_set_conditional_codeobject.cpp" +#include "code_objects/synapses_1_group_variable_set_conditional_codeobject.cpp" +#include "code_objects/synapses_1_group_variable_set_conditional_codeobject_1.cpp" + +#include "code_objects/synapses_max_row_length.cpp" +#include "code_objects/synapses_1_max_row_length.cpp" +#include "code_objects/synapses_2_max_row_length.cpp" + +//-------------------------------------------------------------------------- +/*! \brief This function defines the Brian2GeNN_model +*/ +//-------------------------------------------------------------------------- + +// +// define the neuron model classes + +class neurongroupNEURON : public NeuronModels::Base +{ +public: + DECLARE_MODEL(neurongroupNEURON, 12, 8); + + SET_SIM_CODE("// Update \"constant over DT\" subexpressions (if any)\n\ +\n\ +\n\ +\n\ +// PoissonInputs targetting this group (if any)\n\ +\n\ +\n\ +\n\ +// Update state variables and the threshold condition\n\ +\n\ +$(not_refractory) = $(not_refractory) || (! ($(V) > (0 * $(mV))));\n\ +double _BA_V = 1.0f*((((1.0f*(((1.0 * $(E_K)) * $(g_K)) * (_brian_pow($(n), 4.0)))/$(C)) + (1.0f*((((1.0 * $(E_Na)) * $(g_Na)) * $(h)) * (_brian_pow($(m), 3.0)))/$(C))) + (1.0f*((1.0 * $(E_e)) * $(g_PN_iKC))/$(C))) + (1.0f*((1.0 * $(E_leak)) * $(g_leak))/$(C)))/((((1.0f*(((- 1.0) * $(g_K)) * (_brian_pow($(n), 4.0)))/$(C)) - (1.0f*(((1.0 * $(g_Na)) * $(h)) * (_brian_pow($(m), 3.0)))/$(C))) - (1.0f*(1.0 * $(g_PN_iKC))/$(C))) - (1.0f*(1.0 * $(g_leak))/$(C)));\n\ +double _V = (- _BA_V) + (($(V) + _BA_V) * exp(DT * ((((1.0f*(((- 1.0) * $(g_K)) * (_brian_pow($(n), 4.0)))/$(C)) - (1.0f*(((1.0 * $(g_Na)) * $(h)) * (_brian_pow($(m), 3.0)))/$(C))) - (1.0f*(1.0 * $(g_PN_iKC))/$(C))) - (1.0f*(1.0 * $(g_leak))/$(C)))));\n\ +double _g_PN_iKC = $(g_PN_iKC) * exp(1.0f*(- DT)/$(tau_PN_iKC));\n\ +double _BA_h = 1.0f*((0.329137207652868 * exp(1.0f*((- 0.0555555555555556) * $(V))/$(mV))) * exp(1.0f*(0.0555555555555556 * $(VT))/$(mV)))/($(ms) * ((1.0f*(- 4.0)/($(ms) + (((2980.95798704173 * $(ms)) * exp(1.0f*((- 0.2) * $(V))/$(mV))) * exp(1.0f*(0.2 * $(VT))/$(mV))))) - (1.0f*((0.329137207652868 * exp(1.0f*((- 0.0555555555555556) * $(V))/$(mV))) * exp(1.0f*(0.0555555555555556 * $(VT))/$(mV)))/$(ms))));\n\ +double _h = (- _BA_h) + ((_BA_h + $(h)) * exp(DT * ((1.0f*(- 4.0)/($(ms) + (((2980.95798704173 * $(ms)) * exp(1.0f*((- 0.2) * $(V))/$(mV))) * exp(1.0f*(0.2 * $(VT))/$(mV))))) - (1.0f*((0.329137207652868 * exp(1.0f*((- 0.0555555555555556) * $(V))/$(mV))) * exp(1.0f*(0.0555555555555556 * $(VT))/$(mV)))/$(ms)))));\n\ +double _BA_m = 1.0f*(((1.0f*((- 0.32) * $(V))/((((- 1.0) * (_brian_pow($(mV), 1.0))) * $(ms)) + ((((25.7903399171931 * (_brian_pow($(mV), 1.0))) * $(ms)) * exp(1.0f*((- 0.25) * $(V))/$(mV))) * exp(1.0f*(0.25 * $(VT))/$(mV))))) + (1.0f*(0.32 * $(VT))/((((- 1.0) * (_brian_pow($(mV), 1.0))) * $(ms)) + ((((25.7903399171931 * (_brian_pow($(mV), 1.0))) * $(ms)) * exp(1.0f*((- 0.25) * $(V))/$(mV))) * exp(1.0f*(0.25 * $(VT))/$(mV)))))) + (1.0f*(4.16 * $(mV))/((((- 1.0) * (_brian_pow($(mV), 1.0))) * $(ms)) + ((((25.7903399171931 * (_brian_pow($(mV), 1.0))) * $(ms)) * exp(1.0f*((- 0.25) * $(V))/$(mV))) * exp(1.0f*(0.25 * $(VT))/$(mV))))))/((((((1.0f*((- 0.28) * $(V))/(((((0.000335462627902512 * (_brian_pow($(mV), 1.0))) * $(ms)) * exp(1.0f*(0.2 * $(V))/$(mV))) * exp(1.0f*((- 0.2) * $(VT))/$(mV))) - ((1.0 * (_brian_pow($(mV), 1.0))) * $(ms)))) + (1.0f*(0.32 * $(V))/((((- 1.0) * (_brian_pow($(mV), 1.0))) * $(ms)) + ((((25.7903399171931 * (_brian_pow($(mV), 1.0))) * $(ms)) * exp(1.0f*((- 0.25) * $(V))/$(mV))) * exp(1.0f*(0.25 * $(VT))/$(mV)))))) + (1.0f*(0.28 * $(VT))/(((((0.000335462627902512 * (_brian_pow($(mV), 1.0))) * $(ms)) * exp(1.0f*(0.2 * $(V))/$(mV))) * exp(1.0f*((- 0.2) * $(VT))/$(mV))) - ((1.0 * (_brian_pow($(mV), 1.0))) * $(ms))))) - (1.0f*(0.32 * $(VT))/((((- 1.0) * (_brian_pow($(mV), 1.0))) * $(ms)) + ((((25.7903399171931 * (_brian_pow($(mV), 1.0))) * $(ms)) * exp(1.0f*((- 0.25) * $(V))/$(mV))) * exp(1.0f*(0.25 * $(VT))/$(mV)))))) + (1.0f*(11.2 * $(mV))/(((((0.000335462627902512 * (_brian_pow($(mV), 1.0))) * $(ms)) * exp(1.0f*(0.2 * $(V))/$(mV))) * exp(1.0f*((- 0.2) * $(VT))/$(mV))) - ((1.0 * (_brian_pow($(mV), 1.0))) * $(ms))))) - (1.0f*(4.16 * $(mV))/((((- 1.0) * (_brian_pow($(mV), 1.0))) * $(ms)) + ((((25.7903399171931 * (_brian_pow($(mV), 1.0))) * $(ms)) * exp(1.0f*((- 0.25) * $(V))/$(mV))) * exp(1.0f*(0.25 * $(VT))/$(mV))))));\n\ +double _m = (- _BA_m) + ((_BA_m + $(m)) * exp(DT * ((((((1.0f*((- 0.28) * $(V))/(((((0.000335462627902512 * (_brian_pow($(mV), 1.0))) * $(ms)) * exp(1.0f*(0.2 * $(V))/$(mV))) * exp(1.0f*((- 0.2) * $(VT))/$(mV))) - ((1.0 * (_brian_pow($(mV), 1.0))) * $(ms)))) + (1.0f*(0.32 * $(V))/((((- 1.0) * (_brian_pow($(mV), 1.0))) * $(ms)) + ((((25.7903399171931 * (_brian_pow($(mV), 1.0))) * $(ms)) * exp(1.0f*((- 0.25) * $(V))/$(mV))) * exp(1.0f*(0.25 * $(VT))/$(mV)))))) + (1.0f*(0.28 * $(VT))/(((((0.000335462627902512 * (_brian_pow($(mV), 1.0))) * $(ms)) * exp(1.0f*(0.2 * $(V))/$(mV))) * exp(1.0f*((- 0.2) * $(VT))/$(mV))) - ((1.0 * (_brian_pow($(mV), 1.0))) * $(ms))))) - (1.0f*(0.32 * $(VT))/((((- 1.0) * (_brian_pow($(mV), 1.0))) * $(ms)) + ((((25.7903399171931 * (_brian_pow($(mV), 1.0))) * $(ms)) * exp(1.0f*((- 0.25) * $(V))/$(mV))) * exp(1.0f*(0.25 * $(VT))/$(mV)))))) + (1.0f*(11.2 * $(mV))/(((((0.000335462627902512 * (_brian_pow($(mV), 1.0))) * $(ms)) * exp(1.0f*(0.2 * $(V))/$(mV))) * exp(1.0f*((- 0.2) * $(VT))/$(mV))) - ((1.0 * (_brian_pow($(mV), 1.0))) * $(ms))))) - (1.0f*(4.16 * $(mV))/((((- 1.0) * (_brian_pow($(mV), 1.0))) * $(ms)) + ((((25.7903399171931 * (_brian_pow($(mV), 1.0))) * $(ms)) * exp(1.0f*((- 0.25) * $(V))/$(mV))) * exp(1.0f*(0.25 * $(VT))/$(mV))))))));\n\ +double _BA_n = 1.0f*(((1.0f*((- 0.032) * $(V))/((((- 1.0) * $(mV)) * $(ms)) + ((((20.0855369231877 * $(mV)) * $(ms)) * exp(1.0f*((- 0.2) * $(V))/$(mV))) * exp(1.0f*(0.2 * $(VT))/$(mV))))) + (1.0f*(0.032 * $(VT))/((((- 1.0) * $(mV)) * $(ms)) + ((((20.0855369231877 * $(mV)) * $(ms)) * exp(1.0f*((- 0.2) * $(V))/$(mV))) * exp(1.0f*(0.2 * $(VT))/$(mV)))))) + (1.0f*(0.48 * $(mV))/((((- 1.0) * $(mV)) * $(ms)) + ((((20.0855369231877 * $(mV)) * $(ms)) * exp(1.0f*((- 0.2) * $(V))/$(mV))) * exp(1.0f*(0.2 * $(VT))/$(mV))))))/((((1.0f*(0.032 * $(V))/((((- 1.0) * $(mV)) * $(ms)) + ((((20.0855369231877 * $(mV)) * $(ms)) * exp(1.0f*((- 0.2) * $(V))/$(mV))) * exp(1.0f*(0.2 * $(VT))/$(mV))))) - (1.0f*(0.032 * $(VT))/((((- 1.0) * $(mV)) * $(ms)) + ((((20.0855369231877 * $(mV)) * $(ms)) * exp(1.0f*((- 0.2) * $(V))/$(mV))) * exp(1.0f*(0.2 * $(VT))/$(mV)))))) - (1.0f*(0.48 * $(mV))/((((- 1.0) * $(mV)) * $(ms)) + ((((20.0855369231877 * $(mV)) * $(ms)) * exp(1.0f*((- 0.2) * $(V))/$(mV))) * exp(1.0f*(0.2 * $(VT))/$(mV)))))) - (1.0f*((0.642012708343871 * exp(1.0f*((- 0.025) * $(V))/$(mV))) * exp(1.0f*(0.025 * $(VT))/$(mV)))/$(ms)));\n\ +double _n = (- _BA_n) + ((_BA_n + $(n)) * exp(DT * ((((1.0f*(0.032 * $(V))/((((- 1.0) * $(mV)) * $(ms)) + ((((20.0855369231877 * $(mV)) * $(ms)) * exp(1.0f*((- 0.2) * $(V))/$(mV))) * exp(1.0f*(0.2 * $(VT))/$(mV))))) - (1.0f*(0.032 * $(VT))/((((- 1.0) * $(mV)) * $(ms)) + ((((20.0855369231877 * $(mV)) * $(ms)) * exp(1.0f*((- 0.2) * $(V))/$(mV))) * exp(1.0f*(0.2 * $(VT))/$(mV)))))) - (1.0f*(0.48 * $(mV))/((((- 1.0) * $(mV)) * $(ms)) + ((((20.0855369231877 * $(mV)) * $(ms)) * exp(1.0f*((- 0.2) * $(V))/$(mV))) * exp(1.0f*(0.2 * $(VT))/$(mV)))))) - (1.0f*((0.642012708343871 * exp(1.0f*((- 0.025) * $(V))/$(mV))) * exp(1.0f*(0.025 * $(VT))/$(mV)))/$(ms)))));\n\ +$(V) = _V;\n\ +$(g_PN_iKC) = _g_PN_iKC;\n\ +$(h) = _h;\n\ +$(m) = _m;\n\ +$(n) = _n;\n\ +char _cond = ($(V) > (0 * $(mV))) && $(not_refractory);"); + SET_THRESHOLD_CONDITION_CODE("_cond"); + SET_RESET_CODE("$(lastspike) = t;\n\ +$(not_refractory) = false;"); + + SET_SUPPORT_CODE("\n\ +template < typename T1, typename T2 > struct _higher_type;\n\ +template < > struct _higher_type { typedef int type; };\n\ +template < > struct _higher_type { typedef long type; };\n\ +template < > struct _higher_type { typedef float type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < > struct _higher_type { typedef long type; };\n\ +template < > struct _higher_type { typedef long type; };\n\ +template < > struct _higher_type { typedef float type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < > struct _higher_type { typedef float type; };\n\ +template < > struct _higher_type { typedef float type; };\n\ +template < > struct _higher_type { typedef float type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < typename T1, typename T2 >\n\ +SUPPORT_CODE_FUNC typename _higher_type::type \n\ +_brian_mod(T1 x, T2 y)\n\ +{{\n\ + return x-y*floor(1.0*x/y);\n\ +}}\n\ +template < typename T1, typename T2 >\n\ +SUPPORT_CODE_FUNC typename _higher_type::type\n\ +_brian_floordiv(T1 x, T2 y)\n\ +{{\n\ + return floor(1.0*x/y);\n\ +}}\n\ +#ifdef _MSC_VER\n\ +#define _brian_pow(x, y) (pow((double)(x), (y)))\n\ +#else\n\ +#define _brian_pow(x, y) (pow((x), (y)))\n\ +#endif\n\ +\n\ +\n\ +\n\ +"); + + SET_PARAM_NAMES({ + "VT", "E_K", "E_e", "ms", "g_Na", "g_leak", "E_leak", "tau_PN_iKC", "C", "E_Na", "g_K", "mV" }); + SET_VARS({ + {"i", "int32_t"}, {"V", "double"}, {"g_PN_iKC", "double"}, {"h", "double"}, {"m", "double"}, {"n", "double"}, {"lastspike", "double"}, {"not_refractory", "char"} }); + SET_EXTRA_GLOBAL_PARAMS({ + }); + SET_NEEDS_AUTO_REFRACTORY(false); +}; +IMPLEMENT_MODEL(neurongroupNEURON); +class neurongroup_1NEURON : public NeuronModels::Base +{ +public: + DECLARE_MODEL(neurongroup_1NEURON, 14, 9); + + SET_SIM_CODE("// Update \"constant over DT\" subexpressions (if any)\n\ +\n\ +\n\ +\n\ +// PoissonInputs targetting this group (if any)\n\ +\n\ +\n\ +\n\ +// Update state variables and the threshold condition\n\ +\n\ +$(not_refractory) = $(not_refractory) || (! ($(V) > (0.0 * $(mV))));\n\ +double _BA_V = 1.0f*(((((1.0f*(((1.0 * $(E_K)) * $(g_K)) * (_brian_pow($(n), 4.0)))/$(C)) + (1.0f*((((1.0 * $(E_Na)) * $(g_Na)) * $(h)) * (_brian_pow($(m), 3.0)))/$(C))) + (1.0f*((1.0 * $(E_e)) * $(g_iKC_eKC))/$(C))) + (1.0f*((1.0 * $(E_i)) * $(g_eKC_eKC))/$(C))) + (1.0f*((1.0 * $(E_leak)) * $(g_leak))/$(C)))/(((((1.0f*(((- 1.0) * $(g_K)) * (_brian_pow($(n), 4.0)))/$(C)) - (1.0f*(((1.0 * $(g_Na)) * $(h)) * (_brian_pow($(m), 3.0)))/$(C))) - (1.0f*(1.0 * $(g_eKC_eKC))/$(C))) - (1.0f*(1.0 * $(g_iKC_eKC))/$(C))) - (1.0f*(1.0 * $(g_leak))/$(C)));\n\ +double _V = (- _BA_V) + (($(V) + _BA_V) * exp(DT * (((((1.0f*(((- 1.0) * $(g_K)) * (_brian_pow($(n), 4.0)))/$(C)) - (1.0f*(((1.0 * $(g_Na)) * $(h)) * (_brian_pow($(m), 3.0)))/$(C))) - (1.0f*(1.0 * $(g_eKC_eKC))/$(C))) - (1.0f*(1.0 * $(g_iKC_eKC))/$(C))) - (1.0f*(1.0 * $(g_leak))/$(C)))));\n\ +double _g_eKC_eKC = $(g_eKC_eKC) * exp(1.0f*(- DT)/$(tau_eKC_eKC));\n\ +double _g_iKC_eKC = $(g_iKC_eKC) * exp(1.0f*(- DT)/$(tau_iKC_eKC));\n\ +double _BA_h = 1.0f*((0.329137207652868 * exp(1.0f*((- 0.0555555555555556) * $(V))/$(mV))) * exp(1.0f*(0.0555555555555556 * $(VT))/$(mV)))/($(ms) * ((1.0f*(- 4.0)/($(ms) + (((2980.95798704173 * $(ms)) * exp(1.0f*((- 0.2) * $(V))/$(mV))) * exp(1.0f*(0.2 * $(VT))/$(mV))))) - (1.0f*((0.329137207652868 * exp(1.0f*((- 0.0555555555555556) * $(V))/$(mV))) * exp(1.0f*(0.0555555555555556 * $(VT))/$(mV)))/$(ms))));\n\ +double _h = (- _BA_h) + ((_BA_h + $(h)) * exp(DT * ((1.0f*(- 4.0)/($(ms) + (((2980.95798704173 * $(ms)) * exp(1.0f*((- 0.2) * $(V))/$(mV))) * exp(1.0f*(0.2 * $(VT))/$(mV))))) - (1.0f*((0.329137207652868 * exp(1.0f*((- 0.0555555555555556) * $(V))/$(mV))) * exp(1.0f*(0.0555555555555556 * $(VT))/$(mV)))/$(ms)))));\n\ +double _BA_m = 1.0f*(((1.0f*((- 0.32) * $(V))/((((- 1.0) * (_brian_pow($(mV), 1.0))) * $(ms)) + ((((25.7903399171931 * (_brian_pow($(mV), 1.0))) * $(ms)) * exp(1.0f*((- 0.25) * $(V))/$(mV))) * exp(1.0f*(0.25 * $(VT))/$(mV))))) + (1.0f*(0.32 * $(VT))/((((- 1.0) * (_brian_pow($(mV), 1.0))) * $(ms)) + ((((25.7903399171931 * (_brian_pow($(mV), 1.0))) * $(ms)) * exp(1.0f*((- 0.25) * $(V))/$(mV))) * exp(1.0f*(0.25 * $(VT))/$(mV)))))) + (1.0f*(4.16 * $(mV))/((((- 1.0) * (_brian_pow($(mV), 1.0))) * $(ms)) + ((((25.7903399171931 * (_brian_pow($(mV), 1.0))) * $(ms)) * exp(1.0f*((- 0.25) * $(V))/$(mV))) * exp(1.0f*(0.25 * $(VT))/$(mV))))))/((((((1.0f*((- 0.28) * $(V))/(((((0.000335462627902512 * (_brian_pow($(mV), 1.0))) * $(ms)) * exp(1.0f*(0.2 * $(V))/$(mV))) * exp(1.0f*((- 0.2) * $(VT))/$(mV))) - ((1.0 * (_brian_pow($(mV), 1.0))) * $(ms)))) + (1.0f*(0.32 * $(V))/((((- 1.0) * (_brian_pow($(mV), 1.0))) * $(ms)) + ((((25.7903399171931 * (_brian_pow($(mV), 1.0))) * $(ms)) * exp(1.0f*((- 0.25) * $(V))/$(mV))) * exp(1.0f*(0.25 * $(VT))/$(mV)))))) + (1.0f*(0.28 * $(VT))/(((((0.000335462627902512 * (_brian_pow($(mV), 1.0))) * $(ms)) * exp(1.0f*(0.2 * $(V))/$(mV))) * exp(1.0f*((- 0.2) * $(VT))/$(mV))) - ((1.0 * (_brian_pow($(mV), 1.0))) * $(ms))))) - (1.0f*(0.32 * $(VT))/((((- 1.0) * (_brian_pow($(mV), 1.0))) * $(ms)) + ((((25.7903399171931 * (_brian_pow($(mV), 1.0))) * $(ms)) * exp(1.0f*((- 0.25) * $(V))/$(mV))) * exp(1.0f*(0.25 * $(VT))/$(mV)))))) + (1.0f*(11.2 * $(mV))/(((((0.000335462627902512 * (_brian_pow($(mV), 1.0))) * $(ms)) * exp(1.0f*(0.2 * $(V))/$(mV))) * exp(1.0f*((- 0.2) * $(VT))/$(mV))) - ((1.0 * (_brian_pow($(mV), 1.0))) * $(ms))))) - (1.0f*(4.16 * $(mV))/((((- 1.0) * (_brian_pow($(mV), 1.0))) * $(ms)) + ((((25.7903399171931 * (_brian_pow($(mV), 1.0))) * $(ms)) * exp(1.0f*((- 0.25) * $(V))/$(mV))) * exp(1.0f*(0.25 * $(VT))/$(mV))))));\n\ +double _m = (- _BA_m) + ((_BA_m + $(m)) * exp(DT * ((((((1.0f*((- 0.28) * $(V))/(((((0.000335462627902512 * (_brian_pow($(mV), 1.0))) * $(ms)) * exp(1.0f*(0.2 * $(V))/$(mV))) * exp(1.0f*((- 0.2) * $(VT))/$(mV))) - ((1.0 * (_brian_pow($(mV), 1.0))) * $(ms)))) + (1.0f*(0.32 * $(V))/((((- 1.0) * (_brian_pow($(mV), 1.0))) * $(ms)) + ((((25.7903399171931 * (_brian_pow($(mV), 1.0))) * $(ms)) * exp(1.0f*((- 0.25) * $(V))/$(mV))) * exp(1.0f*(0.25 * $(VT))/$(mV)))))) + (1.0f*(0.28 * $(VT))/(((((0.000335462627902512 * (_brian_pow($(mV), 1.0))) * $(ms)) * exp(1.0f*(0.2 * $(V))/$(mV))) * exp(1.0f*((- 0.2) * $(VT))/$(mV))) - ((1.0 * (_brian_pow($(mV), 1.0))) * $(ms))))) - (1.0f*(0.32 * $(VT))/((((- 1.0) * (_brian_pow($(mV), 1.0))) * $(ms)) + ((((25.7903399171931 * (_brian_pow($(mV), 1.0))) * $(ms)) * exp(1.0f*((- 0.25) * $(V))/$(mV))) * exp(1.0f*(0.25 * $(VT))/$(mV)))))) + (1.0f*(11.2 * $(mV))/(((((0.000335462627902512 * (_brian_pow($(mV), 1.0))) * $(ms)) * exp(1.0f*(0.2 * $(V))/$(mV))) * exp(1.0f*((- 0.2) * $(VT))/$(mV))) - ((1.0 * (_brian_pow($(mV), 1.0))) * $(ms))))) - (1.0f*(4.16 * $(mV))/((((- 1.0) * (_brian_pow($(mV), 1.0))) * $(ms)) + ((((25.7903399171931 * (_brian_pow($(mV), 1.0))) * $(ms)) * exp(1.0f*((- 0.25) * $(V))/$(mV))) * exp(1.0f*(0.25 * $(VT))/$(mV))))))));\n\ +double _BA_n = 1.0f*(((1.0f*((- 0.032) * $(V))/((((- 1.0) * $(mV)) * $(ms)) + ((((20.0855369231877 * $(mV)) * $(ms)) * exp(1.0f*((- 0.2) * $(V))/$(mV))) * exp(1.0f*(0.2 * $(VT))/$(mV))))) + (1.0f*(0.032 * $(VT))/((((- 1.0) * $(mV)) * $(ms)) + ((((20.0855369231877 * $(mV)) * $(ms)) * exp(1.0f*((- 0.2) * $(V))/$(mV))) * exp(1.0f*(0.2 * $(VT))/$(mV)))))) + (1.0f*(0.48 * $(mV))/((((- 1.0) * $(mV)) * $(ms)) + ((((20.0855369231877 * $(mV)) * $(ms)) * exp(1.0f*((- 0.2) * $(V))/$(mV))) * exp(1.0f*(0.2 * $(VT))/$(mV))))))/((((1.0f*(0.032 * $(V))/((((- 1.0) * $(mV)) * $(ms)) + ((((20.0855369231877 * $(mV)) * $(ms)) * exp(1.0f*((- 0.2) * $(V))/$(mV))) * exp(1.0f*(0.2 * $(VT))/$(mV))))) - (1.0f*(0.032 * $(VT))/((((- 1.0) * $(mV)) * $(ms)) + ((((20.0855369231877 * $(mV)) * $(ms)) * exp(1.0f*((- 0.2) * $(V))/$(mV))) * exp(1.0f*(0.2 * $(VT))/$(mV)))))) - (1.0f*(0.48 * $(mV))/((((- 1.0) * $(mV)) * $(ms)) + ((((20.0855369231877 * $(mV)) * $(ms)) * exp(1.0f*((- 0.2) * $(V))/$(mV))) * exp(1.0f*(0.2 * $(VT))/$(mV)))))) - (1.0f*((0.642012708343871 * exp(1.0f*((- 0.025) * $(V))/$(mV))) * exp(1.0f*(0.025 * $(VT))/$(mV)))/$(ms)));\n\ +double _n = (- _BA_n) + ((_BA_n + $(n)) * exp(DT * ((((1.0f*(0.032 * $(V))/((((- 1.0) * $(mV)) * $(ms)) + ((((20.0855369231877 * $(mV)) * $(ms)) * exp(1.0f*((- 0.2) * $(V))/$(mV))) * exp(1.0f*(0.2 * $(VT))/$(mV))))) - (1.0f*(0.032 * $(VT))/((((- 1.0) * $(mV)) * $(ms)) + ((((20.0855369231877 * $(mV)) * $(ms)) * exp(1.0f*((- 0.2) * $(V))/$(mV))) * exp(1.0f*(0.2 * $(VT))/$(mV)))))) - (1.0f*(0.48 * $(mV))/((((- 1.0) * $(mV)) * $(ms)) + ((((20.0855369231877 * $(mV)) * $(ms)) * exp(1.0f*((- 0.2) * $(V))/$(mV))) * exp(1.0f*(0.2 * $(VT))/$(mV)))))) - (1.0f*((0.642012708343871 * exp(1.0f*((- 0.025) * $(V))/$(mV))) * exp(1.0f*(0.025 * $(VT))/$(mV)))/$(ms)))));\n\ +$(V) = _V;\n\ +$(g_eKC_eKC) = _g_eKC_eKC;\n\ +$(g_iKC_eKC) = _g_iKC_eKC;\n\ +$(h) = _h;\n\ +$(m) = _m;\n\ +$(n) = _n;\n\ +char _cond = ($(V) > (0.0 * $(mV))) && $(not_refractory);"); + SET_THRESHOLD_CONDITION_CODE("_cond"); + SET_RESET_CODE("$(lastspike) = t;\n\ +$(not_refractory) = false;"); + + SET_SUPPORT_CODE("\n\ +template < typename T1, typename T2 > struct _higher_type;\n\ +template < > struct _higher_type { typedef int type; };\n\ +template < > struct _higher_type { typedef long type; };\n\ +template < > struct _higher_type { typedef float type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < > struct _higher_type { typedef long type; };\n\ +template < > struct _higher_type { typedef long type; };\n\ +template < > struct _higher_type { typedef float type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < > struct _higher_type { typedef float type; };\n\ +template < > struct _higher_type { typedef float type; };\n\ +template < > struct _higher_type { typedef float type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < typename T1, typename T2 >\n\ +SUPPORT_CODE_FUNC typename _higher_type::type \n\ +_brian_mod(T1 x, T2 y)\n\ +{{\n\ + return x-y*floor(1.0*x/y);\n\ +}}\n\ +template < typename T1, typename T2 >\n\ +SUPPORT_CODE_FUNC typename _higher_type::type\n\ +_brian_floordiv(T1 x, T2 y)\n\ +{{\n\ + return floor(1.0*x/y);\n\ +}}\n\ +#ifdef _MSC_VER\n\ +#define _brian_pow(x, y) (pow((double)(x), (y)))\n\ +#else\n\ +#define _brian_pow(x, y) (pow((x), (y)))\n\ +#endif\n\ +\n\ +\n\ +\n\ +"); + + SET_PARAM_NAMES({ + "E_i", "VT", "E_K", "E_e", "ms", "g_Na", "tau_eKC_eKC", "g_leak", "E_leak", "C", "E_Na", "tau_iKC_eKC", "g_K", "mV" }); + SET_VARS({ + {"i", "int32_t"}, {"V", "double"}, {"g_eKC_eKC", "double"}, {"g_iKC_eKC", "double"}, {"h", "double"}, {"m", "double"}, {"n", "double"}, {"lastspike", "double"}, {"not_refractory", "char"} }); + SET_EXTRA_GLOBAL_PARAMS({ + }); + SET_NEEDS_AUTO_REFRACTORY(false); +}; +IMPLEMENT_MODEL(neurongroup_1NEURON); + +// +// define the synapse model classes +class synapsesWEIGHTUPDATE : public WeightUpdateModels::Base +{ +public: + DECLARE_MODEL(synapsesWEIGHTUPDATE, 1, 1); + + SET_SIM_CODE("$(addToInSyn,($(scale) * $(weight)));"); + SET_LEARN_POST_CODE(""); + SET_SYNAPSE_DYNAMICS_CODE(""); + + SET_SIM_SUPPORT_CODE("\n\ +template < typename T1, typename T2 > struct _higher_type;\n\ +template < > struct _higher_type { typedef int type; };\n\ +template < > struct _higher_type { typedef long type; };\n\ +template < > struct _higher_type { typedef float type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < > struct _higher_type { typedef long type; };\n\ +template < > struct _higher_type { typedef long type; };\n\ +template < > struct _higher_type { typedef float type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < > struct _higher_type { typedef float type; };\n\ +template < > struct _higher_type { typedef float type; };\n\ +template < > struct _higher_type { typedef float type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < typename T1, typename T2 >\n\ +SUPPORT_CODE_FUNC typename _higher_type::type \n\ +_brian_mod(T1 x, T2 y)\n\ +{{\n\ + return x-y*floor(1.0*x/y);\n\ +}}\n\ +template < typename T1, typename T2 >\n\ +SUPPORT_CODE_FUNC typename _higher_type::type\n\ +_brian_floordiv(T1 x, T2 y)\n\ +{{\n\ + return floor(1.0*x/y);\n\ +}}\n\ +#ifdef _MSC_VER\n\ +#define _brian_pow(x, y) (pow((double)(x), (y)))\n\ +#else\n\ +#define _brian_pow(x, y) (pow((x), (y)))\n\ +#endif\n\ +\n\ +\n\ +\n\ +"); + SET_LEARN_POST_SUPPORT_CODE(""); + SET_SYNAPSE_DYNAMICS_SUPPORT_CODE(""); + + SET_PARAM_NAMES({ + "scale" }); + + SET_VARS({ + {"weight", "double"} }); + + SET_EXTRA_GLOBAL_PARAMS({ + }); + + //SET_NEEDS_PRE_SPIKE_TIME(true); + //SET_NEEDS_POST_SPIKE_TIME(true); + +}; + +IMPLEMENT_MODEL(synapsesWEIGHTUPDATE); + +class synapsesPOSTSYN : public PostsynapticModels::Base +{ +public: + DECLARE_MODEL(synapsesPOSTSYN, 0, 0); + + SET_APPLY_INPUT_CODE("$(Isyn) += 0; $(g_PN_iKC) += $(inSyn); $(inSyn)= 0;"); +}; +IMPLEMENT_MODEL(synapsesPOSTSYN); +class synapses_2WEIGHTUPDATE : public WeightUpdateModels::Base +{ +public: + DECLARE_MODEL(synapses_2WEIGHTUPDATE, 2, 0); + + SET_SIM_CODE("$(addToInSyn,($(scale) * $(w_eKC_eKC)));"); + SET_LEARN_POST_CODE(""); + SET_SYNAPSE_DYNAMICS_CODE(""); + + SET_SIM_SUPPORT_CODE("\n\ +template < typename T1, typename T2 > struct _higher_type;\n\ +template < > struct _higher_type { typedef int type; };\n\ +template < > struct _higher_type { typedef long type; };\n\ +template < > struct _higher_type { typedef float type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < > struct _higher_type { typedef long type; };\n\ +template < > struct _higher_type { typedef long type; };\n\ +template < > struct _higher_type { typedef float type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < > struct _higher_type { typedef float type; };\n\ +template < > struct _higher_type { typedef float type; };\n\ +template < > struct _higher_type { typedef float type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < typename T1, typename T2 >\n\ +SUPPORT_CODE_FUNC typename _higher_type::type \n\ +_brian_mod(T1 x, T2 y)\n\ +{{\n\ + return x-y*floor(1.0*x/y);\n\ +}}\n\ +template < typename T1, typename T2 >\n\ +SUPPORT_CODE_FUNC typename _higher_type::type\n\ +_brian_floordiv(T1 x, T2 y)\n\ +{{\n\ + return floor(1.0*x/y);\n\ +}}\n\ +#ifdef _MSC_VER\n\ +#define _brian_pow(x, y) (pow((double)(x), (y)))\n\ +#else\n\ +#define _brian_pow(x, y) (pow((x), (y)))\n\ +#endif\n\ +\n\ +\n\ +\n\ +"); + SET_LEARN_POST_SUPPORT_CODE(""); + SET_SYNAPSE_DYNAMICS_SUPPORT_CODE(""); + + SET_PARAM_NAMES({ + "w_eKC_eKC", "scale" }); + + SET_VARS({ + }); + + SET_EXTRA_GLOBAL_PARAMS({ + }); + + //SET_NEEDS_PRE_SPIKE_TIME(true); + //SET_NEEDS_POST_SPIKE_TIME(true); + +}; + +IMPLEMENT_MODEL(synapses_2WEIGHTUPDATE); + +class synapses_2POSTSYN : public PostsynapticModels::Base +{ +public: + DECLARE_MODEL(synapses_2POSTSYN, 0, 0); + + SET_APPLY_INPUT_CODE("$(Isyn) += 0; $(g_eKC_eKC) += $(inSyn); $(inSyn)= 0;"); +}; +IMPLEMENT_MODEL(synapses_2POSTSYN); +class synapses_1WEIGHTUPDATE : public WeightUpdateModels::Base +{ +public: + DECLARE_MODEL(synapses_1WEIGHTUPDATE, 6, 4); + + SET_SIM_CODE("double _Apost = $(Apost) * exp(1.0f*(- (t - $(lastupdate)))/$(tau_post));\n\ +double _Apre = $(Apre) * exp(1.0f*(- (t - $(lastupdate)))/$(tau_pre));\n\ +$(Apost) = _Apost;\n\ +$(Apre) = _Apre;\n\ +$(addToInSyn,$(g_raw));\n\ +$(Apre) += $(dApre);\n\ +$(g_raw) = _clip($(g_raw) + $(Apost), 0 * $(siemens), $(g_max));\n\ +$(lastupdate) = t;"); + SET_LEARN_POST_CODE("double _Apost = $(Apost) * exp(1.0f*(- (t - $(lastupdate)))/$(tau_post));\n\ +double _Apre = $(Apre) * exp(1.0f*(- (t - $(lastupdate)))/$(tau_pre));\n\ +$(Apost) = _Apost;\n\ +$(Apre) = _Apre;\n\ +$(Apost) += $(dApost);\n\ +$(g_raw) = _clip($(g_raw) + $(Apre), 0 * $(siemens), $(g_max));\n\ +$(lastupdate) = t;"); + SET_SYNAPSE_DYNAMICS_CODE(""); + + SET_SIM_SUPPORT_CODE("\n\ +SUPPORT_CODE_FUNC double _clip(const float value, const float a_min, const float a_max)\n\ +{\n\ + if (value < a_min)\n\ + return a_min;\n\ + if (value > a_max)\n\ + return a_max;\n\ + return value;\n\ +}\n\ +template < typename T1, typename T2 > struct _higher_type;\n\ +template < > struct _higher_type { typedef int type; };\n\ +template < > struct _higher_type { typedef long type; };\n\ +template < > struct _higher_type { typedef float type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < > struct _higher_type { typedef long type; };\n\ +template < > struct _higher_type { typedef long type; };\n\ +template < > struct _higher_type { typedef float type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < > struct _higher_type { typedef float type; };\n\ +template < > struct _higher_type { typedef float type; };\n\ +template < > struct _higher_type { typedef float type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < typename T1, typename T2 >\n\ +SUPPORT_CODE_FUNC typename _higher_type::type \n\ +_brian_mod(T1 x, T2 y)\n\ +{{\n\ + return x-y*floor(1.0*x/y);\n\ +}}\n\ +template < typename T1, typename T2 >\n\ +SUPPORT_CODE_FUNC typename _higher_type::type\n\ +_brian_floordiv(T1 x, T2 y)\n\ +{{\n\ + return floor(1.0*x/y);\n\ +}}\n\ +#ifdef _MSC_VER\n\ +#define _brian_pow(x, y) (pow((double)(x), (y)))\n\ +#else\n\ +#define _brian_pow(x, y) (pow((x), (y)))\n\ +#endif\n\ +\n\ +\n\ +\n\ +"); + SET_LEARN_POST_SUPPORT_CODE("\n\ +SUPPORT_CODE_FUNC double _clip(const float value, const float a_min, const float a_max)\n\ +{\n\ + if (value < a_min)\n\ + return a_min;\n\ + if (value > a_max)\n\ + return a_max;\n\ + return value;\n\ +}\n\ +template < typename T1, typename T2 > struct _higher_type;\n\ +template < > struct _higher_type { typedef int type; };\n\ +template < > struct _higher_type { typedef long type; };\n\ +template < > struct _higher_type { typedef float type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < > struct _higher_type { typedef long type; };\n\ +template < > struct _higher_type { typedef long type; };\n\ +template < > struct _higher_type { typedef float type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < > struct _higher_type { typedef float type; };\n\ +template < > struct _higher_type { typedef float type; };\n\ +template < > struct _higher_type { typedef float type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < > struct _higher_type { typedef double type; };\n\ +template < typename T1, typename T2 >\n\ +SUPPORT_CODE_FUNC typename _higher_type::type \n\ +_brian_mod(T1 x, T2 y)\n\ +{{\n\ + return x-y*floor(1.0*x/y);\n\ +}}\n\ +template < typename T1, typename T2 >\n\ +SUPPORT_CODE_FUNC typename _higher_type::type\n\ +_brian_floordiv(T1 x, T2 y)\n\ +{{\n\ + return floor(1.0*x/y);\n\ +}}\n\ +#ifdef _MSC_VER\n\ +#define _brian_pow(x, y) (pow((double)(x), (y)))\n\ +#else\n\ +#define _brian_pow(x, y) (pow((x), (y)))\n\ +#endif\n\ +\n\ +\n\ +\n\ +"); + SET_SYNAPSE_DYNAMICS_SUPPORT_CODE(""); + + SET_PARAM_NAMES({ + "tau_pre", "dApre", "g_max", "siemens", "tau_post", "dApost" }); + + SET_VARS({ + {"lastupdate", "double"}, {"Apost", "double"}, {"g_raw", "double"}, {"Apre", "double"} }); + + SET_EXTRA_GLOBAL_PARAMS({ + }); + + //SET_NEEDS_PRE_SPIKE_TIME(true); + //SET_NEEDS_POST_SPIKE_TIME(true); + +}; + +IMPLEMENT_MODEL(synapses_1WEIGHTUPDATE); + +class synapses_1POSTSYN : public PostsynapticModels::Base +{ +public: + DECLARE_MODEL(synapses_1POSTSYN, 0, 0); + + SET_APPLY_INPUT_CODE("$(Isyn) += 0; $(g_iKC_eKC) += $(inSyn); $(inSyn)= 0;"); +}; +IMPLEMENT_MODEL(synapses_1POSTSYN); + +// parameter values +// neurons +neurongroupNEURON::ParamValues neurongroup_p +( + - 0.063, - 0.095, 0.0, 0.001, 7.15e-06, 2.67e-08, - 0.06356, 0.002, 3e-10, 0.05, 1.4299999999999999e-06, 0.001); +neurongroup_1NEURON::ParamValues neurongroup_1_p +( + - 0.092, - 0.063, - 0.095, 0.0, 0.001, 7.15e-06, 0.005, 2.67e-08, - 0.06356, 3e-10, 0.05, 0.01, 1.4299999999999999e-06, 0.001); + +// synapses +synapsesWEIGHTUPDATE::ParamValues synapses_p +( + 0.675); +synapses_2WEIGHTUPDATE::ParamValues synapses_2_p +( + 7.500000000000001e-08, 0.675); +synapses_1WEIGHTUPDATE::ParamValues synapses_1_p +( + 0.01, 1.0000000000000002e-10, 3.7500000000000005e-09, 1.0, 0.01, - 1.0000000000000002e-10); + +// initial variables (neurons) +neurongroupNEURON::VarValues neurongroup_ini +( + uninitialisedVar(), uninitialisedVar(), uninitialisedVar(), uninitialisedVar(), uninitialisedVar(), uninitialisedVar(), uninitialisedVar(), uninitialisedVar()); +neurongroup_1NEURON::VarValues neurongroup_1_ini +( + uninitialisedVar(), uninitialisedVar(), uninitialisedVar(), uninitialisedVar(), uninitialisedVar(), uninitialisedVar(), uninitialisedVar(), uninitialisedVar(), uninitialisedVar()); + +// initial variables (synapses) +// one additional initial variable for hidden_weightmatrix +synapsesWEIGHTUPDATE::VarValues synapses_ini +( + uninitialisedVar()); +synapses_2WEIGHTUPDATE::VarValues synapses_2_ini +; +synapses_1WEIGHTUPDATE::VarValues synapses_1_ini +( + uninitialisedVar(), uninitialisedVar(), uninitialisedVar(), uninitialisedVar()); + + +void modelDefinition(NNmodel &model) +{ + _init_arrays(); + _load_arrays(); + + + rk_randomseed(brian::_mersenne_twister_states[0]); + + + { + using namespace brian; + + _array_defaultclock_dt[0] = 0.0001; + _array_defaultclock_dt[0] = 0.0001; + _array_defaultclock_dt[0] = 0.0001; + _dynamic_array_spikegeneratorgroup_spike_number.resize(19676); + + for(int i=0; i<_dynamic_array_spikegeneratorgroup_spike_number.size(); i++) + { + _dynamic_array_spikegeneratorgroup_spike_number[i] = _static_array__dynamic_array_spikegeneratorgroup_spike_number[i]; + } + + _dynamic_array_spikegeneratorgroup_neuron_index.resize(19676); + + for(int i=0; i<_dynamic_array_spikegeneratorgroup_neuron_index.size(); i++) + { + _dynamic_array_spikegeneratorgroup_neuron_index[i] = _static_array__dynamic_array_spikegeneratorgroup_neuron_index[i]; + } + + _dynamic_array_spikegeneratorgroup_spike_time.resize(19676); + + for(int i=0; i<_dynamic_array_spikegeneratorgroup_spike_time.size(); i++) + { + _dynamic_array_spikegeneratorgroup_spike_time[i] = _static_array__dynamic_array_spikegeneratorgroup_spike_time[i]; + } + + _dynamic_array_spikegeneratorgroup__timebins.resize(19676); + _array_spikegeneratorgroup__lastindex[0] = 0; + _array_spikegeneratorgroup_period[0] = 0.0; + + for(int i=0; i<_num__array_neurongroup_lastspike; i++) + { + _array_neurongroup_lastspike[i] = - 10000.0; + } + + + for(int i=0; i<_num__array_neurongroup_not_refractory; i++) + { + _array_neurongroup_not_refractory[i] = true; + } + + + for(int i=0; i<_num__array_neurongroup_1_lastspike; i++) + { + _array_neurongroup_1_lastspike[i] = - 10000.0; + } + + + for(int i=0; i<_num__array_neurongroup_1_not_refractory; i++) + { + _array_neurongroup_1_not_refractory[i] = true; + } + + _dynamic_array_synapses_1_delay.resize(1); + _dynamic_array_synapses_1_delay.resize(1); + _dynamic_array_synapses_1_delay[0] = 0.0; + _dynamic_array_synapses_2_delay.resize(1); + _dynamic_array_synapses_2_delay.resize(1); + _dynamic_array_synapses_2_delay[0] = 0.0; + _run_synapses_group_variable_set_conditional_codeobject(); + _run_synapses_1_group_variable_set_conditional_codeobject(); + _run_synapses_1_group_variable_set_conditional_codeobject_1(); + + for(int i=0; i<_num__array_neurongroup_V; i++) + { + _array_neurongroup_V[i] = - 0.06356; + } + + + for(int i=0; i<_num__array_neurongroup_h; i++) + { + _array_neurongroup_h[i] = 1; + } + + + for(int i=0; i<_num__array_neurongroup_m; i++) + { + _array_neurongroup_m[i] = 0; + } + + + for(int i=0; i<_num__array_neurongroup_n; i++) + { + _array_neurongroup_n[i] = 0.5; + } + + + for(int i=0; i<_num__array_neurongroup_1_V; i++) + { + _array_neurongroup_1_V[i] = - 0.06356; + } + + + for(int i=0; i<_num__array_neurongroup_1_h; i++) + { + _array_neurongroup_1_h[i] = 1; + } + + + for(int i=0; i<_num__array_neurongroup_1_m; i++) + { + _array_neurongroup_1_m[i] = 0; + } + + + for(int i=0; i<_num__array_neurongroup_1_n; i++) + { + _array_neurongroup_1_n[i] = 0.5; + } + + _array_defaultclock_timestep[0] = 0; + _array_defaultclock_t[0] = 0.0; + _array_spikegeneratorgroup__lastindex[0] = 0; + + for(int i=0; i<_dynamic_array_spikegeneratorgroup__timebins.size(); i++) + { + _dynamic_array_spikegeneratorgroup__timebins[i] = _static_array__dynamic_array_spikegeneratorgroup__timebins[i]; + } + + _array_spikegeneratorgroup__period_bins[0] = 0.0; + + } + + _run_synapses_max_row_length(); + _run_synapses_1_max_row_length(); + _run_synapses_2_max_row_length(); + + const long maxRowsynapses= std::max(*std::max_element(brian::_dynamic_array_synapses_N_outgoing.begin(),brian::_dynamic_array_synapses_N_outgoing.end()),1); + const long maxColsynapses= std::max(*std::max_element(brian::_dynamic_array_synapses_N_incoming.begin(),brian::_dynamic_array_synapses_N_incoming.end()),1); + const long maxRowsynapses_1= std::max(*std::max_element(brian::_dynamic_array_synapses_1_N_outgoing.begin(),brian::_dynamic_array_synapses_1_N_outgoing.end()),1); + const long maxColsynapses_1= std::max(*std::max_element(brian::_dynamic_array_synapses_1_N_incoming.begin(),brian::_dynamic_array_synapses_1_N_incoming.end()),1); + const long maxRowsynapses_2= std::max(*std::max_element(brian::_dynamic_array_synapses_2_N_outgoing.begin(),brian::_dynamic_array_synapses_2_N_outgoing.end()),1); + const long maxColsynapses_2= std::max(*std::max_element(brian::_dynamic_array_synapses_2_N_incoming.begin(),brian::_dynamic_array_synapses_2_N_incoming.end()),1); + + // GENN_PREFERENCES set in brian2genn + GENN_PREFERENCES.deviceSelectMethod = DeviceSelect::OPTIMAL; + GENN_PREFERENCES.blockSizeSelectMethod = BlockSizeSelect::OCCUPANCY; + + GENN_PREFERENCES.userNvccFlags = ""; + + model.setDT(0.0001); + + model.setName("magicnetwork_model"); + model.setPrecision(GENN_DOUBLE); + model.addNeuronPopulation("neurongroup", 2500, neurongroup_p, neurongroup_ini); + model.addNeuronPopulation("neurongroup_1", 100, neurongroup_1_p, neurongroup_1_ini); + model.addNeuronPopulation("spikegeneratorgroup", 100, {}, {}); + { + // TODO: Consider flexible use of DENSE and SPARSE (but beware of difficulty of judging which to use at compile time) + const unsigned int delaySteps = NO_DELAY; + auto *syn = model.addSynapsePopulation( + "synapses", SynapseMatrixType::SPARSE_INDIVIDUALG, delaySteps, + "spikegeneratorgroup", "neurongroup", + synapses_p, synapses_ini, + {}, {}); + syn->setSpanType(SynapseGroup::SpanType::POSTSYNAPTIC); + syn->setMaxConnections(maxRowsynapses); + syn->setMaxSourceConnections(maxColsynapses); + } + { + // TODO: Consider flexible use of DENSE and SPARSE (but beware of difficulty of judging which to use at compile time) + const unsigned int delaySteps = NO_DELAY; + auto *syn = model.addSynapsePopulation( + "synapses_2", SynapseMatrixType::SPARSE_INDIVIDUALG, delaySteps, + "neurongroup_1", "neurongroup_1", + synapses_2_p, synapses_2_ini, + {}, {}); + syn->setSpanType(SynapseGroup::SpanType::POSTSYNAPTIC); + syn->setMaxConnections(maxRowsynapses_2); + syn->setMaxSourceConnections(maxColsynapses_2); + } + { + // TODO: Consider flexible use of DENSE and SPARSE (but beware of difficulty of judging which to use at compile time) + const unsigned int delaySteps = NO_DELAY; + auto *syn = model.addSynapsePopulation( + "synapses_1", SynapseMatrixType::SPARSE_INDIVIDUALG, delaySteps, + "neurongroup", "neurongroup_1", + synapses_1_p, synapses_1_ini, + {}, {}); + syn->setSpanType(SynapseGroup::SpanType::POSTSYNAPTIC); + syn->setMaxConnections(maxRowsynapses_1); + syn->setMaxSourceConnections(maxColsynapses_1); + } +} diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/Makefile b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/Makefile new file mode 100644 index 00000000..87104bb6 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/Makefile @@ -0,0 +1,27 @@ +OBJECTS := customUpdate.o neuronUpdate.o synapseUpdate.o init.o runner.o +CUDA_PATH ?=/usr/local/cuda +NVCC := $(CUDA_PATH)/bin/nvcc +NVCCFLAGS := -x cu -arch sm_61 -std=c++11 --compiler-options "-fPIC -Wno-return-type-c-linkage" -Xcudafe "--diag_suppress=extern_entity_treated_as_static" +LINKFLAGS := --shared -arch sm_61 + +DEPS := $(OBJECTS:.o=.d) + +.PHONY: all clean + +all: librunner.so + +librunner.so: $(OBJECTS) + @$(NVCC) $(LINKFLAGS) -o $@ $(OBJECTS) + +-include $(DEPS) + +%.d: %.cc + @$(NVCC) -M $(NVCCFLAGS) $< 1> $@ + +%.o: %.cc %.d + @$(NVCC) -dc $(NVCCFLAGS) $< + +%.d: ; + +clean: + @rm -f $(OBJECTS) $(DEPS) librunner.so diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/customUpdate.cc b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/customUpdate.cc new file mode 100644 index 00000000..ce6414ec --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/customUpdate.cc @@ -0,0 +1,2 @@ +#include "definitionsInternal.h" + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/customUpdate.d b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/customUpdate.d new file mode 100644 index 00000000..0b0c1b8a --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/customUpdate.d @@ -0,0 +1,263 @@ +customUpdate.o : customUpdate.cc \ + /usr/include/stdc-predef.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda_runtime.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/host_config.h \ + /usr/include/features.h \ + /usr/include/x86_64-linux-gnu/sys/cdefs.h \ + /usr/include/x86_64-linux-gnu/bits/wordsize.h \ + /usr/include/x86_64-linux-gnu/bits/long-double.h \ + /usr/include/x86_64-linux-gnu/gnu/stubs.h \ + /usr/include/x86_64-linux-gnu/gnu/stubs-64.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/builtin_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/device_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/host_defines.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/driver_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/vector_types.h \ + /usr/lib/gcc/x86_64-linux-gnu/9/include/limits.h \ + /usr/lib/gcc/x86_64-linux-gnu/9/include/syslimits.h \ + /usr/include/limits.h \ + /usr/include/x86_64-linux-gnu/bits/libc-header-start.h \ + /usr/include/x86_64-linux-gnu/bits/posix1_lim.h \ + /usr/include/x86_64-linux-gnu/bits/local_lim.h \ + /usr/include/linux/limits.h \ + /usr/include/x86_64-linux-gnu/bits/posix2_lim.h \ + /usr/include/x86_64-linux-gnu/bits/xopen_lim.h \ + /usr/include/x86_64-linux-gnu/bits/uio_lim.h \ + /usr/lib/gcc/x86_64-linux-gnu/9/include/stddef.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/surface_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/texture_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/library_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/channel_descriptor.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda_runtime_api.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda_device_runtime_api.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/driver_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/vector_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/vector_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/common_functions.h \ + /usr/include/string.h \ + /usr/include/x86_64-linux-gnu/bits/types/locale_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/__locale_t.h \ + /usr/include/strings.h \ + /usr/include/time.h \ + /usr/include/x86_64-linux-gnu/bits/time.h \ + /usr/include/x86_64-linux-gnu/bits/types.h \ + /usr/include/x86_64-linux-gnu/bits/timesize.h \ + /usr/include/x86_64-linux-gnu/bits/typesizes.h \ + /usr/include/x86_64-linux-gnu/bits/time64.h \ + /usr/include/x86_64-linux-gnu/bits/timex.h \ + /usr/include/x86_64-linux-gnu/bits/types/struct_timeval.h \ + /usr/include/x86_64-linux-gnu/bits/types/clock_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/time_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/struct_tm.h \ + /usr/include/x86_64-linux-gnu/bits/types/struct_timespec.h \ + /usr/include/x86_64-linux-gnu/bits/endian.h \ + /usr/include/x86_64-linux-gnu/bits/endianness.h \ + /usr/include/x86_64-linux-gnu/bits/types/clockid_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/timer_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/struct_itimerspec.h \ + /usr/include/c++/9/new \ + /usr/include/x86_64-linux-gnu/c++/9/bits/c++config.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/os_defines.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/cpu_defines.h \ + /usr/include/c++/9/exception \ + /usr/include/c++/9/bits/exception.h \ + /usr/include/c++/9/bits/exception_ptr.h \ + /usr/include/c++/9/bits/exception_defines.h \ + /usr/include/c++/9/bits/cxxabi_init_exception.h \ + /usr/include/c++/9/typeinfo \ + /usr/include/c++/9/bits/hash_bytes.h \ + /usr/include/c++/9/bits/nested_exception.h \ + /usr/include/c++/9/bits/move.h \ + /usr/include/c++/9/bits/concept_check.h \ + /usr/include/c++/9/type_traits \ + /usr/include/stdio.h \ + /usr/lib/gcc/x86_64-linux-gnu/9/include/stdarg.h \ + /usr/include/x86_64-linux-gnu/bits/types/__fpos_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/__mbstate_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/__fpos64_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/__FILE.h \ + /usr/include/x86_64-linux-gnu/bits/types/FILE.h \ + /usr/include/x86_64-linux-gnu/bits/types/struct_FILE.h \ + /usr/include/x86_64-linux-gnu/bits/types/cookie_io_functions_t.h \ + /usr/include/x86_64-linux-gnu/bits/stdio_lim.h \ + /usr/include/x86_64-linux-gnu/bits/sys_errlist.h \ + /usr/include/c++/9/stdlib.h \ + /usr/include/c++/9/cstdlib \ + /usr/include/stdlib.h \ + /usr/include/x86_64-linux-gnu/bits/waitflags.h \ + /usr/include/x86_64-linux-gnu/bits/waitstatus.h \ + /usr/include/x86_64-linux-gnu/bits/floatn.h \ + /usr/include/x86_64-linux-gnu/bits/floatn-common.h \ + /usr/include/x86_64-linux-gnu/sys/types.h \ + /usr/include/x86_64-linux-gnu/bits/stdint-intn.h \ + /usr/include/endian.h \ + /usr/include/x86_64-linux-gnu/bits/byteswap.h \ + /usr/include/x86_64-linux-gnu/bits/uintn-identity.h \ + /usr/include/x86_64-linux-gnu/sys/select.h \ + /usr/include/x86_64-linux-gnu/bits/select.h \ + /usr/include/x86_64-linux-gnu/bits/types/sigset_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/__sigset_t.h \ + /usr/include/x86_64-linux-gnu/bits/pthreadtypes.h \ + /usr/include/x86_64-linux-gnu/bits/thread-shared-types.h \ + /usr/include/x86_64-linux-gnu/bits/pthreadtypes-arch.h \ + /usr/include/x86_64-linux-gnu/bits/struct_mutex.h \ + /usr/include/x86_64-linux-gnu/bits/struct_rwlock.h \ + /usr/include/alloca.h \ + /usr/include/x86_64-linux-gnu/bits/stdlib-float.h \ + /usr/include/c++/9/bits/std_abs.h \ + /usr/include/assert.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/math_functions.h \ + /usr/include/c++/9/math.h \ + /usr/include/c++/9/cmath \ + /usr/include/c++/9/bits/cpp_type_traits.h \ + /usr/include/c++/9/ext/type_traits.h \ + /usr/include/math.h \ + /usr/include/x86_64-linux-gnu/bits/math-vector.h \ + /usr/include/x86_64-linux-gnu/bits/libm-simd-decl-stubs.h \ + /usr/include/x86_64-linux-gnu/bits/flt-eval-method.h \ + /usr/include/x86_64-linux-gnu/bits/fp-logb.h \ + /usr/include/x86_64-linux-gnu/bits/fp-fast.h \ + /usr/include/x86_64-linux-gnu/bits/mathcalls-helper-functions.h \ + /usr/include/x86_64-linux-gnu/bits/mathcalls.h \ + /usr/include/x86_64-linux-gnu/bits/mathcalls-narrow.h \ + /usr/include/x86_64-linux-gnu/bits/iscanonical.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/math_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda_surface_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda_texture_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/device_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/device_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/device_atomic_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/device_atomic_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/device_double_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/device_double_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_20_atomic_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_20_atomic_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_32_atomic_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_32_atomic_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_35_atomic_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_60_atomic_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_60_atomic_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_20_intrinsics.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_20_intrinsics.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_30_intrinsics.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_30_intrinsics.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_32_intrinsics.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_32_intrinsics.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_35_intrinsics.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_61_intrinsics.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_61_intrinsics.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/sm_70_rt.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/sm_70_rt.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/sm_80_rt.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/sm_80_rt.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/surface_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/texture_fetch_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/texture_indirect_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/surface_indirect_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/device_launch_parameters.h \ + definitionsInternal.h \ + definitions.h \ + /usr/include/c++/9/random \ + /usr/include/c++/9/string \ + /usr/include/c++/9/bits/stringfwd.h \ + /usr/include/c++/9/bits/memoryfwd.h \ + /usr/include/c++/9/bits/char_traits.h \ + /usr/include/c++/9/bits/stl_algobase.h \ + /usr/include/c++/9/bits/functexcept.h \ + /usr/include/c++/9/ext/numeric_traits.h \ + /usr/include/c++/9/bits/stl_pair.h \ + /usr/include/c++/9/bits/stl_iterator_base_types.h \ + /usr/include/c++/9/bits/stl_iterator_base_funcs.h \ + /usr/include/c++/9/debug/assertions.h \ + /usr/include/c++/9/bits/stl_iterator.h \ + /usr/include/c++/9/bits/ptr_traits.h \ + /usr/include/c++/9/debug/debug.h \ + /usr/include/c++/9/bits/predefined_ops.h \ + /usr/include/c++/9/bits/postypes.h \ + /usr/include/c++/9/cwchar \ + /usr/include/wchar.h \ + /usr/include/x86_64-linux-gnu/bits/wchar.h \ + /usr/include/x86_64-linux-gnu/bits/types/wint_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/mbstate_t.h \ + /usr/include/c++/9/cstdint \ + /usr/lib/gcc/x86_64-linux-gnu/9/include/stdint.h \ + /usr/include/stdint.h \ + /usr/include/x86_64-linux-gnu/bits/stdint-uintn.h \ + /usr/include/c++/9/bits/allocator.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/c++allocator.h \ + /usr/include/c++/9/ext/new_allocator.h \ + /usr/include/c++/9/bits/localefwd.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/c++locale.h \ + /usr/include/c++/9/clocale \ + /usr/include/locale.h \ + /usr/include/x86_64-linux-gnu/bits/locale.h \ + /usr/include/c++/9/iosfwd \ + /usr/include/c++/9/cctype \ + /usr/include/ctype.h \ + /usr/include/c++/9/bits/ostream_insert.h \ + /usr/include/c++/9/bits/cxxabi_forced.h \ + /usr/include/c++/9/bits/stl_function.h \ + /usr/include/c++/9/backward/binders.h \ + /usr/include/c++/9/bits/range_access.h \ + /usr/include/c++/9/initializer_list \ + /usr/include/c++/9/bits/basic_string.h \ + /usr/include/c++/9/ext/atomicity.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/gthr.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/gthr-default.h \ + /usr/include/pthread.h \ + /usr/include/sched.h \ + /usr/include/x86_64-linux-gnu/bits/sched.h \ + /usr/include/x86_64-linux-gnu/bits/types/struct_sched_param.h \ + /usr/include/x86_64-linux-gnu/bits/cpu-set.h \ + /usr/include/x86_64-linux-gnu/bits/setjmp.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/atomic_word.h \ + /usr/include/c++/9/ext/alloc_traits.h \ + /usr/include/c++/9/bits/alloc_traits.h \ + /usr/include/c++/9/ext/string_conversions.h \ + /usr/include/c++/9/cstdio \ + /usr/include/c++/9/cerrno \ + /usr/include/errno.h \ + /usr/include/x86_64-linux-gnu/bits/errno.h \ + /usr/include/linux/errno.h \ + /usr/include/x86_64-linux-gnu/asm/errno.h \ + /usr/include/asm-generic/errno.h \ + /usr/include/asm-generic/errno-base.h \ + /usr/include/x86_64-linux-gnu/bits/types/error_t.h \ + /usr/include/c++/9/bits/functional_hash.h \ + /usr/include/c++/9/bits/basic_string.tcc \ + /usr/include/c++/9/limits \ + /usr/include/c++/9/bits/random.h \ + /usr/include/c++/9/vector \ + /usr/include/c++/9/bits/stl_construct.h \ + /usr/include/c++/9/bits/stl_uninitialized.h \ + /usr/include/c++/9/bits/stl_vector.h \ + /usr/include/c++/9/bits/stl_bvector.h \ + /usr/include/c++/9/bits/vector.tcc \ + /usr/include/c++/9/bits/uniform_int_dist.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/opt_random.h \ + /usr/include/c++/9/bits/random.tcc \ + /usr/include/c++/9/numeric \ + /usr/include/c++/9/bits/stl_numeric.h \ + /usr/include/c++/9/stdexcept \ + /usr/include/c++/9/cassert \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_kernel.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_discrete.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_precalc.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_mrg32k3a.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_mtgp32_kernel.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda.h \ + /usr/include/memory.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_mtgp32.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_philox4x32_x.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_globals.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_uniform.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_normal.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_normal_static.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_lognormal.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_poisson.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_discrete2.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda_fp16.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda_fp16.hpp \ + /usr/include/c++/9/utility \ + /usr/include/c++/9/bits/stl_relops.h diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/customUpdateCUDA0.sha b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/customUpdateCUDA0.sha new file mode 100644 index 00000000..7b055d6d --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/customUpdateCUDA0.sha @@ -0,0 +1,3 @@ +96970b38 7ae5b6f9 20e58f9c e0a15161 593bf28b +0 0 +0 0 diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/customUpdateCUDA1.sha b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/customUpdateCUDA1.sha new file mode 100644 index 00000000..7b055d6d --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/customUpdateCUDA1.sha @@ -0,0 +1,3 @@ +96970b38 7ae5b6f9 20e58f9c e0a15161 593bf28b +0 0 +0 0 diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/definitions.h b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/definitions.h new file mode 100644 index 00000000..aa10079d --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/definitions.h @@ -0,0 +1,306 @@ +#pragma once +#define EXPORT_VAR extern +#define EXPORT_FUNC +// Standard C++ includes +#include +#include +#include + +// Standard C includes +#include +#include +#define DT 1.00000000000000005e-04 +typedef double scalar; +#define SCALAR_MIN 2.22507385850720138e-308 +#define SCALAR_MAX 1.79769313486231571e+308 + +#define TIME_MIN 2.22507385850720138e-308 +#define TIME_MAX 1.79769313486231571e+308 + +// ------------------------------------------------------------------------ +// bit tool macros +#define B(x,i) ((x) & (0x80000000 >> (i))) //!< Extract the bit at the specified position i from x +#define setB(x,i) x= ((x) | (0x80000000 >> (i))) //!< Set the bit at the specified position i in x to 1 +#define delB(x,i) x= ((x) & (~(0x80000000 >> (i)))) //!< Set the bit at the specified position i in x to 0 + +extern "C" { +// ------------------------------------------------------------------------ +// global variables +// ------------------------------------------------------------------------ +EXPORT_VAR unsigned long long iT; +EXPORT_VAR double t; + +// ------------------------------------------------------------------------ +// timers +// ------------------------------------------------------------------------ +EXPORT_VAR double initTime; +EXPORT_VAR double initSparseTime; +EXPORT_VAR double neuronUpdateTime; +EXPORT_VAR double presynapticUpdateTime; +EXPORT_VAR double postsynapticUpdateTime; +EXPORT_VAR double synapseDynamicsTime; +// ------------------------------------------------------------------------ +// local neuron groups +// ------------------------------------------------------------------------ +#define spikeCount_neurongroup glbSpkCntneurongroup[0] +#define spike_neurongroup glbSpkneurongroup +#define glbSpkShiftneurongroup 0 + +EXPORT_VAR unsigned int* glbSpkCntneurongroup; +EXPORT_VAR unsigned int* d_glbSpkCntneurongroup; +EXPORT_VAR unsigned int* glbSpkneurongroup; +EXPORT_VAR unsigned int* d_glbSpkneurongroup; +EXPORT_VAR int32_t* ineurongroup; +EXPORT_VAR int32_t* d_ineurongroup; +EXPORT_VAR double* Vneurongroup; +EXPORT_VAR double* d_Vneurongroup; +EXPORT_VAR double* g_PN_iKCneurongroup; +EXPORT_VAR double* d_g_PN_iKCneurongroup; +EXPORT_VAR double* hneurongroup; +EXPORT_VAR double* d_hneurongroup; +EXPORT_VAR double* mneurongroup; +EXPORT_VAR double* d_mneurongroup; +EXPORT_VAR double* nneurongroup; +EXPORT_VAR double* d_nneurongroup; +EXPORT_VAR double* lastspikeneurongroup; +EXPORT_VAR double* d_lastspikeneurongroup; +EXPORT_VAR char* not_refractoryneurongroup; +EXPORT_VAR char* d_not_refractoryneurongroup; +#define spikeCount_neurongroup_1 glbSpkCntneurongroup_1[0] +#define spike_neurongroup_1 glbSpkneurongroup_1 +#define glbSpkShiftneurongroup_1 0 + +EXPORT_VAR unsigned int* glbSpkCntneurongroup_1; +EXPORT_VAR unsigned int* d_glbSpkCntneurongroup_1; +EXPORT_VAR unsigned int* glbSpkneurongroup_1; +EXPORT_VAR unsigned int* d_glbSpkneurongroup_1; +EXPORT_VAR int32_t* ineurongroup_1; +EXPORT_VAR int32_t* d_ineurongroup_1; +EXPORT_VAR double* Vneurongroup_1; +EXPORT_VAR double* d_Vneurongroup_1; +EXPORT_VAR double* g_eKC_eKCneurongroup_1; +EXPORT_VAR double* d_g_eKC_eKCneurongroup_1; +EXPORT_VAR double* g_iKC_eKCneurongroup_1; +EXPORT_VAR double* d_g_iKC_eKCneurongroup_1; +EXPORT_VAR double* hneurongroup_1; +EXPORT_VAR double* d_hneurongroup_1; +EXPORT_VAR double* mneurongroup_1; +EXPORT_VAR double* d_mneurongroup_1; +EXPORT_VAR double* nneurongroup_1; +EXPORT_VAR double* d_nneurongroup_1; +EXPORT_VAR double* lastspikeneurongroup_1; +EXPORT_VAR double* d_lastspikeneurongroup_1; +EXPORT_VAR char* not_refractoryneurongroup_1; +EXPORT_VAR char* d_not_refractoryneurongroup_1; +#define spikeCount_spikegeneratorgroup glbSpkCntspikegeneratorgroup[0] +#define spike_spikegeneratorgroup glbSpkspikegeneratorgroup +#define glbSpkShiftspikegeneratorgroup 0 + +EXPORT_VAR unsigned int* glbSpkCntspikegeneratorgroup; +EXPORT_VAR unsigned int* d_glbSpkCntspikegeneratorgroup; +EXPORT_VAR unsigned int* glbSpkspikegeneratorgroup; +EXPORT_VAR unsigned int* d_glbSpkspikegeneratorgroup; + +// ------------------------------------------------------------------------ +// custom update variables +// ------------------------------------------------------------------------ + +// ------------------------------------------------------------------------ +// postsynaptic variables +// ------------------------------------------------------------------------ +EXPORT_VAR double* inSynsynapses; +EXPORT_VAR double* d_inSynsynapses; +EXPORT_VAR double* inSynsynapses_1; +EXPORT_VAR double* d_inSynsynapses_1; +EXPORT_VAR double* inSynsynapses_2; +EXPORT_VAR double* d_inSynsynapses_2; + +// ------------------------------------------------------------------------ +// synapse connectivity +// ------------------------------------------------------------------------ +EXPORT_VAR const unsigned int maxRowLengthsynapses; +EXPORT_VAR unsigned int* rowLengthsynapses; +EXPORT_VAR unsigned int* d_rowLengthsynapses; +EXPORT_VAR uint32_t* indsynapses; +EXPORT_VAR uint32_t* d_indsynapses; +EXPORT_VAR const unsigned int maxRowLengthsynapses_1; +EXPORT_VAR unsigned int* rowLengthsynapses_1; +EXPORT_VAR unsigned int* d_rowLengthsynapses_1; +EXPORT_VAR uint32_t* indsynapses_1; +EXPORT_VAR uint32_t* d_indsynapses_1; +EXPORT_VAR unsigned int* d_colLengthsynapses_1; +EXPORT_VAR unsigned int* d_remapsynapses_1; +EXPORT_VAR const unsigned int maxRowLengthsynapses_2; +EXPORT_VAR unsigned int* rowLengthsynapses_2; +EXPORT_VAR unsigned int* d_rowLengthsynapses_2; +EXPORT_VAR uint32_t* indsynapses_2; +EXPORT_VAR uint32_t* d_indsynapses_2; + +// ------------------------------------------------------------------------ +// synapse variables +// ------------------------------------------------------------------------ +EXPORT_VAR double* weightsynapses; +EXPORT_VAR double* d_weightsynapses; +EXPORT_VAR double* lastupdatesynapses_1; +EXPORT_VAR double* d_lastupdatesynapses_1; +EXPORT_VAR double* Apostsynapses_1; +EXPORT_VAR double* d_Apostsynapses_1; +EXPORT_VAR double* g_rawsynapses_1; +EXPORT_VAR double* d_g_rawsynapses_1; +EXPORT_VAR double* Apresynapses_1; +EXPORT_VAR double* d_Apresynapses_1; + +EXPORT_FUNC void pushneurongroupSpikesToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullneurongroupSpikesFromDevice(); +EXPORT_FUNC void pushneurongroupCurrentSpikesToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullneurongroupCurrentSpikesFromDevice(); +EXPORT_FUNC unsigned int* getneurongroupCurrentSpikes(unsigned int batch = 0); +EXPORT_FUNC unsigned int& getneurongroupCurrentSpikeCount(unsigned int batch = 0); +EXPORT_FUNC void pushineurongroupToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullineurongroupFromDevice(); +EXPORT_FUNC void pushCurrentineurongroupToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullCurrentineurongroupFromDevice(); +EXPORT_FUNC int32_t* getCurrentineurongroup(unsigned int batch = 0); +EXPORT_FUNC void pushVneurongroupToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullVneurongroupFromDevice(); +EXPORT_FUNC void pushCurrentVneurongroupToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullCurrentVneurongroupFromDevice(); +EXPORT_FUNC double* getCurrentVneurongroup(unsigned int batch = 0); +EXPORT_FUNC void pushg_PN_iKCneurongroupToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullg_PN_iKCneurongroupFromDevice(); +EXPORT_FUNC void pushCurrentg_PN_iKCneurongroupToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullCurrentg_PN_iKCneurongroupFromDevice(); +EXPORT_FUNC double* getCurrentg_PN_iKCneurongroup(unsigned int batch = 0); +EXPORT_FUNC void pushhneurongroupToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullhneurongroupFromDevice(); +EXPORT_FUNC void pushCurrenthneurongroupToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullCurrenthneurongroupFromDevice(); +EXPORT_FUNC double* getCurrenthneurongroup(unsigned int batch = 0); +EXPORT_FUNC void pushmneurongroupToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullmneurongroupFromDevice(); +EXPORT_FUNC void pushCurrentmneurongroupToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullCurrentmneurongroupFromDevice(); +EXPORT_FUNC double* getCurrentmneurongroup(unsigned int batch = 0); +EXPORT_FUNC void pushnneurongroupToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullnneurongroupFromDevice(); +EXPORT_FUNC void pushCurrentnneurongroupToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullCurrentnneurongroupFromDevice(); +EXPORT_FUNC double* getCurrentnneurongroup(unsigned int batch = 0); +EXPORT_FUNC void pushlastspikeneurongroupToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pulllastspikeneurongroupFromDevice(); +EXPORT_FUNC void pushCurrentlastspikeneurongroupToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullCurrentlastspikeneurongroupFromDevice(); +EXPORT_FUNC double* getCurrentlastspikeneurongroup(unsigned int batch = 0); +EXPORT_FUNC void pushnot_refractoryneurongroupToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullnot_refractoryneurongroupFromDevice(); +EXPORT_FUNC void pushCurrentnot_refractoryneurongroupToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullCurrentnot_refractoryneurongroupFromDevice(); +EXPORT_FUNC char* getCurrentnot_refractoryneurongroup(unsigned int batch = 0); +EXPORT_FUNC void pushneurongroupStateToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullneurongroupStateFromDevice(); +EXPORT_FUNC void pushneurongroup_1SpikesToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullneurongroup_1SpikesFromDevice(); +EXPORT_FUNC void pushneurongroup_1CurrentSpikesToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullneurongroup_1CurrentSpikesFromDevice(); +EXPORT_FUNC unsigned int* getneurongroup_1CurrentSpikes(unsigned int batch = 0); +EXPORT_FUNC unsigned int& getneurongroup_1CurrentSpikeCount(unsigned int batch = 0); +EXPORT_FUNC void pushineurongroup_1ToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullineurongroup_1FromDevice(); +EXPORT_FUNC void pushCurrentineurongroup_1ToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullCurrentineurongroup_1FromDevice(); +EXPORT_FUNC int32_t* getCurrentineurongroup_1(unsigned int batch = 0); +EXPORT_FUNC void pushVneurongroup_1ToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullVneurongroup_1FromDevice(); +EXPORT_FUNC void pushCurrentVneurongroup_1ToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullCurrentVneurongroup_1FromDevice(); +EXPORT_FUNC double* getCurrentVneurongroup_1(unsigned int batch = 0); +EXPORT_FUNC void pushg_eKC_eKCneurongroup_1ToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullg_eKC_eKCneurongroup_1FromDevice(); +EXPORT_FUNC void pushCurrentg_eKC_eKCneurongroup_1ToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullCurrentg_eKC_eKCneurongroup_1FromDevice(); +EXPORT_FUNC double* getCurrentg_eKC_eKCneurongroup_1(unsigned int batch = 0); +EXPORT_FUNC void pushg_iKC_eKCneurongroup_1ToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullg_iKC_eKCneurongroup_1FromDevice(); +EXPORT_FUNC void pushCurrentg_iKC_eKCneurongroup_1ToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullCurrentg_iKC_eKCneurongroup_1FromDevice(); +EXPORT_FUNC double* getCurrentg_iKC_eKCneurongroup_1(unsigned int batch = 0); +EXPORT_FUNC void pushhneurongroup_1ToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullhneurongroup_1FromDevice(); +EXPORT_FUNC void pushCurrenthneurongroup_1ToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullCurrenthneurongroup_1FromDevice(); +EXPORT_FUNC double* getCurrenthneurongroup_1(unsigned int batch = 0); +EXPORT_FUNC void pushmneurongroup_1ToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullmneurongroup_1FromDevice(); +EXPORT_FUNC void pushCurrentmneurongroup_1ToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullCurrentmneurongroup_1FromDevice(); +EXPORT_FUNC double* getCurrentmneurongroup_1(unsigned int batch = 0); +EXPORT_FUNC void pushnneurongroup_1ToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullnneurongroup_1FromDevice(); +EXPORT_FUNC void pushCurrentnneurongroup_1ToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullCurrentnneurongroup_1FromDevice(); +EXPORT_FUNC double* getCurrentnneurongroup_1(unsigned int batch = 0); +EXPORT_FUNC void pushlastspikeneurongroup_1ToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pulllastspikeneurongroup_1FromDevice(); +EXPORT_FUNC void pushCurrentlastspikeneurongroup_1ToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullCurrentlastspikeneurongroup_1FromDevice(); +EXPORT_FUNC double* getCurrentlastspikeneurongroup_1(unsigned int batch = 0); +EXPORT_FUNC void pushnot_refractoryneurongroup_1ToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullnot_refractoryneurongroup_1FromDevice(); +EXPORT_FUNC void pushCurrentnot_refractoryneurongroup_1ToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullCurrentnot_refractoryneurongroup_1FromDevice(); +EXPORT_FUNC char* getCurrentnot_refractoryneurongroup_1(unsigned int batch = 0); +EXPORT_FUNC void pushneurongroup_1StateToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullneurongroup_1StateFromDevice(); +EXPORT_FUNC void pushspikegeneratorgroupSpikesToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullspikegeneratorgroupSpikesFromDevice(); +EXPORT_FUNC void pushspikegeneratorgroupCurrentSpikesToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullspikegeneratorgroupCurrentSpikesFromDevice(); +EXPORT_FUNC unsigned int* getspikegeneratorgroupCurrentSpikes(unsigned int batch = 0); +EXPORT_FUNC unsigned int& getspikegeneratorgroupCurrentSpikeCount(unsigned int batch = 0); +EXPORT_FUNC void pushspikegeneratorgroupStateToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullspikegeneratorgroupStateFromDevice(); +EXPORT_FUNC void pushsynapsesConnectivityToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullsynapsesConnectivityFromDevice(); +EXPORT_FUNC void pushsynapses_1ConnectivityToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullsynapses_1ConnectivityFromDevice(); +EXPORT_FUNC void pushsynapses_2ConnectivityToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullsynapses_2ConnectivityFromDevice(); +EXPORT_FUNC void pushweightsynapsesToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullweightsynapsesFromDevice(); +EXPORT_FUNC void pushinSynsynapsesToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullinSynsynapsesFromDevice(); +EXPORT_FUNC void pushsynapsesStateToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullsynapsesStateFromDevice(); +EXPORT_FUNC void pushlastupdatesynapses_1ToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pulllastupdatesynapses_1FromDevice(); +EXPORT_FUNC void pushApostsynapses_1ToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullApostsynapses_1FromDevice(); +EXPORT_FUNC void pushg_rawsynapses_1ToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullg_rawsynapses_1FromDevice(); +EXPORT_FUNC void pushApresynapses_1ToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullApresynapses_1FromDevice(); +EXPORT_FUNC void pushinSynsynapses_1ToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullinSynsynapses_1FromDevice(); +EXPORT_FUNC void pushsynapses_1StateToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullsynapses_1StateFromDevice(); +EXPORT_FUNC void pushinSynsynapses_2ToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullinSynsynapses_2FromDevice(); +EXPORT_FUNC void pushsynapses_2StateToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void pullsynapses_2StateFromDevice(); +// Runner functions +EXPORT_FUNC void copyStateToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void copyConnectivityToDevice(bool uninitialisedOnly = false); +EXPORT_FUNC void copyStateFromDevice(); +EXPORT_FUNC void copyCurrentSpikesFromDevice(); +EXPORT_FUNC void copyCurrentSpikeEventsFromDevice(); +EXPORT_FUNC void allocateMem(); +EXPORT_FUNC void freeMem(); +EXPORT_FUNC size_t getFreeDeviceMemBytes(); +EXPORT_FUNC void stepTime(); + +// Functions generated by backend +EXPORT_FUNC void updateNeurons(double t); +EXPORT_FUNC void updateSynapses(double t); +EXPORT_FUNC void initialize(); +EXPORT_FUNC void initializeSparse(); +} // extern "C" diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/definitionsInternal.h b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/definitionsInternal.h new file mode 100644 index 00000000..49ff7112 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/definitionsInternal.h @@ -0,0 +1,173 @@ +#pragma once +#include "definitions.h" + +// CUDA includes +#include +#include + +// ------------------------------------------------------------------------ +// Helper macro for error-checking CUDA calls +#define CHECK_CUDA_ERRORS(call) {\ + cudaError_t error = call;\ + if (error != cudaSuccess) {\ + throw std::runtime_error(__FILE__": " + std::to_string(__LINE__) + ": cuda error " + std::to_string(error) + ": " + cudaGetErrorString(error));\ + }\ +} + +#define SUPPORT_CODE_FUNC __device__ __host__ inline + + +template +__device__ inline float exponentialDistFloat(RNG *rng) { + while (true) { + const float u = curand_uniform(rng); + if (u != 0.0f) { + return -logf(u); + } + } +} + +template +__device__ inline double exponentialDistDouble(RNG *rng) { + while (true) { + const double u = curand_uniform_double(rng); + if (u != 0.0) { + return -log(u); + } + } +} + +template +__device__ inline float gammaDistFloatInternal(RNG *rng, float c, float d) + { + float x, v, u; + while (true) { + do { + x = curand_normal(rng); + v = 1.0f + c*x; + } + while (v <= 0.0f); + + v = v*v*v; + do { + u = curand_uniform(rng); + } + while (u == 1.0f); + + if (u < 1.0f - 0.0331f*x*x*x*x) break; + if (logf(u) < 0.5f*x*x + d*(1.0f - v + logf(v))) break; + } + + return d*v; +} + +template +__device__ inline float gammaDistFloat(RNG *rng, float a) + { + if (a > 1) + { + const float u = curand_uniform (rng); + const float d = (1.0f + a) - 1.0f / 3.0f; + const float c = (1.0f / 3.0f) / sqrtf(d); + return gammaDistFloatInternal (rng, c, d) * powf(u, 1.0f / a); + } + else + { + const float d = a - 1.0f / 3.0f; + const float c = (1.0f / 3.0f) / sqrtf(d); + return gammaDistFloatInternal(rng, c, d); + } +} + +template +__device__ inline float gammaDistDoubleInternal(RNG *rng, double c, double d) + { + double x, v, u; + while (true) { + do { + x = curand_normal_double(rng); + v = 1.0 + c*x; + } + while (v <= 0.0); + + v = v*v*v; + do { + u = curand_uniform_double(rng); + } + while (u == 1.0); + + if (u < 1.0 - 0.0331*x*x*x*x) break; + if (log(u) < 0.5*x*x + d*(1.0 - v + log(v))) break; + } + + return d*v; +} + +template +__device__ inline float gammaDistDouble(RNG *rng, double a) + { + if (a > 1.0) + { + const double u = curand_uniform (rng); + const double d = (1.0 + a) - 1.0 / 3.0; + const double c = (1.0 / 3.0) / sqrt(d); + return gammaDistDoubleInternal (rng, c, d) * pow(u, 1.0 / a); + } + else + { + const float d = a - 1.0 / 3.0; + const float c = (1.0 / 3.0) / sqrt(d); + return gammaDistDoubleInternal(rng, c, d); + } +} + +// ------------------------------------------------------------------------ +// merged group structures +// ------------------------------------------------------------------------ +extern "C" { +// ------------------------------------------------------------------------ +// global variables +// ------------------------------------------------------------------------ + +// ------------------------------------------------------------------------ +// timers +// ------------------------------------------------------------------------ +// ------------------------------------------------------------------------ +// merged group arrays for host initialisation +// ------------------------------------------------------------------------ +// ------------------------------------------------------------------------ +// local neuron groups +// ------------------------------------------------------------------------ + +// ------------------------------------------------------------------------ +// custom update variables +// ------------------------------------------------------------------------ + +// ------------------------------------------------------------------------ +// postsynaptic variables +// ------------------------------------------------------------------------ + +// ------------------------------------------------------------------------ +// synapse connectivity +// ------------------------------------------------------------------------ + +// ------------------------------------------------------------------------ +// synapse variables +// ------------------------------------------------------------------------ + +// ------------------------------------------------------------------------ +// copying merged group structures to device +// ------------------------------------------------------------------------ +EXPORT_FUNC void pushMergedNeuronInitGroup0ToDevice(unsigned int idx, unsigned int* spkCnt, unsigned int* spk, unsigned int numNeurons); +EXPORT_FUNC void pushMergedNeuronInitGroup1ToDevice(unsigned int idx, unsigned int* spkCnt, unsigned int* spk, double* inSynInSyn0, double* inSynInSyn1, unsigned int numNeurons); +EXPORT_FUNC void pushMergedNeuronInitGroup2ToDevice(unsigned int idx, unsigned int* spkCnt, unsigned int* spk, double* inSynInSyn0, unsigned int numNeurons); +EXPORT_FUNC void pushMergedSynapseSparseInitGroup0ToDevice(unsigned int idx, unsigned int* rowLength, uint32_t* ind, unsigned int* colLength, unsigned int* remap, unsigned int rowStride, unsigned int colStride, unsigned int numSrcNeurons, unsigned int numTrgNeurons); +EXPORT_FUNC void pushMergedNeuronUpdateGroup0ToDevice(unsigned int idx, unsigned int* spkCnt, unsigned int* spk, unsigned int numNeurons); +EXPORT_FUNC void pushMergedNeuronUpdateGroup1ToDevice(unsigned int idx, unsigned int* spkCnt, unsigned int* spk, int32_t* i, double* V, double* g_eKC_eKC, double* g_iKC_eKC, double* h, double* m, double* n, double* lastspike, char* not_refractory, double* inSynInSyn0, double* inSynInSyn1, unsigned int numNeurons); +EXPORT_FUNC void pushMergedNeuronUpdateGroup2ToDevice(unsigned int idx, unsigned int* spkCnt, unsigned int* spk, int32_t* i, double* V, double* g_PN_iKC, double* h, double* m, double* n, double* lastspike, char* not_refractory, double* inSynInSyn0, unsigned int numNeurons); +EXPORT_FUNC void pushMergedPresynapticUpdateGroup0ToDevice(unsigned int idx, double* inSyn, unsigned int* srcSpkCnt, unsigned int* srcSpk, unsigned int* rowLength, uint32_t* ind, unsigned int rowStride, unsigned int numSrcNeurons, unsigned int numTrgNeurons); +EXPORT_FUNC void pushMergedPresynapticUpdateGroup1ToDevice(unsigned int idx, double* inSyn, unsigned int* srcSpkCnt, unsigned int* srcSpk, unsigned int* rowLength, uint32_t* ind, double* lastupdate, double* Apost, double* g_raw, double* Apre, unsigned int rowStride, unsigned int numSrcNeurons, unsigned int numTrgNeurons); +EXPORT_FUNC void pushMergedPresynapticUpdateGroup2ToDevice(unsigned int idx, double* inSyn, unsigned int* srcSpkCnt, unsigned int* srcSpk, unsigned int* rowLength, uint32_t* ind, double* weight, unsigned int rowStride, unsigned int numSrcNeurons, unsigned int numTrgNeurons); +EXPORT_FUNC void pushMergedPostsynapticUpdateGroup0ToDevice(unsigned int idx, unsigned int* trgSpkCnt, unsigned int* trgSpk, unsigned int* rowLength, uint32_t* ind, unsigned int* colLength, unsigned int* remap, double* lastupdate, double* Apost, double* g_raw, double* Apre, unsigned int rowStride, unsigned int colStride, unsigned int numSrcNeurons, unsigned int numTrgNeurons); +EXPORT_FUNC void pushMergedNeuronSpikeQueueUpdateGroup0ToDevice(unsigned int idx, unsigned int* spkCnt); +} // extern "C" diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/init.cc b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/init.cc new file mode 100644 index 00000000..697eaa88 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/init.cc @@ -0,0 +1,194 @@ +#include "definitionsInternal.h" +#include +#include +#include + +struct MergedNeuronInitGroup0 + { + unsigned int* spkCnt; + unsigned int* spk; + unsigned int numNeurons; + +} +; +struct MergedNeuronInitGroup1 + { + unsigned int* spkCnt; + unsigned int* spk; + double* inSynInSyn0; + double* inSynInSyn1; + unsigned int numNeurons; + +} +; +struct MergedNeuronInitGroup2 + { + unsigned int* spkCnt; + unsigned int* spk; + double* inSynInSyn0; + unsigned int numNeurons; + +} +; +struct MergedSynapseSparseInitGroup0 + { + unsigned int* rowLength; + uint32_t* ind; + unsigned int* colLength; + unsigned int* remap; + unsigned int rowStride; + unsigned int colStride; + unsigned int numSrcNeurons; + unsigned int numTrgNeurons; + +} +; +__device__ __constant__ MergedNeuronInitGroup0 d_mergedNeuronInitGroup0[1]; +void pushMergedNeuronInitGroup0ToDevice(unsigned int idx, unsigned int* spkCnt, unsigned int* spk, unsigned int numNeurons) { + MergedNeuronInitGroup0 group = {spkCnt, spk, numNeurons, }; + CHECK_CUDA_ERRORS(cudaMemcpyToSymbolAsync(d_mergedNeuronInitGroup0, &group, sizeof(MergedNeuronInitGroup0), idx * sizeof(MergedNeuronInitGroup0))); +} +__device__ __constant__ MergedNeuronInitGroup1 d_mergedNeuronInitGroup1[1]; +void pushMergedNeuronInitGroup1ToDevice(unsigned int idx, unsigned int* spkCnt, unsigned int* spk, double* inSynInSyn0, double* inSynInSyn1, unsigned int numNeurons) { + MergedNeuronInitGroup1 group = {spkCnt, spk, inSynInSyn0, inSynInSyn1, numNeurons, }; + CHECK_CUDA_ERRORS(cudaMemcpyToSymbolAsync(d_mergedNeuronInitGroup1, &group, sizeof(MergedNeuronInitGroup1), idx * sizeof(MergedNeuronInitGroup1))); +} +__device__ __constant__ MergedNeuronInitGroup2 d_mergedNeuronInitGroup2[1]; +void pushMergedNeuronInitGroup2ToDevice(unsigned int idx, unsigned int* spkCnt, unsigned int* spk, double* inSynInSyn0, unsigned int numNeurons) { + MergedNeuronInitGroup2 group = {spkCnt, spk, inSynInSyn0, numNeurons, }; + CHECK_CUDA_ERRORS(cudaMemcpyToSymbolAsync(d_mergedNeuronInitGroup2, &group, sizeof(MergedNeuronInitGroup2), idx * sizeof(MergedNeuronInitGroup2))); +} +__device__ __constant__ MergedSynapseSparseInitGroup0 d_mergedSynapseSparseInitGroup0[1]; +void pushMergedSynapseSparseInitGroup0ToDevice(unsigned int idx, unsigned int* rowLength, uint32_t* ind, unsigned int* colLength, unsigned int* remap, unsigned int rowStride, unsigned int colStride, unsigned int numSrcNeurons, unsigned int numTrgNeurons) { + MergedSynapseSparseInitGroup0 group = {rowLength, ind, colLength, remap, rowStride, colStride, numSrcNeurons, numTrgNeurons, }; + CHECK_CUDA_ERRORS(cudaMemcpyToSymbolAsync(d_mergedSynapseSparseInitGroup0, &group, sizeof(MergedSynapseSparseInitGroup0), idx * sizeof(MergedSynapseSparseInitGroup0))); +} +// ------------------------------------------------------------------------ +// merged extra global parameter functions +// ------------------------------------------------------------------------ +// ------------------------------------------------------------------------ +// merged extra global parameter functions +// ------------------------------------------------------------------------ +__device__ unsigned int d_mergedNeuronInitGroupStartID0[] = {0, }; +__device__ unsigned int d_mergedNeuronInitGroupStartID1[] = {128, }; +__device__ unsigned int d_mergedNeuronInitGroupStartID2[] = {256, }; +__device__ unsigned int d_mergedSynapseSparseInitGroupStartID0[] = {0, }; + +extern "C" __global__ void initializeKernel(unsigned long long deviceRNGSeed) { + const unsigned int id = 32 * blockIdx.x + threadIdx.x; + // ------------------------------------------------------------------------ + // Local neuron groups + // merged0 + if(id < 128) { + struct MergedNeuronInitGroup0 *group = &d_mergedNeuronInitGroup0[0]; + const unsigned int lid = id - 0; + // only do this for existing neurons + if(lid < group->numNeurons) { + if(lid == 0) { + group->spkCnt[0] = 0; + } + group->spk[lid] = 0; + // current source variables + } + } + // merged1 + if(id >= 128 && id < 256) { + struct MergedNeuronInitGroup1 *group = &d_mergedNeuronInitGroup1[0]; + const unsigned int lid = id - 128; + // only do this for existing neurons + if(lid < group->numNeurons) { + if(lid == 0) { + group->spkCnt[0] = 0; + } + group->spk[lid] = 0; + { + group->inSynInSyn0[lid] = 0.000000; + } + { + group->inSynInSyn1[lid] = 0.000000; + } + // current source variables + } + } + // merged2 + if(id >= 256 && id < 2784) { + struct MergedNeuronInitGroup2 *group = &d_mergedNeuronInitGroup2[0]; + const unsigned int lid = id - 256; + // only do this for existing neurons + if(lid < group->numNeurons) { + if(lid == 0) { + group->spkCnt[0] = 0; + } + group->spk[lid] = 0; + { + group->inSynInSyn0[lid] = 0.000000; + } + // current source variables + } + } + + // ------------------------------------------------------------------------ + // Custom update groups + + // ------------------------------------------------------------------------ + // Custom WU update groups with dense connectivity + + // ------------------------------------------------------------------------ + // Synapse groups with dense connectivity + + // ------------------------------------------------------------------------ + // Synapse groups with sparse connectivity + +} +extern "C" __global__ void initializeSparseKernel() { + const unsigned int id = 32 * blockIdx.x + threadIdx.x; + __shared__ unsigned int shRowLength[32]; + // merged0 + if(id < 128) { + struct MergedSynapseSparseInitGroup0 *group = &d_mergedSynapseSparseInitGroup0[0]; + const unsigned int lid = id - 0; + const unsigned int numBlocks = (group->numSrcNeurons + 32 - 1) / 32; + unsigned int idx = lid; + for(unsigned int r = 0; r < numBlocks; r++) { + const unsigned numRowsInBlock = (r == (numBlocks - 1)) ? ((group->numSrcNeurons - 1) % 32) + 1 : 32; + __syncthreads(); + if (threadIdx.x < numRowsInBlock) { + shRowLength[threadIdx.x] = group->rowLength[(r * 32) + threadIdx.x]; + } + __syncthreads(); + for(unsigned int i = 0; i < numRowsInBlock; i++) { + if(lid < shRowLength[i]) { + { + const unsigned int postIndex = group->ind[idx]; + const unsigned int colLocation = atomicAdd(&group->colLength[postIndex], 1); + const unsigned int colMajorIndex = (postIndex * group->colStride) + colLocation; + group->remap[colMajorIndex] = idx; + } + } + idx += group->rowStride; + } + } + } +} +void initialize() { + unsigned long long deviceRNGSeed = 0; + CHECK_CUDA_ERRORS(cudaMemset(d_colLengthsynapses_1, 0, 100 * sizeof(unsigned int))); + { + const dim3 threads(32, 1); + const dim3 grid(87, 1); + initializeKernel<<>>(deviceRNGSeed); + CHECK_CUDA_ERRORS(cudaPeekAtLastError()); + } +} + +void initializeSparse() { + copyStateToDevice(true); + copyConnectivityToDevice(true); + + { + const dim3 threads(32, 1); + const dim3 grid(4, 1); + initializeSparseKernel<<>>(); + CHECK_CUDA_ERRORS(cudaPeekAtLastError()); + } +} diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/init.d b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/init.d new file mode 100644 index 00000000..6ced26f2 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/init.d @@ -0,0 +1,286 @@ +init.o : init.cc \ + /usr/include/stdc-predef.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda_runtime.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/host_config.h \ + /usr/include/features.h \ + /usr/include/x86_64-linux-gnu/sys/cdefs.h \ + /usr/include/x86_64-linux-gnu/bits/wordsize.h \ + /usr/include/x86_64-linux-gnu/bits/long-double.h \ + /usr/include/x86_64-linux-gnu/gnu/stubs.h \ + /usr/include/x86_64-linux-gnu/gnu/stubs-64.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/builtin_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/device_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/host_defines.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/driver_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/vector_types.h \ + /usr/lib/gcc/x86_64-linux-gnu/9/include/limits.h \ + /usr/lib/gcc/x86_64-linux-gnu/9/include/syslimits.h \ + /usr/include/limits.h \ + /usr/include/x86_64-linux-gnu/bits/libc-header-start.h \ + /usr/include/x86_64-linux-gnu/bits/posix1_lim.h \ + /usr/include/x86_64-linux-gnu/bits/local_lim.h \ + /usr/include/linux/limits.h \ + /usr/include/x86_64-linux-gnu/bits/posix2_lim.h \ + /usr/include/x86_64-linux-gnu/bits/xopen_lim.h \ + /usr/include/x86_64-linux-gnu/bits/uio_lim.h \ + /usr/lib/gcc/x86_64-linux-gnu/9/include/stddef.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/surface_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/texture_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/library_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/channel_descriptor.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda_runtime_api.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda_device_runtime_api.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/driver_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/vector_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/vector_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/common_functions.h \ + /usr/include/string.h \ + /usr/include/x86_64-linux-gnu/bits/types/locale_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/__locale_t.h \ + /usr/include/strings.h \ + /usr/include/time.h \ + /usr/include/x86_64-linux-gnu/bits/time.h \ + /usr/include/x86_64-linux-gnu/bits/types.h \ + /usr/include/x86_64-linux-gnu/bits/timesize.h \ + /usr/include/x86_64-linux-gnu/bits/typesizes.h \ + /usr/include/x86_64-linux-gnu/bits/time64.h \ + /usr/include/x86_64-linux-gnu/bits/timex.h \ + /usr/include/x86_64-linux-gnu/bits/types/struct_timeval.h \ + /usr/include/x86_64-linux-gnu/bits/types/clock_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/time_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/struct_tm.h \ + /usr/include/x86_64-linux-gnu/bits/types/struct_timespec.h \ + /usr/include/x86_64-linux-gnu/bits/endian.h \ + /usr/include/x86_64-linux-gnu/bits/endianness.h \ + /usr/include/x86_64-linux-gnu/bits/types/clockid_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/timer_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/struct_itimerspec.h \ + /usr/include/c++/9/new \ + /usr/include/x86_64-linux-gnu/c++/9/bits/c++config.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/os_defines.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/cpu_defines.h \ + /usr/include/c++/9/exception \ + /usr/include/c++/9/bits/exception.h \ + /usr/include/c++/9/bits/exception_ptr.h \ + /usr/include/c++/9/bits/exception_defines.h \ + /usr/include/c++/9/bits/cxxabi_init_exception.h \ + /usr/include/c++/9/typeinfo \ + /usr/include/c++/9/bits/hash_bytes.h \ + /usr/include/c++/9/bits/nested_exception.h \ + /usr/include/c++/9/bits/move.h \ + /usr/include/c++/9/bits/concept_check.h \ + /usr/include/c++/9/type_traits \ + /usr/include/stdio.h \ + /usr/lib/gcc/x86_64-linux-gnu/9/include/stdarg.h \ + /usr/include/x86_64-linux-gnu/bits/types/__fpos_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/__mbstate_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/__fpos64_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/__FILE.h \ + /usr/include/x86_64-linux-gnu/bits/types/FILE.h \ + /usr/include/x86_64-linux-gnu/bits/types/struct_FILE.h \ + /usr/include/x86_64-linux-gnu/bits/types/cookie_io_functions_t.h \ + /usr/include/x86_64-linux-gnu/bits/stdio_lim.h \ + /usr/include/x86_64-linux-gnu/bits/sys_errlist.h \ + /usr/include/c++/9/stdlib.h \ + /usr/include/c++/9/cstdlib \ + /usr/include/stdlib.h \ + /usr/include/x86_64-linux-gnu/bits/waitflags.h \ + /usr/include/x86_64-linux-gnu/bits/waitstatus.h \ + /usr/include/x86_64-linux-gnu/bits/floatn.h \ + /usr/include/x86_64-linux-gnu/bits/floatn-common.h \ + /usr/include/x86_64-linux-gnu/sys/types.h \ + /usr/include/x86_64-linux-gnu/bits/stdint-intn.h \ + /usr/include/endian.h \ + /usr/include/x86_64-linux-gnu/bits/byteswap.h \ + /usr/include/x86_64-linux-gnu/bits/uintn-identity.h \ + /usr/include/x86_64-linux-gnu/sys/select.h \ + /usr/include/x86_64-linux-gnu/bits/select.h \ + /usr/include/x86_64-linux-gnu/bits/types/sigset_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/__sigset_t.h \ + /usr/include/x86_64-linux-gnu/bits/pthreadtypes.h \ + /usr/include/x86_64-linux-gnu/bits/thread-shared-types.h \ + /usr/include/x86_64-linux-gnu/bits/pthreadtypes-arch.h \ + /usr/include/x86_64-linux-gnu/bits/struct_mutex.h \ + /usr/include/x86_64-linux-gnu/bits/struct_rwlock.h \ + /usr/include/alloca.h \ + /usr/include/x86_64-linux-gnu/bits/stdlib-float.h \ + /usr/include/c++/9/bits/std_abs.h \ + /usr/include/assert.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/math_functions.h \ + /usr/include/c++/9/math.h \ + /usr/include/c++/9/cmath \ + /usr/include/c++/9/bits/cpp_type_traits.h \ + /usr/include/c++/9/ext/type_traits.h \ + /usr/include/math.h \ + /usr/include/x86_64-linux-gnu/bits/math-vector.h \ + /usr/include/x86_64-linux-gnu/bits/libm-simd-decl-stubs.h \ + /usr/include/x86_64-linux-gnu/bits/flt-eval-method.h \ + /usr/include/x86_64-linux-gnu/bits/fp-logb.h \ + /usr/include/x86_64-linux-gnu/bits/fp-fast.h \ + /usr/include/x86_64-linux-gnu/bits/mathcalls-helper-functions.h \ + /usr/include/x86_64-linux-gnu/bits/mathcalls.h \ + /usr/include/x86_64-linux-gnu/bits/mathcalls-narrow.h \ + /usr/include/x86_64-linux-gnu/bits/iscanonical.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/math_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda_surface_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda_texture_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/device_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/device_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/device_atomic_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/device_atomic_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/device_double_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/device_double_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_20_atomic_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_20_atomic_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_32_atomic_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_32_atomic_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_35_atomic_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_60_atomic_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_60_atomic_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_20_intrinsics.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_20_intrinsics.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_30_intrinsics.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_30_intrinsics.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_32_intrinsics.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_32_intrinsics.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_35_intrinsics.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_61_intrinsics.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_61_intrinsics.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/sm_70_rt.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/sm_70_rt.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/sm_80_rt.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/sm_80_rt.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/surface_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/texture_fetch_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/texture_indirect_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/surface_indirect_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/device_launch_parameters.h \ + definitionsInternal.h \ + definitions.h \ + /usr/include/c++/9/random \ + /usr/include/c++/9/string \ + /usr/include/c++/9/bits/stringfwd.h \ + /usr/include/c++/9/bits/memoryfwd.h \ + /usr/include/c++/9/bits/char_traits.h \ + /usr/include/c++/9/bits/stl_algobase.h \ + /usr/include/c++/9/bits/functexcept.h \ + /usr/include/c++/9/ext/numeric_traits.h \ + /usr/include/c++/9/bits/stl_pair.h \ + /usr/include/c++/9/bits/stl_iterator_base_types.h \ + /usr/include/c++/9/bits/stl_iterator_base_funcs.h \ + /usr/include/c++/9/debug/assertions.h \ + /usr/include/c++/9/bits/stl_iterator.h \ + /usr/include/c++/9/bits/ptr_traits.h \ + /usr/include/c++/9/debug/debug.h \ + /usr/include/c++/9/bits/predefined_ops.h \ + /usr/include/c++/9/bits/postypes.h \ + /usr/include/c++/9/cwchar \ + /usr/include/wchar.h \ + /usr/include/x86_64-linux-gnu/bits/wchar.h \ + /usr/include/x86_64-linux-gnu/bits/types/wint_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/mbstate_t.h \ + /usr/include/c++/9/cstdint \ + /usr/lib/gcc/x86_64-linux-gnu/9/include/stdint.h \ + /usr/include/stdint.h \ + /usr/include/x86_64-linux-gnu/bits/stdint-uintn.h \ + /usr/include/c++/9/bits/allocator.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/c++allocator.h \ + /usr/include/c++/9/ext/new_allocator.h \ + /usr/include/c++/9/bits/localefwd.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/c++locale.h \ + /usr/include/c++/9/clocale \ + /usr/include/locale.h \ + /usr/include/x86_64-linux-gnu/bits/locale.h \ + /usr/include/c++/9/iosfwd \ + /usr/include/c++/9/cctype \ + /usr/include/ctype.h \ + /usr/include/c++/9/bits/ostream_insert.h \ + /usr/include/c++/9/bits/cxxabi_forced.h \ + /usr/include/c++/9/bits/stl_function.h \ + /usr/include/c++/9/backward/binders.h \ + /usr/include/c++/9/bits/range_access.h \ + /usr/include/c++/9/initializer_list \ + /usr/include/c++/9/bits/basic_string.h \ + /usr/include/c++/9/ext/atomicity.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/gthr.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/gthr-default.h \ + /usr/include/pthread.h \ + /usr/include/sched.h \ + /usr/include/x86_64-linux-gnu/bits/sched.h \ + /usr/include/x86_64-linux-gnu/bits/types/struct_sched_param.h \ + /usr/include/x86_64-linux-gnu/bits/cpu-set.h \ + /usr/include/x86_64-linux-gnu/bits/setjmp.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/atomic_word.h \ + /usr/include/c++/9/ext/alloc_traits.h \ + /usr/include/c++/9/bits/alloc_traits.h \ + /usr/include/c++/9/ext/string_conversions.h \ + /usr/include/c++/9/cstdio \ + /usr/include/c++/9/cerrno \ + /usr/include/errno.h \ + /usr/include/x86_64-linux-gnu/bits/errno.h \ + /usr/include/linux/errno.h \ + /usr/include/x86_64-linux-gnu/asm/errno.h \ + /usr/include/asm-generic/errno.h \ + /usr/include/asm-generic/errno-base.h \ + /usr/include/x86_64-linux-gnu/bits/types/error_t.h \ + /usr/include/c++/9/bits/functional_hash.h \ + /usr/include/c++/9/bits/basic_string.tcc \ + /usr/include/c++/9/limits \ + /usr/include/c++/9/bits/random.h \ + /usr/include/c++/9/vector \ + /usr/include/c++/9/bits/stl_construct.h \ + /usr/include/c++/9/bits/stl_uninitialized.h \ + /usr/include/c++/9/bits/stl_vector.h \ + /usr/include/c++/9/bits/stl_bvector.h \ + /usr/include/c++/9/bits/vector.tcc \ + /usr/include/c++/9/bits/uniform_int_dist.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/opt_random.h \ + /usr/include/c++/9/bits/random.tcc \ + /usr/include/c++/9/numeric \ + /usr/include/c++/9/bits/stl_numeric.h \ + /usr/include/c++/9/stdexcept \ + /usr/include/c++/9/cassert \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_kernel.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_discrete.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_precalc.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_mrg32k3a.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_mtgp32_kernel.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda.h \ + /usr/include/memory.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_mtgp32.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_philox4x32_x.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_globals.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_uniform.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_normal.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_normal_static.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_lognormal.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_poisson.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_discrete2.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda_fp16.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda_fp16.hpp \ + /usr/include/c++/9/utility \ + /usr/include/c++/9/bits/stl_relops.h \ + /usr/include/c++/9/iostream \ + /usr/include/c++/9/ostream \ + /usr/include/c++/9/ios \ + /usr/include/c++/9/bits/ios_base.h \ + /usr/include/c++/9/bits/locale_classes.h \ + /usr/include/c++/9/bits/locale_classes.tcc \ + /usr/include/c++/9/system_error \ + /usr/include/x86_64-linux-gnu/c++/9/bits/error_constants.h \ + /usr/include/c++/9/streambuf \ + /usr/include/c++/9/bits/streambuf.tcc \ + /usr/include/c++/9/bits/basic_ios.h \ + /usr/include/c++/9/bits/locale_facets.h \ + /usr/include/c++/9/cwctype \ + /usr/include/wctype.h \ + /usr/include/x86_64-linux-gnu/bits/wctype-wchar.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/ctype_base.h \ + /usr/include/c++/9/bits/streambuf_iterator.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/ctype_inline.h \ + /usr/include/c++/9/bits/locale_facets.tcc \ + /usr/include/c++/9/bits/basic_ios.tcc \ + /usr/include/c++/9/bits/ostream.tcc \ + /usr/include/c++/9/istream \ + /usr/include/c++/9/bits/istream.tcc diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/initCUDA0.sha b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/initCUDA0.sha new file mode 100644 index 00000000..bdbfced5 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/initCUDA0.sha @@ -0,0 +1,3 @@ +a606bd0 6d82f349 e91e4e38 d9fc4530 e31cc567 +0 12 +128 24 diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/initCUDA1.sha b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/initCUDA1.sha new file mode 100644 index 00000000..a8cd7286 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/initCUDA1.sha @@ -0,0 +1,3 @@ +a606bd0 6d82f349 e91e4e38 d9fc4530 e31cc567 +0 12 +256 24 diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/model.sha b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/model.sha new file mode 100644 index 00000000..525dc969 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/model.sha @@ -0,0 +1,2 @@ +be5feab8 8f368d01 eaac20f1 3510ebf c8e4831c +9727012 10727412 0 diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/neuronUpdate.cc b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/neuronUpdate.cc new file mode 100644 index 00000000..b56bf02e --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/neuronUpdate.cc @@ -0,0 +1,318 @@ +#include "definitionsInternal.h" +#include "supportCode.h" + +struct MergedNeuronUpdateGroup0 + { + unsigned int* spkCnt; + unsigned int* spk; + unsigned int numNeurons; + +} +; +struct MergedNeuronUpdateGroup1 + { + unsigned int* spkCnt; + unsigned int* spk; + int32_t* i; + double* V; + double* g_eKC_eKC; + double* g_iKC_eKC; + double* h; + double* m; + double* n; + double* lastspike; + char* not_refractory; + double* inSynInSyn0; + double* inSynInSyn1; + unsigned int numNeurons; + +} +; +struct MergedNeuronUpdateGroup2 + { + unsigned int* spkCnt; + unsigned int* spk; + int32_t* i; + double* V; + double* g_PN_iKC; + double* h; + double* m; + double* n; + double* lastspike; + char* not_refractory; + double* inSynInSyn0; + unsigned int numNeurons; + +} +; +struct MergedNeuronSpikeQueueUpdateGroup0 + { + unsigned int* spkCnt; + +} +; +__device__ __constant__ MergedNeuronSpikeQueueUpdateGroup0 d_mergedNeuronSpikeQueueUpdateGroup0[3]; +void pushMergedNeuronSpikeQueueUpdateGroup0ToDevice(unsigned int idx, unsigned int* spkCnt) { + MergedNeuronSpikeQueueUpdateGroup0 group = {spkCnt, }; + CHECK_CUDA_ERRORS(cudaMemcpyToSymbolAsync(d_mergedNeuronSpikeQueueUpdateGroup0, &group, sizeof(MergedNeuronSpikeQueueUpdateGroup0), idx * sizeof(MergedNeuronSpikeQueueUpdateGroup0))); +} +__device__ __constant__ MergedNeuronUpdateGroup0 d_mergedNeuronUpdateGroup0[1]; +void pushMergedNeuronUpdateGroup0ToDevice(unsigned int idx, unsigned int* spkCnt, unsigned int* spk, unsigned int numNeurons) { + MergedNeuronUpdateGroup0 group = {spkCnt, spk, numNeurons, }; + CHECK_CUDA_ERRORS(cudaMemcpyToSymbolAsync(d_mergedNeuronUpdateGroup0, &group, sizeof(MergedNeuronUpdateGroup0), idx * sizeof(MergedNeuronUpdateGroup0))); +} +__device__ __constant__ MergedNeuronUpdateGroup1 d_mergedNeuronUpdateGroup1[1]; +void pushMergedNeuronUpdateGroup1ToDevice(unsigned int idx, unsigned int* spkCnt, unsigned int* spk, int32_t* i, double* V, double* g_eKC_eKC, double* g_iKC_eKC, double* h, double* m, double* n, double* lastspike, char* not_refractory, double* inSynInSyn0, double* inSynInSyn1, unsigned int numNeurons) { + MergedNeuronUpdateGroup1 group = {spkCnt, spk, i, V, g_eKC_eKC, g_iKC_eKC, h, m, n, lastspike, not_refractory, inSynInSyn0, inSynInSyn1, numNeurons, }; + CHECK_CUDA_ERRORS(cudaMemcpyToSymbolAsync(d_mergedNeuronUpdateGroup1, &group, sizeof(MergedNeuronUpdateGroup1), idx * sizeof(MergedNeuronUpdateGroup1))); +} +__device__ __constant__ MergedNeuronUpdateGroup2 d_mergedNeuronUpdateGroup2[1]; +void pushMergedNeuronUpdateGroup2ToDevice(unsigned int idx, unsigned int* spkCnt, unsigned int* spk, int32_t* i, double* V, double* g_PN_iKC, double* h, double* m, double* n, double* lastspike, char* not_refractory, double* inSynInSyn0, unsigned int numNeurons) { + MergedNeuronUpdateGroup2 group = {spkCnt, spk, i, V, g_PN_iKC, h, m, n, lastspike, not_refractory, inSynInSyn0, numNeurons, }; + CHECK_CUDA_ERRORS(cudaMemcpyToSymbolAsync(d_mergedNeuronUpdateGroup2, &group, sizeof(MergedNeuronUpdateGroup2), idx * sizeof(MergedNeuronUpdateGroup2))); +} +// ------------------------------------------------------------------------ +// merged extra global parameter functions +// ------------------------------------------------------------------------ +// ------------------------------------------------------------------------ +// merged extra global parameter functions +// ------------------------------------------------------------------------ +__device__ __constant__ unsigned int d_mergedNeuronUpdateGroupStartID0[] = {0, }; +__device__ __constant__ unsigned int d_mergedNeuronUpdateGroupStartID1[] = {128, }; +__device__ __constant__ unsigned int d_mergedNeuronUpdateGroupStartID2[] = {256, }; + +extern "C" __global__ void neuronSpikeQueueUpdateKernel() { + const unsigned int id = 32 * blockIdx.x + threadIdx.x; + if(id < 3) { + struct MergedNeuronSpikeQueueUpdateGroup0 *group = &d_mergedNeuronSpikeQueueUpdateGroup0[id - 0]; + group->spkCnt[0] = 0; + } +} + +extern "C" __global__ void updateNeuronsKernel(double t) + { + const unsigned int id = 32 * blockIdx.x + threadIdx.x; + __shared__ unsigned int shSpk[32]; + __shared__ unsigned int shPosSpk; + __shared__ unsigned int shSpkCount; + if (threadIdx.x == 0) { + shSpkCount = 0; + } + + __syncthreads(); + // merged0 + if(id < 128) { + struct MergedNeuronUpdateGroup0 *group = &d_mergedNeuronUpdateGroup0[0]; + const unsigned int lid = id - 0; + + if(lid < group->numNeurons) { + + // test whether spike condition was fulfilled previously + // calculate membrane potential + + // test for and register a true spike + if (0) { + const unsigned int spkIdx = atomicAdd(&shSpkCount, 1); + shSpk[spkIdx] = lid; + } + } + __syncthreads(); + if(threadIdx.x == 0) { + if (shSpkCount > 0) { + shPosSpk = atomicAdd(&group->spkCnt[0], shSpkCount); + } + } + __syncthreads(); + if(threadIdx.x < shSpkCount) { + const unsigned int n = shSpk[threadIdx.x]; + group->spk[shPosSpk + threadIdx.x] = n; + } + } + // merged1 + if(id >= 128 && id < 256) { + struct MergedNeuronUpdateGroup1 *group = &d_mergedNeuronUpdateGroup1[0]; + const unsigned int lid = id - 128; + + if(lid < group->numNeurons) { + int32_t li = group->i[lid]; + double lV = group->V[lid]; + double lg_eKC_eKC = group->g_eKC_eKC[lid]; + double lg_iKC_eKC = group->g_iKC_eKC[lid]; + double lh = group->h[lid]; + double lm = group->m[lid]; + double ln = group->n[lid]; + double llastspike = group->lastspike[lid]; + char lnot_refractory = group->not_refractory[lid]; + + double Isyn = 0; + { + // pull inSyn values in a coalesced access + double linSyn = group->inSynInSyn0[lid]; + Isyn += 0; lg_eKC_eKC += linSyn; linSyn= 0; + + group->inSynInSyn0[lid] = linSyn; + } + { + // pull inSyn values in a coalesced access + double linSyn = group->inSynInSyn1[lid]; + Isyn += 0; lg_iKC_eKC += linSyn; linSyn= 0; + + group->inSynInSyn1[lid] = linSyn; + } + using namespace NeuronUpdateSupportCode0; + // test whether spike condition was fulfilled previously + // calculate membrane potential + // Update "constant over DT" subexpressions (if any) + + + + // PoissonInputs targetting this group (if any) + + + + // Update state variables and the threshold condition + + lnot_refractory = lnot_refractory || (! (lV > (0.0 * (1.00000000000000002e-03)))); + double _BA_V = 1.0*(((((1.0*(((1.0 * (-9.50000000000000011e-02)) * (1.42999999999999986e-06)) * (_brian_pow(ln, 4.0)))/(2.99999999999999998e-10)) + (1.0*((((1.0 * (5.00000000000000028e-02)) * (7.15000000000000015e-06)) * lh) * (_brian_pow(lm, 3.0)))/(2.99999999999999998e-10))) + (1.0*((1.0 * (0.00000000000000000e+00)) * lg_iKC_eKC)/(2.99999999999999998e-10))) + (1.0*((1.0 * (-9.19999999999999984e-02)) * lg_eKC_eKC)/(2.99999999999999998e-10))) + (1.0*((1.0 * (-6.35600000000000054e-02)) * (2.67000000000000009e-08))/(2.99999999999999998e-10)))/(((((1.0*(((- 1.0) * (1.42999999999999986e-06)) * (_brian_pow(ln, 4.0)))/(2.99999999999999998e-10)) - (1.0*(((1.0 * (7.15000000000000015e-06)) * lh) * (_brian_pow(lm, 3.0)))/(2.99999999999999998e-10))) - (1.0*(1.0 * lg_eKC_eKC)/(2.99999999999999998e-10))) - (1.0*(1.0 * lg_iKC_eKC)/(2.99999999999999998e-10))) - (1.0*(1.0 * (2.67000000000000009e-08))/(2.99999999999999998e-10))); + double _V = (- _BA_V) + ((lV + _BA_V) * exp(DT * (((((1.0*(((- 1.0) * (1.42999999999999986e-06)) * (_brian_pow(ln, 4.0)))/(2.99999999999999998e-10)) - (1.0*(((1.0 * (7.15000000000000015e-06)) * lh) * (_brian_pow(lm, 3.0)))/(2.99999999999999998e-10))) - (1.0*(1.0 * lg_eKC_eKC)/(2.99999999999999998e-10))) - (1.0*(1.0 * lg_iKC_eKC)/(2.99999999999999998e-10))) - (1.0*(1.0 * (2.67000000000000009e-08))/(2.99999999999999998e-10))))); + double _g_eKC_eKC = lg_eKC_eKC * exp(1.0*(- DT)/(5.00000000000000010e-03)); + double _g_iKC_eKC = lg_iKC_eKC * exp(1.0*(- DT)/(1.00000000000000002e-02)); + double _BA_h = 1.0*((0.329137207652868 * exp(1.0*((- 0.0555555555555556) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.0555555555555556 * (-6.30000000000000004e-02))/(1.00000000000000002e-03)))/((1.00000000000000002e-03) * ((1.0*(- 4.0)/((1.00000000000000002e-03) + (((2980.95798704173 * (1.00000000000000002e-03)) * exp(1.0*((- 0.2) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.2 * (-6.30000000000000004e-02))/(1.00000000000000002e-03))))) - (1.0*((0.329137207652868 * exp(1.0*((- 0.0555555555555556) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.0555555555555556 * (-6.30000000000000004e-02))/(1.00000000000000002e-03)))/(1.00000000000000002e-03)))); + double _h = (- _BA_h) + ((_BA_h + lh) * exp(DT * ((1.0*(- 4.0)/((1.00000000000000002e-03) + (((2980.95798704173 * (1.00000000000000002e-03)) * exp(1.0*((- 0.2) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.2 * (-6.30000000000000004e-02))/(1.00000000000000002e-03))))) - (1.0*((0.329137207652868 * exp(1.0*((- 0.0555555555555556) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.0555555555555556 * (-6.30000000000000004e-02))/(1.00000000000000002e-03)))/(1.00000000000000002e-03))))); + double _BA_m = 1.0*(((1.0*((- 0.32) * lV)/((((- 1.0) * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) + ((((25.7903399171931 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) * exp(1.0*((- 0.25) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.25 * (-6.30000000000000004e-02))/(1.00000000000000002e-03))))) + (1.0*(0.32 * (-6.30000000000000004e-02))/((((- 1.0) * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) + ((((25.7903399171931 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) * exp(1.0*((- 0.25) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.25 * (-6.30000000000000004e-02))/(1.00000000000000002e-03)))))) + (1.0*(4.16 * (1.00000000000000002e-03))/((((- 1.0) * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) + ((((25.7903399171931 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) * exp(1.0*((- 0.25) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.25 * (-6.30000000000000004e-02))/(1.00000000000000002e-03))))))/((((((1.0*((- 0.28) * lV)/(((((0.000335462627902512 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) * exp(1.0*(0.2 * lV)/(1.00000000000000002e-03))) * exp(1.0*((- 0.2) * (-6.30000000000000004e-02))/(1.00000000000000002e-03))) - ((1.0 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)))) + (1.0*(0.32 * lV)/((((- 1.0) * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) + ((((25.7903399171931 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) * exp(1.0*((- 0.25) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.25 * (-6.30000000000000004e-02))/(1.00000000000000002e-03)))))) + (1.0*(0.28 * (-6.30000000000000004e-02))/(((((0.000335462627902512 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) * exp(1.0*(0.2 * lV)/(1.00000000000000002e-03))) * exp(1.0*((- 0.2) * (-6.30000000000000004e-02))/(1.00000000000000002e-03))) - ((1.0 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03))))) - (1.0*(0.32 * (-6.30000000000000004e-02))/((((- 1.0) * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) + ((((25.7903399171931 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) * exp(1.0*((- 0.25) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.25 * (-6.30000000000000004e-02))/(1.00000000000000002e-03)))))) + (1.0*(11.2 * (1.00000000000000002e-03))/(((((0.000335462627902512 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) * exp(1.0*(0.2 * lV)/(1.00000000000000002e-03))) * exp(1.0*((- 0.2) * (-6.30000000000000004e-02))/(1.00000000000000002e-03))) - ((1.0 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03))))) - (1.0*(4.16 * (1.00000000000000002e-03))/((((- 1.0) * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) + ((((25.7903399171931 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) * exp(1.0*((- 0.25) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.25 * (-6.30000000000000004e-02))/(1.00000000000000002e-03)))))); + double _m = (- _BA_m) + ((_BA_m + lm) * exp(DT * ((((((1.0*((- 0.28) * lV)/(((((0.000335462627902512 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) * exp(1.0*(0.2 * lV)/(1.00000000000000002e-03))) * exp(1.0*((- 0.2) * (-6.30000000000000004e-02))/(1.00000000000000002e-03))) - ((1.0 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)))) + (1.0*(0.32 * lV)/((((- 1.0) * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) + ((((25.7903399171931 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) * exp(1.0*((- 0.25) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.25 * (-6.30000000000000004e-02))/(1.00000000000000002e-03)))))) + (1.0*(0.28 * (-6.30000000000000004e-02))/(((((0.000335462627902512 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) * exp(1.0*(0.2 * lV)/(1.00000000000000002e-03))) * exp(1.0*((- 0.2) * (-6.30000000000000004e-02))/(1.00000000000000002e-03))) - ((1.0 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03))))) - (1.0*(0.32 * (-6.30000000000000004e-02))/((((- 1.0) * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) + ((((25.7903399171931 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) * exp(1.0*((- 0.25) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.25 * (-6.30000000000000004e-02))/(1.00000000000000002e-03)))))) + (1.0*(11.2 * (1.00000000000000002e-03))/(((((0.000335462627902512 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) * exp(1.0*(0.2 * lV)/(1.00000000000000002e-03))) * exp(1.0*((- 0.2) * (-6.30000000000000004e-02))/(1.00000000000000002e-03))) - ((1.0 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03))))) - (1.0*(4.16 * (1.00000000000000002e-03))/((((- 1.0) * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) + ((((25.7903399171931 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) * exp(1.0*((- 0.25) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.25 * (-6.30000000000000004e-02))/(1.00000000000000002e-03)))))))); + double _BA_n = 1.0*(((1.0*((- 0.032) * lV)/((((- 1.0) * (1.00000000000000002e-03)) * (1.00000000000000002e-03)) + ((((20.0855369231877 * (1.00000000000000002e-03)) * (1.00000000000000002e-03)) * exp(1.0*((- 0.2) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.2 * (-6.30000000000000004e-02))/(1.00000000000000002e-03))))) + (1.0*(0.032 * (-6.30000000000000004e-02))/((((- 1.0) * (1.00000000000000002e-03)) * (1.00000000000000002e-03)) + ((((20.0855369231877 * (1.00000000000000002e-03)) * (1.00000000000000002e-03)) * exp(1.0*((- 0.2) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.2 * (-6.30000000000000004e-02))/(1.00000000000000002e-03)))))) + (1.0*(0.48 * (1.00000000000000002e-03))/((((- 1.0) * (1.00000000000000002e-03)) * (1.00000000000000002e-03)) + ((((20.0855369231877 * (1.00000000000000002e-03)) * (1.00000000000000002e-03)) * exp(1.0*((- 0.2) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.2 * (-6.30000000000000004e-02))/(1.00000000000000002e-03))))))/((((1.0*(0.032 * lV)/((((- 1.0) * (1.00000000000000002e-03)) * (1.00000000000000002e-03)) + ((((20.0855369231877 * (1.00000000000000002e-03)) * (1.00000000000000002e-03)) * exp(1.0*((- 0.2) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.2 * (-6.30000000000000004e-02))/(1.00000000000000002e-03))))) - (1.0*(0.032 * (-6.30000000000000004e-02))/((((- 1.0) * (1.00000000000000002e-03)) * (1.00000000000000002e-03)) + ((((20.0855369231877 * (1.00000000000000002e-03)) * (1.00000000000000002e-03)) * exp(1.0*((- 0.2) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.2 * (-6.30000000000000004e-02))/(1.00000000000000002e-03)))))) - (1.0*(0.48 * (1.00000000000000002e-03))/((((- 1.0) * (1.00000000000000002e-03)) * (1.00000000000000002e-03)) + ((((20.0855369231877 * (1.00000000000000002e-03)) * (1.00000000000000002e-03)) * exp(1.0*((- 0.2) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.2 * (-6.30000000000000004e-02))/(1.00000000000000002e-03)))))) - (1.0*((0.642012708343871 * exp(1.0*((- 0.025) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.025 * (-6.30000000000000004e-02))/(1.00000000000000002e-03)))/(1.00000000000000002e-03))); + double _n = (- _BA_n) + ((_BA_n + ln) * exp(DT * ((((1.0*(0.032 * lV)/((((- 1.0) * (1.00000000000000002e-03)) * (1.00000000000000002e-03)) + ((((20.0855369231877 * (1.00000000000000002e-03)) * (1.00000000000000002e-03)) * exp(1.0*((- 0.2) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.2 * (-6.30000000000000004e-02))/(1.00000000000000002e-03))))) - (1.0*(0.032 * (-6.30000000000000004e-02))/((((- 1.0) * (1.00000000000000002e-03)) * (1.00000000000000002e-03)) + ((((20.0855369231877 * (1.00000000000000002e-03)) * (1.00000000000000002e-03)) * exp(1.0*((- 0.2) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.2 * (-6.30000000000000004e-02))/(1.00000000000000002e-03)))))) - (1.0*(0.48 * (1.00000000000000002e-03))/((((- 1.0) * (1.00000000000000002e-03)) * (1.00000000000000002e-03)) + ((((20.0855369231877 * (1.00000000000000002e-03)) * (1.00000000000000002e-03)) * exp(1.0*((- 0.2) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.2 * (-6.30000000000000004e-02))/(1.00000000000000002e-03)))))) - (1.0*((0.642012708343871 * exp(1.0*((- 0.025) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.025 * (-6.30000000000000004e-02))/(1.00000000000000002e-03)))/(1.00000000000000002e-03))))); + lV = _V; + lg_eKC_eKC = _g_eKC_eKC; + lg_iKC_eKC = _g_iKC_eKC; + lh = _h; + lm = _m; + ln = _n; + char _cond = (lV > (0.0 * (1.00000000000000002e-03))) && lnot_refractory; + // test for and register a true spike + if (_cond) { + const unsigned int spkIdx = atomicAdd(&shSpkCount, 1); + shSpk[spkIdx] = lid; + // spike reset code + llastspike = t; + lnot_refractory = false; + } + group->i[lid] = li; + group->V[lid] = lV; + group->g_eKC_eKC[lid] = lg_eKC_eKC; + group->g_iKC_eKC[lid] = lg_iKC_eKC; + group->h[lid] = lh; + group->m[lid] = lm; + group->n[lid] = ln; + group->lastspike[lid] = llastspike; + group->not_refractory[lid] = lnot_refractory; + } + __syncthreads(); + if(threadIdx.x == 0) { + if (shSpkCount > 0) { + shPosSpk = atomicAdd(&group->spkCnt[0], shSpkCount); + } + } + __syncthreads(); + if(threadIdx.x < shSpkCount) { + const unsigned int n = shSpk[threadIdx.x]; + group->spk[shPosSpk + threadIdx.x] = n; + } + } + // merged2 + if(id >= 256 && id < 2784) { + struct MergedNeuronUpdateGroup2 *group = &d_mergedNeuronUpdateGroup2[0]; + const unsigned int lid = id - 256; + + if(lid < group->numNeurons) { + int32_t li = group->i[lid]; + double lV = group->V[lid]; + double lg_PN_iKC = group->g_PN_iKC[lid]; + double lh = group->h[lid]; + double lm = group->m[lid]; + double ln = group->n[lid]; + double llastspike = group->lastspike[lid]; + char lnot_refractory = group->not_refractory[lid]; + + double Isyn = 0; + { + // pull inSyn values in a coalesced access + double linSyn = group->inSynInSyn0[lid]; + Isyn += 0; lg_PN_iKC += linSyn; linSyn= 0; + + group->inSynInSyn0[lid] = linSyn; + } + using namespace NeuronUpdateSupportCode0; + // test whether spike condition was fulfilled previously + // calculate membrane potential + // Update "constant over DT" subexpressions (if any) + + + + // PoissonInputs targetting this group (if any) + + + + // Update state variables and the threshold condition + + lnot_refractory = lnot_refractory || (! (lV > (0 * (1.00000000000000002e-03)))); + double _BA_V = 1.0*((((1.0*(((1.0 * (-9.50000000000000011e-02)) * (1.42999999999999986e-06)) * (_brian_pow(ln, 4.0)))/(2.99999999999999998e-10)) + (1.0*((((1.0 * (5.00000000000000028e-02)) * (7.15000000000000015e-06)) * lh) * (_brian_pow(lm, 3.0)))/(2.99999999999999998e-10))) + (1.0*((1.0 * (0.00000000000000000e+00)) * lg_PN_iKC)/(2.99999999999999998e-10))) + (1.0*((1.0 * (-6.35600000000000054e-02)) * (2.67000000000000009e-08))/(2.99999999999999998e-10)))/((((1.0*(((- 1.0) * (1.42999999999999986e-06)) * (_brian_pow(ln, 4.0)))/(2.99999999999999998e-10)) - (1.0*(((1.0 * (7.15000000000000015e-06)) * lh) * (_brian_pow(lm, 3.0)))/(2.99999999999999998e-10))) - (1.0*(1.0 * lg_PN_iKC)/(2.99999999999999998e-10))) - (1.0*(1.0 * (2.67000000000000009e-08))/(2.99999999999999998e-10))); + double _V = (- _BA_V) + ((lV + _BA_V) * exp(DT * ((((1.0*(((- 1.0) * (1.42999999999999986e-06)) * (_brian_pow(ln, 4.0)))/(2.99999999999999998e-10)) - (1.0*(((1.0 * (7.15000000000000015e-06)) * lh) * (_brian_pow(lm, 3.0)))/(2.99999999999999998e-10))) - (1.0*(1.0 * lg_PN_iKC)/(2.99999999999999998e-10))) - (1.0*(1.0 * (2.67000000000000009e-08))/(2.99999999999999998e-10))))); + double _g_PN_iKC = lg_PN_iKC * exp(1.0*(- DT)/(2.00000000000000004e-03)); + double _BA_h = 1.0*((0.329137207652868 * exp(1.0*((- 0.0555555555555556) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.0555555555555556 * (-6.30000000000000004e-02))/(1.00000000000000002e-03)))/((1.00000000000000002e-03) * ((1.0*(- 4.0)/((1.00000000000000002e-03) + (((2980.95798704173 * (1.00000000000000002e-03)) * exp(1.0*((- 0.2) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.2 * (-6.30000000000000004e-02))/(1.00000000000000002e-03))))) - (1.0*((0.329137207652868 * exp(1.0*((- 0.0555555555555556) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.0555555555555556 * (-6.30000000000000004e-02))/(1.00000000000000002e-03)))/(1.00000000000000002e-03)))); + double _h = (- _BA_h) + ((_BA_h + lh) * exp(DT * ((1.0*(- 4.0)/((1.00000000000000002e-03) + (((2980.95798704173 * (1.00000000000000002e-03)) * exp(1.0*((- 0.2) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.2 * (-6.30000000000000004e-02))/(1.00000000000000002e-03))))) - (1.0*((0.329137207652868 * exp(1.0*((- 0.0555555555555556) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.0555555555555556 * (-6.30000000000000004e-02))/(1.00000000000000002e-03)))/(1.00000000000000002e-03))))); + double _BA_m = 1.0*(((1.0*((- 0.32) * lV)/((((- 1.0) * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) + ((((25.7903399171931 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) * exp(1.0*((- 0.25) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.25 * (-6.30000000000000004e-02))/(1.00000000000000002e-03))))) + (1.0*(0.32 * (-6.30000000000000004e-02))/((((- 1.0) * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) + ((((25.7903399171931 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) * exp(1.0*((- 0.25) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.25 * (-6.30000000000000004e-02))/(1.00000000000000002e-03)))))) + (1.0*(4.16 * (1.00000000000000002e-03))/((((- 1.0) * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) + ((((25.7903399171931 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) * exp(1.0*((- 0.25) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.25 * (-6.30000000000000004e-02))/(1.00000000000000002e-03))))))/((((((1.0*((- 0.28) * lV)/(((((0.000335462627902512 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) * exp(1.0*(0.2 * lV)/(1.00000000000000002e-03))) * exp(1.0*((- 0.2) * (-6.30000000000000004e-02))/(1.00000000000000002e-03))) - ((1.0 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)))) + (1.0*(0.32 * lV)/((((- 1.0) * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) + ((((25.7903399171931 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) * exp(1.0*((- 0.25) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.25 * (-6.30000000000000004e-02))/(1.00000000000000002e-03)))))) + (1.0*(0.28 * (-6.30000000000000004e-02))/(((((0.000335462627902512 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) * exp(1.0*(0.2 * lV)/(1.00000000000000002e-03))) * exp(1.0*((- 0.2) * (-6.30000000000000004e-02))/(1.00000000000000002e-03))) - ((1.0 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03))))) - (1.0*(0.32 * (-6.30000000000000004e-02))/((((- 1.0) * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) + ((((25.7903399171931 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) * exp(1.0*((- 0.25) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.25 * (-6.30000000000000004e-02))/(1.00000000000000002e-03)))))) + (1.0*(11.2 * (1.00000000000000002e-03))/(((((0.000335462627902512 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) * exp(1.0*(0.2 * lV)/(1.00000000000000002e-03))) * exp(1.0*((- 0.2) * (-6.30000000000000004e-02))/(1.00000000000000002e-03))) - ((1.0 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03))))) - (1.0*(4.16 * (1.00000000000000002e-03))/((((- 1.0) * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) + ((((25.7903399171931 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) * exp(1.0*((- 0.25) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.25 * (-6.30000000000000004e-02))/(1.00000000000000002e-03)))))); + double _m = (- _BA_m) + ((_BA_m + lm) * exp(DT * ((((((1.0*((- 0.28) * lV)/(((((0.000335462627902512 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) * exp(1.0*(0.2 * lV)/(1.00000000000000002e-03))) * exp(1.0*((- 0.2) * (-6.30000000000000004e-02))/(1.00000000000000002e-03))) - ((1.0 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)))) + (1.0*(0.32 * lV)/((((- 1.0) * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) + ((((25.7903399171931 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) * exp(1.0*((- 0.25) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.25 * (-6.30000000000000004e-02))/(1.00000000000000002e-03)))))) + (1.0*(0.28 * (-6.30000000000000004e-02))/(((((0.000335462627902512 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) * exp(1.0*(0.2 * lV)/(1.00000000000000002e-03))) * exp(1.0*((- 0.2) * (-6.30000000000000004e-02))/(1.00000000000000002e-03))) - ((1.0 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03))))) - (1.0*(0.32 * (-6.30000000000000004e-02))/((((- 1.0) * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) + ((((25.7903399171931 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) * exp(1.0*((- 0.25) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.25 * (-6.30000000000000004e-02))/(1.00000000000000002e-03)))))) + (1.0*(11.2 * (1.00000000000000002e-03))/(((((0.000335462627902512 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) * exp(1.0*(0.2 * lV)/(1.00000000000000002e-03))) * exp(1.0*((- 0.2) * (-6.30000000000000004e-02))/(1.00000000000000002e-03))) - ((1.0 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03))))) - (1.0*(4.16 * (1.00000000000000002e-03))/((((- 1.0) * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) + ((((25.7903399171931 * (_brian_pow((1.00000000000000002e-03), 1.0))) * (1.00000000000000002e-03)) * exp(1.0*((- 0.25) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.25 * (-6.30000000000000004e-02))/(1.00000000000000002e-03)))))))); + double _BA_n = 1.0*(((1.0*((- 0.032) * lV)/((((- 1.0) * (1.00000000000000002e-03)) * (1.00000000000000002e-03)) + ((((20.0855369231877 * (1.00000000000000002e-03)) * (1.00000000000000002e-03)) * exp(1.0*((- 0.2) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.2 * (-6.30000000000000004e-02))/(1.00000000000000002e-03))))) + (1.0*(0.032 * (-6.30000000000000004e-02))/((((- 1.0) * (1.00000000000000002e-03)) * (1.00000000000000002e-03)) + ((((20.0855369231877 * (1.00000000000000002e-03)) * (1.00000000000000002e-03)) * exp(1.0*((- 0.2) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.2 * (-6.30000000000000004e-02))/(1.00000000000000002e-03)))))) + (1.0*(0.48 * (1.00000000000000002e-03))/((((- 1.0) * (1.00000000000000002e-03)) * (1.00000000000000002e-03)) + ((((20.0855369231877 * (1.00000000000000002e-03)) * (1.00000000000000002e-03)) * exp(1.0*((- 0.2) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.2 * (-6.30000000000000004e-02))/(1.00000000000000002e-03))))))/((((1.0*(0.032 * lV)/((((- 1.0) * (1.00000000000000002e-03)) * (1.00000000000000002e-03)) + ((((20.0855369231877 * (1.00000000000000002e-03)) * (1.00000000000000002e-03)) * exp(1.0*((- 0.2) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.2 * (-6.30000000000000004e-02))/(1.00000000000000002e-03))))) - (1.0*(0.032 * (-6.30000000000000004e-02))/((((- 1.0) * (1.00000000000000002e-03)) * (1.00000000000000002e-03)) + ((((20.0855369231877 * (1.00000000000000002e-03)) * (1.00000000000000002e-03)) * exp(1.0*((- 0.2) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.2 * (-6.30000000000000004e-02))/(1.00000000000000002e-03)))))) - (1.0*(0.48 * (1.00000000000000002e-03))/((((- 1.0) * (1.00000000000000002e-03)) * (1.00000000000000002e-03)) + ((((20.0855369231877 * (1.00000000000000002e-03)) * (1.00000000000000002e-03)) * exp(1.0*((- 0.2) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.2 * (-6.30000000000000004e-02))/(1.00000000000000002e-03)))))) - (1.0*((0.642012708343871 * exp(1.0*((- 0.025) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.025 * (-6.30000000000000004e-02))/(1.00000000000000002e-03)))/(1.00000000000000002e-03))); + double _n = (- _BA_n) + ((_BA_n + ln) * exp(DT * ((((1.0*(0.032 * lV)/((((- 1.0) * (1.00000000000000002e-03)) * (1.00000000000000002e-03)) + ((((20.0855369231877 * (1.00000000000000002e-03)) * (1.00000000000000002e-03)) * exp(1.0*((- 0.2) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.2 * (-6.30000000000000004e-02))/(1.00000000000000002e-03))))) - (1.0*(0.032 * (-6.30000000000000004e-02))/((((- 1.0) * (1.00000000000000002e-03)) * (1.00000000000000002e-03)) + ((((20.0855369231877 * (1.00000000000000002e-03)) * (1.00000000000000002e-03)) * exp(1.0*((- 0.2) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.2 * (-6.30000000000000004e-02))/(1.00000000000000002e-03)))))) - (1.0*(0.48 * (1.00000000000000002e-03))/((((- 1.0) * (1.00000000000000002e-03)) * (1.00000000000000002e-03)) + ((((20.0855369231877 * (1.00000000000000002e-03)) * (1.00000000000000002e-03)) * exp(1.0*((- 0.2) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.2 * (-6.30000000000000004e-02))/(1.00000000000000002e-03)))))) - (1.0*((0.642012708343871 * exp(1.0*((- 0.025) * lV)/(1.00000000000000002e-03))) * exp(1.0*(0.025 * (-6.30000000000000004e-02))/(1.00000000000000002e-03)))/(1.00000000000000002e-03))))); + lV = _V; + lg_PN_iKC = _g_PN_iKC; + lh = _h; + lm = _m; + ln = _n; + char _cond = (lV > (0 * (1.00000000000000002e-03))) && lnot_refractory; + // test for and register a true spike + if (_cond) { + const unsigned int spkIdx = atomicAdd(&shSpkCount, 1); + shSpk[spkIdx] = lid; + // spike reset code + llastspike = t; + lnot_refractory = false; + } + group->i[lid] = li; + group->V[lid] = lV; + group->g_PN_iKC[lid] = lg_PN_iKC; + group->h[lid] = lh; + group->m[lid] = lm; + group->n[lid] = ln; + group->lastspike[lid] = llastspike; + group->not_refractory[lid] = lnot_refractory; + } + __syncthreads(); + if(threadIdx.x == 0) { + if (shSpkCount > 0) { + shPosSpk = atomicAdd(&group->spkCnt[0], shSpkCount); + } + } + __syncthreads(); + if(threadIdx.x < shSpkCount) { + const unsigned int n = shSpk[threadIdx.x]; + group->spk[shPosSpk + threadIdx.x] = n; + } + } +} +void updateNeurons(double t) { + { + const dim3 threads(32, 1); + const dim3 grid(1, 1); + neuronSpikeQueueUpdateKernel<<>>(); + CHECK_CUDA_ERRORS(cudaPeekAtLastError()); + } + { + const dim3 threads(32, 1); + const dim3 grid(87, 1); + updateNeuronsKernel<<>>(t); + CHECK_CUDA_ERRORS(cudaPeekAtLastError()); + } +} diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/neuronUpdate.d b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/neuronUpdate.d new file mode 100644 index 00000000..f68fed67 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/neuronUpdate.d @@ -0,0 +1,264 @@ +neuronUpdate.o : neuronUpdate.cc \ + /usr/include/stdc-predef.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda_runtime.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/host_config.h \ + /usr/include/features.h \ + /usr/include/x86_64-linux-gnu/sys/cdefs.h \ + /usr/include/x86_64-linux-gnu/bits/wordsize.h \ + /usr/include/x86_64-linux-gnu/bits/long-double.h \ + /usr/include/x86_64-linux-gnu/gnu/stubs.h \ + /usr/include/x86_64-linux-gnu/gnu/stubs-64.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/builtin_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/device_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/host_defines.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/driver_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/vector_types.h \ + /usr/lib/gcc/x86_64-linux-gnu/9/include/limits.h \ + /usr/lib/gcc/x86_64-linux-gnu/9/include/syslimits.h \ + /usr/include/limits.h \ + /usr/include/x86_64-linux-gnu/bits/libc-header-start.h \ + /usr/include/x86_64-linux-gnu/bits/posix1_lim.h \ + /usr/include/x86_64-linux-gnu/bits/local_lim.h \ + /usr/include/linux/limits.h \ + /usr/include/x86_64-linux-gnu/bits/posix2_lim.h \ + /usr/include/x86_64-linux-gnu/bits/xopen_lim.h \ + /usr/include/x86_64-linux-gnu/bits/uio_lim.h \ + /usr/lib/gcc/x86_64-linux-gnu/9/include/stddef.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/surface_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/texture_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/library_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/channel_descriptor.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda_runtime_api.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda_device_runtime_api.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/driver_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/vector_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/vector_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/common_functions.h \ + /usr/include/string.h \ + /usr/include/x86_64-linux-gnu/bits/types/locale_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/__locale_t.h \ + /usr/include/strings.h \ + /usr/include/time.h \ + /usr/include/x86_64-linux-gnu/bits/time.h \ + /usr/include/x86_64-linux-gnu/bits/types.h \ + /usr/include/x86_64-linux-gnu/bits/timesize.h \ + /usr/include/x86_64-linux-gnu/bits/typesizes.h \ + /usr/include/x86_64-linux-gnu/bits/time64.h \ + /usr/include/x86_64-linux-gnu/bits/timex.h \ + /usr/include/x86_64-linux-gnu/bits/types/struct_timeval.h \ + /usr/include/x86_64-linux-gnu/bits/types/clock_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/time_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/struct_tm.h \ + /usr/include/x86_64-linux-gnu/bits/types/struct_timespec.h \ + /usr/include/x86_64-linux-gnu/bits/endian.h \ + /usr/include/x86_64-linux-gnu/bits/endianness.h \ + /usr/include/x86_64-linux-gnu/bits/types/clockid_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/timer_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/struct_itimerspec.h \ + /usr/include/c++/9/new \ + /usr/include/x86_64-linux-gnu/c++/9/bits/c++config.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/os_defines.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/cpu_defines.h \ + /usr/include/c++/9/exception \ + /usr/include/c++/9/bits/exception.h \ + /usr/include/c++/9/bits/exception_ptr.h \ + /usr/include/c++/9/bits/exception_defines.h \ + /usr/include/c++/9/bits/cxxabi_init_exception.h \ + /usr/include/c++/9/typeinfo \ + /usr/include/c++/9/bits/hash_bytes.h \ + /usr/include/c++/9/bits/nested_exception.h \ + /usr/include/c++/9/bits/move.h \ + /usr/include/c++/9/bits/concept_check.h \ + /usr/include/c++/9/type_traits \ + /usr/include/stdio.h \ + /usr/lib/gcc/x86_64-linux-gnu/9/include/stdarg.h \ + /usr/include/x86_64-linux-gnu/bits/types/__fpos_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/__mbstate_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/__fpos64_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/__FILE.h \ + /usr/include/x86_64-linux-gnu/bits/types/FILE.h \ + /usr/include/x86_64-linux-gnu/bits/types/struct_FILE.h \ + /usr/include/x86_64-linux-gnu/bits/types/cookie_io_functions_t.h \ + /usr/include/x86_64-linux-gnu/bits/stdio_lim.h \ + /usr/include/x86_64-linux-gnu/bits/sys_errlist.h \ + /usr/include/c++/9/stdlib.h \ + /usr/include/c++/9/cstdlib \ + /usr/include/stdlib.h \ + /usr/include/x86_64-linux-gnu/bits/waitflags.h \ + /usr/include/x86_64-linux-gnu/bits/waitstatus.h \ + /usr/include/x86_64-linux-gnu/bits/floatn.h \ + /usr/include/x86_64-linux-gnu/bits/floatn-common.h \ + /usr/include/x86_64-linux-gnu/sys/types.h \ + /usr/include/x86_64-linux-gnu/bits/stdint-intn.h \ + /usr/include/endian.h \ + /usr/include/x86_64-linux-gnu/bits/byteswap.h \ + /usr/include/x86_64-linux-gnu/bits/uintn-identity.h \ + /usr/include/x86_64-linux-gnu/sys/select.h \ + /usr/include/x86_64-linux-gnu/bits/select.h \ + /usr/include/x86_64-linux-gnu/bits/types/sigset_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/__sigset_t.h \ + /usr/include/x86_64-linux-gnu/bits/pthreadtypes.h \ + /usr/include/x86_64-linux-gnu/bits/thread-shared-types.h \ + /usr/include/x86_64-linux-gnu/bits/pthreadtypes-arch.h \ + /usr/include/x86_64-linux-gnu/bits/struct_mutex.h \ + /usr/include/x86_64-linux-gnu/bits/struct_rwlock.h \ + /usr/include/alloca.h \ + /usr/include/x86_64-linux-gnu/bits/stdlib-float.h \ + /usr/include/c++/9/bits/std_abs.h \ + /usr/include/assert.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/math_functions.h \ + /usr/include/c++/9/math.h \ + /usr/include/c++/9/cmath \ + /usr/include/c++/9/bits/cpp_type_traits.h \ + /usr/include/c++/9/ext/type_traits.h \ + /usr/include/math.h \ + /usr/include/x86_64-linux-gnu/bits/math-vector.h \ + /usr/include/x86_64-linux-gnu/bits/libm-simd-decl-stubs.h \ + /usr/include/x86_64-linux-gnu/bits/flt-eval-method.h \ + /usr/include/x86_64-linux-gnu/bits/fp-logb.h \ + /usr/include/x86_64-linux-gnu/bits/fp-fast.h \ + /usr/include/x86_64-linux-gnu/bits/mathcalls-helper-functions.h \ + /usr/include/x86_64-linux-gnu/bits/mathcalls.h \ + /usr/include/x86_64-linux-gnu/bits/mathcalls-narrow.h \ + /usr/include/x86_64-linux-gnu/bits/iscanonical.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/math_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda_surface_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda_texture_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/device_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/device_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/device_atomic_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/device_atomic_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/device_double_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/device_double_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_20_atomic_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_20_atomic_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_32_atomic_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_32_atomic_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_35_atomic_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_60_atomic_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_60_atomic_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_20_intrinsics.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_20_intrinsics.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_30_intrinsics.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_30_intrinsics.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_32_intrinsics.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_32_intrinsics.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_35_intrinsics.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_61_intrinsics.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_61_intrinsics.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/sm_70_rt.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/sm_70_rt.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/sm_80_rt.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/sm_80_rt.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/surface_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/texture_fetch_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/texture_indirect_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/surface_indirect_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/device_launch_parameters.h \ + definitionsInternal.h \ + definitions.h \ + /usr/include/c++/9/random \ + /usr/include/c++/9/string \ + /usr/include/c++/9/bits/stringfwd.h \ + /usr/include/c++/9/bits/memoryfwd.h \ + /usr/include/c++/9/bits/char_traits.h \ + /usr/include/c++/9/bits/stl_algobase.h \ + /usr/include/c++/9/bits/functexcept.h \ + /usr/include/c++/9/ext/numeric_traits.h \ + /usr/include/c++/9/bits/stl_pair.h \ + /usr/include/c++/9/bits/stl_iterator_base_types.h \ + /usr/include/c++/9/bits/stl_iterator_base_funcs.h \ + /usr/include/c++/9/debug/assertions.h \ + /usr/include/c++/9/bits/stl_iterator.h \ + /usr/include/c++/9/bits/ptr_traits.h \ + /usr/include/c++/9/debug/debug.h \ + /usr/include/c++/9/bits/predefined_ops.h \ + /usr/include/c++/9/bits/postypes.h \ + /usr/include/c++/9/cwchar \ + /usr/include/wchar.h \ + /usr/include/x86_64-linux-gnu/bits/wchar.h \ + /usr/include/x86_64-linux-gnu/bits/types/wint_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/mbstate_t.h \ + /usr/include/c++/9/cstdint \ + /usr/lib/gcc/x86_64-linux-gnu/9/include/stdint.h \ + /usr/include/stdint.h \ + /usr/include/x86_64-linux-gnu/bits/stdint-uintn.h \ + /usr/include/c++/9/bits/allocator.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/c++allocator.h \ + /usr/include/c++/9/ext/new_allocator.h \ + /usr/include/c++/9/bits/localefwd.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/c++locale.h \ + /usr/include/c++/9/clocale \ + /usr/include/locale.h \ + /usr/include/x86_64-linux-gnu/bits/locale.h \ + /usr/include/c++/9/iosfwd \ + /usr/include/c++/9/cctype \ + /usr/include/ctype.h \ + /usr/include/c++/9/bits/ostream_insert.h \ + /usr/include/c++/9/bits/cxxabi_forced.h \ + /usr/include/c++/9/bits/stl_function.h \ + /usr/include/c++/9/backward/binders.h \ + /usr/include/c++/9/bits/range_access.h \ + /usr/include/c++/9/initializer_list \ + /usr/include/c++/9/bits/basic_string.h \ + /usr/include/c++/9/ext/atomicity.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/gthr.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/gthr-default.h \ + /usr/include/pthread.h \ + /usr/include/sched.h \ + /usr/include/x86_64-linux-gnu/bits/sched.h \ + /usr/include/x86_64-linux-gnu/bits/types/struct_sched_param.h \ + /usr/include/x86_64-linux-gnu/bits/cpu-set.h \ + /usr/include/x86_64-linux-gnu/bits/setjmp.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/atomic_word.h \ + /usr/include/c++/9/ext/alloc_traits.h \ + /usr/include/c++/9/bits/alloc_traits.h \ + /usr/include/c++/9/ext/string_conversions.h \ + /usr/include/c++/9/cstdio \ + /usr/include/c++/9/cerrno \ + /usr/include/errno.h \ + /usr/include/x86_64-linux-gnu/bits/errno.h \ + /usr/include/linux/errno.h \ + /usr/include/x86_64-linux-gnu/asm/errno.h \ + /usr/include/asm-generic/errno.h \ + /usr/include/asm-generic/errno-base.h \ + /usr/include/x86_64-linux-gnu/bits/types/error_t.h \ + /usr/include/c++/9/bits/functional_hash.h \ + /usr/include/c++/9/bits/basic_string.tcc \ + /usr/include/c++/9/limits \ + /usr/include/c++/9/bits/random.h \ + /usr/include/c++/9/vector \ + /usr/include/c++/9/bits/stl_construct.h \ + /usr/include/c++/9/bits/stl_uninitialized.h \ + /usr/include/c++/9/bits/stl_vector.h \ + /usr/include/c++/9/bits/stl_bvector.h \ + /usr/include/c++/9/bits/vector.tcc \ + /usr/include/c++/9/bits/uniform_int_dist.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/opt_random.h \ + /usr/include/c++/9/bits/random.tcc \ + /usr/include/c++/9/numeric \ + /usr/include/c++/9/bits/stl_numeric.h \ + /usr/include/c++/9/stdexcept \ + /usr/include/c++/9/cassert \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_kernel.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_discrete.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_precalc.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_mrg32k3a.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_mtgp32_kernel.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda.h \ + /usr/include/memory.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_mtgp32.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_philox4x32_x.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_globals.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_uniform.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_normal.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_normal_static.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_lognormal.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_poisson.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_discrete2.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda_fp16.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda_fp16.hpp \ + /usr/include/c++/9/utility \ + /usr/include/c++/9/bits/stl_relops.h \ + supportCode.h diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/neuronUpdateCUDA0.sha b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/neuronUpdateCUDA0.sha new file mode 100644 index 00000000..75b5951f --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/neuronUpdateCUDA0.sha @@ -0,0 +1,4 @@ +f15309d7 f71d7124 931c6fc7 53bfa0e0 d1522f48 +0 4 +0 0 +136 69 diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/neuronUpdateCUDA1.sha b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/neuronUpdateCUDA1.sha new file mode 100644 index 00000000..861c41c3 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/neuronUpdateCUDA1.sha @@ -0,0 +1,4 @@ +f15309d7 f71d7124 931c6fc7 53bfa0e0 d1522f48 +0 4 +0 0 +264 69 diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/runner.cc b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/runner.cc new file mode 100644 index 00000000..bf6436a1 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/runner.cc @@ -0,0 +1,1036 @@ +#include "definitionsInternal.h" + +extern "C" { +// ------------------------------------------------------------------------ +// global variables +// ------------------------------------------------------------------------ +unsigned long long iT; +double t; + +// ------------------------------------------------------------------------ +// timers +// ------------------------------------------------------------------------ +double initTime = 0.0; +double initSparseTime = 0.0; +double neuronUpdateTime = 0.0; +double presynapticUpdateTime = 0.0; +double postsynapticUpdateTime = 0.0; +double synapseDynamicsTime = 0.0; +// ------------------------------------------------------------------------ +// merged group arrays +// ------------------------------------------------------------------------ +// ------------------------------------------------------------------------ +// local neuron groups +// ------------------------------------------------------------------------ +unsigned int* glbSpkCntneurongroup; +unsigned int* d_glbSpkCntneurongroup; +unsigned int* glbSpkneurongroup; +unsigned int* d_glbSpkneurongroup; +int32_t* ineurongroup; +int32_t* d_ineurongroup; +double* Vneurongroup; +double* d_Vneurongroup; +double* g_PN_iKCneurongroup; +double* d_g_PN_iKCneurongroup; +double* hneurongroup; +double* d_hneurongroup; +double* mneurongroup; +double* d_mneurongroup; +double* nneurongroup; +double* d_nneurongroup; +double* lastspikeneurongroup; +double* d_lastspikeneurongroup; +char* not_refractoryneurongroup; +char* d_not_refractoryneurongroup; +unsigned int* glbSpkCntneurongroup_1; +unsigned int* d_glbSpkCntneurongroup_1; +unsigned int* glbSpkneurongroup_1; +unsigned int* d_glbSpkneurongroup_1; +int32_t* ineurongroup_1; +int32_t* d_ineurongroup_1; +double* Vneurongroup_1; +double* d_Vneurongroup_1; +double* g_eKC_eKCneurongroup_1; +double* d_g_eKC_eKCneurongroup_1; +double* g_iKC_eKCneurongroup_1; +double* d_g_iKC_eKCneurongroup_1; +double* hneurongroup_1; +double* d_hneurongroup_1; +double* mneurongroup_1; +double* d_mneurongroup_1; +double* nneurongroup_1; +double* d_nneurongroup_1; +double* lastspikeneurongroup_1; +double* d_lastspikeneurongroup_1; +char* not_refractoryneurongroup_1; +char* d_not_refractoryneurongroup_1; +unsigned int* glbSpkCntspikegeneratorgroup; +unsigned int* d_glbSpkCntspikegeneratorgroup; +unsigned int* glbSpkspikegeneratorgroup; +unsigned int* d_glbSpkspikegeneratorgroup; + +// ------------------------------------------------------------------------ +// custom update variables +// ------------------------------------------------------------------------ + +// ------------------------------------------------------------------------ +// postsynaptic variables +// ------------------------------------------------------------------------ +double* inSynsynapses; +double* d_inSynsynapses; +double* inSynsynapses_1; +double* d_inSynsynapses_1; +double* inSynsynapses_2; +double* d_inSynsynapses_2; + +// ------------------------------------------------------------------------ +// synapse connectivity +// ------------------------------------------------------------------------ +const unsigned int maxRowLengthsynapses = 421; +unsigned int* rowLengthsynapses; +unsigned int* d_rowLengthsynapses; +uint32_t* indsynapses; +uint32_t* d_indsynapses; +const unsigned int maxRowLengthsynapses_1 = 100; +unsigned int* rowLengthsynapses_1; +unsigned int* d_rowLengthsynapses_1; +uint32_t* indsynapses_1; +uint32_t* d_indsynapses_1; +unsigned int* d_colLengthsynapses_1; +unsigned int* d_remapsynapses_1; +const unsigned int maxRowLengthsynapses_2 = 100; +unsigned int* rowLengthsynapses_2; +unsigned int* d_rowLengthsynapses_2; +uint32_t* indsynapses_2; +uint32_t* d_indsynapses_2; + +// ------------------------------------------------------------------------ +// synapse variables +// ------------------------------------------------------------------------ +double* weightsynapses; +double* d_weightsynapses; +double* lastupdatesynapses_1; +double* d_lastupdatesynapses_1; +double* Apostsynapses_1; +double* d_Apostsynapses_1; +double* g_rawsynapses_1; +double* d_g_rawsynapses_1; +double* Apresynapses_1; +double* d_Apresynapses_1; + +} // extern "C" +// ------------------------------------------------------------------------ +// extra global params +// ------------------------------------------------------------------------ + +// ------------------------------------------------------------------------ +// copying things to device +// ------------------------------------------------------------------------ +void pushneurongroupSpikesToDevice(bool uninitialisedOnly) { + if(!uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_glbSpkCntneurongroup, glbSpkCntneurongroup, 1 * sizeof(unsigned int), cudaMemcpyHostToDevice)); + } + if(!uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_glbSpkneurongroup, glbSpkneurongroup, 2500 * sizeof(unsigned int), cudaMemcpyHostToDevice)); + } +} + +void pushneurongroupCurrentSpikesToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_glbSpkCntneurongroup, glbSpkCntneurongroup, 1 * sizeof(unsigned int), cudaMemcpyHostToDevice)); + CHECK_CUDA_ERRORS(cudaMemcpy(d_glbSpkneurongroup, glbSpkneurongroup, glbSpkCntneurongroup[0] * sizeof(unsigned int), cudaMemcpyHostToDevice)); +} + +void pushineurongroupToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_ineurongroup, ineurongroup, 2500 * sizeof(int32_t), cudaMemcpyHostToDevice)); +} + +void pushCurrentineurongroupToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_ineurongroup, ineurongroup, 2500 * sizeof(int32_t), cudaMemcpyHostToDevice)); +} + +void pushVneurongroupToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_Vneurongroup, Vneurongroup, 2500 * sizeof(double), cudaMemcpyHostToDevice)); +} + +void pushCurrentVneurongroupToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_Vneurongroup, Vneurongroup, 2500 * sizeof(double), cudaMemcpyHostToDevice)); +} + +void pushg_PN_iKCneurongroupToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_g_PN_iKCneurongroup, g_PN_iKCneurongroup, 2500 * sizeof(double), cudaMemcpyHostToDevice)); +} + +void pushCurrentg_PN_iKCneurongroupToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_g_PN_iKCneurongroup, g_PN_iKCneurongroup, 2500 * sizeof(double), cudaMemcpyHostToDevice)); +} + +void pushhneurongroupToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_hneurongroup, hneurongroup, 2500 * sizeof(double), cudaMemcpyHostToDevice)); +} + +void pushCurrenthneurongroupToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_hneurongroup, hneurongroup, 2500 * sizeof(double), cudaMemcpyHostToDevice)); +} + +void pushmneurongroupToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_mneurongroup, mneurongroup, 2500 * sizeof(double), cudaMemcpyHostToDevice)); +} + +void pushCurrentmneurongroupToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_mneurongroup, mneurongroup, 2500 * sizeof(double), cudaMemcpyHostToDevice)); +} + +void pushnneurongroupToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_nneurongroup, nneurongroup, 2500 * sizeof(double), cudaMemcpyHostToDevice)); +} + +void pushCurrentnneurongroupToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_nneurongroup, nneurongroup, 2500 * sizeof(double), cudaMemcpyHostToDevice)); +} + +void pushlastspikeneurongroupToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_lastspikeneurongroup, lastspikeneurongroup, 2500 * sizeof(double), cudaMemcpyHostToDevice)); +} + +void pushCurrentlastspikeneurongroupToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_lastspikeneurongroup, lastspikeneurongroup, 2500 * sizeof(double), cudaMemcpyHostToDevice)); +} + +void pushnot_refractoryneurongroupToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_not_refractoryneurongroup, not_refractoryneurongroup, 2500 * sizeof(char), cudaMemcpyHostToDevice)); +} + +void pushCurrentnot_refractoryneurongroupToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_not_refractoryneurongroup, not_refractoryneurongroup, 2500 * sizeof(char), cudaMemcpyHostToDevice)); +} + +void pushneurongroupStateToDevice(bool uninitialisedOnly) { + pushineurongroupToDevice(uninitialisedOnly); + pushVneurongroupToDevice(uninitialisedOnly); + pushg_PN_iKCneurongroupToDevice(uninitialisedOnly); + pushhneurongroupToDevice(uninitialisedOnly); + pushmneurongroupToDevice(uninitialisedOnly); + pushnneurongroupToDevice(uninitialisedOnly); + pushlastspikeneurongroupToDevice(uninitialisedOnly); + pushnot_refractoryneurongroupToDevice(uninitialisedOnly); +} + +void pushneurongroup_1SpikesToDevice(bool uninitialisedOnly) { + if(!uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_glbSpkCntneurongroup_1, glbSpkCntneurongroup_1, 1 * sizeof(unsigned int), cudaMemcpyHostToDevice)); + } + if(!uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_glbSpkneurongroup_1, glbSpkneurongroup_1, 100 * sizeof(unsigned int), cudaMemcpyHostToDevice)); + } +} + +void pushneurongroup_1CurrentSpikesToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_glbSpkCntneurongroup_1, glbSpkCntneurongroup_1, 1 * sizeof(unsigned int), cudaMemcpyHostToDevice)); + CHECK_CUDA_ERRORS(cudaMemcpy(d_glbSpkneurongroup_1, glbSpkneurongroup_1, glbSpkCntneurongroup_1[0] * sizeof(unsigned int), cudaMemcpyHostToDevice)); +} + +void pushineurongroup_1ToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_ineurongroup_1, ineurongroup_1, 100 * sizeof(int32_t), cudaMemcpyHostToDevice)); +} + +void pushCurrentineurongroup_1ToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_ineurongroup_1, ineurongroup_1, 100 * sizeof(int32_t), cudaMemcpyHostToDevice)); +} + +void pushVneurongroup_1ToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_Vneurongroup_1, Vneurongroup_1, 100 * sizeof(double), cudaMemcpyHostToDevice)); +} + +void pushCurrentVneurongroup_1ToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_Vneurongroup_1, Vneurongroup_1, 100 * sizeof(double), cudaMemcpyHostToDevice)); +} + +void pushg_eKC_eKCneurongroup_1ToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_g_eKC_eKCneurongroup_1, g_eKC_eKCneurongroup_1, 100 * sizeof(double), cudaMemcpyHostToDevice)); +} + +void pushCurrentg_eKC_eKCneurongroup_1ToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_g_eKC_eKCneurongroup_1, g_eKC_eKCneurongroup_1, 100 * sizeof(double), cudaMemcpyHostToDevice)); +} + +void pushg_iKC_eKCneurongroup_1ToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_g_iKC_eKCneurongroup_1, g_iKC_eKCneurongroup_1, 100 * sizeof(double), cudaMemcpyHostToDevice)); +} + +void pushCurrentg_iKC_eKCneurongroup_1ToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_g_iKC_eKCneurongroup_1, g_iKC_eKCneurongroup_1, 100 * sizeof(double), cudaMemcpyHostToDevice)); +} + +void pushhneurongroup_1ToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_hneurongroup_1, hneurongroup_1, 100 * sizeof(double), cudaMemcpyHostToDevice)); +} + +void pushCurrenthneurongroup_1ToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_hneurongroup_1, hneurongroup_1, 100 * sizeof(double), cudaMemcpyHostToDevice)); +} + +void pushmneurongroup_1ToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_mneurongroup_1, mneurongroup_1, 100 * sizeof(double), cudaMemcpyHostToDevice)); +} + +void pushCurrentmneurongroup_1ToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_mneurongroup_1, mneurongroup_1, 100 * sizeof(double), cudaMemcpyHostToDevice)); +} + +void pushnneurongroup_1ToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_nneurongroup_1, nneurongroup_1, 100 * sizeof(double), cudaMemcpyHostToDevice)); +} + +void pushCurrentnneurongroup_1ToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_nneurongroup_1, nneurongroup_1, 100 * sizeof(double), cudaMemcpyHostToDevice)); +} + +void pushlastspikeneurongroup_1ToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_lastspikeneurongroup_1, lastspikeneurongroup_1, 100 * sizeof(double), cudaMemcpyHostToDevice)); +} + +void pushCurrentlastspikeneurongroup_1ToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_lastspikeneurongroup_1, lastspikeneurongroup_1, 100 * sizeof(double), cudaMemcpyHostToDevice)); +} + +void pushnot_refractoryneurongroup_1ToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_not_refractoryneurongroup_1, not_refractoryneurongroup_1, 100 * sizeof(char), cudaMemcpyHostToDevice)); +} + +void pushCurrentnot_refractoryneurongroup_1ToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_not_refractoryneurongroup_1, not_refractoryneurongroup_1, 100 * sizeof(char), cudaMemcpyHostToDevice)); +} + +void pushneurongroup_1StateToDevice(bool uninitialisedOnly) { + pushineurongroup_1ToDevice(uninitialisedOnly); + pushVneurongroup_1ToDevice(uninitialisedOnly); + pushg_eKC_eKCneurongroup_1ToDevice(uninitialisedOnly); + pushg_iKC_eKCneurongroup_1ToDevice(uninitialisedOnly); + pushhneurongroup_1ToDevice(uninitialisedOnly); + pushmneurongroup_1ToDevice(uninitialisedOnly); + pushnneurongroup_1ToDevice(uninitialisedOnly); + pushlastspikeneurongroup_1ToDevice(uninitialisedOnly); + pushnot_refractoryneurongroup_1ToDevice(uninitialisedOnly); +} + +void pushspikegeneratorgroupSpikesToDevice(bool uninitialisedOnly) { + if(!uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_glbSpkCntspikegeneratorgroup, glbSpkCntspikegeneratorgroup, 1 * sizeof(unsigned int), cudaMemcpyHostToDevice)); + } + if(!uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_glbSpkspikegeneratorgroup, glbSpkspikegeneratorgroup, 100 * sizeof(unsigned int), cudaMemcpyHostToDevice)); + } +} + +void pushspikegeneratorgroupCurrentSpikesToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_glbSpkCntspikegeneratorgroup, glbSpkCntspikegeneratorgroup, 1 * sizeof(unsigned int), cudaMemcpyHostToDevice)); + CHECK_CUDA_ERRORS(cudaMemcpy(d_glbSpkspikegeneratorgroup, glbSpkspikegeneratorgroup, glbSpkCntspikegeneratorgroup[0] * sizeof(unsigned int), cudaMemcpyHostToDevice)); +} + +void pushspikegeneratorgroupStateToDevice(bool uninitialisedOnly) { +} + +void pushsynapsesConnectivityToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_rowLengthsynapses, rowLengthsynapses, 100 * sizeof(unsigned int), cudaMemcpyHostToDevice)); + CHECK_CUDA_ERRORS(cudaMemcpy(d_indsynapses, indsynapses, 42100 * sizeof(unsigned int), cudaMemcpyHostToDevice)); +} + +void pushsynapses_1ConnectivityToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_rowLengthsynapses_1, rowLengthsynapses_1, 2500 * sizeof(unsigned int), cudaMemcpyHostToDevice)); + CHECK_CUDA_ERRORS(cudaMemcpy(d_indsynapses_1, indsynapses_1, 250000 * sizeof(unsigned int), cudaMemcpyHostToDevice)); +} + +void pushsynapses_2ConnectivityToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_rowLengthsynapses_2, rowLengthsynapses_2, 100 * sizeof(unsigned int), cudaMemcpyHostToDevice)); + CHECK_CUDA_ERRORS(cudaMemcpy(d_indsynapses_2, indsynapses_2, 10000 * sizeof(unsigned int), cudaMemcpyHostToDevice)); +} + +void pushweightsynapsesToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_weightsynapses, weightsynapses, 42100 * sizeof(double), cudaMemcpyHostToDevice)); +} + +void pushinSynsynapsesToDevice(bool uninitialisedOnly) { + if(!uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_inSynsynapses, inSynsynapses, 2500 * sizeof(double), cudaMemcpyHostToDevice)); + } +} + +void pushsynapsesStateToDevice(bool uninitialisedOnly) { + pushweightsynapsesToDevice(uninitialisedOnly); + pushinSynsynapsesToDevice(uninitialisedOnly); +} + +void pushlastupdatesynapses_1ToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_lastupdatesynapses_1, lastupdatesynapses_1, 250000 * sizeof(double), cudaMemcpyHostToDevice)); +} + +void pushApostsynapses_1ToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_Apostsynapses_1, Apostsynapses_1, 250000 * sizeof(double), cudaMemcpyHostToDevice)); +} + +void pushg_rawsynapses_1ToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_g_rawsynapses_1, g_rawsynapses_1, 250000 * sizeof(double), cudaMemcpyHostToDevice)); +} + +void pushApresynapses_1ToDevice(bool uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_Apresynapses_1, Apresynapses_1, 250000 * sizeof(double), cudaMemcpyHostToDevice)); +} + +void pushinSynsynapses_1ToDevice(bool uninitialisedOnly) { + if(!uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_inSynsynapses_1, inSynsynapses_1, 100 * sizeof(double), cudaMemcpyHostToDevice)); + } +} + +void pushsynapses_1StateToDevice(bool uninitialisedOnly) { + pushlastupdatesynapses_1ToDevice(uninitialisedOnly); + pushApostsynapses_1ToDevice(uninitialisedOnly); + pushg_rawsynapses_1ToDevice(uninitialisedOnly); + pushApresynapses_1ToDevice(uninitialisedOnly); + pushinSynsynapses_1ToDevice(uninitialisedOnly); +} + +void pushinSynsynapses_2ToDevice(bool uninitialisedOnly) { + if(!uninitialisedOnly) { + CHECK_CUDA_ERRORS(cudaMemcpy(d_inSynsynapses_2, inSynsynapses_2, 100 * sizeof(double), cudaMemcpyHostToDevice)); + } +} + +void pushsynapses_2StateToDevice(bool uninitialisedOnly) { + pushinSynsynapses_2ToDevice(uninitialisedOnly); +} + + +// ------------------------------------------------------------------------ +// copying things from device +// ------------------------------------------------------------------------ +void pullneurongroupSpikesFromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(glbSpkCntneurongroup, d_glbSpkCntneurongroup, 1 * sizeof(unsigned int), cudaMemcpyDeviceToHost)); + CHECK_CUDA_ERRORS(cudaMemcpy(glbSpkneurongroup, d_glbSpkneurongroup, 2500 * sizeof(unsigned int), cudaMemcpyDeviceToHost)); +} + +void pullneurongroupCurrentSpikesFromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(glbSpkCntneurongroup, d_glbSpkCntneurongroup, 1 * sizeof(unsigned int), cudaMemcpyDeviceToHost)); + CHECK_CUDA_ERRORS(cudaMemcpy(glbSpkneurongroup, d_glbSpkneurongroup, glbSpkCntneurongroup[0] * sizeof(unsigned int), cudaMemcpyDeviceToHost)); +} + +void pullineurongroupFromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(ineurongroup, d_ineurongroup, 2500 * sizeof(int32_t), cudaMemcpyDeviceToHost)); +} + +void pullCurrentineurongroupFromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(ineurongroup, d_ineurongroup, 2500 * sizeof(int32_t), cudaMemcpyDeviceToHost)); +} + +void pullVneurongroupFromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(Vneurongroup, d_Vneurongroup, 2500 * sizeof(double), cudaMemcpyDeviceToHost)); +} + +void pullCurrentVneurongroupFromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(Vneurongroup, d_Vneurongroup, 2500 * sizeof(double), cudaMemcpyDeviceToHost)); +} + +void pullg_PN_iKCneurongroupFromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(g_PN_iKCneurongroup, d_g_PN_iKCneurongroup, 2500 * sizeof(double), cudaMemcpyDeviceToHost)); +} + +void pullCurrentg_PN_iKCneurongroupFromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(g_PN_iKCneurongroup, d_g_PN_iKCneurongroup, 2500 * sizeof(double), cudaMemcpyDeviceToHost)); +} + +void pullhneurongroupFromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(hneurongroup, d_hneurongroup, 2500 * sizeof(double), cudaMemcpyDeviceToHost)); +} + +void pullCurrenthneurongroupFromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(hneurongroup, d_hneurongroup, 2500 * sizeof(double), cudaMemcpyDeviceToHost)); +} + +void pullmneurongroupFromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(mneurongroup, d_mneurongroup, 2500 * sizeof(double), cudaMemcpyDeviceToHost)); +} + +void pullCurrentmneurongroupFromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(mneurongroup, d_mneurongroup, 2500 * sizeof(double), cudaMemcpyDeviceToHost)); +} + +void pullnneurongroupFromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(nneurongroup, d_nneurongroup, 2500 * sizeof(double), cudaMemcpyDeviceToHost)); +} + +void pullCurrentnneurongroupFromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(nneurongroup, d_nneurongroup, 2500 * sizeof(double), cudaMemcpyDeviceToHost)); +} + +void pulllastspikeneurongroupFromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(lastspikeneurongroup, d_lastspikeneurongroup, 2500 * sizeof(double), cudaMemcpyDeviceToHost)); +} + +void pullCurrentlastspikeneurongroupFromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(lastspikeneurongroup, d_lastspikeneurongroup, 2500 * sizeof(double), cudaMemcpyDeviceToHost)); +} + +void pullnot_refractoryneurongroupFromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(not_refractoryneurongroup, d_not_refractoryneurongroup, 2500 * sizeof(char), cudaMemcpyDeviceToHost)); +} + +void pullCurrentnot_refractoryneurongroupFromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(not_refractoryneurongroup, d_not_refractoryneurongroup, 2500 * sizeof(char), cudaMemcpyDeviceToHost)); +} + +void pullneurongroupStateFromDevice() { + pullineurongroupFromDevice(); + pullVneurongroupFromDevice(); + pullg_PN_iKCneurongroupFromDevice(); + pullhneurongroupFromDevice(); + pullmneurongroupFromDevice(); + pullnneurongroupFromDevice(); + pulllastspikeneurongroupFromDevice(); + pullnot_refractoryneurongroupFromDevice(); +} + +void pullneurongroup_1SpikesFromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(glbSpkCntneurongroup_1, d_glbSpkCntneurongroup_1, 1 * sizeof(unsigned int), cudaMemcpyDeviceToHost)); + CHECK_CUDA_ERRORS(cudaMemcpy(glbSpkneurongroup_1, d_glbSpkneurongroup_1, 100 * sizeof(unsigned int), cudaMemcpyDeviceToHost)); +} + +void pullneurongroup_1CurrentSpikesFromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(glbSpkCntneurongroup_1, d_glbSpkCntneurongroup_1, 1 * sizeof(unsigned int), cudaMemcpyDeviceToHost)); + CHECK_CUDA_ERRORS(cudaMemcpy(glbSpkneurongroup_1, d_glbSpkneurongroup_1, glbSpkCntneurongroup_1[0] * sizeof(unsigned int), cudaMemcpyDeviceToHost)); +} + +void pullineurongroup_1FromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(ineurongroup_1, d_ineurongroup_1, 100 * sizeof(int32_t), cudaMemcpyDeviceToHost)); +} + +void pullCurrentineurongroup_1FromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(ineurongroup_1, d_ineurongroup_1, 100 * sizeof(int32_t), cudaMemcpyDeviceToHost)); +} + +void pullVneurongroup_1FromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(Vneurongroup_1, d_Vneurongroup_1, 100 * sizeof(double), cudaMemcpyDeviceToHost)); +} + +void pullCurrentVneurongroup_1FromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(Vneurongroup_1, d_Vneurongroup_1, 100 * sizeof(double), cudaMemcpyDeviceToHost)); +} + +void pullg_eKC_eKCneurongroup_1FromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(g_eKC_eKCneurongroup_1, d_g_eKC_eKCneurongroup_1, 100 * sizeof(double), cudaMemcpyDeviceToHost)); +} + +void pullCurrentg_eKC_eKCneurongroup_1FromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(g_eKC_eKCneurongroup_1, d_g_eKC_eKCneurongroup_1, 100 * sizeof(double), cudaMemcpyDeviceToHost)); +} + +void pullg_iKC_eKCneurongroup_1FromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(g_iKC_eKCneurongroup_1, d_g_iKC_eKCneurongroup_1, 100 * sizeof(double), cudaMemcpyDeviceToHost)); +} + +void pullCurrentg_iKC_eKCneurongroup_1FromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(g_iKC_eKCneurongroup_1, d_g_iKC_eKCneurongroup_1, 100 * sizeof(double), cudaMemcpyDeviceToHost)); +} + +void pullhneurongroup_1FromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(hneurongroup_1, d_hneurongroup_1, 100 * sizeof(double), cudaMemcpyDeviceToHost)); +} + +void pullCurrenthneurongroup_1FromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(hneurongroup_1, d_hneurongroup_1, 100 * sizeof(double), cudaMemcpyDeviceToHost)); +} + +void pullmneurongroup_1FromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(mneurongroup_1, d_mneurongroup_1, 100 * sizeof(double), cudaMemcpyDeviceToHost)); +} + +void pullCurrentmneurongroup_1FromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(mneurongroup_1, d_mneurongroup_1, 100 * sizeof(double), cudaMemcpyDeviceToHost)); +} + +void pullnneurongroup_1FromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(nneurongroup_1, d_nneurongroup_1, 100 * sizeof(double), cudaMemcpyDeviceToHost)); +} + +void pullCurrentnneurongroup_1FromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(nneurongroup_1, d_nneurongroup_1, 100 * sizeof(double), cudaMemcpyDeviceToHost)); +} + +void pulllastspikeneurongroup_1FromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(lastspikeneurongroup_1, d_lastspikeneurongroup_1, 100 * sizeof(double), cudaMemcpyDeviceToHost)); +} + +void pullCurrentlastspikeneurongroup_1FromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(lastspikeneurongroup_1, d_lastspikeneurongroup_1, 100 * sizeof(double), cudaMemcpyDeviceToHost)); +} + +void pullnot_refractoryneurongroup_1FromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(not_refractoryneurongroup_1, d_not_refractoryneurongroup_1, 100 * sizeof(char), cudaMemcpyDeviceToHost)); +} + +void pullCurrentnot_refractoryneurongroup_1FromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(not_refractoryneurongroup_1, d_not_refractoryneurongroup_1, 100 * sizeof(char), cudaMemcpyDeviceToHost)); +} + +void pullneurongroup_1StateFromDevice() { + pullineurongroup_1FromDevice(); + pullVneurongroup_1FromDevice(); + pullg_eKC_eKCneurongroup_1FromDevice(); + pullg_iKC_eKCneurongroup_1FromDevice(); + pullhneurongroup_1FromDevice(); + pullmneurongroup_1FromDevice(); + pullnneurongroup_1FromDevice(); + pulllastspikeneurongroup_1FromDevice(); + pullnot_refractoryneurongroup_1FromDevice(); +} + +void pullspikegeneratorgroupSpikesFromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(glbSpkCntspikegeneratorgroup, d_glbSpkCntspikegeneratorgroup, 1 * sizeof(unsigned int), cudaMemcpyDeviceToHost)); + CHECK_CUDA_ERRORS(cudaMemcpy(glbSpkspikegeneratorgroup, d_glbSpkspikegeneratorgroup, 100 * sizeof(unsigned int), cudaMemcpyDeviceToHost)); +} + +void pullspikegeneratorgroupCurrentSpikesFromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(glbSpkCntspikegeneratorgroup, d_glbSpkCntspikegeneratorgroup, 1 * sizeof(unsigned int), cudaMemcpyDeviceToHost)); + CHECK_CUDA_ERRORS(cudaMemcpy(glbSpkspikegeneratorgroup, d_glbSpkspikegeneratorgroup, glbSpkCntspikegeneratorgroup[0] * sizeof(unsigned int), cudaMemcpyDeviceToHost)); +} + +void pullspikegeneratorgroupStateFromDevice() { +} + +void pullsynapsesConnectivityFromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(rowLengthsynapses, d_rowLengthsynapses, 100 * sizeof(unsigned int), cudaMemcpyDeviceToHost)); + CHECK_CUDA_ERRORS(cudaMemcpy(indsynapses, d_indsynapses, 42100 * sizeof(unsigned int), cudaMemcpyDeviceToHost)); +} + +void pullsynapses_1ConnectivityFromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(rowLengthsynapses_1, d_rowLengthsynapses_1, 2500 * sizeof(unsigned int), cudaMemcpyDeviceToHost)); + CHECK_CUDA_ERRORS(cudaMemcpy(indsynapses_1, d_indsynapses_1, 250000 * sizeof(unsigned int), cudaMemcpyDeviceToHost)); +} + +void pullsynapses_2ConnectivityFromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(rowLengthsynapses_2, d_rowLengthsynapses_2, 100 * sizeof(unsigned int), cudaMemcpyDeviceToHost)); + CHECK_CUDA_ERRORS(cudaMemcpy(indsynapses_2, d_indsynapses_2, 10000 * sizeof(unsigned int), cudaMemcpyDeviceToHost)); +} + +void pullweightsynapsesFromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(weightsynapses, d_weightsynapses, 42100 * sizeof(double), cudaMemcpyDeviceToHost)); +} + +void pullinSynsynapsesFromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(inSynsynapses, d_inSynsynapses, 2500 * sizeof(double), cudaMemcpyDeviceToHost)); +} + +void pullsynapsesStateFromDevice() { + pullweightsynapsesFromDevice(); + pullinSynsynapsesFromDevice(); +} + +void pulllastupdatesynapses_1FromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(lastupdatesynapses_1, d_lastupdatesynapses_1, 250000 * sizeof(double), cudaMemcpyDeviceToHost)); +} + +void pullApostsynapses_1FromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(Apostsynapses_1, d_Apostsynapses_1, 250000 * sizeof(double), cudaMemcpyDeviceToHost)); +} + +void pullg_rawsynapses_1FromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(g_rawsynapses_1, d_g_rawsynapses_1, 250000 * sizeof(double), cudaMemcpyDeviceToHost)); +} + +void pullApresynapses_1FromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(Apresynapses_1, d_Apresynapses_1, 250000 * sizeof(double), cudaMemcpyDeviceToHost)); +} + +void pullinSynsynapses_1FromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(inSynsynapses_1, d_inSynsynapses_1, 100 * sizeof(double), cudaMemcpyDeviceToHost)); +} + +void pullsynapses_1StateFromDevice() { + pulllastupdatesynapses_1FromDevice(); + pullApostsynapses_1FromDevice(); + pullg_rawsynapses_1FromDevice(); + pullApresynapses_1FromDevice(); + pullinSynsynapses_1FromDevice(); +} + +void pullinSynsynapses_2FromDevice() { + CHECK_CUDA_ERRORS(cudaMemcpy(inSynsynapses_2, d_inSynsynapses_2, 100 * sizeof(double), cudaMemcpyDeviceToHost)); +} + +void pullsynapses_2StateFromDevice() { + pullinSynsynapses_2FromDevice(); +} + + +// ------------------------------------------------------------------------ +// helper getter functions +// ------------------------------------------------------------------------ +unsigned int* getneurongroupCurrentSpikes(unsigned int batch) { + return (glbSpkneurongroup); +} + +unsigned int& getneurongroupCurrentSpikeCount(unsigned int batch) { + return glbSpkCntneurongroup[0]; +} + +int32_t* getCurrentineurongroup(unsigned int batch) { + return ineurongroup; +} + +double* getCurrentVneurongroup(unsigned int batch) { + return Vneurongroup; +} + +double* getCurrentg_PN_iKCneurongroup(unsigned int batch) { + return g_PN_iKCneurongroup; +} + +double* getCurrenthneurongroup(unsigned int batch) { + return hneurongroup; +} + +double* getCurrentmneurongroup(unsigned int batch) { + return mneurongroup; +} + +double* getCurrentnneurongroup(unsigned int batch) { + return nneurongroup; +} + +double* getCurrentlastspikeneurongroup(unsigned int batch) { + return lastspikeneurongroup; +} + +char* getCurrentnot_refractoryneurongroup(unsigned int batch) { + return not_refractoryneurongroup; +} + +unsigned int* getneurongroup_1CurrentSpikes(unsigned int batch) { + return (glbSpkneurongroup_1); +} + +unsigned int& getneurongroup_1CurrentSpikeCount(unsigned int batch) { + return glbSpkCntneurongroup_1[0]; +} + +int32_t* getCurrentineurongroup_1(unsigned int batch) { + return ineurongroup_1; +} + +double* getCurrentVneurongroup_1(unsigned int batch) { + return Vneurongroup_1; +} + +double* getCurrentg_eKC_eKCneurongroup_1(unsigned int batch) { + return g_eKC_eKCneurongroup_1; +} + +double* getCurrentg_iKC_eKCneurongroup_1(unsigned int batch) { + return g_iKC_eKCneurongroup_1; +} + +double* getCurrenthneurongroup_1(unsigned int batch) { + return hneurongroup_1; +} + +double* getCurrentmneurongroup_1(unsigned int batch) { + return mneurongroup_1; +} + +double* getCurrentnneurongroup_1(unsigned int batch) { + return nneurongroup_1; +} + +double* getCurrentlastspikeneurongroup_1(unsigned int batch) { + return lastspikeneurongroup_1; +} + +char* getCurrentnot_refractoryneurongroup_1(unsigned int batch) { + return not_refractoryneurongroup_1; +} + +unsigned int* getspikegeneratorgroupCurrentSpikes(unsigned int batch) { + return (glbSpkspikegeneratorgroup); +} + +unsigned int& getspikegeneratorgroupCurrentSpikeCount(unsigned int batch) { + return glbSpkCntspikegeneratorgroup[0]; +} + + +void copyStateToDevice(bool uninitialisedOnly) { + pushneurongroupStateToDevice(uninitialisedOnly); + pushneurongroup_1StateToDevice(uninitialisedOnly); + pushspikegeneratorgroupStateToDevice(uninitialisedOnly); + pushsynapsesStateToDevice(uninitialisedOnly); + pushsynapses_1StateToDevice(uninitialisedOnly); + pushsynapses_2StateToDevice(uninitialisedOnly); +} + +void copyConnectivityToDevice(bool uninitialisedOnly) { + pushsynapsesConnectivityToDevice(uninitialisedOnly); + pushsynapses_1ConnectivityToDevice(uninitialisedOnly); + pushsynapses_2ConnectivityToDevice(uninitialisedOnly); +} + +void copyStateFromDevice() { + pullneurongroupStateFromDevice(); + pullneurongroup_1StateFromDevice(); + pullspikegeneratorgroupStateFromDevice(); + pullsynapsesStateFromDevice(); + pullsynapses_1StateFromDevice(); + pullsynapses_2StateFromDevice(); +} + +void copyCurrentSpikesFromDevice() { + pullneurongroupCurrentSpikesFromDevice(); + pullneurongroup_1CurrentSpikesFromDevice(); + pullspikegeneratorgroupCurrentSpikesFromDevice(); +} + +void copyCurrentSpikeEventsFromDevice() { +} + +void allocateMem() { + int deviceID; + CHECK_CUDA_ERRORS(cudaDeviceGetByPCIBusId(&deviceID, "0000:3B:00.0")); + CHECK_CUDA_ERRORS(cudaSetDevice(deviceID)); + + // ------------------------------------------------------------------------ + // global variables + // ------------------------------------------------------------------------ + + // ------------------------------------------------------------------------ + // timers + // ------------------------------------------------------------------------ + // ------------------------------------------------------------------------ + // local neuron groups + // ------------------------------------------------------------------------ + CHECK_CUDA_ERRORS(cudaHostAlloc(&glbSpkCntneurongroup, 1 * sizeof(unsigned int), cudaHostAllocPortable)); + CHECK_CUDA_ERRORS(cudaMalloc(&d_glbSpkCntneurongroup, 1 * sizeof(unsigned int))); + CHECK_CUDA_ERRORS(cudaHostAlloc(&glbSpkneurongroup, 2500 * sizeof(unsigned int), cudaHostAllocPortable)); + CHECK_CUDA_ERRORS(cudaMalloc(&d_glbSpkneurongroup, 2500 * sizeof(unsigned int))); + CHECK_CUDA_ERRORS(cudaHostAlloc(&ineurongroup, 2500 * sizeof(int32_t), cudaHostAllocPortable)); + CHECK_CUDA_ERRORS(cudaMalloc(&d_ineurongroup, 2500 * sizeof(int32_t))); + CHECK_CUDA_ERRORS(cudaHostAlloc(&Vneurongroup, 2500 * sizeof(double), cudaHostAllocPortable)); + CHECK_CUDA_ERRORS(cudaMalloc(&d_Vneurongroup, 2500 * sizeof(double))); + CHECK_CUDA_ERRORS(cudaHostAlloc(&g_PN_iKCneurongroup, 2500 * sizeof(double), cudaHostAllocPortable)); + CHECK_CUDA_ERRORS(cudaMalloc(&d_g_PN_iKCneurongroup, 2500 * sizeof(double))); + CHECK_CUDA_ERRORS(cudaHostAlloc(&hneurongroup, 2500 * sizeof(double), cudaHostAllocPortable)); + CHECK_CUDA_ERRORS(cudaMalloc(&d_hneurongroup, 2500 * sizeof(double))); + CHECK_CUDA_ERRORS(cudaHostAlloc(&mneurongroup, 2500 * sizeof(double), cudaHostAllocPortable)); + CHECK_CUDA_ERRORS(cudaMalloc(&d_mneurongroup, 2500 * sizeof(double))); + CHECK_CUDA_ERRORS(cudaHostAlloc(&nneurongroup, 2500 * sizeof(double), cudaHostAllocPortable)); + CHECK_CUDA_ERRORS(cudaMalloc(&d_nneurongroup, 2500 * sizeof(double))); + CHECK_CUDA_ERRORS(cudaHostAlloc(&lastspikeneurongroup, 2500 * sizeof(double), cudaHostAllocPortable)); + CHECK_CUDA_ERRORS(cudaMalloc(&d_lastspikeneurongroup, 2500 * sizeof(double))); + CHECK_CUDA_ERRORS(cudaHostAlloc(¬_refractoryneurongroup, 2500 * sizeof(char), cudaHostAllocPortable)); + CHECK_CUDA_ERRORS(cudaMalloc(&d_not_refractoryneurongroup, 2500 * sizeof(char))); + CHECK_CUDA_ERRORS(cudaHostAlloc(&glbSpkCntneurongroup_1, 1 * sizeof(unsigned int), cudaHostAllocPortable)); + CHECK_CUDA_ERRORS(cudaMalloc(&d_glbSpkCntneurongroup_1, 1 * sizeof(unsigned int))); + CHECK_CUDA_ERRORS(cudaHostAlloc(&glbSpkneurongroup_1, 100 * sizeof(unsigned int), cudaHostAllocPortable)); + CHECK_CUDA_ERRORS(cudaMalloc(&d_glbSpkneurongroup_1, 100 * sizeof(unsigned int))); + CHECK_CUDA_ERRORS(cudaHostAlloc(&ineurongroup_1, 100 * sizeof(int32_t), cudaHostAllocPortable)); + CHECK_CUDA_ERRORS(cudaMalloc(&d_ineurongroup_1, 100 * sizeof(int32_t))); + CHECK_CUDA_ERRORS(cudaHostAlloc(&Vneurongroup_1, 100 * sizeof(double), cudaHostAllocPortable)); + CHECK_CUDA_ERRORS(cudaMalloc(&d_Vneurongroup_1, 100 * sizeof(double))); + CHECK_CUDA_ERRORS(cudaHostAlloc(&g_eKC_eKCneurongroup_1, 100 * sizeof(double), cudaHostAllocPortable)); + CHECK_CUDA_ERRORS(cudaMalloc(&d_g_eKC_eKCneurongroup_1, 100 * sizeof(double))); + CHECK_CUDA_ERRORS(cudaHostAlloc(&g_iKC_eKCneurongroup_1, 100 * sizeof(double), cudaHostAllocPortable)); + CHECK_CUDA_ERRORS(cudaMalloc(&d_g_iKC_eKCneurongroup_1, 100 * sizeof(double))); + CHECK_CUDA_ERRORS(cudaHostAlloc(&hneurongroup_1, 100 * sizeof(double), cudaHostAllocPortable)); + CHECK_CUDA_ERRORS(cudaMalloc(&d_hneurongroup_1, 100 * sizeof(double))); + CHECK_CUDA_ERRORS(cudaHostAlloc(&mneurongroup_1, 100 * sizeof(double), cudaHostAllocPortable)); + CHECK_CUDA_ERRORS(cudaMalloc(&d_mneurongroup_1, 100 * sizeof(double))); + CHECK_CUDA_ERRORS(cudaHostAlloc(&nneurongroup_1, 100 * sizeof(double), cudaHostAllocPortable)); + CHECK_CUDA_ERRORS(cudaMalloc(&d_nneurongroup_1, 100 * sizeof(double))); + CHECK_CUDA_ERRORS(cudaHostAlloc(&lastspikeneurongroup_1, 100 * sizeof(double), cudaHostAllocPortable)); + CHECK_CUDA_ERRORS(cudaMalloc(&d_lastspikeneurongroup_1, 100 * sizeof(double))); + CHECK_CUDA_ERRORS(cudaHostAlloc(¬_refractoryneurongroup_1, 100 * sizeof(char), cudaHostAllocPortable)); + CHECK_CUDA_ERRORS(cudaMalloc(&d_not_refractoryneurongroup_1, 100 * sizeof(char))); + CHECK_CUDA_ERRORS(cudaHostAlloc(&glbSpkCntspikegeneratorgroup, 1 * sizeof(unsigned int), cudaHostAllocPortable)); + CHECK_CUDA_ERRORS(cudaMalloc(&d_glbSpkCntspikegeneratorgroup, 1 * sizeof(unsigned int))); + CHECK_CUDA_ERRORS(cudaHostAlloc(&glbSpkspikegeneratorgroup, 100 * sizeof(unsigned int), cudaHostAllocPortable)); + CHECK_CUDA_ERRORS(cudaMalloc(&d_glbSpkspikegeneratorgroup, 100 * sizeof(unsigned int))); + + // ------------------------------------------------------------------------ + // custom update variables + // ------------------------------------------------------------------------ + + // ------------------------------------------------------------------------ + // postsynaptic variables + // ------------------------------------------------------------------------ + CHECK_CUDA_ERRORS(cudaHostAlloc(&inSynsynapses, 2500 * sizeof(double), cudaHostAllocPortable)); + CHECK_CUDA_ERRORS(cudaMalloc(&d_inSynsynapses, 2500 * sizeof(double))); + CHECK_CUDA_ERRORS(cudaHostAlloc(&inSynsynapses_1, 100 * sizeof(double), cudaHostAllocPortable)); + CHECK_CUDA_ERRORS(cudaMalloc(&d_inSynsynapses_1, 100 * sizeof(double))); + CHECK_CUDA_ERRORS(cudaHostAlloc(&inSynsynapses_2, 100 * sizeof(double), cudaHostAllocPortable)); + CHECK_CUDA_ERRORS(cudaMalloc(&d_inSynsynapses_2, 100 * sizeof(double))); + + // ------------------------------------------------------------------------ + // synapse connectivity + // ------------------------------------------------------------------------ + CHECK_CUDA_ERRORS(cudaHostAlloc(&rowLengthsynapses, 100 * sizeof(unsigned int), cudaHostAllocPortable)); + CHECK_CUDA_ERRORS(cudaMalloc(&d_rowLengthsynapses, 100 * sizeof(unsigned int))); + CHECK_CUDA_ERRORS(cudaHostAlloc(&indsynapses, 42100 * sizeof(uint32_t), cudaHostAllocPortable)); + CHECK_CUDA_ERRORS(cudaMalloc(&d_indsynapses, 42100 * sizeof(uint32_t))); + CHECK_CUDA_ERRORS(cudaHostAlloc(&rowLengthsynapses_1, 2500 * sizeof(unsigned int), cudaHostAllocPortable)); + CHECK_CUDA_ERRORS(cudaMalloc(&d_rowLengthsynapses_1, 2500 * sizeof(unsigned int))); + CHECK_CUDA_ERRORS(cudaHostAlloc(&indsynapses_1, 250000 * sizeof(uint32_t), cudaHostAllocPortable)); + CHECK_CUDA_ERRORS(cudaMalloc(&d_indsynapses_1, 250000 * sizeof(uint32_t))); + CHECK_CUDA_ERRORS(cudaMalloc(&d_colLengthsynapses_1, 100 * sizeof(unsigned int))); + CHECK_CUDA_ERRORS(cudaMalloc(&d_remapsynapses_1, 250000 * sizeof(unsigned int))); + CHECK_CUDA_ERRORS(cudaHostAlloc(&rowLengthsynapses_2, 100 * sizeof(unsigned int), cudaHostAllocPortable)); + CHECK_CUDA_ERRORS(cudaMalloc(&d_rowLengthsynapses_2, 100 * sizeof(unsigned int))); + CHECK_CUDA_ERRORS(cudaHostAlloc(&indsynapses_2, 10000 * sizeof(uint32_t), cudaHostAllocPortable)); + CHECK_CUDA_ERRORS(cudaMalloc(&d_indsynapses_2, 10000 * sizeof(uint32_t))); + + // ------------------------------------------------------------------------ + // synapse variables + // ------------------------------------------------------------------------ + CHECK_CUDA_ERRORS(cudaHostAlloc(&weightsynapses, 42100 * sizeof(double), cudaHostAllocPortable)); + CHECK_CUDA_ERRORS(cudaMalloc(&d_weightsynapses, 42100 * sizeof(double))); + CHECK_CUDA_ERRORS(cudaHostAlloc(&lastupdatesynapses_1, 250000 * sizeof(double), cudaHostAllocPortable)); + CHECK_CUDA_ERRORS(cudaMalloc(&d_lastupdatesynapses_1, 250000 * sizeof(double))); + CHECK_CUDA_ERRORS(cudaHostAlloc(&Apostsynapses_1, 250000 * sizeof(double), cudaHostAllocPortable)); + CHECK_CUDA_ERRORS(cudaMalloc(&d_Apostsynapses_1, 250000 * sizeof(double))); + CHECK_CUDA_ERRORS(cudaHostAlloc(&g_rawsynapses_1, 250000 * sizeof(double), cudaHostAllocPortable)); + CHECK_CUDA_ERRORS(cudaMalloc(&d_g_rawsynapses_1, 250000 * sizeof(double))); + CHECK_CUDA_ERRORS(cudaHostAlloc(&Apresynapses_1, 250000 * sizeof(double), cudaHostAllocPortable)); + CHECK_CUDA_ERRORS(cudaMalloc(&d_Apresynapses_1, 250000 * sizeof(double))); + + pushMergedNeuronInitGroup0ToDevice(0, d_glbSpkCntspikegeneratorgroup, d_glbSpkspikegeneratorgroup, 100); + pushMergedNeuronInitGroup1ToDevice(0, d_glbSpkCntneurongroup_1, d_glbSpkneurongroup_1, d_inSynsynapses_1, d_inSynsynapses_2, 100); + pushMergedNeuronInitGroup2ToDevice(0, d_glbSpkCntneurongroup, d_glbSpkneurongroup, d_inSynsynapses, 2500); + pushMergedSynapseSparseInitGroup0ToDevice(0, d_rowLengthsynapses_1, d_indsynapses_1, d_colLengthsynapses_1, d_remapsynapses_1, 100, 2500, 2500, 100); + pushMergedNeuronUpdateGroup0ToDevice(0, d_glbSpkCntspikegeneratorgroup, d_glbSpkspikegeneratorgroup, 100); + pushMergedNeuronUpdateGroup1ToDevice(0, d_glbSpkCntneurongroup_1, d_glbSpkneurongroup_1, d_ineurongroup_1, d_Vneurongroup_1, d_g_eKC_eKCneurongroup_1, d_g_iKC_eKCneurongroup_1, d_hneurongroup_1, d_mneurongroup_1, d_nneurongroup_1, d_lastspikeneurongroup_1, d_not_refractoryneurongroup_1, d_inSynsynapses_2, d_inSynsynapses_1, 100); + pushMergedNeuronUpdateGroup2ToDevice(0, d_glbSpkCntneurongroup, d_glbSpkneurongroup, d_ineurongroup, d_Vneurongroup, d_g_PN_iKCneurongroup, d_hneurongroup, d_mneurongroup, d_nneurongroup, d_lastspikeneurongroup, d_not_refractoryneurongroup, d_inSynsynapses, 2500); + pushMergedPresynapticUpdateGroup0ToDevice(0, d_inSynsynapses_2, d_glbSpkCntneurongroup_1, d_glbSpkneurongroup_1, d_rowLengthsynapses_2, d_indsynapses_2, 100, 100, 100); + pushMergedPresynapticUpdateGroup1ToDevice(0, d_inSynsynapses_1, d_glbSpkCntneurongroup, d_glbSpkneurongroup, d_rowLengthsynapses_1, d_indsynapses_1, d_lastupdatesynapses_1, d_Apostsynapses_1, d_g_rawsynapses_1, d_Apresynapses_1, 100, 2500, 100); + pushMergedPresynapticUpdateGroup2ToDevice(0, d_inSynsynapses, d_glbSpkCntspikegeneratorgroup, d_glbSpkspikegeneratorgroup, d_rowLengthsynapses, d_indsynapses, d_weightsynapses, 421, 100, 2500); + pushMergedPostsynapticUpdateGroup0ToDevice(0, d_glbSpkCntneurongroup_1, d_glbSpkneurongroup_1, d_rowLengthsynapses_1, d_indsynapses_1, d_colLengthsynapses_1, d_remapsynapses_1, d_lastupdatesynapses_1, d_Apostsynapses_1, d_g_rawsynapses_1, d_Apresynapses_1, 100, 2500, 2500, 100); + pushMergedNeuronSpikeQueueUpdateGroup0ToDevice(0, d_glbSpkCntneurongroup); + pushMergedNeuronSpikeQueueUpdateGroup0ToDevice(1, d_glbSpkCntneurongroup_1); + pushMergedNeuronSpikeQueueUpdateGroup0ToDevice(2, d_glbSpkCntspikegeneratorgroup); +} + +void freeMem() { + // ------------------------------------------------------------------------ + // global variables + // ------------------------------------------------------------------------ + + // ------------------------------------------------------------------------ + // timers + // ------------------------------------------------------------------------ + // ------------------------------------------------------------------------ + // local neuron groups + // ------------------------------------------------------------------------ + CHECK_CUDA_ERRORS(cudaFreeHost(glbSpkCntneurongroup)); + CHECK_CUDA_ERRORS(cudaFree(d_glbSpkCntneurongroup)); + CHECK_CUDA_ERRORS(cudaFreeHost(glbSpkneurongroup)); + CHECK_CUDA_ERRORS(cudaFree(d_glbSpkneurongroup)); + CHECK_CUDA_ERRORS(cudaFreeHost(ineurongroup)); + CHECK_CUDA_ERRORS(cudaFree(d_ineurongroup)); + CHECK_CUDA_ERRORS(cudaFreeHost(Vneurongroup)); + CHECK_CUDA_ERRORS(cudaFree(d_Vneurongroup)); + CHECK_CUDA_ERRORS(cudaFreeHost(g_PN_iKCneurongroup)); + CHECK_CUDA_ERRORS(cudaFree(d_g_PN_iKCneurongroup)); + CHECK_CUDA_ERRORS(cudaFreeHost(hneurongroup)); + CHECK_CUDA_ERRORS(cudaFree(d_hneurongroup)); + CHECK_CUDA_ERRORS(cudaFreeHost(mneurongroup)); + CHECK_CUDA_ERRORS(cudaFree(d_mneurongroup)); + CHECK_CUDA_ERRORS(cudaFreeHost(nneurongroup)); + CHECK_CUDA_ERRORS(cudaFree(d_nneurongroup)); + CHECK_CUDA_ERRORS(cudaFreeHost(lastspikeneurongroup)); + CHECK_CUDA_ERRORS(cudaFree(d_lastspikeneurongroup)); + CHECK_CUDA_ERRORS(cudaFreeHost(not_refractoryneurongroup)); + CHECK_CUDA_ERRORS(cudaFree(d_not_refractoryneurongroup)); + CHECK_CUDA_ERRORS(cudaFreeHost(glbSpkCntneurongroup_1)); + CHECK_CUDA_ERRORS(cudaFree(d_glbSpkCntneurongroup_1)); + CHECK_CUDA_ERRORS(cudaFreeHost(glbSpkneurongroup_1)); + CHECK_CUDA_ERRORS(cudaFree(d_glbSpkneurongroup_1)); + CHECK_CUDA_ERRORS(cudaFreeHost(ineurongroup_1)); + CHECK_CUDA_ERRORS(cudaFree(d_ineurongroup_1)); + CHECK_CUDA_ERRORS(cudaFreeHost(Vneurongroup_1)); + CHECK_CUDA_ERRORS(cudaFree(d_Vneurongroup_1)); + CHECK_CUDA_ERRORS(cudaFreeHost(g_eKC_eKCneurongroup_1)); + CHECK_CUDA_ERRORS(cudaFree(d_g_eKC_eKCneurongroup_1)); + CHECK_CUDA_ERRORS(cudaFreeHost(g_iKC_eKCneurongroup_1)); + CHECK_CUDA_ERRORS(cudaFree(d_g_iKC_eKCneurongroup_1)); + CHECK_CUDA_ERRORS(cudaFreeHost(hneurongroup_1)); + CHECK_CUDA_ERRORS(cudaFree(d_hneurongroup_1)); + CHECK_CUDA_ERRORS(cudaFreeHost(mneurongroup_1)); + CHECK_CUDA_ERRORS(cudaFree(d_mneurongroup_1)); + CHECK_CUDA_ERRORS(cudaFreeHost(nneurongroup_1)); + CHECK_CUDA_ERRORS(cudaFree(d_nneurongroup_1)); + CHECK_CUDA_ERRORS(cudaFreeHost(lastspikeneurongroup_1)); + CHECK_CUDA_ERRORS(cudaFree(d_lastspikeneurongroup_1)); + CHECK_CUDA_ERRORS(cudaFreeHost(not_refractoryneurongroup_1)); + CHECK_CUDA_ERRORS(cudaFree(d_not_refractoryneurongroup_1)); + CHECK_CUDA_ERRORS(cudaFreeHost(glbSpkCntspikegeneratorgroup)); + CHECK_CUDA_ERRORS(cudaFree(d_glbSpkCntspikegeneratorgroup)); + CHECK_CUDA_ERRORS(cudaFreeHost(glbSpkspikegeneratorgroup)); + CHECK_CUDA_ERRORS(cudaFree(d_glbSpkspikegeneratorgroup)); + + // ------------------------------------------------------------------------ + // custom update variables + // ------------------------------------------------------------------------ + + // ------------------------------------------------------------------------ + // postsynaptic variables + // ------------------------------------------------------------------------ + CHECK_CUDA_ERRORS(cudaFreeHost(inSynsynapses)); + CHECK_CUDA_ERRORS(cudaFree(d_inSynsynapses)); + CHECK_CUDA_ERRORS(cudaFreeHost(inSynsynapses_1)); + CHECK_CUDA_ERRORS(cudaFree(d_inSynsynapses_1)); + CHECK_CUDA_ERRORS(cudaFreeHost(inSynsynapses_2)); + CHECK_CUDA_ERRORS(cudaFree(d_inSynsynapses_2)); + + // ------------------------------------------------------------------------ + // synapse connectivity + // ------------------------------------------------------------------------ + CHECK_CUDA_ERRORS(cudaFreeHost(rowLengthsynapses)); + CHECK_CUDA_ERRORS(cudaFree(d_rowLengthsynapses)); + CHECK_CUDA_ERRORS(cudaFreeHost(indsynapses)); + CHECK_CUDA_ERRORS(cudaFree(d_indsynapses)); + CHECK_CUDA_ERRORS(cudaFreeHost(rowLengthsynapses_1)); + CHECK_CUDA_ERRORS(cudaFree(d_rowLengthsynapses_1)); + CHECK_CUDA_ERRORS(cudaFreeHost(indsynapses_1)); + CHECK_CUDA_ERRORS(cudaFree(d_indsynapses_1)); + CHECK_CUDA_ERRORS(cudaFree(d_colLengthsynapses_1)); + CHECK_CUDA_ERRORS(cudaFree(d_remapsynapses_1)); + CHECK_CUDA_ERRORS(cudaFreeHost(rowLengthsynapses_2)); + CHECK_CUDA_ERRORS(cudaFree(d_rowLengthsynapses_2)); + CHECK_CUDA_ERRORS(cudaFreeHost(indsynapses_2)); + CHECK_CUDA_ERRORS(cudaFree(d_indsynapses_2)); + + // ------------------------------------------------------------------------ + // synapse variables + // ------------------------------------------------------------------------ + CHECK_CUDA_ERRORS(cudaFreeHost(weightsynapses)); + CHECK_CUDA_ERRORS(cudaFree(d_weightsynapses)); + CHECK_CUDA_ERRORS(cudaFreeHost(lastupdatesynapses_1)); + CHECK_CUDA_ERRORS(cudaFree(d_lastupdatesynapses_1)); + CHECK_CUDA_ERRORS(cudaFreeHost(Apostsynapses_1)); + CHECK_CUDA_ERRORS(cudaFree(d_Apostsynapses_1)); + CHECK_CUDA_ERRORS(cudaFreeHost(g_rawsynapses_1)); + CHECK_CUDA_ERRORS(cudaFree(d_g_rawsynapses_1)); + CHECK_CUDA_ERRORS(cudaFreeHost(Apresynapses_1)); + CHECK_CUDA_ERRORS(cudaFree(d_Apresynapses_1)); + +} + +size_t getFreeDeviceMemBytes() { + size_t free; + size_t total; + CHECK_CUDA_ERRORS(cudaMemGetInfo(&free, &total)); + return free; +} + +void stepTime() { + updateSynapses(t); + updateNeurons(t); + iT++; + t = iT*DT; +} + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/runner.d b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/runner.d new file mode 100644 index 00000000..c70ccb7c --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/runner.d @@ -0,0 +1,263 @@ +runner.o : runner.cc \ + /usr/include/stdc-predef.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda_runtime.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/host_config.h \ + /usr/include/features.h \ + /usr/include/x86_64-linux-gnu/sys/cdefs.h \ + /usr/include/x86_64-linux-gnu/bits/wordsize.h \ + /usr/include/x86_64-linux-gnu/bits/long-double.h \ + /usr/include/x86_64-linux-gnu/gnu/stubs.h \ + /usr/include/x86_64-linux-gnu/gnu/stubs-64.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/builtin_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/device_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/host_defines.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/driver_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/vector_types.h \ + /usr/lib/gcc/x86_64-linux-gnu/9/include/limits.h \ + /usr/lib/gcc/x86_64-linux-gnu/9/include/syslimits.h \ + /usr/include/limits.h \ + /usr/include/x86_64-linux-gnu/bits/libc-header-start.h \ + /usr/include/x86_64-linux-gnu/bits/posix1_lim.h \ + /usr/include/x86_64-linux-gnu/bits/local_lim.h \ + /usr/include/linux/limits.h \ + /usr/include/x86_64-linux-gnu/bits/posix2_lim.h \ + /usr/include/x86_64-linux-gnu/bits/xopen_lim.h \ + /usr/include/x86_64-linux-gnu/bits/uio_lim.h \ + /usr/lib/gcc/x86_64-linux-gnu/9/include/stddef.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/surface_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/texture_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/library_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/channel_descriptor.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda_runtime_api.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda_device_runtime_api.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/driver_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/vector_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/vector_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/common_functions.h \ + /usr/include/string.h \ + /usr/include/x86_64-linux-gnu/bits/types/locale_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/__locale_t.h \ + /usr/include/strings.h \ + /usr/include/time.h \ + /usr/include/x86_64-linux-gnu/bits/time.h \ + /usr/include/x86_64-linux-gnu/bits/types.h \ + /usr/include/x86_64-linux-gnu/bits/timesize.h \ + /usr/include/x86_64-linux-gnu/bits/typesizes.h \ + /usr/include/x86_64-linux-gnu/bits/time64.h \ + /usr/include/x86_64-linux-gnu/bits/timex.h \ + /usr/include/x86_64-linux-gnu/bits/types/struct_timeval.h \ + /usr/include/x86_64-linux-gnu/bits/types/clock_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/time_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/struct_tm.h \ + /usr/include/x86_64-linux-gnu/bits/types/struct_timespec.h \ + /usr/include/x86_64-linux-gnu/bits/endian.h \ + /usr/include/x86_64-linux-gnu/bits/endianness.h \ + /usr/include/x86_64-linux-gnu/bits/types/clockid_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/timer_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/struct_itimerspec.h \ + /usr/include/c++/9/new \ + /usr/include/x86_64-linux-gnu/c++/9/bits/c++config.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/os_defines.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/cpu_defines.h \ + /usr/include/c++/9/exception \ + /usr/include/c++/9/bits/exception.h \ + /usr/include/c++/9/bits/exception_ptr.h \ + /usr/include/c++/9/bits/exception_defines.h \ + /usr/include/c++/9/bits/cxxabi_init_exception.h \ + /usr/include/c++/9/typeinfo \ + /usr/include/c++/9/bits/hash_bytes.h \ + /usr/include/c++/9/bits/nested_exception.h \ + /usr/include/c++/9/bits/move.h \ + /usr/include/c++/9/bits/concept_check.h \ + /usr/include/c++/9/type_traits \ + /usr/include/stdio.h \ + /usr/lib/gcc/x86_64-linux-gnu/9/include/stdarg.h \ + /usr/include/x86_64-linux-gnu/bits/types/__fpos_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/__mbstate_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/__fpos64_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/__FILE.h \ + /usr/include/x86_64-linux-gnu/bits/types/FILE.h \ + /usr/include/x86_64-linux-gnu/bits/types/struct_FILE.h \ + /usr/include/x86_64-linux-gnu/bits/types/cookie_io_functions_t.h \ + /usr/include/x86_64-linux-gnu/bits/stdio_lim.h \ + /usr/include/x86_64-linux-gnu/bits/sys_errlist.h \ + /usr/include/c++/9/stdlib.h \ + /usr/include/c++/9/cstdlib \ + /usr/include/stdlib.h \ + /usr/include/x86_64-linux-gnu/bits/waitflags.h \ + /usr/include/x86_64-linux-gnu/bits/waitstatus.h \ + /usr/include/x86_64-linux-gnu/bits/floatn.h \ + /usr/include/x86_64-linux-gnu/bits/floatn-common.h \ + /usr/include/x86_64-linux-gnu/sys/types.h \ + /usr/include/x86_64-linux-gnu/bits/stdint-intn.h \ + /usr/include/endian.h \ + /usr/include/x86_64-linux-gnu/bits/byteswap.h \ + /usr/include/x86_64-linux-gnu/bits/uintn-identity.h \ + /usr/include/x86_64-linux-gnu/sys/select.h \ + /usr/include/x86_64-linux-gnu/bits/select.h \ + /usr/include/x86_64-linux-gnu/bits/types/sigset_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/__sigset_t.h \ + /usr/include/x86_64-linux-gnu/bits/pthreadtypes.h \ + /usr/include/x86_64-linux-gnu/bits/thread-shared-types.h \ + /usr/include/x86_64-linux-gnu/bits/pthreadtypes-arch.h \ + /usr/include/x86_64-linux-gnu/bits/struct_mutex.h \ + /usr/include/x86_64-linux-gnu/bits/struct_rwlock.h \ + /usr/include/alloca.h \ + /usr/include/x86_64-linux-gnu/bits/stdlib-float.h \ + /usr/include/c++/9/bits/std_abs.h \ + /usr/include/assert.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/math_functions.h \ + /usr/include/c++/9/math.h \ + /usr/include/c++/9/cmath \ + /usr/include/c++/9/bits/cpp_type_traits.h \ + /usr/include/c++/9/ext/type_traits.h \ + /usr/include/math.h \ + /usr/include/x86_64-linux-gnu/bits/math-vector.h \ + /usr/include/x86_64-linux-gnu/bits/libm-simd-decl-stubs.h \ + /usr/include/x86_64-linux-gnu/bits/flt-eval-method.h \ + /usr/include/x86_64-linux-gnu/bits/fp-logb.h \ + /usr/include/x86_64-linux-gnu/bits/fp-fast.h \ + /usr/include/x86_64-linux-gnu/bits/mathcalls-helper-functions.h \ + /usr/include/x86_64-linux-gnu/bits/mathcalls.h \ + /usr/include/x86_64-linux-gnu/bits/mathcalls-narrow.h \ + /usr/include/x86_64-linux-gnu/bits/iscanonical.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/math_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda_surface_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda_texture_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/device_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/device_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/device_atomic_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/device_atomic_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/device_double_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/device_double_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_20_atomic_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_20_atomic_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_32_atomic_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_32_atomic_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_35_atomic_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_60_atomic_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_60_atomic_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_20_intrinsics.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_20_intrinsics.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_30_intrinsics.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_30_intrinsics.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_32_intrinsics.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_32_intrinsics.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_35_intrinsics.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_61_intrinsics.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_61_intrinsics.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/sm_70_rt.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/sm_70_rt.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/sm_80_rt.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/sm_80_rt.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/surface_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/texture_fetch_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/texture_indirect_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/surface_indirect_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/device_launch_parameters.h \ + definitionsInternal.h \ + definitions.h \ + /usr/include/c++/9/random \ + /usr/include/c++/9/string \ + /usr/include/c++/9/bits/stringfwd.h \ + /usr/include/c++/9/bits/memoryfwd.h \ + /usr/include/c++/9/bits/char_traits.h \ + /usr/include/c++/9/bits/stl_algobase.h \ + /usr/include/c++/9/bits/functexcept.h \ + /usr/include/c++/9/ext/numeric_traits.h \ + /usr/include/c++/9/bits/stl_pair.h \ + /usr/include/c++/9/bits/stl_iterator_base_types.h \ + /usr/include/c++/9/bits/stl_iterator_base_funcs.h \ + /usr/include/c++/9/debug/assertions.h \ + /usr/include/c++/9/bits/stl_iterator.h \ + /usr/include/c++/9/bits/ptr_traits.h \ + /usr/include/c++/9/debug/debug.h \ + /usr/include/c++/9/bits/predefined_ops.h \ + /usr/include/c++/9/bits/postypes.h \ + /usr/include/c++/9/cwchar \ + /usr/include/wchar.h \ + /usr/include/x86_64-linux-gnu/bits/wchar.h \ + /usr/include/x86_64-linux-gnu/bits/types/wint_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/mbstate_t.h \ + /usr/include/c++/9/cstdint \ + /usr/lib/gcc/x86_64-linux-gnu/9/include/stdint.h \ + /usr/include/stdint.h \ + /usr/include/x86_64-linux-gnu/bits/stdint-uintn.h \ + /usr/include/c++/9/bits/allocator.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/c++allocator.h \ + /usr/include/c++/9/ext/new_allocator.h \ + /usr/include/c++/9/bits/localefwd.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/c++locale.h \ + /usr/include/c++/9/clocale \ + /usr/include/locale.h \ + /usr/include/x86_64-linux-gnu/bits/locale.h \ + /usr/include/c++/9/iosfwd \ + /usr/include/c++/9/cctype \ + /usr/include/ctype.h \ + /usr/include/c++/9/bits/ostream_insert.h \ + /usr/include/c++/9/bits/cxxabi_forced.h \ + /usr/include/c++/9/bits/stl_function.h \ + /usr/include/c++/9/backward/binders.h \ + /usr/include/c++/9/bits/range_access.h \ + /usr/include/c++/9/initializer_list \ + /usr/include/c++/9/bits/basic_string.h \ + /usr/include/c++/9/ext/atomicity.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/gthr.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/gthr-default.h \ + /usr/include/pthread.h \ + /usr/include/sched.h \ + /usr/include/x86_64-linux-gnu/bits/sched.h \ + /usr/include/x86_64-linux-gnu/bits/types/struct_sched_param.h \ + /usr/include/x86_64-linux-gnu/bits/cpu-set.h \ + /usr/include/x86_64-linux-gnu/bits/setjmp.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/atomic_word.h \ + /usr/include/c++/9/ext/alloc_traits.h \ + /usr/include/c++/9/bits/alloc_traits.h \ + /usr/include/c++/9/ext/string_conversions.h \ + /usr/include/c++/9/cstdio \ + /usr/include/c++/9/cerrno \ + /usr/include/errno.h \ + /usr/include/x86_64-linux-gnu/bits/errno.h \ + /usr/include/linux/errno.h \ + /usr/include/x86_64-linux-gnu/asm/errno.h \ + /usr/include/asm-generic/errno.h \ + /usr/include/asm-generic/errno-base.h \ + /usr/include/x86_64-linux-gnu/bits/types/error_t.h \ + /usr/include/c++/9/bits/functional_hash.h \ + /usr/include/c++/9/bits/basic_string.tcc \ + /usr/include/c++/9/limits \ + /usr/include/c++/9/bits/random.h \ + /usr/include/c++/9/vector \ + /usr/include/c++/9/bits/stl_construct.h \ + /usr/include/c++/9/bits/stl_uninitialized.h \ + /usr/include/c++/9/bits/stl_vector.h \ + /usr/include/c++/9/bits/stl_bvector.h \ + /usr/include/c++/9/bits/vector.tcc \ + /usr/include/c++/9/bits/uniform_int_dist.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/opt_random.h \ + /usr/include/c++/9/bits/random.tcc \ + /usr/include/c++/9/numeric \ + /usr/include/c++/9/bits/stl_numeric.h \ + /usr/include/c++/9/stdexcept \ + /usr/include/c++/9/cassert \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_kernel.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_discrete.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_precalc.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_mrg32k3a.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_mtgp32_kernel.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda.h \ + /usr/include/memory.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_mtgp32.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_philox4x32_x.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_globals.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_uniform.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_normal.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_normal_static.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_lognormal.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_poisson.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_discrete2.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda_fp16.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda_fp16.hpp \ + /usr/include/c++/9/utility \ + /usr/include/c++/9/bits/stl_relops.h diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/supportCode.h b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/supportCode.h new file mode 100644 index 00000000..defc291e --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/supportCode.h @@ -0,0 +1,196 @@ +#pragma once + +// support code for neuron update groups +namespace NeuronUpdateSupportCode0 { + + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < typename T1, typename T2 > + SUPPORT_CODE_FUNC typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + SUPPORT_CODE_FUNC typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + + + + +} + + +// support code for postsynaptic dynamics + +// support code for presynaptic update +namespace PresynapticUpdateSupportCode1 { + + SUPPORT_CODE_FUNC double _clip(const float value, const float a_min, const float a_max) + { + if (value < a_min) + return a_min; + if (value > a_max) + return a_max; + return value; + } + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < typename T1, typename T2 > + SUPPORT_CODE_FUNC typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + SUPPORT_CODE_FUNC typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + + + + +} + +namespace PresynapticUpdateSupportCode0 { + + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < typename T1, typename T2 > + SUPPORT_CODE_FUNC typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + SUPPORT_CODE_FUNC typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + + + + +} + + +// support code for postsynaptic update groups +namespace PostsynapticUpdateSupportCode0 { + + SUPPORT_CODE_FUNC double _clip(const float value, const float a_min, const float a_max) + { + if (value < a_min) + return a_min; + if (value > a_max) + return a_max; + return value; + } + template < typename T1, typename T2 > struct _higher_type; + template < > struct _higher_type { typedef int type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef long type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef float type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < > struct _higher_type { typedef double type; }; + template < typename T1, typename T2 > + SUPPORT_CODE_FUNC typename _higher_type::type + _brian_mod(T1 x, T2 y) + {{ + return x-y*floor(1.0*x/y); + }} + template < typename T1, typename T2 > + SUPPORT_CODE_FUNC typename _higher_type::type + _brian_floordiv(T1 x, T2 y) + {{ + return floor(1.0*x/y); + }} + #ifdef _MSC_VER + #define _brian_pow(x, y) (pow((double)(x), (y))) + #else + #define _brian_pow(x, y) (pow((x), (y))) + #endif + + + + +} + + +// support code for synapse dynamics update groups + diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/synapseUpdate.cc b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/synapseUpdate.cc new file mode 100644 index 00000000..c8e073a2 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/synapseUpdate.cc @@ -0,0 +1,258 @@ +#include "definitionsInternal.h" +#include "supportCode.h" + +struct MergedPresynapticUpdateGroup0 + { + double* inSyn; + unsigned int* srcSpkCnt; + unsigned int* srcSpk; + unsigned int* rowLength; + uint32_t* ind; + unsigned int rowStride; + unsigned int numSrcNeurons; + unsigned int numTrgNeurons; + +} +; +struct MergedPresynapticUpdateGroup1 + { + double* inSyn; + unsigned int* srcSpkCnt; + unsigned int* srcSpk; + unsigned int* rowLength; + uint32_t* ind; + double* lastupdate; + double* Apost; + double* g_raw; + double* Apre; + unsigned int rowStride; + unsigned int numSrcNeurons; + unsigned int numTrgNeurons; + +} +; +struct MergedPresynapticUpdateGroup2 + { + double* inSyn; + unsigned int* srcSpkCnt; + unsigned int* srcSpk; + unsigned int* rowLength; + uint32_t* ind; + double* weight; + unsigned int rowStride; + unsigned int numSrcNeurons; + unsigned int numTrgNeurons; + +} +; +struct MergedPostsynapticUpdateGroup0 + { + unsigned int* trgSpkCnt; + unsigned int* trgSpk; + unsigned int* rowLength; + uint32_t* ind; + unsigned int* colLength; + unsigned int* remap; + double* lastupdate; + double* Apost; + double* g_raw; + double* Apre; + unsigned int rowStride; + unsigned int colStride; + unsigned int numSrcNeurons; + unsigned int numTrgNeurons; + +} +; +__device__ __constant__ MergedPresynapticUpdateGroup0 d_mergedPresynapticUpdateGroup0[1]; +void pushMergedPresynapticUpdateGroup0ToDevice(unsigned int idx, double* inSyn, unsigned int* srcSpkCnt, unsigned int* srcSpk, unsigned int* rowLength, uint32_t* ind, unsigned int rowStride, unsigned int numSrcNeurons, unsigned int numTrgNeurons) { + MergedPresynapticUpdateGroup0 group = {inSyn, srcSpkCnt, srcSpk, rowLength, ind, rowStride, numSrcNeurons, numTrgNeurons, }; + CHECK_CUDA_ERRORS(cudaMemcpyToSymbolAsync(d_mergedPresynapticUpdateGroup0, &group, sizeof(MergedPresynapticUpdateGroup0), idx * sizeof(MergedPresynapticUpdateGroup0))); +} +__device__ __constant__ MergedPresynapticUpdateGroup1 d_mergedPresynapticUpdateGroup1[1]; +void pushMergedPresynapticUpdateGroup1ToDevice(unsigned int idx, double* inSyn, unsigned int* srcSpkCnt, unsigned int* srcSpk, unsigned int* rowLength, uint32_t* ind, double* lastupdate, double* Apost, double* g_raw, double* Apre, unsigned int rowStride, unsigned int numSrcNeurons, unsigned int numTrgNeurons) { + MergedPresynapticUpdateGroup1 group = {inSyn, srcSpkCnt, srcSpk, rowLength, ind, lastupdate, Apost, g_raw, Apre, rowStride, numSrcNeurons, numTrgNeurons, }; + CHECK_CUDA_ERRORS(cudaMemcpyToSymbolAsync(d_mergedPresynapticUpdateGroup1, &group, sizeof(MergedPresynapticUpdateGroup1), idx * sizeof(MergedPresynapticUpdateGroup1))); +} +__device__ __constant__ MergedPresynapticUpdateGroup2 d_mergedPresynapticUpdateGroup2[1]; +void pushMergedPresynapticUpdateGroup2ToDevice(unsigned int idx, double* inSyn, unsigned int* srcSpkCnt, unsigned int* srcSpk, unsigned int* rowLength, uint32_t* ind, double* weight, unsigned int rowStride, unsigned int numSrcNeurons, unsigned int numTrgNeurons) { + MergedPresynapticUpdateGroup2 group = {inSyn, srcSpkCnt, srcSpk, rowLength, ind, weight, rowStride, numSrcNeurons, numTrgNeurons, }; + CHECK_CUDA_ERRORS(cudaMemcpyToSymbolAsync(d_mergedPresynapticUpdateGroup2, &group, sizeof(MergedPresynapticUpdateGroup2), idx * sizeof(MergedPresynapticUpdateGroup2))); +} +__device__ __constant__ MergedPostsynapticUpdateGroup0 d_mergedPostsynapticUpdateGroup0[1]; +void pushMergedPostsynapticUpdateGroup0ToDevice(unsigned int idx, unsigned int* trgSpkCnt, unsigned int* trgSpk, unsigned int* rowLength, uint32_t* ind, unsigned int* colLength, unsigned int* remap, double* lastupdate, double* Apost, double* g_raw, double* Apre, unsigned int rowStride, unsigned int colStride, unsigned int numSrcNeurons, unsigned int numTrgNeurons) { + MergedPostsynapticUpdateGroup0 group = {trgSpkCnt, trgSpk, rowLength, ind, colLength, remap, lastupdate, Apost, g_raw, Apre, rowStride, colStride, numSrcNeurons, numTrgNeurons, }; + CHECK_CUDA_ERRORS(cudaMemcpyToSymbolAsync(d_mergedPostsynapticUpdateGroup0, &group, sizeof(MergedPostsynapticUpdateGroup0), idx * sizeof(MergedPostsynapticUpdateGroup0))); +} +// ------------------------------------------------------------------------ +// merged extra global parameter functions +// ------------------------------------------------------------------------ +// ------------------------------------------------------------------------ +// merged extra global parameter functions +// ------------------------------------------------------------------------ +__device__ __constant__ unsigned int d_mergedPresynapticUpdateGroupStartID0[] = {0, }; +__device__ __constant__ unsigned int d_mergedPresynapticUpdateGroupStartID1[] = {128, }; +__device__ __constant__ unsigned int d_mergedPresynapticUpdateGroupStartID2[] = {256, }; +__device__ __constant__ unsigned int d_mergedPostsynapticUpdateGroupStartID0[] = {0, }; +extern "C" __global__ void updatePresynapticKernel(double t) + { + const unsigned int id = 32 * blockIdx.x + threadIdx.x; + __shared__ unsigned int shRowLength[32]; + __shared__ unsigned int shSpk[32]; + // merged0 + if(id < 128) { + struct MergedPresynapticUpdateGroup0 *group = &d_mergedPresynapticUpdateGroup0[0]; + const unsigned int lid = id - 0; + { + const unsigned int numSpikes = group->srcSpkCnt[0]; + const unsigned int numSpikeBlocks = (numSpikes + 32 - 1) / 32; + for (unsigned int r = 0; r < numSpikeBlocks; r++) { + const unsigned int numSpikesInBlock = (r == numSpikeBlocks - 1) ? ((numSpikes - 1) % 32) + 1 : 32; + __syncthreads(); + if (threadIdx.x < numSpikesInBlock) { + const unsigned int spk = group->srcSpk[(r * 32) + threadIdx.x]; + shSpk[threadIdx.x] = spk; + shRowLength[threadIdx.x] = group->rowLength[spk]; + } + __syncthreads(); + // loop through all incoming spikes + for (unsigned int j = 0; j < numSpikesInBlock; j++) { + // only work on existing neurons + if (lid < group->rowStride) { + using namespace PresynapticUpdateSupportCode0; + const unsigned int synAddress = (shSpk[j] * group->rowStride) + lid; + const unsigned int npost = shRowLength[j]; + if (lid < npost) { + const unsigned int ipost = group->ind[synAddress]; + atomicAdd(&group->inSyn[ipost], ((6.75000000000000044e-01) * (7.50000000000000098e-08)));} + } + } + } + } + + } + // merged1 + if(id >= 128 && id < 256) { + struct MergedPresynapticUpdateGroup1 *group = &d_mergedPresynapticUpdateGroup1[0]; + const unsigned int lid = id - 128; + { + const unsigned int numSpikes = group->srcSpkCnt[0]; + const unsigned int numSpikeBlocks = (numSpikes + 32 - 1) / 32; + for (unsigned int r = 0; r < numSpikeBlocks; r++) { + const unsigned int numSpikesInBlock = (r == numSpikeBlocks - 1) ? ((numSpikes - 1) % 32) + 1 : 32; + __syncthreads(); + if (threadIdx.x < numSpikesInBlock) { + const unsigned int spk = group->srcSpk[(r * 32) + threadIdx.x]; + shSpk[threadIdx.x] = spk; + shRowLength[threadIdx.x] = group->rowLength[spk]; + } + __syncthreads(); + // loop through all incoming spikes + for (unsigned int j = 0; j < numSpikesInBlock; j++) { + // only work on existing neurons + if (lid < group->rowStride) { + using namespace PresynapticUpdateSupportCode1; + const unsigned int synAddress = (shSpk[j] * group->rowStride) + lid; + const unsigned int npost = shRowLength[j]; + if (lid < npost) { + const unsigned int ipost = group->ind[synAddress]; + double _Apost = group->Apost[synAddress] * exp(1.0*(- (t - group->lastupdate[synAddress]))/(1.00000000000000002e-02)); + double _Apre = group->Apre[synAddress] * exp(1.0*(- (t - group->lastupdate[synAddress]))/(1.00000000000000002e-02)); + group->Apost[synAddress] = _Apost; + group->Apre[synAddress] = _Apre; + atomicAdd(&group->inSyn[ipost], group->g_raw[synAddress]); + group->Apre[synAddress] += (1.00000000000000017e-10); + group->g_raw[synAddress] = _clip(group->g_raw[synAddress] + group->Apost[synAddress], 0 * (1.00000000000000000e+00), (3.75000000000000049e-09)); + group->lastupdate[synAddress] = t;} + } + } + } + } + + } + // merged2 + if(id >= 256 && id < 704) { + struct MergedPresynapticUpdateGroup2 *group = &d_mergedPresynapticUpdateGroup2[0]; + const unsigned int lid = id - 256; + { + const unsigned int numSpikes = group->srcSpkCnt[0]; + const unsigned int numSpikeBlocks = (numSpikes + 32 - 1) / 32; + for (unsigned int r = 0; r < numSpikeBlocks; r++) { + const unsigned int numSpikesInBlock = (r == numSpikeBlocks - 1) ? ((numSpikes - 1) % 32) + 1 : 32; + __syncthreads(); + if (threadIdx.x < numSpikesInBlock) { + const unsigned int spk = group->srcSpk[(r * 32) + threadIdx.x]; + shSpk[threadIdx.x] = spk; + shRowLength[threadIdx.x] = group->rowLength[spk]; + } + __syncthreads(); + // loop through all incoming spikes + for (unsigned int j = 0; j < numSpikesInBlock; j++) { + // only work on existing neurons + if (lid < group->rowStride) { + using namespace PresynapticUpdateSupportCode0; + const unsigned int synAddress = (shSpk[j] * group->rowStride) + lid; + const unsigned int npost = shRowLength[j]; + if (lid < npost) { + const unsigned int ipost = group->ind[synAddress]; + atomicAdd(&group->inSyn[ipost], ((6.75000000000000044e-01) * group->weight[synAddress]));} + } + } + } + } + + } +} +extern "C" __global__ void updatePostsynapticKernel(double t) + { + const unsigned int id = 32 * blockIdx.x + threadIdx.x; + __shared__ unsigned int shSpk[32]; + __shared__ unsigned int shColLength[32]; + // merged0 + if(id < 2528) { + struct MergedPostsynapticUpdateGroup0 *group = &d_mergedPostsynapticUpdateGroup0[0]; + const unsigned int lid = id - 0; + const unsigned int numSpikes = group->trgSpkCnt[0]; + const unsigned int numSpikeBlocks = (numSpikes + 31) / 32; + for (unsigned int r = 0; r < numSpikeBlocks; r++) { + const unsigned int numSpikesInBlock = (r == numSpikeBlocks - 1) ? ((numSpikes - 1) % 32) + 1 : 32; + if (threadIdx.x < numSpikesInBlock) { + const unsigned int spk = group->trgSpk[(r * 32) + threadIdx.x]; + shSpk[threadIdx.x] = spk; + shColLength[threadIdx.x] = group->colLength[spk]; + } + __syncthreads(); + // only work on existing neurons + if (lid < group->colStride) { + // loop through all incoming spikes for learning + for (unsigned int j = 0; j < numSpikesInBlock; j++) { + if (lid < shColLength[j]) { + const unsigned int synAddress = group->remap[(shSpk[j] * group->colStride) + lid]; + const unsigned int ipre = synAddress / group->rowStride; + using namespace PostsynapticUpdateSupportCode0; + double _Apost = group->Apost[synAddress] * exp(1.0*(- (t - group->lastupdate[synAddress]))/(1.00000000000000002e-02)); + double _Apre = group->Apre[synAddress] * exp(1.0*(- (t - group->lastupdate[synAddress]))/(1.00000000000000002e-02)); + group->Apost[synAddress] = _Apost; + group->Apre[synAddress] = _Apre; + group->Apost[synAddress] += (-1.00000000000000017e-10); + group->g_raw[synAddress] = _clip(group->g_raw[synAddress] + group->Apre[synAddress], 0 * (1.00000000000000000e+00), (3.75000000000000049e-09)); + group->lastupdate[synAddress] = t;} + } + } + } + } +} +void updateSynapses(double t) { + { + const dim3 threads(32, 1); + const dim3 grid(22, 1); + updatePresynapticKernel<<>>(t); + CHECK_CUDA_ERRORS(cudaPeekAtLastError()); + } + { + const dim3 threads(32, 1); + const dim3 grid(79, 1); + updatePostsynapticKernel<<>>(t); + CHECK_CUDA_ERRORS(cudaPeekAtLastError()); + } +} diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/synapseUpdate.d b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/synapseUpdate.d new file mode 100644 index 00000000..46be32ae --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/synapseUpdate.d @@ -0,0 +1,264 @@ +synapseUpdate.o : synapseUpdate.cc \ + /usr/include/stdc-predef.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda_runtime.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/host_config.h \ + /usr/include/features.h \ + /usr/include/x86_64-linux-gnu/sys/cdefs.h \ + /usr/include/x86_64-linux-gnu/bits/wordsize.h \ + /usr/include/x86_64-linux-gnu/bits/long-double.h \ + /usr/include/x86_64-linux-gnu/gnu/stubs.h \ + /usr/include/x86_64-linux-gnu/gnu/stubs-64.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/builtin_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/device_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/host_defines.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/driver_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/vector_types.h \ + /usr/lib/gcc/x86_64-linux-gnu/9/include/limits.h \ + /usr/lib/gcc/x86_64-linux-gnu/9/include/syslimits.h \ + /usr/include/limits.h \ + /usr/include/x86_64-linux-gnu/bits/libc-header-start.h \ + /usr/include/x86_64-linux-gnu/bits/posix1_lim.h \ + /usr/include/x86_64-linux-gnu/bits/local_lim.h \ + /usr/include/linux/limits.h \ + /usr/include/x86_64-linux-gnu/bits/posix2_lim.h \ + /usr/include/x86_64-linux-gnu/bits/xopen_lim.h \ + /usr/include/x86_64-linux-gnu/bits/uio_lim.h \ + /usr/lib/gcc/x86_64-linux-gnu/9/include/stddef.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/surface_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/texture_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/library_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/channel_descriptor.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda_runtime_api.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda_device_runtime_api.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/driver_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/vector_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/vector_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/common_functions.h \ + /usr/include/string.h \ + /usr/include/x86_64-linux-gnu/bits/types/locale_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/__locale_t.h \ + /usr/include/strings.h \ + /usr/include/time.h \ + /usr/include/x86_64-linux-gnu/bits/time.h \ + /usr/include/x86_64-linux-gnu/bits/types.h \ + /usr/include/x86_64-linux-gnu/bits/timesize.h \ + /usr/include/x86_64-linux-gnu/bits/typesizes.h \ + /usr/include/x86_64-linux-gnu/bits/time64.h \ + /usr/include/x86_64-linux-gnu/bits/timex.h \ + /usr/include/x86_64-linux-gnu/bits/types/struct_timeval.h \ + /usr/include/x86_64-linux-gnu/bits/types/clock_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/time_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/struct_tm.h \ + /usr/include/x86_64-linux-gnu/bits/types/struct_timespec.h \ + /usr/include/x86_64-linux-gnu/bits/endian.h \ + /usr/include/x86_64-linux-gnu/bits/endianness.h \ + /usr/include/x86_64-linux-gnu/bits/types/clockid_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/timer_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/struct_itimerspec.h \ + /usr/include/c++/9/new \ + /usr/include/x86_64-linux-gnu/c++/9/bits/c++config.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/os_defines.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/cpu_defines.h \ + /usr/include/c++/9/exception \ + /usr/include/c++/9/bits/exception.h \ + /usr/include/c++/9/bits/exception_ptr.h \ + /usr/include/c++/9/bits/exception_defines.h \ + /usr/include/c++/9/bits/cxxabi_init_exception.h \ + /usr/include/c++/9/typeinfo \ + /usr/include/c++/9/bits/hash_bytes.h \ + /usr/include/c++/9/bits/nested_exception.h \ + /usr/include/c++/9/bits/move.h \ + /usr/include/c++/9/bits/concept_check.h \ + /usr/include/c++/9/type_traits \ + /usr/include/stdio.h \ + /usr/lib/gcc/x86_64-linux-gnu/9/include/stdarg.h \ + /usr/include/x86_64-linux-gnu/bits/types/__fpos_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/__mbstate_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/__fpos64_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/__FILE.h \ + /usr/include/x86_64-linux-gnu/bits/types/FILE.h \ + /usr/include/x86_64-linux-gnu/bits/types/struct_FILE.h \ + /usr/include/x86_64-linux-gnu/bits/types/cookie_io_functions_t.h \ + /usr/include/x86_64-linux-gnu/bits/stdio_lim.h \ + /usr/include/x86_64-linux-gnu/bits/sys_errlist.h \ + /usr/include/c++/9/stdlib.h \ + /usr/include/c++/9/cstdlib \ + /usr/include/stdlib.h \ + /usr/include/x86_64-linux-gnu/bits/waitflags.h \ + /usr/include/x86_64-linux-gnu/bits/waitstatus.h \ + /usr/include/x86_64-linux-gnu/bits/floatn.h \ + /usr/include/x86_64-linux-gnu/bits/floatn-common.h \ + /usr/include/x86_64-linux-gnu/sys/types.h \ + /usr/include/x86_64-linux-gnu/bits/stdint-intn.h \ + /usr/include/endian.h \ + /usr/include/x86_64-linux-gnu/bits/byteswap.h \ + /usr/include/x86_64-linux-gnu/bits/uintn-identity.h \ + /usr/include/x86_64-linux-gnu/sys/select.h \ + /usr/include/x86_64-linux-gnu/bits/select.h \ + /usr/include/x86_64-linux-gnu/bits/types/sigset_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/__sigset_t.h \ + /usr/include/x86_64-linux-gnu/bits/pthreadtypes.h \ + /usr/include/x86_64-linux-gnu/bits/thread-shared-types.h \ + /usr/include/x86_64-linux-gnu/bits/pthreadtypes-arch.h \ + /usr/include/x86_64-linux-gnu/bits/struct_mutex.h \ + /usr/include/x86_64-linux-gnu/bits/struct_rwlock.h \ + /usr/include/alloca.h \ + /usr/include/x86_64-linux-gnu/bits/stdlib-float.h \ + /usr/include/c++/9/bits/std_abs.h \ + /usr/include/assert.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/math_functions.h \ + /usr/include/c++/9/math.h \ + /usr/include/c++/9/cmath \ + /usr/include/c++/9/bits/cpp_type_traits.h \ + /usr/include/c++/9/ext/type_traits.h \ + /usr/include/math.h \ + /usr/include/x86_64-linux-gnu/bits/math-vector.h \ + /usr/include/x86_64-linux-gnu/bits/libm-simd-decl-stubs.h \ + /usr/include/x86_64-linux-gnu/bits/flt-eval-method.h \ + /usr/include/x86_64-linux-gnu/bits/fp-logb.h \ + /usr/include/x86_64-linux-gnu/bits/fp-fast.h \ + /usr/include/x86_64-linux-gnu/bits/mathcalls-helper-functions.h \ + /usr/include/x86_64-linux-gnu/bits/mathcalls.h \ + /usr/include/x86_64-linux-gnu/bits/mathcalls-narrow.h \ + /usr/include/x86_64-linux-gnu/bits/iscanonical.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/math_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda_surface_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda_texture_types.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/device_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/device_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/device_atomic_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/device_atomic_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/device_double_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/device_double_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_20_atomic_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_20_atomic_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_32_atomic_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_32_atomic_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_35_atomic_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_60_atomic_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_60_atomic_functions.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_20_intrinsics.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_20_intrinsics.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_30_intrinsics.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_30_intrinsics.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_32_intrinsics.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_32_intrinsics.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_35_intrinsics.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_61_intrinsics.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/sm_61_intrinsics.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/sm_70_rt.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/sm_70_rt.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/sm_80_rt.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/crt/sm_80_rt.hpp \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/surface_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/texture_fetch_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/texture_indirect_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/surface_indirect_functions.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/device_launch_parameters.h \ + definitionsInternal.h \ + definitions.h \ + /usr/include/c++/9/random \ + /usr/include/c++/9/string \ + /usr/include/c++/9/bits/stringfwd.h \ + /usr/include/c++/9/bits/memoryfwd.h \ + /usr/include/c++/9/bits/char_traits.h \ + /usr/include/c++/9/bits/stl_algobase.h \ + /usr/include/c++/9/bits/functexcept.h \ + /usr/include/c++/9/ext/numeric_traits.h \ + /usr/include/c++/9/bits/stl_pair.h \ + /usr/include/c++/9/bits/stl_iterator_base_types.h \ + /usr/include/c++/9/bits/stl_iterator_base_funcs.h \ + /usr/include/c++/9/debug/assertions.h \ + /usr/include/c++/9/bits/stl_iterator.h \ + /usr/include/c++/9/bits/ptr_traits.h \ + /usr/include/c++/9/debug/debug.h \ + /usr/include/c++/9/bits/predefined_ops.h \ + /usr/include/c++/9/bits/postypes.h \ + /usr/include/c++/9/cwchar \ + /usr/include/wchar.h \ + /usr/include/x86_64-linux-gnu/bits/wchar.h \ + /usr/include/x86_64-linux-gnu/bits/types/wint_t.h \ + /usr/include/x86_64-linux-gnu/bits/types/mbstate_t.h \ + /usr/include/c++/9/cstdint \ + /usr/lib/gcc/x86_64-linux-gnu/9/include/stdint.h \ + /usr/include/stdint.h \ + /usr/include/x86_64-linux-gnu/bits/stdint-uintn.h \ + /usr/include/c++/9/bits/allocator.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/c++allocator.h \ + /usr/include/c++/9/ext/new_allocator.h \ + /usr/include/c++/9/bits/localefwd.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/c++locale.h \ + /usr/include/c++/9/clocale \ + /usr/include/locale.h \ + /usr/include/x86_64-linux-gnu/bits/locale.h \ + /usr/include/c++/9/iosfwd \ + /usr/include/c++/9/cctype \ + /usr/include/ctype.h \ + /usr/include/c++/9/bits/ostream_insert.h \ + /usr/include/c++/9/bits/cxxabi_forced.h \ + /usr/include/c++/9/bits/stl_function.h \ + /usr/include/c++/9/backward/binders.h \ + /usr/include/c++/9/bits/range_access.h \ + /usr/include/c++/9/initializer_list \ + /usr/include/c++/9/bits/basic_string.h \ + /usr/include/c++/9/ext/atomicity.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/gthr.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/gthr-default.h \ + /usr/include/pthread.h \ + /usr/include/sched.h \ + /usr/include/x86_64-linux-gnu/bits/sched.h \ + /usr/include/x86_64-linux-gnu/bits/types/struct_sched_param.h \ + /usr/include/x86_64-linux-gnu/bits/cpu-set.h \ + /usr/include/x86_64-linux-gnu/bits/setjmp.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/atomic_word.h \ + /usr/include/c++/9/ext/alloc_traits.h \ + /usr/include/c++/9/bits/alloc_traits.h \ + /usr/include/c++/9/ext/string_conversions.h \ + /usr/include/c++/9/cstdio \ + /usr/include/c++/9/cerrno \ + /usr/include/errno.h \ + /usr/include/x86_64-linux-gnu/bits/errno.h \ + /usr/include/linux/errno.h \ + /usr/include/x86_64-linux-gnu/asm/errno.h \ + /usr/include/asm-generic/errno.h \ + /usr/include/asm-generic/errno-base.h \ + /usr/include/x86_64-linux-gnu/bits/types/error_t.h \ + /usr/include/c++/9/bits/functional_hash.h \ + /usr/include/c++/9/bits/basic_string.tcc \ + /usr/include/c++/9/limits \ + /usr/include/c++/9/bits/random.h \ + /usr/include/c++/9/vector \ + /usr/include/c++/9/bits/stl_construct.h \ + /usr/include/c++/9/bits/stl_uninitialized.h \ + /usr/include/c++/9/bits/stl_vector.h \ + /usr/include/c++/9/bits/stl_bvector.h \ + /usr/include/c++/9/bits/vector.tcc \ + /usr/include/c++/9/bits/uniform_int_dist.h \ + /usr/include/x86_64-linux-gnu/c++/9/bits/opt_random.h \ + /usr/include/c++/9/bits/random.tcc \ + /usr/include/c++/9/numeric \ + /usr/include/c++/9/bits/stl_numeric.h \ + /usr/include/c++/9/stdexcept \ + /usr/include/c++/9/cassert \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_kernel.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_discrete.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_precalc.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_mrg32k3a.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_mtgp32_kernel.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda.h \ + /usr/include/memory.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_mtgp32.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_philox4x32_x.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_globals.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_uniform.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_normal.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_normal_static.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_lognormal.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_poisson.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/curand_discrete2.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda_fp16.h \ + /cognition/home/local/cuda/cuda-11.2/bin/../targets/x86_64-linux/include/cuda_fp16.hpp \ + /usr/include/c++/9/utility \ + /usr/include/c++/9/bits/stl_relops.h \ + supportCode.h diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/synapseUpdateCUDA0.sha b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/synapseUpdateCUDA0.sha new file mode 100644 index 00000000..63cfce0c --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/synapseUpdateCUDA0.sha @@ -0,0 +1,5 @@ +9f8d190 4d18f275 f4fd9f4d b18d03ad 41fad55e +0 0 +256 32 +256 30 +0 0 diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/synapseUpdateCUDA1.sha b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/synapseUpdateCUDA1.sha new file mode 100644 index 00000000..83423121 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/magicnetwork_model_CODE/synapseUpdateCUDA1.sha @@ -0,0 +1,5 @@ +9f8d190 4d18f275 f4fd9f4d b18d03ad 41fad55e +0 0 +512 32 +512 30 +0 0 diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/main b/parallel_execution/parallel_execution/code/MushroomBody/genn/main new file mode 100755 index 00000000..408a5481 Binary files /dev/null and b/parallel_execution/parallel_execution/code/MushroomBody/genn/main differ diff --git a/parallel_execution/parallel_execution/code/MushroomBody/genn/main.cpp b/parallel_execution/parallel_execution/code/MushroomBody/genn/main.cpp new file mode 100644 index 00000000..c4e095d2 --- /dev/null +++ b/parallel_execution/parallel_execution/code/MushroomBody/genn/main.cpp @@ -0,0 +1,325 @@ +//-------------------------------------------------------------------------- +/*! \file main.cu + +\brief Main entry point for the running a model simulation. +*/ +//-------------------------------------------------------------------------- + +#include "main.h" +#include "magicnetwork_model_CODE/definitions.h" + +#include "b2glib/convert_synapses.h" +#include "code_objects/spikegeneratorgroup_codeobject.h" +#include "code_objects/spikemonitor_1_codeobject.h" +#include "code_objects/spikemonitor_2_codeobject.h" +#include "code_objects/spikemonitor_codeobject.h" +#include "code_objects/synapses_1_group_variable_set_conditional_codeobject.h" +#include "code_objects/synapses_1_group_variable_set_conditional_codeobject_1.h" +#include "code_objects/synapses_1_post_push_spikes.h" +#include "code_objects/synapses_1_pre_push_spikes.h" +#include "code_objects/synapses_1_synapses_create_generator_codeobject.h" +#include "code_objects/synapses_2_pre_push_spikes.h" +#include "code_objects/synapses_2_synapses_create_generator_codeobject.h" +#include "code_objects/synapses_group_variable_set_conditional_codeobject.h" +#include "code_objects/synapses_pre_push_spikes.h" +#include "code_objects/synapses_synapses_create_generator_codeobject.h" +#include "network.h" +#include "objects.h" + +#include "engine.cpp" + + + +//-------------------------------------------------------------------------- +/*! \brief This function is the entry point for running the simulation of the MBody1 model network. +*/ +//-------------------------------------------------------------------------- +int main(int argc, char *argv[]) +{ + if (argc != 3) + { + fprintf(stderr, "usage: main