Skip to content

Commit

Permalink
improvements and compilation fixes
Browse files Browse the repository at this point in the history
- don't measure time waiting for timer as overhead
- avoid cyclical dependency between time_point (core)
  and timers (device)
- fix DPCPP timers
- remove unnecessary scoping
  • Loading branch information
upsj committed Mar 30, 2023
1 parent 673a642 commit d10f27c
Show file tree
Hide file tree
Showing 10 changed files with 145 additions and 98 deletions.
9 changes: 5 additions & 4 deletions core/base/timer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,9 @@ time_point& time_point::operator=(time_point&& other)
}


time_point CpuTimer::create_time_point()
void CpuTimer::init_time_point(time_point& time)
{
time_point result;
result.type_ = time_point::type::cpu;
return result;
time.type_ = time_point::type::cpu;
}


Expand All @@ -60,6 +58,9 @@ void CpuTimer::record(time_point& time)
}


void CpuTimer::wait(const time_point& time) {}


int64 CpuTimer::difference(const time_point& start, const time_point& stop)
{
return std::chrono::duration_cast<std::chrono::nanoseconds, int64>(
Expand Down
5 changes: 4 additions & 1 deletion core/device_hooks/cuda_hooks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,12 +170,15 @@ CudaTimer::CudaTimer(std::shared_ptr<const CudaExecutor> exec)
GKO_NOT_COMPILED(cuda);


time_point CudaTimer::create_time_point() GKO_NOT_COMPILED(cuda);
void CudaTimer::init_time_point(time_point& time) GKO_NOT_COMPILED(cuda);


void CudaTimer::record(time_point&) GKO_NOT_COMPILED(cuda);


void CudaTimer::wait(const time_point& time) GKO_NOT_COMPILED(cuda);


int64 CudaTimer::difference(const time_point& start, const time_point& stop)
GKO_NOT_COMPILED(cuda);

Expand Down
5 changes: 4 additions & 1 deletion core/device_hooks/dpcpp_hooks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,12 +160,15 @@ DpcppTimer::DpcppTimer(std::shared_ptr<const DpcppExecutor> exec)
GKO_NOT_COMPILED(dpcpp);


time_point DpcppTimer::create_time_point() GKO_NOT_COMPILED(dpcpp);
void DpcppTimer::init_time_point(time_point&) GKO_NOT_COMPILED(dpcpp);


void DpcppTimer::record(time_point&) GKO_NOT_COMPILED(dpcpp);


void DpcppTimer::wait(const time_point& time) GKO_NOT_COMPILED(dpcpp);


int64 DpcppTimer::difference(const time_point& start, const time_point& stop)
GKO_NOT_COMPILED(dpcpp);

Expand Down
5 changes: 4 additions & 1 deletion core/device_hooks/hip_hooks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,12 +171,15 @@ HipTimer::HipTimer(std::shared_ptr<const HipExecutor> exec)
GKO_NOT_COMPILED(hip);


time_point HipTimer::create_time_point() GKO_NOT_COMPILED(hip);
void HipTimer::init_time_point(time_point& time) GKO_NOT_COMPILED(hip);


void HipTimer::record(time_point&) GKO_NOT_COMPILED(hip);


void HipTimer::wait(const time_point& time) GKO_NOT_COMPILED(hip);


int64 HipTimer::difference(const time_point& start, const time_point& stop)
GKO_NOT_COMPILED(hip);

Expand Down
136 changes: 70 additions & 66 deletions core/log/profiler_hook_summary.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,16 +143,22 @@ struct summary {
summary(std::shared_ptr<Timer> timer) : timer{std::move(timer)}
{
push("total");
// preallocate 5 nested levels of timers
for (int i = 0; i < 10; i++) {
free_list.emplace_back();
timer->init_time_point(free_list.back());
}
}

time_point get_current_time_point()
{
gko::time_point time;
if (free_list.empty()) {
auto time = timer->create_time_point();
timer->init_time_point(time);
timer->record(time);
return time;
} else {
auto time = std::move(free_list.back());
time = std::move(free_list.back());
free_list.pop_back();
timer->record(time);
return time;
Expand All @@ -170,20 +176,17 @@ struct summary {
return;
}
const auto cpu_now = cpu_clock::now();
{
// scope the time_point to capture its destruction in overhead
auto now = get_current_time_point();
std::lock_guard<std::mutex> guard{mutex};
auto it = name_map.find(name);
if (it == name_map.end()) {
const auto new_id = static_cast<int64>(entries.size());
it = name_map.emplace_hint(it, name, new_id);
entries.emplace_back();
entries.back().name = name;
}
const auto id = it->second;
stack.emplace_back(id, std::move(now));
std::lock_guard<std::mutex> guard{mutex};
auto now = get_current_time_point();
auto it = name_map.find(name);
if (it == name_map.end()) {
const auto new_id = static_cast<int64>(entries.size());
it = name_map.emplace_hint(it, name, new_id);
entries.emplace_back();
entries.back().name = name;
}
const auto id = it->second;
stack.emplace_back(id, std::move(now));
overhead_ns +=
std::chrono::duration_cast<std::chrono::nanoseconds, int64>(
cpu_clock::now() - cpu_now)
Expand All @@ -193,31 +196,33 @@ struct summary {
void pop(const char* name, bool allow_pop_root = false)
{
const auto cpu_now = cpu_clock::now();
{
// scope the time_point to capture its destruction in overhead
auto now = get_current_time_point();
std::lock_guard<std::mutex> guard{mutex};
if (!check_pop_status(*this, name, allow_pop_root)) {
return;
}
const auto id = stack.back().first;
auto partial_entry = std::move(stack.back());
stack.pop_back();
auto& entry = entries[id];
const auto elapsed_ns =
timer->difference(partial_entry.second, now);
release_time_point(std::move(partial_entry.second));
release_time_point(std::move(now));
entry.count++;
entry.inclusive_ns += elapsed_ns;
entry.exclusive_ns += elapsed_ns;
if (!stack.empty()) {
entries[stack.back().first].exclusive_ns -= elapsed_ns;
}
std::lock_guard<std::mutex> guard{mutex};
auto now = get_current_time_point();
if (!check_pop_status(*this, name, allow_pop_root)) {
return;
}
const auto id = stack.back().first;
auto partial_entry = std::move(stack.back());
stack.pop_back();
auto& entry = entries[id];
const auto cpu_now2 = cpu_clock::now();
// we need to exclude the wait for the timer from the overhead
// measurement
timer->wait(now);
const auto cpu_now3 = cpu_clock::now();
const auto elapsed_ns = timer->difference(partial_entry.second, now);
release_time_point(std::move(partial_entry.second));
release_time_point(std::move(now));
entry.count++;
entry.inclusive_ns += elapsed_ns;
entry.exclusive_ns += elapsed_ns;
if (!stack.empty()) {
entries[stack.back().first].exclusive_ns -= elapsed_ns;
}
const auto cpu_now4 = cpu_clock::now();
overhead_ns +=
std::chrono::duration_cast<std::chrono::nanoseconds, int64>(
cpu_clock::now() - cpu_now)
(cpu_now4 - cpu_now3) + (cpu_now2 - cpu_now))
.count();
}

Expand Down Expand Up @@ -277,12 +282,13 @@ struct nested_summary {

time_point get_current_time_point()
{
gko::time_point time;
if (free_list.empty()) {
auto time = timer->create_time_point();
timer->init_time_point(time);
timer->record(time);
return time;
} else {
auto time = std::move(free_list.back());
time = std::move(free_list.back());
free_list.pop_back();
timer->record(time);
return time;
Expand Down Expand Up @@ -331,14 +337,11 @@ struct nested_summary {
return;
}
const auto cpu_now = cpu_clock::now();
{
// scope the time_point to capture its destruction in overhead
auto now = get_current_time_point();
std::lock_guard<std::mutex> guard{mutex};
const auto name_id = get_or_add_name_id(name);
const auto node_id = get_or_add_node_id(name_id);
stack.emplace_back(name_id, node_id, std::move(now));
}
std::lock_guard<std::mutex> guard{mutex};
auto now = get_current_time_point();
const auto name_id = get_or_add_name_id(name);
const auto node_id = get_or_add_node_id(name_id);
stack.emplace_back(name_id, node_id, std::move(now));
overhead_ns +=
std::chrono::duration_cast<std::chrono::nanoseconds, int64>(
cpu_clock::now() - cpu_now)
Expand All @@ -348,28 +351,29 @@ struct nested_summary {
void pop(const char* name, bool allow_pop_root = false)
{
const auto cpu_now = cpu_clock::now();
{
// scope the time_point to capture its destruction in overhead
auto now = get_current_time_point();
std::lock_guard<std::mutex> guard{mutex};
if (!check_pop_status(*this, name, allow_pop_root)) {
return;
}
auto partial_entry = std::move(stack.back());
const auto name_id = partial_entry.name_id;
stack.pop_back();
const auto node_id =
node_map.at(std::make_pair(name_id, get_parent_id()));
auto& node = nodes[node_id];
const auto elapsed_ns = timer->difference(partial_entry.start, now);
release_time_point(std::move(partial_entry.start));
release_time_point(std::move(now));
node.count++;
node.elapsed_ns += elapsed_ns;
std::lock_guard<std::mutex> guard{mutex};
auto now = get_current_time_point();
if (!check_pop_status(*this, name, allow_pop_root)) {
return;
}
auto partial_entry = std::move(stack.back());
const auto name_id = partial_entry.name_id;
stack.pop_back();
const auto node_id =
node_map.at(std::make_pair(name_id, get_parent_id()));
auto& node = nodes[node_id];
const auto cpu_now2 = cpu_clock::now();
timer->wait(now);
const auto cpu_now3 = cpu_clock::now();
const auto elapsed_ns = timer->difference(partial_entry.start, now);
release_time_point(std::move(partial_entry.start));
release_time_point(std::move(now));
node.count++;
node.elapsed_ns += elapsed_ns;
const auto cpu_now4 = cpu_clock::now();
overhead_ns +=
std::chrono::duration_cast<std::chrono::nanoseconds, int64>(
cpu_clock::now() - cpu_now)
(cpu_now4 - cpu_now3) + (cpu_now2 - cpu_now))
.count();
}

Expand Down
15 changes: 10 additions & 5 deletions cuda/base/timer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,10 @@ CudaTimer::CudaTimer(std::shared_ptr<const CudaExecutor> exec)
{}


time_point CudaTimer::create_time_point()
void CudaTimer::create_time_point(time_point& time)
{
time_point result;
result.type_ = time_point::type::cuda;
GKO_ASSERT_NO_CUDA_ERRORS(cudaEventCreate(&result.data_.cuda_event));
return result;
time.type_ = time_point::type::cuda;
GKO_ASSERT_NO_CUDA_ERRORS(cudaEventCreate(&time.data_.cuda_event));
}


Expand All @@ -31,6 +29,13 @@ void CudaTimer::record(time_point& time)
}


void CudaTimer::wait(const time_point& time)
{
GKO_ASSERT(time.type_ == time_point::type::cuda);
GKO_ASSERT_NO_HIP_ERRORS(cudaEventSynchronize(time.data_.cuda_event));
}


int64 CudaTimer::difference(const time_point& start, const time_point& stop)
{
GKO_ASSERT(start.type_ == time_point::type::cuda);
Expand Down
21 changes: 14 additions & 7 deletions dpcpp/base/timer.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,10 @@ DpcppTimer::DpcppTimer(std::shared_ptr<const DpcppExecutor> exec)
}


time_point DpcppTimer::create_time_point()
void DpcppTimer::init_time_point(time_point& time)
{
time_point result;
result.type_ = time_point::type::dpcpp;
result.data_.dpcpp_event = new sycl::event{};
return result;
time.type_ = time_point::type::dpcpp;
time.data_.dpcpp_event = new sycl::event{};
}


Expand All @@ -40,15 +38,24 @@ void DpcppTimer::record(time_point& time)
}


void DpcppTimer::wait(const time_point& time)
{
GKO_ASSERT(time.type_ == time_point::type::dpcpp);
time.data_.dpcpp_event->wait_and_throw();
}


int64 DpcppTimer::difference(const time_point& start, const time_point& stop)
{
GKO_ASSERT(start.type_ == time_point::type::dpcpp);
GKO_ASSERT(stop.type_ == time_point::type::dpcpp);
stop.data_.dpcpp_event->wait_and_throw();
auto stop_time =
stop.get_profiling_info<sycl::info::event_profiling::command_start>();
stop.data_.dpcpp_event
->get_profiling_info<sycl::info::event_profiling::command_start>();
auto start_time =
start.get_profiling_info<sycl::info::event_profiling::command_end>();
stop.data_.dpcpp_event
->get_profiling_info<sycl::info::event_profiling::command_end>();
return static_cast<int64>(stop_time - start_time);
}

Expand Down
14 changes: 10 additions & 4 deletions hip/base/timer.hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,10 @@ HipTimer::HipTimer(std::shared_ptr<const HipExecutor> exec)
{}


time_point HipTimer::create_time_point()
void HipTimer::init_time_point(time_point& time)
{
time_point result;
result.type_ = time_point::type::hip;
GKO_ASSERT_NO_HIP_ERRORS(hipEventCreate(&result.data_.hip_event));
time.type_ = time_point::type::hip;
GKO_ASSERT_NO_HIP_ERRORS(hipEventCreate(&time.data_.hip_event));
}


Expand All @@ -31,6 +30,13 @@ void HipTimer::record(time_point& time)
}


void HipTimer::wait(const time_point& time)
{
GKO_ASSERT(time.type_ == time_point::type::hip);
GKO_ASSERT_NO_HIP_ERRORS(hipEventSynchronize(time.data_.hip_event));
}


int64 HipTimer::difference(const time_point& start, const time_point& stop)
{
GKO_ASSERT(start.type_ == time_point::type::hip);
Expand Down
Loading

0 comments on commit d10f27c

Please sign in to comment.