In addition to my previous response, I realize that it may be impossible to perfectly time a multi-threaded function.
Although clock_gettime(CLOCK_THREAD_CPUTIME_ID, &t)
excludes scheduling delays, we will always call the function in the master thread. If the master thread finishes before other threads, the recorded CPU time for the master thread may be shorter than the CPU time consumed by the longest-running thread in the execution. Ideally, as shown in the figure, we would want to capture the CPU time consumed by the longest-running thread, which is the child thread in this case, but it is just impossible.
Child Start - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Child Complete
/ \
Clock() - Master Start - Master Complete - (Wait for Child, does not consume CPU time) - Clock()
#include <bits/stdc++.h>
#include <omp.h>
using namespace std;
double myclock() {
struct timespec t;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &t);
return t.tv_sec + t.tv_nsec / 1e9;
}
int main() {
cout << omp_get_max_threads() << endl;
double a = myclock();
#pragma omp parallel for
for (int i = 0; i < 2; ++i) {
if (omp_get_thread_num() != 0) {
double c = myclock();
// simulate some work
for (uint j = 0; j < 1000000000; ++j)
;
double d = myclock();
cout << omp_get_thread_num() << endl;
cout << (d - c) << endl;
}
}
double b = myclock();
cout << (b - a) << endl;
}
$ g++ 1.cpp -fopenmp; ./a.out # run without optimization so the long loop will not be optimized out
256
1
0.653028
0.0163417