Hi, thank you for your response. I've made some updates to my implementation by including your suggested time calculations along with both wtime() and clock() functions. This should help provide a clearer picture of the performance. I would greatly appreciate your feedback on any potential issues or improvements. Thank you for your ongoing support.
Test Script: openmp_101.c
#include<stdlib.h> // Needed for atoi
int max_threads;
double myclock() {
struct timespec t;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &t);
return t.tv_sec + t.tv_nsec / 1e9;
int calculateSums(long loops) {
double start, end;
clock_t clock_start, clock_end;
double my_start, my_end;
start = omp_get_wtime();
my_start = myclock();
clock_start = clock();
int sum = 0;
int rand_sum = 0;
#pragma omp parallel
// If this is the master thread, print the number of threads used
if (omp_get_thread_num() == 0) {
printf("Using %d thredas of max available %d threads.\n\n", omp_get_num_threads(), max_threads);
// Use a for loop with a reduction operation to sum the values
#pragma omp for reduction(+: sum) reduction(+: rand_sum)
for (int i = 0; i < loops; i++) {
sum += 1;
// Generate a random number between 1 and 3
rand_sum += (rand() % 3) + 1;
end = omp_get_wtime();
my_end = myclock();
clock_end = clock();
// Print the time taken and the sums calculated
printf("Time taken (wtime): %.3f seconds.\nTime taken (clock_gettime): %.3f seconds.\nTime taken (clock_t): %.3f seconds.\nLoops %ld \nSum: %d \nRand_sum: %d\n", (end - start), (my_end - my_start), ((double) (clock_end - clock_start) / CLOCKS_PER_SEC), loops, sum, rand_sum);
return rand_sum;
int main(int argc, char *argv[])
srand(time(NULL)); // Seed the random number generator
long loops = 10000000000; // 10 billion
double start, end;
max_threads = omp_get_max_threads();
// Check if the number of threads was specified as a command line argument
if (argc >= 2) {
int requested_threads = atoi(argv[1]);
// Set number of threads based on max availability
if (requested_threads <= max_threads) {
} else {
// Calculate the sum of 1 and the sum of random numbers between 1 and 3
return 0;
Batch Scripts:
File: openmp_101-a.sh
#SBATCH --account=courses0101
#SBATCH --partition=work
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=28
#SBATCH --mem=100G
#SBATCH --time=00:30:00
#SBATCH --qos=high
#SBATCH --mail-type=END,FAIL
#SBATCH [email protected]
cc -fopenmp -o openmp_101 ./openmp_101.c
srun ./openmp_101
File: openmp_101-b.sh
#SBATCH --account=courses0101
#SBATCH --partition=work
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=28
#SBATCH --mem=100G
#SBATCH --time=00:30:00
#SBATCH --qos=high
#SBATCH --mail-type=END,FAIL
#SBATCH [email protected]
cc -fopenmp -o openmp_101 ./openmp_101.c
srun ./openmp_101
On local mac:
% ./openmp_101
Using 10 thredas of max available 10 threads.
Time taken (wtime): 2.172 seconds.
Time taken (clock_gettime): 1.905 seconds.
Time taken (clock_t): 19.139 seconds.
Loops 10000000000
Sum: 1410065408
Rand_sum: -1474884569
% ./openmp_101 1
Using 1 thredas of max available 10 threads.
Time taken (wtime): 9.230 seconds.
Time taken (clock_gettime): 9.228 seconds.
Time taken (clock_t): 9.228 seconds.
Loops 10000000000
Sum: 1410065408
Rand_sum: -1474822776
On Setonix server:
Run jobs using both batch scripts, openmp_101-b.sh
has one additional line export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK}
> sbatch openmp_101-a.sh
Submitted batch job 15274010
> sbatch openmp_101-b.sh
Submitted batch job 15274012
> tail slurm-15274010.out
Using 1 thredas of max available 1 threads.
Time taken (wtime): 10.454 seconds.
Time taken (clock_gettime): 10.453 seconds.
Time taken (clock_t): 10.453 seconds.
Loops 10000000000
Sum: 1410065408
Rand_sum: -1474854915
> tail slurm-15274012.out
Using 56 thredas of max available 56 threads.
Time taken (wtime): 12.586 seconds.
Time taken (clock_gettime): 0.226 seconds.
Time taken (clock_t): 12.584 seconds.
Loops 10000000000
Sum: 1410065408
Rand_sum: -1474884167
According to all available documentation, wtime() is recommended for measuring the performance of parallel programs using OpenMP. I observe a clear difference when running programs with varying thread counts on my local machine. However, this performance difference is not reflected when running the same programs on Setonix.
In addition to my previous response, I realize that it may be impossible to perfectly time a multi-threaded function.
Although clock_gettime(CLOCK_THREAD_CPUTIME_ID, &t)
excludes scheduling delays, we will always call the function in the master thread. If the master thread finishes before other threads, the recorded CPU time for the master thread may be shorter than the CPU time consumed by the longest-running thread in the execution. Ideally, as shown in the figure, we would want to capture the CPU time consumed by the longest-running thread, which is the child thread in this case, but it is just impossible.
Child Start - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Child Complete
/ \
Clock() - Master Start - Master Complete - (Wait for Child, does not consume CPU time) - Clock()
#include <bits/stdc++.h>
#include <omp.h>
using namespace std;
double myclock() {
struct timespec t;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &t);
return t.tv_sec + t.tv_nsec / 1e9;
int main() {
cout << omp_get_max_threads() << endl;
double a = myclock();
#pragma omp parallel for
for (int i = 0; i < 2; ++i) {
if (omp_get_thread_num() != 0) {
double c = myclock();
// simulate some work
for (uint j = 0; j < 1000000000; ++j)
double d = myclock();
cout << omp_get_thread_num() << endl;
cout << (d - c) << endl;
double b = myclock();
cout << (b - a) << endl;
$ g++ 1.cpp -fopenmp; ./a.out # run without optimization so the long loop will not be optimized out