diff --git a/src/cluster/cluster.cpp b/src/cluster/cluster.cpp index 8c4790564598e5d4529c9b18dcb6b480d4fe738c..db96ecb6f77225fa45a317f61de40c02e1dfbf2c 100644 --- a/src/cluster/cluster.cpp +++ b/src/cluster/cluster.cpp @@ -22,6 +22,7 @@ #include <exception> #include <fstream> #include <string> +#include <nvtx3/nvToolsExt.h> #ifdef _OPENMP #include <omp.h> #endif @@ -84,6 +85,7 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf */ void cluster(const string& config_file, const string& data_file, const string& output_path, const mixMPI *mpidata) { chrono::time_point<chrono::high_resolution_clock> t_start = chrono::high_resolution_clock::now(); + nvtxRangePush("Set up starts"); chrono::duration<double> elapsed; string message; string timing_name = output_path + "/c_timing_mpi"+ to_string(mpidata->rank) +".log"; @@ -219,10 +221,13 @@ void cluster(const string& config_file, const string& data_file, const string& o fprintf(output, " VK=%15.7lE, XI IS SCALE FACTOR FOR LENGTHS\n", cid->vk); fprintf(output, " \n"); } + nvtxRangePop(); // do the first iteration on jxi488 separately, since it seems to be different from the others int jxi488 = 1; chrono::time_point<chrono::high_resolution_clock> start_iter_1 = chrono::high_resolution_clock::now(); + nvtxRangePush("First iteration starts"); int jer = cluster_jxi488_cycle(jxi488, sconf, gconf, p_scattering_angles, cid, output, output_path, tppoan); + nvtxRangePop(); chrono::time_point<chrono::high_resolution_clock> end_iter_1 = chrono::high_resolution_clock::now(); elapsed = start_iter_1 - t_start; string message = "INFO: Calculation setup took " + to_string(elapsed.count()) + "s.\n"; @@ -258,6 +263,7 @@ void cluster(const string& config_file, const string& data_file, const string& o // Create this variable and initialise it with a default here, so that it is defined anyway, with or without OpenMP support enabled int ompnumthreads = 1; + nvtxRangePush("Parallel loop starts"); #pragma omp parallel { // Create and initialise this variable here, so that if OpenMP is enabled it is local to the thread, and if OpenMP is not enabled it has a well-defiled value anyway @@ -309,7 +315,9 @@ void cluster(const string& config_file, const string& data_file, const string& o logger->log(message); } } // closes pragma omp parallel + nvtxRangePop(); + nvtxRangePush("Output concatenation starts"); #ifdef _OPENMP #pragma omp barrier { @@ -383,6 +391,7 @@ void cluster(const string& config_file, const string& data_file, const string& o } } #endif + nvtxRangePop(); tppoanp->close(); delete tppoanp; } else { // In case TPPOAN could not be opened. Should never happen. diff --git a/src/make.inc b/src/make.inc index d6acdf1adc670531ae582b82bc29a7a854fd3b69..70a9cb6398cf742bbc31e2aa410eaefc7ac8bd5d 100644 --- a/src/make.inc +++ b/src/make.inc @@ -102,7 +102,7 @@ endif ifdef CUDA_HOME override MAGMA_LDFLAGS+= -L$(CUDA_HOME)/lib64 endif -override MAGMA_LDFLAGS+= -lmagma -lcudart +override MAGMA_LDFLAGS+= -lmagma -lcudart -lnvToolsExt #the next endif is for USE_MAGMA endif