diff --git a/src/cluster/cluster.cpp b/src/cluster/cluster.cpp index db96ecb6f77225fa45a317f61de40c02e1dfbf2c..870a0150732dc98e700fdad70a256582b12842e8 100644 --- a/src/cluster/cluster.cpp +++ b/src/cluster/cluster.cpp @@ -85,7 +85,6 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf */ void cluster(const string& config_file, const string& data_file, const string& output_path, const mixMPI *mpidata) { chrono::time_point<chrono::high_resolution_clock> t_start = chrono::high_resolution_clock::now(); - nvtxRangePush("Set up starts"); chrono::duration<double> elapsed; string message; string timing_name = output_path + "/c_timing_mpi"+ to_string(mpidata->rank) +".log"; @@ -109,6 +108,7 @@ void cluster(const string& config_file, const string& data_file, const string& o #endif // the following only happens on MPI process 0 if (mpidata->rank == 0) { + nvtxRangePush("Set up"); logger->log("INFO: making legacy configuration...", LOG_INFO); ScattererConfiguration *sconf = NULL; try { @@ -139,6 +139,7 @@ void cluster(const string& config_file, const string& data_file, const string& o return; } logger->log(" done.\n", LOG_INFO); + nvtxRangePop(); int s_nsph = sconf->number_of_spheres; int nsph = gconf->number_of_spheres; if (s_nsph == nsph) { @@ -221,11 +222,10 @@ void cluster(const string& config_file, const string& data_file, const string& o fprintf(output, " VK=%15.7lE, XI IS SCALE FACTOR FOR LENGTHS\n", cid->vk); fprintf(output, " \n"); } - nvtxRangePop(); // do the first iteration on jxi488 separately, since it seems to be different from the others int jxi488 = 1; chrono::time_point<chrono::high_resolution_clock> start_iter_1 = chrono::high_resolution_clock::now(); - nvtxRangePush("First iteration starts"); + nvtxRangePush("First iteration"); int jer = cluster_jxi488_cycle(jxi488, sconf, gconf, p_scattering_angles, cid, output, output_path, tppoan); nvtxRangePop(); chrono::time_point<chrono::high_resolution_clock> end_iter_1 = chrono::high_resolution_clock::now(); @@ -263,7 +263,7 @@ void cluster(const string& config_file, const string& data_file, const string& o // Create this variable and initialise it with a default here, so that it is defined anyway, with or without OpenMP support enabled int ompnumthreads = 1; - nvtxRangePush("Parallel loop starts"); + nvtxRangePush("Parallel loop"); #pragma omp parallel { // Create and initialise this variable here, so that if OpenMP is enabled it is local to the thread, and if OpenMP is not enabled it has a well-defiled value anyway @@ -317,7 +317,7 @@ void cluster(const string& config_file, const string& data_file, const string& o } // closes pragma omp parallel nvtxRangePop(); - nvtxRangePush("Output concatenation starts"); + nvtxRangePush("Output concatenation"); #ifdef _OPENMP #pragma omp barrier { @@ -597,7 +597,8 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf int jwtm = gconf->jwtm; np_int ndit = 2 * nsph * cid->c4->nlim; int isq, ibf; - + + nvtxRangePush("Prepare matrix calculation"); fprintf(output, "========== JXI =%3d ====================\n", jxi488); double xi = sconf->get_scale(jxi488 - 1); double exdc = sconf->exdc; @@ -654,14 +655,19 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf //break; } } // i132 loop + nvtxRangePop(); interval_start = chrono::high_resolution_clock::now(); + nvtxRangePush("Calculate inverted matrix"); cms(cid->am, cid->c1, cid->c1ao, cid->c4, cid->c6); + nvtxRangePop(); interval_end = chrono::high_resolution_clock::now(); elapsed = interval_end - interval_start; message = "INFO: matrix calculation for scale " + to_string(jxi488) + " took " + to_string(elapsed.count()) + "s.\n"; logger->log(message); interval_start = chrono::high_resolution_clock::now(); + nvtxRangePush("Invert the matrix"); invert_matrix(cid->am, ndit, jer, mxndm); + nvtxRangePop(); interval_end = chrono::high_resolution_clock::now(); elapsed = interval_end - interval_start; message = "INFO: matrix inversion for scale " + to_string(jxi488) + " took " + to_string(elapsed.count()) + "s.\n"; @@ -673,6 +679,7 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf // break; // jxi488 loop: goes to memory clean } interval_start = chrono::high_resolution_clock::now(); + nvtxRangePush("Average calculation"); ztm(cid->am, cid->c1, cid->c1ao, cid->c4, cid->c6, cid->c9); if (idfc >= 0) { if (jxi488 == jwtm) { @@ -740,11 +747,13 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf tppoan.write(reinterpret_cast<char *>(&(cid->vk)), sizeof(double)); pcrsm0(cid->vk, exri, inpol, cid->c1, cid->c1ao, cid->c4); apcra(cid->zpv, cid->c4->le, cid->c1ao->am0m, inpol, sqk, cid->gapm, cid->gappm); + nvtxRangePop(); interval_end = chrono::high_resolution_clock::now(); elapsed = interval_end - interval_start; message = "INFO: average calculation for scale " + to_string(jxi488) + " took " + to_string(elapsed.count()) + "s.\n"; logger->log(message); interval_start = chrono::high_resolution_clock::now(); + nvtxRangePush("Angle loop"); double th = sa->th; for (int jth486 = 1; jth486 <= sa->nth; jth486++) { // OpenMP portable? double ph = sa->ph; @@ -1229,6 +1238,7 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf } // jph484 loop th += sa->thstp; } // jth486 loop + nvtxRangePop(); interval_end = chrono::high_resolution_clock::now(); elapsed = interval_end - interval_start; message = "INFO: angle loop for scale " + to_string(jxi488) + " took " + to_string(elapsed.count()) + "s.\n";