diff --git a/src/cluster/cluster.cpp b/src/cluster/cluster.cpp index 870a0150732dc98e700fdad70a256582b12842e8..b35fe1d9b996816382e413089321f30548e6880c 100644 --- a/src/cluster/cluster.cpp +++ b/src/cluster/cluster.cpp @@ -22,7 +22,6 @@ #include <exception> #include <fstream> #include <string> -#include <nvtx3/nvToolsExt.h> #ifdef _OPENMP #include <omp.h> #endif @@ -31,6 +30,9 @@ #include <mpi.h> #endif #endif +#ifdef USE_NVTX +#include <nvtx3/nvToolsExt.h> +#endif #ifdef USE_MAGMA #include <cuda_runtime.h> #endif @@ -108,7 +110,9 @@ void cluster(const string& config_file, const string& data_file, const string& o #endif // the following only happens on MPI process 0 if (mpidata->rank == 0) { +#ifdef USE_NVTX nvtxRangePush("Set up"); +#endif logger->log("INFO: making legacy configuration...", LOG_INFO); ScattererConfiguration *sconf = NULL; try { @@ -139,7 +143,9 @@ void cluster(const string& config_file, const string& data_file, const string& o return; } logger->log(" done.\n", LOG_INFO); +#ifdef USE_NVTX nvtxRangePop(); +#endif int s_nsph = sconf->number_of_spheres; int nsph = gconf->number_of_spheres; if (s_nsph == nsph) { @@ -225,9 +231,13 @@ void cluster(const string& config_file, const string& data_file, const string& o // do the first iteration on jxi488 separately, since it seems to be different from the others int jxi488 = 1; chrono::time_point<chrono::high_resolution_clock> start_iter_1 = chrono::high_resolution_clock::now(); +#ifdef USE_NVTX nvtxRangePush("First iteration"); +#endif int jer = cluster_jxi488_cycle(jxi488, sconf, gconf, p_scattering_angles, cid, output, output_path, tppoan); +#ifdef USE_NVTX nvtxRangePop(); +#endif chrono::time_point<chrono::high_resolution_clock> end_iter_1 = chrono::high_resolution_clock::now(); elapsed = start_iter_1 - t_start; string message = "INFO: Calculation setup took " + to_string(elapsed.count()) + "s.\n"; @@ -263,7 +273,9 @@ void cluster(const string& config_file, const string& data_file, const string& o // Create this variable and initialise it with a default here, so that it is defined anyway, with or without OpenMP support enabled int ompnumthreads = 1; +#ifdef USE_NVTX nvtxRangePush("Parallel loop"); +#endif #pragma omp parallel { // Create and initialise this variable here, so that if OpenMP is enabled it is local to the thread, and if OpenMP is not enabled it has a well-defiled value anyway @@ -315,9 +327,11 @@ void cluster(const string& config_file, const string& data_file, const string& o logger->log(message); } } // closes pragma omp parallel +#ifdef USE_NVTX nvtxRangePop(); nvtxRangePush("Output concatenation"); +#endif #ifdef _OPENMP #pragma omp barrier { @@ -391,7 +405,9 @@ void cluster(const string& config_file, const string& data_file, const string& o } } #endif +#ifdef USE_NVTX nvtxRangePop(); +#endif tppoanp->close(); delete tppoanp; } else { // In case TPPOAN could not be opened. Should never happen. @@ -598,7 +614,9 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf np_int ndit = 2 * nsph * cid->c4->nlim; int isq, ibf; +#ifdef USE_NVTX nvtxRangePush("Prepare matrix calculation"); +#endif fprintf(output, "========== JXI =%3d ====================\n", jxi488); double xi = sconf->get_scale(jxi488 - 1); double exdc = sconf->exdc; @@ -655,19 +673,29 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf //break; } } // i132 loop +#ifdef USE_NVTX nvtxRangePop(); +#endif interval_start = chrono::high_resolution_clock::now(); +#ifdef USE_NVTX nvtxRangePush("Calculate inverted matrix"); +#endif cms(cid->am, cid->c1, cid->c1ao, cid->c4, cid->c6); +#ifdef USE_NVTX nvtxRangePop(); +#endif interval_end = chrono::high_resolution_clock::now(); elapsed = interval_end - interval_start; message = "INFO: matrix calculation for scale " + to_string(jxi488) + " took " + to_string(elapsed.count()) + "s.\n"; logger->log(message); interval_start = chrono::high_resolution_clock::now(); +#ifdef USE_NVTX nvtxRangePush("Invert the matrix"); +#endif invert_matrix(cid->am, ndit, jer, mxndm); +#ifdef USE_NVTX nvtxRangePop(); +#endif interval_end = chrono::high_resolution_clock::now(); elapsed = interval_end - interval_start; message = "INFO: matrix inversion for scale " + to_string(jxi488) + " took " + to_string(elapsed.count()) + "s.\n"; @@ -679,7 +707,9 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf // break; // jxi488 loop: goes to memory clean } interval_start = chrono::high_resolution_clock::now(); +#ifdef USE_NVTX nvtxRangePush("Average calculation"); +#endif ztm(cid->am, cid->c1, cid->c1ao, cid->c4, cid->c6, cid->c9); if (idfc >= 0) { if (jxi488 == jwtm) { @@ -747,13 +777,17 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf tppoan.write(reinterpret_cast<char *>(&(cid->vk)), sizeof(double)); pcrsm0(cid->vk, exri, inpol, cid->c1, cid->c1ao, cid->c4); apcra(cid->zpv, cid->c4->le, cid->c1ao->am0m, inpol, sqk, cid->gapm, cid->gappm); +#ifdef USE_NVTX nvtxRangePop(); +#endif interval_end = chrono::high_resolution_clock::now(); elapsed = interval_end - interval_start; message = "INFO: average calculation for scale " + to_string(jxi488) + " took " + to_string(elapsed.count()) + "s.\n"; logger->log(message); interval_start = chrono::high_resolution_clock::now(); +#ifdef USE_NVTX nvtxRangePush("Angle loop"); +#endif double th = sa->th; for (int jth486 = 1; jth486 <= sa->nth; jth486++) { // OpenMP portable? double ph = sa->ph; @@ -1238,7 +1272,9 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf } // jph484 loop th += sa->thstp; } // jth486 loop +#ifdef USE_NVTX nvtxRangePop(); +#endif interval_end = chrono::high_resolution_clock::now(); elapsed = interval_end - interval_start; message = "INFO: angle loop for scale " + to_string(jxi488) + " took " + to_string(elapsed.count()) + "s.\n"; diff --git a/src/make.inc b/src/make.inc index 70a9cb6398cf742bbc31e2aa410eaefc7ac8bd5d..4754af13194d4a7eeb833c7029cab213addbb4ec 100644 --- a/src/make.inc +++ b/src/make.inc @@ -102,13 +102,24 @@ endif ifdef CUDA_HOME override MAGMA_LDFLAGS+= -L$(CUDA_HOME)/lib64 endif -override MAGMA_LDFLAGS+= -lmagma -lcudart -lnvToolsExt +#override MAGMA_LDFLAGS+= -lmagma -lcudart -lnvToolsExt +override MAGMA_LDFLAGS+= -lmagma -lcudart #the next endif is for USE_MAGMA endif +# define (outside) USE_NVTX to enable NVIDIA profiling +ifdef USE_NVTX +override NVTX_FLAGS=-DUSE_NVTX +ifdef CUDA_HOME +override NVTX_FLAGS+= -I$(CUDA_HOME)/include +# closes CUDA_HOME +endif +# closes USE_NVTX +endif + # CXXFLAGS defines the default compilation options for the C++ compiler ifndef CXXFLAGS -override CXXFLAGS=-O3 -ggdb -pg -coverage -I$(HDF5_INCLUDE) $(MPI_CXXFLAGS) +override CXXFLAGS=-O3 -ggdb -pg -coverage -I$(HDF5_INCLUDE) $(MPI_CXXFLAGS) $(NVTX_FLAGS) ifdef USE_OPENMP override CXXFLAGS+= -fopenmp # closes USE_OPENMP