Skip to content
Snippets Groups Projects
Commit 4edaf082 authored by Giovanni La Mura's avatar Giovanni La Mura
Browse files

Use compiler flags to enable NVIDIA profiling

parent ee43703f
No related branches found
No related tags found
No related merge requests found
......@@ -22,7 +22,6 @@
#include <exception>
#include <fstream>
#include <string>
#include <nvtx3/nvToolsExt.h>
#ifdef _OPENMP
#include <omp.h>
#endif
......@@ -31,6 +30,9 @@
#include <mpi.h>
#endif
#endif
#ifdef USE_NVTX
#include <nvtx3/nvToolsExt.h>
#endif
#ifdef USE_MAGMA
#include <cuda_runtime.h>
#endif
......@@ -108,7 +110,9 @@ void cluster(const string& config_file, const string& data_file, const string& o
#endif
// the following only happens on MPI process 0
if (mpidata->rank == 0) {
#ifdef USE_NVTX
nvtxRangePush("Set up");
#endif
logger->log("INFO: making legacy configuration...", LOG_INFO);
ScattererConfiguration *sconf = NULL;
try {
......@@ -139,7 +143,9 @@ void cluster(const string& config_file, const string& data_file, const string& o
return;
}
logger->log(" done.\n", LOG_INFO);
#ifdef USE_NVTX
nvtxRangePop();
#endif
int s_nsph = sconf->number_of_spheres;
int nsph = gconf->number_of_spheres;
if (s_nsph == nsph) {
......@@ -225,9 +231,13 @@ void cluster(const string& config_file, const string& data_file, const string& o
// do the first iteration on jxi488 separately, since it seems to be different from the others
int jxi488 = 1;
chrono::time_point<chrono::high_resolution_clock> start_iter_1 = chrono::high_resolution_clock::now();
#ifdef USE_NVTX
nvtxRangePush("First iteration");
#endif
int jer = cluster_jxi488_cycle(jxi488, sconf, gconf, p_scattering_angles, cid, output, output_path, tppoan);
#ifdef USE_NVTX
nvtxRangePop();
#endif
chrono::time_point<chrono::high_resolution_clock> end_iter_1 = chrono::high_resolution_clock::now();
elapsed = start_iter_1 - t_start;
string message = "INFO: Calculation setup took " + to_string(elapsed.count()) + "s.\n";
......@@ -263,7 +273,9 @@ void cluster(const string& config_file, const string& data_file, const string& o
// Create this variable and initialise it with a default here, so that it is defined anyway, with or without OpenMP support enabled
int ompnumthreads = 1;
#ifdef USE_NVTX
nvtxRangePush("Parallel loop");
#endif
#pragma omp parallel
{
// Create and initialise this variable here, so that if OpenMP is enabled it is local to the thread, and if OpenMP is not enabled it has a well-defiled value anyway
......@@ -315,9 +327,11 @@ void cluster(const string& config_file, const string& data_file, const string& o
logger->log(message);
}
} // closes pragma omp parallel
#ifdef USE_NVTX
nvtxRangePop();
nvtxRangePush("Output concatenation");
#endif
#ifdef _OPENMP
#pragma omp barrier
{
......@@ -391,7 +405,9 @@ void cluster(const string& config_file, const string& data_file, const string& o
}
}
#endif
#ifdef USE_NVTX
nvtxRangePop();
#endif
tppoanp->close();
delete tppoanp;
} else { // In case TPPOAN could not be opened. Should never happen.
......@@ -598,7 +614,9 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
np_int ndit = 2 * nsph * cid->c4->nlim;
int isq, ibf;
#ifdef USE_NVTX
nvtxRangePush("Prepare matrix calculation");
#endif
fprintf(output, "========== JXI =%3d ====================\n", jxi488);
double xi = sconf->get_scale(jxi488 - 1);
double exdc = sconf->exdc;
......@@ -655,19 +673,29 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
//break;
}
} // i132 loop
#ifdef USE_NVTX
nvtxRangePop();
#endif
interval_start = chrono::high_resolution_clock::now();
#ifdef USE_NVTX
nvtxRangePush("Calculate inverted matrix");
#endif
cms(cid->am, cid->c1, cid->c1ao, cid->c4, cid->c6);
#ifdef USE_NVTX
nvtxRangePop();
#endif
interval_end = chrono::high_resolution_clock::now();
elapsed = interval_end - interval_start;
message = "INFO: matrix calculation for scale " + to_string(jxi488) + " took " + to_string(elapsed.count()) + "s.\n";
logger->log(message);
interval_start = chrono::high_resolution_clock::now();
#ifdef USE_NVTX
nvtxRangePush("Invert the matrix");
#endif
invert_matrix(cid->am, ndit, jer, mxndm);
#ifdef USE_NVTX
nvtxRangePop();
#endif
interval_end = chrono::high_resolution_clock::now();
elapsed = interval_end - interval_start;
message = "INFO: matrix inversion for scale " + to_string(jxi488) + " took " + to_string(elapsed.count()) + "s.\n";
......@@ -679,7 +707,9 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
// break; // jxi488 loop: goes to memory clean
}
interval_start = chrono::high_resolution_clock::now();
#ifdef USE_NVTX
nvtxRangePush("Average calculation");
#endif
ztm(cid->am, cid->c1, cid->c1ao, cid->c4, cid->c6, cid->c9);
if (idfc >= 0) {
if (jxi488 == jwtm) {
......@@ -747,13 +777,17 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
tppoan.write(reinterpret_cast<char *>(&(cid->vk)), sizeof(double));
pcrsm0(cid->vk, exri, inpol, cid->c1, cid->c1ao, cid->c4);
apcra(cid->zpv, cid->c4->le, cid->c1ao->am0m, inpol, sqk, cid->gapm, cid->gappm);
#ifdef USE_NVTX
nvtxRangePop();
#endif
interval_end = chrono::high_resolution_clock::now();
elapsed = interval_end - interval_start;
message = "INFO: average calculation for scale " + to_string(jxi488) + " took " + to_string(elapsed.count()) + "s.\n";
logger->log(message);
interval_start = chrono::high_resolution_clock::now();
#ifdef USE_NVTX
nvtxRangePush("Angle loop");
#endif
double th = sa->th;
for (int jth486 = 1; jth486 <= sa->nth; jth486++) { // OpenMP portable?
double ph = sa->ph;
......@@ -1238,7 +1272,9 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
} // jph484 loop
th += sa->thstp;
} // jth486 loop
#ifdef USE_NVTX
nvtxRangePop();
#endif
interval_end = chrono::high_resolution_clock::now();
elapsed = interval_end - interval_start;
message = "INFO: angle loop for scale " + to_string(jxi488) + " took " + to_string(elapsed.count()) + "s.\n";
......
......@@ -102,13 +102,24 @@ endif
ifdef CUDA_HOME
override MAGMA_LDFLAGS+= -L$(CUDA_HOME)/lib64
endif
override MAGMA_LDFLAGS+= -lmagma -lcudart -lnvToolsExt
#override MAGMA_LDFLAGS+= -lmagma -lcudart -lnvToolsExt
override MAGMA_LDFLAGS+= -lmagma -lcudart
#the next endif is for USE_MAGMA
endif
# define (outside) USE_NVTX to enable NVIDIA profiling
ifdef USE_NVTX
override NVTX_FLAGS=-DUSE_NVTX
ifdef CUDA_HOME
override NVTX_FLAGS+= -I$(CUDA_HOME)/include
# closes CUDA_HOME
endif
# closes USE_NVTX
endif
# CXXFLAGS defines the default compilation options for the C++ compiler
ifndef CXXFLAGS
override CXXFLAGS=-O3 -ggdb -pg -coverage -I$(HDF5_INCLUDE) $(MPI_CXXFLAGS)
override CXXFLAGS=-O3 -ggdb -pg -coverage -I$(HDF5_INCLUDE) $(MPI_CXXFLAGS) $(NVTX_FLAGS)
ifdef USE_OPENMP
override CXXFLAGS+= -fopenmp
# closes USE_OPENMP
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment