diff --git a/src/cluster/cluster.cpp b/src/cluster/cluster.cpp
index 870a0150732dc98e700fdad70a256582b12842e8..b35fe1d9b996816382e413089321f30548e6880c 100644
--- a/src/cluster/cluster.cpp
+++ b/src/cluster/cluster.cpp
@@ -22,7 +22,6 @@
 #include <exception>
 #include <fstream>
 #include <string>
-#include <nvtx3/nvToolsExt.h>
 #ifdef _OPENMP
 #include <omp.h>
 #endif
@@ -31,6 +30,9 @@
 #include <mpi.h>
 #endif
 #endif
+#ifdef USE_NVTX
+#include <nvtx3/nvToolsExt.h>
+#endif
 #ifdef USE_MAGMA
 #include <cuda_runtime.h>
 #endif
@@ -108,7 +110,9 @@ void cluster(const string& config_file, const string& data_file, const string& o
 #endif
   // the following only happens on MPI process 0
   if (mpidata->rank == 0) {
+#ifdef USE_NVTX
     nvtxRangePush("Set up");
+#endif
     logger->log("INFO: making legacy configuration...", LOG_INFO);
     ScattererConfiguration *sconf = NULL;
     try {
@@ -139,7 +143,9 @@ void cluster(const string& config_file, const string& data_file, const string& o
       return;
     }
     logger->log(" done.\n", LOG_INFO);
+#ifdef USE_NVTX
     nvtxRangePop();
+#endif
     int s_nsph = sconf->number_of_spheres;
     int nsph = gconf->number_of_spheres;
     if (s_nsph == nsph) {
@@ -225,9 +231,13 @@ void cluster(const string& config_file, const string& data_file, const string& o
 	// do the first iteration on jxi488 separately, since it seems to be different from the others
 	int jxi488 = 1;
 	chrono::time_point<chrono::high_resolution_clock> start_iter_1 = chrono::high_resolution_clock::now();
+#ifdef USE_NVTX
 	nvtxRangePush("First iteration");
+#endif
 	int jer = cluster_jxi488_cycle(jxi488, sconf, gconf, p_scattering_angles, cid, output, output_path, tppoan);
+#ifdef USE_NVTX
 	nvtxRangePop();
+#endif
 	chrono::time_point<chrono::high_resolution_clock> end_iter_1 = chrono::high_resolution_clock::now();
 	elapsed = start_iter_1 - t_start;
 	string message = "INFO: Calculation setup took " + to_string(elapsed.count()) + "s.\n";
@@ -263,7 +273,9 @@ void cluster(const string& config_file, const string& data_file, const string& o
 	// Create this variable and initialise it with a default here, so that it is defined anyway, with or without OpenMP support enabled
 	int ompnumthreads = 1;
 
+#ifdef USE_NVTX
 	nvtxRangePush("Parallel loop");
+#endif
 #pragma omp parallel
 	{
 	  // Create and initialise this variable here, so that if OpenMP is enabled it is local to the thread, and if OpenMP is not enabled it has a well-defiled value anyway
@@ -315,9 +327,11 @@ void cluster(const string& config_file, const string& data_file, const string& o
 	    logger->log(message);
 	  }
 	} // closes pragma omp parallel
+#ifdef USE_NVTX
 	nvtxRangePop();
 
 	nvtxRangePush("Output concatenation");
+#endif
 #ifdef _OPENMP
 #pragma omp barrier
 	{
@@ -391,7 +405,9 @@ void cluster(const string& config_file, const string& data_file, const string& o
 	  }
 	}
 #endif
+#ifdef USE_NVTX
 	nvtxRangePop();
+#endif
 	tppoanp->close();
 	delete tppoanp;
       } else { // In case TPPOAN could not be opened. Should never happen.
@@ -598,7 +614,9 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
   np_int ndit = 2 * nsph * cid->c4->nlim;
   int isq, ibf;
 
+#ifdef USE_NVTX
   nvtxRangePush("Prepare matrix calculation");
+#endif
   fprintf(output, "========== JXI =%3d ====================\n", jxi488);
   double xi = sconf->get_scale(jxi488 - 1);
   double exdc = sconf->exdc;
@@ -655,19 +673,29 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
       //break;
     }
   } // i132 loop
+#ifdef USE_NVTX
   nvtxRangePop();
+#endif
   interval_start = chrono::high_resolution_clock::now();
+#ifdef USE_NVTX
   nvtxRangePush("Calculate inverted matrix");
+#endif
   cms(cid->am, cid->c1, cid->c1ao, cid->c4, cid->c6);
+#ifdef USE_NVTX
   nvtxRangePop();
+#endif
   interval_end = chrono::high_resolution_clock::now();
   elapsed = interval_end - interval_start;
   message = "INFO: matrix calculation for scale " + to_string(jxi488) + " took " + to_string(elapsed.count()) + "s.\n";
   logger->log(message);
   interval_start = chrono::high_resolution_clock::now();
+#ifdef USE_NVTX
   nvtxRangePush("Invert the matrix");
+#endif
   invert_matrix(cid->am, ndit, jer, mxndm);
+#ifdef USE_NVTX
   nvtxRangePop();
+#endif
   interval_end = chrono::high_resolution_clock::now();
   elapsed = interval_end - interval_start;
   message = "INFO: matrix inversion for scale " + to_string(jxi488) + " took " + to_string(elapsed.count()) + "s.\n";
@@ -679,7 +707,9 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
     // break; // jxi488 loop: goes to memory clean
   }
   interval_start = chrono::high_resolution_clock::now();
+#ifdef USE_NVTX
   nvtxRangePush("Average calculation");
+#endif
   ztm(cid->am, cid->c1, cid->c1ao, cid->c4, cid->c6, cid->c9);
   if (idfc >= 0) {
     if (jxi488 == jwtm) {
@@ -747,13 +777,17 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
   tppoan.write(reinterpret_cast<char *>(&(cid->vk)), sizeof(double));
   pcrsm0(cid->vk, exri, inpol, cid->c1, cid->c1ao, cid->c4);
   apcra(cid->zpv, cid->c4->le, cid->c1ao->am0m, inpol, sqk, cid->gapm, cid->gappm);
+#ifdef USE_NVTX
   nvtxRangePop();
+#endif
   interval_end = chrono::high_resolution_clock::now();
   elapsed = interval_end - interval_start;
   message = "INFO: average calculation for scale " + to_string(jxi488) + " took " + to_string(elapsed.count()) + "s.\n";
   logger->log(message);
   interval_start = chrono::high_resolution_clock::now();
+#ifdef USE_NVTX
   nvtxRangePush("Angle loop");
+#endif
   double th = sa->th;
   for (int jth486 = 1; jth486 <= sa->nth; jth486++) { // OpenMP portable?
     double ph = sa->ph;
@@ -1238,7 +1272,9 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
     } // jph484 loop
     th += sa->thstp;
   } // jth486 loop
+#ifdef USE_NVTX
   nvtxRangePop();
+#endif
   interval_end = chrono::high_resolution_clock::now();
   elapsed = interval_end - interval_start;
   message = "INFO: angle loop for scale " + to_string(jxi488) + " took " + to_string(elapsed.count()) + "s.\n";
diff --git a/src/make.inc b/src/make.inc
index 70a9cb6398cf742bbc31e2aa410eaefc7ac8bd5d..4754af13194d4a7eeb833c7029cab213addbb4ec 100644
--- a/src/make.inc
+++ b/src/make.inc
@@ -102,13 +102,24 @@ endif
 ifdef CUDA_HOME
 override MAGMA_LDFLAGS+= -L$(CUDA_HOME)/lib64
 endif
-override MAGMA_LDFLAGS+= -lmagma -lcudart -lnvToolsExt
+#override MAGMA_LDFLAGS+= -lmagma -lcudart -lnvToolsExt
+override MAGMA_LDFLAGS+= -lmagma -lcudart
 #the next endif is for USE_MAGMA
 endif
 
+# define (outside) USE_NVTX to enable NVIDIA profiling
+ifdef USE_NVTX
+override NVTX_FLAGS=-DUSE_NVTX
+ifdef CUDA_HOME
+override NVTX_FLAGS+= -I$(CUDA_HOME)/include
+# closes CUDA_HOME
+endif
+# closes USE_NVTX
+endif
+
 # CXXFLAGS defines the default compilation options for the C++ compiler
 ifndef CXXFLAGS
-override CXXFLAGS=-O3 -ggdb -pg -coverage -I$(HDF5_INCLUDE) $(MPI_CXXFLAGS)
+override CXXFLAGS=-O3 -ggdb -pg -coverage -I$(HDF5_INCLUDE) $(MPI_CXXFLAGS) $(NVTX_FLAGS)
 ifdef USE_OPENMP
 override CXXFLAGS+= -fopenmp
 # closes USE_OPENMP