diff --git a/src/cluster/cluster.cpp b/src/cluster/cluster.cpp
index 8c4790564598e5d4529c9b18dcb6b480d4fe738c..db96ecb6f77225fa45a317f61de40c02e1dfbf2c 100644
--- a/src/cluster/cluster.cpp
+++ b/src/cluster/cluster.cpp
@@ -22,6 +22,7 @@
 #include <exception>
 #include <fstream>
 #include <string>
+#include <nvtx3/nvToolsExt.h>
 #ifdef _OPENMP
 #include <omp.h>
 #endif
@@ -84,6 +85,7 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
  */
 void cluster(const string& config_file, const string& data_file, const string& output_path, const mixMPI *mpidata) {
   chrono::time_point<chrono::high_resolution_clock> t_start = chrono::high_resolution_clock::now();
+  nvtxRangePush("Set up starts");
   chrono::duration<double> elapsed;
   string message;
   string timing_name = output_path + "/c_timing_mpi"+ to_string(mpidata->rank) +".log";
@@ -219,10 +221,13 @@ void cluster(const string& config_file, const string& data_file, const string& o
 	  fprintf(output, "  VK=%15.7lE, XI IS SCALE FACTOR FOR LENGTHS\n", cid->vk);
 	  fprintf(output, " \n");
 	}
+	nvtxRangePop();
 	// do the first iteration on jxi488 separately, since it seems to be different from the others
 	int jxi488 = 1;
 	chrono::time_point<chrono::high_resolution_clock> start_iter_1 = chrono::high_resolution_clock::now();
+	nvtxRangePush("First iteration starts");
 	int jer = cluster_jxi488_cycle(jxi488, sconf, gconf, p_scattering_angles, cid, output, output_path, tppoan);
+	nvtxRangePop();
 	chrono::time_point<chrono::high_resolution_clock> end_iter_1 = chrono::high_resolution_clock::now();
 	elapsed = start_iter_1 - t_start;
 	string message = "INFO: Calculation setup took " + to_string(elapsed.count()) + "s.\n";
@@ -258,6 +263,7 @@ void cluster(const string& config_file, const string& data_file, const string& o
 	// Create this variable and initialise it with a default here, so that it is defined anyway, with or without OpenMP support enabled
 	int ompnumthreads = 1;
 
+	nvtxRangePush("Parallel loop starts");
 #pragma omp parallel
 	{
 	  // Create and initialise this variable here, so that if OpenMP is enabled it is local to the thread, and if OpenMP is not enabled it has a well-defiled value anyway
@@ -309,7 +315,9 @@ void cluster(const string& config_file, const string& data_file, const string& o
 	    logger->log(message);
 	  }
 	} // closes pragma omp parallel
+	nvtxRangePop();
 
+	nvtxRangePush("Output concatenation starts");
 #ifdef _OPENMP
 #pragma omp barrier
 	{
@@ -383,6 +391,7 @@ void cluster(const string& config_file, const string& data_file, const string& o
 	  }
 	}
 #endif
+	nvtxRangePop();
 	tppoanp->close();
 	delete tppoanp;
       } else { // In case TPPOAN could not be opened. Should never happen.
diff --git a/src/make.inc b/src/make.inc
index d6acdf1adc670531ae582b82bc29a7a854fd3b69..70a9cb6398cf742bbc31e2aa410eaefc7ac8bd5d 100644
--- a/src/make.inc
+++ b/src/make.inc
@@ -102,7 +102,7 @@ endif
 ifdef CUDA_HOME
 override MAGMA_LDFLAGS+= -L$(CUDA_HOME)/lib64
 endif
-override MAGMA_LDFLAGS+= -lmagma -lcudart
+override MAGMA_LDFLAGS+= -lmagma -lcudart -lnvToolsExt
 #the next endif is for USE_MAGMA
 endif