diff --git a/src/cluster/cluster.cpp b/src/cluster/cluster.cpp
index db96ecb6f77225fa45a317f61de40c02e1dfbf2c..870a0150732dc98e700fdad70a256582b12842e8 100644
--- a/src/cluster/cluster.cpp
+++ b/src/cluster/cluster.cpp
@@ -85,7 +85,6 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
  */
 void cluster(const string& config_file, const string& data_file, const string& output_path, const mixMPI *mpidata) {
   chrono::time_point<chrono::high_resolution_clock> t_start = chrono::high_resolution_clock::now();
-  nvtxRangePush("Set up starts");
   chrono::duration<double> elapsed;
   string message;
   string timing_name = output_path + "/c_timing_mpi"+ to_string(mpidata->rank) +".log";
@@ -109,6 +108,7 @@ void cluster(const string& config_file, const string& data_file, const string& o
 #endif
   // the following only happens on MPI process 0
   if (mpidata->rank == 0) {
+    nvtxRangePush("Set up");
     logger->log("INFO: making legacy configuration...", LOG_INFO);
     ScattererConfiguration *sconf = NULL;
     try {
@@ -139,6 +139,7 @@ void cluster(const string& config_file, const string& data_file, const string& o
       return;
     }
     logger->log(" done.\n", LOG_INFO);
+    nvtxRangePop();
     int s_nsph = sconf->number_of_spheres;
     int nsph = gconf->number_of_spheres;
     if (s_nsph == nsph) {
@@ -221,11 +222,10 @@ void cluster(const string& config_file, const string& data_file, const string& o
 	  fprintf(output, "  VK=%15.7lE, XI IS SCALE FACTOR FOR LENGTHS\n", cid->vk);
 	  fprintf(output, " \n");
 	}
-	nvtxRangePop();
 	// do the first iteration on jxi488 separately, since it seems to be different from the others
 	int jxi488 = 1;
 	chrono::time_point<chrono::high_resolution_clock> start_iter_1 = chrono::high_resolution_clock::now();
-	nvtxRangePush("First iteration starts");
+	nvtxRangePush("First iteration");
 	int jer = cluster_jxi488_cycle(jxi488, sconf, gconf, p_scattering_angles, cid, output, output_path, tppoan);
 	nvtxRangePop();
 	chrono::time_point<chrono::high_resolution_clock> end_iter_1 = chrono::high_resolution_clock::now();
@@ -263,7 +263,7 @@ void cluster(const string& config_file, const string& data_file, const string& o
 	// Create this variable and initialise it with a default here, so that it is defined anyway, with or without OpenMP support enabled
 	int ompnumthreads = 1;
 
-	nvtxRangePush("Parallel loop starts");
+	nvtxRangePush("Parallel loop");
 #pragma omp parallel
 	{
 	  // Create and initialise this variable here, so that if OpenMP is enabled it is local to the thread, and if OpenMP is not enabled it has a well-defiled value anyway
@@ -317,7 +317,7 @@ void cluster(const string& config_file, const string& data_file, const string& o
 	} // closes pragma omp parallel
 	nvtxRangePop();
 
-	nvtxRangePush("Output concatenation starts");
+	nvtxRangePush("Output concatenation");
 #ifdef _OPENMP
 #pragma omp barrier
 	{
@@ -597,7 +597,8 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
   int jwtm = gconf->jwtm;
   np_int ndit = 2 * nsph * cid->c4->nlim;
   int isq, ibf;
-  
+
+  nvtxRangePush("Prepare matrix calculation");
   fprintf(output, "========== JXI =%3d ====================\n", jxi488);
   double xi = sconf->get_scale(jxi488 - 1);
   double exdc = sconf->exdc;
@@ -654,14 +655,19 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
       //break;
     }
   } // i132 loop
+  nvtxRangePop();
   interval_start = chrono::high_resolution_clock::now();
+  nvtxRangePush("Calculate inverted matrix");
   cms(cid->am, cid->c1, cid->c1ao, cid->c4, cid->c6);
+  nvtxRangePop();
   interval_end = chrono::high_resolution_clock::now();
   elapsed = interval_end - interval_start;
   message = "INFO: matrix calculation for scale " + to_string(jxi488) + " took " + to_string(elapsed.count()) + "s.\n";
   logger->log(message);
   interval_start = chrono::high_resolution_clock::now();
+  nvtxRangePush("Invert the matrix");
   invert_matrix(cid->am, ndit, jer, mxndm);
+  nvtxRangePop();
   interval_end = chrono::high_resolution_clock::now();
   elapsed = interval_end - interval_start;
   message = "INFO: matrix inversion for scale " + to_string(jxi488) + " took " + to_string(elapsed.count()) + "s.\n";
@@ -673,6 +679,7 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
     // break; // jxi488 loop: goes to memory clean
   }
   interval_start = chrono::high_resolution_clock::now();
+  nvtxRangePush("Average calculation");
   ztm(cid->am, cid->c1, cid->c1ao, cid->c4, cid->c6, cid->c9);
   if (idfc >= 0) {
     if (jxi488 == jwtm) {
@@ -740,11 +747,13 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
   tppoan.write(reinterpret_cast<char *>(&(cid->vk)), sizeof(double));
   pcrsm0(cid->vk, exri, inpol, cid->c1, cid->c1ao, cid->c4);
   apcra(cid->zpv, cid->c4->le, cid->c1ao->am0m, inpol, sqk, cid->gapm, cid->gappm);
+  nvtxRangePop();
   interval_end = chrono::high_resolution_clock::now();
   elapsed = interval_end - interval_start;
   message = "INFO: average calculation for scale " + to_string(jxi488) + " took " + to_string(elapsed.count()) + "s.\n";
   logger->log(message);
   interval_start = chrono::high_resolution_clock::now();
+  nvtxRangePush("Angle loop");
   double th = sa->th;
   for (int jth486 = 1; jth486 <= sa->nth; jth486++) { // OpenMP portable?
     double ph = sa->ph;
@@ -1229,6 +1238,7 @@ int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConf
     } // jph484 loop
     th += sa->thstp;
   } // jth486 loop
+  nvtxRangePop();
   interval_end = chrono::high_resolution_clock::now();
   elapsed = interval_end - interval_start;
   message = "INFO: angle loop for scale " + to_string(jxi488) + " took " + to_string(elapsed.count()) + "s.\n";