diff --git a/src/include/IterationData.h b/src/include/IterationData.h
index 6ab981db1616318f194dfb3d614c3a9efca4d265..a8d82dbf9e2d5a02d583fc94cf3f5f80442437dd 100644
--- a/src/include/IterationData.h
+++ b/src/include/IterationData.h
@@ -313,6 +313,8 @@ public:
   int proc_device;
   //! \brief Refinement mode selction flag.
   int refinemode;
+  //! \brief flag defining a first iteration
+  bool is_first_scale;
   //! \brief Maximum number of refinement iterations.
   int maxrefiters;
   //! \brief Required accuracy level.
@@ -484,6 +486,8 @@ public:
   double **tqss;
   //! \brief Scattering coefficients tensor.
   double ****zpv;
+  //! \brief flag for first time initialisation
+  bool is_first_scale;
   
   /*! \brief `SphereIterationData` default instance constructor.
    *
diff --git a/src/inclusion/inclusion.cpp b/src/inclusion/inclusion.cpp
index 7d6d4ae38b3403862747b80dc9ac3e310e05d101..b70306c044b424f3d719d8df501b98f605964d44 100644
--- a/src/inclusion/inclusion.cpp
+++ b/src/inclusion/inclusion.cpp
@@ -278,50 +278,50 @@ void inclusion(const string& config_file, const string& data_file, const string&
       }
       
       // do the first iteration on jxi488 separately, since it seems to be different from the others
-      int jxi488 = 1;
+      int jxi488;
       int initialmaxrefiters = cid->maxrefiters;
       
-      chrono::time_point<chrono::high_resolution_clock> start_iter_1 = chrono::high_resolution_clock::now();
-#ifdef USE_NVTX
-      nvtxRangePush("First iteration");
-#endif
+//       chrono::time_point<chrono::high_resolution_clock> start_iter_1 = chrono::high_resolution_clock::now();
+// #ifdef USE_NVTX
+//       nvtxRangePush("First iteration");
+// #endif
       // use these pragmas, which should have no effect on parallelism, just to push OMP nested levels at the same level also in the first wavelength iteration
       int jer = 0;
-#pragma omp parallel
-      {
-#pragma omp single
-	{
-	  jer = inclusion_jxi488_cycle(jxi488, sconf, gconf, p_scattering_angles, cid, p_output, output_path, vtppoanp);
-	}
-      }
-#ifdef USE_NVTX
-      nvtxRangePop();
-#endif
-      chrono::time_point<chrono::high_resolution_clock> end_iter_1 = chrono::high_resolution_clock::now();
-      elapsed = start_iter_1 - t_start;
-      string message = "INFO: Calculation setup took " + to_string(elapsed.count()) + "s.\n";
-      logger->log(message);
-      time_logger->log(message);
-      elapsed = end_iter_1 - start_iter_1;
-      message = "INFO: First iteration took " + to_string(elapsed.count()) + "s.\n";
-      logger->log(message);
-      time_logger->log(message);
+// #pragma omp parallel
+//       {
+// #pragma omp single
+// 	{
+// 	  jer = inclusion_jxi488_cycle(jxi488, sconf, gconf, p_scattering_angles, cid, p_output, output_path, vtppoanp);
+// 	}
+//       }
+// #ifdef USE_NVTX
+//       nvtxRangePop();
+// #endif
+//       chrono::time_point<chrono::high_resolution_clock> end_iter_1 = chrono::high_resolution_clock::now();
+//       elapsed = start_iter_1 - t_start;
+//       string message = "INFO: Calculation setup took " + to_string(elapsed.count()) + "s.\n";
+//       logger->log(message);
+//       time_logger->log(message);
+//       elapsed = end_iter_1 - start_iter_1;
+//       message = "INFO: First iteration took " + to_string(elapsed.count()) + "s.\n";
+//       logger->log(message);
+//       time_logger->log(message);
       /* for the next iterations, just always do maxiter iterations, assuming the accuracy is good enough */
-      cid->refinemode = 0;
-      /* add an extra iteration for margin, if this does not exceed initialmaxrefiters */
-      // if (cid->maxrefiters < initialmaxrefiters) cid->maxrefiters++;
-      if (jer != 0) {
-	// First loop failed. Halt the calculation.
-	fclose(timing_file);
-	delete time_logger;
-	delete p_output;
-	delete p_scattering_angles;
-	delete cid;
-	delete logger;
-	delete sconf;
-	delete gconf;
-	return;
-      }
+      // cid->refinemode = 0;
+      // /* add an extra iteration for margin, if this does not exceed initialmaxrefiters */
+      // // if (cid->maxrefiters < initialmaxrefiters) cid->maxrefiters++;
+      // if (jer != 0) {
+      // 	// First loop failed. Halt the calculation.
+      // 	fclose(timing_file);
+      // 	delete time_logger;
+      // 	delete p_output;
+      // 	delete p_scattering_angles;
+      // 	delete cid;
+      // 	delete logger;
+      // 	delete sconf;
+      // 	delete gconf;
+      // 	return;
+      // }
 
       //==================================================
       // do the first outputs here, so that I open here the new files, afterwards I only append
@@ -415,7 +415,7 @@ void inclusion(const string& config_file, const string& data_file, const string&
 	}
 #pragma omp barrier
 	// ok, now I can actually start the parallel calculations
-	for (int ixi488=2; ixi488<=cid_2->number_of_scales; ixi488 +=myMPIstride) {
+	for (int ixi488=1; ixi488<=cid_2->number_of_scales; ixi488 +=myMPIstride) {
 	  // the parallel loop over MPI processes covers a different set of indices for each thread
 #pragma omp barrier
 	  int myjxi488 = ixi488+myompthread;
@@ -595,7 +595,7 @@ void inclusion(const string& config_file, const string& data_file, const string&
       // make sure all threads align here: I don't want the following loop to accidentally start for thread 0, possibly modifying some variables before they are copied by all other threads
 #pragma omp barrier
       // ok, now I can actually start the parallel calculations
-      for (int ixi488=2; ixi488<=cid_2->number_of_scales; ixi488 +=myMPIstride) {
+      for (int ixi488=1; ixi488<=cid_2->number_of_scales; ixi488 +=myMPIstride) {
 	// the parallel loop over MPI processes covers a different set of indices for each thread
 #pragma omp barrier
 	int myjxi488 = ixi488 + myjxi488startoffset + myompthread;
@@ -685,7 +685,9 @@ int inclusion_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryCo
   int jer = 0;
   int lcalc = 0;
   int jaw = 1;
-  bool is_first_scale = (jxi488 == 1);
+  // this is now part of cid, so don't mess with it here, just copy it by reference
+  bool &is_first_scale = cid->is_first_scale;
+  //bool is_first_scale = (jxi488 == 1);
   // int li = cid->c1->li;
   // int le = cid->c1->le;
   // int lm = cid->c1->lm;
@@ -1414,6 +1416,8 @@ int inclusion_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryCo
     } // jph484 loop
     th += sa->thstp;
   } // jth486 loop
+  // at the end of this iteration make sure to set is_first_scale to false, the next iteration will reinitialise _only_ if the order changes
+  is_first_scale = 0;
 #ifdef USE_NVTX
   nvtxRangePop();
 #endif
@@ -1534,6 +1538,8 @@ InclusionIterationData::InclusionIterationData(GeometryConfiguration *gconf, Sca
   proc_device = 0;
 #endif
 
+  // the first time the object is used, it will be a "first iteration", to ensure data structures are properly initialised.
+  is_first_scale = 1;
   // In the first iteration, if refinement is enabled, determine the number of refinement iterations required to arrive at the target accuracy (if achievable in a reasonable number of iterations)
   refinemode = 2;
   // maxrefiters and accuracygoal should be configurable and preferably set somewhere else
@@ -1685,6 +1691,7 @@ InclusionIterationData::InclusionIterationData(const InclusionIterationData& rhs
   extr = rhs.extr;
   
   proc_device = rhs.proc_device;
+  is_first_scale = rhs.is_first_scale;
   refinemode = rhs.refinemode;
   maxrefiters = rhs.maxrefiters;
   accuracygoal = rhs.accuracygoal;
@@ -1823,6 +1830,7 @@ InclusionIterationData::InclusionIterationData(const mixMPI *mpidata, const int
   proc_device = 0;
 #endif
   MPI_Bcast(&refinemode, 1, MPI_INT, 0, MPI_COMM_WORLD);
+  MPI_Bcast(&is_first_scale, 1, MPI_C_BOOL, 0, MPI_COMM_WORLD);
   MPI_Bcast(&maxrefiters, 1, MPI_INT, 0, MPI_COMM_WORLD);
   MPI_Bcast(&accuracygoal, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
 }
@@ -1888,6 +1896,7 @@ void InclusionIterationData::mpibcast(const mixMPI *mpidata) {
   MPI_Bcast(&nimd, 1, MPI_INT, 0, MPI_COMM_WORLD);
   MPI_Bcast(&extr, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
   MPI_Bcast(&refinemode, 1, MPI_INT, 0, MPI_COMM_WORLD);
+  MPI_Bcast(&is_first_scale, 1, MPI_C_BOOL, 0, MPI_COMM_WORLD);
   MPI_Bcast(&maxrefiters, 1, MPI_INT, 0, MPI_COMM_WORLD);
   MPI_Bcast(&accuracygoal, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
 }
diff --git a/src/sphere/sphere.cpp b/src/sphere/sphere.cpp
index d123b0ff3927e89da061c1c946190575d19afa9c..7d26091f21762d4f1edf166e1a65cf6dd7c92163 100644
--- a/src/sphere/sphere.cpp
+++ b/src/sphere/sphere.cpp
@@ -201,25 +201,25 @@ void sphere(const string& config_file, const string& data_file, const string& ou
       }
 
       // Do the first wavelength iteration
-      int jxi488 = 1;
+      // int jxi488 = 1;
       // Use pragmas to put OMP parallelism to second level.
       int jer = 0;
-#pragma omp parallel
-      {
-#pragma omp single
-	{
-	  jer = sphere_jxi488_cycle(jxi488 - 1, sconf, gconf, p_sa, sid, p_output, output_path, vtppoanp);
-	} // OMP single
-      } // OMP parallel
-      if (jer != 0) { // First iteration failed. Halt the calculation.
-	delete p_output;
-	delete p_sa;
-	delete sid;
-	delete logger;
-	delete sconf;
-	delete gconf;
-	return;
-      }
+// #pragma omp parallel
+//       {
+// #pragma omp single
+// 	{
+// 	  jer = sphere_jxi488_cycle(jxi488 - 1, sconf, gconf, p_sa, sid, p_output, output_path, vtppoanp);
+// 	} // OMP single
+//       } // OMP parallel
+//       if (jer != 0) { // First iteration failed. Halt the calculation.
+// 	delete p_output;
+// 	delete p_sa;
+// 	delete sid;
+// 	delete logger;
+// 	delete sconf;
+// 	delete gconf;
+// 	return;
+//       }
 
       //==================================================
       // do the first outputs here, so that I open here the new files, afterwards I only append
@@ -310,7 +310,7 @@ void sphere(const string& config_file, const string& data_file, const string& ou
 	}
 #pragma omp barrier
 	// ok, now I can actually start the parallel calculations
-	for (int ixi488=2; ixi488<=sid_2->number_of_scales; ixi488 +=myMPIstride) {
+	for (int ixi488=1; ixi488<=sid_2->number_of_scales; ixi488 +=myMPIstride) {
 	  // the parallel loop over MPI processes covers a different set of indices for each thread
 #pragma omp barrier
 	  int myjxi488 = ixi488+myompthread;
@@ -463,7 +463,7 @@ void sphere(const string& config_file, const string& data_file, const string& ou
       // make sure all threads align here: I don't want the following loop to accidentally start for thread 0, possibly modifying some variables before they are copied by all other threads
 #pragma omp barrier
       // ok, now I can actually start the parallel calculations
-      for (int ixi488=2; ixi488<=sid_2->number_of_scales; ixi488 +=myMPIstride) {
+      for (int ixi488=1; ixi488<=sid_2->number_of_scales; ixi488 +=myMPIstride) {
 	// the parallel loop over MPI processes covers a different set of indices for each thread
 #pragma omp barrier
 	int myjxi488 = ixi488 + myjxi488startoffset + myompthread;
@@ -546,7 +546,9 @@ int sphere_jxi488_cycle(
   int oindex = 0;
   int jxi = jxi488 + 1;
   int jxindex = jxi - oi->first_xi;
-  bool is_first_scale = (jxi == 1);
+  // this is now part of sid, so don't mess with it here, just copy it by reference
+  bool &is_first_scale = sid->is_first_scale;
+  //bool is_first_scale = (jxi == 1);
   int nsph = gconf->number_of_spheres;
   //int l_max = gconf->l_max;
   int in_pol = gconf->in_pol;
@@ -889,6 +891,8 @@ int sphere_jxi488_cycle(
     } // jph484 loop on elevation
     th += sa->thstp;
   } // jth486 loop on azimuth
+  // at the end of this iteration make sure to set is_first_scale to false, the next iteration will reinitialise _only_ if the order changes
+  is_first_scale = 0;
   oi->vec_jxi[jxindex] = jxi;
   logger->log("INFO: finished scale iteration " + to_string(jxi) + " of " + to_string(nxi) + ".\n");
   delete logger;
@@ -977,6 +981,7 @@ SphereIterationData::SphereIterationData(
       zpv[zi][zj][1] = (vec_zpv + vec_index + 2);
     }
   }
+  is_first_scale = 1;
 }
 
 SphereIterationData::SphereIterationData(const SphereIterationData &rhs) {
@@ -1082,6 +1087,7 @@ SphereIterationData::SphereIterationData(const SphereIterationData &rhs) {
       zpv[zi][zj][1] = (vec_zpv + vec_index + 2);
     }
   }
+  is_first_scale = rhs.is_first_scale;
 }
 
 SphereIterationData::~SphereIterationData() {
@@ -1207,6 +1213,9 @@ SphereIterationData::SphereIterationData(const mixMPI *mpidata, const int device
   // Collect vectors whose size depend on LM
   vec_zpv = new double[12 * _lm];
   MPI_Bcast(vec_zpv, 12 * _lm, MPI_DOUBLE, 0, MPI_COMM_WORLD);
+
+  MPI_Bcast(&is_first_scale, 1, MPI_C_BOOL, 0, MPI_COMM_WORLD);
+
 }
 
 int SphereIterationData::mpibcast(const mixMPI *mpidata) {
@@ -1269,7 +1278,9 @@ int SphereIterationData::mpibcast(const mixMPI *mpidata) {
 
   // Broadcast vectors whose size depend on LM
   MPI_Bcast(vec_zpv, 12 * _lm, MPI_DOUBLE, 0, MPI_COMM_WORLD);
-  
+
+  MPI_Bcast(&is_first_scale, 1, MPI_C_BOOL, 0, MPI_COMM_WORLD);
+
   return 0;
 }
 #endif // MPI_VERSION