/* Copyright (C) 2024   INAF - Osservatorio Astronomico di Cagliari

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.
   
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
   
   A copy of the GNU General Public License is distributed along with
   this program in the COPYING file. If not, see: <https://www.gnu.org/licenses/>.
 */

/*! \file sphere.cpp
 *
 * \brief Implementation of the single sphere calculation.
 */
#include <cstdio>
#include <exception>
#include <fstream>
#include <hdf5.h>
#include <string>

#ifdef _OPENMP
#include <omp.h>
#endif

#ifdef USE_MPI
#ifndef MPI_VERSION
#include <mpi.h>
#endif
#endif

#ifndef INCLUDE_TYPES_H_
#include "../include/types.h"
#endif

#ifndef INCLUDE_ERRORS_H_
#include "../include/errors.h"
#endif

#ifndef INCLUDE_LOGGING_H_
#include "../include/logging.h"
#endif

#ifndef INCLUDE_CONFIGURATION_H_
#include "../include/Configuration.h"
#endif

#ifndef INCLUDE_COMMONS_H_
#include "../include/Commons.h"
#endif

#ifndef INCLUDE_SPH_SUBS_H_
#include "../include/sph_subs.h"
#endif

#ifndef INCLUDE_TRANSITIONMATRIX_H_
#include "../include/TransitionMatrix.h"
#endif

#ifndef INCLUDE_LIST_H_
#include "../include/List.h"
#endif

#ifndef INCLUDE_FILE_IO_H_
#include "../include/file_io.h"
#endif

#ifndef INCLUDE_OUTPUTS_H_
#include "../include/outputs.h"
#endif

#ifndef INCLUDE_ITERATION_DATA_H_
#include "../include/IterationData.h"
#endif

using namespace std;

/*! \brief Main calculation loop.
 *
 *  \param jxi488: `int` Wavelength loop index.
 *  \param sconf: `ScattererConfiguration *` Pointer to a `ScattererConfiguration` object.
 *  \param gconf: `GeometryConfiguration *` Pointer to a `GeometryConfiguration` object.
 *  \param sa: `ScatteringAngles *` Pointer to a `ScatteringAngles` object.
 *  \param sid: `SphereIterationData *` Pointer to a `SphereIterationData` object.
 *  \param oi: `SphereOutputInfo *` Pointer to a `SphereOutputInfo` object.
 *  \param output_path: `const string &` Path to the output directory.
 *  \param vtppoanp: `VirtualBinaryFile *` Pointer to a `VirtualBinaryFile` object.
 */
int sphere_jxi488_cycle(
  int jxi488, ScattererConfiguration *sconf, GeometryConfiguration *gconf,
  ScatteringAngles *sa, SphereIterationData *sid, SphereOutputInfo *oi,
  const string& output_path, VirtualBinaryFile *vtppoanp
);

/*! \brief C++ implementation of SPH
 *
 *  \param config_file: `string` Name of the configuration file.
 *  \param data_file: `string` Name of the input data file.
 *  \param output_path: `string` Directory to write the output files in.
 *  \param mpidata: `const mixMPI *` Pointer to a mixMPI data structure.
 */
void sphere(const string& config_file, const string& data_file, const string& output_path, const mixMPI *mpidata) {
  Logger *logger = new Logger(LOG_INFO);
  int device_count = 0;

  //===========================
  // the following only happens on MPI process 0
  //===========================
  if (mpidata->rank == 0) {
    logger->log("INFO: making legacy configuration...");
    ScattererConfiguration *sconf = NULL;
    try {
      sconf = ScattererConfiguration::from_dedfb(config_file);
    } catch(const OpenConfigurationFileException &ex) {
      logger->err("\nERROR: failed to open scatterer configuration file.\n");
      string message = ex.what();
      logger->err("FILE: " + message + "\n");
      delete logger;
      return;
    }
    sconf->write_formatted(output_path + "/c_OEDFB");
    sconf->write_binary(output_path + "/c_TEDF");
    sconf->write_binary(output_path + "/c_TEDF.hd5", "HDF5");
    GeometryConfiguration *gconf = NULL;
    try {
      gconf = GeometryConfiguration::from_legacy(data_file);
    } catch(const OpenConfigurationFileException &ex) {
      logger->err("\nERROR: failed to open geometry configuration file.\n");
      string message = ex.what();
      logger->err("FILE: " + message + "\n");
      if (sconf != NULL) delete sconf;
      delete logger;
      return;
    }
    int s_nsph = sconf->number_of_spheres;
    int nsph = gconf->number_of_spheres;
    int configurations = sconf->configurations;
    logger->log(" done.\n");
    // Sanity check on number of sphere consistency, should always be verified
    if (s_nsph == nsph) {
      ScatteringAngles *p_sa = new ScatteringAngles(gconf);
      SphereIterationData *sid = new SphereIterationData(gconf, sconf, mpidata, 0);
      SphereOutputInfo *p_output = new SphereOutputInfo(sconf, gconf, mpidata);
      // FILE *output = fopen((output_path + "/c_OSPH").c_str(), "w");
      const double half_pi = acos(0.0);
      const double pi = 2.0 * half_pi;
      sid->c1->gcs = 0.0;
      for (int i116 = 0; i116 < nsph; i116++) {
	int i = i116 + 1;
	int iogi = sid->c1->iog[i116];
	if (iogi >= i) {
	  double gcss = pi * sid->c1->ros[i116] * sid->c1->ros[i116];
	  sid->c1->gcsv[i116] = gcss;
	  int nsh = sid->c1->nshl[i116];
	  for (int j115 = 0; j115 < nsh; j115++) {
	    sid->c1->rc[i116][j115] = sconf->get_rcf(i116, j115) * sid->c1->ros[i116];
	  }
	}
	sid->c1->gcs += sid->c1->gcsv[iogi - 1];
      }
      thdps(gconf->l_max, sid->zpv);
      double exdc = sconf->exdc;
      double exri = sqrt(exdc);

      // Create empty virtual binary file
      VirtualBinaryFile *vtppoanp = new VirtualBinaryFile();
      string tppoan_name = output_path + "/c_TPPOAN";
      int imode = 10, tmpvalue;

      //========================
      // write a block of info to virtual binary file
      //========================
      vtppoanp->append_line(VirtualBinaryLine(imode));
      tmpvalue = gconf->isam;
      vtppoanp->append_line(VirtualBinaryLine(tmpvalue));
      tmpvalue = gconf->in_pol;
      vtppoanp->append_line(VirtualBinaryLine(tmpvalue));
      vtppoanp->append_line(VirtualBinaryLine(s_nsph));
      tmpvalue = p_sa->nth;
      vtppoanp->append_line(VirtualBinaryLine(tmpvalue));
      tmpvalue = p_sa->nph;
      vtppoanp->append_line(VirtualBinaryLine(tmpvalue));
      tmpvalue = p_sa->nths;
      vtppoanp->append_line(VirtualBinaryLine(tmpvalue));
      tmpvalue = p_sa->nphs;
      vtppoanp->append_line(VirtualBinaryLine(tmpvalue));
      vtppoanp->append_line(VirtualBinaryLine(nsph));
      for (int nsi = 0; nsi < nsph; nsi++) {
	tmpvalue = sid->c1->iog[nsi];
	vtppoanp->append_line(VirtualBinaryLine(tmpvalue));
      }

      if (sconf->idfc < 0) {
	sid->vk = sid->xip * sid->wn;
	p_output->vec_vk[0] = sid->vk;
      }

      // Do the first wavelength iteration
      int jxi488 = 1;
      // Use pragmas to put OMP parallelism to second level.
      int jer = 0;
#pragma omp parallel
      {
#pragma omp single
	{
	  jer = sphere_jxi488_cycle(jxi488 - 1, sconf, gconf, p_sa, sid, p_output, output_path, vtppoanp);
	} // OMP single
      } // OMP parallel
      if (jer != 0) { // First iteration failed. Halt the calculation.
	delete p_output;
	delete p_sa;
	delete sid;
	delete logger;
	delete sconf;
	delete gconf;
	return;
      }

      //==================================================
      // do the first outputs here, so that I open here the new files, afterwards I only append
      //==================================================
      vtppoanp->write_to_disk(output_path + "/c_TPPOAN");
      delete vtppoanp;

      // here go the calls that send data to be duplicated on other MPI processes from process 0 to others, using MPI broadcasts, but only if MPI is actually used
#ifdef MPI_VERSION
      if (mpidata->mpirunning) {
	gconf->mpibcast(mpidata);
	sconf->mpibcast(mpidata);	    
	sid->mpibcast(mpidata);
	p_sa->mpibcast(mpidata);
      }	
#endif
      // Create this variable and initialise it with a default here, so that it is defined anyway, with or without OpenMP support enabled
      int ompnumthreads = 1;
      // this is for MPI process 0 (or even if we are not using MPI at all)
      int myjxi488startoffset = 0;
      int myMPIstride = ompnumthreads;
      int myMPIblock = ompnumthreads;
      // Define here shared arrays of virtual ascii and binary files, so that thread 0 will be able to access them all later
      SphereOutputInfo **p_outarray = NULL;
      VirtualBinaryFile **vtppoanarray = NULL;

      //===========================================
      // open the OpenMP parallel context, so each thread can initialise its stuff
      //===========================================
#pragma omp parallel
      {
	// Create and initialise this variable here, so that if OpenMP is enabled it is local to the thread, and if OpenMP is not enabled it has a well-defiled value anyway
	int myompthread = 0;
	
#ifdef _OPENMP
	// If OpenMP is enabled, give actual values to myompthread and ompnumthreads, and open thread-local output files
	myompthread = omp_get_thread_num();
	if (myompthread == 0) ompnumthreads = omp_get_num_threads();
#endif

	if (myompthread == 0) {
	  // Initialise some shared variables only on thread 0
	  p_outarray = new SphereOutputInfo*[ompnumthreads];
	  vtppoanarray = new VirtualBinaryFile*[ompnumthreads];
	  myMPIblock = ompnumthreads;
	  myMPIstride = myMPIblock;
	}

#ifdef MPI_VERSION
	if (myompthread == 0) {
	  if (mpidata->mpirunning) {
	    // only go through this if MPI has been actually used
	    for (int rr=1; rr<mpidata->nprocs; rr++) {
	      // individually send their respective starting points to other MPI processes: they start immediately after the frequencies computed by previous processes so far
	      int remotejxi488startoffset = myMPIstride;
	      MPI_Send(&remotejxi488startoffset, 1, MPI_INT, rr, 3, MPI_COMM_WORLD);
	      int remoteMPIblock;
	      MPI_Recv(&remoteMPIblock, 1, MPI_INT, rr, 3, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
	      // update myMPIstride to include the ones due to MPI process rr
	      myMPIstride += remoteMPIblock;
	    }
	    // now I know the total myMPIstride, I can send it to all processes
	    MPI_Bcast(&myMPIstride, 1, MPI_INT, 0, MPI_COMM_WORLD);
	  }
	}
#endif
	// add an omp barrier to make sure that the global variables defined by thread 0 are known to all threads below this
#pragma omp barrier

	// To test parallelism, I will now start feeding this function with "clean" copies of the parameters, so that they will not be changed by previous iterations, and each one will behave as the first one. Define all (empty) variables here, so they have the correct scope, then they get different definitions depending on thread number
	SphereIterationData *sid_2 = NULL;
	SphereOutputInfo *p_output_2 = NULL;
	VirtualBinaryFile *vtppoanp_2 = NULL;
	// for threads other than the 0, create distinct copies of all relevant data, while for thread 0 just define new references / pointers to the original ones
	if (myompthread == 0) {
	  sid_2 = sid;
	  // OMP thread 0 of MPI process 0 holds the pointer to the full output structure
	  p_output_2 = p_output;
	  p_outarray[0] = p_output_2;
	} else {
	  // this is not thread 0, so do create fresh copies of all local variables
	  sid_2 = new SphereIterationData(*sid);
	}
	// make sure all threads align here: I don't want the following loop to accidentally start for thread 0, possibly modifying some variables before they are copied by all other threads
	if (myompthread==0) {
	  logger->log("Syncing OpenMP threads and starting the loop on wavelengths\n");
	  // Thread 0 of process 0 has already allocated all necessary output memory
	}
#pragma omp barrier
	// ok, now I can actually start the parallel calculations
	for (int ixi488=2; ixi488<=sid_2->number_of_scales; ixi488 +=myMPIstride) {
	  // the parallel loop over MPI processes covers a different set of indices for each thread
#pragma omp barrier
	  int myjxi488 = ixi488+myompthread;
	  // each thread opens new virtual files and stores their pointers in the shared array
	  vtppoanp_2 = new VirtualBinaryFile();
	  // each thread puts a copy of the pointers to its virtual files in the shared arrays
	  vtppoanarray[myompthread] = vtppoanp_2;
#pragma omp barrier

	  // each MPI process handles a number of contiguous scales corresponding to its number of OMP threads at this omp level of parallelism
	  if (myjxi488 <= sid_2->number_of_scales) {
	    if (myompthread > 0) {
	      // UPDATE: non-0 threads need to allocate memory for one scale at a time.
	      p_output_2 = new SphereOutputInfo(sconf, gconf, mpidata, myjxi488, 1);
	      p_outarray[myompthread] = p_output_2;
	    }
	    int jer = sphere_jxi488_cycle(myjxi488 - 1, sconf, gconf, p_sa, sid_2, p_output_2, output_path, vtppoanp_2);
	  } else {
	    if (myompthread > 0) {
	      // If there is no input for this thread, mark to skip.
	      p_outarray[myompthread] = new SphereOutputInfo(1);
	    }
	  }
#pragma omp barrier
	  // threads different from 0 append their virtual files to the one of thread 0, and delete them
	  if (myompthread == 0) {
	    for (int ti=1; ti<ompnumthreads; ti++) {
	      p_outarray[0]->insert(*(p_outarray[ti]));
	      delete p_outarray[ti];
	      p_outarray[ti] = NULL;
	      vtppoanarray[0]->append(*(vtppoanarray[ti]));
	      delete vtppoanarray[ti];
	    }
	  }
#pragma omp barrier
	  //==============================================
	  // Collect all virtual files on thread 0 of MPI process 0, and append them to disk
	  //==============================================
	  if (myompthread == 0) {
	    // thread 0 writes its virtual files, now including contributions from all threads, to disk, and deletes them
	    // p_outarray[0]->append_to_disk(output_path + "/c_OCLU");
	    // delete p_outarray[0];
	    vtppoanarray[0]->append_to_disk(output_path + "/c_TPPOAN");
	    delete vtppoanarray[0];

#ifdef MPI_VERSION
	    if (mpidata->mpirunning) {
	      // only go through this if MPI has been actually used
	      for (int rr=1; rr<mpidata->nprocs; rr++) {
		// get the data from process rr by receiving it in total memory structure
		p_outarray[0]->mpireceive(mpidata, rr);
		// get the data from process rr, creating a new virtual ascii file
		// VirtualAsciiFile *p_output = new VirtualAsciiFile(mpidata, rr);
		// append to disk and delete virtual ascii file
		// p_output->append_to_disk(output_path + "/c_OCLU");
		// delete p_output;
		
		// get the data from process rr, creating a new virtual binary file
		VirtualBinaryFile *vtppoanp = new VirtualBinaryFile(mpidata, rr);
		// append to disk and delete virtual binary file
		vtppoanp->append_to_disk(output_path + "/c_TPPOAN");
		delete vtppoanp;
		int test = MPI_Barrier(MPI_COMM_WORLD);
	      }
	    }
#endif
	  }
	  // end block writing to disk
#pragma omp barrier

	} // ixi488 strided MPI loop
#pragma omp barrier
	if (myompthread == 0) {
	  delete[] p_outarray;
	  delete[] vtppoanarray;
	}
	{
	  string message = "INFO: Closing thread-local output files of thread " + to_string(myompthread) + " and syncing threads.\n";
	  logger->log(message);
	}
	delete sid_2;
      } // OMP parallel
      delete p_sa;
      p_output->write(output_path + "/c_OSPH.hd5", "HDF5");
      p_output->write(output_path + "/c_OSPH", "LEGACY");
      delete p_output;
      logger->log("Finished. Output written to " + output_path + "/c_OSPH.\n");
    } else { // NSPH mismatch between geometry and scatterer configurations.
      throw UnrecognizedConfigurationException(
        "Inconsistent geometry and scatterer configurations."
      );
    }
    delete sconf;
    delete gconf;
  } // end of instruction block for MPI process 0
  
    //===============================
    // instruction block for MPI processes different from 0
    //===============================
#ifdef MPI_VERSION
  else { // Instruction block for MPI processes other than 0.
    // here go the code for MPI processes other than 0
    // copy gconf, sconf, cid and p_scattering_angles from MPI process 0
    GeometryConfiguration *gconf = new GeometryConfiguration(mpidata);
    ScattererConfiguration *sconf = new ScattererConfiguration(mpidata);
    SphereIterationData *sid = new SphereIterationData(mpidata, device_count);
    ScatteringAngles *p_sa = new ScatteringAngles(mpidata);
    
    // Create this variable and initialise it with a default here, so that it is defined anyway, with or without OpenMP support enabled
    int ompnumthreads = 1;
    SphereOutputInfo **p_outarray = NULL;
    VirtualBinaryFile **vtppoanarray = NULL;
    int myjxi488startoffset;
    int myMPIstride = ompnumthreads;
    int myMPIblock = ompnumthreads;

#pragma omp parallel
    {
      // Create and initialise this variable here, so that if OpenMP is enabled it is local to the thread, and if OpenMP is not enabled it has a well-defiled value anyway
      int myompthread = 0;
#ifdef _OPENMP
      // If OpenMP is enabled, give actual values to myompthread and ompnumthreads, and open thread-local output files
      myompthread = omp_get_thread_num();
      if (myompthread == 0) ompnumthreads = omp_get_num_threads();
#endif
      if (myompthread == 0) {
	// receive the start parameter from MPI process 0
	MPI_Recv(&myjxi488startoffset, 1, MPI_INT, 0, 3, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
	// send my number of omp threads to process 0
	MPI_Send(&ompnumthreads, 1, MPI_INT, 0, 3, MPI_COMM_WORLD);
	// receive myMPIstride sent by MPI process 0 to all processes
	MPI_Bcast(&myMPIstride, 1, MPI_INT, 0, MPI_COMM_WORLD);
	// allocate virtual files for each thread
	p_outarray = new SphereOutputInfo*[ompnumthreads];
	vtppoanarray = new VirtualBinaryFile*[ompnumthreads];
      }
#pragma omp barrier
      // To test parallelism, I will now start feeding this function with "clean" copies of the parameters, so that they will not be changed by previous iterations, and each one will behave as the first one. Define all (empty) variables here, so they have the correct scope, then they get different definitions depending on thread number
      SphereIterationData *sid_2 = NULL;
      SphereOutputInfo *p_output_2 = NULL;
      VirtualBinaryFile *vtppoanp_2 = NULL;
      // PLACEHOLDER
      // for threads other than the 0, create distinct copies of all relevant data, while for thread 0 just define new references / pointers to the original ones
      if (myompthread == 0) {
	sid_2 = sid;
      } else {
	// this is not thread 0, so do create fresh copies of all local variables
	sid_2 = new SphereIterationData(*sid);
      }
      // make sure all threads align here: I don't want the following loop to accidentally start for thread 0, possibly modifying some variables before they are copied by all other threads
#pragma omp barrier
      // ok, now I can actually start the parallel calculations
      for (int ixi488=2; ixi488<=sid_2->number_of_scales; ixi488 +=myMPIstride) {
	// the parallel loop over MPI processes covers a different set of indices for each thread
#pragma omp barrier
	int myjxi488 = ixi488 + myjxi488startoffset + myompthread;
	// each thread opens new virtual files and stores their pointers in the shared array
	vtppoanp_2 = new VirtualBinaryFile();
	// each thread puts a copy of the pointers to its virtual files in the shared arrays
	vtppoanarray[myompthread] = vtppoanp_2;
#pragma omp barrier
	if (myompthread==0) logger->log("Syncing OpenMP threads and starting the loop on wavelengths\n");
	// ok, now I can actually start the parallel calculations
	// each MPI process handles a number of contiguous scales corresponding to its number of OMP threads at this omp level of parallelism
	if (myjxi488 <= sid_2->number_of_scales) {
	  if (myompthread > 0) {
	    // UPDATE: non-0 threads need to allocate memory for one scale at a time.
	    p_output_2 = new SphereOutputInfo(sconf, gconf, mpidata, myjxi488, 1);
	    p_outarray[myompthread] = p_output_2;
	  } else {
	    // Thread 0 of non-zero MPI processes needs to allocate memory for the
	    // output of all threads _doing something_.
	    int iterstodo = sid_2->number_of_scales - myjxi488 + 1;
	    if (iterstodo > ompnumthreads) iterstodo = ompnumthreads;
	    p_output_2 = new SphereOutputInfo(sconf, gconf, mpidata, myjxi488, iterstodo);
	    p_outarray[0] = p_output_2;
	  }
	  int jer = sphere_jxi488_cycle(myjxi488 - 1, sconf, gconf, p_sa, sid_2, p_output_2, output_path, vtppoanp_2);
	} else {
	  p_outarray[myompthread] = new SphereOutputInfo(1);
	}

#pragma omp barrier
	// threads different from 0 append their virtual files to the one of thread 0, and delete them
	if (myompthread == 0) {
	  for (int ti=1; ti<ompnumthreads; ti++) {
	    p_outarray[0]->insert(*(p_outarray[ti]));
	    delete p_outarray[ti];
	    p_outarray[ti] = NULL;
	    vtppoanarray[0]->append(*(vtppoanarray[ti]));
	    delete vtppoanarray[ti];
	  }
	  // thread 0 sends the collected virtualfiles to thread 0 of MPI process 0, then deletes them
	  for (int rr=1; rr<mpidata->nprocs; rr++) {
	    if (rr == mpidata->rank) {
	      p_outarray[0]->mpisend(mpidata);
	      delete p_outarray[0];
	      p_outarray[0] = NULL;
	      vtppoanarray[0]->mpisend(mpidata);
	      delete vtppoanarray[0];
	    }
	    int test = MPI_Barrier(MPI_COMM_WORLD);
	  }
	}
      } // ixi488: close strided loop running on MPI processes
      // Clean memory
#pragma omp barrier
      if (myompthread == 0) {
	delete[] p_outarray;
	delete[] vtppoanarray;
      }
      delete sid_2;
    } // OMP parallel
    delete p_sa;
    delete sconf;
    delete gconf;
  } // End instructions block for MPI non-0 processes.
#endif // MPI_VERSION
  delete logger;
} // sphere()

int sphere_jxi488_cycle(
  int jxi488, ScattererConfiguration *sconf, GeometryConfiguration *gconf,
  ScatteringAngles *sa, SphereIterationData *sid, SphereOutputInfo *oi,
  const string& output_path, VirtualBinaryFile *vtppoanp
) {
  const dcomplex cc0 = 0.0 + I * 0.0;
  const double half_pi = acos(0.0);
  const double pi = 2.0 * half_pi;
  int jer = 0;
  Logger *logger = new Logger(LOG_INFO);
  int oindex = 0;
  int jxi = jxi488 + 1;
  int jxindex = jxi - oi->first_xi;
  int nsph = gconf->number_of_spheres;
  int l_max = gconf->l_max;
  int in_pol = gconf->in_pol;
  int npnt = gconf->npnt;
  int npntts = gconf->npntts;
  int jwtm = gconf->jwtm;
  double wn = sconf->wp / 3.0e8;
  double sqsfi = 1.0;
  int idfc = sconf->idfc;
  int nxi = sconf->number_of_scales;
  int configurations = sconf->configurations;
  int ndirs = sa->nkks;
  int lcalc;
  int jw, isq, ibf;
  logger->log("INFO: running scale iteration " + to_string(jxi) + " of " + to_string(nxi) + ".\n");
  double vk, vkarg;
  double xi = sconf->get_scale(jxi488);
  double exdc = sconf->exdc;
  double exri = sqrt(exdc);
  if (idfc >= 0) {
    vk = xi * wn;
    vkarg = vk;
    oi->vec_vk[jxindex] = vk;
    oi->vec_xi[jxindex] = xi;
  } else { // IDFC < 0
    vk = sconf->xip * wn;
    vkarg = xi * vk;
    sqsfi = 1.0 / (xi * xi);
    oi->vec_vk[jxindex] = vk;
    oi->vec_xi[jxindex] = xi;
  }
  // Adaptive definition of L_MAX
  double wavelength = 2.0 * pi / vk;
  double size_param = 2.0 * pi * sconf->get_radius(0) / wavelength;
  int N = int(size_param + 4.05 * pow(size_param, 1.0 / 3.0)) + 2;
  if (N < l_max) l_max = N;
  // End of adaptive definition of L_MAX
  vtppoanp->append_line(VirtualBinaryLine(vk));
  double thsca = (gconf->isam > 1) ? sa->ths - sa->th : 0.0;
  for (int i132 = 0; i132 < nsph; i132++) {
    int i = i132 + 1;
    int iogi = sid->c1->iog[i132];
    if (iogi != i) {
      for (int l123 = 0; l123 < l_max; l123++) {
	sid->c1->rmi[l123][i132] = sid->c1->rmi[l123][iogi - 1];
	sid->c1->rei[l123][i132] = sid->c1->rei[l123][iogi - 1];
      }
      continue; // i132
    }
    // label 125
    int nsh = sid->c1->nshl[i132];
    int ici = (nsh + 1) / 2;
    if (idfc == 0) {
      for (int ic = 0; ic < ici; ic++)
	sid->c1->dc0[ic] = sconf->get_dielectric_constant(ic, i132, jxi488); // WARNING: IDFC=0 is not tested!
    } else { // IDFC != 0
      if (jxi == 1) {
	for (int ic = 0; ic < ici; ic++) {
	  sid->c1->dc0[ic] = sconf->get_dielectric_constant(ic, i132, jxi488);
	}
      }
    }
    if (nsh % 2 == 0) sid->c1->dc0[ici] = exdc;
    dme(l_max, i, npnt, npntts, vkarg, exdc, exri, sid->c1, jer, lcalc, sid->arg);
    if (jer != 0) {
      oi->vec_ier[jxindex] = 1;
      oi->lcalc = lcalc;
      delete logger;
      return jer;
    }
  } // i132
  if (idfc >= 0 and nsph == 1 and jxi == jwtm) {
    // This is the condition that writes the transition matrix to output.
    string ttms_name = output_path + "/c_TTMS.hd5";
    TransitionMatrix::write_binary(
      ttms_name, l_max, vk, exri, sid->c1->rmi, sid->c1->rei,
      sconf->get_radius(0), "HDF5"
    );
    ttms_name = output_path + "/c_TTMS";
    TransitionMatrix::write_binary(
      ttms_name, l_max, vk, exri, sid->c1->rmi, sid->c1->rei,
      sconf->get_radius(0)
    );
  }
  double cs0 = 0.25 * vk * vk * vk / half_pi;
  sscr0(sid->tfsas, nsph, l_max, vk, exri, sid->c1);
  double sqk = vk * vk * exdc;
  aps(sid->zpv, l_max, nsph, sid->c1, sqk, sid->gaps);
  rabas(in_pol, l_max, nsph, sid->c1, sid->tqse, sid->tqspe, sid->tqss, sid->tqsps);
  int last_configuration = 0;
  for (int i170 = 0; i170 < nsph; i170++) {
    int i = i170 + 1;
    if (sid->c1->iog[i170] >= i) {
      last_configuration++;
      oindex = jxindex * configurations + last_configuration - 1;
      double albeds = sid->c1->sscs[i170] / sid->c1->sexs[i170];
      sid->c1->sqscs[i170] *= sqsfi;
      sid->c1->sqabs[i170] *= sqsfi;
      sid->c1->sqexs[i170] *= sqsfi;
      if (sid->c1->nshl[i170] != 1) {
	oi->vec_sphere_ref_indices[oindex] = cc0;
	oi->vec_sphere_sizes[oindex] = sid->c1->vsz[i170];
      } else {
	oi->vec_sphere_ref_indices[oindex] = sid->c1->vkt[i170];
	oi->vec_sphere_sizes[oindex] = sid->c1->vsz[i170];
      }
      oi->vec_scs[oindex] = sid->c1->sscs[i170];
      oi->vec_abs[oindex] = sid->c1->sabs[i170];
      oi->vec_exs[oindex] = sid->c1->sexs[i170];
      oi->vec_albeds[oindex] = albeds;
      oi->vec_scsrt[oindex] = sid->c1->sqscs[i170];
      oi->vec_absrt[oindex] = sid->c1->sqabs[i170];
      oi->vec_exsrt[oindex] = sid->c1->sqexs[i170];
      oi->vec_fsas[oindex] = sid->c1->fsas[i170];
      double csch = 2.0 * vk * sqsfi / sid->c1->gcsv[i170];
      dcomplex s0 = sid->c1->fsas[i170] * exri;
      double qschu = csch * imag(s0);
      double pschu = csch * real(s0);
      double s0mag = cs0 * cabs(s0);
      oi->vec_qschu[oindex] = qschu;
      oi->vec_pschu[oindex] = pschu;
      oi->vec_s0mag[oindex] = s0mag;
      double rapr = sid->c1->sexs[i170] - sid->gaps[i170];
      double cosav = sid->gaps[i170] / sid->c1->sscs[i170];
      oi->vec_cosav[oindex] = cosav;
      oi->vec_raprs[oindex] = rapr;
      oi->vec_tqek1[oindex] = sid->tqse[0][i170];
      oi->vec_tqsk1[oindex] = sid->tqss[0][i170];
      oi->vec_tqek2[oindex] = sid->tqse[1][i170];
      oi->vec_tqsk2[oindex] = sid->tqss[1][i170];
      double value = sid->tqse[0][i170];
      vtppoanp->append_line(VirtualBinaryLine(value));
      value = sid->tqss[0][i170];
      vtppoanp->append_line(VirtualBinaryLine(value));
      value = real(sid->tqspe[0][i170]);
      vtppoanp->append_line(VirtualBinaryLine(value));
      value = imag(sid->tqspe[0][i170]);
      vtppoanp->append_line(VirtualBinaryLine(value));
      value = real(sid->tqsps[0][i170]);
      vtppoanp->append_line(VirtualBinaryLine(value));
      value = imag(sid->tqsps[0][i170]);
      vtppoanp->append_line(VirtualBinaryLine(value));
      value = sid->tqse[1][i170];
      vtppoanp->append_line(VirtualBinaryLine(value));
      value = sid->tqss[1][i170];
      vtppoanp->append_line(VirtualBinaryLine(value));
      value = real(sid->tqspe[1][i170]);
      vtppoanp->append_line(VirtualBinaryLine(value));
      value = imag(sid->tqspe[1][i170]);
      vtppoanp->append_line(VirtualBinaryLine(value));
      value = real(sid->tqsps[1][i170]);
      vtppoanp->append_line(VirtualBinaryLine(value));
      value = imag(sid->tqsps[1][i170]);
      vtppoanp->append_line(VirtualBinaryLine(value));
    } // End if iog[i170] >= i
  } // i170 loop
  if (nsph != 1) {
    oi->vec_fsat[jxindex] = sid->tfsas;
    double csch = 2.0 * vk * sqsfi / sid->c1->gcs;
    dcomplex s0 = sid->tfsas * exri;
    double qschu = csch * imag(s0);
    double pschu = csch * real(s0);
    double s0mag = cs0 * cabs(s0);
    oi->vec_qschut[jxindex] = qschu;
    oi->vec_pschut[jxindex] = pschu;
    oi->vec_s0magt[jxindex] = s0mag;
  }
  double th = sa->th;
  int done_dirs = 0;
  int nks = sa->nths * sa->nphs;
  int nkks = sa->nth * sa->nph * nks;
  int nth = sa->nth;
  int nph = sa->nph;
  double frx, fry, frz;
  for (int jth486 = 0; jth486 < sa->nth; jth486++) { // OpenMP parallelizable section
    int jth = jth486 + 1;
    double ph = sa->ph;
    for (int jph484 = 0; jph484 < sa->nph; jph484++) {
      int jph = jph484 + 1;
      bool goto182 = (sa->nk == 1) && (jxi > 1);
      if (!goto182) {
	upvmp(th, ph, 0, sid->cost, sid->sint, sid->cosp, sid->sinp, sid->u, sid->upmp, sid->unmp);
      }
      if (gconf->isam >= 0) {
	wmamp(0, sid->cost, sid->sint, sid->cosp, sid->sinp, in_pol, l_max, 0, nsph, sid->argi, sid->u, sid->upmp, sid->unmp, sid->c1);
	for (int i183 = 0; i183 < nsph; i183++) {
	  double rapr = sid->c1->sexs[i183] - sid->gaps[i183];
	  frx = rapr * sid->u[0];
	  fry = rapr * sid->u[1];
	  frz = rapr * sid->u[2];
	}
	jw = 1;
      }
      double thsl = sa->ths;
      double phsph = 0.0;
      for (int jths482 = 0; jths482 < sa->nths; jths482++) {
	int jths = jths482 + 1;
	double ths = thsl;
	int icspnv = 0;
	if (gconf->isam > 1) ths = th + thsca;
	if (gconf->isam >= 1) {
	  phsph = 0.0;
	  if ((ths < 0.0) || (ths > 180.0)) phsph = 180.0;
	  if (ths < 0.0) ths *= -1.0;
	  if (ths > 180.0) ths = 360.0 - ths;
	  if (phsph != 0.0) icspnv = 1;
	}
	double phs = sa->phs;
	for (int jphs480 = 0; jphs480 < sa->nphs; jphs480++) {
	  int jphs = jphs480 + 1;
	  if (gconf->isam >= 1) {
	    phs = ph + phsph;
	    if (phs >= 360.0) phs -= 360.0;
	  }
	  bool goto190 = (nks == 1) && ((jxi > 1) || (jth > 1) || (jph > 1));
	  if (!goto190) {
	    upvmp(ths, phs, icspnv, sid->costs, sid->sints, sid->cosps, sid->sinps, sid->us, sid->upsmp, sid->unsmp);
	    if (gconf->isam >= 0) {
	      wmamp(2, sid->costs, sid->sints, sid->cosps, sid->sinps, in_pol, l_max, 0, nsph, sid->args, sid->us, sid->upsmp, sid->unsmp, sid->c1);
	    }
	  }
	  if (nkks != 0 || jxi == 1) {
	    upvsp(
	      sid->u, sid->upmp, sid->unmp, sid->us, sid->upsmp, sid->unsmp,
	      sid->up, sid->un, sid->ups, sid->uns, sid->duk, isq, ibf,
	      sid->scan, sid->cfmp, sid->sfmp, sid->cfsp, sid->sfsp
	    );
	    if (gconf->isam < 0) {
	      wmasp(
		sid->cost, sid->sint, sid->cosp, sid->sinp, sid->costs,
		sid->sints, sid->cosps, sid->sinps, sid->u, sid->up,
		sid->un, sid->us, sid->ups, sid->uns, isq, ibf, in_pol,
		l_max, 0, nsph, sid->argi, sid->args, sid->c1
	      );
	    }
	    for (int i193 = 0; i193 < 3; i193++) {
	      sid->un[i193] = sid->unmp[i193];
	      sid->uns[i193] = sid->unsmp[i193];
	    }
	  }
	  if (gconf->isam < 0) jw = 1;
	  vtppoanp->append_line(VirtualBinaryLine(th));
	  vtppoanp->append_line(VirtualBinaryLine(ph));
	  vtppoanp->append_line(VirtualBinaryLine(ths));
	  vtppoanp->append_line(VirtualBinaryLine(phs));
	  double value = sid->scan;
	  vtppoanp->append_line(VirtualBinaryLine(value));
	  if (jw != 0) {
	    jw = 0;
	    value = sid->u[0];
	    vtppoanp->append_line(VirtualBinaryLine(value));
	    value = sid->u[1];
	    vtppoanp->append_line(VirtualBinaryLine(value));
	    value = sid->u[2];
	    vtppoanp->append_line(VirtualBinaryLine(value));
	  }
	  oi->vec_dir_scand[done_dirs] = sid->scan;
	  oi->vec_dir_cfmp[done_dirs] = sid->cfmp;
	  oi->vec_dir_cfsp[done_dirs] = sid->cfsp;
	  oi->vec_dir_sfmp[done_dirs] = sid->sfmp;
	  oi->vec_dir_sfsp[done_dirs] = sid->sfsp;
	  if (gconf->isam >= 0) {
	    oi->vec_dir_un[3 * done_dirs] = sid->un[0];
	    oi->vec_dir_un[3 * done_dirs + 1] = sid->un[1];
	    oi->vec_dir_un[3 * done_dirs + 2] = sid->un[2];
	    oi->vec_dir_uns[3 * done_dirs] = sid->uns[0];
	    oi->vec_dir_uns[3 * done_dirs + 1] = sid->uns[1];
	    oi->vec_dir_uns[3 * done_dirs + 2] = sid->uns[2];
	  } else {
	    oi->vec_dir_un[3 * done_dirs] = sid->un[0];
	    oi->vec_dir_un[3 * done_dirs + 1] = sid->un[1];
	    oi->vec_dir_un[3 * done_dirs + 2] = sid->un[2];
	    oi->vec_dir_uns[3 * done_dirs] = 0.0;
	    oi->vec_dir_uns[3 * done_dirs + 1] = 0.0;
	    oi->vec_dir_uns[3 * done_dirs + 2] = 0.0;
	  }
	  sscr2(nsph, l_max, vk, exri, sid->c1);
	  last_configuration = 0;
	  for (int ns226 = 0; ns226 < nsph; ns226++) {
	    int ns = ns226 + 1;
	    oindex = jxindex * nsph * ndirs + nsph * done_dirs + ns226;
	    oi->vec_dir_sas11[oindex] = sid->c1->sas[ns226][0][0];
	    oi->vec_dir_sas21[oindex] = sid->c1->sas[ns226][1][0];
	    oi->vec_dir_sas12[oindex] = sid->c1->sas[ns226][0][1];
	    oi->vec_dir_sas22[oindex] = sid->c1->sas[ns226][1][1];
	    oi->vec_dir_fx[jxindex * nsph * nth * nph + nsph * nph * (jth - 1) + nsph * (jph - 1) + ns226] = frx;
	    oi->vec_dir_fy[jxindex * nsph * nth * nph + nsph * nph * (jth - 1) + nsph * (jph - 1) + ns226] = fry;
	    oi->vec_dir_fz[jxindex * nsph * nth * nph + nsph * nph * (jth - 1) + nsph * (jph - 1) + ns226] = frz;
	    for (int i225 = 0; i225 < 16; i225++) sid->c1->vint[i225] = sid->c1->vints[ns226][i225];
	    mmulc(sid->c1->vint, sid->cmullr, sid->cmul);
	    for (int imul = 0; imul < 4; imul++) {
	      int muls_index = 16 * jxindex * nsph * ndirs + 16 * nsph * done_dirs + 4 * imul;
	      for (int jmul = 0; jmul < 4; jmul++) {
		oi->vec_dir_muls[muls_index + jmul] = sid->cmul[imul][jmul];
	      }
	    }
	    for (int imul = 0; imul < 4; imul++) {
	      int muls_index = 16 * jxindex * nsph * ndirs + 16 * nsph * done_dirs + 4 * imul;
	      for (int jmul = 0; jmul < 4; jmul++) {
		oi->vec_dir_mulslr[muls_index + jmul] = sid->cmullr[imul][jmul];
	      }
	    }
	    for (int vi = 0; vi < 16; vi++) {
	      value = real(sid->c1->vint[vi]);
	      vtppoanp->append_line(VirtualBinaryLine(value));
	      value = imag(sid->c1->vint[vi]);
	      vtppoanp->append_line(VirtualBinaryLine(value));
	    }
	    for (int imul = 0; imul < 4; imul++) {
	      for (int jmul = 0; jmul < 4; jmul++) {
		value = sid->cmul[imul][jmul];
		vtppoanp->append_line(VirtualBinaryLine(value));
	      }
	    }
	  } // ns226 loop
	  if (gconf->isam < 1) phs += sa->phsstp;
	  done_dirs++;
	} // jphs480 loop
	if (gconf->isam <= 1) thsl += sa->thsstp;
      } // jths482 loop
      ph += sa->phstp;
    } // jph484 loop on elevation
    th += sa->thstp;
  } // jth486 loop on azimuth
  oi->vec_jxi[jxindex] = jxi;
  logger->log("INFO: finished scale iteration " + to_string(jxi) + " of " + to_string(nxi) + ".\n");
  delete logger;
  return jer;
}

// >>> IMPLEMENTATION OF SphereIterationData CLASS <<<
SphereIterationData::SphereIterationData(
  GeometryConfiguration *gconf, ScattererConfiguration *sconf,
  const mixMPI *mpidata, const int device_count
) {
  const dcomplex cc0 = 0.0 + I * 0.0;
  _nsph = gconf->number_of_spheres;
  _lm = gconf->l_max;
  arg = cc0;
  s0 = cc0;
  tfsas = cc0;
  c1 = new ParticleDescriptorSphere(gconf, sconf);
  argi = new double[1]();
  args = new double[1]();
  cost = 0.0;
  sint = 0.0;
  cosp = 0.0;
  sinp = 0.0;
  costs = 0.0;
  sints = 0.0;
  cosps = 0.0;
  sinps = 0.0;
  scan = 0.0;
  cfmp = 0.0;
  cfsp = 0.0;
  sfmp = 0.0;
  sfsp = 0.0;
  wn = sconf->wp / 3.0e8;
  xip = sconf->xip;
  vk = 0.0;
  // Scale block initialization
  number_of_scales = sconf->number_of_scales;
  xiblock = (int) ceil(((double) (sconf->number_of_scales-1))/((double) mpidata->nprocs));
  lastxi = ((mpidata->rank+1) * xiblock)+1;
  firstxi = lastxi-xiblock+1;
  if (lastxi > sconf->number_of_scales) lastxi = sconf->number_of_scales;
  // End of scale block initialization
  gaps = new double[_nsph]();
  duk = new double[3]();
  u = new double[3]();
  us = new double[3]();
  un = new double[3]();
  uns = new double[3]();
  up = new double[3]();
  ups = new double[3]();
  upmp = new double[3]();
  upsmp = new double[3]();
  unmp = new double[3]();
  unsmp = new double[3]();
  vec_cmul = new double[16]();
  vec_cmullr = new double[16]();
  cmul = new double*[4];
  cmullr = new double*[4];
  for (int ci = 0; ci < 4; ci++) {
    cmul[ci] = (vec_cmul + 4 * ci);
    cmullr[ci] = (vec_cmullr + 4 * ci);
  }
  vec_tqspe = new dcomplex[2 * _nsph]();
  vec_tqsps = new dcomplex[2 * _nsph]();
  vec_tqse = new double[2 * _nsph]();
  vec_tqss = new double[2 * _nsph]();
  tqspe = new dcomplex*[2];
  tqsps = new dcomplex*[2];
  tqse = new double*[2];
  tqss = new double*[2];
  for (int ti = 0; ti < 2; ti++) {
    tqspe[ti] = (vec_tqspe + _nsph * ti);
    tqsps[ti] = (vec_tqsps + _nsph * ti);
    tqse[ti] = (vec_tqse + _nsph * ti);
    tqss[ti] = (vec_tqss + _nsph * ti);
  }
  vec_zpv = new double[_lm * 12]();
  zpv = new double***[_lm];
  for (int zi = 0; zi < _lm; zi++) {
    zpv[zi] = new double**[3];
    for (int zj = 0; zj < 3; zj++) {
      int vec_index = 12 * zi + 4 * zj;
      zpv[zi][zj] = new double*[2];
      zpv[zi][zj][0] = (vec_zpv + vec_index);
      zpv[zi][zj][1] = (vec_zpv + vec_index + 2);
    }
  }
}

SphereIterationData::SphereIterationData(const SphereIterationData &rhs) {
  _nsph = rhs._nsph;
  _lm = rhs._lm;
  arg = rhs.arg;
  s0 = rhs.s0;
  tfsas = rhs.tfsas;
  c1 = new ParticleDescriptorSphere(reinterpret_cast<ParticleDescriptorSphere &>(*(rhs.c1)));
  argi = new double[1];
  args = new double[1];
  argi[0] = rhs.argi[0];
  args[0] = rhs.args[0];
  cost = rhs.cost;
  sint = rhs.sint;
  cosp = rhs.cosp;
  sinp = rhs.sinp;
  costs = rhs.costs;
  sints = rhs.sints;
  cosps = rhs.cosps;
  sinps = rhs.sinps;
  scan = rhs.scan;
  cfmp = rhs.cfmp;
  cfsp = rhs.cfsp;
  sfmp = rhs.sfmp;
  sfsp = rhs.sfsp;
  wn = rhs.wn;
  xip = rhs.xip;
  vk = rhs.vk;
  // Scale block initialization
  number_of_scales = rhs.number_of_scales;
  xiblock = rhs.xiblock;
  lastxi = rhs.lastxi;
  firstxi = rhs.firstxi;
  // End of scale block initialization
  gaps = new double[_nsph];
  for (int si = 0; si < _nsph; si++) gaps[si] = rhs.gaps[si];
  duk = new double[3];
  u = new double[3];
  us = new double[3];
  un = new double[3];
  uns = new double[3];
  up = new double[3];
  ups = new double[3];
  upmp = new double[3];
  upsmp = new double[3];
  unmp = new double[3];
  unsmp = new double[3];
  for (int ui = 0; ui < 3; ui++) {
    duk[ui] = rhs.duk[ui];
    u[ui] = rhs.u[ui];
    us[ui] = rhs.us[ui];
    un[ui] = rhs.un[ui];
    uns[ui] = rhs.us[ui];
    up[ui] = rhs.up[ui];
    ups[ui] = rhs.ups[ui];
    upmp[ui] = rhs.upmp[ui];
    upsmp[ui] = rhs.upsmp[ui];
    unmp[ui] = rhs.unmp[ui];
    unsmp[ui] = rhs.unsmp[ui];
  }
  vec_cmul = new double[16];
  vec_cmullr = new double[16];
  for (int mi = 0; mi < 16; mi++) {
    vec_cmul[mi] = rhs.vec_cmul[mi];
    vec_cmullr[mi] = rhs.vec_cmullr[mi];
  }
  cmul = new double*[4];
  cmullr = new double*[4];
  for (int ci = 0; ci < 4; ci++) {
    cmul[ci] = (vec_cmul + 4 * ci);
    cmullr[ci] = (vec_cmullr + 4 * ci);
  }
  vec_tqspe = new dcomplex[2 * _nsph]();
  vec_tqsps = new dcomplex[2 * _nsph]();
  vec_tqse = new double[2 * _nsph]();
  vec_tqss = new double[2 * _nsph]();
  for (int ti = 0; ti < 2 * _nsph; ti++) {
    vec_tqspe[ti] = rhs.vec_tqspe[ti];
    vec_tqsps[ti] = rhs.vec_tqsps[ti];
    vec_tqse[ti] = rhs.vec_tqse[ti];
    vec_tqss[ti] = rhs.vec_tqss[ti];
  }
  tqspe = new dcomplex*[2];
  tqsps = new dcomplex*[2];
  tqse = new double*[2];
  tqss = new double*[2];
  for (int ti = 0; ti < 2; ti++) {
    tqspe[ti] = (vec_tqspe + _nsph * ti);
    tqsps[ti] = (vec_tqsps + _nsph * ti);
    tqse[ti] = (vec_tqse + _nsph * ti);
    tqss[ti] = (vec_tqss + _nsph * ti);
  }
  vec_zpv = new double[_lm * 12];
  for (int zi = 0; zi < _lm * 12; zi++) vec_zpv[zi] = rhs.vec_zpv[zi];
  zpv = new double***[_lm];
  for (int zi = 0; zi < _lm; zi++) {
    zpv[zi] = new double**[3];
    for (int zj = 0; zj < 3; zj++) {
      int vec_index = 12 * zi + 4 * zj;
      zpv[zi][zj] = new double*[2];
      zpv[zi][zj][0] = (vec_zpv + vec_index);
      zpv[zi][zj][1] = (vec_zpv + vec_index + 2);
    }
  }
}

SphereIterationData::~SphereIterationData() {
  int lm = c1->li;
  delete c1;
  delete[] argi;
  delete[] args;
  delete[] gaps;
  delete[] duk;
  delete[] u;
  delete[] us;
  delete[] un;
  delete[] uns;
  delete[] up;
  delete[] ups;
  delete[] upmp;
  delete[] upsmp;
  delete[] unmp;
  delete[] unsmp;
  delete[] vec_cmul;
  delete[] vec_cmullr;
  delete[] cmul;
  delete[] cmullr;
  delete[] vec_tqspe;
  delete[] vec_tqsps;
  delete[] vec_tqse;
  delete[] vec_tqss;
  delete[] tqspe;
  delete[] tqsps;
  delete[] tqse;
  delete[] tqss;
  delete[] vec_zpv;
  for (int zi = 0; zi < lm; zi++) {
    for (int zj = 0; zj < 3; zj++) {
      delete[] zpv[zi][zj];
    }
    delete[] zpv[zi];
  }
  delete[] zpv;
}

#ifdef MPI_VERSION
SphereIterationData::SphereIterationData(const mixMPI *mpidata, const int device_count) {
  // Buld child object
  c1 = new ParticleDescriptorSphere(mpidata);

  // Collect the scalar values
  MPI_Bcast(&_nsph, 1, MPI_INT32_T, 0, MPI_COMM_WORLD);
  MPI_Bcast(&_lm, 1, MPI_INT32_T, 0, MPI_COMM_WORLD);
  MPI_Bcast(&arg, 1, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);
  MPI_Bcast(&s0, 1, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);
  MPI_Bcast(&tfsas, 1, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);
  MPI_Bcast(&cost, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(&sint, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(&cosp, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(&sinp, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(&costs, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(&sints, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(&cosps, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(&sinps, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(&scan, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(&cfmp, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(&cfsp, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(&sfmp, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(&sfsp, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(&wn, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(&xip, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(&vk, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(&xiblock, 1, MPI_INT, 0, MPI_COMM_WORLD);
  MPI_Bcast(&number_of_scales, 1, MPI_INT, 0, MPI_COMM_WORLD);
  lastxi = ((mpidata->rank+1) * xiblock)+1;
  firstxi = lastxi-xiblock+1;
  if (lastxi > number_of_scales) lastxi = number_of_scales;

  // Collect length-1 vectors
  argi = new double[1];
  args = new double[1];
  MPI_Bcast(argi, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(args, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  
  // Collect vectors whose size depend on NSPH
  gaps = new double[_nsph];
  vec_tqspe = new dcomplex[2 * _nsph];
  vec_tqsps = new dcomplex[2 * _nsph];
  vec_tqse = new double[2 * _nsph];
  vec_tqss = new double[2 * _nsph];
  MPI_Bcast(gaps, _nsph, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(vec_tqspe, 2 * _nsph, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);
  MPI_Bcast(vec_tqsps, 2 * _nsph, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);
  MPI_Bcast(vec_tqse, 2 * _nsph, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(vec_tqss, 2 * _nsph, MPI_DOUBLE, 0, MPI_COMM_WORLD);

  // Collect length-3 vectors
  duk = new double[3];
  u = new double[3];
  us = new double[3];
  un = new double[3];
  uns = new double[3];
  up = new double[3];
  ups = new double[3];
  upmp = new double[3];
  upsmp = new double[3];
  unmp = new double[3];
  unsmp = new double[3];
  MPI_Bcast(duk, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(u, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(us, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(un, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(uns, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(up, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(ups, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(upmp, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(upsmp, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(unmp, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(unsmp, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  
  // Collect length-16 vectors
  vec_cmul = new double[16];
  vec_cmullr = new double[16];
  MPI_Bcast(vec_cmul, 16, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(vec_cmullr, 16, MPI_DOUBLE, 0, MPI_COMM_WORLD);

  // Collect vectors whose size depend on LM
  vec_zpv = new double[12 * _lm];
  MPI_Bcast(vec_zpv, 12 * _lm, MPI_DOUBLE, 0, MPI_COMM_WORLD);
}

int SphereIterationData::mpibcast(const mixMPI *mpidata) {
  int result = 0;
  // Broadcast child object
  c1->mpibcast(mpidata);
  
  // Broadcast scalar values
  MPI_Bcast(&_nsph, 1, MPI_INT32_T, 0, MPI_COMM_WORLD);
  MPI_Bcast(&_lm, 1, MPI_INT32_T, 0, MPI_COMM_WORLD);
  MPI_Bcast(&arg, 1, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);
  MPI_Bcast(&s0, 1, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);
  MPI_Bcast(&tfsas, 1, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);
  MPI_Bcast(&cost, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(&sint, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(&cosp, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(&sinp, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(&costs, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(&sints, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(&cosps, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(&sinps, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(&scan, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(&cfmp, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(&cfsp, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(&sfmp, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(&sfsp, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(&wn, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(&xip, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(&vk, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(&xiblock, 1, MPI_INT, 0, MPI_COMM_WORLD);
  MPI_Bcast(&number_of_scales, 1, MPI_INT, 0, MPI_COMM_WORLD);

  // Broadcast length-1 vectors
  MPI_Bcast(argi, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(args, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);

  // Broadcast vectors whose size depend on NSPH
  MPI_Bcast(gaps, _nsph, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(vec_tqspe, 2 * _nsph, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);
  MPI_Bcast(vec_tqsps, 2 * _nsph, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD);
  MPI_Bcast(vec_tqse, 2 * _nsph, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(vec_tqss, 2 * _nsph, MPI_DOUBLE, 0, MPI_COMM_WORLD);

  // Broadcast length-3 vectors
  MPI_Bcast(duk, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(u, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(us, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(un, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(uns, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(up, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(ups, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(upmp, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(upsmp, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(unmp, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(unsmp, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  
  // Broadcast length-16 vectors
  MPI_Bcast(vec_cmul, 16, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  MPI_Bcast(vec_cmullr, 16, MPI_DOUBLE, 0, MPI_COMM_WORLD);

  // Broadcast vectors whose size depend on LM
  MPI_Bcast(vec_zpv, 12 * _lm, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  
  return 0;
}
#endif // MPI_VERSION
// >>> END OF SphereIterationData CLASS IMPLEMENTATION <<<