diff --git a/src/include/IterationData.h b/src/include/IterationData.h index 46ebcee36ab9666511cd005795e035c32f22036d..92e21b0826996eff727c701862404eab060be315 100644 --- a/src/include/IterationData.h +++ b/src/include/IterationData.h @@ -179,6 +179,174 @@ public: */ ~ClusterIterationData(); }; -// >>> END OF DEFINITION OF ClusterIterationData CLASS <<< +// >>> END OF ClusterIterationData CLASS DEFINITION <<< + +// >>> DEFINITION OF InclusionIterationData CLASS <<< +/*! \brief A data structure representing the information used for a single scale + * of the INCLUSION case. + */ +class InclusionIterationData { +protected: + //! \brief Vectorized geometric asymmetry parameter components. + double *vec_zpv; + +public: + //! \brief External layer index. + int nimd; + //! \brief External layer radius. + double extr; + + //! \brief Pointer to a ParticleDescriptor structure. + ParticleDescriptor *c1; + //! \brief Vector of geometric asymmetry factors. + double *gaps; + //! \brief Components of extinction contribution to radiation torque on a single sphere along k. + double **tqse; + //! \brief Components of polarized extinction contribution to radiation torque on a single sphere along k. + dcomplex **tqspe; + //! \brief Components of scattering contribution to radiation torque on a single sphere along k. + double **tqss; + //! \brief Components of polarized scattering contribution to radiation torque on a single sphere along k. + dcomplex **tqsps; + //! \brief L-dependent coefficients of the geometric asymmetry parameter. + double ****zpv; + //! \brief Mean geometric asymmetry parameters. + double **gapm; + //! \brief Mean geometric asymmetry parameters referred to polarization plane. + dcomplex **gappm; + //! \brief Imaginary part of the harmonic functions argument. + double *argi; + //! \brief Argument of the harmonic functions referred to the scattering plane. + double *args; + //! \brief Geometric asymmetry parameters. + double **gap; + //! \brief Geometric asymmetry parameters referred to polarization plane. + dcomplex **gapp; + //! \brief Components of extinction contribution to radiation torque on the cluster along k. + double **tqce; + //! \brief Components of extinction contribution to radiation torque on the cluster along k referred to polarization plane. + dcomplex **tqcpe; + //! \brief Components of scattering contribution to radiation torque on the cluster along k. + double **tqcs; + //! \brief Components of scattering contribution to radiation torque on the cluster along k referred to polarization plane. + dcomplex **tqcps; + //! \brief Variation of unitary radiation vector. QUESTION: correct? + double *duk; + //! \brief Cluster extinction cross-section components referred to scattering plane. + double **cextlr; + //! \brief Cluster extinction cross-section components referred to meridional plane. + double **cext; + //! \brief Cluster Mueller Transformation Matrix components referred to scattering plane. + double **cmullr; + //! \brief Cluster Mueller Transformation Matrix components referred to meridional plane. + double **cmul; + //! \brief Geometric asymmetry parameter components. + double *gapv; + //! \brief Radiation extinction torque components. + double *tqev; + //! \brief Radiation scattering torque components. + double *tqsv; + //! \brief Incident unitary vector components. + double *u; + //! \brief Scattered unitary vector components. + double *us; + //! \brief Normal unitary vector components. + double *un; + //! \brief Normal scattered unitary vector components. + double *uns; + //! \brief Incident unitary vector components on polarization plane. + double *up; + //! \brief Scattered unitary vector components on polarization plane. + double *ups; + //! \brief Mean unitary vector components normal to polarization plane. + double *unmp; + //! \brief Mean scattered unitary vector components normal to polarization plane. + double *unsmp; + //! \brief Mean incident unitary vector components on polarization plane. + double *upmp; + //! \brief Mean scattered unitary vector components on polarization plane. + double *upsmp; + //! \brief Scattering angle. + double scan; + //! \brief Control parameter on incidence direction referred to meridional plane. + double cfmp; + //! \brief Control parameter on scattering direction referred to meridional plane. + double sfmp; + //! \brief Control parameter on incidence direction referred to scattering plane. + double cfsp; + //! \brief Control parameter on scattering direction referred to scattering plane. + double sfsp; + //! \brief SQSFI = XI^-2 + double sqsfi; + //! \brief Vectorized scattering coefficient matrix. + dcomplex *am_vector; + //! \brief Scattering coefficient matrix. + dcomplex **am; + //! \brief Argument of harmonic functions. QUESTION: correct? + dcomplex arg; + //! \brief Vacuum magnitude of wave vector. + double vk; + //! \brief Wave number. + double wn; + //! \brief Normalization scale. QUESTION: correct? + double xip; + //! \brief Number of scales (wavelengths) to be computed. + int number_of_scales; + //! \brief Size of the block of scales handled by the current process. + int xiblock; + //! \brief Index of the first scale handled by the current process. + int firstxi; + //! \brief Index of the last scale handled by the current process. + int lastxi; + //! \brief ID of the GPU used by one MPI process. + int proc_device; + //! \brief Refinement mode selction flag. + int refinemode; + //! \brief Maximum number of refinement iterations. + int maxrefiters; + //! \brief Required accuracy level. + double accuracygoal; + + /*! \brief `InclusionIterationData` default instance constructor. + * + * \param gconf: `GeometryConfiguration *` Pointer to a `GeometryConfiguration` object. + * \param sconf: `ScattererConfiguration *` Pointer to a `ScattererConfiguration` object. + * \param mpidata: `mixMPI *` Pointer to a `mixMPI` object. + * \param device_count: `const int` Number of offload devices available on the system. + */ + InclusionIterationData(GeometryConfiguration *gconf, ScattererConfiguration *sconf, const mixMPI *mpidata, const int device_count); + + /*! \brief `InclusionIterationData` copy constructor. + * + * \param rhs: `const InclusionIterationData &` Reference to the `InclusionIterationData` object to be copied. + */ + InclusionIterationData(const InclusionIterationData& rhs); + +#ifdef MPI_VERSION + /*! \brief `InclusionIterationData` MPI constructor. + * + * \param mpidata: `const mixMPI *` Pointer to a `mixMPI` instance. + * \param device_count: `const int` Number of offload devices available on the system. + */ + InclusionIterationData(const mixMPI *mpidata, const int device_count); + + /*! \brief Broadcast over MPI the InclusionIterationData instance from MPI process 0 to all others. + * + * When using MPI, the initial InclusionIterationData instance created by MPI process 0 + * needs to be replicated on all other processes. This function sends it using + * MPI broadcast calls. The MPI broadcast calls in this function must match those + * in the constructor using the mixMPI pointer. + * + * \param mpidata: `mixMPI *` Pointer to the mpi structure used to do the MPI broadcast. + */ + void mpibcast(const mixMPI *mpidata); +#endif + + /*! \brief `InclusionIterationData` instance destroyer. + */ + ~InclusionIterationData(); +}; +// >>> END OF InclusionIterationData CLASS DEFINITION <<< // + #endif // INCLUDE_ITERATION_DATA_H_ diff --git a/src/inclusion/inclusion.cpp b/src/inclusion/inclusion.cpp index 25b6880f7e69e83fb1101863d91bd95f85df8c58..86c73b469a07f4a89e9fac7b3420a62bc1119339 100644 --- a/src/inclusion/inclusion.cpp +++ b/src/inclusion/inclusion.cpp @@ -24,17 +24,21 @@ #include <fstream> #include <hdf5.h> #include <string> + #ifdef _OPENMP #include <omp.h> #endif + #ifdef USE_MPI #ifndef MPI_VERSION #include <mpi.h> #endif #endif + #ifdef USE_NVTX #include <nvtx3/nvToolsExt.h> #endif + #ifdef USE_MAGMA #include <cuda_runtime.h> #endif @@ -95,1243 +99,537 @@ #include "../include/outputs.h" #endif -using namespace std; - -// >>> InclusionIterationData header <<< // -/*! \brief A data structure representing the information used for a single scale - * of the INCLUSION case. - */ -class InclusionIterationData { -protected: - //! \brief Vectorized geometric asymmetry parameter components. - double *vec_zpv; - -public: - //! \brief External layer index. - int nimd; - //! \brief External layer radius. - double extr; - - //! \brief Pointer to a ParticleDescriptor structure. - ParticleDescriptor *c1; - //! \brief Vector of geometric asymmetry factors. - double *gaps; - //! \brief Components of extinction contribution to radiation torque on a single sphere along k. - double **tqse; - //! \brief Components of polarized extinction contribution to radiation torque on a single sphere along k. - dcomplex **tqspe; - //! \brief Components of scattering contribution to radiation torque on a single sphere along k. - double **tqss; - //! \brief Components of polarized scattering contribution to radiation torque on a single sphere along k. - dcomplex **tqsps; - //! \brief L-dependent coefficients of the geometric asymmetry parameter. - double ****zpv; - //! \brief Mean geometric asymmetry parameters. - double **gapm; - //! \brief Mean geometric asymmetry parameters referred to polarization plane. - dcomplex **gappm; - //! \brief Imaginary part of the harmonic functions argument. - double *argi; - //! \brief Argument of the harmonic functions referred to the scattering plane. - double *args; - //! \brief Geometric asymmetry parameters. - double **gap; - //! \brief Geometric asymmetry parameters referred to polarization plane. - dcomplex **gapp; - //! \brief Components of extinction contribution to radiation torque on the cluster along k. - double **tqce; - //! \brief Components of extinction contribution to radiation torque on the cluster along k referred to polarization plane. - dcomplex **tqcpe; - //! \brief Components of scattering contribution to radiation torque on the cluster along k. - double **tqcs; - //! \brief Components of scattering contribution to radiation torque on the cluster along k referred to polarization plane. - dcomplex **tqcps; - //! \brief Variation of unitary radiation vector. QUESTION: correct? - double *duk; - //! \brief Cluster extinction cross-section components referred to scattering plane. - double **cextlr; - //! \brief Cluster extinction cross-section components referred to meridional plane. - double **cext; - //! \brief Cluster Mueller Transformation Matrix components referred to scattering plane. - double **cmullr; - //! \brief Cluster Mueller Transformation Matrix components referred to meridional plane. - double **cmul; - //! \brief Geometric asymmetry parameter components. - double *gapv; - //! \brief Radiation extinction torque components. - double *tqev; - //! \brief Radiation scattering torque components. - double *tqsv; - //! \brief Incident unitary vector components. - double *u; - //! \brief Scattered unitary vector components. - double *us; - //! \brief Normal unitary vector components. - double *un; - //! \brief Normal scattered unitary vector components. - double *uns; - //! \brief Incident unitary vector components on polarization plane. - double *up; - //! \brief Scattered unitary vector components on polarization plane. - double *ups; - //! \brief Mean unitary vector components normal to polarization plane. - double *unmp; - //! \brief Mean scattered unitary vector components normal to polarization plane. - double *unsmp; - //! \brief Mean incident unitary vector components on polarization plane. - double *upmp; - //! \brief Mean scattered unitary vector components on polarization plane. - double *upsmp; - //! \brief Scattering angle. - double scan; - //! \brief Control parameter on incidence direction referred to meridional plane. - double cfmp; - //! \brief Control parameter on scattering direction referred to meridional plane. - double sfmp; - //! \brief Control parameter on incidence direction referred to scattering plane. - double cfsp; - //! \brief Control parameter on scattering direction referred to scattering plane. - double sfsp; - //! \brief SQSFI = XI^-2 - double sqsfi; - //! \brief Vectorized scattering coefficient matrix. - dcomplex *am_vector; - //! \brief Scattering coefficient matrix. - dcomplex **am; - //! \brief Argument of harmonic functions. QUESTION: correct? - dcomplex arg; - //! \brief Vacuum magnitude of wave vector. - double vk; - //! \brief Wave number. - double wn; - //! \brief Normalization scale. QUESTION: correct? - double xip; - //! \brief Number of scales (wavelengths) to be computed. - int number_of_scales; - //! \brief Size of the block of scales handled by the current process. - int xiblock; - //! \brief Index of the first scale handled by the current process. - int firstxi; - //! \brief Index of the last scale handled by the current process. - int lastxi; - //! \brief ID of the GPU used by one MPI process. - int proc_device; - //! \brief Refinement mode selction flag. - int refinemode; - //! \brief Maximum number of refinement iterations. - int maxrefiters; - //! \brief Required accuracy level. - double accuracygoal; - - /*! \brief `InclusionIterationData` default instance constructor. - * - * \param gconf: `GeometryConfiguration *` Pointer to a `GeometryConfiguration` object. - * \param sconf: `ScattererConfiguration *` Pointer to a `ScattererConfiguration` object. - * \param mpidata: `mixMPI *` Pointer to a `mixMPI` object. - * \param device_count: `const int` Number of offload devices available on the system. - */ - InclusionIterationData(GeometryConfiguration *gconf, ScattererConfiguration *sconf, const mixMPI *mpidata, const int device_count); - - /*! \brief `InclusionIterationData` copy constructor. - * - * \param rhs: `const InclusionIterationData &` Reference to the `InclusionIterationData` object to be copied. - */ - InclusionIterationData(const InclusionIterationData& rhs); - -#ifdef MPI_VERSION - /*! \brief `InclusionIterationData` MPI constructor. - * - * \param mpidata: `const mixMPI *` Pointer to a `mixMPI` instance. - * \param device_count: `const int` Number of offload devices available on the system. - */ - InclusionIterationData(const mixMPI *mpidata, const int device_count); - - /*! \brief Broadcast over MPI the InclusionIterationData instance from MPI process 0 to all others. - * - * When using MPI, the initial InclusionIterationData instance created by MPI process 0 - * needs to be replicated on all other processes. This function sends it using - * MPI broadcast calls. The MPI broadcast calls in this function must match those - * in the constructor using the mixMPI pointer. - * - * \param mpidata: `mixMPI *` Pointer to the mpi structure used to do the MPI broadcast. - */ - void mpibcast(const mixMPI *mpidata); +#ifndef INCLUDE_ITERATION_DATA_H_ +#include "../include/IterationData.h" #endif - /*! \brief `InclusionIterationData` instance destroyer. - */ - ~InclusionIterationData(); -}; +using namespace std; -// >>> End of InclusionIterationData header <<< // +/*! \brief Main calculation loop. + * + * The solution of the scattering problem for different wavelengths is an + * embarrasingly parallel task. This function, therefore, collects all the + * operations that can be independently executed by different processes, + * after the configuration stage and the first calculation loop have been + * executed. + * + * \param jxi488: `int` Wavelength loop index. + * \param sconf: `ScattererConfiguration *` Pointer to a `ScattererConfiguration` object. + * \param gconf: `GeometryConfiguration *` Pointer to a `GeometryConfiguration` object. + * \param sa: `ScatteringAngles *` Pointer to a `ScatteringAngles` object. + * \param cid: `InclusionIterationData *` Pointer to an `InclusionIterationData` object. + * \param oi: `InclusionOutputInfo *` Pointer to an `InclusionOutputInfo` object. + * \param output_path: `const string &` Path to the output directory. + * \param vtppoanp: `VirtualBinaryFile *` Pointer to a `VirtualBinaryFile` object. + */ +int inclusion_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConfiguration *gconf, ScatteringAngles *sa, InclusionIterationData *cid, InclusionOutputInfo *output, const string& output_path, VirtualBinaryFile *vtppoanp); -// >>> InclusionIterationData implementation <<< // -InclusionIterationData::InclusionIterationData(GeometryConfiguration *gconf, ScattererConfiguration *sconf, const mixMPI *mpidata, const int device_count) { - c1 = new ParticleDescriptorInclusion(gconf, sconf); - const int ndi = c1->nsph * c1->nlim; - const np_int ndit = 2 * ndi; - gaps = new double[c1->nsph](); - tqev = new double[3](); - tqsv = new double[3](); - tqse = new double*[2]; - tqspe = new dcomplex*[2]; - tqss = new double*[2]; - tqsps = new dcomplex*[2]; - tqce = new double*[2]; - tqcpe = new dcomplex*[2]; - tqcs = new double*[2]; - tqcps = new dcomplex*[2]; - for (int ti = 0; ti < 2; ti++) { - tqse[ti] = new double[c1->nsph](); - tqspe[ti] = new dcomplex[c1->nsph](); - tqss[ti] = new double[c1->nsph](); - tqsps[ti] = new dcomplex[c1->nsph](); - tqce[ti] = new double[3](); - tqcpe[ti] = new dcomplex[3](); - tqcs[ti] = new double[3](); - tqcps[ti] = new dcomplex[3](); - } - gapv = new double[3](); - gapp = new dcomplex*[3]; - gappm = new dcomplex*[3]; - gap = new double*[3]; - gapm = new double*[3]; - for (int gi = 0; gi < 3; gi++) { - gapp[gi] = new dcomplex[2](); - gappm[gi] = new dcomplex[2](); - gap[gi] = new double[2](); - gapm[gi] = new double[2](); - } - u = new double[3](); - us = new double[3](); - un = new double[3](); - uns = new double[3](); - up = new double[3](); - ups = new double[3](); - unmp = new double[3](); - unsmp = new double[3](); - upmp = new double[3](); - upsmp = new double[3](); - argi = new double[1](); - args = new double[1](); - duk = new double[3](); - cextlr = new double*[4]; - cext = new double*[4]; - cmullr = new double*[4];; - cmul = new double*[4]; - for (int ci = 0; ci < 4; ci++) { - cextlr[ci] = new double[4](); - cext[ci] = new double[4](); - cmullr[ci] = new double[4](); - cmul[ci] = new double[4](); +/*! \brief C++ implementation of INCLU + * + * \param config_file: `string` Name of the configuration file. + * \param data_file: `string` Name of the input data file. + * \param output_path: `string` Directory to write the output files in. + * \param mpidata: `mixMPI *` Pointer to an instance of MPI data settings. + */ +void inclusion(const string& config_file, const string& data_file, const string& output_path, const mixMPI *mpidata) { + chrono::time_point<chrono::high_resolution_clock> t_start = chrono::high_resolution_clock::now(); + chrono::duration<double> elapsed; + string message; + string timing_name; + FILE *timing_file; + Logger *time_logger; + if (mpidata->rank == 0) { + timing_name = output_path + "/c_timing_mpi"+ to_string(mpidata->rank) +".log"; + timing_file = fopen(timing_name.c_str(), "w"); + time_logger = new Logger(LOG_DEBG, timing_file); } - vec_zpv = new double[c1->lm * 12](); - zpv = new double***[c1->lm]; - for (int zi = 0; zi < c1->lm; zi++) { - zpv[zi] = new double**[12]; - for (int zj = 0; zj < 3; zj++) { - zpv[zi][zj] = new double*[4]; - zpv[zi][zj][0] = vec_zpv + (zi * 12) + (zj * 4); - zpv[zi][zj][1] = vec_zpv + (zi * 12) + (zj * 4) + 2; + Logger *logger = new Logger(LOG_DEBG); + int device_count = 0; + //=========== + // Initialise MAGMA + //=========== +#ifdef USE_MAGMA + const magma_int_t d_array_max_size = 32; // TEMPORARY: can become configurable parameter + magma_device_t *device_array = new magma_device_t[d_array_max_size]; + magma_int_t num_devices; + magma_getdevices(device_array, d_array_max_size, &num_devices); + device_count = (int)num_devices; + delete[] device_array; + message = "DEBUG: Proc-" + to_string(mpidata->rank) + " found " + to_string(device_count) + " GPU "; + if (device_count > 1) message += "devices.\n"; + else message += "device.\n"; + logger->log(message, LOG_DEBG); + logger->log("INFO: Process " + to_string(mpidata->rank) + " initializes MAGMA.\n"); + magma_int_t magma_result = magma_init(); + if (magma_result != MAGMA_SUCCESS) { + logger->err("ERROR: Process " + to_string(mpidata->rank) + " failed to initilize MAGMA.\n"); + logger->err("PROC-" + to_string(mpidata->rank) + ": MAGMA error code " + to_string(magma_result) + "\n"); + if (mpidata->rank == 0) { + fclose(timing_file); + delete time_logger; } + delete logger; + return; } - am_vector = new dcomplex[c1->ndm * c1->ndm](); - am = new dcomplex*[c1->ndm]; - for (int ai = 0; ai < c1->ndm; ai++) { - am[ai] = (am_vector + ai * c1->ndm); - } +#endif // end MAGMA initialisation - arg = 0.0 + 0.0 * I; - // These are suspect initializations - scan = 0.0; - cfmp = 0.0; - sfmp = 0.0; - cfsp = 0.0; - sfsp = 0.0; - // End of suspect initializations - wn = sconf->wp / 3.0e8; - xip = sconf->xip; - sqsfi = 1.0; - vk = 0.0; - number_of_scales = sconf->number_of_scales; - xiblock = (int) ceil(((double) (sconf->number_of_scales-1))/((double) mpidata->nprocs)); - lastxi = ((mpidata->rank+1) * xiblock)+1; - firstxi = lastxi-xiblock+1; - if (lastxi > sconf->number_of_scales) lastxi = sconf->number_of_scales; + //=========================== + // the following only happens on MPI process 0 + //=========================== + if (mpidata->rank == 0) { +#ifdef USE_NVTX + nvtxRangePush("Set up"); +#endif + //======================= + // Initialise sconf from configuration file + //======================= + logger->log("INFO: making legacy configuration...", LOG_INFO); + ScattererConfiguration *sconf = NULL; + try { + sconf = ScattererConfiguration::from_dedfb(config_file); + } catch(const OpenConfigurationFileException &ex) { + logger->err("\nERROR: failed to open scatterer configuration file.\n"); + string message = "FILE: " + string(ex.what()) + "\n"; + logger->err(message); + fclose(timing_file); + delete time_logger; + delete logger; + return; + } + sconf->write_formatted(output_path + "/c_OEDFB"); + sconf->write_binary(output_path + "/c_TEDF"); + sconf->write_binary(output_path + "/c_TEDF.hd5", "HDF5"); + // end scatterer initialisation - nimd = c1->nshl[0] + 1; - c1->rc[0][nimd - 1] = c1->ros[0] * sconf->get_rcf(0, nimd - 1); - extr = c1->rc[0][nimd - 1]; - const double pig = acos(0.0) * 2.0; - c1->gcs = pig * extr * extr; - + //======================== + // Initialise gconf from configuration files + //======================== + GeometryConfiguration *gconf = NULL; + try { + gconf = GeometryConfiguration::from_legacy(data_file); + } catch (const OpenConfigurationFileException &ex) { + logger->err("\nERROR: failed to open geometry configuration file.\n"); + string message = "FILE: " + string(ex.what()) + "\n"; + logger->err(message); + if (sconf) delete sconf; + fclose(timing_file); + delete time_logger; + delete logger; + return; + } + logger->log(" done.\n", LOG_INFO); + //end gconf initialisation + +#ifdef USE_NVTX + nvtxRangePop(); +#endif + int s_nsph = sconf->number_of_spheres; + int nsph = gconf->number_of_spheres; + // Sanity check on number of sphere consistency, should always be verified + if (s_nsph == nsph) { + // Shortcuts to variables stored in configuration objects + ScatteringAngles *p_scattering_angles = new ScatteringAngles(gconf); + double wp = sconf->wp; + // Open empty virtual ascii file for output + InclusionOutputInfo *p_output = new InclusionOutputInfo(sconf, gconf, mpidata); + InclusionIterationData *cid = new InclusionIterationData(gconf, sconf, mpidata, device_count); + const np_int ndi = cid->c1->nsph * cid->c1->nlim; + const np_int ndit = 2 * ndi; + logger->log("INFO: Size of matrices to invert: " + to_string((int64_t)cid->c1->ndm) + " x " + to_string((int64_t)cid->c1->ndm) +".\n"); + time_logger->log("INFO: Size of matrices to invert: " + to_string((int64_t)cid->c1->ndm) + " x " + to_string((int64_t)cid->c1->ndm) +".\n"); + + instr(sconf, cid->c1); + thdps(cid->c1->lm, cid->zpv); + double exdc = sconf->exdc; + double exri = sqrt(exdc); + + // Create an empty bynary file + VirtualBinaryFile *vtppoanp = new VirtualBinaryFile(); + string tppoan_name = output_path + "/c_TPPOAN"; #ifdef USE_MAGMA - proc_device = mpidata->rank % device_count; + logger->log("INFO: using MAGMA calls.\n", LOG_INFO); +#elif defined USE_LAPACK + logger->log("INFO: using LAPACK calls.\n", LOG_INFO); #else - proc_device = 0; + logger->log("INFO: using fall-back lucin() calls.\n", LOG_INFO); #endif + int iavm = gconf->iavm; + int isam = gconf->isam; + int inpol = gconf->in_pol; + int nxi = sconf->number_of_scales; + int nth = p_scattering_angles->nth; + int nths = p_scattering_angles->nths; + int nph = p_scattering_angles->nph; + int nphs = p_scattering_angles->nphs; + + //======================== + // write a block of info to virtual binary file + //======================== + vtppoanp->append_line(VirtualBinaryLine(iavm)); + vtppoanp->append_line(VirtualBinaryLine(isam)); + vtppoanp->append_line(VirtualBinaryLine(inpol)); + vtppoanp->append_line(VirtualBinaryLine(nxi)); + vtppoanp->append_line(VirtualBinaryLine(nth)); + vtppoanp->append_line(VirtualBinaryLine(nph)); + vtppoanp->append_line(VirtualBinaryLine(nths)); + vtppoanp->append_line(VirtualBinaryLine(nphs)); + if (sconf->idfc < 0) { + cid->vk = cid->xip * cid->wn; + p_output->vec_vk[0] = cid->vk; + } + + // do the first iteration on jxi488 separately, since it seems to be different from the others + int jxi488 = 1; + int initialmaxrefiters = cid->maxrefiters; + + chrono::time_point<chrono::high_resolution_clock> start_iter_1 = chrono::high_resolution_clock::now(); +#ifdef USE_NVTX + nvtxRangePush("First iteration"); +#endif + // use these pragmas, which should have no effect on parallelism, just to push OMP nested levels at the same level also in the first wavelength iteration + int jer = 0; +#pragma omp parallel + { +#pragma omp single + { + jer = inclusion_jxi488_cycle(jxi488, sconf, gconf, p_scattering_angles, cid, p_output, output_path, vtppoanp); + } + } +#ifdef USE_NVTX + nvtxRangePop(); +#endif + chrono::time_point<chrono::high_resolution_clock> end_iter_1 = chrono::high_resolution_clock::now(); + elapsed = start_iter_1 - t_start; + string message = "INFO: Calculation setup took " + to_string(elapsed.count()) + "s.\n"; + logger->log(message); + time_logger->log(message); + elapsed = end_iter_1 - start_iter_1; + message = "INFO: First iteration took " + to_string(elapsed.count()) + "s.\n"; + logger->log(message); + time_logger->log(message); + /* for the next iterations, just always do maxiter iterations, assuming the accuracy is good enough */ + cid->refinemode = 0; + /* add an extra iteration for margin, if this does not exceed initialmaxrefiters */ + // if (cid->maxrefiters < initialmaxrefiters) cid->maxrefiters++; + if (jer != 0) { + // First loop failed. Halt the calculation. + fclose(timing_file); + delete time_logger; + delete p_output; + delete p_scattering_angles; + delete cid; + delete logger; + delete sconf; + delete gconf; + return; + } - // In the first iteration, if refinement is enabled, determine the number of refinement iterations required to arrive at the target accuracy (if achievable in a reasonable number of iterations) - refinemode = 2; - // maxrefiters and accuracygoal should be configurable and preferably set somewhere else - maxrefiters = 20; - accuracygoal = 1e-6; -} - -InclusionIterationData::InclusionIterationData(const InclusionIterationData& rhs) { - c1 = new ParticleDescriptorInclusion(reinterpret_cast<ParticleDescriptorInclusion &>(*(rhs.c1))); - const int ndi = c1->nsph * c1->nlim; - const np_int ndit = 2 * ndi; - gaps = new double[c1->nsph](); - for (int gi = 0; gi < c1->nsph; gi++) gaps[gi] = rhs.gaps[gi]; - tqev = new double[3](); - tqsv = new double[3](); - for (int ti = 0; ti < 3; ti++) { - tqev[ti] = rhs.tqev[ti]; - tqsv[ti] = rhs.tqsv[ti]; - } - tqse = new double*[2]; - tqspe = new dcomplex*[2]; - tqss = new double*[2]; - tqsps = new dcomplex*[2]; - tqce = new double*[2]; - tqcpe = new dcomplex*[2]; - tqcs = new double*[2]; - tqcps = new dcomplex*[2]; - for (int ti = 0; ti < 2; ti++) { - tqse[ti] = new double[c1->nsph](); - tqspe[ti] = new dcomplex[c1->nsph](); - tqss[ti] = new double[c1->nsph](); - tqsps[ti] = new dcomplex[c1->nsph](); - for (int tj = 0; tj < c1->nsph; tj++) { - tqse[ti][tj] = rhs.tqse[ti][tj]; - tqspe[ti][tj] = rhs.tqspe[ti][tj]; - tqss[ti][tj] = rhs.tqss[ti][tj]; - tqsps[ti][tj] = rhs.tqsps[ti][tj]; - } - tqce[ti] = new double[3](); - tqcpe[ti] = new dcomplex[3](); - tqcs[ti] = new double[3](); - tqcps[ti] = new dcomplex[3](); - for (int tj = 0; tj < 3; tj++) { - tqce[ti][tj] = rhs.tqce[ti][tj]; - tqcpe[ti][tj] = rhs.tqcpe[ti][tj]; - tqcs[ti][tj] = rhs.tqcs[ti][tj]; - tqcps[ti][tj] = rhs.tqcps[ti][tj]; - } - } - gapv = new double[3](); - gapp = new dcomplex*[3]; - gappm = new dcomplex*[3]; - gap = new double*[3]; - gapm = new double*[3]; - for (int gi = 0; gi < 3; gi++) { - gapv[gi] = rhs.gapv[gi]; - gapp[gi] = new dcomplex[2](); - gappm[gi] = new dcomplex[2](); - gap[gi] = new double[2](); - gapm[gi] = new double[2](); - for (int gj = 0; gj < 2; gj++) { - gapp[gi][gj] = rhs.gapp[gi][gj]; - gappm[gi][gj] = rhs.gappm[gi][gj]; - gap[gi][gj] = rhs.gap[gi][gj]; - gapm[gi][gj] = rhs.gapm[gi][gj]; - } - } - u = new double[3](); - us = new double[3](); - un = new double[3](); - uns = new double[3](); - up = new double[3](); - ups = new double[3](); - unmp = new double[3](); - unsmp = new double[3](); - upmp = new double[3](); - upsmp = new double[3](); - duk = new double[3](); - for (int ui = 0; ui < 3; ui++) { - u[ui] = rhs.u[ui]; - us[ui] = rhs.us[ui]; - un[ui] = rhs.un[ui]; - uns[ui] = rhs.uns[ui]; - up[ui] = rhs.up[ui]; - ups[ui] = rhs.ups[ui]; - unmp[ui] = rhs.unmp[ui]; - unsmp[ui] = rhs.unsmp[ui]; - upmp[ui] = rhs.upmp[ui]; - upsmp[ui] = rhs.upsmp[ui]; - duk[ui] = rhs.duk[ui]; - } - argi = new double[1](); - args = new double[1](); - argi[0] = rhs.argi[0]; - args[0] = rhs.args[0]; - cextlr = new double*[4]; - cext = new double*[4]; - cmullr = new double*[4];; - cmul = new double*[4]; - for (int ci = 0; ci < 4; ci++) { - cextlr[ci] = new double[4](); - cext[ci] = new double[4](); - cmullr[ci] = new double[4](); - cmul[ci] = new double[4](); - for (int cj = 0; cj < 4; cj++) { - cextlr[ci][cj] = rhs.cextlr[ci][cj]; - cext[ci][cj] = rhs.cext[ci][cj]; - cmullr[ci][cj] = rhs.cmullr[ci][cj]; - cmul[ci][cj] = rhs.cmul[ci][cj]; - } - } - vec_zpv = new double[c1->lm * 12]; - zpv = new double***[c1->lm]; - for (int zi = 0; zi < c1->lm; zi++) { - zpv[zi] = new double **[12]; - for (int zj = 0; zj < 3; zj++) { - zpv[zi][zj] = new double*[4]; - zpv[zi][zj][0] = vec_zpv + (zi * 12) + (zj * 4); - zpv[zi][zj][1] = vec_zpv + (zi * 12) + (zj * 4) + 2; - zpv[zi][zj][0][0] = rhs.zpv[zi][zj][0][0]; - zpv[zi][zj][0][1] = rhs.zpv[zi][zj][0][1]; - zpv[zi][zj][1][0] = rhs.zpv[zi][zj][1][0]; - zpv[zi][zj][1][1] = rhs.zpv[zi][zj][1][1]; - } - } - am_vector = new dcomplex[c1->ndm * c1->ndm]; - for (int ai = 0; ai < c1->ndm * c1->ndm; ai++) am_vector[ai] = rhs.am_vector[ai]; - am = new dcomplex*[c1->ndm]; - for (int ai = 0; ai < c1->ndm; ai++) { - am[ai] = (am_vector + ai * c1->ndm); - } - - arg = rhs.arg; - // These are suspect initializations - scan = rhs.scan; - cfmp = rhs.cfmp; - sfmp = rhs.sfmp; - cfsp = rhs.cfsp; - sfsp = rhs.sfsp; - // End of suspect initializations - wn = rhs.wn; - xip = rhs.xip; - sqsfi = rhs.sqsfi; - vk = rhs.vk; - firstxi = rhs.firstxi; - lastxi = rhs.lastxi; - xiblock = rhs.xiblock; - number_of_scales = rhs.number_of_scales; + //================================================== + // do the first outputs here, so that I open here the new files, afterwards I only append + //================================================== + vtppoanp->write_to_disk(output_path + "/c_TPPOAN"); + delete vtppoanp; + + // here go the calls that send data to be duplicated on other MPI processes from process 0 to others, using MPI broadcasts, but only if MPI is actually used +#ifdef MPI_VERSION + if (mpidata->mpirunning) { + gconf->mpibcast(mpidata); + sconf->mpibcast(mpidata); + cid->mpibcast(mpidata); + p_scattering_angles->mpibcast(mpidata); + } +#endif + // Create this variable and initialise it with a default here, so that it is defined anyway, with or without OpenMP support enabled + int ompnumthreads = 1; + // this is for MPI process 0 (or even if we are not using MPI at all) + int myjxi488startoffset = 0; + int myMPIstride = ompnumthreads; + int myMPIblock = ompnumthreads; + // Define here shared arrays of virtual ascii and binary files, so that thread 0 will be able to access them all later + InclusionOutputInfo **p_outarray = NULL; + VirtualBinaryFile **vtppoanarray = NULL; - nimd = rhs.nimd; - extr = rhs.extr; - - proc_device = rhs.proc_device; - refinemode = rhs.refinemode; - maxrefiters = rhs.maxrefiters; - accuracygoal = rhs.accuracygoal; -} +#ifdef USE_NVTX + nvtxRangePush("Parallel loop"); +#endif -#ifdef MPI_VERSION -InclusionIterationData::InclusionIterationData(const mixMPI *mpidata, const int device_count) { - c1 = new ParticleDescriptorInclusion(mpidata); - const int ndi = c1->nsph * c1->nlim; - const np_int ndit = 2 * ndi; - gaps = new double[c1->nsph](); - MPI_Bcast(gaps, c1->nsph, MPI_DOUBLE, 0, MPI_COMM_WORLD); - tqev = new double[3](); - tqsv = new double[3](); - MPI_Bcast(tqev, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(tqsv, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); - tqse = new double*[2]; - tqspe = new dcomplex*[2]; - tqss = new double*[2]; - tqsps = new dcomplex*[2]; - tqce = new double*[2]; - tqcpe = new dcomplex*[2]; - tqcs = new double*[2]; - tqcps = new dcomplex*[2]; - for (int ti = 0; ti < 2; ti++) { - tqse[ti] = new double[c1->nsph](); - tqspe[ti] = new dcomplex[c1->nsph](); - tqss[ti] = new double[c1->nsph](); - tqsps[ti] = new dcomplex[c1->nsph](); - MPI_Bcast(tqse[ti], c1->nsph, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(tqspe[ti], c1->nsph, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); - MPI_Bcast(tqss[ti], c1->nsph, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(tqsps[ti], c1->nsph, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); - tqce[ti] = new double[3](); - tqcpe[ti] = new dcomplex[3](); - tqcs[ti] = new double[3](); - tqcps[ti] = new dcomplex[3](); - MPI_Bcast(tqce[ti], 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(tqcpe[ti], 3, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); - MPI_Bcast(tqcs[ti], 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(tqcps[ti], 3, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); - } - gapv = new double[3](); - gapp = new dcomplex*[3]; - gappm = new dcomplex*[3]; - gap = new double*[3]; - gapm = new double*[3]; - MPI_Bcast(gapv, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); - for (int gi = 0; gi < 3; gi++) { - gapp[gi] = new dcomplex[2](); - gappm[gi] = new dcomplex[2](); - gap[gi] = new double[2](); - gapm[gi] = new double[2](); - MPI_Bcast(gapp[gi], 2, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); - MPI_Bcast(gappm[gi], 2, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); - MPI_Bcast(gap[gi], 2, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(gapm[gi], 2, MPI_DOUBLE, 0, MPI_COMM_WORLD); - } - u = new double[3](); - us = new double[3](); - un = new double[3](); - uns = new double[3](); - up = new double[3](); - ups = new double[3](); - unmp = new double[3](); - unsmp = new double[3](); - upmp = new double[3](); - upsmp = new double[3](); - duk = new double[3](); - MPI_Bcast(u, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(us, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(un, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(uns, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(up, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(ups, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(unmp, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(unsmp, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(upmp, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(upsmp, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(duk, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); - argi = new double[1](); - args = new double[1](); - MPI_Bcast(argi, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(args, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); - cextlr = new double*[4]; - cext = new double*[4]; - cmullr = new double*[4];; - cmul = new double*[4]; - for (int ci = 0; ci < 4; ci++) { - cextlr[ci] = new double[4](); - cext[ci] = new double[4](); - cmullr[ci] = new double[4](); - cmul[ci] = new double[4](); - MPI_Bcast(cextlr[ci], 4, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(cext[ci], 4, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(cmullr[ci], 4, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(cmul[ci], 4, MPI_DOUBLE, 0, MPI_COMM_WORLD); - } - vec_zpv = new double[c1->lm * 12]; - MPI_Bcast(vec_zpv, c1->lm * 12, MPI_DOUBLE, 0, MPI_COMM_WORLD); - zpv = new double***[c1->lm]; - for (int zi = 0; zi < c1->lm; zi++) { - zpv[zi] = new double **[12]; - for (int zj = 0; zj < 3; zj++) { - zpv[zi][zj] = new double*[4]; - zpv[zi][zj][0] = vec_zpv + (zi * 12) + (zj * 4); - zpv[zi][zj][1] = vec_zpv + (zi * 12) + (zj * 4) + 2; - } - } - am_vector = new dcomplex[c1->ndm * c1->ndm]; - am = new dcomplex*[c1->ndm]; - for (int ai = 0; ai < c1->ndm; ai++) { - am[ai] = (am_vector + ai * c1->ndm); - MPI_Bcast(am[ai], c1->ndm, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); - } - MPI_Bcast(&arg, 1, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); - MPI_Bcast(&scan, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(&cfmp, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(&sfmp, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(&cfsp, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(&sfsp, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(&wn, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(&xip, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(&sqsfi, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(&vk, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(&xiblock, 1, MPI_INT, 0, MPI_COMM_WORLD); - MPI_Bcast(&number_of_scales, 1, MPI_INT, 0, MPI_COMM_WORLD); - lastxi = ((mpidata->rank+1) * xiblock)+1; - firstxi = lastxi-xiblock+1; - if (lastxi > number_of_scales) lastxi = number_of_scales; + //=========================================== + // open the OpenMP parallel context, so each thread can initialise its stuff + //=========================================== +#pragma omp parallel + { + // Create and initialise this variable here, so that if OpenMP is enabled it is local to the thread, and if OpenMP is not enabled it has a well-defiled value anyway + int myompthread = 0; - MPI_Bcast(&nimd, 1, MPI_INT, 0, MPI_COMM_WORLD); - MPI_Bcast(&extr, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); - -#ifdef USE_MAGMA - proc_device = mpidata->rank % device_count; -#else - proc_device = 0; +#ifdef _OPENMP + // If OpenMP is enabled, give actual values to myompthread and ompnumthreads, and open thread-local output files + myompthread = omp_get_thread_num(); + if (myompthread == 0) ompnumthreads = omp_get_num_threads(); #endif - MPI_Bcast(&refinemode, 1, MPI_INT, 0, MPI_COMM_WORLD); - MPI_Bcast(&maxrefiters, 1, MPI_INT, 0, MPI_COMM_WORLD); - MPI_Bcast(&accuracygoal, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); -} -void InclusionIterationData::mpibcast(const mixMPI *mpidata) { - c1->mpibcast(mpidata); - const int ndi = c1->nsph * c1->nlim; - const np_int ndit = 2 * ndi; - MPI_Bcast(gaps, c1->nsph, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(tqev, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(tqsv, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); - for (int ti = 0; ti < 2; ti++) { - MPI_Bcast(tqse[ti], c1->nsph, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(tqspe[ti], c1->nsph, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); - MPI_Bcast(tqss[ti], c1->nsph, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(tqsps[ti], c1->nsph, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); - MPI_Bcast(tqce[ti], 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(tqcpe[ti], 3, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); - MPI_Bcast(tqcs[ti], 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(tqcps[ti], 3, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); - } - MPI_Bcast(gapv, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); - for (int gi = 0; gi < 3; gi++) { - MPI_Bcast(gapp[gi], 2, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); - MPI_Bcast(gappm[gi], 2, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); - MPI_Bcast(gap[gi], 2, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(gapm[gi], 2, MPI_DOUBLE, 0, MPI_COMM_WORLD); - } - MPI_Bcast(u, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(us, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(un, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(uns, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(up, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(ups, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(unmp, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(unsmp, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(upmp, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(upsmp, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(duk, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(argi, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(args, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); - for (int ci = 0; ci < 4; ci++) { - MPI_Bcast(cextlr[ci], 4, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(cext[ci], 4, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(cmullr[ci], 4, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(cmul[ci], 4, MPI_DOUBLE, 0, MPI_COMM_WORLD); - } - MPI_Bcast(vec_zpv, c1->lm * 12, MPI_DOUBLE, 0, MPI_COMM_WORLD); - // since MPI expects an int argument for the number of elements to transfer in one go, transfer am one row at a time - for (int ai = 0; ai < c1->ndm; ai++) { - MPI_Bcast(am[ai], c1->ndm, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); - } - MPI_Bcast(&arg, 1, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); - MPI_Bcast(&scan, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(&cfmp, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(&sfmp, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(&cfsp, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(&sfsp, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(&wn, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(&xip, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(&sqsfi, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(&vk, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(&xiblock, 1, MPI_INT, 0, MPI_COMM_WORLD); - MPI_Bcast(&number_of_scales, 1, MPI_INT, 0, MPI_COMM_WORLD); - MPI_Bcast(&nimd, 1, MPI_INT, 0, MPI_COMM_WORLD); - MPI_Bcast(&extr, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); - MPI_Bcast(&refinemode, 1, MPI_INT, 0, MPI_COMM_WORLD); - MPI_Bcast(&maxrefiters, 1, MPI_INT, 0, MPI_COMM_WORLD); - MPI_Bcast(&accuracygoal, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); -} + if (myompthread == 0) { + // Initialise some shared variables only on thread 0 + p_outarray = new InclusionOutputInfo*[ompnumthreads]; + vtppoanarray = new VirtualBinaryFile*[ompnumthreads]; + myMPIblock = ompnumthreads; + myMPIstride = myMPIblock; + } + +#ifdef MPI_VERSION + if (myompthread == 0) { + if (mpidata->mpirunning) { + // only go through this if MPI has been actually used + for (int rr=1; rr<mpidata->nprocs; rr++) { + // individually send their respective starting points to other MPI processes: they start immediately after the frequencies computed by previous processes so far + int remotejxi488startoffset = myMPIstride; + MPI_Send(&remotejxi488startoffset, 1, MPI_INT, rr, 3, MPI_COMM_WORLD); + int remoteMPIblock; + MPI_Recv(&remoteMPIblock, 1, MPI_INT, rr, 3, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + // update myMPIstride to include the ones due to MPI process rr + myMPIstride += remoteMPIblock; + } + // now I know the total myMPIstride, I can send it to all processes + MPI_Bcast(&myMPIstride, 1, MPI_INT, 0, MPI_COMM_WORLD); + } + } #endif + // add an omp barrier to make sure that the global variables defined by thread 0 are known to all threads below this +#pragma omp barrier -InclusionIterationData::~InclusionIterationData() { - const int nsph = c1->nsph; - delete[] am_vector; - delete[] am; - for (int zi = 0; zi < c1->lm; zi++) { - for (int zj = 0; zj < 3; zj++) { - delete[] zpv[zi][zj]; - } - delete[] zpv[zi]; - } - delete[] zpv; - delete[] vec_zpv; - delete c1; - delete[] gaps; - for (int ti = 1; ti > -1; ti--) { - delete[] tqse[ti]; - delete[] tqss[ti]; - delete[] tqspe[ti]; - delete[] tqsps[ti]; - delete[] tqce[ti]; - delete[] tqcpe[ti]; - delete[] tqcs[ti]; - delete[] tqcps[ti]; - } - delete[] tqse; - delete[] tqss; - delete[] tqspe; - delete[] tqsps; - delete[] tqce; - delete[] tqcpe; - delete[] tqcs; - delete[] tqcps; - delete[] tqev; - delete[] tqsv; - for (int gi = 2; gi > -1; gi--) { - delete[] gapp[gi]; - delete[] gappm[gi]; - delete[] gap[gi]; - delete[] gapm[gi]; - } - delete[] gapp; - delete[] gappm; - delete[] gap; - delete[] gapm; - delete[] gapv; - delete[] u; - delete[] us; - delete[] un; - delete[] uns; - delete[] up; - delete[] ups; - delete[] unmp; - delete[] unsmp; - delete[] upmp; - delete[] upsmp; - delete[] argi; - delete[] args; - delete[] duk; - for (int ci = 3; ci > -1; ci--) { - delete[] cextlr[ci]; - delete[] cext[ci]; - delete[] cmullr[ci]; - delete[] cmul[ci]; - } - delete[] cextlr; - delete[] cext; - delete[] cmullr; - delete[] cmul; -} -// >>> End of InclusionIterationData implementation <<< // + // To test parallelism, I will now start feeding this function with "clean" copies of the parameters, so that they will not be changed by previous iterations, and each one will behave as the first one. Define all (empty) variables here, so they have the correct scope, then they get different definitions depending on thread number + InclusionIterationData *cid_2 = NULL; + InclusionOutputInfo *p_output_2 = NULL; + VirtualBinaryFile *vtppoanp_2 = NULL; + // for threads other than the 0, create distinct copies of all relevant data, while for thread 0 just define new references / pointers to the original ones + if (myompthread == 0) { + cid_2 = cid; + // OMP thread 0 of MPI process 0 holds the pointer to the full output structure + p_output_2 = p_output; + p_outarray[0] = p_output_2; + } else { + // this is not thread 0, so do create fresh copies of all local variables + cid_2 = new InclusionIterationData(*cid); + } + // make sure all threads align here: I don't want the following loop to accidentally start for thread 0, possibly modifying some variables before they are copied by all other threads + if (myompthread==0) { + logger->log("Syncing OpenMP threads and starting the loop on wavelengths\n"); + } +#pragma omp barrier + // ok, now I can actually start the parallel calculations + for (int ixi488=2; ixi488<=cid_2->number_of_scales; ixi488 +=myMPIstride) { + // the parallel loop over MPI processes covers a different set of indices for each thread +#pragma omp barrier + int myjxi488 = ixi488+myompthread; + // each thread opens new virtual files and stores their pointers in the shared array + vtppoanp_2 = new VirtualBinaryFile(); + // each thread puts a copy of the pointers to its virtual files in the shared arrays + vtppoanarray[myompthread] = vtppoanp_2; +#pragma omp barrier -/*! \brief Main calculation loop. - * - * The solution of the scattering problem for different wavelengths is an - * embarrasingly parallel task. This function, therefore, collects all the - * operations that can be independently executed by different processes, - * after the configuration stage and the first calculation loop have been - * executed. - * - * \param jxi488: `int` Wavelength loop index. - * \param sconf: `ScattererConfiguration *` Pointer to a `ScattererConfiguration` object. - * \param gconf: `GeometryConfiguration *` Pointer to a `GeometryConfiguration` object. - * \param sa: `ScatteringAngles *` Pointer to a `ScatteringAngles` object. - * \param cid: `InclusionIterationData *` Pointer to an `InclusionIterationData` object. - * \param oi: `InclusionOutputInfo *` Pointer to an `InclusionOutputInfo` object. - * \param output_path: `const string &` Path to the output directory. - * \param vtppoanp: `VirtualBinaryFile *` Pointer to a `VirtualBinaryFile` object. - */ -int inclusion_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConfiguration *gconf, ScatteringAngles *sa, InclusionIterationData *cid, InclusionOutputInfo *output, const string& output_path, VirtualBinaryFile *vtppoanp); + // each MPI process handles a number of contiguous scales corresponding to its number of OMP threads at this omp level of parallelism + if (myjxi488 <= cid_2->number_of_scales) { + if (myompthread > 0) { + // UPDATE: non-0 threads need to allocate memory for one scale at a time. + p_output_2 = new InclusionOutputInfo(sconf, gconf, mpidata, myjxi488, 1); + p_outarray[myompthread] = p_output_2; + } + int jer = inclusion_jxi488_cycle(myjxi488, sconf, gconf, p_scattering_angles, cid_2, p_output_2, output_path, vtppoanp_2); + } else { + if (myompthread > 0) { + // If there is no input for this thread, set output pointer to NULL. + p_outarray[myompthread] = NULL; + } + } +#pragma omp barrier -/*! \brief C++ implementation of INCLU - * - * \param config_file: `string` Name of the configuration file. - * \param data_file: `string` Name of the input data file. - * \param output_path: `string` Directory to write the output files in. - * \param mpidata: `mixMPI *` Pointer to an instance of MPI data settings. - */ -void inclusion(const string& config_file, const string& data_file, const string& output_path, const mixMPI *mpidata) { - chrono::time_point<chrono::high_resolution_clock> t_start = chrono::high_resolution_clock::now(); - chrono::duration<double> elapsed; - string message; - string timing_name; - FILE *timing_file; - Logger *time_logger; - if (mpidata->rank == 0) { - timing_name = output_path + "/c_timing_mpi"+ to_string(mpidata->rank) +".log"; - timing_file = fopen(timing_name.c_str(), "w"); - time_logger = new Logger(LOG_DEBG, timing_file); - } - Logger *logger = new Logger(LOG_DEBG); - int device_count = 0; - //=========== - // Initialise MAGMA - //=========== -#ifdef USE_MAGMA - const magma_int_t d_array_max_size = 32; // TEMPORARY: can become configurable parameter - magma_device_t *device_array = new magma_device_t[d_array_max_size]; - magma_int_t num_devices; - magma_getdevices(device_array, d_array_max_size, &num_devices); - device_count = (int)num_devices; - delete[] device_array; - message = "DEBUG: Proc-" + to_string(mpidata->rank) + " found " + to_string(device_count) + " GPU "; - if (device_count > 1) message += "devices.\n"; - else message += "device.\n"; - logger->log(message, LOG_DEBG); - logger->log("INFO: Process " + to_string(mpidata->rank) + " initializes MAGMA.\n"); - magma_int_t magma_result = magma_init(); - if (magma_result != MAGMA_SUCCESS) { - logger->err("ERROR: Process " + to_string(mpidata->rank) + " failed to initilize MAGMA.\n"); - logger->err("PROC-" + to_string(mpidata->rank) + ": MAGMA error code " + to_string(magma_result) + "\n"); - if (mpidata->rank == 0) { - fclose(timing_file); - delete time_logger; - } - delete logger; - return; - } -#endif // end MAGMA initialisation - - //=========================== - // the following only happens on MPI process 0 - //=========================== - if (mpidata->rank == 0) { #ifdef USE_NVTX - nvtxRangePush("Set up"); + nvtxRangePush("Output concatenation"); #endif - //======================= - // Initialise sconf from configuration file - //======================= - logger->log("INFO: making legacy configuration...", LOG_INFO); - ScattererConfiguration *sconf = NULL; - try { - sconf = ScattererConfiguration::from_dedfb(config_file); - } catch(const OpenConfigurationFileException &ex) { - logger->err("\nERROR: failed to open scatterer configuration file.\n"); - string message = "FILE: " + string(ex.what()) + "\n"; - logger->err(message); - fclose(timing_file); - delete time_logger; - delete logger; - return; - } - sconf->write_formatted(output_path + "/c_OEDFB"); - sconf->write_binary(output_path + "/c_TEDF"); - sconf->write_binary(output_path + "/c_TEDF.hd5", "HDF5"); - // end scatterer initialisation - - //======================== - // Initialise gconf from configuration files - //======================== - GeometryConfiguration *gconf = NULL; - try { - gconf = GeometryConfiguration::from_legacy(data_file); - } catch (const OpenConfigurationFileException &ex) { - logger->err("\nERROR: failed to open geometry configuration file.\n"); - string message = "FILE: " + string(ex.what()) + "\n"; - logger->err(message); - if (sconf) delete sconf; - fclose(timing_file); - delete time_logger; - delete logger; - return; - } - logger->log(" done.\n", LOG_INFO); - //end gconf initialisation +#pragma omp barrier + // threads different from 0 append their virtual files to the one of thread 0, and delete them + if (myompthread == 0) { + for (int ti=1; ti<ompnumthreads; ti++) { + if (p_outarray[ti] != NULL) { + p_outarray[0]->insert(*(p_outarray[ti])); + delete p_outarray[ti]; + p_outarray[ti] = NULL; + } + vtppoanarray[0]->append(*(vtppoanarray[ti])); + delete vtppoanarray[ti]; + } + } +#pragma omp barrier + //============================================== + // Collect all virtual files on thread 0 of MPI process 0, and append them to disk + //============================================== + if (myompthread == 0) { + // thread 0 writes its virtual files, now including contributions from all threads, to disk, and deletes them + // p_outarray[0]->append_to_disk(output_path + "/c_OINCLU"); + // delete p_outarray[0]; + vtppoanarray[0]->append_to_disk(output_path + "/c_TPPOAN"); + delete vtppoanarray[0]; +#ifdef MPI_VERSION + if (mpidata->mpirunning) { + // only go through this if MPI has been actually used + for (int rr=1; rr<mpidata->nprocs; rr++) { + // get the data from process rr by receiving it in total memory structure + p_outarray[0]->mpireceive(mpidata, rr); + // get the data from process rr, creating a new virtual ascii file + // VirtualAsciiFile *p_output = new VirtualAsciiFile(mpidata, rr); + // append to disk and delete virtual ascii file + // p_output->append_to_disk(output_path + "/c_OINCLU"); + // delete p_output; + + // get the data from process rr, creating a new virtual binary file + VirtualBinaryFile *vtppoanp = new VirtualBinaryFile(mpidata, rr); + // append to disk and delete virtual binary file + vtppoanp->append_to_disk(output_path + "/c_TPPOAN"); + delete vtppoanp; + int test = MPI_Barrier(MPI_COMM_WORLD); + } + } +#endif + } + // end block writing to disk #ifdef USE_NVTX - nvtxRangePop(); + nvtxRangePop(); #endif - int s_nsph = sconf->number_of_spheres; - int nsph = gconf->number_of_spheres; - // Sanity check on number of sphere consistency, should always be verified - if (s_nsph == nsph) { - // Shortcuts to variables stored in configuration objects - ScatteringAngles *p_scattering_angles = new ScatteringAngles(gconf); - double wp = sconf->wp; - // Open empty virtual ascii file for output - InclusionOutputInfo *p_output = new InclusionOutputInfo(sconf, gconf, mpidata); - InclusionIterationData *cid = new InclusionIterationData(gconf, sconf, mpidata, device_count); - const np_int ndi = cid->c1->nsph * cid->c1->nlim; - const np_int ndit = 2 * ndi; - logger->log("INFO: Size of matrices to invert: " + to_string((int64_t)cid->c1->ndm) + " x " + to_string((int64_t)cid->c1->ndm) +".\n"); - time_logger->log("INFO: Size of matrices to invert: " + to_string((int64_t)cid->c1->ndm) + " x " + to_string((int64_t)cid->c1->ndm) +".\n"); - - instr(sconf, cid->c1); - thdps(cid->c1->lm, cid->zpv); - double exdc = sconf->exdc; - double exri = sqrt(exdc); +#pragma omp barrier - // Create an empty bynary file - VirtualBinaryFile *vtppoanp = new VirtualBinaryFile(); - string tppoan_name = output_path + "/c_TPPOAN"; -#ifdef USE_MAGMA - logger->log("INFO: using MAGMA calls.\n", LOG_INFO); -#elif defined USE_LAPACK - logger->log("INFO: using LAPACK calls.\n", LOG_INFO); -#else - logger->log("INFO: using fall-back lucin() calls.\n", LOG_INFO); -#endif - int iavm = gconf->iavm; - int isam = gconf->isam; - int inpol = gconf->in_pol; - int nxi = sconf->number_of_scales; - int nth = p_scattering_angles->nth; - int nths = p_scattering_angles->nths; - int nph = p_scattering_angles->nph; - int nphs = p_scattering_angles->nphs; - - //======================== - // write a block of info to virtual binary file - //======================== - vtppoanp->append_line(VirtualBinaryLine(iavm)); - vtppoanp->append_line(VirtualBinaryLine(isam)); - vtppoanp->append_line(VirtualBinaryLine(inpol)); - vtppoanp->append_line(VirtualBinaryLine(nxi)); - vtppoanp->append_line(VirtualBinaryLine(nth)); - vtppoanp->append_line(VirtualBinaryLine(nph)); - vtppoanp->append_line(VirtualBinaryLine(nths)); - vtppoanp->append_line(VirtualBinaryLine(nphs)); - if (sconf->idfc < 0) { - cid->vk = cid->xip * cid->wn; - p_output->vec_vk[0] = cid->vk; - } - - // do the first iteration on jxi488 separately, since it seems to be different from the others - int jxi488 = 1; - int initialmaxrefiters = cid->maxrefiters; - - chrono::time_point<chrono::high_resolution_clock> start_iter_1 = chrono::high_resolution_clock::now(); -#ifdef USE_NVTX - nvtxRangePush("First iteration"); -#endif - // use these pragmas, which should have no effect on parallelism, just to push OMP nested levels at the same level also in the first wavelength iteration - int jer = 0; -#pragma omp parallel - { -#pragma omp single + } // close strided loop running on MPI processes, ixi488 loop + // delete the shared arrays I used to make available to thread 0 the virtual files of other threads +#pragma omp barrier + if (myompthread == 0) { + delete[] p_outarray; + delete[] vtppoanarray; + } { - jer = inclusion_jxi488_cycle(jxi488, sconf, gconf, p_scattering_angles, cid, p_output, output_path, vtppoanp); + string message = "INFO: Closing thread-local output files of thread " + to_string(myompthread) + " and syncing threads.\n"; + logger->log(message); } - } #ifdef USE_NVTX - nvtxRangePop(); + nvtxRangePop(); #endif - chrono::time_point<chrono::high_resolution_clock> end_iter_1 = chrono::high_resolution_clock::now(); - elapsed = start_iter_1 - t_start; - string message = "INFO: Calculation setup took " + to_string(elapsed.count()) + "s.\n"; - logger->log(message); - time_logger->log(message); - elapsed = end_iter_1 - start_iter_1; - message = "INFO: First iteration took " + to_string(elapsed.count()) + "s.\n"; - logger->log(message); - time_logger->log(message); - /* for the next iterations, just always do maxiter iterations, assuming the accuracy is good enough */ - cid->refinemode = 0; - /* add an extra iteration for margin, if this does not exceed initialmaxrefiters */ - // if (cid->maxrefiters < initialmaxrefiters) cid->maxrefiters++; - if (jer != 0) { - // First loop failed. Halt the calculation. - fclose(timing_file); - delete time_logger; - delete p_output; - delete p_scattering_angles; - delete cid; - delete logger; - delete sconf; - delete gconf; - return; + delete cid_2; } + delete p_scattering_angles; + p_output->write(output_path + "/c_OINCLU.hd5", "HDF5"); + p_output->write(output_path + "/c_OINCLU", "LEGACY"); + delete p_output; + } // closes s_nsph == nsph check + + else { // Sphere number inconsistency error. + throw UnrecognizedConfigurationException( + "Inconsistent geometry and scatterer configurations." + ); + } - //================================================== - // do the first outputs here, so that I open here the new files, afterwards I only append - //================================================== - vtppoanp->write_to_disk(output_path + "/c_TPPOAN"); - delete vtppoanp; - - // here go the calls that send data to be duplicated on other MPI processes from process 0 to others, using MPI broadcasts, but only if MPI is actually used -#ifdef MPI_VERSION - if (mpidata->mpirunning) { - gconf->mpibcast(mpidata); - sconf->mpibcast(mpidata); - cid->mpibcast(mpidata); - p_scattering_angles->mpibcast(mpidata); - } -#endif - // Create this variable and initialise it with a default here, so that it is defined anyway, with or without OpenMP support enabled - int ompnumthreads = 1; - // this is for MPI process 0 (or even if we are not using MPI at all) - int myjxi488startoffset = 0; - int myMPIstride = ompnumthreads; - int myMPIblock = ompnumthreads; - // Define here shared arrays of virtual ascii and binary files, so that thread 0 will be able to access them all later - InclusionOutputInfo **p_outarray = NULL; - VirtualBinaryFile **vtppoanarray = NULL; - -#ifdef USE_NVTX - nvtxRangePush("Parallel loop"); + delete sconf; + delete gconf; +#ifdef USE_MAGMA + logger->log("INFO: Process " + to_string(mpidata->rank) + " finalizes MAGMA.\n"); + magma_finalize(); #endif + chrono::time_point<chrono::high_resolution_clock> t_end = chrono::high_resolution_clock::now(); + elapsed = t_end - t_start; + string message = "INFO: Calculation lasted " + to_string(elapsed.count()) + "s.\n"; + logger->log(message); + logger->log("Finished: output written to " + output_path + "/c_OINCLU\n"); + time_logger->log(message); + fclose(timing_file); + delete time_logger; + } // end instructions block of MPI process 0 + + //=============================== + // instruction block for MPI processes different from 0 + //=============================== +#ifdef MPI_VERSION + else { + // here go the code for MPI processes other than 0 + // copy gconf, sconf, cid and p_scattering_angles from MPI process 0 + GeometryConfiguration *gconf = new GeometryConfiguration(mpidata); + ScattererConfiguration *sconf = new ScattererConfiguration(mpidata); + InclusionIterationData *cid = new InclusionIterationData(mpidata, device_count); + ScatteringAngles *p_scattering_angles = new ScatteringAngles(mpidata); - //=========================================== - // open the OpenMP parallel context, so each thread can initialise its stuff - //=========================================== + // Create this variable and initialise it with a default here, so that it is defined anyway, with or without OpenMP support enabled + int ompnumthreads = 1; + InclusionOutputInfo **p_outarray = NULL; + VirtualBinaryFile **vtppoanarray = NULL; + int myjxi488startoffset; + int myMPIstride = ompnumthreads; + int myMPIblock = ompnumthreads; + #pragma omp parallel - { - // Create and initialise this variable here, so that if OpenMP is enabled it is local to the thread, and if OpenMP is not enabled it has a well-defiled value anyway - int myompthread = 0; - + { + // Create and initialise this variable here, so that if OpenMP is enabled it is local to the thread, and if OpenMP is not enabled it has a well-defiled value anyway + int myompthread = 0; #ifdef _OPENMP - // If OpenMP is enabled, give actual values to myompthread and ompnumthreads, and open thread-local output files - myompthread = omp_get_thread_num(); - if (myompthread == 0) ompnumthreads = omp_get_num_threads(); + // If OpenMP is enabled, give actual values to myompthread and ompnumthreads, and open thread-local output files + myompthread = omp_get_thread_num(); + if (myompthread == 0) ompnumthreads = omp_get_num_threads(); #endif - - if (myompthread == 0) { - // Initialise some shared variables only on thread 0 - p_outarray = new InclusionOutputInfo*[ompnumthreads]; - vtppoanarray = new VirtualBinaryFile*[ompnumthreads]; - myMPIblock = ompnumthreads; - myMPIstride = myMPIblock; + if (myompthread == 0) { + // receive the start parameter from MPI process 0 + MPI_Recv(&myjxi488startoffset, 1, MPI_INT, 0, 3, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + // send my number of omp threads to process 0 + MPI_Send(&ompnumthreads, 1, MPI_INT, 0, 3, MPI_COMM_WORLD); + // receive myMPIstride sent by MPI process 0 to all processes + MPI_Bcast(&myMPIstride, 1, MPI_INT, 0, MPI_COMM_WORLD); + // allocate virtual files for each thread + p_outarray = new InclusionOutputInfo*[ompnumthreads]; + vtppoanarray = new VirtualBinaryFile*[ompnumthreads]; + } +#pragma omp barrier + // To test parallelism, I will now start feeding this function with "clean" copies of the parameters, so that they will not be changed by previous iterations, and each one will behave as the first one. Define all (empty) variables here, so they have the correct scope, then they get different definitions depending on thread number + InclusionIterationData *cid_2 = NULL; + InclusionOutputInfo *p_output_2 = NULL; + VirtualBinaryFile *vtppoanp_2 = NULL; + // PLACEHOLDER + // for threads other than the 0, create distinct copies of all relevant data, while for thread 0 just define new references / pointers to the original ones + if (myompthread == 0) { + cid_2 = cid; + } else { + // this is not thread 0, so do create fresh copies of all local variables + cid_2 = new InclusionIterationData(*cid); + } + // make sure all threads align here: I don't want the following loop to accidentally start for thread 0, possibly modifying some variables before they are copied by all other threads +#pragma omp barrier + // ok, now I can actually start the parallel calculations + for (int ixi488=2; ixi488<=cid_2->number_of_scales; ixi488 +=myMPIstride) { + // the parallel loop over MPI processes covers a different set of indices for each thread +#pragma omp barrier + int myjxi488 = ixi488 + myjxi488startoffset + myompthread; + // each thread opens new virtual files and stores their pointers in the shared array + vtppoanp_2 = new VirtualBinaryFile(); + // each thread puts a copy of the pointers to its virtual files in the shared arrays + vtppoanarray[myompthread] = vtppoanp_2; +#pragma omp barrier + if (myompthread==0) logger->log("Syncing OpenMP threads and starting the loop on wavelengths\n"); + // ok, now I can actually start the parallel calculations + // each MPI process handles a number of contiguous scales corresponding to its number of OMP threads at this omp level of parallelism + if (myjxi488 <= cid_2->number_of_scales) { + if (myompthread > 0) { + // UPDATE: non-0 threads need to allocate memory for one scale at a time. + p_output_2 = new InclusionOutputInfo(sconf, gconf, mpidata, myjxi488, 1); + p_outarray[myompthread] = p_output_2; + } else { + // Thread 0 of non-zero MPI processes needs to allocate memory for the + // output of all threads. + p_output_2 = new InclusionOutputInfo(sconf, gconf, mpidata, myjxi488, ompnumthreads); + p_outarray[0] = p_output_2; + } + int jer = inclusion_jxi488_cycle(myjxi488, sconf, gconf, p_scattering_angles, cid_2, p_output_2, output_path, vtppoanp_2); + } else { + if (myompthread > 0) { + // If there is no input for this thread, set the output pointer to NULL. + p_outarray[myompthread] = NULL; + } } -#ifdef MPI_VERSION - if (myompthread == 0) { - if (mpidata->mpirunning) { - // only go through this if MPI has been actually used - for (int rr=1; rr<mpidata->nprocs; rr++) { - // individually send their respective starting points to other MPI processes: they start immediately after the frequencies computed by previous processes so far - int remotejxi488startoffset = myMPIstride; - MPI_Send(&remotejxi488startoffset, 1, MPI_INT, rr, 3, MPI_COMM_WORLD); - int remoteMPIblock; - MPI_Recv(&remoteMPIblock, 1, MPI_INT, rr, 3, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - // update myMPIstride to include the ones due to MPI process rr - myMPIstride += remoteMPIblock; - } - // now I know the total myMPIstride, I can send it to all processes - MPI_Bcast(&myMPIstride, 1, MPI_INT, 0, MPI_COMM_WORLD); - } - } -#endif - // add an omp barrier to make sure that the global variables defined by thread 0 are known to all threads below this -#pragma omp barrier - - // To test parallelism, I will now start feeding this function with "clean" copies of the parameters, so that they will not be changed by previous iterations, and each one will behave as the first one. Define all (empty) variables here, so they have the correct scope, then they get different definitions depending on thread number - InclusionIterationData *cid_2 = NULL; - InclusionOutputInfo *p_output_2 = NULL; - VirtualBinaryFile *vtppoanp_2 = NULL; - // for threads other than the 0, create distinct copies of all relevant data, while for thread 0 just define new references / pointers to the original ones - if (myompthread == 0) { - cid_2 = cid; - // OMP thread 0 of MPI process 0 holds the pointer to the full output structure - p_output_2 = p_output; - p_outarray[0] = p_output_2; - } else { - // this is not thread 0, so do create fresh copies of all local variables - cid_2 = new InclusionIterationData(*cid); - } - // make sure all threads align here: I don't want the following loop to accidentally start for thread 0, possibly modifying some variables before they are copied by all other threads - if (myompthread==0) { - logger->log("Syncing OpenMP threads and starting the loop on wavelengths\n"); - } -#pragma omp barrier - // ok, now I can actually start the parallel calculations - for (int ixi488=2; ixi488<=cid_2->number_of_scales; ixi488 +=myMPIstride) { - // the parallel loop over MPI processes covers a different set of indices for each thread -#pragma omp barrier - int myjxi488 = ixi488+myompthread; - // each thread opens new virtual files and stores their pointers in the shared array - vtppoanp_2 = new VirtualBinaryFile(); - // each thread puts a copy of the pointers to its virtual files in the shared arrays - vtppoanarray[myompthread] = vtppoanp_2; -#pragma omp barrier - - // each MPI process handles a number of contiguous scales corresponding to its number of OMP threads at this omp level of parallelism - if (myjxi488 <= cid_2->number_of_scales) { - if (myompthread > 0) { - // UPDATE: non-0 threads need to allocate memory for one scale at a time. - p_output_2 = new InclusionOutputInfo(sconf, gconf, mpidata, myjxi488, 1); - p_outarray[myompthread] = p_output_2; - } - int jer = inclusion_jxi488_cycle(myjxi488, sconf, gconf, p_scattering_angles, cid_2, p_output_2, output_path, vtppoanp_2); - } else { - if (myompthread > 0) { - // If there is no input for this thread, set output pointer to NULL. - p_outarray[myompthread] = NULL; - } - } -#pragma omp barrier - -#ifdef USE_NVTX - nvtxRangePush("Output concatenation"); -#endif -#pragma omp barrier - // threads different from 0 append their virtual files to the one of thread 0, and delete them - if (myompthread == 0) { - for (int ti=1; ti<ompnumthreads; ti++) { - if (p_outarray[ti] != NULL) { - p_outarray[0]->insert(*(p_outarray[ti])); - delete p_outarray[ti]; - p_outarray[ti] = NULL; - } - vtppoanarray[0]->append(*(vtppoanarray[ti])); - delete vtppoanarray[ti]; - } - } -#pragma omp barrier - //============================================== - // Collect all virtual files on thread 0 of MPI process 0, and append them to disk - //============================================== - if (myompthread == 0) { - // thread 0 writes its virtual files, now including contributions from all threads, to disk, and deletes them - // p_outarray[0]->append_to_disk(output_path + "/c_OINCLU"); - // delete p_outarray[0]; - vtppoanarray[0]->append_to_disk(output_path + "/c_TPPOAN"); - delete vtppoanarray[0]; - -#ifdef MPI_VERSION - if (mpidata->mpirunning) { - // only go through this if MPI has been actually used - for (int rr=1; rr<mpidata->nprocs; rr++) { - // get the data from process rr by receiving it in total memory structure - p_outarray[0]->mpireceive(mpidata, rr); - // get the data from process rr, creating a new virtual ascii file - // VirtualAsciiFile *p_output = new VirtualAsciiFile(mpidata, rr); - // append to disk and delete virtual ascii file - // p_output->append_to_disk(output_path + "/c_OINCLU"); - // delete p_output; - - // get the data from process rr, creating a new virtual binary file - VirtualBinaryFile *vtppoanp = new VirtualBinaryFile(mpidata, rr); - // append to disk and delete virtual binary file - vtppoanp->append_to_disk(output_path + "/c_TPPOAN"); - delete vtppoanp; - int test = MPI_Barrier(MPI_COMM_WORLD); - } - } -#endif - } - // end block writing to disk -#ifdef USE_NVTX - nvtxRangePop(); -#endif -#pragma omp barrier - - } // close strided loop running on MPI processes, ixi488 loop - // delete the shared arrays I used to make available to thread 0 the virtual files of other threads -#pragma omp barrier - if (myompthread == 0) { - delete[] p_outarray; - delete[] vtppoanarray; - } - { - string message = "INFO: Closing thread-local output files of thread " + to_string(myompthread) + " and syncing threads.\n"; - logger->log(message); - } -#ifdef USE_NVTX - nvtxRangePop(); -#endif - delete cid_2; - } - delete p_scattering_angles; - p_output->write(output_path + "/c_OINCLU.hd5", "HDF5"); - p_output->write(output_path + "/c_OINCLU", "LEGACY"); - delete p_output; - } // closes s_nsph == nsph check - - else { // Sphere number inconsistency error. - throw UnrecognizedConfigurationException( - "Inconsistent geometry and scatterer configurations." - ); - } - - delete sconf; - delete gconf; -#ifdef USE_MAGMA - logger->log("INFO: Process " + to_string(mpidata->rank) + " finalizes MAGMA.\n"); - magma_finalize(); -#endif - chrono::time_point<chrono::high_resolution_clock> t_end = chrono::high_resolution_clock::now(); - elapsed = t_end - t_start; - string message = "INFO: Calculation lasted " + to_string(elapsed.count()) + "s.\n"; - logger->log(message); - logger->log("Finished: output written to " + output_path + "/c_OINCLU\n"); - time_logger->log(message); - fclose(timing_file); - delete time_logger; - } // end instructions block of MPI process 0 - - //=============================== - // instruction block for MPI processes different from 0 - //=============================== -#ifdef MPI_VERSION - else { - // here go the code for MPI processes other than 0 - // copy gconf, sconf, cid and p_scattering_angles from MPI process 0 - GeometryConfiguration *gconf = new GeometryConfiguration(mpidata); - ScattererConfiguration *sconf = new ScattererConfiguration(mpidata); - InclusionIterationData *cid = new InclusionIterationData(mpidata, device_count); - ScatteringAngles *p_scattering_angles = new ScatteringAngles(mpidata); - - // Create this variable and initialise it with a default here, so that it is defined anyway, with or without OpenMP support enabled - int ompnumthreads = 1; - InclusionOutputInfo **p_outarray = NULL; - VirtualBinaryFile **vtppoanarray = NULL; - int myjxi488startoffset; - int myMPIstride = ompnumthreads; - int myMPIblock = ompnumthreads; - -#pragma omp parallel - { - // Create and initialise this variable here, so that if OpenMP is enabled it is local to the thread, and if OpenMP is not enabled it has a well-defiled value anyway - int myompthread = 0; -#ifdef _OPENMP - // If OpenMP is enabled, give actual values to myompthread and ompnumthreads, and open thread-local output files - myompthread = omp_get_thread_num(); - if (myompthread == 0) ompnumthreads = omp_get_num_threads(); -#endif - if (myompthread == 0) { - // receive the start parameter from MPI process 0 - MPI_Recv(&myjxi488startoffset, 1, MPI_INT, 0, 3, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - // send my number of omp threads to process 0 - MPI_Send(&ompnumthreads, 1, MPI_INT, 0, 3, MPI_COMM_WORLD); - // receive myMPIstride sent by MPI process 0 to all processes - MPI_Bcast(&myMPIstride, 1, MPI_INT, 0, MPI_COMM_WORLD); - // allocate virtual files for each thread - p_outarray = new InclusionOutputInfo*[ompnumthreads]; - vtppoanarray = new VirtualBinaryFile*[ompnumthreads]; - } -#pragma omp barrier - // To test parallelism, I will now start feeding this function with "clean" copies of the parameters, so that they will not be changed by previous iterations, and each one will behave as the first one. Define all (empty) variables here, so they have the correct scope, then they get different definitions depending on thread number - InclusionIterationData *cid_2 = NULL; - InclusionOutputInfo *p_output_2 = NULL; - VirtualBinaryFile *vtppoanp_2 = NULL; - // PLACEHOLDER - // for threads other than the 0, create distinct copies of all relevant data, while for thread 0 just define new references / pointers to the original ones - if (myompthread == 0) { - cid_2 = cid; - } else { - // this is not thread 0, so do create fresh copies of all local variables - cid_2 = new InclusionIterationData(*cid); - } - // make sure all threads align here: I don't want the following loop to accidentally start for thread 0, possibly modifying some variables before they are copied by all other threads -#pragma omp barrier - // ok, now I can actually start the parallel calculations - for (int ixi488=2; ixi488<=cid_2->number_of_scales; ixi488 +=myMPIstride) { - // the parallel loop over MPI processes covers a different set of indices for each thread -#pragma omp barrier - int myjxi488 = ixi488 + myjxi488startoffset + myompthread; - // each thread opens new virtual files and stores their pointers in the shared array - vtppoanp_2 = new VirtualBinaryFile(); - // each thread puts a copy of the pointers to its virtual files in the shared arrays - vtppoanarray[myompthread] = vtppoanp_2; -#pragma omp barrier - if (myompthread==0) logger->log("Syncing OpenMP threads and starting the loop on wavelengths\n"); - // ok, now I can actually start the parallel calculations - // each MPI process handles a number of contiguous scales corresponding to its number of OMP threads at this omp level of parallelism - if (myjxi488 <= cid_2->number_of_scales) { - if (myompthread > 0) { - // UPDATE: non-0 threads need to allocate memory for one scale at a time. - p_output_2 = new InclusionOutputInfo(sconf, gconf, mpidata, myjxi488, 1); - p_outarray[myompthread] = p_output_2; - } else { - // Thread 0 of non-zero MPI processes needs to allocate memory for the - // output of all threads. - p_output_2 = new InclusionOutputInfo(sconf, gconf, mpidata, myjxi488, ompnumthreads); - p_outarray[0] = p_output_2; - } - int jer = inclusion_jxi488_cycle(myjxi488, sconf, gconf, p_scattering_angles, cid_2, p_output_2, output_path, vtppoanp_2); - } else { - if (myompthread > 0) { - // If there is no input for this thread, set the output pointer to NULL. - p_outarray[myompthread] = NULL; - } - } - -#pragma omp barrier - // threads different from 0 append their virtual files to the one of thread 0, and delete them +#pragma omp barrier + // threads different from 0 append their virtual files to the one of thread 0, and delete them if (myompthread == 0) { for (int ti=1; ti<ompnumthreads; ti++) { if (p_outarray[ti] != NULL) { @@ -2086,9 +1384,551 @@ int inclusion_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryCo message = "INFO: angle loop for scale " + to_string(jxi488) + " took " + to_string(elapsed.count()) + "s.\n"; logger->log(message); - logger->log("INFO: finished scale iteration " + to_string(jxi488) + " of " + to_string(nxi) + ".\n"); + logger->log("INFO: finished scale iteration " + to_string(jxi488) + " of " + to_string(nxi) + ".\n"); + + delete logger; + + return jer; +} + +// >>> IMPLEMENTATION OF InclusionIterationData CLASS <<< // +InclusionIterationData::InclusionIterationData(GeometryConfiguration *gconf, ScattererConfiguration *sconf, const mixMPI *mpidata, const int device_count) { + c1 = new ParticleDescriptorInclusion(gconf, sconf); + const int ndi = c1->nsph * c1->nlim; + const np_int ndit = 2 * ndi; + gaps = new double[c1->nsph](); + tqev = new double[3](); + tqsv = new double[3](); + tqse = new double*[2]; + tqspe = new dcomplex*[2]; + tqss = new double*[2]; + tqsps = new dcomplex*[2]; + tqce = new double*[2]; + tqcpe = new dcomplex*[2]; + tqcs = new double*[2]; + tqcps = new dcomplex*[2]; + for (int ti = 0; ti < 2; ti++) { + tqse[ti] = new double[c1->nsph](); + tqspe[ti] = new dcomplex[c1->nsph](); + tqss[ti] = new double[c1->nsph](); + tqsps[ti] = new dcomplex[c1->nsph](); + tqce[ti] = new double[3](); + tqcpe[ti] = new dcomplex[3](); + tqcs[ti] = new double[3](); + tqcps[ti] = new dcomplex[3](); + } + gapv = new double[3](); + gapp = new dcomplex*[3]; + gappm = new dcomplex*[3]; + gap = new double*[3]; + gapm = new double*[3]; + for (int gi = 0; gi < 3; gi++) { + gapp[gi] = new dcomplex[2](); + gappm[gi] = new dcomplex[2](); + gap[gi] = new double[2](); + gapm[gi] = new double[2](); + } + u = new double[3](); + us = new double[3](); + un = new double[3](); + uns = new double[3](); + up = new double[3](); + ups = new double[3](); + unmp = new double[3](); + unsmp = new double[3](); + upmp = new double[3](); + upsmp = new double[3](); + argi = new double[1](); + args = new double[1](); + duk = new double[3](); + cextlr = new double*[4]; + cext = new double*[4]; + cmullr = new double*[4];; + cmul = new double*[4]; + for (int ci = 0; ci < 4; ci++) { + cextlr[ci] = new double[4](); + cext[ci] = new double[4](); + cmullr[ci] = new double[4](); + cmul[ci] = new double[4](); + } + vec_zpv = new double[c1->lm * 12](); + zpv = new double***[c1->lm]; + for (int zi = 0; zi < c1->lm; zi++) { + zpv[zi] = new double**[12]; + for (int zj = 0; zj < 3; zj++) { + zpv[zi][zj] = new double*[4]; + zpv[zi][zj][0] = vec_zpv + (zi * 12) + (zj * 4); + zpv[zi][zj][1] = vec_zpv + (zi * 12) + (zj * 4) + 2; + } + } + am_vector = new dcomplex[c1->ndm * c1->ndm](); + am = new dcomplex*[c1->ndm]; + for (int ai = 0; ai < c1->ndm; ai++) { + am[ai] = (am_vector + ai * c1->ndm); + } + + arg = 0.0 + 0.0 * I; + // These are suspect initializations + scan = 0.0; + cfmp = 0.0; + sfmp = 0.0; + cfsp = 0.0; + sfsp = 0.0; + // End of suspect initializations + wn = sconf->wp / 3.0e8; + xip = sconf->xip; + sqsfi = 1.0; + vk = 0.0; + number_of_scales = sconf->number_of_scales; + xiblock = (int) ceil(((double) (sconf->number_of_scales-1))/((double) mpidata->nprocs)); + lastxi = ((mpidata->rank+1) * xiblock)+1; + firstxi = lastxi-xiblock+1; + if (lastxi > sconf->number_of_scales) lastxi = sconf->number_of_scales; + + nimd = c1->nshl[0] + 1; + c1->rc[0][nimd - 1] = c1->ros[0] * sconf->get_rcf(0, nimd - 1); + extr = c1->rc[0][nimd - 1]; + const double pig = acos(0.0) * 2.0; + c1->gcs = pig * extr * extr; + +#ifdef USE_MAGMA + proc_device = mpidata->rank % device_count; +#else + proc_device = 0; +#endif + + // In the first iteration, if refinement is enabled, determine the number of refinement iterations required to arrive at the target accuracy (if achievable in a reasonable number of iterations) + refinemode = 2; + // maxrefiters and accuracygoal should be configurable and preferably set somewhere else + maxrefiters = 20; + accuracygoal = 1e-6; +} + +InclusionIterationData::InclusionIterationData(const InclusionIterationData& rhs) { + c1 = new ParticleDescriptorInclusion(reinterpret_cast<ParticleDescriptorInclusion &>(*(rhs.c1))); + const int ndi = c1->nsph * c1->nlim; + const np_int ndit = 2 * ndi; + gaps = new double[c1->nsph](); + for (int gi = 0; gi < c1->nsph; gi++) gaps[gi] = rhs.gaps[gi]; + tqev = new double[3](); + tqsv = new double[3](); + for (int ti = 0; ti < 3; ti++) { + tqev[ti] = rhs.tqev[ti]; + tqsv[ti] = rhs.tqsv[ti]; + } + tqse = new double*[2]; + tqspe = new dcomplex*[2]; + tqss = new double*[2]; + tqsps = new dcomplex*[2]; + tqce = new double*[2]; + tqcpe = new dcomplex*[2]; + tqcs = new double*[2]; + tqcps = new dcomplex*[2]; + for (int ti = 0; ti < 2; ti++) { + tqse[ti] = new double[c1->nsph](); + tqspe[ti] = new dcomplex[c1->nsph](); + tqss[ti] = new double[c1->nsph](); + tqsps[ti] = new dcomplex[c1->nsph](); + for (int tj = 0; tj < c1->nsph; tj++) { + tqse[ti][tj] = rhs.tqse[ti][tj]; + tqspe[ti][tj] = rhs.tqspe[ti][tj]; + tqss[ti][tj] = rhs.tqss[ti][tj]; + tqsps[ti][tj] = rhs.tqsps[ti][tj]; + } + tqce[ti] = new double[3](); + tqcpe[ti] = new dcomplex[3](); + tqcs[ti] = new double[3](); + tqcps[ti] = new dcomplex[3](); + for (int tj = 0; tj < 3; tj++) { + tqce[ti][tj] = rhs.tqce[ti][tj]; + tqcpe[ti][tj] = rhs.tqcpe[ti][tj]; + tqcs[ti][tj] = rhs.tqcs[ti][tj]; + tqcps[ti][tj] = rhs.tqcps[ti][tj]; + } + } + gapv = new double[3](); + gapp = new dcomplex*[3]; + gappm = new dcomplex*[3]; + gap = new double*[3]; + gapm = new double*[3]; + for (int gi = 0; gi < 3; gi++) { + gapv[gi] = rhs.gapv[gi]; + gapp[gi] = new dcomplex[2](); + gappm[gi] = new dcomplex[2](); + gap[gi] = new double[2](); + gapm[gi] = new double[2](); + for (int gj = 0; gj < 2; gj++) { + gapp[gi][gj] = rhs.gapp[gi][gj]; + gappm[gi][gj] = rhs.gappm[gi][gj]; + gap[gi][gj] = rhs.gap[gi][gj]; + gapm[gi][gj] = rhs.gapm[gi][gj]; + } + } + u = new double[3](); + us = new double[3](); + un = new double[3](); + uns = new double[3](); + up = new double[3](); + ups = new double[3](); + unmp = new double[3](); + unsmp = new double[3](); + upmp = new double[3](); + upsmp = new double[3](); + duk = new double[3](); + for (int ui = 0; ui < 3; ui++) { + u[ui] = rhs.u[ui]; + us[ui] = rhs.us[ui]; + un[ui] = rhs.un[ui]; + uns[ui] = rhs.uns[ui]; + up[ui] = rhs.up[ui]; + ups[ui] = rhs.ups[ui]; + unmp[ui] = rhs.unmp[ui]; + unsmp[ui] = rhs.unsmp[ui]; + upmp[ui] = rhs.upmp[ui]; + upsmp[ui] = rhs.upsmp[ui]; + duk[ui] = rhs.duk[ui]; + } + argi = new double[1](); + args = new double[1](); + argi[0] = rhs.argi[0]; + args[0] = rhs.args[0]; + cextlr = new double*[4]; + cext = new double*[4]; + cmullr = new double*[4];; + cmul = new double*[4]; + for (int ci = 0; ci < 4; ci++) { + cextlr[ci] = new double[4](); + cext[ci] = new double[4](); + cmullr[ci] = new double[4](); + cmul[ci] = new double[4](); + for (int cj = 0; cj < 4; cj++) { + cextlr[ci][cj] = rhs.cextlr[ci][cj]; + cext[ci][cj] = rhs.cext[ci][cj]; + cmullr[ci][cj] = rhs.cmullr[ci][cj]; + cmul[ci][cj] = rhs.cmul[ci][cj]; + } + } + vec_zpv = new double[c1->lm * 12]; + zpv = new double***[c1->lm]; + for (int zi = 0; zi < c1->lm; zi++) { + zpv[zi] = new double **[12]; + for (int zj = 0; zj < 3; zj++) { + zpv[zi][zj] = new double*[4]; + zpv[zi][zj][0] = vec_zpv + (zi * 12) + (zj * 4); + zpv[zi][zj][1] = vec_zpv + (zi * 12) + (zj * 4) + 2; + zpv[zi][zj][0][0] = rhs.zpv[zi][zj][0][0]; + zpv[zi][zj][0][1] = rhs.zpv[zi][zj][0][1]; + zpv[zi][zj][1][0] = rhs.zpv[zi][zj][1][0]; + zpv[zi][zj][1][1] = rhs.zpv[zi][zj][1][1]; + } + } + am_vector = new dcomplex[c1->ndm * c1->ndm]; + for (int ai = 0; ai < c1->ndm * c1->ndm; ai++) am_vector[ai] = rhs.am_vector[ai]; + am = new dcomplex*[c1->ndm]; + for (int ai = 0; ai < c1->ndm; ai++) { + am[ai] = (am_vector + ai * c1->ndm); + } + + arg = rhs.arg; + // These are suspect initializations + scan = rhs.scan; + cfmp = rhs.cfmp; + sfmp = rhs.sfmp; + cfsp = rhs.cfsp; + sfsp = rhs.sfsp; + // End of suspect initializations + wn = rhs.wn; + xip = rhs.xip; + sqsfi = rhs.sqsfi; + vk = rhs.vk; + firstxi = rhs.firstxi; + lastxi = rhs.lastxi; + xiblock = rhs.xiblock; + number_of_scales = rhs.number_of_scales; + + nimd = rhs.nimd; + extr = rhs.extr; + + proc_device = rhs.proc_device; + refinemode = rhs.refinemode; + maxrefiters = rhs.maxrefiters; + accuracygoal = rhs.accuracygoal; +} + +#ifdef MPI_VERSION +InclusionIterationData::InclusionIterationData(const mixMPI *mpidata, const int device_count) { + c1 = new ParticleDescriptorInclusion(mpidata); + const int ndi = c1->nsph * c1->nlim; + const np_int ndit = 2 * ndi; + gaps = new double[c1->nsph](); + MPI_Bcast(gaps, c1->nsph, MPI_DOUBLE, 0, MPI_COMM_WORLD); + tqev = new double[3](); + tqsv = new double[3](); + MPI_Bcast(tqev, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(tqsv, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); + tqse = new double*[2]; + tqspe = new dcomplex*[2]; + tqss = new double*[2]; + tqsps = new dcomplex*[2]; + tqce = new double*[2]; + tqcpe = new dcomplex*[2]; + tqcs = new double*[2]; + tqcps = new dcomplex*[2]; + for (int ti = 0; ti < 2; ti++) { + tqse[ti] = new double[c1->nsph](); + tqspe[ti] = new dcomplex[c1->nsph](); + tqss[ti] = new double[c1->nsph](); + tqsps[ti] = new dcomplex[c1->nsph](); + MPI_Bcast(tqse[ti], c1->nsph, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(tqspe[ti], c1->nsph, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); + MPI_Bcast(tqss[ti], c1->nsph, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(tqsps[ti], c1->nsph, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); + tqce[ti] = new double[3](); + tqcpe[ti] = new dcomplex[3](); + tqcs[ti] = new double[3](); + tqcps[ti] = new dcomplex[3](); + MPI_Bcast(tqce[ti], 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(tqcpe[ti], 3, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); + MPI_Bcast(tqcs[ti], 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(tqcps[ti], 3, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); + } + gapv = new double[3](); + gapp = new dcomplex*[3]; + gappm = new dcomplex*[3]; + gap = new double*[3]; + gapm = new double*[3]; + MPI_Bcast(gapv, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); + for (int gi = 0; gi < 3; gi++) { + gapp[gi] = new dcomplex[2](); + gappm[gi] = new dcomplex[2](); + gap[gi] = new double[2](); + gapm[gi] = new double[2](); + MPI_Bcast(gapp[gi], 2, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); + MPI_Bcast(gappm[gi], 2, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); + MPI_Bcast(gap[gi], 2, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(gapm[gi], 2, MPI_DOUBLE, 0, MPI_COMM_WORLD); + } + u = new double[3](); + us = new double[3](); + un = new double[3](); + uns = new double[3](); + up = new double[3](); + ups = new double[3](); + unmp = new double[3](); + unsmp = new double[3](); + upmp = new double[3](); + upsmp = new double[3](); + duk = new double[3](); + MPI_Bcast(u, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(us, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(un, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(uns, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(up, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(ups, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(unmp, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(unsmp, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(upmp, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(upsmp, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(duk, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); + argi = new double[1](); + args = new double[1](); + MPI_Bcast(argi, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(args, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); + cextlr = new double*[4]; + cext = new double*[4]; + cmullr = new double*[4];; + cmul = new double*[4]; + for (int ci = 0; ci < 4; ci++) { + cextlr[ci] = new double[4](); + cext[ci] = new double[4](); + cmullr[ci] = new double[4](); + cmul[ci] = new double[4](); + MPI_Bcast(cextlr[ci], 4, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(cext[ci], 4, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(cmullr[ci], 4, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(cmul[ci], 4, MPI_DOUBLE, 0, MPI_COMM_WORLD); + } + vec_zpv = new double[c1->lm * 12]; + MPI_Bcast(vec_zpv, c1->lm * 12, MPI_DOUBLE, 0, MPI_COMM_WORLD); + zpv = new double***[c1->lm]; + for (int zi = 0; zi < c1->lm; zi++) { + zpv[zi] = new double **[12]; + for (int zj = 0; zj < 3; zj++) { + zpv[zi][zj] = new double*[4]; + zpv[zi][zj][0] = vec_zpv + (zi * 12) + (zj * 4); + zpv[zi][zj][1] = vec_zpv + (zi * 12) + (zj * 4) + 2; + } + } + am_vector = new dcomplex[c1->ndm * c1->ndm]; + am = new dcomplex*[c1->ndm]; + for (int ai = 0; ai < c1->ndm; ai++) { + am[ai] = (am_vector + ai * c1->ndm); + MPI_Bcast(am[ai], c1->ndm, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); + } + MPI_Bcast(&arg, 1, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); + MPI_Bcast(&scan, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(&cfmp, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(&sfmp, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(&cfsp, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(&sfsp, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(&wn, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(&xip, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(&sqsfi, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(&vk, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(&xiblock, 1, MPI_INT, 0, MPI_COMM_WORLD); + MPI_Bcast(&number_of_scales, 1, MPI_INT, 0, MPI_COMM_WORLD); + lastxi = ((mpidata->rank+1) * xiblock)+1; + firstxi = lastxi-xiblock+1; + if (lastxi > number_of_scales) lastxi = number_of_scales; + + MPI_Bcast(&nimd, 1, MPI_INT, 0, MPI_COMM_WORLD); + MPI_Bcast(&extr, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); + +#ifdef USE_MAGMA + proc_device = mpidata->rank % device_count; +#else + proc_device = 0; +#endif + MPI_Bcast(&refinemode, 1, MPI_INT, 0, MPI_COMM_WORLD); + MPI_Bcast(&maxrefiters, 1, MPI_INT, 0, MPI_COMM_WORLD); + MPI_Bcast(&accuracygoal, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); +} - delete logger; +void InclusionIterationData::mpibcast(const mixMPI *mpidata) { + c1->mpibcast(mpidata); + const int ndi = c1->nsph * c1->nlim; + const np_int ndit = 2 * ndi; + MPI_Bcast(gaps, c1->nsph, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(tqev, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(tqsv, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); + for (int ti = 0; ti < 2; ti++) { + MPI_Bcast(tqse[ti], c1->nsph, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(tqspe[ti], c1->nsph, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); + MPI_Bcast(tqss[ti], c1->nsph, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(tqsps[ti], c1->nsph, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); + MPI_Bcast(tqce[ti], 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(tqcpe[ti], 3, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); + MPI_Bcast(tqcs[ti], 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(tqcps[ti], 3, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); + } + MPI_Bcast(gapv, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); + for (int gi = 0; gi < 3; gi++) { + MPI_Bcast(gapp[gi], 2, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); + MPI_Bcast(gappm[gi], 2, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); + MPI_Bcast(gap[gi], 2, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(gapm[gi], 2, MPI_DOUBLE, 0, MPI_COMM_WORLD); + } + MPI_Bcast(u, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(us, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(un, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(uns, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(up, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(ups, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(unmp, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(unsmp, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(upmp, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(upsmp, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(duk, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(argi, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(args, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); + for (int ci = 0; ci < 4; ci++) { + MPI_Bcast(cextlr[ci], 4, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(cext[ci], 4, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(cmullr[ci], 4, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(cmul[ci], 4, MPI_DOUBLE, 0, MPI_COMM_WORLD); + } + MPI_Bcast(vec_zpv, c1->lm * 12, MPI_DOUBLE, 0, MPI_COMM_WORLD); + // since MPI expects an int argument for the number of elements to transfer in one go, transfer am one row at a time + for (int ai = 0; ai < c1->ndm; ai++) { + MPI_Bcast(am[ai], c1->ndm, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); + } + MPI_Bcast(&arg, 1, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); + MPI_Bcast(&scan, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(&cfmp, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(&sfmp, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(&cfsp, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(&sfsp, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(&wn, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(&xip, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(&sqsfi, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(&vk, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(&xiblock, 1, MPI_INT, 0, MPI_COMM_WORLD); + MPI_Bcast(&number_of_scales, 1, MPI_INT, 0, MPI_COMM_WORLD); + MPI_Bcast(&nimd, 1, MPI_INT, 0, MPI_COMM_WORLD); + MPI_Bcast(&extr, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(&refinemode, 1, MPI_INT, 0, MPI_COMM_WORLD); + MPI_Bcast(&maxrefiters, 1, MPI_INT, 0, MPI_COMM_WORLD); + MPI_Bcast(&accuracygoal, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); +} +#endif - return jer; +InclusionIterationData::~InclusionIterationData() { + const int nsph = c1->nsph; + delete[] am_vector; + delete[] am; + for (int zi = 0; zi < c1->lm; zi++) { + for (int zj = 0; zj < 3; zj++) { + delete[] zpv[zi][zj]; + } + delete[] zpv[zi]; + } + delete[] zpv; + delete[] vec_zpv; + delete c1; + delete[] gaps; + for (int ti = 1; ti > -1; ti--) { + delete[] tqse[ti]; + delete[] tqss[ti]; + delete[] tqspe[ti]; + delete[] tqsps[ti]; + delete[] tqce[ti]; + delete[] tqcpe[ti]; + delete[] tqcs[ti]; + delete[] tqcps[ti]; + } + delete[] tqse; + delete[] tqss; + delete[] tqspe; + delete[] tqsps; + delete[] tqce; + delete[] tqcpe; + delete[] tqcs; + delete[] tqcps; + delete[] tqev; + delete[] tqsv; + for (int gi = 2; gi > -1; gi--) { + delete[] gapp[gi]; + delete[] gappm[gi]; + delete[] gap[gi]; + delete[] gapm[gi]; + } + delete[] gapp; + delete[] gappm; + delete[] gap; + delete[] gapm; + delete[] gapv; + delete[] u; + delete[] us; + delete[] un; + delete[] uns; + delete[] up; + delete[] ups; + delete[] unmp; + delete[] unsmp; + delete[] upmp; + delete[] upsmp; + delete[] argi; + delete[] args; + delete[] duk; + for (int ci = 3; ci > -1; ci--) { + delete[] cextlr[ci]; + delete[] cext[ci]; + delete[] cmullr[ci]; + delete[] cmul[ci]; + } + delete[] cextlr; + delete[] cext; + delete[] cmullr; + delete[] cmul; } +// >>> END OF InclusionIterationData CLASS IMPLEMENTATION <<< diff --git a/src/libnptm/outputs.cpp b/src/libnptm/outputs.cpp index 68e0e62b26a9c0eb87ef43b05d73456206264a64..71cf7b781222e17c42da9f7129da9e4a511949f5 100644 --- a/src/libnptm/outputs.cpp +++ b/src/libnptm/outputs.cpp @@ -475,11 +475,11 @@ ClusterOutputInfo::ClusterOutputInfo(const std::string &hdf5_name) { status = hdf_file->read("VEC_FKC1", str_type, vec_fkc1); vec_fkc2 = new double[xi_block_size]; status = hdf_file->read("VEC_FKC2", str_type, vec_fkc2); + // Initialize directions (they are scale-independent) vec_dir_tidg = new double[_num_theta]; vec_dir_pidg = new double[_num_phi]; vec_dir_tsdg = new double[_num_thetas]; vec_dir_psdg = new double[_num_phis]; - // Initialize directions (they are scale-independent) double cti = th, cpi = ph, cts = ths, cps = phs; for (int di = 0; di < _num_theta; di++) { vec_dir_tidg[di] = cti; @@ -4850,27 +4850,11 @@ SphereOutputInfo::SphereOutputInfo( vec_pschut = new double[xi_block_size](); vec_s0magt = new double[xi_block_size](); } - vec_dir_tidg = new double[_num_theta](); - vec_dir_pidg = new double[_num_phi](); - vec_dir_tsdg = new double[_num_thetas](); - vec_dir_psdg = new double[_num_phis](); - vec_dir_scand = new double[ndirs](); - vec_dir_cfmp = new double[ndirs](); - vec_dir_cfsp = new double[ndirs](); - vec_dir_sfmp = new double[ndirs](); - vec_dir_sfsp = new double[ndirs](); - vec_dir_un = new double[3 * ndirs](); - vec_dir_uns = new double[3 * ndirs](); - vec_dir_sas11 = new dcomplex[nsph * ndirs * xi_block_size](); - vec_dir_sas21 = new dcomplex[nsph * ndirs * xi_block_size](); - vec_dir_sas12 = new dcomplex[nsph * ndirs * xi_block_size](); - vec_dir_sas22 = new dcomplex[nsph * ndirs * xi_block_size](); - vec_dir_fx = new double[nsph * _num_theta * _num_phi * xi_block_size](); - vec_dir_fy = new double[nsph * _num_theta * _num_phi * xi_block_size](); - vec_dir_fz = new double[nsph * _num_theta * _num_phi * xi_block_size](); - vec_dir_muls = new double[16 * nsph * ndirs * xi_block_size](); - vec_dir_mulslr = new double[16 * nsph * ndirs * xi_block_size](); // Initialize directions (they are scale-independent) + vec_dir_tidg = new double[_num_theta]; + vec_dir_pidg = new double[_num_phi]; + vec_dir_tsdg = new double[_num_thetas]; + vec_dir_psdg = new double[_num_phis]; double cti = th, cpi = ph, cts = ths, cps = phs; for (int di = 0; di < _num_theta; di++) { vec_dir_tidg[di] = cti; @@ -4888,10 +4872,25 @@ SphereOutputInfo::SphereOutputInfo( vec_dir_psdg[di] = cps; cps += phsstp; } + vec_dir_scand = new double[ndirs](); + vec_dir_cfmp = new double[ndirs](); + vec_dir_cfsp = new double[ndirs](); + vec_dir_sfmp = new double[ndirs](); + vec_dir_sfsp = new double[ndirs](); + vec_dir_un = new double[3 * ndirs](); + vec_dir_uns = new double[3 * ndirs](); + vec_dir_sas11 = new dcomplex[nsph * ndirs * xi_block_size](); + vec_dir_sas21 = new dcomplex[nsph * ndirs * xi_block_size](); + vec_dir_sas12 = new dcomplex[nsph * ndirs * xi_block_size](); + vec_dir_sas22 = new dcomplex[nsph * ndirs * xi_block_size](); + vec_dir_fx = new double[nsph * _num_theta * _num_phi * xi_block_size](); + vec_dir_fy = new double[nsph * _num_theta * _num_phi * xi_block_size](); + vec_dir_fz = new double[nsph * _num_theta * _num_phi * xi_block_size](); + vec_dir_muls = new double[16 * nsph * ndirs * xi_block_size](); + vec_dir_mulslr = new double[16 * nsph * ndirs * xi_block_size](); } SphereOutputInfo::SphereOutputInfo(const std::string &hdf5_name) { - // DA QUI ; TODO: AGGIUSTARE TUTTE LE DIMENSIONI LEGATE A configurations unsigned int flags = H5F_ACC_RDONLY; HDFFile *hdf_file = new HDFFile(hdf5_name, flags); herr_t status = hdf_file->get_status(); @@ -5001,10 +5000,6 @@ SphereOutputInfo::SphereOutputInfo(const std::string &hdf5_name) { vec_s0magt = NULL; } // Initialize directions (they are scale-independent) - _num_theta = (thstp == 0.0) ? 1 : 1 + (int)((thlst - th) / thstp); - _num_thetas = (thsstp == 0.0) ? 1 : 1 + (int)((thslst - ths) / thsstp); - _num_phi = (phstp == 0.0) ? 1 : 1 + (int)((phlst - ph) / phstp); - _num_phis = (phsstp == 0.0) ? 1 : 1 + (int)((phslst - phs) / phsstp); vec_dir_tidg = new double[_num_theta]; vec_dir_tsdg = new double[_num_thetas]; vec_dir_pidg = new double[_num_phi];