diff --git a/src/include/Commons.h b/src/include/Commons.h index 7e4d2520719a82fbebcee2198797bb11ad335562..3369a9c5056716364b68beb53c53325994f99d94 100644 --- a/src/include/Commons.h +++ b/src/include/Commons.h @@ -55,8 +55,6 @@ class mixMPI; */ class C1 { protected: - //! \brief Number of spheres. - int nsph; //! \brief Maximum order of field expansion. int lm; //! \brief Contiguous RMI vector. @@ -69,6 +67,8 @@ protected: dcomplex *vec_vints; public: + //! \brief Number of spheres. + int nsph; //! \brief NLMMT = 2 * LM * (LM + 2). int nlmmt; //! \brief Number of configurations @@ -453,6 +453,7 @@ public: }; + /*! \brief Representation of the FORTRAN C9 blocks. */ class C9 { @@ -461,12 +462,12 @@ protected: int gis_size_0; //! \brief Number of rows in the SAM matrix int sam_size_0; + +public: //! \brief QUESTION: definition? int nlem; //! \brief QUESTION: definition? int nlemt; - -public: //! \brief QUESTION: definition? dcomplex **gis; //! \brief QUESTION: definition? diff --git a/src/include/clu_subs.h b/src/include/clu_subs.h index 3a94928ff5002170ddfa4447d7df74396c6c6fa8..4c6b151797b9401a05d0d19229785f679d75ee85 100644 --- a/src/include/clu_subs.h +++ b/src/include/clu_subs.h @@ -119,6 +119,28 @@ void cms(dcomplex **am, C1 *c1, C1_AddOns *c1ao, C4 *c4, C6 *c6); */ void crsm1(double vk, double exri, C1 *c1, C1_AddOns *c1ao, C4 *c4, C6 *c6); +/*! \brief Compute the transfer vector from N2 to N1. + * + * This function computes the transfer vector going from N2 to N1, using either + * Hankel, Bessel or Bessel from origin functions. + * + * \param ihi: `int` + * \param ipamo: `int` + * \param nbl: `int` + * \param l1: `int` + * \param m1: `int` + * \param l2: `int` + * \param m2: `int` + * \param c1: `C1 *` + * \param c1ao: `C1_AddOns *` + * \param c4: `C4 *` + * \param rac3j: `double *` + */ +dcomplex ghit_d( + int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2, C1 *c1, + C1_AddOns *c1ao, C4 *c4, double *rac3j +); + /*! \brief Compute the transfer vector from N2 to N1. * * This function computes the transfer vector going from N2 to N1, using either @@ -257,6 +279,19 @@ void r3j000(int j2, int j3, C6 *c6); */ void r3jjr(int j2, int j3, int m2, int m3, C6 *c6); +/*! \brief Compute the 3j symbol for Clebsch-Gordan coefficients for JJ transitions. + * + * This function calculates the 3j(J,J2,J3;-M2-M3,M2,M3) symbol for the Clebsch-Gordan + * coefficients. See Appendix a.3.1 in Borghese, Denti & Saija (2007). + * + * \param j2: `int` + * \param j3: `int` + * \param m2: `int` + * \param m3: `int` + * \param rac3j: `double *` + */ +void r3jjr_d(int j2, int j3, int m2, int m3, double *rac3j); + /*! \brief Compute the 3j symbol for Clebsch-Gordan coefficients for JM transitions. * * This function calculates the 3j(J,J2,J3;M1,M,-M1-M) symbol for the Clebsch-Gordan diff --git a/src/libnptm/Commons.cpp b/src/libnptm/Commons.cpp index 1410bbea6305e166786d05c0fb7b5f68df753812..de9539d77362053209a9bed8b2143911335c594b 100644 --- a/src/libnptm/Commons.cpp +++ b/src/libnptm/Commons.cpp @@ -837,12 +837,16 @@ C9::C9(int ndi, int _nlem, int ndit, int _nlemt) { nlemt = _nlemt; gis = new dcomplex*[ndi]; gls = new dcomplex*[ndi]; - for (int gi = 0; gi < ndi; gi++) { - gis[gi] = new dcomplex[nlem](); - gls[gi] = new dcomplex[nlem](); + // allocate these arrays to be contiguous, C-style + gis[0] = new dcomplex[ndi*nlem](); + gls[0] = new dcomplex[ndi*nlem](); + for (int gi = 1; gi < ndi; gi++) { + gis[gi] = gis[0] + gi*nlem; + gls[gi] = gls[0] + gi*nlem; } sam = new dcomplex*[ndit]; - for (int si = 0; si < ndit; si++) sam[si] = new dcomplex[nlemt](); + sam[0] = new dcomplex[ndit*nlemt](); + for (int si = 1; si < ndit; si++) sam[si] = sam[0] + si*nlemt; } C9::C9(const C9& rhs) { @@ -852,29 +856,29 @@ C9::C9(const C9& rhs) { nlemt = rhs.nlemt; gis = new dcomplex*[gis_size_0]; gls = new dcomplex*[gis_size_0]; - for (int gi = 0; gi < gis_size_0; gi++) { - gis[gi] = new dcomplex[nlem](); - gls[gi] = new dcomplex[nlem](); - for (int gj=0; gj<nlem; gj++) { - gis[gi][gj] = rhs.gis[gi][gj]; - gls[gi][gj] = rhs.gls[gi][gj]; - } + // allocate these arrays to be contiguous, C-style + gis[0] = new dcomplex[gis_size_0*nlem](); + gls[0] = new dcomplex[gis_size_0*nlem](); + memcpy(gis[0], rhs.gis[0], gis_size_0*nlem*sizeof(dcomplex)); + memcpy(gls[0], rhs.gls[0], gis_size_0*nlem*sizeof(dcomplex)); + for (int gi = 1; gi < gis_size_0; gi++) { + gis[gi] = gis[0] + gi*nlem; + gls[gi] = gls[0] + gi*nlem; } sam = new dcomplex*[sam_size_0]; - for (int si = 0; si < sam_size_0; si++) { - sam[si] = new dcomplex[nlemt](); - for (int sj=0; sj<nlemt; sj++) sam[si][sj] = rhs.sam[si][sj]; + sam[0] = new dcomplex[sam_size_0*nlemt](); + memcpy(sam[0], rhs.sam[0], sam_size_0*nlemt*sizeof(dcomplex)); + for (int si = 1; si < sam_size_0; si++) { + sam[si] = sam[0]+si*nlemt; } } C9::~C9() { - for (int gi = gis_size_0 - 1; gi > -1; gi--) { - delete[] gis[gi]; - delete[] gls[gi]; - } + delete[] gis[0]; + delete[] gls[0]; delete[] gis; delete[] gls; - for (int si = sam_size_0 - 1; si > -1; si--) delete[] sam[si]; + delete[] sam[0]; delete[] sam; } @@ -886,16 +890,19 @@ C9::C9(const mixMPI *mpidata) { MPI_Bcast(&nlemt, 1, MPI_INT, 0, MPI_COMM_WORLD); gis = new dcomplex*[gis_size_0]; gls = new dcomplex*[gis_size_0]; - for (int gi = 0; gi < gis_size_0; gi++) { - gis[gi] = new dcomplex[nlem](); - gls[gi] = new dcomplex[nlem](); - MPI_Bcast(gis[gi], nlem, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); - MPI_Bcast(gls[gi], nlem, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); + gis[0] = new dcomplex[gis_size_0*nlem](); + gls[0] = new dcomplex[gis_size_0*nlem](); + MPI_Bcast(gis[0], gis_size_0*nlem, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); + MPI_Bcast(gls[0], gis_size_0*nlem, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); + for (int gi = 1; gi < gis_size_0; gi++) { + gis[gi] = gis[0] + gi*nlem; + gls[gi] = gls[0] + gi*nlem; } sam = new dcomplex*[sam_size_0]; - for (int si = 0; si < sam_size_0; si++) { - sam[si] = new dcomplex[nlemt](); - MPI_Bcast(sam[si], nlemt, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); + sam[0] = new dcomplex[sam_size_0*nlemt](); + MPI_Bcast(sam[0], sam_size_0*nlemt, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); + for (int si = 1; si < sam_size_0; si++) { + sam[si] = sam[0]+si*nlemt; } } @@ -904,13 +911,9 @@ void C9::mpibcast(const mixMPI *mpidata) { MPI_Bcast(&sam_size_0, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&nlem, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&nlemt, 1, MPI_INT, 0, MPI_COMM_WORLD); - for (int gi = 0; gi < gis_size_0; gi++) { - MPI_Bcast(gis[gi], nlem, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); - MPI_Bcast(gls[gi], nlem, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); - } - for (int si = 0; si < sam_size_0; si++) { - MPI_Bcast(sam[si], nlemt, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); - } + MPI_Bcast(gis[0], gis_size_0*nlem, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); + MPI_Bcast(gls[0], gis_size_0*nlem, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); + MPI_Bcast(sam[0], sam_size_0*nlemt, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); } #endif diff --git a/src/libnptm/clu_subs.cpp b/src/libnptm/clu_subs.cpp index 5a7e3db5c1b79121277e112d3e3a25feb0fd7d52..3760cd8c39b13b653d04a9016f8efad2da766cc8 100644 --- a/src/libnptm/clu_subs.cpp +++ b/src/libnptm/clu_subs.cpp @@ -38,6 +38,12 @@ #include "../include/clu_subs.h" #endif +#ifdef USE_NVTX +#include <nvtx3/nvToolsExt.h> +#endif + +#pragma omp requires unified_shared_memory + using namespace std; void apc( @@ -394,6 +400,7 @@ dcomplex cdtp(dcomplex z, dcomplex **am, int i, int jf, int k, int nj) { return result; } +# pragma omp begin declare target device_type(any) double cgev(int ipamo, int mu, int l, int m) { double result = 0.0; double xd = 0.0, xn = 0.0; @@ -427,6 +434,7 @@ double cgev(int ipamo, int mu, int l, int m) { } return result; } +# pragma omp end declare target void cms(dcomplex **am, C1 *c1, C1_AddOns *c1ao, C4 *c4, C6 *c6) { dcomplex dm, de, cgh, cgk; @@ -630,6 +638,220 @@ void crsm1(double vk, double exri, C1 *c1, C1_AddOns *c1ao, C4 *c4, C6 *c6) { delete[] svs; } +# pragma omp begin declare target device_type(any) +dcomplex ghit_d( + int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2, C1 *c1, + C1_AddOns *c1ao, C4 *c4, double *rac3j +) { + /* NBL identifies transfer vector going from N2 to N1; + * IHI=0 for Hankel, IHI=1 for Bessel, IHI=2 for Bessel from origin; + * depending on IHI, IPAM=0 gives H or I, IPAM= 1 gives K or L. */ + const dcomplex cc0 = 0.0 + 0.0 * I; + const dcomplex uim = 0.0 + 1.0 * I; + dcomplex csum = cc0, cfun = cc0; + dcomplex result = cc0; + + if (ihi == 2) { + if (c1->rxx[nbl - 1] == 0.0 && c1->ryy[nbl - 1] == 0.0 && c1->rzz[nbl - 1] == 0.0) { + if (ipamo == 0) { + if (l1 == l2 && m1 == m2) result = 1.0 + 0.0 * I; + } + return result; + } + } + // label 10 + int l1mp = l1 - ipamo; + int l1po = l1 + 1; + int m1mm2 = m1 - m2; + int m1mm2m = (m1mm2 > 0) ? m1mm2 + 1 : 1 - m1mm2; + int lminpo = (l2 - l1mp > 0) ? l2 - l1mp + 1 : l1mp - l2 + 1; + int lmaxpo = l2 + l1mp + 1; + int i3j0in = c1ao->ind3j[l1mp][l2 - 1]; + int ilin = -1; + if (m1mm2m > lminpo && (m1mm2m - lminpo) % 2 != 0) ilin = 0; + int isn = 1; + if (m1 % 2 != 0) isn *= -1; + if (lminpo % 2 == 0) { + isn *= -1; + if (l2 > l1mp) isn *= -1; + } + // label 12 + int nblmo = nbl - 1; + if (ihi != 2) { + int nbhj = nblmo * c4->litpo; + int nby = nblmo * c4->litpos; + if (ihi != 1) { + for (int jm24 = 1; jm24 <= 3; jm24++) { + csum = cc0; + int mu = jm24 - 2; + int mupm1 = mu + m1; + int mupm2 = mu + m2; + if (mupm1 >= -l1mp && mupm1 <= l1mp && mupm2 >= - l2 && mupm2 <= l2) { + int jsn = -isn; + if (mu == 0) jsn = isn; + double cr = cgev(ipamo, mu, l1, m1) * cgev(0, mu, l2, m2); + int i3j0 = i3j0in; + if (mupm1 == 0 && mupm2 == 0) { + int lt14 = lminpo; + while (lt14 <= lmaxpo) { + i3j0++; + int l3 = lt14 - 1; + int ny = l3 * l3 + lt14; + double aors = 1.0 * (l3 + lt14); + double f3j = (c1ao->v3j0[i3j0 - 1] * c1ao->v3j0[i3j0 - 1] * sqrt(aors)) * jsn; + cfun = (c1ao->vh[nbhj + lt14 - 1] * c1ao->vyhj[nby + ny - 1]) * f3j; + csum += cfun; + jsn *= -1; + lt14 += 2; + } + // goes to 22 + } else { // label 16 + r3jjr_d(l1mp, l2, -mupm1, mupm2, rac3j); + int il = ilin; + int lt20 = lminpo; + while (lt20 <= lmaxpo) { + i3j0++; + if (m1mm2m <= lt20) { + il += 2; + int l3 = lt20 - 1; + int ny = l3 * l3 + lt20 + m1mm2; + double aors = 1.0 * (l3 + lt20); + double f3j = (rac3j[il - 1] * c1ao->v3j0[i3j0 - 1] * sqrt(aors)) * jsn; + cfun = (c1ao->vh[nbhj + lt20 - 1] * c1ao->vyhj[nby + ny - 1]) * f3j; + csum += cfun; + } + // label 20 + jsn *= -1; + lt20 += 2; + } + } + // label 22 + csum *= cr; + result += csum; + } + // Otherwise there is nothing to add + } // jm24 loop. Should go to 70 + } else { // label 30, IHI == 1 + for (int jm44 = 1; jm44 <= 3; jm44++) { + csum = cc0; + int mu = jm44 - 2; + int mupm1 = mu + m1; + int mupm2 = mu + m2; + if (mupm1 >= -l1mp && mupm1 <= l1mp && mupm2 >= - l2 && mupm2 <= l2) { + int jsn = - isn; + if (mu == 0) jsn = isn; + double cr = cgev(ipamo, mu, l1, m1) * cgev(0, mu, l2, m2); + int i3j0 = i3j0in; + if (mupm1 == 0 && mupm2 == 0) { + int lt34 = lminpo; + while (lt34 <= lmaxpo) { + i3j0++; + int l3 = lt34 - 1; + int ny = l3 * l3 + lt34; + double aors = 1.0 * (l3 + lt34); + double f3j = (c1ao->v3j0[i3j0 - 1] * c1ao->v3j0[i3j0 - 1] * sqrt(aors)) * jsn; + cfun = (c1ao->vh[nbhj + lt34 - 1] * c1ao->vyhj[nby + ny - 1]) * f3j; + csum += cfun; + jsn *= -1; + lt34 += 2; + } + // goes to 42 + } else { // label 36 + r3jjr_d(l1mp, l2, -mupm1, mupm2, rac3j); + int il = ilin; + int lt40 = lminpo; + while (lt40 <= lmaxpo) { + i3j0++; + if (m1mm2m <= lt40) { + il += 2; + int l3 = lt40 - 1; + int ny = l3 * l3 + lt40 + m1mm2; + double aors = 1.0 * (l3 + lt40); + double f3j = (rac3j[il - 1] * c1ao->v3j0[i3j0 - 1] * sqrt(aors)) * jsn; + cfun = (c1ao->vh[nbhj + lt40 - 1] * c1ao->vyhj[nby + ny - 1]) * f3j; + csum += cfun; + } + // label 40 + jsn *= -1; + lt40 += 2; + } + } + // label 42 + csum *= cr; + result += csum; + } + // Otherwise there is nothing to add + } // jm44 loop. Should go to 70 + } + // goes to 70 + } else { // label 50, IHI == 2 + int nbhj = nblmo * c4->lmtpo; + int nby = nblmo * c4->lmtpos; + for (int jm64 = 1; jm64 <= 3; jm64++) { + csum = cc0; + int mu = jm64 - 2; + int mupm1 = mu + m1; + int mupm2 = mu + m2; + if (mupm1 >= -l1mp && mupm1 <= l1mp && mupm2 >= - l2 && mupm2 <= l2) { + int jsn = -isn; + if (mu == 0) jsn = isn; + double cr = cgev(ipamo, mu, l1, m1) * cgev(0, mu, l2, m2); + int i3j0 = i3j0in; + if (mupm1 == 0 && mupm2 == 0) { + int lt54 = lminpo; + while (lt54 <= lmaxpo) { + i3j0++; + int l3 = lt54 - 1; + int ny = l3 * l3 + lt54; + double aors = 1.0 * (l3 + lt54); + double f3j = (c1ao->v3j0[i3j0 - 1] * c1ao->v3j0[i3j0 - 1] * sqrt(aors)) * jsn; + cfun = (c1ao->vj0[nbhj + lt54 - 1] * c1ao->vyj0[nby + ny - 1]) * f3j; + csum += cfun; + jsn *= -1; + lt54 += 2; + } + // goes to 62 + } else { // label 56 + r3jjr_d(l1mp, l2, -mupm1, mupm2, rac3j); + int il = ilin; + int lt60 = lminpo; + while (lt60 <= lmaxpo) { + i3j0++; + if (m1mm2m <= lt60) { + il += 2; + int l3 = lt60 - 1; + int ny = l3 * l3 + lt60 + m1mm2; + double aors = 1.0 * (l3 + lt60); + double f3j = (rac3j[il - 1] * c1ao->v3j0[i3j0 - 1] * sqrt(aors)) * jsn; + cfun = (c1ao->vj0[nbhj + lt60 - 1] * c1ao->vyj0[nby + ny - 1]) * f3j; + csum += cfun; + } + // label 60 + jsn *= -1; + lt60 += 2; + } + } + // label 62 + csum *= cr; + result += csum; + } + // Otherwise there is nothing to add + } // jm64 loop. Should go to 70 + } + // label 70 + const double four_pi = acos(0.0) * 8.0; + if (ipamo != 1) { + double cr = sqrt(four_pi * (l1 + l1po) * (l2 + l2 + 1)); + result *= cr; + } else { + double cr = sqrt(four_pi * (l1 + l1mp) * (l1 + l1po) * (l2 + l2 + 1) / l1po); + result *= (cr * uim); + } + return result; +} +#pragma omp end declare target + +# pragma omp begin declare target device_type(any) dcomplex ghit( int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2, C1 *c1, C1_AddOns *c1ao, C4 *c4, C6 *c6 @@ -840,6 +1062,7 @@ dcomplex ghit( } return result; } +#pragma omp end declare target void hjv( double exri, double vk, int &jer, int &lcalc, dcomplex &arg, @@ -1342,6 +1565,7 @@ void r3j000(int j2, int j3, C6 *c6) { } } +#pragma omp begin declare target device_type(any) void r3jjr(int j2, int j3, int m2, int m3, C6 *c6) { int jmx = j3 + j2; int jdf = j3 - j2; @@ -1459,6 +1683,127 @@ void r3jjr(int j2, int j3, int m2, int m3, C6 *c6) { } } } +# pragma omp end declare target + +#pragma omp begin declare target device_type(any) +void r3jjr_d(int j2, int j3, int m2, int m3, double *rac3j) { + int jmx = j3 + j2; + int jdf = j3 - j2; + int m1 = -m2 - m3; + int abs_jdf = (jdf >= 0) ? jdf : -jdf; + int abs_m1 = (m1 >= 0) ? m1 : -m1; + int jmn = (abs_jdf > abs_m1) ? abs_jdf : abs_m1; + int njmo = jmx - jmn; + int jf = jmx + jmx + 1; + int isn = 1; + if ((jdf + m1) % 2 != 0) isn = -1; + if (njmo <= 0) { + double sj = 1.0 * jf; + double cnr = (1.0 / sqrt(sj)) * isn; + rac3j[0] = cnr; + } else { // label 15 + double sjt = 1.0; + double sjr = 1.0 * jf; + int jsmpos = (jmx + 1) * (jmx + 1); + int jdfs = jdf * jdf; + int m1s = m1 * m1; + int mdf = m3 - m2; + int idjc = m1 * (j3 * (j3 + 1) - j2 * (j2 +1)); + int j1 = jmx; + int j1s = j1 * j1; + int j1po = j1 + 1; + double ccj = 1.0 * (j1s - jdfs) * (j1s - m1s); + double cj = sqrt(ccj * (jsmpos - j1s)); + double dj = 1.0 * jf * (j1 * j1po * mdf + idjc); + if (njmo <= 1) { + rac3j[0] = -dj / (cj * j1po); + double sj = sjr + (rac3j[0] * rac3j[0]) * (jf - 2); + double cnr = (1.0 / sqrt(sj)) * isn; + rac3j[1] = cnr; + rac3j[0] *= cnr; + } else { // label 20 + double cjp = 0.0; + int nj = njmo + 1; + int nmat = (nj + 1) / 2; + rac3j[nj - 1] = 1.0; + rac3j[njmo - 1] = -dj / (cj * j1po); + if (nmat != njmo) { + int nbr = njmo - nmat; + for (int ibr45 = 1; ibr45 <= nbr; ibr45++) { + int irr = nj - ibr45; + jf -= 2; + j1--; + j1s = j1 * j1; + j1po = j1 + 1; + cjp = cj; + ccj = 1.0 * (j1s - jdfs) * (j1s - m1s); + cj = sqrt(ccj * (jsmpos - j1s)); + sjt = rac3j[irr - 1] * rac3j[irr - 1]; + dj = 1.0 * jf * (j1 * j1po * mdf + idjc); + rac3j[irr - 2] = -(rac3j[irr - 1] * dj + + rac3j[irr] * cjp * j1) / (cj * j1po); + sjr += (sjt * jf); + } // ibr45 loop + } + // label 50 + double osjt = sjt; + sjt = rac3j[nmat - 1] * rac3j[nmat - 1]; + if (sjt >= osjt) { + sjr += (sjt * (jf - 2)); + } else { // label 55 + nmat++; + } + // label 60 + double racmat = rac3j[nmat - 1]; + rac3j[0] = 1.0; + jf = jmn + jmn + 1; + double sjl = 1.0 * jf; + j1 = jmn; + if (j1 != 0) { + j1po = j1 + 1; + int j1pos = j1po * j1po; + double ccjp = 1.0 * (j1pos - jdfs) * (j1pos - m1s); + cjp = sqrt(ccjp * (jsmpos - j1pos)); + dj = 1.0 * jf * (j1 * j1po * mdf + idjc); + rac3j[1] = - dj / (cjp * j1); + } else { // label 62 + cjp = sqrt(1.0 * (jsmpos - 1)); + dj = 1.0 * mdf; + rac3j[1] = -dj / cjp; + } + // label 63 + int nmatmo = nmat - 1; + if (nmatmo >= 2) { + for (int irl70 = 2; irl70 <= nmatmo; irl70++) { + jf += 2; + j1++; + j1po = j1 + 1; + int j1pos = j1po * j1po; + cj = cjp; + double ccjp = 1.0 * (j1pos - jdfs) * (j1pos - m1s); + cjp = sqrt(ccjp * (jsmpos - j1pos)); + sjt = rac3j[irl70 - 1] * rac3j[irl70 - 1]; + dj = 1.0 * jf * (j1 * j1po * mdf + idjc); + rac3j[irl70] = -( + rac3j[irl70 - 1] * dj + + rac3j[irl70 - 2] * cj * j1po + ) / (cjp * j1); + sjl += (sjt * jf); + } + } + // label 75 + double ratrac = racmat / rac3j[nmat - 1]; + double rats = ratrac * ratrac; + double sj = sjr + sjl * rats; + rac3j[nmat - 1] = racmat; + double cnr = (1.0 / sqrt(sj)) * isn; + for (int irr80 = nmat; irr80 <= nj; irr80++) rac3j[irr80 - 1] *= cnr; + double cnl = cnr * ratrac; + for (int irl85 = 1; irl85 <= nmatmo; irl85++) rac3j[irl85 - 1] *= cnl; + } + } +} +# pragma omp end declare target void r3jmr(int j1, int j2, int j3, int m1, C6 *c6) { int mmx = (j2 < j3 - m1) ? j2 : j3 - m1; @@ -1752,6 +2097,9 @@ void scr2( double vk, double vkarg, double exri, double *duk, C1 *c1, C1_AddOns *c1ao, C3 *c3, C4 *c4 ) { +#ifdef USE_NVTX + nvtxRangePush("scr2 starts"); +#endif const dcomplex cc0 = 0.0 + 0.0 * I; const dcomplex uim = 0.0 + 1.0 * I; dcomplex s11, s21, s12, s22, rm, re, csam, cph, phas, cc; @@ -1765,29 +2113,77 @@ void scr2( c3->tsas[1][0] = cc0; c3->tsas[0][1] = cc0; c3->tsas[1][1] = cc0; + dcomplex *vec_rmi = c1->rmi[0]; + dcomplex *vec_rei = c1->rei[0]; + dcomplex *vec_w = c1->w[0]; +#ifdef USE_NVTX + nvtxRangePush("scr2 outer loop 1"); +#endif +#pragma omp parallel for for (int i14 = 1; i14 <= c4->nsph; i14++) { int i = i14 - 1; int iogi = c1->iog[i14 - 1]; if (iogi >= i14) { - int k = 0; - s11 = cc0; - s21 = cc0; - s12 = cc0; - s22 = cc0; - for (int l10 = 1; l10 <= ls; l10++) { - int l = l10 - 1; - rm = 1.0 / c1->rmi[l][i]; - re = 1.0 / c1->rei[l][i]; - int ltpo = l10 + l10 + 1; - for (int im10 = 1; im10 <= ltpo; im10++) { - k++; - int ke = k + c4->nlem; - s11 -= (c1->w[k - 1][2] * c1->w[k - 1][0] * rm + c1->w[ke - 1][2] * c1->w[ke - 1][0] * re); - s21 -= (c1->w[k - 1][3] * c1->w[k - 1][0] * rm + c1->w[ke - 1][3] * c1->w[ke - 1][0] * re); - s12 -= (c1->w[k - 1][2] * c1->w[k - 1][1] * rm + c1->w[ke - 1][2] * c1->w[ke - 1][1] * re); - s22 -= (c1->w[k - 1][3] * c1->w[k - 1][1] * rm + c1->w[ke - 1][3] * c1->w[ke - 1][1] * re); - } // im10 loop - } // l10 loop + // int k = 0; + dcomplex s11 = cc0; + dcomplex s21 = cc0; + dcomplex s12 = cc0; + dcomplex s22 = cc0; + // To parallelise, I run a linearised loop directly over k + // working out the algebra, it turns out that + // k = l10*l10-1+im10 + // we invert this to find + // l10 = (int) sqrt(k+1) and im10 = k - l10*10+1 + // but if it results im = 0, then we set l10 = l10-1 and im10 = 2*l10+1 + // furthermore if it results im10 > 2*l10+1, then we set + // im10 = im10 -(2*l10+1) and l10 = l10+1 (there was a rounding error in a nearly exact root) + int kmax = (ls+1)*(ls+1)-1; +#ifdef USE_NVTX + nvtxRangePush("scr2 inner loop 1"); +#endif +#pragma omp target teams distribute parallel for simd reduction(-:s11, s21, s12, s22) + for (int k = 1; k<=kmax; k++) { + int l10 = (int) sqrt(k+1); + int im10 = k - (l10*l10) + 1; + if (im10 == 0) { + l10--; + im10 = 2*l10+1; + } + else if (im10 > 2*l10 + 1) { + im10 -= 2*l10 + 1; + l10++; + } + // I have all the indices in my linearised loop + int l = l10 - 1; + int ke = k + c4->nlem; + int km1t4 = (k - 1)*4; + int kem1t4 = (ke - 1)*4; + dcomplex auxrm0 = vec_w[km1t4] / vec_rmi[l*c1->nsph+i]; + dcomplex auxrm1 = vec_w[km1t4+1] / vec_rmi[l*c1->nsph+i]; + dcomplex auxre0 = vec_w[kem1t4] / vec_rei[l*c1->nsph+i]; + dcomplex auxre1 = vec_w[kem1t4+1] / vec_rei[l*c1->nsph+i]; + s11 -= vec_w[km1t4+2] * auxrm0 + vec_w[kem1t4+2] * auxre0; + s21 -= vec_w[km1t4+3] * auxrm0 + vec_w[kem1t4+3] * auxre0; + s12 -= vec_w[km1t4+2] * auxrm1 + vec_w[kem1t4+2] * auxre1; + s22 -= vec_w[km1t4+3] * auxrm1 + vec_w[kem1t4+3] * auxre1; + } +#ifdef USE_NVTX + nvtxRangePop(); +#endif + // for (int l10 = 1; l10 <= ls; l10++) { + // int l = l10 - 1; + // rm = 1.0 / c1->rmi[l][i]; + // re = 1.0 / c1->rei[l][i]; + // int ltpo = l10 + l10 + 1; + // for (int im10 = 1; im10 <= ltpo; im10++) { + // k++; + // int ke = k + c4->nlem; + // s11 -= (c1->w[k - 1][2] * c1->w[k - 1][0] * rm + c1->w[ke - 1][2] * c1->w[ke - 1][0] * re); + // s21 -= (c1->w[k - 1][3] * c1->w[k - 1][0] * rm + c1->w[ke - 1][3] * c1->w[ke - 1][0] * re); + // s12 -= (c1->w[k - 1][2] * c1->w[k - 1][1] * rm + c1->w[ke - 1][2] * c1->w[ke - 1][1] * re); + // s22 -= (c1->w[k - 1][3] * c1->w[k - 1][1] * rm + c1->w[ke - 1][3] * c1->w[ke - 1][1] * re); + // } // im10 loop + // } // l10 loop c1->sas[i][0][0] = s11 * csam; c1->sas[i][1][0] = s21 * csam; c1->sas[i][0][1] = s12 * csam; @@ -1800,35 +2196,65 @@ void scr2( c3->tsas[0][1] += (c1->sas[iogi - 1][0][1] * phas); c3->tsas[1][1] += (c1->sas[iogi - 1][1][1] * phas); } // i14 loop +#ifdef USE_NVTX + nvtxRangePop(); +#endif + dcomplex *vec_vints = c1->vints[0]; +#ifdef USE_NVTX + nvtxRangePush("scr2 outer loop 2"); +#endif +#pragma omp parallel for for (int i24 = 1; i24 <= c4->nsph; i24++) { int iogi = c1->iog[i24 - 1]; if (iogi >= i24) { - int j = 0; + // int j = 0; +#ifdef USE_NVTX + nvtxRangePush("scr2 inner loop 2"); +#endif +#pragma omp target parallel for collapse(4) for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { - cc = dconjg(c1->sas[i24 - 1][jpo1 - 1][ipo1 - 1]); + // cc = dconjg(c1->sas[i24 - 1][jpo1 - 1][ipo1 - 1]); for (int ipo2 = 1; ipo2 <= 2; ipo2++) { for (int jpo2 = 1; jpo2 <= 2; jpo2++) { - j++; - c1->vints[i24 - 1][j - 1] = c1->sas[i24 - 1][jpo2 - 1][ipo2 - 1] * cc * cfsq; + int j = jpo2 + (ipo2-1)*2 + (jpo1-1)*4 + (ipo1-1)*8; + // j++; + vec_vints[(i24 - 1)*16+j - 1] = c1->sas[i24 - 1][jpo2 - 1][ipo2 - 1] * dconjg(c1->sas[i24 - 1][jpo1 - 1][ipo1 - 1]) * cfsq; } // jpo2 loop } // ipo2 loop } // jpo1 loop } // ipo1 loop +#ifdef USE_NVTX + nvtxRangePop(); +#endif } } // i24 loop - int j = 0; +#ifdef USE_NVTX + nvtxRangePop(); +#endif + // int j = 0; +#ifdef USE_NVTX + nvtxRangePush("scr2 loop 3"); +#endif +#pragma omp target parallel for collapse(4) for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { - cc = dconjg(c3->tsas[jpo1 - 1][ipo1 - 1]); + // cc = dconjg(c3->tsas[jpo1 - 1][ipo1 - 1]); for (int ipo2 = 1; ipo2 <= 2; ipo2++) { for (int jpo2 = 1; jpo2 <= 2; jpo2++) { - j++; - c1ao->vintt[j - 1] = c3->tsas[jpo2 - 1][ipo2 - 1] * cc * cfsq; + // j++; + int j = jpo2-1 + (ipo2-1)*2 + (jpo1-1)*4 + (ipo1-1)*8; + c1ao->vintt[j] = c3->tsas[jpo2 - 1][ipo2 - 1] * dconjg(c3->tsas[jpo1 - 1][ipo1 - 1]) * cfsq; } // jpo2 loop } // ipo2 loop } // jpo1 loop } // ipo1 loop +#ifdef USE_NVTX + nvtxRangePop(); +#endif +#ifdef USE_NVTX + nvtxRangePop(); +#endif } void str(ScattererConfiguration *sconf, C1 *c1, C1_AddOns *c1ao, C3 *c3, C4 *c4, C6 *c6) { @@ -1922,79 +2348,133 @@ void tqr( void ztm(dcomplex **am, C1 *c1, C1_AddOns *c1ao, C4 *c4, C6 *c6, C9 * c9) { dcomplex gie, gle, a1, a2, a3, a4, sum1, sum2, sum3, sum4; const dcomplex cc0 = 0.0 + 0.0 * I; - int ndi = c4->nsph * c4->nlim; - int i2 = 0; - for (int n2 = 1; n2 <= c4->nsph; n2++) { // GPU portable? - for (int l2 = 1; l2 <= c4->li; l2++) { - int l2tpo = l2 + l2 + 1; - int m2 = -l2 - 1; - for (int im2 = 1; im2 <= l2tpo; im2++) { - m2++; - i2++; - int i3 = 0; + const int ndi = c4->nsph * c4->nlim; + np_int ndit = 2*ndi; + // int i2 = 0; // old implementation +#ifdef USE_NVTX + nvtxRangePush("ZTM starts"); +#endif +#ifdef USE_NVTX + nvtxRangePush("ZTM parallel loop 1"); +#endif + // C9 *c9_para = new C9(*c9); + dcomplex *gis_v = c9->gis[0]; + dcomplex *gls_v = c9->gls[0]; +#pragma omp target parallel + { + double *rac3j_local = (double *) malloc(c6->lmtpo*sizeof(double)); +#pragma omp for collapse(3) + for (int n2 = 1; n2 <= c4->nsph; n2++) { // GPU portable? + for (int l2 = 1; l2 <= c4->li; l2++) { for (int l3 = 1; l3 <= c4->le; l3++) { + int l2tpo = l2 + l2 + 1; int l3tpo = l3 + l3 + 1; - int m3 = -l3 - 1; - for (int im3 = 1; im3 <= l3tpo; im3++) { - m3++; - i3++; - c9->gis[i2 - 1][i3 - 1] = ghit(2, 0, n2, l2, m2, l3, m3, c1, c1ao, c4, c6); - c9->gls[i2 - 1][i3 - 1] = ghit(2, 1, n2, l2, m2, l3, m3, c1, c1ao, c4, c6); - } // im3 loop - } // l3 loop - } // im2 loop - } // l2 loop - } // n2 loop + for (int im2 = 1; im2 <= l2tpo; im2++) { + int i2 = (n2-1) * c4->li * (c4->li + 2) + l2 * l2 + im2 - 1; + // int vecindex0 = (i2 - 1)*c9->nlem; + int m2 = -l2 - 1 + im2; + // i2++; // old implementation + // int i3 = 0; // old implementation + //#pragma omp for + for (int im3 = 1; im3 <= l3tpo; im3++) { + int i3 = l3 * l3 + im3 - 1; + int m3 = -l3 - 1 + im3; + int vecindex = (i2 - 1)*c9->nlem + i3 - 1; + // i3++; // old implementation + gis_v[vecindex] = ghit_d(2, 0, n2, l2, m2, l3, m3, c1, c1ao, c4, rac3j_local); + gls_v[vecindex] = ghit_d(2, 1, n2, l2, m2, l3, m3, c1, c1ao, c4, rac3j_local); + // c9->gis[i2 - 1][i3 - 1] = ghit(2, 0, n2, l2, m2, l3, m3, c1, c1ao, c4, c6_local); + // c9->gls[i2 - 1][i3 - 1] = ghit(2, 1, n2, l2, m2, l3, m3, c1, c1ao, c4, c6_local); + } // im3 loop + } // l3 loop + } // im2 loop + } // l2 loop + } // n2 loop + free(rac3j_local); + } // pragma omp parallel +#ifdef USE_NVTX + nvtxRangePop(); +#endif +#ifdef USE_NVTX + nvtxRangePush("ZTM loop 2"); +#endif + dcomplex *am_v = am[0]; + dcomplex *sam_v = c9->sam[0]; +# pragma omp target teams distribute parallel for collapse(2) for (int i1 = 1; i1 <= ndi; i1++) { // GPU portable? - int i1e = i1 + ndi; for (int i3 = 1; i3 <= c4->nlem; i3++) { + dcomplex sum1 = cc0; + dcomplex sum2 = cc0; + dcomplex sum3 = cc0; + dcomplex sum4 = cc0; + int i1e = i1 + ndi; int i3e = i3 + c4->nlem; - sum1 = cc0; - sum2 = cc0; - sum3 = cc0; - sum4 = cc0; for (int i2 = 1; i2 <= ndi; i2++) { int i2e = i2 + ndi; - gie = c9->gis[i2 - 1][i3 - 1]; - gle = c9->gls[i2 - 1][i3 - 1]; - a1 = am[i1 - 1][i2 - 1]; - a2 = am[i1 - 1][i2e - 1]; - a3 = am[i1e - 1][i2 - 1]; - a4 = am[i1e - 1][i2e - 1]; + int vecindg_23 = (i2 - 1)*c9->nlem + i3 - 1; + dcomplex gie = gis_v[vecindg_23]; + dcomplex gle = gls_v[vecindg_23]; + np_int vecinda_1 = (i1 - 1)*ndit; + np_int vecinda_1e = (i1 - 1 + ndi)*ndit; + dcomplex a1 = am_v[vecinda_1 + i2 - 1]; + dcomplex a2 = am_v[vecinda_1 + i2e - 1]; + dcomplex a3 = am_v[vecinda_1e + i2 - 1]; + dcomplex a4 = am_v[vecinda_1e + i2e - 1]; sum1 += (a1 * gie + a2 * gle); sum2 += (a1 * gle + a2 * gie); sum3 += (a3 * gie + a4 * gle); sum4 += (a3 * gle + a4 * gie); } // i2 loop - c9->sam[i1 - 1][i3 - 1] = sum1; - c9->sam[i1 - 1][i3e - 1] = sum2; - c9->sam[i1e - 1][i3 - 1] = sum3; - c9->sam[i1e - 1][i3e - 1] = sum4; + int vecind1 = (i1 - 1)*c9->nlemt; + int vecind1e = (i1e - 1)*c9->nlemt; + sam_v[vecind1 + i3 - 1] = sum1; + sam_v[vecind1 + i3e - 1] = sum2; + sam_v[vecind1e + i3 - 1] = sum3; + sam_v[vecind1e + i3e - 1] = sum4; } // i3 loop } // i1 loop +# pragma omp parallel for collapse(2) for (int i1 = 1; i1 <= ndi; i1++) { for (int i0 = 1; i0 <= c4->nlem; i0++) { - c9->gis[i1 - 1][i0 - 1] = dconjg(c9->gis[i1 - 1][i0 - 1]); - c9->gls[i1 - 1][i0 - 1] = dconjg(c9->gls[i1 - 1][i0 - 1]); + int vecindex = (i1 - 1)*c9->nlem + i0 - 1; + gis_v[vecindex] = dconjg(gis_v[vecindex]); + gls_v[vecindex] = dconjg(gls_v[vecindex]); + // c9->gis[i1 - 1][i0 - 1] = dconjg(c9->gis[i1 - 1][i0 - 1]); + // c9->gls[i1 - 1][i0 - 1] = dconjg(c9->gls[i1 - 1][i0 - 1]); } // i0 loop } // i1 loop int nlemt = c4->nlem + c4->nlem; + dcomplex *am0m_v = c1ao->am0m[0]; +# pragma omp target parallel for collapse(2) for (int i0 = 1; i0 <= c4->nlem; i0++) { - int i0e = i0 + c4->nlem; for (int i3 = 1; i3 <= nlemt; i3++) { - sum1 = cc0; - sum2 = cc0; + int i0e = i0 + c4->nlem; + dcomplex sum1 = cc0; + dcomplex sum2 = cc0; for (int i1 = 1; i1 <= ndi; i1 ++) { int i1e = i1 + ndi; - a1 = c9->sam[i1 - 1][i3 - 1]; - a2 = c9->sam[i1e - 1][i3 - 1]; - gie = c9->gis[i1 - 1][i0 - 1]; - gle = c9->gls[i1 - 1][i0 - 1]; + int vecind1 = (i1 - 1)*c9->nlemt; + int vecind1e = (i1e - 1)*c9->nlemt; + a1 = sam_v[vecind1 + i3 - 1]; + a2 = sam_v[vecind1e + i3 - 1]; + int vecindex = (i1 - 1)*c9->nlem + i0 - 1; + gie = gis_v[vecindex]; + gle = gls_v[vecindex]; sum1 += (a1 * gie + a2 * gle); sum2 += (a1 * gle + a2 * gie); } // i1 loop - c1ao->am0m[i0 - 1][i3 - 1] = -sum1; - c1ao->am0m[i0e - 1][i3 - 1] = -sum2; + int vecind0 = (i0 - 1)*nlemt; + int vecind0e = (i0e - 1)*nlemt; + am0m_v[vecind0 + i3 - 1] = -sum1; + am0m_v[vecind0e + i3 - 1] = -sum2; + // c1ao->am0m[i0 - 1][i3 - 1] = -sum1; + // c1ao->am0m[i0e - 1][i3 - 1] = -sum2; } // i3 loop } // i0 loop +#ifdef USE_NVTX + nvtxRangePop(); +#endif +#ifdef USE_NVTX + nvtxRangePop(); +#endif } diff --git a/src/libnptm/sph_subs.cpp b/src/libnptm/sph_subs.cpp index 208f4bd3d4013c5a6ee44f811ab94164912be4d4..267a7c4573f6546a9ea3b16e435308958fd9ed47 100644 --- a/src/libnptm/sph_subs.cpp +++ b/src/libnptm/sph_subs.cpp @@ -196,11 +196,13 @@ double cg1(int lmpml, int mu, int l, int m) { return result; } +# pragma omp begin declare target device_type(any) dcomplex dconjg(dcomplex z) { double zreal = real(z); double zimag = imag(z); return (zreal - zimag * I); } +# pragma omp end declare target void diel(int npntmo, int ns, int i, int ic, double vk, C1 *c1, C2 *c2) { const double dif = c1->rc[i - 1][ns] - c1->rc[i - 1][ns - 1]; diff --git a/src/libnptm/types.cpp b/src/libnptm/types.cpp index 2f17bc9627125eae563ffea7131f381ed5b80083..d79f10b2858014928a8831a52b24b5325c7ba0ca 100644 --- a/src/libnptm/types.cpp +++ b/src/libnptm/types.cpp @@ -21,6 +21,10 @@ #include "../include/types.h" #endif +# pragma omp begin declare target device_type(any) double real(dcomplex z) { return __real__ z; } +# pragma omp end declare target +# pragma omp begin declare target device_type(any) double imag(dcomplex z) { return __imag__ z; } +# pragma omp end declare target diff --git a/src/make.inc b/src/make.inc index 57499cbead9efdd7acd6ad3658982013e8b69df9..6b9c27f19f04a3ada4914cb5c10ab51a29122c28 100644 --- a/src/make.inc +++ b/src/make.inc @@ -124,7 +124,8 @@ endif # CXXFLAGS defines the default compilation options for the C++ compiler ifndef CXXFLAGS - override CXXFLAGS=-O3 -ggdb -pg -coverage -Wno-format-contains-nul -I$(HDF5_INCLUDE) $(MPI_CXXFLAGS) $(NVTX_CXXFLAGS) +# override CXXFLAGS=-O3 -ggdb -pg -coverage -Wno-format-contains-nul -I$(HDF5_INCLUDE) $(MPI_CXXFLAGS) $(NVTX_CXXFLAGS) + override CXXFLAGS=-O3 -ggdb -Wno-format-contains-nul -foffload=default -foffload=nvptx-none="-O3 -ggdb -fopt-info -lm -latomic -mgomp" -I$(HDF5_INCLUDE) $(MPI_CXXFLAGS) $(NVTX_CXXFLAGS) ifdef USE_OPENMP override CXXFLAGS+= -fopenmp # closes USE_OPENMP