diff --git a/src/cluster/cluster.cpp b/src/cluster/cluster.cpp index 17694a5068372b8921e94fdb75474fe232e730a1..883d31be834a6ee678e523a8b62cff051ee7859b 100644 --- a/src/cluster/cluster.cpp +++ b/src/cluster/cluster.cpp @@ -298,7 +298,15 @@ void cluster(const string& config_file, const string& data_file, const string& o #ifdef USE_NVTX nvtxRangePush("First iteration"); #endif - int jer = cluster_jxi488_cycle(jxi488, sconf, gconf, p_scattering_angles, cid, p_output, output_path, vtppoanp); + // use these pragmas, which should have no effect on parallelism, just to push OMP nested levels at the same level also in the first wavelength iteration + int jer = 0; +#pragma omp parallel + { +#pragma omp single + { + jer = cluster_jxi488_cycle(jxi488, sconf, gconf, p_scattering_angles, cid, p_output, output_path, vtppoanp); + } + } #ifdef USE_NVTX nvtxRangePop(); #endif diff --git a/src/libnptm/Commons.cpp b/src/libnptm/Commons.cpp index de9539d77362053209a9bed8b2143911335c594b..bc410a205991fa030bcb026df8a14e1ff3c1121b 100644 --- a/src/libnptm/Commons.cpp +++ b/src/libnptm/Commons.cpp @@ -95,10 +95,13 @@ C1::C1(GeometryConfiguration *gconf, ScattererConfiguration *sconf) { } sas = new dcomplex**[nsph]; - for (int si = 0; si < nsph; si++) { - sas[si] = new dcomplex*[2]; - sas[si][0] = new dcomplex[2](); - sas[si][1] = new dcomplex[2](); + sas[0] = new dcomplex*[nsph*2]; + sas[0][0] = new dcomplex[nsph*2*2]; + sas[0][1] = sas[0][0]+2; + for (int si = 1; si < nsph; si++) { + sas[si] = sas[0]+2*si; + sas[si][0] = sas[0][0]+4*si; + sas[si][1] = sas[si][0]+2; } } @@ -162,10 +165,16 @@ C1::C1(const C1& rhs) { ros = new double[nsph](); sas = new dcomplex**[nsph]; + sas[0] = new dcomplex*[nsph*2]; + sas[0][0] = new dcomplex[nsph*2*2]; + sas[0][1] = sas[0][0]+2; + for (int si = 1; si < nsph; si++) { + sas[si] = sas[0]+2*si; + sas[si][0] = sas[0][0]+4*si; + sas[si][1] = sas[si][0]+2; + } for (int si = 0; si < nsph; si++) { - sas[si] = new dcomplex*[2]; for (int sj=0; sj<2; sj++) { - sas[si][sj] = new dcomplex[2](); for (int sk=0; sk<2; sk++) sas[si][sj][sk] = rhs.sas[si][sj][sk]; } fsas[si] = rhs.fsas[si]; @@ -252,10 +261,16 @@ C1::C1(const mixMPI *mpidata) { MPI_Bcast(rzz, nsph, MPI_DOUBLE, 0, MPI_COMM_WORLD); MPI_Bcast(ros, configurations, MPI_DOUBLE, 0, MPI_COMM_WORLD); sas = new dcomplex**[nsph]; + sas[0] = new dcomplex*[nsph*2]; + sas[0][0] = new dcomplex[nsph*2*2]; + sas[0][1] = sas[0][0]+2; + for (int si = 1; si < nsph; si++) { + sas[si] = sas[0]+2*si; + sas[si][0] = sas[0][0]+4*si; + sas[si][1] = sas[si][0]+2; + } for (int si = 0; si < nsph; si++) { - sas[si] = new dcomplex*[2]; for (int sj=0; sj<2; sj++) { - sas[si][sj] = new dcomplex[2](); MPI_Bcast(sas[si][sj], 2, MPI_C_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD); } } @@ -311,11 +326,8 @@ C1::~C1() { delete[] vint; delete[] vec_vints; delete[] vints; - for (int si = nsph - 1; si > -1; si--) { - delete[] sas[si][1]; - delete[] sas[si][0]; - delete[] sas[si]; - } + delete[] sas[0][0]; + delete[] sas[0]; delete[] sas; delete[] fsas; delete[] sscs; diff --git a/src/libnptm/clu_subs.cpp b/src/libnptm/clu_subs.cpp index de8a411166078c670236fbe35940380bcf0179d6..7d66bb6e3e962b9f2c2c003c113834a22489dcd8 100644 --- a/src/libnptm/clu_subs.cpp +++ b/src/libnptm/clu_subs.cpp @@ -42,6 +42,10 @@ #include <nvtx3/nvToolsExt.h> #endif +#ifdef _OPENMP +#include <omp.h> +#endif + #pragma omp requires unified_shared_memory using namespace std; @@ -1905,7 +1909,7 @@ void raba( dcomplex **tqcpe, double **tqcs, dcomplex **tqcps ) { dcomplex **a, **ctqce, **ctqcs; - dcomplex acw, acwp, aca, acap, c1, c2, c3; + // dcomplex c3; const dcomplex cc0 = 0.0 + 0.0 * I; const dcomplex uim = 0.0 + 1.0 * I; const double sq2i = 1.0 / sqrt(2.0); @@ -1914,27 +1918,37 @@ void raba( a = new dcomplex*[nlemt]; ctqce = new dcomplex*[2]; ctqcs = new dcomplex*[2]; - for (int ai = 0; ai < nlemt; ai++) a[ai] = new dcomplex[2](); - for (int ci = 0; ci < 2; ci++) { - ctqce[ci] = new dcomplex[3](); - ctqcs[ci] = new dcomplex[3](); - } + a[0] = new dcomplex[nlemt*2](); + dcomplex *vec_a = a[0]; + ctqce[0] = new dcomplex[6](); + ctqcs[0] = new dcomplex[6](); + for (int ai = 1; ai < nlemt; ai++) a[ai] = a[0]+ai*2; + ctqce[1] = ctqce[0]+3; + ctqcs[1] = ctqcs[0]+3; + dcomplex *vec_am0m = am0m[0]; +#pragma omp parallel for for (int i20 = 1; i20 <= nlemt; i20++) { int i = i20 - 1; - c1 = cc0; - c2 = cc0; + dcomplex c1 = cc0; + dcomplex c2 = cc0; +#pragma omp target teams distribute parallel for simd reduction(+:c1, c2) for (int j10 = 1; j10 <= nlemt; j10++) { int j = j10 - 1; - c1 += (am0m[i][j] * w[j][0]); - c2 += (am0m[i][j] * w[j][1]); + c1 += (vec_am0m[i*nlemt+j] * w[j][0]); + c2 += (vec_am0m[i*nlemt+j] * w[j][1]); + // c1 += (am0m[i][j] * w[j][0]); + // c2 += (am0m[i][j] * w[j][1]); } // j10 loop - a[i][0] = c1; - a[i][1] = c2; + vec_a[2*i] = c1; + vec_a[2*i+1] = c2; + // a[i][0] = c1; + // a[i][1] = c2; } //i20 loop int jpo = 2; for (int ipo70 = 1; ipo70 <= 2; ipo70++) { if (ipo70 == 2) jpo = 1; int ipo = ipo70 - 1; + int jpo = 1 - ipo; ctqce[ipo][0] = cc0; ctqce[ipo][1] = cc0; ctqce[ipo][2] = cc0; @@ -1947,66 +1961,94 @@ void raba( tqcps[ipo][0] = cc0; tqcps[ipo][1] = cc0; tqcps[ipo][2] = cc0; - for (int l60 = 1; l60 <= le; l60 ++) { + dcomplex &ctqce0 = ctqce[ipo][0]; + dcomplex &ctqce1 = ctqce[ipo][1]; + dcomplex &ctqce2 = ctqce[ipo][2]; + dcomplex &tqcpe0 = tqcpe[ipo][0]; + dcomplex &tqcpe1 = tqcpe[ipo][1]; + dcomplex &tqcpe2 = tqcpe[ipo][2]; + dcomplex &ctqcs0 = ctqcs[ipo][0]; + dcomplex &ctqcs1 = ctqcs[ipo][1]; + dcomplex &ctqcs2 = ctqcs[ipo][2]; + dcomplex &tqcps0 = tqcps[ipo][0]; + dcomplex &tqcps1 = tqcps[ipo][1]; + dcomplex &tqcps2 = tqcps[ipo][2]; + int kmax = le*(le+2); + // for efficiency I should also linearise array w, but I cannot easily since I do not know for sure its major dimension (changes to containing class needed) +#pragma omp target teams distribute parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2) + for (int k = 1; k<=kmax; k++) { + int l60 = (int) sqrt(k+1); + int im60 = k - (l60*l60) + 1; + if (im60 == 0) { + l60--; + im60 = 2*l60+1; + } + else if (im60 > 2*l60 + 1) { + im60 -= 2*l60 + 1; + l60++; + } int lpo = l60 + 1; int il = l60 * lpo; - int ltpo = l60 + lpo; - for (int im60 = 1; im60 <= ltpo; im60++) { - int m = im60 - lpo; - int i = m + il; - int ie = i + nlem; - int mmmu = m + 1; - int mmmmu = (mmmu > 0) ? mmmu : -mmmu; - double rmu = 0.0; + // int ltpo = l60 + lpo; + int m = im60 - lpo; + int i = m + il; + int ie = i + nlem; + int mmmu = m + 1; + int mmmmu = (mmmu > 0) ? mmmu : -mmmu; + double rmu = 0.0; + dcomplex acw; + dcomplex acwp; + dcomplex aca; + dcomplex acap; if (mmmmu <= l60) { int immu = mmmu + il; int immue = immu + nlem; rmu = -sqrt(1.0 * (l60 + mmmu) * (l60 - m)) * sq2i; - acw = dconjg(a[i - 1][ipo]) * w[immu - 1][ipo] + dconjg(a[ie - 1][ipo]) * w[immue - 1][ipo]; - acwp = dconjg(a[i - 1][ipo]) * w[immu - 1][jpo - 1] + dconjg(a[ie - 1][ipo]) * w[immue - 1][jpo - 1]; - aca = dconjg(a[i - 1][ipo]) * a[immu - 1][ipo] + dconjg(a[ie - 1][ipo]) * a[immue - 1][ipo]; - acap = dconjg(a[i - 1][ipo]) * a[immu - 1][jpo - 1] + dconjg(a[ie - 1][ipo]) * a[immue - 1][jpo - 1]; - ctqce[ipo][0] += (acw * rmu); - tqcpe[ipo][0] += (acwp * rmu); - ctqcs[ipo][0] += (aca * rmu); - tqcps[ipo][0] += (acap * rmu); + acw = dconjg(vec_a[2*(i - 1)+ipo]) * w[immu - 1][ipo] + dconjg(vec_a[2*(ie - 1)+ipo]) * w[immue - 1][ipo]; + acwp = dconjg(vec_a[2*(i - 1)+ipo]) * w[immu - 1][jpo] + dconjg(vec_a[2*(ie - 1)+ipo]) * w[immue - 1][jpo]; + aca = dconjg(vec_a[2*(i - 1)+ipo]) * vec_a[2*(immu - 1)+ipo] + dconjg(vec_a[2*(ie - 1)+ipo]) * vec_a[2*(immue - 1)+ipo]; + acap = dconjg(vec_a[2*(i - 1)+ipo]) * vec_a[2*(immu - 1)+jpo] + dconjg(vec_a[2*(ie - 1)+ipo]) * vec_a[2*(immue - 1)+jpo]; + ctqce0 += (acw * rmu); + tqcpe0 += (acwp * rmu); + ctqcs0 += (aca * rmu); + tqcps0 += (acap * rmu); } // label 30 rmu = -1.0 * m; - acw = dconjg(a[i - 1][ipo]) * w[i - 1][ipo] + dconjg(a[ie - 1][ipo]) * w[ie - 1][ipo]; - acwp = dconjg(a[i - 1][ipo]) * w[i - 1][jpo - 1] + dconjg(a[ie - 1][ipo]) * w[ie - 1][jpo - 1]; - aca = dconjg(a[i - 1][ipo]) * a[i - 1][ipo] + dconjg(a[ie - 1][ipo]) * a[ie - 1][ipo]; - acap = dconjg(a[i - 1][ipo]) * a[i - 1][jpo - 1] + dconjg(a[ie - 1][ipo]) * a[ie - 1][jpo - 1]; - ctqce[ipo][1] += (acw * rmu); - tqcpe[ipo][1] += (acwp * rmu); - ctqcs[ipo][1] += (aca * rmu); - tqcps[ipo][1] += (acap * rmu); + acw = dconjg(vec_a[2*(i - 1)+ipo]) * w[i - 1][ipo] + dconjg(vec_a[2*(ie - 1)+ipo]) * w[ie - 1][ipo]; + acwp = dconjg(vec_a[2*(i - 1)+ipo]) * w[i - 1][jpo] + dconjg(vec_a[2*(ie - 1)+ipo]) * w[ie - 1][jpo]; + aca = dconjg(vec_a[2*(i - 1)+ipo]) * vec_a[2*(i - 1)+ipo] + dconjg(vec_a[2*(ie - 1)+ipo]) * vec_a[2*(ie - 1)+ipo]; + acap = dconjg(vec_a[1*(i - 1)+ipo]) * vec_a[2*(i - 1)+jpo] + dconjg(vec_a[2*(ie - 1)+ipo]) * vec_a[2*(ie - 1)+jpo]; + ctqce1 += (acw * rmu); + tqcpe1 += (acwp * rmu); + ctqcs1 += (aca * rmu); + tqcps1 += (acap * rmu); mmmu = m - 1; mmmmu = (mmmu > 0) ? mmmu : -mmmu; if (mmmmu <= l60) { int immu = mmmu + il; int immue = immu + nlem; rmu = sqrt(1.0 * (l60 - mmmu) * (l60 + m)) * sq2i; - acw = dconjg(a[i - 1][ipo]) * w[immu - 1][ipo] + dconjg(a[ie - 1][ipo]) * w[immue - 1][ipo]; - acwp = dconjg(a[i - 1][ipo]) * w[immu - 1][jpo - 1] + dconjg(a[ie - 1][ipo]) * w[immue - 1][jpo - 1]; - aca = dconjg(a[i - 1][ipo]) * a[immu - 1][ipo] + dconjg(a[ie - 1][ipo]) * a[immue - 1][ipo]; - acap = dconjg(a[i - 1][ipo]) * a[immu - 1][jpo - 1] + dconjg(a[ie - 1][ipo]) * a[immue - 1][jpo - 1]; - ctqce[ipo][2] += (acw * rmu); - tqcpe[ipo][2] += (acwp * rmu); - ctqcs[ipo][2] += (aca * rmu); - tqcps[ipo][2] += (acap * rmu); - } // ends im60 loop - } // im60 loop - } // l60 loop + acw = dconjg(vec_a[2*(i - 1)+ipo]) * w[immu - 1][ipo] + dconjg(vec_a[2*(ie - 1)+ipo]) * w[immue - 1][ipo]; + acwp = dconjg(vec_a[2*(i - 1)+ipo]) * w[immu - 1][jpo] + dconjg(vec_a[2*(ie - 1)+ipo]) * w[immue - 1][jpo]; + aca = dconjg(vec_a[2*(i - 1)+ipo]) * vec_a[2*(immu - 1)+ipo] + dconjg(vec_a[2*(ie - 1)+ipo]) * vec_a[2*(immue - 1)+ipo]; + acap = dconjg(vec_a[2*(i - 1)+ipo]) * vec_a[2*(immu - 1)+jpo] + dconjg(vec_a[2*(ie - 1)+ipo]) * vec_a[2*(immue - 1)+jpo]; + ctqce2 += (acw * rmu); + tqcpe2 += (acwp * rmu); + ctqcs2 += (aca * rmu); + tqcps2 += (acap * rmu); + } // ends if clause + } // k loop (previously the l60 and im60 loops } // ipo70 loop +#pragma omp teams distribute parallel for simd for (int ipo78 = 1; ipo78 <= 2; ipo78++) { int ipo = ipo78 - 1; tqce[ipo][0] = real(ctqce[ipo][0] - ctqce[ipo][2]) * sq2i; tqce[ipo][1] = real((ctqce[ipo][0] + ctqce[ipo][2]) * uim) * sq2i; tqce[ipo][2] = real(ctqce[ipo][1]); - c1 = tqcpe[ipo][0]; - c2 = tqcpe[ipo][1]; - c3 = tqcpe[ipo][2]; + dcomplex c1 = tqcpe[ipo][0]; + dcomplex c2 = tqcpe[ipo][1]; + dcomplex c3 = tqcpe[ipo][2]; tqcpe[ipo][0] = (c1 - c3) * sq2i; tqcpe[ipo][1] = (c1 + c3) * (uim * sq2i); tqcpe[ipo][2] = c2; @@ -2020,17 +2062,296 @@ void raba( tqcps[ipo][1] = -(c1 + c3) * (uim * sq2i); tqcps[ipo][2] = -c2; } // ipo78 loop - // Clean memory - for (int ai = 0; ai < nlemt; ai++) delete[] a[ai]; - for (int ci = 0; ci < 2; ci++) { - delete[] ctqce[ci]; - delete[] ctqcs[ci]; - } + delete[] a[0]; + delete[] ctqce[0]; + delete[] ctqcs[0]; delete[] a; delete[] ctqce; delete[] ctqcs; } +// void raba_new( +// int le, dcomplex **am0m, dcomplex **w, double **tqce, +// dcomplex **tqcpe, double **tqcs, dcomplex **tqcps +// ) { +// dcomplex **a, **ctqce, **ctqcs; +// dcomplex acw, acwp, aca, acap, c1, c2, c3; +// const dcomplex cc0 = 0.0 + 0.0 * I; +// const dcomplex uim = 0.0 + 1.0 * I; +// const double sq2i = 1.0 / sqrt(2.0); +// int nlem = le * (le + 2); +// const int nlemt = nlem + nlem; +// a = new dcomplex*[nlemt]; +// ctqce = new dcomplex*[2]; +// ctqcs = new dcomplex*[2]; +// a[0] = new dcomplex[nlemt*2](); +// dcomplex *vec_a = a[0]; +// ctqce[0] = new dcomplex[6](); +// ctqcs[0] = new dcomplex[6](); +// for (int ai = 1; ai < nlemt; ai++) a[ai] = a[0]+ai*2; +// ctqce[1] = ctqce[0]+3; +// ctqcs[1] = ctqcs[0]+3; +// dcomplex *vec_am0m = am0m[0]; +// // I cannot vectorise easily the access to w, since its size can follow either li or le, and I don't know here +// #pragma omp parallel for +// for (int i20 = 1; i20 <= nlemt; i20++) { +// int i = i20 - 1; +// dcomplex c1 = cc0; +// dcomplex c2 = cc0; +// #pragma omp target teams distribute parallel for simd reduction(+:c1, c2) +// for (int j10 = 1; j10 <= nlemt; j10++) { +// int j = j10 - 1; +// // this is actually a matrix multiplication a = am0m x w, I could substitute the whole nested loop with a single BLAS level 3 call +// c1 += (vec_am0m[i*nlemt+j] * w[j][0]); +// c2 += (vec_am0m[i*nlemt+j] * w[j][1]); +// } // j10 loop +// vec_a[2*i] = c1; +// vec_a[2*i+1] = c2; +// } //i20 loop +// //int jpo = 2; +// // for (int ipo70 = 1; ipo70 <= 2; ipo70++) { +// // if (ipo70 == 2) jpo = 1; +// // int ipo = ipo70 - 1; +// // these are 2 x 3 arrays, in principle there should be a double loop over their two indices, but the second was explicitly unrolled here. This is senseless, either unroll both indices for speed, or unroll none for clarity, doing it halfway achieves neither +// // ctqce[ipo][0] = cc0; +// // ctqce[ipo][1] = cc0; +// // ctqce[ipo][2] = cc0; +// // tqcpe[ipo][0] = cc0; +// // tqcpe[ipo][1] = cc0; +// // tqcpe[ipo][2] = cc0; +// // ctqcs[ipo][0] = cc0; +// // ctqcs[ipo][1] = cc0; +// // ctqcs[ipo][2] = cc0; +// // tqcps[ipo][0] = cc0; +// // tqcps[ipo][1] = cc0; +// // tqcps[ipo][2] = cc0; +// // so let's go for it all the way, unroll both dimensions, use auxiliary scalar variables for the reduction, to declare them in the omp pragma +// dcomplex ctqce00 = cc0; +// dcomplex ctqce01 = cc0; +// dcomplex ctqce02 = cc0; +// dcomplex ctqce10 = cc0; +// dcomplex ctqce11 = cc0; +// dcomplex ctqce12 = cc0; +// dcomplex tqcpe00 = cc0; +// dcomplex tqcpe01 = cc0; +// dcomplex tqcpe02 = cc0; +// dcomplex tqcpe10 = cc0; +// dcomplex tqcpe11 = cc0; +// dcomplex tqcpe12 = cc0; +// dcomplex ctqcs00 = cc0; +// dcomplex ctqcs01 = cc0; +// dcomplex ctqcs02 = cc0; +// dcomplex ctqcs10 = cc0; +// dcomplex ctqcs11 = cc0; +// dcomplex ctqcs12 = cc0; +// dcomplex tqcps00 = cc0; +// dcomplex tqcps01 = cc0; +// dcomplex tqcps02 = cc0; +// dcomplex tqcps10 = cc0; +// dcomplex tqcps11 = cc0; +// dcomplex tqcps12 = cc0; +// // To parallelise, I run a linearised loop directly over k +// // working out the algebra, it turns out that +// // k = l60*l60-1+im60 +// // we invert this to find +// // l60 = (int) sqrt(k+1) and im60 = k - l60*60+1 +// // but if it results im60 = 0, then we set l60 = l60-1 and im60 = 2*l60+1 +// // furthermore if it results im60 > 2*l60+1, then we set +// // im60 = im60 -(2*l60+1) and l60 = l60+1 (there was a rounding error in a nearly exact root) +// // with the following kmax, l60 goes from 1 to le, and im60 from 1 to 2*l60+1 +// int kmax = le*(le+2); +// //#pragma omp target teams distribute parallel for simd reduction(+:ctqce00, ctqce01, ctqce02, ctqce10, ctqce11, ctqce12, ctqcs00, ctqcs01, ctqcs02, ctqcs10, ctqcs11, ctqcs12, tqcpe00, tqcpe01, tqcpe02, tqcpe10, tqcpe11, tqcpe12, tqcps00, tqcps01, tqcps02, tqcps10, tqcps11, tqcps12) +// for (int k = 1; k<=kmax; k++) { +// int l60 = (int) sqrt(k+1); +// int im60 = k - (l60*l60) + 1; +// if (im60 == 0) { +// l60--; +// im60 = 2*l60+1; +// } +// else if (im60 > 2*l60 + 1) { +// im60 -= 2*l60 + 1; +// l60++; +// } +// // I have all the indices in my linearised loop +// int lpo = l60 + 1; +// int il = l60 * lpo; +// int m = im60 - lpo; +// int i = m + il; +// int ie = i + nlem; +// int mmmu = m + 1; +// int mmmmu = (mmmu > 0) ? mmmu : -mmmu; +// int im1p2 = (i - 1)*2; +// int iem1p2 = (ie - 1)*2; +// double rmu; +// dcomplex acw0; +// dcomplex acw1; +// dcomplex acwp0; +// dcomplex acwp1; +// dcomplex aca0; +// dcomplex aca1; +// dcomplex acap0; +// dcomplex acap1; +// if (mmmmu <= l60) { +// int immu = mmmu + il; +// int immue = immu + nlem; +// int immum1 = immu-1; +// int vecimmu = (immum1)*2; +// int immuem1 = immue-1; +// int vecimmue = (immuem1)*2; +// dcomplex *vec_w = w[immum1]; +// dcomplex *vec_we = w[immuem1]; +// rmu = -sqrt(1.0 * (l60 + mmmu) * (l60 - m)) * sq2i; +// acw0 = dconjg(a[i - 1][0]) * w[immu - 1][0] + dconjg(a[ie - 1][0]) * w[immue - 1][0]; +// acw1 = dconjg(a[i - 1][1]) * w[immu - 1][1] + dconjg(a[ie - 1][1]) * w[immue - 1][1]; +// acwp0 = dconjg(a[i - 1][0]) * w[immu - 1][1] + dconjg(a[ie - 1][0]) * w[immue - 1][1]; +// acwp1 = dconjg(a[i - 1][1]) * w[immu - 1][0] + dconjg(a[ie - 1][1]) * w[immue - 1][0]; +// aca0 = dconjg(a[i - 1][0]) * a[immu - 1][0] + dconjg(a[ie - 1][0]) * a[immue - 1][0]; +// aca1 = dconjg(a[i - 1][1]) * a[immu - 1][1] + dconjg(a[ie - 1][1]) * a[immue - 1][1]; +// acap0 = dconjg(a[i - 1][0]) * a[immu - 1][1] + dconjg(a[ie - 1][0]) * a[immue - 1][1]; +// acap1 = dconjg(a[i - 1][1]) * a[immu - 1][0] + dconjg(a[ie - 1][1]) * a[immue - 1][0]; +// // acw0 = dconjg(vec_a[im1p2]) * vec_w[0] + dconjg(vec_a[iem1p2]) * vec_we[0]; +// // acw1 = dconjg(vec_a[im1p2+1]) * vec_w[1] + dconjg(vec_a[iem1p2+1]) * vec_we[1]; +// // acwp0 = dconjg(vec_a[im1p2]) * vec_w[1] + dconjg(vec_a[iem1p2]) * vec_we[1]; +// // acwp1 = dconjg(vec_a[im1p2+1]) * vec_w[0] + dconjg(vec_a[im1p2+1]) * vec_we[0]; +// // aca0 = dconjg(vec_a[im1p2]) * vec_a[vecimmu] + dconjg(vec_a[iem1p2]) * vec_a[vecimmue]; +// // aca1 = dconjg(vec_a[im1p2+1]) * vec_a[vecimmu+1] + dconjg(vec_a[iem1p2+1]) * vec_a[vecimmue+1]; +// // acap0 = dconjg(vec_a[im1p2]) * vec_a[vecimmu+1] + dconjg(vec_a[iem1p2]) * vec_a[vecimmue+1]; +// // acap1 = dconjg(vec_a[im1p2+1]) * vec_a[vecimmu] + dconjg(vec_a[iem1p2+1]) * vec_a[vecimmue]; +// ctqce00 += (acw0 * rmu); +// ctqce10 += (acw0 * rmu); +// tqcpe00 += (acwp0 * rmu); +// tqcpe10 += (acwp1 * rmu); +// ctqcs00 += (aca0 * rmu); +// ctqcs10 += (aca1 * rmu); +// tqcps00 += (acap0 * rmu); +// tqcps10 += (acap1 * rmu); +// } +// // label 30 +// dcomplex *vec_w = w[i-1]; +// dcomplex *vec_we = w[ie-1]; +// rmu = -1.0 * m; +// acw0 = dconjg(a[i - 1][0]) * w[i - 1][0] + dconjg(a[ie - 1][0]) * w[ie - 1][0]; +// acw1 = dconjg(a[i - 1][1]) * w[i - 1][1] + dconjg(a[ie - 1][1]) * w[ie - 1][1]; +// acwp0 = dconjg(a[i - 1][0]) * w[i - 1][1] + dconjg(a[ie - 1][0]) * w[ie - 1][1]; +// acwp1 = dconjg(a[i - 1][1]) * w[i - 1][0] + dconjg(a[ie - 1][1]) * w[ie - 1][0]; +// aca0 = dconjg(a[i - 1][0]) * a[i - 1][0] + dconjg(a[ie - 1][0]) * a[ie - 1][0]; +// aca1 = dconjg(a[i - 1][1]) * a[i - 1][1] + dconjg(a[ie - 1][1]) * a[ie - 1][1]; +// acap0 = dconjg(a[i - 1][0]) * a[i - 1][1] + dconjg(a[ie - 1][0]) * a[ie - 1][1]; +// acap1 = dconjg(a[i - 1][1]) * a[i - 1][0] + dconjg(a[ie - 1][1]) * a[ie - 1][0]; +// // acw0 = dconjg(vec_a[im1p2]) * vec_w[0] + dconjg(vec_a[iem1p2]) * vec_we[0]; +// // acw1 = dconjg(vec_a[im1p2 + 1]) * vec_w[1] + dconjg(vec_a[iem1p2 + 1]) * vec_we[1]; +// // acwp0 = dconjg(vec_a[im1p2]) * vec_w[1] + dconjg(vec_a[iem1p2]) * vec_we[1]; +// // acwp1 = dconjg(vec_a[im1p2 + 1]) * vec_w[0] + dconjg(vec_a[iem1p2 + 1]) * vec_we[0]; +// // aca0 = dconjg(vec_a[im1p2]) * vec_a[im1p2] + dconjg(vec_a[iem1p2]) * vec_a[iem1p2]; +// // aca1 = dconjg(vec_a[im1p2 + 1]) * vec_a[im1p2 + 1] + dconjg(vec_a[iem1p2 + 1]) * vec_a[iem1p2 + 1]; +// // acap0 = dconjg(vec_a[im1p2]) * vec_a[im1p2 + 1] + dconjg(vec_a[iem1p2]) * vec_a[iem1p2 + 1]; +// // acap1 = dconjg(vec_a[im1p2 + 1]) * vec_a[im1p2] + dconjg(vec_a[iem1p2 + 1]) * vec_a[iem1p2]; +// ctqce01 += (acw0 * rmu); +// ctqce11 += (acw1 * rmu); +// tqcpe01 += (acwp0 * rmu); +// tqcpe11 += (acwp1 * rmu); +// ctqcs01 += (aca0 * rmu); +// ctqcs11 += (aca1 * rmu); +// tqcps01 += (acap0 * rmu); +// tqcps11 += (acap1 * rmu); +// mmmu = m - 1; +// mmmmu = (mmmu > 0) ? mmmu : -mmmu; +// if (mmmmu <= l60) { +// int immu = mmmu + il; +// int immue = immu + nlem; +// int immum1 = immu-1; +// int vecimmu = (immum1)*2; +// int immuem1 = immue-1; +// int vecimmue = (immuem1)*2; +// dcomplex *vec_w = w[immum1]; +// dcomplex *vec_we = w[immuem1]; +// rmu = sqrt(1.0 * (l60 - mmmu) * (l60 + m)) * sq2i; +// acw0 = dconjg(a[i - 1][0]) * w[immu - 1][0] + dconjg(a[ie - 1][0]) * w[immue - 1][0]; +// acw1 = dconjg(a[i - 1][1]) * w[immu - 1][1] + dconjg(a[ie - 1][1]) * w[immue - 1][1]; +// acwp0 = dconjg(a[i - 1][0]) * w[immu - 1][1] + dconjg(a[ie - 1][0]) * w[immue - 1][1]; +// acwp1 = dconjg(a[i - 1][1]) * w[immu - 1][0] + dconjg(a[ie - 1][1]) * w[immue - 1][0]; +// aca0 = dconjg(a[i - 1][0]) * a[immu - 1][0] + dconjg(a[ie - 1][0]) * a[immue - 1][0]; +// aca1 = dconjg(a[i - 1][1]) * a[immu - 1][1] + dconjg(a[ie - 1][1]) * a[immue - 1][1]; +// acap0 = dconjg(a[i - 1][0]) * a[immu - 1][1] + dconjg(a[ie - 1][0]) * a[immue - 1][1]; +// acap1 = dconjg(a[i - 1][1]) * a[immu - 1][0] + dconjg(a[ie - 1][1]) * a[immue - 1][0]; +// // acw0 = dconjg(vec_a[im1p2]) * vec_w[0] + dconjg(vec_a[iem1p2]) * vec_we[0]; +// // acw1 = dconjg(vec_a[im1p2 + 1]) * vec_w[1] + dconjg(vec_a[iem1p2 + 1]) * vec_we[1]; +// // acwp0 = dconjg(vec_a[im1p2]) * vec_w[1] + dconjg(vec_a[iem1p2]) * vec_we[1]; +// // acwp1 = dconjg(vec_a[im1p2 + 1]) * vec_w[0] + dconjg(vec_a[iem1p2 + 1]) * vec_we[0]; +// // aca0 = dconjg(vec_a[im1p2]) * vec_a[vecimmu] + dconjg(vec_a[iem1p2]) * vec_a[vecimmue]; +// // aca1 = dconjg(vec_a[im1p2 + 1]) * vec_a[vecimmu + 1] + dconjg(vec_a[iem1p2 + 1]) * vec_a[vecimmue + 1]; +// // acap0 = dconjg(vec_a[im1p2]) * vec_a[vecimmu + 1] + dconjg(vec_a[iem1p2]) * vec_a[vecimmue + 1]; +// // acap1 = dconjg(vec_a[im1p2 + 1]) * vec_a[vecimmu] + dconjg(vec_a[iem1p2 + 1]) * vec_a[vecimmue]; +// ctqce02 += (acw0 * rmu); +// ctqce12 += (acw1 * rmu); +// tqcpe02 += (acwp0 * rmu); +// tqcpe12 += (acwp1 * rmu); +// ctqcs02 += (aca0 * rmu); +// ctqcs12 += (aca1 * rmu); +// tqcps02 += (acap0 * rmu); +// tqcps12 += (acap1 * rmu); +// } // ends if clause +// } // k loop (previously the l60 and im60 loops +// ctqce[0][0] = ctqce00; +// ctqce[0][1] = ctqce01; +// ctqce[0][2] = ctqce02; +// ctqce[1][0] = ctqce10; +// ctqce[1][1] = ctqce11; +// ctqce[1][2] = ctqce12; + +// ctqcs[0][0] = ctqcs00; +// ctqcs[0][1] = ctqcs01; +// ctqcs[0][2] = ctqcs02; +// ctqcs[1][0] = ctqcs10; +// ctqcs[1][1] = ctqcs11; +// ctqcs[1][2] = ctqcs12; + +// tqcpe[0][0] = tqcpe00; +// tqcpe[0][1] = tqcpe01; +// tqcpe[0][2] = tqcpe02; +// tqcpe[1][0] = tqcpe10; +// tqcpe[1][1] = tqcpe11; +// tqcpe[1][2] = tqcpe12; + +// tqcps[0][0] = tqcps00; +// tqcps[0][1] = tqcps01; +// tqcps[0][2] = tqcps02; +// tqcps[1][0] = tqcps10; +// tqcps[1][1] = tqcps11; +// tqcps[1][2] = tqcps12; + +// // this is a ridiculously small loop (3 iterations!) of assignments. Would be not worth parallelising, but since it's easy do it on the cpus +// //#pragma omp teams distribute parallel for simd +// for (int ipo78 = 1; ipo78 <= 2; ipo78++) { +// int ipo = ipo78 - 1; +// tqce[ipo][0] = real(ctqce[ipo][0] - ctqce[ipo][2]) * sq2i; +// tqce[ipo][1] = real((ctqce[ipo][0] + ctqce[ipo][2]) * uim) * sq2i; +// tqce[ipo][2] = real(ctqce[ipo][1]); +// c1 = tqcpe[ipo][0]; +// c2 = tqcpe[ipo][1]; +// c3 = tqcpe[ipo][2]; +// tqcpe[ipo][0] = (c1 - c3) * sq2i; +// tqcpe[ipo][1] = (c1 + c3) * (uim * sq2i); +// tqcpe[ipo][2] = c2; +// tqcs[ipo][0] = -sq2i * real(ctqcs[ipo][0] - ctqcs[ipo][2]); +// tqcs[ipo][1] = -sq2i * real((ctqcs[ipo][0] + ctqcs[ipo][2]) * uim); +// tqcs[ipo][2] = -1.0 * real(ctqcs[ipo][1]); +// c1 = tqcps[ipo][0]; +// c2 = tqcps[ipo][1]; +// c3 = tqcps[ipo][2]; +// tqcps[ipo][0] = -(c1 - c3) * sq2i; +// tqcps[ipo][1] = -(c1 + c3) * (uim * sq2i); +// tqcps[ipo][2] = -c2; +// } // ipo78 loop +// // Clean memory +// delete[] a[0]; +// delete[] ctqce[0]; +// delete[] ctqcs[0]; +// delete[] a; +// delete[] ctqce; +// delete[] ctqcs; +// } + void rftr( double *u, double *up, double *un, double *gapv, double extins, double scatts, double &rapr, double &cosav, double &fp, double &fn, double &fk, double &fx, @@ -2052,22 +2373,35 @@ void scr0(double vk, double exri, C1 *c1, C1_AddOns *c1ao, C3 *c3, C4 * c4) { double exdc = exri * exri; double ccs = 4.0 * acos(0.0) / (vk * vk); double cccs = ccs / exdc; - dcomplex sum21, rm, re, csam; - csam = -(ccs / (exri * vk)) * 0.5 * I; + dcomplex csam = -(ccs / (exri * vk)) * 0.5 * I; //double scs = 0.0, ecs = 0.0, acs = 0.0; - c3->scs = 0.0; - c3->ecs = 0.0; - c3->acs = 0.0; - c3->tfsas = cc0; + double scs = 0.0; + double ecs = 0.0; + double acs = 0.0; + dcomplex tfsas = cc0; + dcomplex *vec_rmi = c1->rmi[0]; + dcomplex *vec_rei = c1->rei[0]; +#ifdef USE_NVTX + nvtxRangePush("scr0 outer loop"); +#endif + +#pragma omp parallel for reduction(+:scs, ecs, acs, tfsas) for (int i14 = 1; i14 <= c4->nsph; i14++) { int iogi = c1->iog[i14 - 1]; if (iogi >= i14) { double sums = 0.0; - sum21 = cc0; + dcomplex sum21 = cc0; +#ifdef USE_NVTX + nvtxRangePush("scr0 inner loop"); +#endif +#pragma omp target teams distribute parallel for simd reduction(+:sums, sum21) for (int l10 = 1; l10 <= c4->li; l10++) { double fl = 1.0 * (l10 + l10 + 1); - rm = 1.0 / c1->rmi[l10 - 1][i14 - 1]; - re = 1.0 / c1->rei[l10 - 1][i14 - 1]; + // dcomplex rm = 1.0 / c1->rmi[l10 - 1][i14 - 1]; + // dcomplex re = 1.0 / c1->rei[l10 - 1][i14 - 1]; + int vecindex = (l10 - 1)*c1->nsph + i14 - 1; + dcomplex rm = 1.0 / vec_rmi[vecindex]; + dcomplex re = 1.0 / vec_rei[vecindex]; double rvalue = real(dconjg(rm) * rm + dconjg(re) * re) * fl; sums += rvalue; sum21 += ((rm + re) * fl); @@ -2086,11 +2420,18 @@ void scr0(double vk, double exri, C1 *c1, C1_AddOns *c1ao, C3 *c3, C4 * c4) { c1->fsas[i14 - 1] = sum21 * csam; } // label 12 - c3->scs += c1->sscs[iogi - 1]; - c3->ecs += c1->sexs[iogi - 1]; - c3->acs += c1->sabs[iogi - 1]; - c3->tfsas += c1->fsas[iogi - 1]; + scs += c1->sscs[iogi - 1]; + ecs += c1->sexs[iogi - 1]; + acs += c1->sabs[iogi - 1]; + tfsas += c1->fsas[iogi - 1]; } // i14 loop + c3->scs = scs; + c3->ecs = ecs; + c3->acs = acs; + c3->tfsas = tfsas; +#ifdef USE_NVTX + nvtxRangePop(); +#endif } void scr2( @@ -2109,18 +2450,15 @@ void scr2( double cfsq = 4.0 / (pi4sq * ccs * ccs); cph = uim * exri * vkarg; int ls = (c4->li < c4->le) ? c4->li : c4->le; - int kmax = (ls+1)*(ls+1)-1; + int kmax = ls*(ls+2); dcomplex tsas00 = cc0; dcomplex tsas10 = cc0; dcomplex tsas01 = cc0; dcomplex tsas11 = cc0; - // c3->tsas[0][0] = cc0; - // c3->tsas[1][0] = cc0; - // c3->tsas[0][1] = cc0; - // c3->tsas[1][1] = cc0; dcomplex *vec_rmi = c1->rmi[0]; dcomplex *vec_rei = c1->rei[0]; dcomplex *vec_w = c1->w[0]; + dcomplex *vec_sas = c1->sas[0][0]; #ifdef USE_NVTX nvtxRangePush("scr2 outer loop 1"); #endif @@ -2139,7 +2477,7 @@ void scr2( // k = l10*l10-1+im10 // we invert this to find // l10 = (int) sqrt(k+1) and im10 = k - l10*10+1 - // but if it results im = 0, then we set l10 = l10-1 and im10 = 2*l10+1 + // but if it results im10 = 0, then we set l10 = l10-1 and im10 = 2*l10+1 // furthermore if it results im10 > 2*l10+1, then we set // im10 = im10 -(2*l10+1) and l10 = l10+1 (there was a rounding error in a nearly exact root) #ifdef USE_NVTX @@ -2172,26 +2510,17 @@ void scr2( s22 -= vec_w[km1t4+3] * auxrm1 + vec_w[kem1t4+3] * auxre1; } #ifdef USE_NVTX - nvtxRangePop(); + nvtxRangePop(); #endif - // for (int l10 = 1; l10 <= ls; l10++) { - // int l = l10 - 1; - // rm = 1.0 / c1->rmi[l][i]; - // re = 1.0 / c1->rei[l][i]; - // int ltpo = l10 + l10 + 1; - // for (int im10 = 1; im10 <= ltpo; im10++) { - // k++; - // int ke = k + c4->nlem; - // s11 -= (c1->w[k - 1][2] * c1->w[k - 1][0] * rm + c1->w[ke - 1][2] * c1->w[ke - 1][0] * re); - // s21 -= (c1->w[k - 1][3] * c1->w[k - 1][0] * rm + c1->w[ke - 1][3] * c1->w[ke - 1][0] * re); - // s12 -= (c1->w[k - 1][2] * c1->w[k - 1][1] * rm + c1->w[ke - 1][2] * c1->w[ke - 1][1] * re); - // s22 -= (c1->w[k - 1][3] * c1->w[k - 1][1] * rm + c1->w[ke - 1][3] * c1->w[ke - 1][1] * re); - // } // im10 loop - // } // l10 loop - c1->sas[i][0][0] = s11 * csam; - c1->sas[i][1][0] = s21 * csam; - c1->sas[i][0][1] = s12 * csam; - c1->sas[i][1][1] = s22 * csam; + int vecindex = i*4; + vec_sas[vecindex] = s11 * csam; + vec_sas[vecindex+2] = s21 * csam; + vec_sas[vecindex+1] = s12 * csam; + vec_sas[vecindex+3] = s22 * csam; + // c1->sas[i][0][0] = s11 * csam; + // c1->sas[i][1][0] = s21 * csam; + // c1->sas[i][0][1] = s12 * csam; + // c1->sas[i][1][1] = s22 * csam; } // label 12 dcomplex phas = cexp(cph * (duk[0] * c1->rxx[i] + duk[1] * c1->ryy[i] + duk[2] * c1->rzz[i]));