diff --git a/src/libnptm/clu_subs.cpp b/src/libnptm/clu_subs.cpp index 7d66bb6e3e962b9f2c2c003c113834a22489dcd8..b299911750c1c0b9aa8ee12a6ec843d3d6a285d0 100644 --- a/src/libnptm/clu_subs.cpp +++ b/src/libnptm/clu_subs.cpp @@ -1926,28 +1926,38 @@ void raba( ctqce[1] = ctqce[0]+3; ctqcs[1] = ctqcs[0]+3; dcomplex *vec_am0m = am0m[0]; + dcomplex *vec_w = w[0]; +#ifdef USE_NVTX + nvtxRangePush("raba outer loop 1"); +#endif #pragma omp parallel for for (int i20 = 1; i20 <= nlemt; i20++) { int i = i20 - 1; dcomplex c1 = cc0; dcomplex c2 = cc0; +#ifdef USE_NVTX + nvtxRangePush("raba inner loop 1"); +#endif #pragma omp target teams distribute parallel for simd reduction(+:c1, c2) for (int j10 = 1; j10 <= nlemt; j10++) { int j = j10 - 1; - c1 += (vec_am0m[i*nlemt+j] * w[j][0]); - c2 += (vec_am0m[i*nlemt+j] * w[j][1]); - // c1 += (am0m[i][j] * w[j][0]); - // c2 += (am0m[i][j] * w[j][1]); + c1 += (vec_am0m[i*nlemt+j] * vec_w[4*j]); + c2 += (vec_am0m[i*nlemt+j] * vec_w[4*j+1]); } // j10 loop +#ifdef USE_NVTX + nvtxRangePop(); +#endif vec_a[2*i] = c1; vec_a[2*i+1] = c2; - // a[i][0] = c1; - // a[i][1] = c2; } //i20 loop - int jpo = 2; - for (int ipo70 = 1; ipo70 <= 2; ipo70++) { - if (ipo70 == 2) jpo = 1; - int ipo = ipo70 - 1; +#ifdef USE_NVTX + nvtxRangePop(); +#endif +#ifdef USE_NVTX + nvtxRangePush("raba outer loop 2"); +#endif +#pragma omp teams distribute parallel for + for (int ipo = 0; ipo < 2; ipo++) { int jpo = 1 - ipo; ctqce[ipo][0] = cc0; ctqce[ipo][1] = cc0; @@ -1975,6 +1985,9 @@ void raba( dcomplex &tqcps2 = tqcps[ipo][2]; int kmax = le*(le+2); // for efficiency I should also linearise array w, but I cannot easily since I do not know for sure its major dimension (changes to containing class needed) +#ifdef USE_NVTX + nvtxRangePush("raba inner loop 2"); +#endif #pragma omp target teams distribute parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2) for (int k = 1; k<=kmax; k++) { int l60 = (int) sqrt(k+1); @@ -1989,9 +2002,8 @@ void raba( } int lpo = l60 + 1; int il = l60 * lpo; - // int ltpo = l60 + lpo; int m = im60 - lpo; - int i = m + il; + int i = m + il - 1; int ie = i + nlem; int mmmu = m + 1; int mmmmu = (mmmu > 0) ? mmmu : -mmmu; @@ -2001,13 +2013,13 @@ void raba( dcomplex aca; dcomplex acap; if (mmmmu <= l60) { - int immu = mmmu + il; + int immu = mmmu + il - 1; int immue = immu + nlem; rmu = -sqrt(1.0 * (l60 + mmmu) * (l60 - m)) * sq2i; - acw = dconjg(vec_a[2*(i - 1)+ipo]) * w[immu - 1][ipo] + dconjg(vec_a[2*(ie - 1)+ipo]) * w[immue - 1][ipo]; - acwp = dconjg(vec_a[2*(i - 1)+ipo]) * w[immu - 1][jpo] + dconjg(vec_a[2*(ie - 1)+ipo]) * w[immue - 1][jpo]; - aca = dconjg(vec_a[2*(i - 1)+ipo]) * vec_a[2*(immu - 1)+ipo] + dconjg(vec_a[2*(ie - 1)+ipo]) * vec_a[2*(immue - 1)+ipo]; - acap = dconjg(vec_a[2*(i - 1)+ipo]) * vec_a[2*(immu - 1)+jpo] + dconjg(vec_a[2*(ie - 1)+ipo]) * vec_a[2*(immue - 1)+jpo]; + acw = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+ipo]; + acwp = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+jpo]; + aca = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+ipo]; + acap = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+jpo]; ctqce0 += (acw * rmu); tqcpe0 += (acwp * rmu); ctqcs0 += (aca * rmu); @@ -2015,10 +2027,10 @@ void raba( } // label 30 rmu = -1.0 * m; - acw = dconjg(vec_a[2*(i - 1)+ipo]) * w[i - 1][ipo] + dconjg(vec_a[2*(ie - 1)+ipo]) * w[ie - 1][ipo]; - acwp = dconjg(vec_a[2*(i - 1)+ipo]) * w[i - 1][jpo] + dconjg(vec_a[2*(ie - 1)+ipo]) * w[ie - 1][jpo]; - aca = dconjg(vec_a[2*(i - 1)+ipo]) * vec_a[2*(i - 1)+ipo] + dconjg(vec_a[2*(ie - 1)+ipo]) * vec_a[2*(ie - 1)+ipo]; - acap = dconjg(vec_a[1*(i - 1)+ipo]) * vec_a[2*(i - 1)+jpo] + dconjg(vec_a[2*(ie - 1)+ipo]) * vec_a[2*(ie - 1)+jpo]; + acw = dconjg(vec_a[2*i+ipo]) * vec_w[4*i+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*ie+ipo]; + acwp = dconjg(vec_a[2*i+ipo]) * vec_w[4*i+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*ie+jpo]; + aca = dconjg(vec_a[2*i+ipo]) * vec_a[2*i+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*ie+ipo]; + acap = dconjg(vec_a[2*i+ipo]) * vec_a[2*i+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*ie+jpo]; ctqce1 += (acw * rmu); tqcpe1 += (acwp * rmu); ctqcs1 += (aca * rmu); @@ -2026,20 +2038,29 @@ void raba( mmmu = m - 1; mmmmu = (mmmu > 0) ? mmmu : -mmmu; if (mmmmu <= l60) { - int immu = mmmu + il; + int immu = mmmu + il - 1; int immue = immu + nlem; rmu = sqrt(1.0 * (l60 - mmmu) * (l60 + m)) * sq2i; - acw = dconjg(vec_a[2*(i - 1)+ipo]) * w[immu - 1][ipo] + dconjg(vec_a[2*(ie - 1)+ipo]) * w[immue - 1][ipo]; - acwp = dconjg(vec_a[2*(i - 1)+ipo]) * w[immu - 1][jpo] + dconjg(vec_a[2*(ie - 1)+ipo]) * w[immue - 1][jpo]; - aca = dconjg(vec_a[2*(i - 1)+ipo]) * vec_a[2*(immu - 1)+ipo] + dconjg(vec_a[2*(ie - 1)+ipo]) * vec_a[2*(immue - 1)+ipo]; - acap = dconjg(vec_a[2*(i - 1)+ipo]) * vec_a[2*(immu - 1)+jpo] + dconjg(vec_a[2*(ie - 1)+ipo]) * vec_a[2*(immue - 1)+jpo]; + acw = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+ipo]; + acwp = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+jpo]; + aca = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+ipo]; + acap = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+jpo]; ctqce2 += (acw * rmu); tqcpe2 += (acwp * rmu); ctqcs2 += (aca * rmu); tqcps2 += (acap * rmu); } // ends if clause } // k loop (previously the l60 and im60 loops +#ifdef USE_NVTX + nvtxRangePop(); +#endif } // ipo70 loop +#ifdef USE_NVTX + nvtxRangePop(); +#endif +#ifdef USE_NVTX + nvtxRangePush("raba loop 3"); +#endif #pragma omp teams distribute parallel for simd for (int ipo78 = 1; ipo78 <= 2; ipo78++) { int ipo = ipo78 - 1; @@ -2062,6 +2083,9 @@ void raba( tqcps[ipo][1] = -(c1 + c3) * (uim * sq2i); tqcps[ipo][2] = -c2; } // ipo78 loop +#ifdef USE_NVTX + nvtxRangePop(); +#endif delete[] a[0]; delete[] ctqce[0]; delete[] ctqcs[0]; @@ -2070,288 +2094,6 @@ void raba( delete[] ctqcs; } -// void raba_new( -// int le, dcomplex **am0m, dcomplex **w, double **tqce, -// dcomplex **tqcpe, double **tqcs, dcomplex **tqcps -// ) { -// dcomplex **a, **ctqce, **ctqcs; -// dcomplex acw, acwp, aca, acap, c1, c2, c3; -// const dcomplex cc0 = 0.0 + 0.0 * I; -// const dcomplex uim = 0.0 + 1.0 * I; -// const double sq2i = 1.0 / sqrt(2.0); -// int nlem = le * (le + 2); -// const int nlemt = nlem + nlem; -// a = new dcomplex*[nlemt]; -// ctqce = new dcomplex*[2]; -// ctqcs = new dcomplex*[2]; -// a[0] = new dcomplex[nlemt*2](); -// dcomplex *vec_a = a[0]; -// ctqce[0] = new dcomplex[6](); -// ctqcs[0] = new dcomplex[6](); -// for (int ai = 1; ai < nlemt; ai++) a[ai] = a[0]+ai*2; -// ctqce[1] = ctqce[0]+3; -// ctqcs[1] = ctqcs[0]+3; -// dcomplex *vec_am0m = am0m[0]; -// // I cannot vectorise easily the access to w, since its size can follow either li or le, and I don't know here -// #pragma omp parallel for -// for (int i20 = 1; i20 <= nlemt; i20++) { -// int i = i20 - 1; -// dcomplex c1 = cc0; -// dcomplex c2 = cc0; -// #pragma omp target teams distribute parallel for simd reduction(+:c1, c2) -// for (int j10 = 1; j10 <= nlemt; j10++) { -// int j = j10 - 1; -// // this is actually a matrix multiplication a = am0m x w, I could substitute the whole nested loop with a single BLAS level 3 call -// c1 += (vec_am0m[i*nlemt+j] * w[j][0]); -// c2 += (vec_am0m[i*nlemt+j] * w[j][1]); -// } // j10 loop -// vec_a[2*i] = c1; -// vec_a[2*i+1] = c2; -// } //i20 loop -// //int jpo = 2; -// // for (int ipo70 = 1; ipo70 <= 2; ipo70++) { -// // if (ipo70 == 2) jpo = 1; -// // int ipo = ipo70 - 1; -// // these are 2 x 3 arrays, in principle there should be a double loop over their two indices, but the second was explicitly unrolled here. This is senseless, either unroll both indices for speed, or unroll none for clarity, doing it halfway achieves neither -// // ctqce[ipo][0] = cc0; -// // ctqce[ipo][1] = cc0; -// // ctqce[ipo][2] = cc0; -// // tqcpe[ipo][0] = cc0; -// // tqcpe[ipo][1] = cc0; -// // tqcpe[ipo][2] = cc0; -// // ctqcs[ipo][0] = cc0; -// // ctqcs[ipo][1] = cc0; -// // ctqcs[ipo][2] = cc0; -// // tqcps[ipo][0] = cc0; -// // tqcps[ipo][1] = cc0; -// // tqcps[ipo][2] = cc0; -// // so let's go for it all the way, unroll both dimensions, use auxiliary scalar variables for the reduction, to declare them in the omp pragma -// dcomplex ctqce00 = cc0; -// dcomplex ctqce01 = cc0; -// dcomplex ctqce02 = cc0; -// dcomplex ctqce10 = cc0; -// dcomplex ctqce11 = cc0; -// dcomplex ctqce12 = cc0; -// dcomplex tqcpe00 = cc0; -// dcomplex tqcpe01 = cc0; -// dcomplex tqcpe02 = cc0; -// dcomplex tqcpe10 = cc0; -// dcomplex tqcpe11 = cc0; -// dcomplex tqcpe12 = cc0; -// dcomplex ctqcs00 = cc0; -// dcomplex ctqcs01 = cc0; -// dcomplex ctqcs02 = cc0; -// dcomplex ctqcs10 = cc0; -// dcomplex ctqcs11 = cc0; -// dcomplex ctqcs12 = cc0; -// dcomplex tqcps00 = cc0; -// dcomplex tqcps01 = cc0; -// dcomplex tqcps02 = cc0; -// dcomplex tqcps10 = cc0; -// dcomplex tqcps11 = cc0; -// dcomplex tqcps12 = cc0; -// // To parallelise, I run a linearised loop directly over k -// // working out the algebra, it turns out that -// // k = l60*l60-1+im60 -// // we invert this to find -// // l60 = (int) sqrt(k+1) and im60 = k - l60*60+1 -// // but if it results im60 = 0, then we set l60 = l60-1 and im60 = 2*l60+1 -// // furthermore if it results im60 > 2*l60+1, then we set -// // im60 = im60 -(2*l60+1) and l60 = l60+1 (there was a rounding error in a nearly exact root) -// // with the following kmax, l60 goes from 1 to le, and im60 from 1 to 2*l60+1 -// int kmax = le*(le+2); -// //#pragma omp target teams distribute parallel for simd reduction(+:ctqce00, ctqce01, ctqce02, ctqce10, ctqce11, ctqce12, ctqcs00, ctqcs01, ctqcs02, ctqcs10, ctqcs11, ctqcs12, tqcpe00, tqcpe01, tqcpe02, tqcpe10, tqcpe11, tqcpe12, tqcps00, tqcps01, tqcps02, tqcps10, tqcps11, tqcps12) -// for (int k = 1; k<=kmax; k++) { -// int l60 = (int) sqrt(k+1); -// int im60 = k - (l60*l60) + 1; -// if (im60 == 0) { -// l60--; -// im60 = 2*l60+1; -// } -// else if (im60 > 2*l60 + 1) { -// im60 -= 2*l60 + 1; -// l60++; -// } -// // I have all the indices in my linearised loop -// int lpo = l60 + 1; -// int il = l60 * lpo; -// int m = im60 - lpo; -// int i = m + il; -// int ie = i + nlem; -// int mmmu = m + 1; -// int mmmmu = (mmmu > 0) ? mmmu : -mmmu; -// int im1p2 = (i - 1)*2; -// int iem1p2 = (ie - 1)*2; -// double rmu; -// dcomplex acw0; -// dcomplex acw1; -// dcomplex acwp0; -// dcomplex acwp1; -// dcomplex aca0; -// dcomplex aca1; -// dcomplex acap0; -// dcomplex acap1; -// if (mmmmu <= l60) { -// int immu = mmmu + il; -// int immue = immu + nlem; -// int immum1 = immu-1; -// int vecimmu = (immum1)*2; -// int immuem1 = immue-1; -// int vecimmue = (immuem1)*2; -// dcomplex *vec_w = w[immum1]; -// dcomplex *vec_we = w[immuem1]; -// rmu = -sqrt(1.0 * (l60 + mmmu) * (l60 - m)) * sq2i; -// acw0 = dconjg(a[i - 1][0]) * w[immu - 1][0] + dconjg(a[ie - 1][0]) * w[immue - 1][0]; -// acw1 = dconjg(a[i - 1][1]) * w[immu - 1][1] + dconjg(a[ie - 1][1]) * w[immue - 1][1]; -// acwp0 = dconjg(a[i - 1][0]) * w[immu - 1][1] + dconjg(a[ie - 1][0]) * w[immue - 1][1]; -// acwp1 = dconjg(a[i - 1][1]) * w[immu - 1][0] + dconjg(a[ie - 1][1]) * w[immue - 1][0]; -// aca0 = dconjg(a[i - 1][0]) * a[immu - 1][0] + dconjg(a[ie - 1][0]) * a[immue - 1][0]; -// aca1 = dconjg(a[i - 1][1]) * a[immu - 1][1] + dconjg(a[ie - 1][1]) * a[immue - 1][1]; -// acap0 = dconjg(a[i - 1][0]) * a[immu - 1][1] + dconjg(a[ie - 1][0]) * a[immue - 1][1]; -// acap1 = dconjg(a[i - 1][1]) * a[immu - 1][0] + dconjg(a[ie - 1][1]) * a[immue - 1][0]; -// // acw0 = dconjg(vec_a[im1p2]) * vec_w[0] + dconjg(vec_a[iem1p2]) * vec_we[0]; -// // acw1 = dconjg(vec_a[im1p2+1]) * vec_w[1] + dconjg(vec_a[iem1p2+1]) * vec_we[1]; -// // acwp0 = dconjg(vec_a[im1p2]) * vec_w[1] + dconjg(vec_a[iem1p2]) * vec_we[1]; -// // acwp1 = dconjg(vec_a[im1p2+1]) * vec_w[0] + dconjg(vec_a[im1p2+1]) * vec_we[0]; -// // aca0 = dconjg(vec_a[im1p2]) * vec_a[vecimmu] + dconjg(vec_a[iem1p2]) * vec_a[vecimmue]; -// // aca1 = dconjg(vec_a[im1p2+1]) * vec_a[vecimmu+1] + dconjg(vec_a[iem1p2+1]) * vec_a[vecimmue+1]; -// // acap0 = dconjg(vec_a[im1p2]) * vec_a[vecimmu+1] + dconjg(vec_a[iem1p2]) * vec_a[vecimmue+1]; -// // acap1 = dconjg(vec_a[im1p2+1]) * vec_a[vecimmu] + dconjg(vec_a[iem1p2+1]) * vec_a[vecimmue]; -// ctqce00 += (acw0 * rmu); -// ctqce10 += (acw0 * rmu); -// tqcpe00 += (acwp0 * rmu); -// tqcpe10 += (acwp1 * rmu); -// ctqcs00 += (aca0 * rmu); -// ctqcs10 += (aca1 * rmu); -// tqcps00 += (acap0 * rmu); -// tqcps10 += (acap1 * rmu); -// } -// // label 30 -// dcomplex *vec_w = w[i-1]; -// dcomplex *vec_we = w[ie-1]; -// rmu = -1.0 * m; -// acw0 = dconjg(a[i - 1][0]) * w[i - 1][0] + dconjg(a[ie - 1][0]) * w[ie - 1][0]; -// acw1 = dconjg(a[i - 1][1]) * w[i - 1][1] + dconjg(a[ie - 1][1]) * w[ie - 1][1]; -// acwp0 = dconjg(a[i - 1][0]) * w[i - 1][1] + dconjg(a[ie - 1][0]) * w[ie - 1][1]; -// acwp1 = dconjg(a[i - 1][1]) * w[i - 1][0] + dconjg(a[ie - 1][1]) * w[ie - 1][0]; -// aca0 = dconjg(a[i - 1][0]) * a[i - 1][0] + dconjg(a[ie - 1][0]) * a[ie - 1][0]; -// aca1 = dconjg(a[i - 1][1]) * a[i - 1][1] + dconjg(a[ie - 1][1]) * a[ie - 1][1]; -// acap0 = dconjg(a[i - 1][0]) * a[i - 1][1] + dconjg(a[ie - 1][0]) * a[ie - 1][1]; -// acap1 = dconjg(a[i - 1][1]) * a[i - 1][0] + dconjg(a[ie - 1][1]) * a[ie - 1][0]; -// // acw0 = dconjg(vec_a[im1p2]) * vec_w[0] + dconjg(vec_a[iem1p2]) * vec_we[0]; -// // acw1 = dconjg(vec_a[im1p2 + 1]) * vec_w[1] + dconjg(vec_a[iem1p2 + 1]) * vec_we[1]; -// // acwp0 = dconjg(vec_a[im1p2]) * vec_w[1] + dconjg(vec_a[iem1p2]) * vec_we[1]; -// // acwp1 = dconjg(vec_a[im1p2 + 1]) * vec_w[0] + dconjg(vec_a[iem1p2 + 1]) * vec_we[0]; -// // aca0 = dconjg(vec_a[im1p2]) * vec_a[im1p2] + dconjg(vec_a[iem1p2]) * vec_a[iem1p2]; -// // aca1 = dconjg(vec_a[im1p2 + 1]) * vec_a[im1p2 + 1] + dconjg(vec_a[iem1p2 + 1]) * vec_a[iem1p2 + 1]; -// // acap0 = dconjg(vec_a[im1p2]) * vec_a[im1p2 + 1] + dconjg(vec_a[iem1p2]) * vec_a[iem1p2 + 1]; -// // acap1 = dconjg(vec_a[im1p2 + 1]) * vec_a[im1p2] + dconjg(vec_a[iem1p2 + 1]) * vec_a[iem1p2]; -// ctqce01 += (acw0 * rmu); -// ctqce11 += (acw1 * rmu); -// tqcpe01 += (acwp0 * rmu); -// tqcpe11 += (acwp1 * rmu); -// ctqcs01 += (aca0 * rmu); -// ctqcs11 += (aca1 * rmu); -// tqcps01 += (acap0 * rmu); -// tqcps11 += (acap1 * rmu); -// mmmu = m - 1; -// mmmmu = (mmmu > 0) ? mmmu : -mmmu; -// if (mmmmu <= l60) { -// int immu = mmmu + il; -// int immue = immu + nlem; -// int immum1 = immu-1; -// int vecimmu = (immum1)*2; -// int immuem1 = immue-1; -// int vecimmue = (immuem1)*2; -// dcomplex *vec_w = w[immum1]; -// dcomplex *vec_we = w[immuem1]; -// rmu = sqrt(1.0 * (l60 - mmmu) * (l60 + m)) * sq2i; -// acw0 = dconjg(a[i - 1][0]) * w[immu - 1][0] + dconjg(a[ie - 1][0]) * w[immue - 1][0]; -// acw1 = dconjg(a[i - 1][1]) * w[immu - 1][1] + dconjg(a[ie - 1][1]) * w[immue - 1][1]; -// acwp0 = dconjg(a[i - 1][0]) * w[immu - 1][1] + dconjg(a[ie - 1][0]) * w[immue - 1][1]; -// acwp1 = dconjg(a[i - 1][1]) * w[immu - 1][0] + dconjg(a[ie - 1][1]) * w[immue - 1][0]; -// aca0 = dconjg(a[i - 1][0]) * a[immu - 1][0] + dconjg(a[ie - 1][0]) * a[immue - 1][0]; -// aca1 = dconjg(a[i - 1][1]) * a[immu - 1][1] + dconjg(a[ie - 1][1]) * a[immue - 1][1]; -// acap0 = dconjg(a[i - 1][0]) * a[immu - 1][1] + dconjg(a[ie - 1][0]) * a[immue - 1][1]; -// acap1 = dconjg(a[i - 1][1]) * a[immu - 1][0] + dconjg(a[ie - 1][1]) * a[immue - 1][0]; -// // acw0 = dconjg(vec_a[im1p2]) * vec_w[0] + dconjg(vec_a[iem1p2]) * vec_we[0]; -// // acw1 = dconjg(vec_a[im1p2 + 1]) * vec_w[1] + dconjg(vec_a[iem1p2 + 1]) * vec_we[1]; -// // acwp0 = dconjg(vec_a[im1p2]) * vec_w[1] + dconjg(vec_a[iem1p2]) * vec_we[1]; -// // acwp1 = dconjg(vec_a[im1p2 + 1]) * vec_w[0] + dconjg(vec_a[iem1p2 + 1]) * vec_we[0]; -// // aca0 = dconjg(vec_a[im1p2]) * vec_a[vecimmu] + dconjg(vec_a[iem1p2]) * vec_a[vecimmue]; -// // aca1 = dconjg(vec_a[im1p2 + 1]) * vec_a[vecimmu + 1] + dconjg(vec_a[iem1p2 + 1]) * vec_a[vecimmue + 1]; -// // acap0 = dconjg(vec_a[im1p2]) * vec_a[vecimmu + 1] + dconjg(vec_a[iem1p2]) * vec_a[vecimmue + 1]; -// // acap1 = dconjg(vec_a[im1p2 + 1]) * vec_a[vecimmu] + dconjg(vec_a[iem1p2 + 1]) * vec_a[vecimmue]; -// ctqce02 += (acw0 * rmu); -// ctqce12 += (acw1 * rmu); -// tqcpe02 += (acwp0 * rmu); -// tqcpe12 += (acwp1 * rmu); -// ctqcs02 += (aca0 * rmu); -// ctqcs12 += (aca1 * rmu); -// tqcps02 += (acap0 * rmu); -// tqcps12 += (acap1 * rmu); -// } // ends if clause -// } // k loop (previously the l60 and im60 loops -// ctqce[0][0] = ctqce00; -// ctqce[0][1] = ctqce01; -// ctqce[0][2] = ctqce02; -// ctqce[1][0] = ctqce10; -// ctqce[1][1] = ctqce11; -// ctqce[1][2] = ctqce12; - -// ctqcs[0][0] = ctqcs00; -// ctqcs[0][1] = ctqcs01; -// ctqcs[0][2] = ctqcs02; -// ctqcs[1][0] = ctqcs10; -// ctqcs[1][1] = ctqcs11; -// ctqcs[1][2] = ctqcs12; - -// tqcpe[0][0] = tqcpe00; -// tqcpe[0][1] = tqcpe01; -// tqcpe[0][2] = tqcpe02; -// tqcpe[1][0] = tqcpe10; -// tqcpe[1][1] = tqcpe11; -// tqcpe[1][2] = tqcpe12; - -// tqcps[0][0] = tqcps00; -// tqcps[0][1] = tqcps01; -// tqcps[0][2] = tqcps02; -// tqcps[1][0] = tqcps10; -// tqcps[1][1] = tqcps11; -// tqcps[1][2] = tqcps12; - -// // this is a ridiculously small loop (3 iterations!) of assignments. Would be not worth parallelising, but since it's easy do it on the cpus -// //#pragma omp teams distribute parallel for simd -// for (int ipo78 = 1; ipo78 <= 2; ipo78++) { -// int ipo = ipo78 - 1; -// tqce[ipo][0] = real(ctqce[ipo][0] - ctqce[ipo][2]) * sq2i; -// tqce[ipo][1] = real((ctqce[ipo][0] + ctqce[ipo][2]) * uim) * sq2i; -// tqce[ipo][2] = real(ctqce[ipo][1]); -// c1 = tqcpe[ipo][0]; -// c2 = tqcpe[ipo][1]; -// c3 = tqcpe[ipo][2]; -// tqcpe[ipo][0] = (c1 - c3) * sq2i; -// tqcpe[ipo][1] = (c1 + c3) * (uim * sq2i); -// tqcpe[ipo][2] = c2; -// tqcs[ipo][0] = -sq2i * real(ctqcs[ipo][0] - ctqcs[ipo][2]); -// tqcs[ipo][1] = -sq2i * real((ctqcs[ipo][0] + ctqcs[ipo][2]) * uim); -// tqcs[ipo][2] = -1.0 * real(ctqcs[ipo][1]); -// c1 = tqcps[ipo][0]; -// c2 = tqcps[ipo][1]; -// c3 = tqcps[ipo][2]; -// tqcps[ipo][0] = -(c1 - c3) * sq2i; -// tqcps[ipo][1] = -(c1 + c3) * (uim * sq2i); -// tqcps[ipo][2] = -c2; -// } // ipo78 loop -// // Clean memory -// delete[] a[0]; -// delete[] ctqce[0]; -// delete[] ctqcs[0]; -// delete[] a; -// delete[] ctqce; -// delete[] ctqcs; -// } - void rftr( double *u, double *up, double *un, double *gapv, double extins, double scatts, double &rapr, double &cosav, double &fp, double &fn, double &fk, double &fx,