diff --git a/src/libnptm/clu_subs.cpp b/src/libnptm/clu_subs.cpp index cb2757eae471741fa28c1ecda123abac1dfe29b6..d83384a7441d61e0e3d23ab02506985be1009709 100644 --- a/src/libnptm/clu_subs.cpp +++ b/src/libnptm/clu_subs.cpp @@ -1293,6 +1293,9 @@ void mextc(double vk, double exri, dcomplex **fsac, double **cextlr, double **ce } void pcros(double vk, double exri, C1 *c1, C1_AddOns *c1ao, C4 *c4) { +#ifdef USE_NVTX + nvtxRangePush("whole pcros"); +#endif const dcomplex cc0 = 0.0 + 0.0 * I; dcomplex sump, sum1, sum2, sum3, sum4, am, amp, cc, csam; const double exdc = exri * exri; @@ -1303,8 +1306,13 @@ void pcros(double vk, double exri, C1 *c1, C1_AddOns *c1ao, C4 *c4) { double cfsq = 4.0 / (pi4sq *ccs * ccs); const int nlemt = c4->nlem + c4->nlem; int jpo = 2; - for (int ipo18 = 1; ipo18 <= 2; ipo18++) { - if (ipo18 == 2) jpo = 1; + dcomplex *vec_am0m = c1ao->am0m[0]; + dcomplex *vec_w = c1->w[0]; +#ifdef USE_NVTX + nvtxRangePush("pcros outer loop 1"); +#endif + for (int ipo18 = 0; ipo18 < 2; ipo18++) { + int jpo = 1-ipo18; int ipopt = ipo18 + 2; int jpopt = jpo + 2; double sum = 0.0; @@ -1313,32 +1321,44 @@ void pcros(double vk, double exri, C1 *c1, C1_AddOns *c1ao, C4 *c4) { sum2 = cc0; sum3 = cc0; sum4 = cc0; - for (int i12 = 1; i12 <= nlemt; i12++) { - int i = i12 - 1; - am = cc0; - amp = cc0; - for (int j10 = 1; j10 <= nlemt; j10++) { - int j = j10 - 1; - am += (c1ao->am0m[i][j] * c1->w[j][ipo18 - 1]); - amp += (c1ao->am0m[i][j] * c1->w[j][jpo - 1]); +#ifdef USE_NVTX + nvtxRangePush("pcros intermediate loop 1"); +#endif +#pragma omp target teams distribute parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4) + for (int i12 = 0; i12 < nlemt; i12++) { + // int i = i12 - 1; + dcomplex am = cc0; + dcomplex amp = cc0; + //#pragma omp target teams distribute parallel for simd reduction(+:am,amp) + for (int j10 = 0; j10 < nlemt; j10++) { + // int j = j10 - 1; + am += (vec_am0m[nlemt*i12+j10] * vec_w[4*j10+ipo18]); + amp += (vec_am0m[nlemt*i12+j10] * vec_w[4*j10+jpo]); } // j10 loop sum += real(dconjg(am) * am); sump += (dconjg(amp) * am); - sum1 += (dconjg(c1->w[i][ipo18 - 1]) * am); - sum2 += (dconjg(c1->w[i][jpo - 1]) * am); - sum3 += (c1->w[i][ipopt - 1] * am); - sum4 += (c1->w[i][jpopt - 1] * am); + sum1 += (dconjg(vec_w[4*i12+ipo18]) * am); + sum2 += (dconjg(vec_w[4*i12+jpo]) * am); + sum3 += (vec_w[4*i12+ipopt] * am); + sum4 += (vec_w[4*i12+jpopt] * am); } // i12 loop - c1ao->scsc[ipo18 - 1] = cccs * sum; - c1ao->scscp[ipo18 - 1] = cccs * sump; - c1ao->ecsc[ipo18 - 1] = -cccs * real(sum1); - c1ao->ecscp[ipo18 - 1] = -cccs * sum2; - c1ao->fsac[ipo18 - 1][ipo18 - 1] = csam * sum1; - c1ao->fsac[jpo - 1][ipo18 - 1] = csam * sum2; - c1ao->sac[ipo18 - 1][ipo18 - 1] = csam * sum3; - c1ao->sac[jpo - 1][ipo18 - 1] = csam * sum4; +#ifdef USE_NVTX + nvtxRangePop(); +#endif + c1ao->scsc[ipo18] = cccs * sum; + c1ao->scscp[ipo18] = cccs * sump; + c1ao->ecsc[ipo18] = -cccs * real(sum1); + c1ao->ecscp[ipo18] = -cccs * sum2; + c1ao->fsac[ipo18][ipo18] = csam * sum1; + c1ao->fsac[jpo][ipo18] = csam * sum2; + c1ao->sac[ipo18][ipo18] = csam * sum3; + c1ao->sac[jpo][ipo18] = csam * sum4; } // ipo18 loop int i = 0; + dcomplex * &vint = c1->vint; +#ifdef USE_NVTX + nvtxRangePush("pcros loop 2"); +#endif for (int ipo1 = 1; ipo1 <= 2; ipo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { cc = dconjg(c1ao->sac[jpo1 - 1][ipo1 - 1]); @@ -1349,38 +1369,47 @@ void pcros(double vk, double exri, C1 *c1, C1_AddOns *c1ao, C4 *c4) { } // ipo2 loop } // jpo1 loop } // ipo1 loop +#ifdef USE_NVTX + nvtxRangePop(); +#endif +#ifdef USE_NVTX + nvtxRangePop(); +#endif } void pcrsm0(double vk, double exri, int inpol, C1 *c1, C1_AddOns *c1ao, C4 *c4) { const dcomplex cc0 = 0.0 + 0.0 * I; const dcomplex uim = 0.0 + 1.0 * I; - dcomplex sum1, sum2, sum3, sum4, sumpd; + dcomplex sum1, sum2, sum3, sum4; dcomplex sums1, sums2, sums3, sums4, csam; double exdc = exri * exri; double ccs = 4.0 * acos(0.0) / (vk * vk); double cccs = ccs / exdc; + int nlemt = c4->nlem + c4->nlem; + dcomplex *vec_am0m = c1ao->am0m[0]; csam = -(ccs / (exri * vk)) * 0.5 * I; sum2 = cc0; sum3 = cc0; - for (int i14 = 1; i14 <= c4->nlem; i14++) { // GPU portable? +#pragma omp target teams distribute parallel for simd reduction(+:sum2,sum3) + for (int i14 = 0; i14 < c4->nlem; i14++) { int ie = i14 + c4->nlem; - sum2 += (c1ao->am0m[i14 - 1][i14 - 1] + c1ao->am0m[ie - 1][ie - 1]); - sum3 += (c1ao->am0m[i14 - 1][ie - 1] + c1ao->am0m[ie - 1][i14 - 1]); + sum2 += (vec_am0m[nlemt*i14 + i14] + vec_am0m[nlemt*ie + ie]); + sum3 += (vec_am0m[nlemt*i14 + ie] + vec_am0m[nlemt*ie + i14]); } // i14 loop double sumpi = 0.0; - sumpd = cc0; - int nlemt = c4->nlem + c4->nlem; - for (int i16 = 1; i16 <= nlemt; i16++) { - for (int j16 = 1; j16 <= c4->nlem; j16++) { + dcomplex sumpd = cc0; +#pragma omp target teams distribute parallel for simd collapse(2) reduction(+:sumpi,sumpd) + for (int i16 = 0; i16 < nlemt; i16++) { + for (int j16 = 0; j16 < c4->nlem; j16++) { int je = j16 + c4->nlem; double rvalue = real( - dconjg(c1ao->am0m[i16 - 1][j16 - 1]) * c1ao->am0m[i16 - 1][j16 - 1] - + dconjg(c1ao->am0m[i16 - 1][je - 1]) * c1ao->am0m[i16 - 1][je - 1] + dconjg(vec_am0m[nlemt*i16 + j16]) * vec_am0m[nlemt*i16 + j16] + + dconjg(vec_am0m[nlemt*i16 + je]) * vec_am0m[nlemt*i16 + je] ); sumpi += rvalue; sumpd += ( - dconjg(c1ao->am0m[i16 - 1][j16 - 1]) * c1ao->am0m[i16 - 1][je - 1] - + dconjg(c1ao->am0m[i16 - 1][je - 1]) * c1ao->am0m[i16 - 1][j16 - 1] + dconjg(vec_am0m[nlemt*i16 + j16]) * vec_am0m[nlemt*i16 + je] + + dconjg(vec_am0m[nlemt*i16 + je]) * vec_am0m[nlemt*i16 + j16] ); } // j16 loop } // i16 loop