diff --git a/src/libnptm/clu_subs.cpp b/src/libnptm/clu_subs.cpp index 0d7a26fdc0d224513bc9573c5908957b2b4936d8..8733d94180a79326923fc87418a91fc9d9226603 100644 --- a/src/libnptm/clu_subs.cpp +++ b/src/libnptm/clu_subs.cpp @@ -2146,24 +2146,19 @@ void scr0(double vk, double exri, C1 *c1, C1_AddOns *c1ao, C3 *c3, C4 * c4) { double cccs = ccs / exdc; dcomplex csam = -(ccs / (exri * vk)) * 0.5 * I; //double scs = 0.0, ecs = 0.0, acs = 0.0; - double scs = 0.0; - double ecs = 0.0; - double acs = 0.0; - dcomplex tfsas = cc0; dcomplex *vec_rmi = c1->rmi[0]; dcomplex *vec_rei = c1->rei[0]; #ifdef USE_NVTX - nvtxRangePush("scr0 outer loop"); + nvtxRangePush("scr0 outer loop 1"); #endif - - //#pragma omp parallel for reduction(+:scs, ecs, acs, tfsas) +#pragma omp parallel for for (int i14 = 1; i14 <= c4->nsph; i14++) { int iogi = c1->iog[i14 - 1]; if (iogi >= i14) { double sums = 0.0; dcomplex sum21 = cc0; #ifdef USE_NVTX - nvtxRangePush("scr0 inner loop"); + nvtxRangePush("scr0 inner loop 1"); #endif #pragma omp target teams distribute parallel for simd reduction(+:sums, sum21) for (int l10 = 1; l10 <= c4->li; l10++) { @@ -2177,6 +2172,9 @@ void scr0(double vk, double exri, C1 *c1, C1_AddOns *c1ao, C3 *c3, C4 * c4) { sums += rvalue; sum21 += ((rm + re) * fl); } // l10 loop +#ifdef USE_NVTX + nvtxRangePop(); +#endif sum21 *= -1.0; double scasec = cccs * sums; double extsec = -cccs * real(sum21); @@ -2191,11 +2189,29 @@ void scr0(double vk, double exri, C1 *c1, C1_AddOns *c1ao, C3 *c3, C4 * c4) { c1->fsas[i14 - 1] = sum21 * csam; } // label 12 + // scs += c1->sscs[iogi - 1]; + // ecs += c1->sexs[iogi - 1]; + // acs += c1->sabs[iogi - 1]; + // tfsas += c1->fsas[iogi - 1]; + } // i14 loop +#ifdef USE_NVTX + nvtxRangePop(); +#endif + double scs = 0.0; + double ecs = 0.0; + double acs = 0.0; + dcomplex tfsas = cc0; +#ifdef USE_NVTX + nvtxRangePush("scr0 loop 2"); +#endif +#pragma omp target teams distribute parallel for simd reduction(+:scs, ecs, acs, tfsas) + for (int i14 = 1; i14 <= c4->nsph; i14++) { + int iogi = c1->iog[i14 - 1]; scs += c1->sscs[iogi - 1]; ecs += c1->sexs[iogi - 1]; acs += c1->sabs[iogi - 1]; tfsas += c1->fsas[iogi - 1]; - } // i14 loop + } c3->scs = scs; c3->ecs = ecs; c3->acs = acs;