From e53392615f309de18e9191e15eb870019ac7b3b6 Mon Sep 17 00:00:00 2001 From: "Mulas, Giacomo" <gmulas@oa-cagliari.inaf.it> Date: Wed, 12 Jun 2024 12:19:57 +0200 Subject: [PATCH] Fix omp parallel loop in scr2() --- src/libnptm/clu_subs.cpp | 44 +++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/src/libnptm/clu_subs.cpp b/src/libnptm/clu_subs.cpp index cf643b1a..0d7a26fd 100644 --- a/src/libnptm/clu_subs.cpp +++ b/src/libnptm/clu_subs.cpp @@ -2222,10 +2222,6 @@ void scr2( cph = uim * exri * vkarg; int ls = (c4->li < c4->le) ? c4->li : c4->le; int kmax = ls*(ls+2); - dcomplex tsas00 = cc0; - dcomplex tsas10 = cc0; - dcomplex tsas01 = cc0; - dcomplex tsas11 = cc0; dcomplex *vec_rmi = c1->rmi[0]; dcomplex *vec_rei = c1->rei[0]; dcomplex *vec_w = c1->w[0]; @@ -2233,7 +2229,7 @@ void scr2( #ifdef USE_NVTX nvtxRangePush("scr2 outer loop 1"); #endif - //#pragma omp parallel for reduction(+:tsas00, tsas10, tsas01, tsas11) +#pragma omp parallel for for (int i14 = 1; i14 <= c4->nsph; i14++) { int i = i14 - 1; int iogi = c1->iog[i14 - 1]; @@ -2290,27 +2286,43 @@ void scr2( vec_sas[vecindex+3] = s22 * csam; } // label 12 + // dcomplex phas = cexp(cph * (duk[0] * c1->rxx[i] + duk[1] * c1->ryy[i] + duk[2] * c1->rzz[i])); + // tsas00 += (c1->sas[iogi - 1][0][0] * phas); + // tsas10 += (c1->sas[iogi - 1][1][0] * phas); + // tsas01 += (c1->sas[iogi - 1][0][1] * phas); + // tsas11 += (c1->sas[iogi - 1][1][1] * phas); + } // i14 loop +#ifdef USE_NVTX + nvtxRangePop(); +#endif + dcomplex tsas00 = cc0; + dcomplex tsas10 = cc0; + dcomplex tsas01 = cc0; + dcomplex tsas11 = cc0; +#ifdef USE_NVTX + nvtxRangePush("scr2 loop 2"); +#endif + #pragma omp target teams distribute parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) + for (int i14 = 1; i14 <= c4->nsph; i14++) { + int i = i14 - 1; + int iogi = c1->iog[i14 - 1]; + // label 12 dcomplex phas = cexp(cph * (duk[0] * c1->rxx[i] + duk[1] * c1->ryy[i] + duk[2] * c1->rzz[i])); tsas00 += (c1->sas[iogi - 1][0][0] * phas); tsas10 += (c1->sas[iogi - 1][1][0] * phas); tsas01 += (c1->sas[iogi - 1][0][1] * phas); tsas11 += (c1->sas[iogi - 1][1][1] * phas); } // i14 loop -#ifdef USE_NVTX - nvtxRangePop(); -#endif - - - - - c3->tsas[0][0] = tsas00; c3->tsas[1][0] = tsas10; c3->tsas[0][1] = tsas01; c3->tsas[1][1] = tsas11; +#ifdef USE_NVTX + nvtxRangePop(); +#endif dcomplex *vec_vints = c1->vints[0]; #ifdef USE_NVTX - nvtxRangePush("scr2 outer loop 2"); + nvtxRangePush("scr2 outer loop 3"); #endif #pragma omp parallel for for (int i24 = 1; i24 <= c4->nsph; i24++) { @@ -2318,7 +2330,7 @@ void scr2( if (iogi >= i24) { // int j = 0; #ifdef USE_NVTX - nvtxRangePush("scr2 inner loop 2"); + nvtxRangePush("scr2 inner loop 3"); #endif #pragma omp target teams distribute parallel for simd collapse(4) for (int ipo1 = 1; ipo1 <=2; ipo1++) { @@ -2341,7 +2353,7 @@ void scr2( #endif // int j = 0; #ifdef USE_NVTX - nvtxRangePush("scr2 loop 3"); + nvtxRangePush("scr2 loop 4"); #endif #pragma omp target parallel for collapse(4) for (int ipo1 = 1; ipo1 <=2; ipo1++) { -- GitLab