From e53392615f309de18e9191e15eb870019ac7b3b6 Mon Sep 17 00:00:00 2001
From: "Mulas, Giacomo" <gmulas@oa-cagliari.inaf.it>
Date: Wed, 12 Jun 2024 12:19:57 +0200
Subject: [PATCH] Fix omp parallel loop in scr2()

---
 src/libnptm/clu_subs.cpp | 44 +++++++++++++++++++++++++---------------
 1 file changed, 28 insertions(+), 16 deletions(-)

diff --git a/src/libnptm/clu_subs.cpp b/src/libnptm/clu_subs.cpp
index cf643b1a..0d7a26fd 100644
--- a/src/libnptm/clu_subs.cpp
+++ b/src/libnptm/clu_subs.cpp
@@ -2222,10 +2222,6 @@ void scr2(
   cph = uim * exri * vkarg;
   int ls = (c4->li < c4->le) ? c4->li : c4->le;
   int kmax = ls*(ls+2);
-  dcomplex tsas00 = cc0;
-  dcomplex tsas10 = cc0;
-  dcomplex tsas01 = cc0;
-  dcomplex tsas11 = cc0;
   dcomplex *vec_rmi = c1->rmi[0];
   dcomplex *vec_rei = c1->rei[0];
   dcomplex *vec_w = c1->w[0];
@@ -2233,7 +2229,7 @@ void scr2(
 #ifdef USE_NVTX
   nvtxRangePush("scr2 outer loop 1");
 #endif
-  //#pragma omp parallel for reduction(+:tsas00, tsas10, tsas01, tsas11)
+#pragma omp parallel for
   for (int i14 = 1; i14 <= c4->nsph; i14++) {
     int i = i14 - 1;
     int iogi = c1->iog[i14 - 1];
@@ -2290,27 +2286,43 @@ void scr2(
       vec_sas[vecindex+3] = s22 * csam;
     }
     // label 12
+    // dcomplex phas = cexp(cph * (duk[0] * c1->rxx[i] + duk[1] * c1->ryy[i] + duk[2] * c1->rzz[i]));
+    // tsas00 += (c1->sas[iogi - 1][0][0] * phas);
+    // tsas10 += (c1->sas[iogi - 1][1][0] * phas);
+    // tsas01 += (c1->sas[iogi - 1][0][1] * phas);
+    // tsas11 += (c1->sas[iogi - 1][1][1] * phas);
+  } // i14 loop
+#ifdef USE_NVTX
+  nvtxRangePop();
+#endif
+  dcomplex tsas00 = cc0;
+  dcomplex tsas10 = cc0;
+  dcomplex tsas01 = cc0;
+  dcomplex tsas11 = cc0;
+#ifdef USE_NVTX
+  nvtxRangePush("scr2 loop 2");
+#endif
+  #pragma omp target teams distribute parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11)
+  for (int i14 = 1; i14 <= c4->nsph; i14++) {
+    int i = i14 - 1;
+    int iogi = c1->iog[i14 - 1];
+    // label 12
     dcomplex phas = cexp(cph * (duk[0] * c1->rxx[i] + duk[1] * c1->ryy[i] + duk[2] * c1->rzz[i]));
     tsas00 += (c1->sas[iogi - 1][0][0] * phas);
     tsas10 += (c1->sas[iogi - 1][1][0] * phas);
     tsas01 += (c1->sas[iogi - 1][0][1] * phas);
     tsas11 += (c1->sas[iogi - 1][1][1] * phas);
   } // i14 loop
-#ifdef USE_NVTX
-  nvtxRangePop();
-#endif
-
-  
-
-
-
   c3->tsas[0][0] = tsas00;
   c3->tsas[1][0] = tsas10;
   c3->tsas[0][1] = tsas01;
   c3->tsas[1][1] = tsas11;
+#ifdef USE_NVTX
+  nvtxRangePop();
+#endif
   dcomplex *vec_vints = c1->vints[0];
 #ifdef USE_NVTX
-  nvtxRangePush("scr2 outer loop 2");
+  nvtxRangePush("scr2 outer loop 3");
 #endif
 #pragma omp parallel for
   for (int i24 = 1; i24 <= c4->nsph; i24++) {
@@ -2318,7 +2330,7 @@ void scr2(
     if (iogi >= i24) {
       // int j = 0;
 #ifdef USE_NVTX
-  nvtxRangePush("scr2 inner loop 2");
+  nvtxRangePush("scr2 inner loop 3");
 #endif
 #pragma omp target teams distribute parallel for simd collapse(4)
       for (int ipo1 = 1; ipo1 <=2; ipo1++) {
@@ -2341,7 +2353,7 @@ void scr2(
 #endif
   // int j = 0;
 #ifdef USE_NVTX
-  nvtxRangePush("scr2 loop 3");
+  nvtxRangePush("scr2 loop 4");
 #endif
 #pragma omp target parallel for collapse(4)
   for (int ipo1 = 1; ipo1 <=2; ipo1++) {
-- 
GitLab