From 704e038dcff6995f34142c2eda5758abb7690c37 Mon Sep 17 00:00:00 2001
From: "Mulas, Giacomo" <gmulas@oa-cagliari.inaf.it>
Date: Tue, 10 Jun 2025 19:21:54 +0200
Subject: [PATCH] linearise loops in cfrfme.cpp, to prepare for omp target
 offload

---
 src/trapping/cfrfme.cpp | 124 ++++++++++++++++++++++++++--------------
 1 file changed, 80 insertions(+), 44 deletions(-)

diff --git a/src/trapping/cfrfme.cpp b/src/trapping/cfrfme.cpp
index 4c8010d..28c91cb 100644
--- a/src/trapping/cfrfme.cpp
+++ b/src/trapping/cfrfme.cpp
@@ -61,6 +61,14 @@
 #include <nvtx3/nvToolsExt.h>
 #endif
 
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#ifdef USE_TARGET_OFFLOAD
+#pragma omp requires unified_shared_memory
+#endif
+
 using namespace std;
 
 /*! \brief C++ implementation of FRFME
@@ -391,57 +399,84 @@ void frfme(string data_file, string output_path) {
 #ifdef USE_NVTX
 	  nvtxRangePush("j80 loop");
 #endif
-#pragma omp parallel for
-	  for (int j80 = jlmf; j80 <= jlml; j80++) {
-	    dcomplex *vec_w = new dcomplex[nkv * nkv]();
-	    dcomplex **w = new dcomplex*[nkv];
+	  dcomplex *vec_wsum = tfrfme->wsum[0];
+	  double *vec_vkzm = vkzm[0];
+#ifdef USE_TARGET_OFFLOAD
+#pragma omp target teams distribute parallel for simd
+#endif
+// #pragma omp parallel for
+	  for (int j80 = jlmf-1; j80 < jlml; j80++) {
+	    int nkvs = nkv * nkv;
+	    dcomplex *vec_w = (dcomplex *) calloc(nkvs, sizeof(dcomplex));
+	    dcomplex **w = (dcomplex **) calloc(nkv, sizeof(dcomplex *));
 	    // dcomplex *wk_local = new dcomplex[nlmmt]();
 	    for (int wi = 0; wi < nkv; wi++) w[wi] = vec_w + wi * nkv;
 	    dcomplex wk_value;
 	    int wk_index = 0;
-	    int nkvs = nkv * nkv;
-	    for (int jy50 = 0; jy50 < nkv; jy50++) {
-	      for (int jx50 = 0; jx50 < nkv; jx50++) {
+	    // for (int jy50 = 0; jy50 < nkv; jy50++) {
+	    //   for (int jx50 = 0; jx50 < nkv; jx50++) {
+// #ifdef USE_TARGET_OFFLOAD
+// #pragma omp target teams distribute parallel for simd
+// #endif
+// #pragma omp parallel for
+	    for (int jxy50 = 0; jxy50 < nkvs; jxy50++) {
 		// for (int wi = 0; wi < nlmmt; wi++) wk_local[wi] = tt1->wk[wk_index++];
-		// w[jx50][jy50] = wk_local[j80 - 1];
-		wk_value = tt1->wk[wk_index + j80 - 1];
-		wk_index += nlmmt;
-		w[jx50][jy50] = wk_value;
-	      } // jx50
-	    } // jy50 loop
+		// w[jx50][jy50] = wk_local[j80];
+		wk_index = nlmmt*jxy50;
+		wk_value = tt1->wk[wk_index + j80];
+		// wk_index += nlmmt;
+		int jy50 = jxy50 / nkv;
+		int jx50 = jxy50 % nkv;
+		vec_w[(nkv*jx50) + jy50] =  wk_value;
+		// w[jx50][jy50] = wk_value;
+	    } // jxy50 loop
+	    //   } // jx50
+	    // } // jy50 loop
 	    int ixyz = 0;
-	    for (int wj = 0; wj < nrvc; wj++) tfrfme->wsum[j80 - 1][wj] = cc0;
-	    for (int iz75 = 0; iz75 < nzv; iz75++) {
+	    for (int wj = 0; wj < nrvc; wj++) vec_wsum[(j80*nrvc)+wj] = cc0;
+	    int nvtot = nxv*nyv*nzv;
+	    int nvxy = nxv *nyv;
+// #ifdef USE_TARGET_OFFLOAD
+// #pragma omp target teams distribute parallel for
+// #endif
+// #pragma omp parallel for
+	    for (int ixyz = 0; ixyz < nvtot; ixyz++) {
+	      int iz75 = ixyz / nvxy;
+	      int iy70 = (ixyz % nvxy) / nxv;
+	      int ix65 = ixyz % nxv;
+	    // for (int iz75 = 0; iz75 < nzv; iz75++) {
 	      double z = _zv[iz75] + frsh;
-	      for (int iy70 = 0; iy70 < nyv; iy70++) {
-		double y = _yv[iy70];
-		for (int ix65 = 0; ix65 < nxv; ix65++) {
-		  double x = _xv[ix65];
-		  ixyz++;
-		  dcomplex sumy = cc0;
-		  for (int jy60 = 0; jy60 < nkv; jy60++) {
-		    double vky = vkv[jy60];
-		    double vkx = vkv[nkv - 1];
-		    double vkzf = vkzm[0][jy60];
-		    dcomplex phasf = cexp(uim * (-vkx * x + vky * y + vkzf * z));
-		    double vkzl = vkzm[nkv - 1][jy60];
-		    dcomplex phasl = cexp(uim * (vkx * x + vky * y + vkzl * z));
-		    dcomplex sumx = 0.5 * (w[0][jy60] * phasf + w[nkv - 1][jy60] * phasl);
-		    for (int jx55 = 2; jx55 <= nks; jx55++) {
-		      vkx = vkv[jx55 - 1];
-		      double vkz = vkzm[jx55 - 1][jy60];
-		      dcomplex phas = cexp(uim * (vkx * x + vky * y + vkz * z));
-		      sumx += (w[jx55 - 1][jy60] * phas);
-		    } // jx55 loop
-		    if (jy60 == 0 || jy60 == nkv - 1) sumx *= 0.5;
-		    sumy += sumx;
-		  } // jy60 loop
-		  tfrfme->wsum[j80 - 1][ixyz - 1] = sumy * delks;
-		} // ix65 loop
-	      } // iy70 loop
-	    } // iz75 loop
-	    delete[] vec_w;
-	    delete[] w;
+	      // for (int iy70 = 0; iy70 < nyv; iy70++) {
+	      double y = _yv[iy70];
+		// for (int ix65 = 0; ix65 < nxv; ix65++) {
+	      double x = _xv[ix65];
+		  // ixyz++;
+	      dcomplex sumy = cc0;
+// #ifdef USE_TARGET_OFFLOAD
+// #pragma omp target parallel for simd reduction(+:sumy)
+// #endif
+	      for (int jy60x55 = 0; jy60x55 < nkvs ; jy60x55++) {
+		int jy60 = jy60x55 / nkv;
+		int jx55 = jy60x55 % nkv;
+		double vky = vkv[jy60];
+		double vkx = (jx55==0) ? vkv[nkv - 1] : vkv[jx55];
+		double vkz = vec_vkzm[(jx55*nkv)+jy60];
+		dcomplex phas = (jx55==0) ?
+		  cexp(uim * (-vkx * x + vky * y + vkz * z)):
+		  cexp(uim * (vkx * x + vky * y + vkz * z));
+		dcomplex sumx = vec_w[(jx55*nkv)+jy60] * phas;
+		double factor1 = ((jx55==0)||(jx55==(nkv-1))) ? 0.5 : 1.0;
+		double factor2 = ((jy60==0)||(jy60==(nkv-1))) ? 0.5 : 1.0;
+		sumx *= factor1*factor2;
+		sumy += sumx;
+	      } // jy60x55 loop
+	      vec_wsum[((j80)*nrvc)+ixyz] = sumy * delks;
+	    // 	} // ix65 loop
+	    //   } // iy70 loop
+	    // } // iz75 loop
+	    } // ixyz loop
+	    free(vec_w);
+	    free(w);
 	    // delete[] wk_local;
 	  } // j80 loop
 #ifdef USE_NVTX
@@ -499,3 +534,4 @@ void frfme(string data_file, string output_path) {
   nvtxRangePop();
 #endif
 }
+
-- 
GitLab