diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 39e36dad06329deff757d046ef360ef211c575e1..a77e21b6db9cb73ffab0a0709391b0f9849ef7b6 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -78,13 +78,13 @@ compatibility_stage:
       - CXX=g++-14 FC=gfortran-14 ./configure
       - make wipe
       - make -j
-      - echo "Running make with refinement with gnu compilers version 14..."
+      - echo "Running make with gnu compilers version 14..."
       - cd ..
       - rm -rf build_gnu14
       - mkdir build_gnu14_refine
       - cd build_gnu14_refine
       - cp -r ../build/* .
-      - CXX=g++-14 FC=gfortran-14 ./configure --enable-refinement
+      - CXX=g++-14 FC=gfortran-14 ./configure
       - make wipe
       - make -j
       #- echo "Running make with flang version 16 and clang version 16..."
@@ -173,7 +173,7 @@ building_stage:
       - cat /etc/os-release
       - cd build
       - echo "Configuring with default compilers (MAGMA disabled)..."
-      - ./configure --without-magma --without-cublas --disable-offload --enable-refinement --enable-shared
+      - ./configure --without-magma --without-cublas --disable-offload --enable-shared
       - make wipe
       - echo "Building the default configuration..."
       - make -j
diff --git a/build/configure.sh b/build/configure.sh
index f38981c6661323397f698177066d76e14c2f0902..5f2a7b6e6bf82a3889d01d92ce9c0d973a213727 100755
--- a/build/configure.sh
+++ b/build/configure.sh
@@ -814,7 +814,7 @@ EOF
     fi
     if [ "x$result" = "x0" ]; then
 	echo "yes"
-	OFFLOADFLAGS="-DUSE_TARGET_OFFLOAD -fcf-protection=check -foffload=nvptx-none=\"-O${CXX_OPT}${CXX_DBG} -fcf-protection=check -fopt-info -lm -latomic -lgomp\""
+	OFFLOADFLAGS=" -DUSE_TARGET_OFFLOAD -fcf-protection=check -foffload=nvptx-none=\"-O${CXX_OPT}${CXX_DBG} -fcf-protection=check -fopt-info -lm -latomic -lgomp\""
 	if [ "x${OMPFLAGS}" = "x" ]; then
 	    OFFLOADFLAGS="${OFFLOADFLAGS} -fopenmp"
 	fi
diff --git a/src/libnptm/clu_subs.cpp b/src/libnptm/clu_subs.cpp
index 779c2ecd4afdfe2eba45809b9ff4e80e43883b2e..7bd73b106914ffd80ef5ab02a57f5cde17b6c938 100644
--- a/src/libnptm/clu_subs.cpp
+++ b/src/libnptm/clu_subs.cpp
@@ -1341,6 +1341,8 @@ void pcros(double vk, double exri, ParticleDescriptor *c1) {
 #endif
 #ifdef USE_TARGET_OFFLOAD
 #pragma omp target teams distribute parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4)
+#else
+#pragma omp parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4)
 #endif
   for (int i12 = 0; i12 < nlemt; i12++) {
       // int i = i12 - 1;
@@ -1408,6 +1410,8 @@ void pcrsm0(double vk, double exri, int inpol, ParticleDescriptor *c1) {
   sum3 = cc0;
 #ifdef USE_TARGET_OFFLOAD
 #pragma omp target teams distribute parallel for simd reduction(+:sum2,sum3)
+#else
+#pragma omp parallel for simd reduction(+:sum2,sum3)
 #endif
   for (int i14 = 0; i14 < c1->nlem; i14++) { 
     int ie = i14 + c1->nlem;
@@ -1418,6 +1422,8 @@ void pcrsm0(double vk, double exri, int inpol, ParticleDescriptor *c1) {
   dcomplex sumpd = cc0;
 #ifdef USE_TARGET_OFFLOAD
 #pragma omp target teams distribute parallel for simd collapse(2) reduction(+:sumpi,sumpd)
+#else
+#pragma omp parallel for simd collapse(2) reduction(+:sumpi,sumpd)
 #endif
   for (int i16 = 0; i16 < nlemt; i16++) {
     for (int j16 = 0; j16 < c1->nlem; j16++) {
@@ -2001,6 +2007,8 @@ void raba(
 #endif
 #ifdef USE_TARGET_OFFLOAD
 #pragma omp target teams distribute parallel for simd reduction(+:c1, c2)
+#else
+#pragma omp parallel for simd reduction(+:c1, c2)
 #endif
   for (int j10 = 1; j10 <= nlemt; j10++) {
       int j = j10 - 1;
@@ -2021,6 +2029,8 @@ void raba(
 #endif
 #ifdef USE_TARGET_OFFLOAD
 #pragma omp teams distribute parallel for
+#else
+#pragma omp parallel for
 #endif
   for (int ipo = 0; ipo < 2; ipo++) {
     int jpo = 1 - ipo;
@@ -2051,12 +2061,14 @@ void raba(
     int kmax = le*(le+2);
     // for efficiency I should also linearise array w, but I cannot easily since I do not know for sure its major dimension (changes to containing class needed)
 #ifdef USE_NVTX
-  nvtxRangePush("raba inner loop 2");
+    nvtxRangePush("raba inner loop 2");
 #endif
 #ifdef USE_TARGET_OFFLOAD
 #pragma omp target teams distribute parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2)
+#else
+#pragma omp parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2)
 #endif
-  for (int k = 1; k<=kmax; k++) {
+    for (int k = 1; k<=kmax; k++) {
       int l60 = (int) sqrt(k+1);
       int im60 = k - (l60*l60) + 1;
       if (im60 == 0) {
@@ -2079,44 +2091,44 @@ void raba(
       dcomplex acwp;
       dcomplex aca;
       dcomplex acap;
-	if (mmmmu <= l60) {
-	  int immu = mmmu + il - 1;
-	  int immue = immu + nlem;
-	  rmu = -sqrt(1.0 * (l60 + mmmu) * (l60 - m)) * sq2i;
-	  acw = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+ipo];
-	  acwp = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+jpo];
-	  aca = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+ipo];
-	  acap = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+jpo];
-	  ctqce0 += (acw * rmu);
-	  tqcpe0 += (acwp * rmu);
-	  ctqcs0 += (aca * rmu);
-	  tqcps0 += (acap * rmu);
-	}
-	// label 30
-	rmu = -1.0 * m;
-	acw = dconjg(vec_a[2*i+ipo]) * vec_w[4*i+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*ie+ipo];
-	acwp = dconjg(vec_a[2*i+ipo]) * vec_w[4*i+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*ie+jpo];
-	aca = dconjg(vec_a[2*i+ipo]) * vec_a[2*i+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*ie+ipo];
-	acap = dconjg(vec_a[2*i+ipo]) * vec_a[2*i+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*ie+jpo];
-	ctqce1 += (acw * rmu);
-	tqcpe1 += (acwp * rmu);
-	ctqcs1 += (aca * rmu);
-	tqcps1 += (acap * rmu);
-	mmmu = m - 1;
-	mmmmu = (mmmu > 0) ? mmmu : -mmmu;
-	if (mmmmu <= l60) {
-	  int immu = mmmu + il - 1;
-	  int immue = immu + nlem;
-	  rmu = sqrt(1.0 * (l60 - mmmu) * (l60 + m)) * sq2i;
-	  acw = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+ipo];
-	  acwp = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+jpo];
-	  aca = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+ipo];
-	  acap = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+jpo];
-	  ctqce2 += (acw * rmu);
-	  tqcpe2 += (acwp * rmu);
-	  ctqcs2 += (aca * rmu);
-	  tqcps2 += (acap * rmu);
-	} // ends if clause
+      if (mmmmu <= l60) {
+	int immu = mmmu + il - 1;
+	int immue = immu + nlem;
+	rmu = -sqrt(1.0 * (l60 + mmmu) * (l60 - m)) * sq2i;
+	acw = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+ipo];
+	acwp = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+jpo];
+	aca = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+ipo];
+	acap = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+jpo];
+	ctqce0 += (acw * rmu);
+	tqcpe0 += (acwp * rmu);
+	ctqcs0 += (aca * rmu);
+	tqcps0 += (acap * rmu);
+      }
+      // label 30
+      rmu = -1.0 * m;
+      acw = dconjg(vec_a[2*i+ipo]) * vec_w[4*i+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*ie+ipo];
+      acwp = dconjg(vec_a[2*i+ipo]) * vec_w[4*i+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*ie+jpo];
+      aca = dconjg(vec_a[2*i+ipo]) * vec_a[2*i+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*ie+ipo];
+      acap = dconjg(vec_a[2*i+ipo]) * vec_a[2*i+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*ie+jpo];
+      ctqce1 += (acw * rmu);
+      tqcpe1 += (acwp * rmu);
+      ctqcs1 += (aca * rmu);
+      tqcps1 += (acap * rmu);
+      mmmu = m - 1;
+      mmmmu = (mmmu > 0) ? mmmu : -mmmu;
+      if (mmmmu <= l60) {
+	int immu = mmmu + il - 1;
+	int immue = immu + nlem;
+	rmu = sqrt(1.0 * (l60 - mmmu) * (l60 + m)) * sq2i;
+	acw = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+ipo];
+	acwp = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+jpo];
+	aca = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+ipo];
+	acap = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+jpo];
+	ctqce2 += (acw * rmu);
+	tqcpe2 += (acwp * rmu);
+	ctqcs2 += (aca * rmu);
+	tqcps2 += (acap * rmu);
+      } // ends if clause
     } // k loop (previously the l60 and im60 loops
 #ifdef USE_NVTX
   nvtxRangePop();
@@ -2129,7 +2141,9 @@ void raba(
   nvtxRangePush("raba loop 3");
 #endif
 #ifdef USE_TARGET_OFFLOAD
-#pragma omp teams distribute parallel for simd
+#pragma omp target teams distribute parallel for simd
+#else
+#pragma omp parallel for simd
 #endif
   for (int ipo78 = 1; ipo78 <= 2; ipo78++) {
     int ipo = ipo78 - 1;
@@ -2202,6 +2216,8 @@ void scr0(double vk, double exri, ParticleDescriptor *c1) {
 #endif
 #ifdef USE_TARGET_OFFLOAD
 #pragma omp target teams distribute parallel for simd reduction(+:sums, sum21)
+#else
+#pragma omp parallel for simd reduction(+:sums, sum21)
 #endif
       for (int l10 = 1; l10 <= c1->li; l10++) {
 	double fl = 1.0 * (l10 + l10 + 1);
@@ -2248,6 +2264,8 @@ void scr0(double vk, double exri, ParticleDescriptor *c1) {
 #endif
 #ifdef USE_TARGET_OFFLOAD
 #pragma omp target teams distribute parallel for simd reduction(+:scs, ecs, acs, tfsas)
+#else
+#pragma omp parallel for simd reduction(+:scs, ecs, acs, tfsas)
 #endif
   for (int i14 = 1; i14 <= c1->nsph; i14++) {
     int iogi = c1->iog[i14 - 1];
@@ -2312,6 +2330,8 @@ void scr2(
 #endif
 #ifdef USE_TARGET_OFFLOAD
 #pragma omp target teams distribute parallel for simd reduction(-:s11, s21, s12, s22)
+#else
+#pragma omp parallel for simd reduction(-:s11, s21, s12, s22)
 #endif
       for (int k = 1; k<=kmax; k++) {
 	int l10 = (int) sqrt(k+1);
@@ -2366,6 +2386,8 @@ void scr2(
 #endif
 #ifdef USE_TARGET_OFFLOAD
 #pragma omp target teams distribute parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11)
+#else
+#pragma omp parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11)
 #endif
   for (int i14 = 1; i14 <= c1->nsph; i14++) {
     int i = i14 - 1;
@@ -2398,6 +2420,8 @@ void scr2(
 #endif
 #ifdef USE_TARGET_OFFLOAD
 #pragma omp target teams distribute parallel for simd collapse(4)
+#else
+#pragma omp parallel for simd collapse(4)
 #endif
       for (int ipo1 = 1; ipo1 <=2; ipo1++) {
 	for (int jpo1 = 1; jpo1 <= 2; jpo1++) {
@@ -2422,7 +2446,9 @@ void scr2(
   nvtxRangePush("scr2 loop 4");
 #endif
 #ifdef USE_TARGET_OFFLOAD
-#pragma omp target parallel for collapse(4)
+#pragma omp target teams distribute parallel for collapse(4)
+#else
+#pragma omp parallel for collapse(4)
 #endif
   for (int ipo1 = 1; ipo1 <=2; ipo1++) {
     for (int jpo1 = 1; jpo1 <= 2; jpo1++) {
@@ -2534,7 +2560,7 @@ void tqr(
 }
 
 void ztm(dcomplex **am, ParticleDescriptor *c1) {
-  dcomplex gie, gle, a1, a2, a3, a4, sum1, sum2, sum3, sum4;
+  // dcomplex gie, gle, a1, a2, a3, a4, sum1, sum2, sum3, sum4;
   const dcomplex cc0 = 0.0 + 0.0 * I;
   // int i2 = 0; // old implementation
 #ifdef USE_NVTX
@@ -2546,7 +2572,6 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) {
   // C9 *c9_para = new C9(*c9);
   dcomplex *gis_v = c1->gis[0];
   dcomplex *gls_v = c1->gls[0];
-  double *rac3j_local = (double *) malloc(c1->lmtpo*sizeof(double));
   int k2max = c1->li*(c1->li+2);
   int k3max = c1->le*(c1->le+2);
   // To parallelise, I run a linearised loop directly over k
@@ -2559,6 +2584,8 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) {
   // im = im -(2*l+1) and l = l+1 (there was a rounding error in a nearly exact root)
 #ifdef USE_TARGET_OFFLOAD
 #pragma omp target teams distribute parallel for simd collapse(3)
+#else
+#pragma omp parallel for simd collapse(3)
 #endif
   for (int n2 = 1; n2 <= c1->nsph; n2++) { // GPU portable?
     for (int k2 = 1; k2<=k2max; k2++) {
@@ -2590,12 +2617,13 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) {
 	int i3 = l3 * l3 + im3 - 1;
 	int m3 = -l3 - 1 + im3;
 	int vecindex = (i2 - 1) * c1->nlem + i3 - 1;
+	double *rac3j_local = (double *) malloc(c1->lmtpo*sizeof(double));
 	gis_v[vecindex] = ghit_d(2, 0, n2, l2, m2, l3, m3, c1, rac3j_local);
 	gls_v[vecindex] = ghit_d(2, 1, n2, l2, m2, l3, m3, c1, rac3j_local);
+	free(rac3j_local);
       } // close k3 loop, former l3 + im3 loops
     } // close k2 loop, former l2 + im2 loops
   } // close n2 loop
-  free(rac3j_local);
 #ifdef USE_NVTX
   nvtxRangePop();
 #endif
@@ -2606,6 +2634,8 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) {
   dcomplex *sam_v = c1->sam[0];
 #ifdef USE_TARGET_OFFLOAD
 #pragma omp target teams distribute parallel for simd collapse(2)
+#else
+#pragma omp parallel for simd collapse(2)
 #endif
   for (int i1 = 1; i1 <= c1->ndi; i1++) { // GPU portable?
     for (int i3 = 1; i3 <= c1->nlem; i3++) {
@@ -2615,7 +2645,6 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) {
       dcomplex sum4 = cc0;
       int i1e = i1 + c1->ndi;
       int i3e = i3 + c1->nlem;
-#pragma parallel for simd reduction(+:sum1,sum2,sum3,sum4)
       for (int i2 = 1; i2 <= c1->ndi; i2++) {
 	int i2e = i2 + c1->ndi;
 	int vecindg_23 = (i2 - 1) * c1->nlem + i3 - 1;
@@ -2640,7 +2669,11 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) {
       sam_v[vecind1e + i3e - 1] = sum4;
     } // i3 loop
   } // i1 loop
-#pragma omp parallel for collapse(2)
+#ifdef USE_TARGET_OFFLOAD
+#pragma omp target teams distribute parallel for simd collapse(2)
+#else
+#pragma omp parallel for simd collapse(2)
+#endif 
   for (int i1 = 1; i1 <= c1->ndi; i1++) {
     for (int i0 = 1; i0 <= c1->nlem; i0++) {
       int vecindex = (i1 - 1) * c1->nlem + i0 - 1;
@@ -2650,7 +2683,9 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) {
   } // i1 loop
   dcomplex *vec_am0m = c1->am0m[0];
 #ifdef USE_TARGET_OFFLOAD
-#pragma omp target parallel for collapse(2)
+#pragma omp target teams distribute parallel for simd collapse(2)
+#else
+#pragma omp parallel for simd collapse(2)
 #endif
   for (int i0 = 1; i0 <= c1->nlem; i0++) {
     for (int i3 = 1; i3 <= c1->nlemt; i3++) {
@@ -2661,11 +2696,11 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) {
 	int i1e = i1 + c1->ndi;
 	int vecind1 = (i1 - 1) * c1->nlemt;
 	int vecind1e = (i1e - 1) * c1->nlemt;
-	a1 = sam_v[vecind1 + i3 - 1];
-	a2 = sam_v[vecind1e + i3 - 1];
+	dcomplex a1 = sam_v[vecind1 + i3 - 1];
+	dcomplex a2 = sam_v[vecind1e + i3 - 1];
 	int vecindex = (i1 - 1) * c1->nlem + i0 - 1;
-	gie = gis_v[vecindex];
-	gle = gls_v[vecindex];
+	dcomplex gie = gis_v[vecindex];
+	dcomplex gle = gls_v[vecindex];
 	sum1 += (a1 * gie + a2 * gle);
 	sum2 += (a1 * gle + a2 * gie);
       } // i1 loop
diff --git a/src/scripts/model_maker.py b/src/scripts/model_maker.py
index 27e4352c34b2f2eafa7f013bd9c7791923c7f13b..f5cda8f15652e181d1ca935d7fc0fdd7a48c920d 100755
--- a/src/scripts/model_maker.py
+++ b/src/scripts/model_maker.py
@@ -328,6 +328,8 @@ def load_model(model_file):
                     for j in range(expected_radii):
                         sconf['rcf'][i][j] = float(model['particle_settings']['rad_frac'][i][j])
         # Create the gconf dict
+        use_refinement = False
+        dyn_orders = True
         try:
             use_refinement = False if model['system_settings']['refinement'] == "0" else True
         except KeyError:
@@ -335,7 +337,7 @@ def load_model(model_file):
         try:
             dyn_orders = False if model['radiation_settings']['dyn_orders'] == "0" else True
         except KeyError:
-            use_refinement = False
+            dyn_orders = True
         str_polar = model['radiation_settings']['polarization']
         if (str_polar not in ["LINEAR", "CIRCULAR"]):
             print("ERROR: %s is not a recognized polarization state."%str_polar)
@@ -875,10 +877,10 @@ def write_legacy_gconf(conf):
     output.write(str_line)
     str_line = " {0:d}\n0\n".format(conf['jwtm'])
     output.write(str_line)
-    if (gconf['use_refinement']):
+    if (conf['use_refinement']):
         str_line = "USE_REFINEMENT=1\n"
         output.write(str_line)
-    if (not gconf['dyn_orders']):
+    if (not conf['dyn_orders']):
         str_line = "USE_DYN_ORDER=0\n"
         output.write(str_line)
     output.close()
diff --git a/src/trapping/cfrfme.cpp b/src/trapping/cfrfme.cpp
index deba6ef5c322f6a715a6c184157036b07aff7520..b1b9985563d15756441d1c35f1fc39436d505a2b 100644
--- a/src/trapping/cfrfme.cpp
+++ b/src/trapping/cfrfme.cpp
@@ -61,6 +61,14 @@
 #include <nvtx3/nvToolsExt.h>
 #endif
 
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#ifdef USE_TARGET_OFFLOAD
+#pragma omp requires unified_shared_memory
+#endif
+
 using namespace std;
 
 /*! \brief C++ implementation of FRFME
@@ -391,58 +399,64 @@ void frfme(string data_file, string output_path) {
 #ifdef USE_NVTX
 	  nvtxRangePush("j80 loop");
 #endif
-#pragma omp parallel for
-	  for (int j80 = jlmf; j80 <= jlml; j80++) {
-	    dcomplex *vec_w = new dcomplex[nkv * nkv]();
-	    dcomplex **w = new dcomplex*[nkv];
+	  dcomplex *vec_wsum = tfrfme->wsum[0];
+	  double *vec_vkzm = vkzm[0];
+#ifdef USE_TARGET_OFFLOAD
+#pragma omp target teams distribute parallel for simd
+#else
+#pragma omp parallel for simd
+#endif
+	  for (int j80 = jlmf-1; j80 < jlml; j80++) {
+	    int nkvs = nkv * nkv;
+	    dcomplex *vec_w = (dcomplex *) calloc(nkvs, sizeof(dcomplex));
+	    dcomplex **w = (dcomplex **) calloc(nkv, sizeof(dcomplex *));
 	    for (int wi = 0; wi < nkv; wi++) w[wi] = vec_w + wi * nkv;
 	    dcomplex wk_value;
-	    for (int jy50 = 0; jy50 < nkv; jy50++) {
-	      for (int jx50 = 0; jx50 < nkv; jx50++) {
-		wk_value = tt1->wk[jy50 * nkv * nlmmt + jx50 * nlmmt + j80 - 1];
-		w[jx50][jy50] = wk_value;
-	      } // jx50
-	    } // jy50 loop
+	    int wk_index = 0;
+	    for (int jxy50 = 0; jxy50 < nkvs; jxy50++) {
+		wk_index = nlmmt*jxy50;
+		wk_value = tt1->wk[wk_index + j80];
+		int jy50 = jxy50 / nkv;
+		int jx50 = jxy50 % nkv;
+		vec_w[(nkv*jx50) + jy50] =  wk_value;
+	    } // jxy50 loop
 	    int ixyz = 0;
-	    for (int iz75 = 0; iz75 < nzv; iz75++) {
+	    for (int wj = 0; wj < nrvc; wj++) vec_wsum[(j80*nrvc)+wj] = cc0;
+	    int nvtot = nxv*nyv*nzv;
+	    int nvxy = nxv *nyv;
+	    for (int ixyz = 0; ixyz < nvtot; ixyz++) {
+	      int iz75 = ixyz / nvxy;
+	      int iy70 = (ixyz % nvxy) / nxv;
+	      int ix65 = ixyz % nxv;
 	      double z = _zv[iz75] + frsh;
-	      for (int iy70 = 0; iy70 < nyv; iy70++) {
-		double y = _yv[iy70];
-		for (int ix65 = 0; ix65 < nxv; ix65++) {
-		  double x = _xv[ix65];
-		  ixyz++;
-		  dcomplex sumy = cc0;
-		  for (int jy60 = 0; jy60 < nkv; jy60++) {
-		    double vky = vkv[jy60];
-		    double vkx = vkv[nkv - 1];
-		    double vkzf = vkzm[0][jy60];
-		    dcomplex phasf = cexp(uim * (-vkx * x + vky * y + vkzf * z));
-		    double vkzl = vkzm[nkv - 1][jy60];
-		    dcomplex phasl = cexp(uim * (vkx * x + vky * y + vkzl * z));
-		    dcomplex sumx = 0.5 * (w[0][jy60] * phasf + w[nkv - 1][jy60] * phasl);
-		    for (int jx55 = 2; jx55 <= nks; jx55++) {
-		      vkx = vkv[jx55 - 1];
-		      double vkz = vkzm[jx55 - 1][jy60];
-		      dcomplex phas = cexp(uim * (vkx * x + vky * y + vkz * z));
-		      sumx += (w[jx55 - 1][jy60] * phas);
-		    } // jx55 loop
-		    if (jy60 == 0 || jy60 == nkv - 1) sumx *= 0.5;
-		    sumy += sumx;
-		  } // jy60 loop
-		  tfrfme->wsum[j80 - 1][ixyz - 1] = sumy * delks;
-		} // ix65 loop
-	      } // iy70 loop
-	    } // iz75 loop
-	    delete[] vec_w;
-	    delete[] w;
+	      double y = _yv[iy70];
+	      double x = _xv[ix65];
+	      dcomplex sumy = cc0;
+	      for (int jy60x55 = 0; jy60x55 < nkvs ; jy60x55++) {
+		int jy60 = jy60x55 / nkv;
+		int jx55 = jy60x55 % nkv;
+		double vky = vkv[jy60];
+		double vkx = (jx55==0) ? vkv[nkv - 1] : vkv[jx55];
+		double vkz = vec_vkzm[(jx55*nkv)+jy60];
+		dcomplex phas = (jx55==0) ?
+		  cexp(uim * (-vkx * x + vky * y + vkz * z)):
+		  cexp(uim * (vkx * x + vky * y + vkz * z));
+		dcomplex sumx = vec_w[(jx55*nkv)+jy60] * phas;
+		double factor1 = ((jx55==0)||(jx55==(nkv-1))) ? 0.5 : 1.0;
+		double factor2 = ((jy60==0)||(jy60==(nkv-1))) ? 0.5 : 1.0;
+		sumx *= factor1*factor2;
+		sumy += sumx;
+	      } // jy60x55 loop
+	      vec_wsum[((j80)*nrvc)+ixyz] = sumy * delks;
+	    } // ixyz loop
+	    free(vec_w);
+	    free(w);
 	  } // j80 loop
 #ifdef USE_NVTX
 	  nvtxRangePop();
-#endif
-	  // label 88
-#ifdef USE_NVTX
 	  nvtxRangePush("Closing operations");
 #endif
+	  // label 88
 	  tfrfme->write_binary(tfrfme_name, "HDF5");
 	  string output_name = output_path + "/c_OFRFME";
 	  FILE *output = fopen(output_name.c_str(), "w");