diff --git a/src/libnptm/clu_subs.cpp b/src/libnptm/clu_subs.cpp
index 0d7a26fdc0d224513bc9573c5908957b2b4936d8..8733d94180a79326923fc87418a91fc9d9226603 100644
--- a/src/libnptm/clu_subs.cpp
+++ b/src/libnptm/clu_subs.cpp
@@ -2146,24 +2146,19 @@ void scr0(double vk, double exri, C1 *c1, C1_AddOns *c1ao, C3 *c3, C4 * c4) {
   double cccs = ccs / exdc;
   dcomplex csam = -(ccs / (exri * vk)) * 0.5 * I;
   //double scs = 0.0, ecs = 0.0, acs = 0.0;
-  double scs = 0.0;
-  double ecs = 0.0;
-  double acs = 0.0;
-  dcomplex tfsas = cc0;
   dcomplex *vec_rmi = c1->rmi[0];
   dcomplex *vec_rei = c1->rei[0];
 #ifdef USE_NVTX
-  nvtxRangePush("scr0 outer loop");
+  nvtxRangePush("scr0 outer loop 1");
 #endif
-
-  //#pragma omp parallel for reduction(+:scs, ecs, acs, tfsas)
+#pragma omp parallel for
   for (int i14 = 1; i14 <= c4->nsph; i14++) {
     int iogi = c1->iog[i14 - 1];
     if (iogi >= i14) {
       double sums = 0.0;
       dcomplex sum21 = cc0;
 #ifdef USE_NVTX
-      nvtxRangePush("scr0 inner loop");
+      nvtxRangePush("scr0 inner loop 1");
 #endif
 #pragma omp target teams distribute parallel for simd reduction(+:sums, sum21)
       for (int l10 = 1; l10 <= c4->li; l10++) {
@@ -2177,6 +2172,9 @@ void scr0(double vk, double exri, C1 *c1, C1_AddOns *c1ao, C3 *c3, C4 * c4) {
 	sums += rvalue;
 	sum21 += ((rm + re) * fl);
       } // l10 loop
+#ifdef USE_NVTX
+      nvtxRangePop();
+#endif
       sum21 *= -1.0;
       double scasec = cccs * sums;
       double extsec = -cccs * real(sum21);
@@ -2191,11 +2189,29 @@ void scr0(double vk, double exri, C1 *c1, C1_AddOns *c1ao, C3 *c3, C4 * c4) {
       c1->fsas[i14 - 1] = sum21 * csam;
     }
     // label 12
+    // scs += c1->sscs[iogi - 1];
+    // ecs += c1->sexs[iogi - 1];
+    // acs += c1->sabs[iogi - 1];
+    // tfsas += c1->fsas[iogi - 1];
+  } // i14 loop
+#ifdef USE_NVTX
+  nvtxRangePop();
+#endif
+  double scs = 0.0;
+  double ecs = 0.0;
+  double acs = 0.0;
+  dcomplex tfsas = cc0;
+#ifdef USE_NVTX
+      nvtxRangePush("scr0 loop 2");
+#endif
+#pragma omp target teams distribute parallel for simd reduction(+:scs, ecs, acs, tfsas)
+  for (int i14 = 1; i14 <= c4->nsph; i14++) {
+    int iogi = c1->iog[i14 - 1];
     scs += c1->sscs[iogi - 1];
     ecs += c1->sexs[iogi - 1];
     acs += c1->sabs[iogi - 1];
     tfsas += c1->fsas[iogi - 1];
-  } // i14 loop
+  }
   c3->scs = scs;
   c3->ecs = ecs;
   c3->acs = acs;