diff --git a/src/libnptm/clu_subs.cpp b/src/libnptm/clu_subs.cpp
index cb2757eae471741fa28c1ecda123abac1dfe29b6..d83384a7441d61e0e3d23ab02506985be1009709 100644
--- a/src/libnptm/clu_subs.cpp
+++ b/src/libnptm/clu_subs.cpp
@@ -1293,6 +1293,9 @@ void mextc(double vk, double exri, dcomplex **fsac, double **cextlr, double **ce
 }
 
 void pcros(double vk, double exri, C1 *c1, C1_AddOns *c1ao, C4 *c4) {
+#ifdef USE_NVTX
+  nvtxRangePush("whole pcros");
+#endif
   const dcomplex cc0 = 0.0 + 0.0 * I;
   dcomplex sump, sum1, sum2, sum3, sum4, am, amp, cc, csam;
   const double exdc = exri * exri;
@@ -1303,8 +1306,13 @@ void pcros(double vk, double exri, C1 *c1, C1_AddOns *c1ao, C4 *c4) {
   double cfsq = 4.0 / (pi4sq *ccs * ccs);
   const int nlemt = c4->nlem + c4->nlem;
   int jpo = 2;
-  for (int ipo18 = 1; ipo18 <= 2; ipo18++) {
-    if (ipo18 == 2) jpo = 1;
+  dcomplex *vec_am0m = c1ao->am0m[0];
+  dcomplex *vec_w = c1->w[0];
+#ifdef USE_NVTX
+  nvtxRangePush("pcros outer loop 1");
+#endif
+  for (int ipo18 = 0; ipo18 < 2; ipo18++) {
+    int jpo = 1-ipo18;
     int ipopt = ipo18 + 2;
     int jpopt = jpo + 2;
     double sum = 0.0;
@@ -1313,32 +1321,44 @@ void pcros(double vk, double exri, C1 *c1, C1_AddOns *c1ao, C4 *c4) {
     sum2 = cc0;
     sum3 = cc0;
     sum4 = cc0;
-    for (int i12 = 1; i12 <= nlemt; i12++) {
-      int i = i12 - 1;
-      am = cc0;
-      amp = cc0;
-      for (int j10 = 1; j10 <= nlemt; j10++) {
-	int j = j10 - 1;
-	am += (c1ao->am0m[i][j] * c1->w[j][ipo18 - 1]);
-	amp += (c1ao->am0m[i][j] * c1->w[j][jpo - 1]);
+#ifdef USE_NVTX
+  nvtxRangePush("pcros intermediate loop 1");
+#endif
+#pragma omp target teams distribute parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4)
+    for (int i12 = 0; i12 < nlemt; i12++) {
+      // int i = i12 - 1;
+      dcomplex am = cc0;
+      dcomplex amp = cc0;
+      //#pragma omp target teams distribute parallel for simd reduction(+:am,amp)
+      for (int j10 = 0; j10 < nlemt; j10++) {
+	// int j = j10 - 1;
+	am += (vec_am0m[nlemt*i12+j10] * vec_w[4*j10+ipo18]);
+	amp += (vec_am0m[nlemt*i12+j10] * vec_w[4*j10+jpo]);
       } // j10 loop
       sum += real(dconjg(am) * am);
       sump += (dconjg(amp) * am);
-      sum1 += (dconjg(c1->w[i][ipo18 - 1]) * am);
-      sum2 += (dconjg(c1->w[i][jpo - 1]) * am);
-      sum3 += (c1->w[i][ipopt - 1] * am);
-      sum4 += (c1->w[i][jpopt - 1] * am);
+      sum1 += (dconjg(vec_w[4*i12+ipo18]) * am);
+      sum2 += (dconjg(vec_w[4*i12+jpo]) * am);
+      sum3 += (vec_w[4*i12+ipopt] * am);
+      sum4 += (vec_w[4*i12+jpopt] * am);
     } // i12 loop
-    c1ao->scsc[ipo18 - 1] = cccs * sum;
-    c1ao->scscp[ipo18 - 1] = cccs * sump;
-    c1ao->ecsc[ipo18 - 1] = -cccs * real(sum1);
-    c1ao->ecscp[ipo18 - 1] = -cccs * sum2;
-    c1ao->fsac[ipo18 - 1][ipo18 - 1] = csam * sum1;
-    c1ao->fsac[jpo - 1][ipo18 - 1] = csam * sum2;
-    c1ao->sac[ipo18 - 1][ipo18 - 1] = csam * sum3;
-    c1ao->sac[jpo - 1][ipo18 - 1] = csam * sum4;
+#ifdef USE_NVTX
+  nvtxRangePop();
+#endif
+    c1ao->scsc[ipo18] = cccs * sum;
+    c1ao->scscp[ipo18] = cccs * sump;
+    c1ao->ecsc[ipo18] = -cccs * real(sum1);
+    c1ao->ecscp[ipo18] = -cccs * sum2;
+    c1ao->fsac[ipo18][ipo18] = csam * sum1;
+    c1ao->fsac[jpo][ipo18] = csam * sum2;
+    c1ao->sac[ipo18][ipo18] = csam * sum3;
+    c1ao->sac[jpo][ipo18] = csam * sum4;
   } // ipo18 loop
   int i = 0;
+  dcomplex * &vint =  c1->vint;
+#ifdef USE_NVTX
+  nvtxRangePush("pcros loop 2");
+#endif
   for (int ipo1 = 1; ipo1 <= 2; ipo1++) {
     for (int jpo1 = 1; jpo1 <= 2; jpo1++) {
       cc = dconjg(c1ao->sac[jpo1 - 1][ipo1 - 1]);
@@ -1349,38 +1369,47 @@ void pcros(double vk, double exri, C1 *c1, C1_AddOns *c1ao, C4 *c4) {
       } // ipo2 loop
     } // jpo1 loop
   } // ipo1 loop
+#ifdef USE_NVTX
+  nvtxRangePop();
+#endif
+#ifdef USE_NVTX
+  nvtxRangePop();
+#endif
 }
 
 void pcrsm0(double vk, double exri, int inpol, C1 *c1, C1_AddOns *c1ao, C4 *c4) {
   const dcomplex cc0 = 0.0 + 0.0 * I;
   const dcomplex uim = 0.0 + 1.0 * I;
-  dcomplex sum1, sum2, sum3, sum4, sumpd;
+  dcomplex sum1, sum2, sum3, sum4;
   dcomplex sums1, sums2, sums3, sums4, csam;
   double exdc = exri * exri;
   double ccs = 4.0 * acos(0.0) / (vk * vk);
   double cccs = ccs / exdc;
+  int nlemt = c4->nlem + c4->nlem;
+  dcomplex *vec_am0m = c1ao->am0m[0];
   csam = -(ccs / (exri * vk)) * 0.5 * I;
   sum2 = cc0;
   sum3 = cc0;
-  for (int i14 = 1; i14 <= c4->nlem; i14++) { // GPU portable?
+#pragma omp target teams distribute parallel for simd reduction(+:sum2,sum3)
+  for (int i14 = 0; i14 < c4->nlem; i14++) { 
     int ie = i14 + c4->nlem;
-    sum2 += (c1ao->am0m[i14 - 1][i14 - 1] + c1ao->am0m[ie - 1][ie - 1]);
-    sum3 += (c1ao->am0m[i14 - 1][ie - 1] + c1ao->am0m[ie - 1][i14 - 1]);
+    sum2 += (vec_am0m[nlemt*i14 + i14] + vec_am0m[nlemt*ie + ie]);
+    sum3 += (vec_am0m[nlemt*i14 + ie] + vec_am0m[nlemt*ie + i14]);
   } // i14 loop
   double sumpi = 0.0;
-  sumpd = cc0;
-  int nlemt = c4->nlem + c4->nlem;
-  for (int i16 = 1; i16 <= nlemt; i16++) {
-    for (int j16 = 1; j16 <= c4->nlem; j16++) {
+  dcomplex sumpd = cc0;
+#pragma omp target teams distribute parallel for simd collapse(2) reduction(+:sumpi,sumpd)
+  for (int i16 = 0; i16 < nlemt; i16++) {
+    for (int j16 = 0; j16 < c4->nlem; j16++) {
       int je = j16 + c4->nlem;
       double rvalue = real(
-			   dconjg(c1ao->am0m[i16 - 1][j16 - 1]) * c1ao->am0m[i16 - 1][j16 - 1]
-			   + dconjg(c1ao->am0m[i16 - 1][je - 1]) * c1ao->am0m[i16 - 1][je - 1]
+			   dconjg(vec_am0m[nlemt*i16 + j16]) * vec_am0m[nlemt*i16 + j16]
+			   + dconjg(vec_am0m[nlemt*i16 + je]) * vec_am0m[nlemt*i16 + je]
 			   );
       sumpi += rvalue;
       sumpd += (
-		dconjg(c1ao->am0m[i16 - 1][j16 - 1]) * c1ao->am0m[i16 - 1][je - 1]
-		+ dconjg(c1ao->am0m[i16 - 1][je - 1]) * c1ao->am0m[i16 - 1][j16 - 1]
+		dconjg(vec_am0m[nlemt*i16 + j16]) * vec_am0m[nlemt*i16 + je]
+		+ dconjg(vec_am0m[nlemt*i16 + je]) * vec_am0m[nlemt*i16 + j16]
 		);
     } // j16 loop
   } // i16 loop