diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 39e36dad06329deff757d046ef360ef211c575e1..a77e21b6db9cb73ffab0a0709391b0f9849ef7b6 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -78,13 +78,13 @@ compatibility_stage:
       - CXX=g++-14 FC=gfortran-14 ./configure
       - make wipe
       - make -j
-      - echo "Running make with refinement with gnu compilers version 14..."
+      - echo "Running make with gnu compilers version 14..."
       - cd ..
       - rm -rf build_gnu14
       - mkdir build_gnu14_refine
       - cd build_gnu14_refine
       - cp -r ../build/* .
-      - CXX=g++-14 FC=gfortran-14 ./configure --enable-refinement
+      - CXX=g++-14 FC=gfortran-14 ./configure
       - make wipe
       - make -j
       #- echo "Running make with flang version 16 and clang version 16..."
@@ -173,7 +173,7 @@ building_stage:
       - cat /etc/os-release
       - cd build
       - echo "Configuring with default compilers (MAGMA disabled)..."
-      - ./configure --without-magma --without-cublas --disable-offload --enable-refinement --enable-shared
+      - ./configure --without-magma --without-cublas --disable-offload --enable-shared
       - make wipe
       - echo "Building the default configuration..."
       - make -j
diff --git a/src/libnptm/clu_subs.cpp b/src/libnptm/clu_subs.cpp
index a575d469b276517c9531c0a212a6c0d84f1db507..7bd73b106914ffd80ef5ab02a57f5cde17b6c938 100644
--- a/src/libnptm/clu_subs.cpp
+++ b/src/libnptm/clu_subs.cpp
@@ -1341,6 +1341,8 @@ void pcros(double vk, double exri, ParticleDescriptor *c1) {
 #endif
 #ifdef USE_TARGET_OFFLOAD
 #pragma omp target teams distribute parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4)
+#else
+#pragma omp parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4)
 #endif
   for (int i12 = 0; i12 < nlemt; i12++) {
       // int i = i12 - 1;
@@ -1408,6 +1410,8 @@ void pcrsm0(double vk, double exri, int inpol, ParticleDescriptor *c1) {
   sum3 = cc0;
 #ifdef USE_TARGET_OFFLOAD
 #pragma omp target teams distribute parallel for simd reduction(+:sum2,sum3)
+#else
+#pragma omp parallel for simd reduction(+:sum2,sum3)
 #endif
   for (int i14 = 0; i14 < c1->nlem; i14++) { 
     int ie = i14 + c1->nlem;
@@ -1418,6 +1422,8 @@ void pcrsm0(double vk, double exri, int inpol, ParticleDescriptor *c1) {
   dcomplex sumpd = cc0;
 #ifdef USE_TARGET_OFFLOAD
 #pragma omp target teams distribute parallel for simd collapse(2) reduction(+:sumpi,sumpd)
+#else
+#pragma omp parallel for simd collapse(2) reduction(+:sumpi,sumpd)
 #endif
   for (int i16 = 0; i16 < nlemt; i16++) {
     for (int j16 = 0; j16 < c1->nlem; j16++) {
@@ -2001,6 +2007,8 @@ void raba(
 #endif
 #ifdef USE_TARGET_OFFLOAD
 #pragma omp target teams distribute parallel for simd reduction(+:c1, c2)
+#else
+#pragma omp parallel for simd reduction(+:c1, c2)
 #endif
   for (int j10 = 1; j10 <= nlemt; j10++) {
       int j = j10 - 1;
@@ -2021,6 +2029,8 @@ void raba(
 #endif
 #ifdef USE_TARGET_OFFLOAD
 #pragma omp teams distribute parallel for
+#else
+#pragma omp parallel for
 #endif
   for (int ipo = 0; ipo < 2; ipo++) {
     int jpo = 1 - ipo;
@@ -2051,12 +2061,14 @@ void raba(
     int kmax = le*(le+2);
     // for efficiency I should also linearise array w, but I cannot easily since I do not know for sure its major dimension (changes to containing class needed)
 #ifdef USE_NVTX
-  nvtxRangePush("raba inner loop 2");
+    nvtxRangePush("raba inner loop 2");
 #endif
 #ifdef USE_TARGET_OFFLOAD
 #pragma omp target teams distribute parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2)
+#else
+#pragma omp parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2)
 #endif
-  for (int k = 1; k<=kmax; k++) {
+    for (int k = 1; k<=kmax; k++) {
       int l60 = (int) sqrt(k+1);
       int im60 = k - (l60*l60) + 1;
       if (im60 == 0) {
@@ -2079,44 +2091,44 @@ void raba(
       dcomplex acwp;
       dcomplex aca;
       dcomplex acap;
-	if (mmmmu <= l60) {
-	  int immu = mmmu + il - 1;
-	  int immue = immu + nlem;
-	  rmu = -sqrt(1.0 * (l60 + mmmu) * (l60 - m)) * sq2i;
-	  acw = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+ipo];
-	  acwp = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+jpo];
-	  aca = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+ipo];
-	  acap = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+jpo];
-	  ctqce0 += (acw * rmu);
-	  tqcpe0 += (acwp * rmu);
-	  ctqcs0 += (aca * rmu);
-	  tqcps0 += (acap * rmu);
-	}
-	// label 30
-	rmu = -1.0 * m;
-	acw = dconjg(vec_a[2*i+ipo]) * vec_w[4*i+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*ie+ipo];
-	acwp = dconjg(vec_a[2*i+ipo]) * vec_w[4*i+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*ie+jpo];
-	aca = dconjg(vec_a[2*i+ipo]) * vec_a[2*i+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*ie+ipo];
-	acap = dconjg(vec_a[2*i+ipo]) * vec_a[2*i+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*ie+jpo];
-	ctqce1 += (acw * rmu);
-	tqcpe1 += (acwp * rmu);
-	ctqcs1 += (aca * rmu);
-	tqcps1 += (acap * rmu);
-	mmmu = m - 1;
-	mmmmu = (mmmu > 0) ? mmmu : -mmmu;
-	if (mmmmu <= l60) {
-	  int immu = mmmu + il - 1;
-	  int immue = immu + nlem;
-	  rmu = sqrt(1.0 * (l60 - mmmu) * (l60 + m)) * sq2i;
-	  acw = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+ipo];
-	  acwp = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+jpo];
-	  aca = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+ipo];
-	  acap = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+jpo];
-	  ctqce2 += (acw * rmu);
-	  tqcpe2 += (acwp * rmu);
-	  ctqcs2 += (aca * rmu);
-	  tqcps2 += (acap * rmu);
-	} // ends if clause
+      if (mmmmu <= l60) {
+	int immu = mmmu + il - 1;
+	int immue = immu + nlem;
+	rmu = -sqrt(1.0 * (l60 + mmmu) * (l60 - m)) * sq2i;
+	acw = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+ipo];
+	acwp = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+jpo];
+	aca = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+ipo];
+	acap = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+jpo];
+	ctqce0 += (acw * rmu);
+	tqcpe0 += (acwp * rmu);
+	ctqcs0 += (aca * rmu);
+	tqcps0 += (acap * rmu);
+      }
+      // label 30
+      rmu = -1.0 * m;
+      acw = dconjg(vec_a[2*i+ipo]) * vec_w[4*i+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*ie+ipo];
+      acwp = dconjg(vec_a[2*i+ipo]) * vec_w[4*i+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*ie+jpo];
+      aca = dconjg(vec_a[2*i+ipo]) * vec_a[2*i+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*ie+ipo];
+      acap = dconjg(vec_a[2*i+ipo]) * vec_a[2*i+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*ie+jpo];
+      ctqce1 += (acw * rmu);
+      tqcpe1 += (acwp * rmu);
+      ctqcs1 += (aca * rmu);
+      tqcps1 += (acap * rmu);
+      mmmu = m - 1;
+      mmmmu = (mmmu > 0) ? mmmu : -mmmu;
+      if (mmmmu <= l60) {
+	int immu = mmmu + il - 1;
+	int immue = immu + nlem;
+	rmu = sqrt(1.0 * (l60 - mmmu) * (l60 + m)) * sq2i;
+	acw = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+ipo];
+	acwp = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+jpo];
+	aca = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+ipo];
+	acap = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+jpo];
+	ctqce2 += (acw * rmu);
+	tqcpe2 += (acwp * rmu);
+	ctqcs2 += (aca * rmu);
+	tqcps2 += (acap * rmu);
+      } // ends if clause
     } // k loop (previously the l60 and im60 loops
 #ifdef USE_NVTX
   nvtxRangePop();
@@ -2129,7 +2141,9 @@ void raba(
   nvtxRangePush("raba loop 3");
 #endif
 #ifdef USE_TARGET_OFFLOAD
-#pragma omp teams distribute parallel for simd
+#pragma omp target teams distribute parallel for simd
+#else
+#pragma omp parallel for simd
 #endif
   for (int ipo78 = 1; ipo78 <= 2; ipo78++) {
     int ipo = ipo78 - 1;
@@ -2202,6 +2216,8 @@ void scr0(double vk, double exri, ParticleDescriptor *c1) {
 #endif
 #ifdef USE_TARGET_OFFLOAD
 #pragma omp target teams distribute parallel for simd reduction(+:sums, sum21)
+#else
+#pragma omp parallel for simd reduction(+:sums, sum21)
 #endif
       for (int l10 = 1; l10 <= c1->li; l10++) {
 	double fl = 1.0 * (l10 + l10 + 1);
@@ -2248,6 +2264,8 @@ void scr0(double vk, double exri, ParticleDescriptor *c1) {
 #endif
 #ifdef USE_TARGET_OFFLOAD
 #pragma omp target teams distribute parallel for simd reduction(+:scs, ecs, acs, tfsas)
+#else
+#pragma omp parallel for simd reduction(+:scs, ecs, acs, tfsas)
 #endif
   for (int i14 = 1; i14 <= c1->nsph; i14++) {
     int iogi = c1->iog[i14 - 1];
@@ -2312,6 +2330,8 @@ void scr2(
 #endif
 #ifdef USE_TARGET_OFFLOAD
 #pragma omp target teams distribute parallel for simd reduction(-:s11, s21, s12, s22)
+#else
+#pragma omp parallel for simd reduction(-:s11, s21, s12, s22)
 #endif
       for (int k = 1; k<=kmax; k++) {
 	int l10 = (int) sqrt(k+1);
@@ -2366,6 +2386,8 @@ void scr2(
 #endif
 #ifdef USE_TARGET_OFFLOAD
 #pragma omp target teams distribute parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11)
+#else
+#pragma omp parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11)
 #endif
   for (int i14 = 1; i14 <= c1->nsph; i14++) {
     int i = i14 - 1;
@@ -2398,6 +2420,8 @@ void scr2(
 #endif
 #ifdef USE_TARGET_OFFLOAD
 #pragma omp target teams distribute parallel for simd collapse(4)
+#else
+#pragma omp parallel for simd collapse(4)
 #endif
       for (int ipo1 = 1; ipo1 <=2; ipo1++) {
 	for (int jpo1 = 1; jpo1 <= 2; jpo1++) {
@@ -2422,7 +2446,9 @@ void scr2(
   nvtxRangePush("scr2 loop 4");
 #endif
 #ifdef USE_TARGET_OFFLOAD
-#pragma omp target parallel for collapse(4)
+#pragma omp target teams distribute parallel for collapse(4)
+#else
+#pragma omp parallel for collapse(4)
 #endif
   for (int ipo1 = 1; ipo1 <=2; ipo1++) {
     for (int jpo1 = 1; jpo1 <= 2; jpo1++) {