diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 39e36dad06329deff757d046ef360ef211c575e1..a77e21b6db9cb73ffab0a0709391b0f9849ef7b6 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -78,13 +78,13 @@ compatibility_stage: - CXX=g++-14 FC=gfortran-14 ./configure - make wipe - make -j - - echo "Running make with refinement with gnu compilers version 14..." + - echo "Running make with gnu compilers version 14..." - cd .. - rm -rf build_gnu14 - mkdir build_gnu14_refine - cd build_gnu14_refine - cp -r ../build/* . - - CXX=g++-14 FC=gfortran-14 ./configure --enable-refinement + - CXX=g++-14 FC=gfortran-14 ./configure - make wipe - make -j #- echo "Running make with flang version 16 and clang version 16..." @@ -173,7 +173,7 @@ building_stage: - cat /etc/os-release - cd build - echo "Configuring with default compilers (MAGMA disabled)..." - - ./configure --without-magma --without-cublas --disable-offload --enable-refinement --enable-shared + - ./configure --without-magma --without-cublas --disable-offload --enable-shared - make wipe - echo "Building the default configuration..." - make -j diff --git a/src/libnptm/clu_subs.cpp b/src/libnptm/clu_subs.cpp index a575d469b276517c9531c0a212a6c0d84f1db507..7bd73b106914ffd80ef5ab02a57f5cde17b6c938 100644 --- a/src/libnptm/clu_subs.cpp +++ b/src/libnptm/clu_subs.cpp @@ -1341,6 +1341,8 @@ void pcros(double vk, double exri, ParticleDescriptor *c1) { #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4) +#else +#pragma omp parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4) #endif for (int i12 = 0; i12 < nlemt; i12++) { // int i = i12 - 1; @@ -1408,6 +1410,8 @@ void pcrsm0(double vk, double exri, int inpol, ParticleDescriptor *c1) { sum3 = cc0; #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:sum2,sum3) +#else +#pragma omp parallel for simd reduction(+:sum2,sum3) #endif for (int i14 = 0; i14 < c1->nlem; i14++) { int ie = i14 + c1->nlem; @@ -1418,6 +1422,8 @@ void pcrsm0(double vk, double exri, int inpol, ParticleDescriptor *c1) { dcomplex sumpd = cc0; #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) reduction(+:sumpi,sumpd) +#else +#pragma omp parallel for simd collapse(2) reduction(+:sumpi,sumpd) #endif for (int i16 = 0; i16 < nlemt; i16++) { for (int j16 = 0; j16 < c1->nlem; j16++) { @@ -2001,6 +2007,8 @@ void raba( #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:c1, c2) +#else +#pragma omp parallel for simd reduction(+:c1, c2) #endif for (int j10 = 1; j10 <= nlemt; j10++) { int j = j10 - 1; @@ -2021,6 +2029,8 @@ void raba( #endif #ifdef USE_TARGET_OFFLOAD #pragma omp teams distribute parallel for +#else +#pragma omp parallel for #endif for (int ipo = 0; ipo < 2; ipo++) { int jpo = 1 - ipo; @@ -2051,12 +2061,14 @@ void raba( int kmax = le*(le+2); // for efficiency I should also linearise array w, but I cannot easily since I do not know for sure its major dimension (changes to containing class needed) #ifdef USE_NVTX - nvtxRangePush("raba inner loop 2"); + nvtxRangePush("raba inner loop 2"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2) +#else +#pragma omp parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2) #endif - for (int k = 1; k<=kmax; k++) { + for (int k = 1; k<=kmax; k++) { int l60 = (int) sqrt(k+1); int im60 = k - (l60*l60) + 1; if (im60 == 0) { @@ -2079,44 +2091,44 @@ void raba( dcomplex acwp; dcomplex aca; dcomplex acap; - if (mmmmu <= l60) { - int immu = mmmu + il - 1; - int immue = immu + nlem; - rmu = -sqrt(1.0 * (l60 + mmmu) * (l60 - m)) * sq2i; - acw = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+ipo]; - acwp = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+jpo]; - aca = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+ipo]; - acap = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+jpo]; - ctqce0 += (acw * rmu); - tqcpe0 += (acwp * rmu); - ctqcs0 += (aca * rmu); - tqcps0 += (acap * rmu); - } - // label 30 - rmu = -1.0 * m; - acw = dconjg(vec_a[2*i+ipo]) * vec_w[4*i+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*ie+ipo]; - acwp = dconjg(vec_a[2*i+ipo]) * vec_w[4*i+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*ie+jpo]; - aca = dconjg(vec_a[2*i+ipo]) * vec_a[2*i+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*ie+ipo]; - acap = dconjg(vec_a[2*i+ipo]) * vec_a[2*i+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*ie+jpo]; - ctqce1 += (acw * rmu); - tqcpe1 += (acwp * rmu); - ctqcs1 += (aca * rmu); - tqcps1 += (acap * rmu); - mmmu = m - 1; - mmmmu = (mmmu > 0) ? mmmu : -mmmu; - if (mmmmu <= l60) { - int immu = mmmu + il - 1; - int immue = immu + nlem; - rmu = sqrt(1.0 * (l60 - mmmu) * (l60 + m)) * sq2i; - acw = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+ipo]; - acwp = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+jpo]; - aca = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+ipo]; - acap = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+jpo]; - ctqce2 += (acw * rmu); - tqcpe2 += (acwp * rmu); - ctqcs2 += (aca * rmu); - tqcps2 += (acap * rmu); - } // ends if clause + if (mmmmu <= l60) { + int immu = mmmu + il - 1; + int immue = immu + nlem; + rmu = -sqrt(1.0 * (l60 + mmmu) * (l60 - m)) * sq2i; + acw = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+ipo]; + acwp = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+jpo]; + aca = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+ipo]; + acap = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+jpo]; + ctqce0 += (acw * rmu); + tqcpe0 += (acwp * rmu); + ctqcs0 += (aca * rmu); + tqcps0 += (acap * rmu); + } + // label 30 + rmu = -1.0 * m; + acw = dconjg(vec_a[2*i+ipo]) * vec_w[4*i+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*ie+ipo]; + acwp = dconjg(vec_a[2*i+ipo]) * vec_w[4*i+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*ie+jpo]; + aca = dconjg(vec_a[2*i+ipo]) * vec_a[2*i+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*ie+ipo]; + acap = dconjg(vec_a[2*i+ipo]) * vec_a[2*i+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*ie+jpo]; + ctqce1 += (acw * rmu); + tqcpe1 += (acwp * rmu); + ctqcs1 += (aca * rmu); + tqcps1 += (acap * rmu); + mmmu = m - 1; + mmmmu = (mmmu > 0) ? mmmu : -mmmu; + if (mmmmu <= l60) { + int immu = mmmu + il - 1; + int immue = immu + nlem; + rmu = sqrt(1.0 * (l60 - mmmu) * (l60 + m)) * sq2i; + acw = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+ipo]; + acwp = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+jpo]; + aca = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+ipo]; + acap = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+jpo]; + ctqce2 += (acw * rmu); + tqcpe2 += (acwp * rmu); + ctqcs2 += (aca * rmu); + tqcps2 += (acap * rmu); + } // ends if clause } // k loop (previously the l60 and im60 loops #ifdef USE_NVTX nvtxRangePop(); @@ -2129,7 +2141,9 @@ void raba( nvtxRangePush("raba loop 3"); #endif #ifdef USE_TARGET_OFFLOAD -#pragma omp teams distribute parallel for simd +#pragma omp target teams distribute parallel for simd +#else +#pragma omp parallel for simd #endif for (int ipo78 = 1; ipo78 <= 2; ipo78++) { int ipo = ipo78 - 1; @@ -2202,6 +2216,8 @@ void scr0(double vk, double exri, ParticleDescriptor *c1) { #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:sums, sum21) +#else +#pragma omp parallel for simd reduction(+:sums, sum21) #endif for (int l10 = 1; l10 <= c1->li; l10++) { double fl = 1.0 * (l10 + l10 + 1); @@ -2248,6 +2264,8 @@ void scr0(double vk, double exri, ParticleDescriptor *c1) { #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:scs, ecs, acs, tfsas) +#else +#pragma omp parallel for simd reduction(+:scs, ecs, acs, tfsas) #endif for (int i14 = 1; i14 <= c1->nsph; i14++) { int iogi = c1->iog[i14 - 1]; @@ -2312,6 +2330,8 @@ void scr2( #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(-:s11, s21, s12, s22) +#else +#pragma omp parallel for simd reduction(-:s11, s21, s12, s22) #endif for (int k = 1; k<=kmax; k++) { int l10 = (int) sqrt(k+1); @@ -2366,6 +2386,8 @@ void scr2( #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) +#else +#pragma omp parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) #endif for (int i14 = 1; i14 <= c1->nsph; i14++) { int i = i14 - 1; @@ -2398,6 +2420,8 @@ void scr2( #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(4) +#else +#pragma omp parallel for simd collapse(4) #endif for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { @@ -2422,7 +2446,9 @@ void scr2( nvtxRangePush("scr2 loop 4"); #endif #ifdef USE_TARGET_OFFLOAD -#pragma omp target parallel for collapse(4) +#pragma omp target teams distribute parallel for collapse(4) +#else +#pragma omp parallel for collapse(4) #endif for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) {