diff --git a/src/libnptm/clu_subs.cpp b/src/libnptm/clu_subs.cpp index 9a09a5960e38096423c95e1f9f32d028db692c97..f10d3f788ac77b67adfc917f11d15895ace741a8 100644 --- a/src/libnptm/clu_subs.cpp +++ b/src/libnptm/clu_subs.cpp @@ -47,7 +47,9 @@ #include <omp.h> #endif +#ifdef USE_TARGET_OFFLOAD #pragma omp requires unified_shared_memory +#endif using namespace std; @@ -405,7 +407,9 @@ dcomplex cdtp(dcomplex z, dcomplex **am, int i, int jf, int k, int nj) { return result; } -# pragma omp begin declare target device_type(any) +#ifdef USE_TARGET_OFFLOAD +#pragma omp begin declare target device_type(any) +#endif double cgev(int ipamo, int mu, int l, int m) { double result = 0.0; double xd = 0.0, xn = 0.0; @@ -439,7 +443,9 @@ double cgev(int ipamo, int mu, int l, int m) { } return result; } -# pragma omp end declare target +#ifdef USE_TARGET_OFFLOAD +#pragma omp end declare target +#endif void cms(dcomplex **am, C1 *c1, C1_AddOns *c1ao, C4 *c4, C6 *c6) { dcomplex dm, de, cgh, cgk; @@ -643,7 +649,9 @@ void crsm1(double vk, double exri, C1 *c1, C1_AddOns *c1ao, C4 *c4, C6 *c6) { delete[] svs; } -# pragma omp begin declare target device_type(any) +#ifdef USE_TARGET_OFFLOAD +#pragma omp begin declare target device_type(any) +#endif dcomplex ghit_d( int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2, C1 *c1, C1_AddOns *c1ao, C4 *c4, double *rac3j @@ -854,9 +862,13 @@ dcomplex ghit_d( } return result; } +#ifdef USE_TARGET_OFFLOAD #pragma omp end declare target +#endif -# pragma omp begin declare target device_type(any) +#ifdef USE_TARGET_OFFLOAD +#pragma omp begin declare target device_type(any) +#endif dcomplex ghit( int ihi, int ipamo, int nbl, int l1, int m1, int l2, int m2, C1 *c1, C1_AddOns *c1ao, C4 *c4, C6 *c6 @@ -1067,7 +1079,9 @@ dcomplex ghit( } return result; } +#ifdef USE_TARGET_OFFLOAD #pragma omp end declare target +#endif void hjv( double exri, double vk, int &jer, int &lcalc, dcomplex &arg, @@ -1325,12 +1339,13 @@ void pcros(double vk, double exri, C1 *c1, C1_AddOns *c1ao, C4 *c4) { #ifdef USE_NVTX nvtxRangePush("pcros intermediate loop 1"); #endif +#ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4) - for (int i12 = 0; i12 < nlemt; i12++) { +#endif + for (int i12 = 0; i12 < nlemt; i12++) { // int i = i12 - 1; dcomplex am = cc0; dcomplex amp = cc0; - //#pragma omp target teams distribute parallel for simd reduction(+:am,amp) for (int j10 = 0; j10 < nlemt; j10++) { // int j = j10 - 1; am += (vec_am0m[nlemt*i12+j10] * vec_w[4*j10+ipo18]); @@ -1391,7 +1406,9 @@ void pcrsm0(double vk, double exri, int inpol, C1 *c1, C1_AddOns *c1ao, C4 *c4) csam = -(ccs / (exri * vk)) * 0.5 * I; sum2 = cc0; sum3 = cc0; +#ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:sum2,sum3) +#endif for (int i14 = 0; i14 < c4->nlem; i14++) { int ie = i14 + c4->nlem; sum2 += (vec_am0m[nlemt*i14 + i14] + vec_am0m[nlemt*ie + ie]); @@ -1399,7 +1416,9 @@ void pcrsm0(double vk, double exri, int inpol, C1 *c1, C1_AddOns *c1ao, C4 *c4) } // i14 loop double sumpi = 0.0; dcomplex sumpd = cc0; +#ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) reduction(+:sumpi,sumpd) +#endif for (int i16 = 0; i16 < nlemt; i16++) { for (int j16 = 0; j16 < c4->nlem; j16++) { int je = j16 + c4->nlem; @@ -1599,7 +1618,9 @@ void r3j000(int j2, int j3, C6 *c6) { } } +#ifdef USE_TARGET_OFFLOAD #pragma omp begin declare target device_type(any) +#endif void r3jjr(int j2, int j3, int m2, int m3, C6 *c6) { int jmx = j3 + j2; int jdf = j3 - j2; @@ -1717,9 +1738,13 @@ void r3jjr(int j2, int j3, int m2, int m3, C6 *c6) { } } } -# pragma omp end declare target +#ifdef USE_TARGET_OFFLOAD +#pragma omp end declare target +#endif +#ifdef USE_TARGET_OFFLOAD #pragma omp begin declare target device_type(any) +#endif void r3jjr_d(int j2, int j3, int m2, int m3, double *rac3j) { int jmx = j3 + j2; int jdf = j3 - j2; @@ -1837,7 +1862,9 @@ void r3jjr_d(int j2, int j3, int m2, int m3, double *rac3j) { } } } -# pragma omp end declare target +#ifdef USE_TARGET_OFFLOAD +#pragma omp end declare target +#endif void r3jmr(int j1, int j2, int j3, int m1, C6 *c6) { int mmx = (j2 < j3 - m1) ? j2 : j3 - m1; @@ -1968,8 +1995,10 @@ void raba( #ifdef USE_NVTX nvtxRangePush("raba inner loop 1"); #endif +#ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:c1, c2) - for (int j10 = 1; j10 <= nlemt; j10++) { +#endif + for (int j10 = 1; j10 <= nlemt; j10++) { int j = j10 - 1; c1 += (vec_am0m[i*nlemt+j] * vec_w[4*j]); c2 += (vec_am0m[i*nlemt+j] * vec_w[4*j+1]); @@ -2018,8 +2047,10 @@ void raba( #ifdef USE_NVTX nvtxRangePush("raba inner loop 2"); #endif +#ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2) - for (int k = 1; k<=kmax; k++) { +#endif + for (int k = 1; k<=kmax; k++) { int l60 = (int) sqrt(k+1); int im60 = k - (l60*l60) + 1; if (im60 == 0) { @@ -2091,7 +2122,9 @@ void raba( #ifdef USE_NVTX nvtxRangePush("raba loop 3"); #endif +#ifdef USE_TARGET_OFFLOAD #pragma omp teams distribute parallel for simd +#endif for (int ipo78 = 1; ipo78 <= 2; ipo78++) { int ipo = ipo78 - 1; tqce[ipo][0] = real(ctqce[ipo][0] - ctqce[ipo][2]) * sq2i; @@ -2161,7 +2194,9 @@ void scr0(double vk, double exri, C1 *c1, C1_AddOns *c1ao, C3 *c3, C4 * c4) { #ifdef USE_NVTX nvtxRangePush("scr0 inner loop 1"); #endif +#ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:sums, sum21) +#endif for (int l10 = 1; l10 <= c4->li; l10++) { double fl = 1.0 * (l10 + l10 + 1); // dcomplex rm = 1.0 / c1->rmi[l10 - 1][i14 - 1]; @@ -2203,9 +2238,11 @@ void scr0(double vk, double exri, C1 *c1, C1_AddOns *c1ao, C3 *c3, C4 * c4) { double acs = 0.0; dcomplex tfsas = cc0; #ifdef USE_NVTX - nvtxRangePush("scr0 loop 2"); + nvtxRangePush("scr0 loop 2"); #endif +#ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:scs, ecs, acs, tfsas) +#endif for (int i14 = 1; i14 <= c4->nsph; i14++) { int iogi = c1->iog[i14 - 1]; scs += c1->sscs[iogi - 1]; @@ -2265,9 +2302,11 @@ void scr2( // furthermore if it results im10 > 2*l10+1, then we set // im10 = im10 -(2*l10+1) and l10 = l10+1 (there was a rounding error in a nearly exact root) #ifdef USE_NVTX - nvtxRangePush("scr2 inner loop 1"); + nvtxRangePush("scr2 inner loop 1"); #endif +#ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(-:s11, s21, s12, s22) +#endif for (int k = 1; k<=kmax; k++) { int l10 = (int) sqrt(k+1); int im10 = k - (l10*l10) + 1; @@ -2319,7 +2358,9 @@ void scr2( #ifdef USE_NVTX nvtxRangePush("scr2 loop 2"); #endif - #pragma omp target teams distribute parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) +#ifdef USE_TARGET_OFFLOAD +#pragma omp target teams distribute parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) +#endif for (int i14 = 1; i14 <= c4->nsph; i14++) { int i = i14 - 1; int iogi = c1->iog[i14 - 1]; @@ -2349,8 +2390,10 @@ void scr2( #ifdef USE_NVTX nvtxRangePush("scr2 inner loop 3"); #endif +#ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(4) - for (int ipo1 = 1; ipo1 <=2; ipo1++) { +#endif + for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { for (int ipo2 = 1; ipo2 <= 2; ipo2++) { for (int jpo2 = 1; jpo2 <= 2; jpo2++) { @@ -2372,7 +2415,9 @@ void scr2( #ifdef USE_NVTX nvtxRangePush("scr2 loop 4"); #endif +#ifdef USE_TARGET_OFFLOAD #pragma omp target parallel for collapse(4) +#endif for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { for (int ipo2 = 1; ipo2 <= 2; ipo2++) { @@ -2505,7 +2550,9 @@ void ztm(dcomplex **am, C1 *c1, C1_AddOns *c1ao, C4 *c4, C6 *c6, C9 * c9) { // but if it results im = 0, then we set l = l-1 and im = 2*l+1 // furthermore if it results im > 2*l+1, then we set // im = im -(2*l+1) and l = l+1 (there was a rounding error in a nearly exact root) -# pragma omp target teams distribute parallel for simd collapse(3) +#ifdef USE_TARGET_OFFLOAD +#pragma omp target teams distribute parallel for simd collapse(3) +#endif for (int n2 = 1; n2 <= c4->nsph; n2++) { // GPU portable? for (int k2 = 1; k2<=k2max; k2++) { for (int k3 = 1; k3<=k3max; k3++) { @@ -2550,7 +2597,9 @@ void ztm(dcomplex **am, C1 *c1, C1_AddOns *c1ao, C4 *c4, C6 *c6, C9 * c9) { #endif dcomplex *am_v = am[0]; dcomplex *sam_v = c9->sam[0]; -# pragma omp target teams distribute parallel for simd collapse(2) +#ifdef USE_TARGET_OFFLOAD +#pragma omp target teams distribute parallel for simd collapse(2) +#endif for (int i1 = 1; i1 <= ndi; i1++) { // GPU portable? for (int i3 = 1; i3 <= c4->nlem; i3++) { dcomplex sum1 = cc0; @@ -2584,7 +2633,7 @@ void ztm(dcomplex **am, C1 *c1, C1_AddOns *c1ao, C4 *c4, C6 *c6, C9 * c9) { sam_v[vecind1e + i3e - 1] = sum4; } // i3 loop } // i1 loop -# pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) for (int i1 = 1; i1 <= ndi; i1++) { for (int i0 = 1; i0 <= c4->nlem; i0++) { int vecindex = (i1 - 1)*c9->nlem + i0 - 1; @@ -2596,7 +2645,9 @@ void ztm(dcomplex **am, C1 *c1, C1_AddOns *c1ao, C4 *c4, C6 *c6, C9 * c9) { } // i1 loop int nlemt = c4->nlem + c4->nlem; dcomplex *am0m_v = c1ao->am0m[0]; -# pragma omp target parallel for collapse(2) +#ifdef USE_TARGET_OFFLOAD +#pragma omp target parallel for collapse(2) +#endif for (int i0 = 1; i0 <= c4->nlem; i0++) { for (int i3 = 1; i3 <= nlemt; i3++) { int i0e = i0 + c4->nlem; diff --git a/src/libnptm/sph_subs.cpp b/src/libnptm/sph_subs.cpp index 6e1cde3681e548e01b395ecf2e8430599d3cdeaa..63685f97af4e18be3d10718b6495293a7509ea6d 100644 --- a/src/libnptm/sph_subs.cpp +++ b/src/libnptm/sph_subs.cpp @@ -197,13 +197,17 @@ double cg1(int lmpml, int mu, int l, int m) { return result; } -# pragma omp begin declare target device_type(any) +#ifdef USE_TARGET_OFFLOAD +#pragma omp begin declare target device_type(any) +#endif dcomplex dconjg(dcomplex z) { double zreal = real(z); double zimag = imag(z); return (zreal - zimag * I); } -# pragma omp end declare target +#ifdef USE_TARGET_OFFLOAD +#pragma omp end declare target +#endif void diel(int npntmo, int ns, int i, int ic, double vk, C1 *c1, C2 *c2) { const double dif = c1->rc[i - 1][ns] - c1->rc[i - 1][ns - 1]; diff --git a/src/libnptm/types.cpp b/src/libnptm/types.cpp index fcd571d05a07dbf017a741b0363b0eccd02a1dbb..4b2bd23372e899f382d25fed87bee2132b409f61 100644 --- a/src/libnptm/types.cpp +++ b/src/libnptm/types.cpp @@ -22,10 +22,18 @@ #include "../include/types.h" #endif +#ifdef USE_TARGET_OFFLOAD # pragma omp begin declare target device_type(any) +#endif double real(dcomplex z) { return __real__ z; } +#ifdef USE_TARGET_OFFLOAD # pragma omp end declare target +#endif +#ifdef USE_TARGET_OFFLOAD # pragma omp begin declare target device_type(any) +#endif double imag(dcomplex z) { return __imag__ z; } +#ifdef USE_TARGET_OFFLOAD # pragma omp end declare target +#endif