diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 39e36dad06329deff757d046ef360ef211c575e1..a77e21b6db9cb73ffab0a0709391b0f9849ef7b6 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -78,13 +78,13 @@ compatibility_stage: - CXX=g++-14 FC=gfortran-14 ./configure - make wipe - make -j - - echo "Running make with refinement with gnu compilers version 14..." + - echo "Running make with gnu compilers version 14..." - cd .. - rm -rf build_gnu14 - mkdir build_gnu14_refine - cd build_gnu14_refine - cp -r ../build/* . - - CXX=g++-14 FC=gfortran-14 ./configure --enable-refinement + - CXX=g++-14 FC=gfortran-14 ./configure - make wipe - make -j #- echo "Running make with flang version 16 and clang version 16..." @@ -173,7 +173,7 @@ building_stage: - cat /etc/os-release - cd build - echo "Configuring with default compilers (MAGMA disabled)..." - - ./configure --without-magma --without-cublas --disable-offload --enable-refinement --enable-shared + - ./configure --without-magma --without-cublas --disable-offload --enable-shared - make wipe - echo "Building the default configuration..." - make -j diff --git a/build/configure.sh b/build/configure.sh index f38981c6661323397f698177066d76e14c2f0902..5f2a7b6e6bf82a3889d01d92ce9c0d973a213727 100755 --- a/build/configure.sh +++ b/build/configure.sh @@ -814,7 +814,7 @@ EOF fi if [ "x$result" = "x0" ]; then echo "yes" - OFFLOADFLAGS="-DUSE_TARGET_OFFLOAD -fcf-protection=check -foffload=nvptx-none=\"-O${CXX_OPT}${CXX_DBG} -fcf-protection=check -fopt-info -lm -latomic -lgomp\"" + OFFLOADFLAGS=" -DUSE_TARGET_OFFLOAD -fcf-protection=check -foffload=nvptx-none=\"-O${CXX_OPT}${CXX_DBG} -fcf-protection=check -fopt-info -lm -latomic -lgomp\"" if [ "x${OMPFLAGS}" = "x" ]; then OFFLOADFLAGS="${OFFLOADFLAGS} -fopenmp" fi diff --git a/src/libnptm/clu_subs.cpp b/src/libnptm/clu_subs.cpp index 779c2ecd4afdfe2eba45809b9ff4e80e43883b2e..7bd73b106914ffd80ef5ab02a57f5cde17b6c938 100644 --- a/src/libnptm/clu_subs.cpp +++ b/src/libnptm/clu_subs.cpp @@ -1341,6 +1341,8 @@ void pcros(double vk, double exri, ParticleDescriptor *c1) { #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4) +#else +#pragma omp parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4) #endif for (int i12 = 0; i12 < nlemt; i12++) { // int i = i12 - 1; @@ -1408,6 +1410,8 @@ void pcrsm0(double vk, double exri, int inpol, ParticleDescriptor *c1) { sum3 = cc0; #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:sum2,sum3) +#else +#pragma omp parallel for simd reduction(+:sum2,sum3) #endif for (int i14 = 0; i14 < c1->nlem; i14++) { int ie = i14 + c1->nlem; @@ -1418,6 +1422,8 @@ void pcrsm0(double vk, double exri, int inpol, ParticleDescriptor *c1) { dcomplex sumpd = cc0; #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) reduction(+:sumpi,sumpd) +#else +#pragma omp parallel for simd collapse(2) reduction(+:sumpi,sumpd) #endif for (int i16 = 0; i16 < nlemt; i16++) { for (int j16 = 0; j16 < c1->nlem; j16++) { @@ -2001,6 +2007,8 @@ void raba( #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:c1, c2) +#else +#pragma omp parallel for simd reduction(+:c1, c2) #endif for (int j10 = 1; j10 <= nlemt; j10++) { int j = j10 - 1; @@ -2021,6 +2029,8 @@ void raba( #endif #ifdef USE_TARGET_OFFLOAD #pragma omp teams distribute parallel for +#else +#pragma omp parallel for #endif for (int ipo = 0; ipo < 2; ipo++) { int jpo = 1 - ipo; @@ -2051,12 +2061,14 @@ void raba( int kmax = le*(le+2); // for efficiency I should also linearise array w, but I cannot easily since I do not know for sure its major dimension (changes to containing class needed) #ifdef USE_NVTX - nvtxRangePush("raba inner loop 2"); + nvtxRangePush("raba inner loop 2"); #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2) +#else +#pragma omp parallel for simd reduction(+:ctqce0, ctqce1, ctqce2, ctqcs0, ctqcs1, ctqcs2, tqcpe0, tqcpe1, tqcpe2, tqcps0, tqcps1, tqcps2) #endif - for (int k = 1; k<=kmax; k++) { + for (int k = 1; k<=kmax; k++) { int l60 = (int) sqrt(k+1); int im60 = k - (l60*l60) + 1; if (im60 == 0) { @@ -2079,44 +2091,44 @@ void raba( dcomplex acwp; dcomplex aca; dcomplex acap; - if (mmmmu <= l60) { - int immu = mmmu + il - 1; - int immue = immu + nlem; - rmu = -sqrt(1.0 * (l60 + mmmu) * (l60 - m)) * sq2i; - acw = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+ipo]; - acwp = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+jpo]; - aca = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+ipo]; - acap = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+jpo]; - ctqce0 += (acw * rmu); - tqcpe0 += (acwp * rmu); - ctqcs0 += (aca * rmu); - tqcps0 += (acap * rmu); - } - // label 30 - rmu = -1.0 * m; - acw = dconjg(vec_a[2*i+ipo]) * vec_w[4*i+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*ie+ipo]; - acwp = dconjg(vec_a[2*i+ipo]) * vec_w[4*i+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*ie+jpo]; - aca = dconjg(vec_a[2*i+ipo]) * vec_a[2*i+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*ie+ipo]; - acap = dconjg(vec_a[2*i+ipo]) * vec_a[2*i+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*ie+jpo]; - ctqce1 += (acw * rmu); - tqcpe1 += (acwp * rmu); - ctqcs1 += (aca * rmu); - tqcps1 += (acap * rmu); - mmmu = m - 1; - mmmmu = (mmmu > 0) ? mmmu : -mmmu; - if (mmmmu <= l60) { - int immu = mmmu + il - 1; - int immue = immu + nlem; - rmu = sqrt(1.0 * (l60 - mmmu) * (l60 + m)) * sq2i; - acw = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+ipo]; - acwp = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+jpo]; - aca = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+ipo]; - acap = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+jpo]; - ctqce2 += (acw * rmu); - tqcpe2 += (acwp * rmu); - ctqcs2 += (aca * rmu); - tqcps2 += (acap * rmu); - } // ends if clause + if (mmmmu <= l60) { + int immu = mmmu + il - 1; + int immue = immu + nlem; + rmu = -sqrt(1.0 * (l60 + mmmu) * (l60 - m)) * sq2i; + acw = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+ipo]; + acwp = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+jpo]; + aca = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+ipo]; + acap = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+jpo]; + ctqce0 += (acw * rmu); + tqcpe0 += (acwp * rmu); + ctqcs0 += (aca * rmu); + tqcps0 += (acap * rmu); + } + // label 30 + rmu = -1.0 * m; + acw = dconjg(vec_a[2*i+ipo]) * vec_w[4*i+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*ie+ipo]; + acwp = dconjg(vec_a[2*i+ipo]) * vec_w[4*i+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*ie+jpo]; + aca = dconjg(vec_a[2*i+ipo]) * vec_a[2*i+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*ie+ipo]; + acap = dconjg(vec_a[2*i+ipo]) * vec_a[2*i+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*ie+jpo]; + ctqce1 += (acw * rmu); + tqcpe1 += (acwp * rmu); + ctqcs1 += (aca * rmu); + tqcps1 += (acap * rmu); + mmmu = m - 1; + mmmmu = (mmmu > 0) ? mmmu : -mmmu; + if (mmmmu <= l60) { + int immu = mmmu + il - 1; + int immue = immu + nlem; + rmu = sqrt(1.0 * (l60 - mmmu) * (l60 + m)) * sq2i; + acw = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+ipo]; + acwp = dconjg(vec_a[2*i+ipo]) * vec_w[4*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_w[4*immue+jpo]; + aca = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+ipo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+ipo]; + acap = dconjg(vec_a[2*i+ipo]) * vec_a[2*immu+jpo] + dconjg(vec_a[2*ie+ipo]) * vec_a[2*immue+jpo]; + ctqce2 += (acw * rmu); + tqcpe2 += (acwp * rmu); + ctqcs2 += (aca * rmu); + tqcps2 += (acap * rmu); + } // ends if clause } // k loop (previously the l60 and im60 loops #ifdef USE_NVTX nvtxRangePop(); @@ -2129,7 +2141,9 @@ void raba( nvtxRangePush("raba loop 3"); #endif #ifdef USE_TARGET_OFFLOAD -#pragma omp teams distribute parallel for simd +#pragma omp target teams distribute parallel for simd +#else +#pragma omp parallel for simd #endif for (int ipo78 = 1; ipo78 <= 2; ipo78++) { int ipo = ipo78 - 1; @@ -2202,6 +2216,8 @@ void scr0(double vk, double exri, ParticleDescriptor *c1) { #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:sums, sum21) +#else +#pragma omp parallel for simd reduction(+:sums, sum21) #endif for (int l10 = 1; l10 <= c1->li; l10++) { double fl = 1.0 * (l10 + l10 + 1); @@ -2248,6 +2264,8 @@ void scr0(double vk, double exri, ParticleDescriptor *c1) { #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:scs, ecs, acs, tfsas) +#else +#pragma omp parallel for simd reduction(+:scs, ecs, acs, tfsas) #endif for (int i14 = 1; i14 <= c1->nsph; i14++) { int iogi = c1->iog[i14 - 1]; @@ -2312,6 +2330,8 @@ void scr2( #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(-:s11, s21, s12, s22) +#else +#pragma omp parallel for simd reduction(-:s11, s21, s12, s22) #endif for (int k = 1; k<=kmax; k++) { int l10 = (int) sqrt(k+1); @@ -2366,6 +2386,8 @@ void scr2( #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) +#else +#pragma omp parallel for simd reduction(+:tsas00, tsas10, tsas01, tsas11) #endif for (int i14 = 1; i14 <= c1->nsph; i14++) { int i = i14 - 1; @@ -2398,6 +2420,8 @@ void scr2( #endif #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(4) +#else +#pragma omp parallel for simd collapse(4) #endif for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { @@ -2422,7 +2446,9 @@ void scr2( nvtxRangePush("scr2 loop 4"); #endif #ifdef USE_TARGET_OFFLOAD -#pragma omp target parallel for collapse(4) +#pragma omp target teams distribute parallel for collapse(4) +#else +#pragma omp parallel for collapse(4) #endif for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { @@ -2534,7 +2560,7 @@ void tqr( } void ztm(dcomplex **am, ParticleDescriptor *c1) { - dcomplex gie, gle, a1, a2, a3, a4, sum1, sum2, sum3, sum4; + // dcomplex gie, gle, a1, a2, a3, a4, sum1, sum2, sum3, sum4; const dcomplex cc0 = 0.0 + 0.0 * I; // int i2 = 0; // old implementation #ifdef USE_NVTX @@ -2546,7 +2572,6 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { // C9 *c9_para = new C9(*c9); dcomplex *gis_v = c1->gis[0]; dcomplex *gls_v = c1->gls[0]; - double *rac3j_local = (double *) malloc(c1->lmtpo*sizeof(double)); int k2max = c1->li*(c1->li+2); int k3max = c1->le*(c1->le+2); // To parallelise, I run a linearised loop directly over k @@ -2559,6 +2584,8 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { // im = im -(2*l+1) and l = l+1 (there was a rounding error in a nearly exact root) #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(3) +#else +#pragma omp parallel for simd collapse(3) #endif for (int n2 = 1; n2 <= c1->nsph; n2++) { // GPU portable? for (int k2 = 1; k2<=k2max; k2++) { @@ -2590,12 +2617,13 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { int i3 = l3 * l3 + im3 - 1; int m3 = -l3 - 1 + im3; int vecindex = (i2 - 1) * c1->nlem + i3 - 1; + double *rac3j_local = (double *) malloc(c1->lmtpo*sizeof(double)); gis_v[vecindex] = ghit_d(2, 0, n2, l2, m2, l3, m3, c1, rac3j_local); gls_v[vecindex] = ghit_d(2, 1, n2, l2, m2, l3, m3, c1, rac3j_local); + free(rac3j_local); } // close k3 loop, former l3 + im3 loops } // close k2 loop, former l2 + im2 loops } // close n2 loop - free(rac3j_local); #ifdef USE_NVTX nvtxRangePop(); #endif @@ -2606,6 +2634,8 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { dcomplex *sam_v = c1->sam[0]; #ifdef USE_TARGET_OFFLOAD #pragma omp target teams distribute parallel for simd collapse(2) +#else +#pragma omp parallel for simd collapse(2) #endif for (int i1 = 1; i1 <= c1->ndi; i1++) { // GPU portable? for (int i3 = 1; i3 <= c1->nlem; i3++) { @@ -2615,7 +2645,6 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { dcomplex sum4 = cc0; int i1e = i1 + c1->ndi; int i3e = i3 + c1->nlem; -#pragma parallel for simd reduction(+:sum1,sum2,sum3,sum4) for (int i2 = 1; i2 <= c1->ndi; i2++) { int i2e = i2 + c1->ndi; int vecindg_23 = (i2 - 1) * c1->nlem + i3 - 1; @@ -2640,7 +2669,11 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { sam_v[vecind1e + i3e - 1] = sum4; } // i3 loop } // i1 loop -#pragma omp parallel for collapse(2) +#ifdef USE_TARGET_OFFLOAD +#pragma omp target teams distribute parallel for simd collapse(2) +#else +#pragma omp parallel for simd collapse(2) +#endif for (int i1 = 1; i1 <= c1->ndi; i1++) { for (int i0 = 1; i0 <= c1->nlem; i0++) { int vecindex = (i1 - 1) * c1->nlem + i0 - 1; @@ -2650,7 +2683,9 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { } // i1 loop dcomplex *vec_am0m = c1->am0m[0]; #ifdef USE_TARGET_OFFLOAD -#pragma omp target parallel for collapse(2) +#pragma omp target teams distribute parallel for simd collapse(2) +#else +#pragma omp parallel for simd collapse(2) #endif for (int i0 = 1; i0 <= c1->nlem; i0++) { for (int i3 = 1; i3 <= c1->nlemt; i3++) { @@ -2661,11 +2696,11 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) { int i1e = i1 + c1->ndi; int vecind1 = (i1 - 1) * c1->nlemt; int vecind1e = (i1e - 1) * c1->nlemt; - a1 = sam_v[vecind1 + i3 - 1]; - a2 = sam_v[vecind1e + i3 - 1]; + dcomplex a1 = sam_v[vecind1 + i3 - 1]; + dcomplex a2 = sam_v[vecind1e + i3 - 1]; int vecindex = (i1 - 1) * c1->nlem + i0 - 1; - gie = gis_v[vecindex]; - gle = gls_v[vecindex]; + dcomplex gie = gis_v[vecindex]; + dcomplex gle = gls_v[vecindex]; sum1 += (a1 * gie + a2 * gle); sum2 += (a1 * gle + a2 * gie); } // i1 loop diff --git a/src/scripts/model_maker.py b/src/scripts/model_maker.py index 27e4352c34b2f2eafa7f013bd9c7791923c7f13b..f5cda8f15652e181d1ca935d7fc0fdd7a48c920d 100755 --- a/src/scripts/model_maker.py +++ b/src/scripts/model_maker.py @@ -328,6 +328,8 @@ def load_model(model_file): for j in range(expected_radii): sconf['rcf'][i][j] = float(model['particle_settings']['rad_frac'][i][j]) # Create the gconf dict + use_refinement = False + dyn_orders = True try: use_refinement = False if model['system_settings']['refinement'] == "0" else True except KeyError: @@ -335,7 +337,7 @@ def load_model(model_file): try: dyn_orders = False if model['radiation_settings']['dyn_orders'] == "0" else True except KeyError: - use_refinement = False + dyn_orders = True str_polar = model['radiation_settings']['polarization'] if (str_polar not in ["LINEAR", "CIRCULAR"]): print("ERROR: %s is not a recognized polarization state."%str_polar) @@ -875,10 +877,10 @@ def write_legacy_gconf(conf): output.write(str_line) str_line = " {0:d}\n0\n".format(conf['jwtm']) output.write(str_line) - if (gconf['use_refinement']): + if (conf['use_refinement']): str_line = "USE_REFINEMENT=1\n" output.write(str_line) - if (not gconf['dyn_orders']): + if (not conf['dyn_orders']): str_line = "USE_DYN_ORDER=0\n" output.write(str_line) output.close() diff --git a/src/trapping/cfrfme.cpp b/src/trapping/cfrfme.cpp index deba6ef5c322f6a715a6c184157036b07aff7520..b1b9985563d15756441d1c35f1fc39436d505a2b 100644 --- a/src/trapping/cfrfme.cpp +++ b/src/trapping/cfrfme.cpp @@ -61,6 +61,14 @@ #include #endif +#ifdef _OPENMP +#include +#endif + +#ifdef USE_TARGET_OFFLOAD +#pragma omp requires unified_shared_memory +#endif + using namespace std; /*! \brief C++ implementation of FRFME @@ -391,58 +399,64 @@ void frfme(string data_file, string output_path) { #ifdef USE_NVTX nvtxRangePush("j80 loop"); #endif -#pragma omp parallel for - for (int j80 = jlmf; j80 <= jlml; j80++) { - dcomplex *vec_w = new dcomplex[nkv * nkv](); - dcomplex **w = new dcomplex*[nkv]; + dcomplex *vec_wsum = tfrfme->wsum[0]; + double *vec_vkzm = vkzm[0]; +#ifdef USE_TARGET_OFFLOAD +#pragma omp target teams distribute parallel for simd +#else +#pragma omp parallel for simd +#endif + for (int j80 = jlmf-1; j80 < jlml; j80++) { + int nkvs = nkv * nkv; + dcomplex *vec_w = (dcomplex *) calloc(nkvs, sizeof(dcomplex)); + dcomplex **w = (dcomplex **) calloc(nkv, sizeof(dcomplex *)); for (int wi = 0; wi < nkv; wi++) w[wi] = vec_w + wi * nkv; dcomplex wk_value; - for (int jy50 = 0; jy50 < nkv; jy50++) { - for (int jx50 = 0; jx50 < nkv; jx50++) { - wk_value = tt1->wk[jy50 * nkv * nlmmt + jx50 * nlmmt + j80 - 1]; - w[jx50][jy50] = wk_value; - } // jx50 - } // jy50 loop + int wk_index = 0; + for (int jxy50 = 0; jxy50 < nkvs; jxy50++) { + wk_index = nlmmt*jxy50; + wk_value = tt1->wk[wk_index + j80]; + int jy50 = jxy50 / nkv; + int jx50 = jxy50 % nkv; + vec_w[(nkv*jx50) + jy50] = wk_value; + } // jxy50 loop int ixyz = 0; - for (int iz75 = 0; iz75 < nzv; iz75++) { + for (int wj = 0; wj < nrvc; wj++) vec_wsum[(j80*nrvc)+wj] = cc0; + int nvtot = nxv*nyv*nzv; + int nvxy = nxv *nyv; + for (int ixyz = 0; ixyz < nvtot; ixyz++) { + int iz75 = ixyz / nvxy; + int iy70 = (ixyz % nvxy) / nxv; + int ix65 = ixyz % nxv; double z = _zv[iz75] + frsh; - for (int iy70 = 0; iy70 < nyv; iy70++) { - double y = _yv[iy70]; - for (int ix65 = 0; ix65 < nxv; ix65++) { - double x = _xv[ix65]; - ixyz++; - dcomplex sumy = cc0; - for (int jy60 = 0; jy60 < nkv; jy60++) { - double vky = vkv[jy60]; - double vkx = vkv[nkv - 1]; - double vkzf = vkzm[0][jy60]; - dcomplex phasf = cexp(uim * (-vkx * x + vky * y + vkzf * z)); - double vkzl = vkzm[nkv - 1][jy60]; - dcomplex phasl = cexp(uim * (vkx * x + vky * y + vkzl * z)); - dcomplex sumx = 0.5 * (w[0][jy60] * phasf + w[nkv - 1][jy60] * phasl); - for (int jx55 = 2; jx55 <= nks; jx55++) { - vkx = vkv[jx55 - 1]; - double vkz = vkzm[jx55 - 1][jy60]; - dcomplex phas = cexp(uim * (vkx * x + vky * y + vkz * z)); - sumx += (w[jx55 - 1][jy60] * phas); - } // jx55 loop - if (jy60 == 0 || jy60 == nkv - 1) sumx *= 0.5; - sumy += sumx; - } // jy60 loop - tfrfme->wsum[j80 - 1][ixyz - 1] = sumy * delks; - } // ix65 loop - } // iy70 loop - } // iz75 loop - delete[] vec_w; - delete[] w; + double y = _yv[iy70]; + double x = _xv[ix65]; + dcomplex sumy = cc0; + for (int jy60x55 = 0; jy60x55 < nkvs ; jy60x55++) { + int jy60 = jy60x55 / nkv; + int jx55 = jy60x55 % nkv; + double vky = vkv[jy60]; + double vkx = (jx55==0) ? vkv[nkv - 1] : vkv[jx55]; + double vkz = vec_vkzm[(jx55*nkv)+jy60]; + dcomplex phas = (jx55==0) ? + cexp(uim * (-vkx * x + vky * y + vkz * z)): + cexp(uim * (vkx * x + vky * y + vkz * z)); + dcomplex sumx = vec_w[(jx55*nkv)+jy60] * phas; + double factor1 = ((jx55==0)||(jx55==(nkv-1))) ? 0.5 : 1.0; + double factor2 = ((jy60==0)||(jy60==(nkv-1))) ? 0.5 : 1.0; + sumx *= factor1*factor2; + sumy += sumx; + } // jy60x55 loop + vec_wsum[((j80)*nrvc)+ixyz] = sumy * delks; + } // ixyz loop + free(vec_w); + free(w); } // j80 loop #ifdef USE_NVTX nvtxRangePop(); -#endif - // label 88 -#ifdef USE_NVTX nvtxRangePush("Closing operations"); #endif + // label 88 tfrfme->write_binary(tfrfme_name, "HDF5"); string output_name = output_path + "/c_OFRFME"; FILE *output = fopen(output_name.c_str(), "w");