From 704e038dcff6995f34142c2eda5758abb7690c37 Mon Sep 17 00:00:00 2001 From: "Mulas, Giacomo" <gmulas@oa-cagliari.inaf.it> Date: Tue, 10 Jun 2025 19:21:54 +0200 Subject: [PATCH] linearise loops in cfrfme.cpp, to prepare for omp target offload --- src/trapping/cfrfme.cpp | 124 ++++++++++++++++++++++++++-------------- 1 file changed, 80 insertions(+), 44 deletions(-) diff --git a/src/trapping/cfrfme.cpp b/src/trapping/cfrfme.cpp index 4c8010d..28c91cb 100644 --- a/src/trapping/cfrfme.cpp +++ b/src/trapping/cfrfme.cpp @@ -61,6 +61,14 @@ #include <nvtx3/nvToolsExt.h> #endif +#ifdef _OPENMP +#include <omp.h> +#endif + +#ifdef USE_TARGET_OFFLOAD +#pragma omp requires unified_shared_memory +#endif + using namespace std; /*! \brief C++ implementation of FRFME @@ -391,57 +399,84 @@ void frfme(string data_file, string output_path) { #ifdef USE_NVTX nvtxRangePush("j80 loop"); #endif -#pragma omp parallel for - for (int j80 = jlmf; j80 <= jlml; j80++) { - dcomplex *vec_w = new dcomplex[nkv * nkv](); - dcomplex **w = new dcomplex*[nkv]; + dcomplex *vec_wsum = tfrfme->wsum[0]; + double *vec_vkzm = vkzm[0]; +#ifdef USE_TARGET_OFFLOAD +#pragma omp target teams distribute parallel for simd +#endif +// #pragma omp parallel for + for (int j80 = jlmf-1; j80 < jlml; j80++) { + int nkvs = nkv * nkv; + dcomplex *vec_w = (dcomplex *) calloc(nkvs, sizeof(dcomplex)); + dcomplex **w = (dcomplex **) calloc(nkv, sizeof(dcomplex *)); // dcomplex *wk_local = new dcomplex[nlmmt](); for (int wi = 0; wi < nkv; wi++) w[wi] = vec_w + wi * nkv; dcomplex wk_value; int wk_index = 0; - int nkvs = nkv * nkv; - for (int jy50 = 0; jy50 < nkv; jy50++) { - for (int jx50 = 0; jx50 < nkv; jx50++) { + // for (int jy50 = 0; jy50 < nkv; jy50++) { + // for (int jx50 = 0; jx50 < nkv; jx50++) { +// #ifdef USE_TARGET_OFFLOAD +// #pragma omp target teams distribute parallel for simd +// #endif +// #pragma omp parallel for + for (int jxy50 = 0; jxy50 < nkvs; jxy50++) { // for (int wi = 0; wi < nlmmt; wi++) wk_local[wi] = tt1->wk[wk_index++]; - // w[jx50][jy50] = wk_local[j80 - 1]; - wk_value = tt1->wk[wk_index + j80 - 1]; - wk_index += nlmmt; - w[jx50][jy50] = wk_value; - } // jx50 - } // jy50 loop + // w[jx50][jy50] = wk_local[j80]; + wk_index = nlmmt*jxy50; + wk_value = tt1->wk[wk_index + j80]; + // wk_index += nlmmt; + int jy50 = jxy50 / nkv; + int jx50 = jxy50 % nkv; + vec_w[(nkv*jx50) + jy50] = wk_value; + // w[jx50][jy50] = wk_value; + } // jxy50 loop + // } // jx50 + // } // jy50 loop int ixyz = 0; - for (int wj = 0; wj < nrvc; wj++) tfrfme->wsum[j80 - 1][wj] = cc0; - for (int iz75 = 0; iz75 < nzv; iz75++) { + for (int wj = 0; wj < nrvc; wj++) vec_wsum[(j80*nrvc)+wj] = cc0; + int nvtot = nxv*nyv*nzv; + int nvxy = nxv *nyv; +// #ifdef USE_TARGET_OFFLOAD +// #pragma omp target teams distribute parallel for +// #endif +// #pragma omp parallel for + for (int ixyz = 0; ixyz < nvtot; ixyz++) { + int iz75 = ixyz / nvxy; + int iy70 = (ixyz % nvxy) / nxv; + int ix65 = ixyz % nxv; + // for (int iz75 = 0; iz75 < nzv; iz75++) { double z = _zv[iz75] + frsh; - for (int iy70 = 0; iy70 < nyv; iy70++) { - double y = _yv[iy70]; - for (int ix65 = 0; ix65 < nxv; ix65++) { - double x = _xv[ix65]; - ixyz++; - dcomplex sumy = cc0; - for (int jy60 = 0; jy60 < nkv; jy60++) { - double vky = vkv[jy60]; - double vkx = vkv[nkv - 1]; - double vkzf = vkzm[0][jy60]; - dcomplex phasf = cexp(uim * (-vkx * x + vky * y + vkzf * z)); - double vkzl = vkzm[nkv - 1][jy60]; - dcomplex phasl = cexp(uim * (vkx * x + vky * y + vkzl * z)); - dcomplex sumx = 0.5 * (w[0][jy60] * phasf + w[nkv - 1][jy60] * phasl); - for (int jx55 = 2; jx55 <= nks; jx55++) { - vkx = vkv[jx55 - 1]; - double vkz = vkzm[jx55 - 1][jy60]; - dcomplex phas = cexp(uim * (vkx * x + vky * y + vkz * z)); - sumx += (w[jx55 - 1][jy60] * phas); - } // jx55 loop - if (jy60 == 0 || jy60 == nkv - 1) sumx *= 0.5; - sumy += sumx; - } // jy60 loop - tfrfme->wsum[j80 - 1][ixyz - 1] = sumy * delks; - } // ix65 loop - } // iy70 loop - } // iz75 loop - delete[] vec_w; - delete[] w; + // for (int iy70 = 0; iy70 < nyv; iy70++) { + double y = _yv[iy70]; + // for (int ix65 = 0; ix65 < nxv; ix65++) { + double x = _xv[ix65]; + // ixyz++; + dcomplex sumy = cc0; +// #ifdef USE_TARGET_OFFLOAD +// #pragma omp target parallel for simd reduction(+:sumy) +// #endif + for (int jy60x55 = 0; jy60x55 < nkvs ; jy60x55++) { + int jy60 = jy60x55 / nkv; + int jx55 = jy60x55 % nkv; + double vky = vkv[jy60]; + double vkx = (jx55==0) ? vkv[nkv - 1] : vkv[jx55]; + double vkz = vec_vkzm[(jx55*nkv)+jy60]; + dcomplex phas = (jx55==0) ? + cexp(uim * (-vkx * x + vky * y + vkz * z)): + cexp(uim * (vkx * x + vky * y + vkz * z)); + dcomplex sumx = vec_w[(jx55*nkv)+jy60] * phas; + double factor1 = ((jx55==0)||(jx55==(nkv-1))) ? 0.5 : 1.0; + double factor2 = ((jy60==0)||(jy60==(nkv-1))) ? 0.5 : 1.0; + sumx *= factor1*factor2; + sumy += sumx; + } // jy60x55 loop + vec_wsum[((j80)*nrvc)+ixyz] = sumy * delks; + // } // ix65 loop + // } // iy70 loop + // } // iz75 loop + } // ixyz loop + free(vec_w); + free(w); // delete[] wk_local; } // j80 loop #ifdef USE_NVTX @@ -499,3 +534,4 @@ void frfme(string data_file, string output_path) { nvtxRangePop(); #endif } + -- GitLab