Skip to content
Snippets Groups Projects
Commit 704e038d authored by Mulas, Giacomo's avatar Mulas, Giacomo
Browse files

linearise loops in cfrfme.cpp, to prepare for omp target offload

parent 4a8cfbd6
Branches
No related tags found
No related merge requests found
......@@ -61,6 +61,14 @@
#include <nvtx3/nvToolsExt.h>
#endif
#ifdef _OPENMP
#include <omp.h>
#endif
#ifdef USE_TARGET_OFFLOAD
#pragma omp requires unified_shared_memory
#endif
using namespace std;
/*! \brief C++ implementation of FRFME
......@@ -391,57 +399,84 @@ void frfme(string data_file, string output_path) {
#ifdef USE_NVTX
nvtxRangePush("j80 loop");
#endif
#pragma omp parallel for
for (int j80 = jlmf; j80 <= jlml; j80++) {
dcomplex *vec_w = new dcomplex[nkv * nkv]();
dcomplex **w = new dcomplex*[nkv];
dcomplex *vec_wsum = tfrfme->wsum[0];
double *vec_vkzm = vkzm[0];
#ifdef USE_TARGET_OFFLOAD
#pragma omp target teams distribute parallel for simd
#endif
// #pragma omp parallel for
for (int j80 = jlmf-1; j80 < jlml; j80++) {
int nkvs = nkv * nkv;
dcomplex *vec_w = (dcomplex *) calloc(nkvs, sizeof(dcomplex));
dcomplex **w = (dcomplex **) calloc(nkv, sizeof(dcomplex *));
// dcomplex *wk_local = new dcomplex[nlmmt]();
for (int wi = 0; wi < nkv; wi++) w[wi] = vec_w + wi * nkv;
dcomplex wk_value;
int wk_index = 0;
int nkvs = nkv * nkv;
for (int jy50 = 0; jy50 < nkv; jy50++) {
for (int jx50 = 0; jx50 < nkv; jx50++) {
// for (int jy50 = 0; jy50 < nkv; jy50++) {
// for (int jx50 = 0; jx50 < nkv; jx50++) {
// #ifdef USE_TARGET_OFFLOAD
// #pragma omp target teams distribute parallel for simd
// #endif
// #pragma omp parallel for
for (int jxy50 = 0; jxy50 < nkvs; jxy50++) {
// for (int wi = 0; wi < nlmmt; wi++) wk_local[wi] = tt1->wk[wk_index++];
// w[jx50][jy50] = wk_local[j80 - 1];
wk_value = tt1->wk[wk_index + j80 - 1];
wk_index += nlmmt;
w[jx50][jy50] = wk_value;
} // jx50
} // jy50 loop
// w[jx50][jy50] = wk_local[j80];
wk_index = nlmmt*jxy50;
wk_value = tt1->wk[wk_index + j80];
// wk_index += nlmmt;
int jy50 = jxy50 / nkv;
int jx50 = jxy50 % nkv;
vec_w[(nkv*jx50) + jy50] = wk_value;
// w[jx50][jy50] = wk_value;
} // jxy50 loop
// } // jx50
// } // jy50 loop
int ixyz = 0;
for (int wj = 0; wj < nrvc; wj++) tfrfme->wsum[j80 - 1][wj] = cc0;
for (int iz75 = 0; iz75 < nzv; iz75++) {
for (int wj = 0; wj < nrvc; wj++) vec_wsum[(j80*nrvc)+wj] = cc0;
int nvtot = nxv*nyv*nzv;
int nvxy = nxv *nyv;
// #ifdef USE_TARGET_OFFLOAD
// #pragma omp target teams distribute parallel for
// #endif
// #pragma omp parallel for
for (int ixyz = 0; ixyz < nvtot; ixyz++) {
int iz75 = ixyz / nvxy;
int iy70 = (ixyz % nvxy) / nxv;
int ix65 = ixyz % nxv;
// for (int iz75 = 0; iz75 < nzv; iz75++) {
double z = _zv[iz75] + frsh;
for (int iy70 = 0; iy70 < nyv; iy70++) {
// for (int iy70 = 0; iy70 < nyv; iy70++) {
double y = _yv[iy70];
for (int ix65 = 0; ix65 < nxv; ix65++) {
// for (int ix65 = 0; ix65 < nxv; ix65++) {
double x = _xv[ix65];
ixyz++;
// ixyz++;
dcomplex sumy = cc0;
for (int jy60 = 0; jy60 < nkv; jy60++) {
// #ifdef USE_TARGET_OFFLOAD
// #pragma omp target parallel for simd reduction(+:sumy)
// #endif
for (int jy60x55 = 0; jy60x55 < nkvs ; jy60x55++) {
int jy60 = jy60x55 / nkv;
int jx55 = jy60x55 % nkv;
double vky = vkv[jy60];
double vkx = vkv[nkv - 1];
double vkzf = vkzm[0][jy60];
dcomplex phasf = cexp(uim * (-vkx * x + vky * y + vkzf * z));
double vkzl = vkzm[nkv - 1][jy60];
dcomplex phasl = cexp(uim * (vkx * x + vky * y + vkzl * z));
dcomplex sumx = 0.5 * (w[0][jy60] * phasf + w[nkv - 1][jy60] * phasl);
for (int jx55 = 2; jx55 <= nks; jx55++) {
vkx = vkv[jx55 - 1];
double vkz = vkzm[jx55 - 1][jy60];
dcomplex phas = cexp(uim * (vkx * x + vky * y + vkz * z));
sumx += (w[jx55 - 1][jy60] * phas);
} // jx55 loop
if (jy60 == 0 || jy60 == nkv - 1) sumx *= 0.5;
double vkx = (jx55==0) ? vkv[nkv - 1] : vkv[jx55];
double vkz = vec_vkzm[(jx55*nkv)+jy60];
dcomplex phas = (jx55==0) ?
cexp(uim * (-vkx * x + vky * y + vkz * z)):
cexp(uim * (vkx * x + vky * y + vkz * z));
dcomplex sumx = vec_w[(jx55*nkv)+jy60] * phas;
double factor1 = ((jx55==0)||(jx55==(nkv-1))) ? 0.5 : 1.0;
double factor2 = ((jy60==0)||(jy60==(nkv-1))) ? 0.5 : 1.0;
sumx *= factor1*factor2;
sumy += sumx;
} // jy60 loop
tfrfme->wsum[j80 - 1][ixyz - 1] = sumy * delks;
} // ix65 loop
} // iy70 loop
} // iz75 loop
delete[] vec_w;
delete[] w;
} // jy60x55 loop
vec_wsum[((j80)*nrvc)+ixyz] = sumy * delks;
// } // ix65 loop
// } // iy70 loop
// } // iz75 loop
} // ixyz loop
free(vec_w);
free(w);
// delete[] wk_local;
} // j80 loop
#ifdef USE_NVTX
......@@ -499,3 +534,4 @@ void frfme(string data_file, string output_path) {
nvtxRangePop();
#endif
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment