From e279e64d81df084e2f83004af9e36241b20eb4db Mon Sep 17 00:00:00 2001 From: Giovanni La Mura Date: Fri, 11 Jul 2025 17:42:47 +0200 Subject: [PATCH] Prepare nested OpenMP parallelism in TRAPPING --- src/trapping/cfrfme.cpp | 44 +++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/src/trapping/cfrfme.cpp b/src/trapping/cfrfme.cpp index fbc9a935..b26e6d7d 100644 --- a/src/trapping/cfrfme.cpp +++ b/src/trapping/cfrfme.cpp @@ -398,7 +398,8 @@ void frfme(string data_file, string output_path) { dcomplex *vec_wsum = tfrfme->wsum[0]; int size_wsum = nlmmt * nrvc; double *vec_vkzm = vkzm[0]; - int size_vkzm = nkv * nkv; + int nkvs = nkv * nkv; + int size_vkzm = nkvs; const dcomplex *vec_tt1_wk = tt1->wk; int size_tt1_wk = nkv * nkv * nlmmt; int size_global_vec_w = nkv * nkv * (jlml - jlmf + 1); @@ -412,34 +413,34 @@ void frfme(string data_file, string output_path) { device_id = omp_get_default_device(); global_vec_w = (dcomplex *)omp_target_alloc(size_global_vec_w * sizeof(dcomplex), device_id); global_w = (dcomplex **)omp_target_alloc(size_global_w * sizeof(dcomplex), device_id); -#pragma omp target teams distribute parallel for simd map(tofrom: vec_wsum[0:size_wsum]) \ - map(to:vec_vkzm[0:size_vkzm], vkv[0:nkv], vec_tt1_wk[0:size_tt1_wk], _xv[0:nxv], _yv[0:nyv], _zv[0:nzv]) \ - map(to: global_vec_w, global_w) \ - firstprivate(jlmf, jlml, nkv, nlmmt, nrvc, nxv, nyv, nzv, frsh, uim, delks) +#pragma omp target teams distribute parallel for \ + map(tofrom: vec_wsum[0:size_wsum]) \ + map(to: vec_vkzm[0:size_vkzm], vkv[0:nkv], vec_tt1_wk[0:size_tt1_wk], _xv[0:nxv]) \ + map(to: _yv[0:nyv], _zv[0:nzv], global_vec_w, global_w) #else // Fall-back host work-space allocation global_vec_w = = new dcomplex[size_global_vec_w](); global_w = new dcomplex*[size_global_w]; -#pragma omp parallel for simd -#endif - for (int j80 = jlmf-1; j80 < jlml; j80++) { - int nkvs = nkv * nkv; +#pragma omp parallel for +#endif // USE_TARGET_OFFLOAD + for (int j80 = jlmf - 1; j80 < jlml; j80++) { dcomplex *vec_w = global_vec_w + nkvs * (j80 - jlmf + 1); dcomplex **w = global_w + nkv * (j80 - jlmf + 1); +#pragma omp parallel for simd for (int wi = 0; wi < nkv; wi++) w[wi] = vec_w + wi * nkv; - dcomplex wk_value; - int wk_index = 0; +#pragma omp parallel for simd for (int jxy50 = 0; jxy50 < nkvs; jxy50++) { - wk_index = nlmmt * jxy50; - wk_value = vec_tt1_wk[wk_index + j80]; + int wk_index = nlmmt * jxy50; + dcomplex wk_value = vec_tt1_wk[wk_index + j80]; int jy50 = jxy50 / nkv; int jx50 = jxy50 % nkv; - vec_w[(nkv*jx50) + jy50] = wk_value; + vec_w[(nkv * jx50) + jy50] = wk_value; } // jxy50 loop - int ixyz = 0; - for (int wj = 0; wj < nrvc; wj++) vec_wsum[(j80 * nrvc)+wj] = cc0; +#pragma omp parallel for simd + for (int wj = 0; wj < nrvc; wj++) vec_wsum[(j80 * nrvc) + wj] = cc0; int nvtot = nxv * nyv * nzv; int nvxy = nxv * nyv; +#pragma omp parallel for for (int ixyz = 0; ixyz < nvtot; ixyz++) { int iz75 = ixyz / nvxy; int iy70 = (ixyz % nvxy) / nxv; @@ -448,18 +449,19 @@ void frfme(string data_file, string output_path) { double y = _yv[iy70]; double x = _xv[ix65]; dcomplex sumy = cc0; +#pragma omp parallel for simd for (int jy60x55 = 0; jy60x55 < nkvs ; jy60x55++) { int jy60 = jy60x55 / nkv; int jx55 = jy60x55 % nkv; double vky = vkv[jy60]; - double vkx = (jx55==0) ? vkv[nkv - 1] : vkv[jx55]; + double vkx = (jx55 == 0) ? vkv[nkv - 1] : vkv[jx55]; double vkz = vec_vkzm[(jx55 * nkv) + jy60]; - dcomplex phas = (jx55==0) ? + dcomplex phas = (jx55 == 0) ? cexp(uim * (-vkx * x + vky * y + vkz * z)): cexp(uim * (vkx * x + vky * y + vkz * z)); - dcomplex sumx = vec_w[(jx55 * nkv)+jy60] * phas; - double factor1 = ((jx55==0)||(jx55==(nkv-1))) ? 0.5 : 1.0; - double factor2 = ((jy60==0)||(jy60==(nkv-1))) ? 0.5 : 1.0; + dcomplex sumx = vec_w[(jx55 * nkv) + jy60] * phas; + double factor1 = ((jx55 == 0) || (jx55 == (nkv - 1))) ? 0.5 : 1.0; + double factor2 = ((jy60 == 0) || (jy60 == (nkv - 1))) ? 0.5 : 1.0; sumx *= factor1*factor2; sumy += sumx; } // jy60x55 loop -- GitLab