Skip to content
Snippets Groups Projects
Commit 7dd24b46 authored by Giovanni La Mura's avatar Giovanni La Mura
Browse files

Allocate global work-space directly on device, if using offload

parent 38e5b786
No related branches found
No related tags found
No related merge requests found
...@@ -401,16 +401,25 @@ void frfme(string data_file, string output_path) { ...@@ -401,16 +401,25 @@ void frfme(string data_file, string output_path) {
int size_vkzm = nkv * nkv; int size_vkzm = nkv * nkv;
const dcomplex *vec_tt1_wk = tt1->wk; const dcomplex *vec_tt1_wk = tt1->wk;
int size_tt1_wk = nkv * nkv * nlmmt; int size_tt1_wk = nkv * nkv * nlmmt;
dcomplex *global_vec_w = new dcomplex[nkv * nkv * (jlml - jlmf + 1)]();
dcomplex **global_w = new dcomplex*[nkv * (jlml - jlmf + 1)];
int size_global_vec_w = nkv * nkv * (jlml - jlmf + 1); int size_global_vec_w = nkv * nkv * (jlml - jlmf + 1);
int size_global_w = nkv * (jlml - jlmf + 1); int size_global_w = nkv * (jlml - jlmf + 1);
int device_id = 0;
// Work-space pointers for simultaneous threads
dcomplex *global_vec_w;
dcomplex **global_w;
#ifdef USE_TARGET_OFFLOAD #ifdef USE_TARGET_OFFLOAD
// Device-only work-space allocation
device_id = omp_get_default_device();
global_vec_w = (dcomplex *)omp_target_alloc(size_global_vec_w * sizeof(dcomplex), device_id);
global_w = (dcomplex **)omp_target_alloc(size_global_w * sizeof(dcomplex), device_id);
#pragma omp target teams distribute parallel for simd map(tofrom: vec_wsum[0:size_wsum]) \ #pragma omp target teams distribute parallel for simd map(tofrom: vec_wsum[0:size_wsum]) \
map(to:vec_vkzm[0:size_vkzm], vkv[0:nkv], vec_tt1_wk[0:size_tt1_wk], _xv[0:nxv], _yv[0:nyv], _zv[0:nzv]) \ map(to:vec_vkzm[0:size_vkzm], vkv[0:nkv], vec_tt1_wk[0:size_tt1_wk], _xv[0:nxv], _yv[0:nyv], _zv[0:nzv]) \
map(alloc:global_w[0:size_global_w], global_vec_w[0:size_global_vec_w]) \ map(to: global_vec_w, global_w) \
firstprivate(jlmf, jlml, nkv, nlmmt, nrvc, nxv, nyv, nzv, frsh, uim, delks) firstprivate(jlmf, jlml, nkv, nlmmt, nrvc, nxv, nyv, nzv, frsh, uim, delks)
#else #else
// Fall-back host work-space allocation
global_vec_w = = new dcomplex[size_global_vec_w]();
global_w = new dcomplex*[size_global_w];
#pragma omp parallel for simd #pragma omp parallel for simd
#endif #endif
for (int j80 = jlmf-1; j80 < jlml; j80++) { for (int j80 = jlmf-1; j80 < jlml; j80++) {
...@@ -457,8 +466,13 @@ void frfme(string data_file, string output_path) { ...@@ -457,8 +466,13 @@ void frfme(string data_file, string output_path) {
vec_wsum[((j80) * nrvc) + ixyz] = sumy * delks; vec_wsum[((j80) * nrvc) + ixyz] = sumy * delks;
} // ixyz loop } // ixyz loop
} // j80 loop } // j80 loop
#ifdef USE_TARGET_OFFLOAD
omp_target_free(global_w, device_id);
omp_target_free(global_vec_w, device_id);
#else
delete[] global_w; delete[] global_w;
delete[] global_vec_w; delete[] global_vec_w;
#endif // USE_TARGET_OFFLOAD
#ifdef USE_NVTX #ifdef USE_NVTX
nvtxRangePop(); nvtxRangePop();
nvtxRangePush("Closing operations"); nvtxRangePush("Closing operations");
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment