Skip to content
Snippets Groups Projects
Commit 93521048 authored by Mulas, Giacomo's avatar Mulas, Giacomo
Browse files

fix races in ztm

parent 704e038d
No related branches found
No related tags found
No related merge requests found
......@@ -2534,7 +2534,7 @@ void tqr(
}
void ztm(dcomplex **am, ParticleDescriptor *c1) {
dcomplex gie, gle, a1, a2, a3, a4, sum1, sum2, sum3, sum4;
// dcomplex gie, gle, a1, a2, a3, a4, sum1, sum2, sum3, sum4;
const dcomplex cc0 = 0.0 + 0.0 * I;
// int i2 = 0; // old implementation
#ifdef USE_NVTX
......@@ -2546,7 +2546,6 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) {
// C9 *c9_para = new C9(*c9);
dcomplex *gis_v = c1->gis[0];
dcomplex *gls_v = c1->gls[0];
double *rac3j_local = (double *) malloc(c1->lmtpo*sizeof(double));
int k2max = c1->li*(c1->li+2);
int k3max = c1->le*(c1->le+2);
// To parallelise, I run a linearised loop directly over k
......@@ -2559,6 +2558,8 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) {
// im = im -(2*l+1) and l = l+1 (there was a rounding error in a nearly exact root)
#ifdef USE_TARGET_OFFLOAD
#pragma omp target teams distribute parallel for simd collapse(3)
#else
#pragma omp parallel for simd collapse(3)
#endif
for (int n2 = 1; n2 <= c1->nsph; n2++) { // GPU portable?
for (int k2 = 1; k2<=k2max; k2++) {
......@@ -2590,12 +2591,13 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) {
int i3 = l3 * l3 + im3 - 1;
int m3 = -l3 - 1 + im3;
int vecindex = (i2 - 1) * c1->nlem + i3 - 1;
double *rac3j_local = (double *) malloc(c1->lmtpo*sizeof(double));
gis_v[vecindex] = ghit_d(2, 0, n2, l2, m2, l3, m3, c1, rac3j_local);
gls_v[vecindex] = ghit_d(2, 1, n2, l2, m2, l3, m3, c1, rac3j_local);
free(rac3j_local);
} // close k3 loop, former l3 + im3 loops
} // close k2 loop, former l2 + im2 loops
} // close n2 loop
free(rac3j_local);
#ifdef USE_NVTX
nvtxRangePop();
#endif
......@@ -2606,6 +2608,8 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) {
dcomplex *sam_v = c1->sam[0];
#ifdef USE_TARGET_OFFLOAD
#pragma omp target teams distribute parallel for simd collapse(2)
#else
#pragma omp parallel for simd collapse(2)
#endif
for (int i1 = 1; i1 <= c1->ndi; i1++) { // GPU portable?
for (int i3 = 1; i3 <= c1->nlem; i3++) {
......@@ -2615,7 +2619,6 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) {
dcomplex sum4 = cc0;
int i1e = i1 + c1->ndi;
int i3e = i3 + c1->nlem;
#pragma parallel for simd reduction(+:sum1,sum2,sum3,sum4)
for (int i2 = 1; i2 <= c1->ndi; i2++) {
int i2e = i2 + c1->ndi;
int vecindg_23 = (i2 - 1) * c1->nlem + i3 - 1;
......@@ -2640,7 +2643,11 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) {
sam_v[vecind1e + i3e - 1] = sum4;
} // i3 loop
} // i1 loop
#pragma omp parallel for collapse(2)
#ifdef USE_TARGET_OFFLOAD
#pragma omp target teams distribute parallel for simd collapse(2)
#else
#pragma omp parallel for simd collapse(2)
#endif
for (int i1 = 1; i1 <= c1->ndi; i1++) {
for (int i0 = 1; i0 <= c1->nlem; i0++) {
int vecindex = (i1 - 1) * c1->nlem + i0 - 1;
......@@ -2650,7 +2657,9 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) {
} // i1 loop
dcomplex *vec_am0m = c1->am0m[0];
#ifdef USE_TARGET_OFFLOAD
#pragma omp target parallel for collapse(2)
#pragma omp target teams distribute parallel for simd collapse(2)
#else
#pragma omp parallel for simd collapse(2)
#endif
for (int i0 = 1; i0 <= c1->nlem; i0++) {
for (int i3 = 1; i3 <= c1->nlemt; i3++) {
......@@ -2661,11 +2670,11 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) {
int i1e = i1 + c1->ndi;
int vecind1 = (i1 - 1) * c1->nlemt;
int vecind1e = (i1e - 1) * c1->nlemt;
a1 = sam_v[vecind1 + i3 - 1];
a2 = sam_v[vecind1e + i3 - 1];
dcomplex a1 = sam_v[vecind1 + i3 - 1];
dcomplex a2 = sam_v[vecind1e + i3 - 1];
int vecindex = (i1 - 1) * c1->nlem + i0 - 1;
gie = gis_v[vecindex];
gle = gls_v[vecindex];
dcomplex gie = gis_v[vecindex];
dcomplex gle = gls_v[vecindex];
sum1 += (a1 * gie + a2 * gle);
sum2 += (a1 * gle + a2 * gie);
} // i1 loop
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment