Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
N
NP_TMcode
Manage
Activity
Members
Plan
Wiki
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Container registry
Analyze
Contributor analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Giacomo Mulas
NP_TMcode
Commits
93521048
Commit
93521048
authored
2 months ago
by
Mulas, Giacomo
Browse files
Options
Downloads
Patches
Plain Diff
fix races in ztm
parent
704e038d
No related branches found
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/libnptm/clu_subs.cpp
+19
-10
19 additions, 10 deletions
src/libnptm/clu_subs.cpp
with
19 additions
and
10 deletions
src/libnptm/clu_subs.cpp
+
19
−
10
View file @
93521048
...
...
@@ -2534,7 +2534,7 @@ void tqr(
}
void
ztm
(
dcomplex
**
am
,
ParticleDescriptor
*
c1
)
{
dcomplex
gie
,
gle
,
a1
,
a2
,
a3
,
a4
,
sum1
,
sum2
,
sum3
,
sum4
;
//
dcomplex gie, gle, a1, a2, a3, a4, sum1, sum2, sum3, sum4;
const
dcomplex
cc0
=
0.0
+
0.0
*
I
;
// int i2 = 0; // old implementation
#ifdef USE_NVTX
...
...
@@ -2546,7 +2546,6 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) {
// C9 *c9_para = new C9(*c9);
dcomplex
*
gis_v
=
c1
->
gis
[
0
];
dcomplex
*
gls_v
=
c1
->
gls
[
0
];
double
*
rac3j_local
=
(
double
*
)
malloc
(
c1
->
lmtpo
*
sizeof
(
double
));
int
k2max
=
c1
->
li
*
(
c1
->
li
+
2
);
int
k3max
=
c1
->
le
*
(
c1
->
le
+
2
);
// To parallelise, I run a linearised loop directly over k
...
...
@@ -2559,6 +2558,8 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) {
// im = im -(2*l+1) and l = l+1 (there was a rounding error in a nearly exact root)
#ifdef USE_TARGET_OFFLOAD
#pragma omp target teams distribute parallel for simd collapse(3)
#else
#pragma omp parallel for simd collapse(3)
#endif
for
(
int
n2
=
1
;
n2
<=
c1
->
nsph
;
n2
++
)
{
// GPU portable?
for
(
int
k2
=
1
;
k2
<=
k2max
;
k2
++
)
{
...
...
@@ -2590,12 +2591,13 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) {
int
i3
=
l3
*
l3
+
im3
-
1
;
int
m3
=
-
l3
-
1
+
im3
;
int
vecindex
=
(
i2
-
1
)
*
c1
->
nlem
+
i3
-
1
;
double
*
rac3j_local
=
(
double
*
)
malloc
(
c1
->
lmtpo
*
sizeof
(
double
));
gis_v
[
vecindex
]
=
ghit_d
(
2
,
0
,
n2
,
l2
,
m2
,
l3
,
m3
,
c1
,
rac3j_local
);
gls_v
[
vecindex
]
=
ghit_d
(
2
,
1
,
n2
,
l2
,
m2
,
l3
,
m3
,
c1
,
rac3j_local
);
free
(
rac3j_local
);
}
// close k3 loop, former l3 + im3 loops
}
// close k2 loop, former l2 + im2 loops
}
// close n2 loop
free
(
rac3j_local
);
#ifdef USE_NVTX
nvtxRangePop
();
#endif
...
...
@@ -2606,6 +2608,8 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) {
dcomplex
*
sam_v
=
c1
->
sam
[
0
];
#ifdef USE_TARGET_OFFLOAD
#pragma omp target teams distribute parallel for simd collapse(2)
#else
#pragma omp parallel for simd collapse(2)
#endif
for
(
int
i1
=
1
;
i1
<=
c1
->
ndi
;
i1
++
)
{
// GPU portable?
for
(
int
i3
=
1
;
i3
<=
c1
->
nlem
;
i3
++
)
{
...
...
@@ -2615,7 +2619,6 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) {
dcomplex
sum4
=
cc0
;
int
i1e
=
i1
+
c1
->
ndi
;
int
i3e
=
i3
+
c1
->
nlem
;
#pragma parallel for simd reduction(+:sum1,sum2,sum3,sum4)
for
(
int
i2
=
1
;
i2
<=
c1
->
ndi
;
i2
++
)
{
int
i2e
=
i2
+
c1
->
ndi
;
int
vecindg_23
=
(
i2
-
1
)
*
c1
->
nlem
+
i3
-
1
;
...
...
@@ -2640,7 +2643,11 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) {
sam_v
[
vecind1e
+
i3e
-
1
]
=
sum4
;
}
// i3 loop
}
// i1 loop
#pragma omp parallel for collapse(2)
#ifdef USE_TARGET_OFFLOAD
#pragma omp target teams distribute parallel for simd collapse(2)
#else
#pragma omp parallel for simd collapse(2)
#endif
for
(
int
i1
=
1
;
i1
<=
c1
->
ndi
;
i1
++
)
{
for
(
int
i0
=
1
;
i0
<=
c1
->
nlem
;
i0
++
)
{
int
vecindex
=
(
i1
-
1
)
*
c1
->
nlem
+
i0
-
1
;
...
...
@@ -2650,7 +2657,9 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) {
}
// i1 loop
dcomplex
*
vec_am0m
=
c1
->
am0m
[
0
];
#ifdef USE_TARGET_OFFLOAD
#pragma omp target parallel for collapse(2)
#pragma omp target teams distribute parallel for simd collapse(2)
#else
#pragma omp parallel for simd collapse(2)
#endif
for
(
int
i0
=
1
;
i0
<=
c1
->
nlem
;
i0
++
)
{
for
(
int
i3
=
1
;
i3
<=
c1
->
nlemt
;
i3
++
)
{
...
...
@@ -2661,11 +2670,11 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) {
int
i1e
=
i1
+
c1
->
ndi
;
int
vecind1
=
(
i1
-
1
)
*
c1
->
nlemt
;
int
vecind1e
=
(
i1e
-
1
)
*
c1
->
nlemt
;
a1
=
sam_v
[
vecind1
+
i3
-
1
];
a2
=
sam_v
[
vecind1e
+
i3
-
1
];
dcomplex
a1
=
sam_v
[
vecind1
+
i3
-
1
];
dcomplex
a2
=
sam_v
[
vecind1e
+
i3
-
1
];
int
vecindex
=
(
i1
-
1
)
*
c1
->
nlem
+
i0
-
1
;
gie
=
gis_v
[
vecindex
];
gle
=
gls_v
[
vecindex
];
dcomplex
gie
=
gis_v
[
vecindex
];
dcomplex
gle
=
gls_v
[
vecindex
];
sum1
+=
(
a1
*
gie
+
a2
*
gle
);
sum2
+=
(
a1
*
gle
+
a2
*
gie
);
}
// i1 loop
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment