Skip to content
Snippets Groups Projects
Commit 95f8fcf8 authored by Mulas, Giacomo's avatar Mulas, Giacomo
Browse files

parallelisation in principle done, working but not very effective, profiling

needed
parent 8a4d015b
No related branches found
No related tags found
No related merge requests found
...@@ -1293,6 +1293,9 @@ void mextc(double vk, double exri, dcomplex **fsac, double **cextlr, double **ce ...@@ -1293,6 +1293,9 @@ void mextc(double vk, double exri, dcomplex **fsac, double **cextlr, double **ce
} }
void pcros(double vk, double exri, C1 *c1, C1_AddOns *c1ao, C4 *c4) { void pcros(double vk, double exri, C1 *c1, C1_AddOns *c1ao, C4 *c4) {
#ifdef USE_NVTX
nvtxRangePush("whole pcros");
#endif
const dcomplex cc0 = 0.0 + 0.0 * I; const dcomplex cc0 = 0.0 + 0.0 * I;
dcomplex sump, sum1, sum2, sum3, sum4, am, amp, cc, csam; dcomplex sump, sum1, sum2, sum3, sum4, am, amp, cc, csam;
const double exdc = exri * exri; const double exdc = exri * exri;
...@@ -1303,8 +1306,13 @@ void pcros(double vk, double exri, C1 *c1, C1_AddOns *c1ao, C4 *c4) { ...@@ -1303,8 +1306,13 @@ void pcros(double vk, double exri, C1 *c1, C1_AddOns *c1ao, C4 *c4) {
double cfsq = 4.0 / (pi4sq *ccs * ccs); double cfsq = 4.0 / (pi4sq *ccs * ccs);
const int nlemt = c4->nlem + c4->nlem; const int nlemt = c4->nlem + c4->nlem;
int jpo = 2; int jpo = 2;
for (int ipo18 = 1; ipo18 <= 2; ipo18++) { dcomplex *vec_am0m = c1ao->am0m[0];
if (ipo18 == 2) jpo = 1; dcomplex *vec_w = c1->w[0];
#ifdef USE_NVTX
nvtxRangePush("pcros outer loop 1");
#endif
for (int ipo18 = 0; ipo18 < 2; ipo18++) {
int jpo = 1-ipo18;
int ipopt = ipo18 + 2; int ipopt = ipo18 + 2;
int jpopt = jpo + 2; int jpopt = jpo + 2;
double sum = 0.0; double sum = 0.0;
...@@ -1313,32 +1321,44 @@ void pcros(double vk, double exri, C1 *c1, C1_AddOns *c1ao, C4 *c4) { ...@@ -1313,32 +1321,44 @@ void pcros(double vk, double exri, C1 *c1, C1_AddOns *c1ao, C4 *c4) {
sum2 = cc0; sum2 = cc0;
sum3 = cc0; sum3 = cc0;
sum4 = cc0; sum4 = cc0;
for (int i12 = 1; i12 <= nlemt; i12++) { #ifdef USE_NVTX
int i = i12 - 1; nvtxRangePush("pcros intermediate loop 1");
am = cc0; #endif
amp = cc0; #pragma omp target teams distribute parallel for simd reduction(+:sum, sump, sum1, sum2, sum3, sum4)
for (int j10 = 1; j10 <= nlemt; j10++) { for (int i12 = 0; i12 < nlemt; i12++) {
int j = j10 - 1; // int i = i12 - 1;
am += (c1ao->am0m[i][j] * c1->w[j][ipo18 - 1]); dcomplex am = cc0;
amp += (c1ao->am0m[i][j] * c1->w[j][jpo - 1]); dcomplex amp = cc0;
//#pragma omp target teams distribute parallel for simd reduction(+:am,amp)
for (int j10 = 0; j10 < nlemt; j10++) {
// int j = j10 - 1;
am += (vec_am0m[nlemt*i12+j10] * vec_w[4*j10+ipo18]);
amp += (vec_am0m[nlemt*i12+j10] * vec_w[4*j10+jpo]);
} // j10 loop } // j10 loop
sum += real(dconjg(am) * am); sum += real(dconjg(am) * am);
sump += (dconjg(amp) * am); sump += (dconjg(amp) * am);
sum1 += (dconjg(c1->w[i][ipo18 - 1]) * am); sum1 += (dconjg(vec_w[4*i12+ipo18]) * am);
sum2 += (dconjg(c1->w[i][jpo - 1]) * am); sum2 += (dconjg(vec_w[4*i12+jpo]) * am);
sum3 += (c1->w[i][ipopt - 1] * am); sum3 += (vec_w[4*i12+ipopt] * am);
sum4 += (c1->w[i][jpopt - 1] * am); sum4 += (vec_w[4*i12+jpopt] * am);
} // i12 loop } // i12 loop
c1ao->scsc[ipo18 - 1] = cccs * sum; #ifdef USE_NVTX
c1ao->scscp[ipo18 - 1] = cccs * sump; nvtxRangePop();
c1ao->ecsc[ipo18 - 1] = -cccs * real(sum1); #endif
c1ao->ecscp[ipo18 - 1] = -cccs * sum2; c1ao->scsc[ipo18] = cccs * sum;
c1ao->fsac[ipo18 - 1][ipo18 - 1] = csam * sum1; c1ao->scscp[ipo18] = cccs * sump;
c1ao->fsac[jpo - 1][ipo18 - 1] = csam * sum2; c1ao->ecsc[ipo18] = -cccs * real(sum1);
c1ao->sac[ipo18 - 1][ipo18 - 1] = csam * sum3; c1ao->ecscp[ipo18] = -cccs * sum2;
c1ao->sac[jpo - 1][ipo18 - 1] = csam * sum4; c1ao->fsac[ipo18][ipo18] = csam * sum1;
c1ao->fsac[jpo][ipo18] = csam * sum2;
c1ao->sac[ipo18][ipo18] = csam * sum3;
c1ao->sac[jpo][ipo18] = csam * sum4;
} // ipo18 loop } // ipo18 loop
int i = 0; int i = 0;
dcomplex * &vint = c1->vint;
#ifdef USE_NVTX
nvtxRangePush("pcros loop 2");
#endif
for (int ipo1 = 1; ipo1 <= 2; ipo1++) { for (int ipo1 = 1; ipo1 <= 2; ipo1++) {
for (int jpo1 = 1; jpo1 <= 2; jpo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) {
cc = dconjg(c1ao->sac[jpo1 - 1][ipo1 - 1]); cc = dconjg(c1ao->sac[jpo1 - 1][ipo1 - 1]);
...@@ -1349,38 +1369,47 @@ void pcros(double vk, double exri, C1 *c1, C1_AddOns *c1ao, C4 *c4) { ...@@ -1349,38 +1369,47 @@ void pcros(double vk, double exri, C1 *c1, C1_AddOns *c1ao, C4 *c4) {
} // ipo2 loop } // ipo2 loop
} // jpo1 loop } // jpo1 loop
} // ipo1 loop } // ipo1 loop
#ifdef USE_NVTX
nvtxRangePop();
#endif
#ifdef USE_NVTX
nvtxRangePop();
#endif
} }
void pcrsm0(double vk, double exri, int inpol, C1 *c1, C1_AddOns *c1ao, C4 *c4) { void pcrsm0(double vk, double exri, int inpol, C1 *c1, C1_AddOns *c1ao, C4 *c4) {
const dcomplex cc0 = 0.0 + 0.0 * I; const dcomplex cc0 = 0.0 + 0.0 * I;
const dcomplex uim = 0.0 + 1.0 * I; const dcomplex uim = 0.0 + 1.0 * I;
dcomplex sum1, sum2, sum3, sum4, sumpd; dcomplex sum1, sum2, sum3, sum4;
dcomplex sums1, sums2, sums3, sums4, csam; dcomplex sums1, sums2, sums3, sums4, csam;
double exdc = exri * exri; double exdc = exri * exri;
double ccs = 4.0 * acos(0.0) / (vk * vk); double ccs = 4.0 * acos(0.0) / (vk * vk);
double cccs = ccs / exdc; double cccs = ccs / exdc;
int nlemt = c4->nlem + c4->nlem;
dcomplex *vec_am0m = c1ao->am0m[0];
csam = -(ccs / (exri * vk)) * 0.5 * I; csam = -(ccs / (exri * vk)) * 0.5 * I;
sum2 = cc0; sum2 = cc0;
sum3 = cc0; sum3 = cc0;
for (int i14 = 1; i14 <= c4->nlem; i14++) { // GPU portable? #pragma omp target teams distribute parallel for simd reduction(+:sum2,sum3)
for (int i14 = 0; i14 < c4->nlem; i14++) {
int ie = i14 + c4->nlem; int ie = i14 + c4->nlem;
sum2 += (c1ao->am0m[i14 - 1][i14 - 1] + c1ao->am0m[ie - 1][ie - 1]); sum2 += (vec_am0m[nlemt*i14 + i14] + vec_am0m[nlemt*ie + ie]);
sum3 += (c1ao->am0m[i14 - 1][ie - 1] + c1ao->am0m[ie - 1][i14 - 1]); sum3 += (vec_am0m[nlemt*i14 + ie] + vec_am0m[nlemt*ie + i14]);
} // i14 loop } // i14 loop
double sumpi = 0.0; double sumpi = 0.0;
sumpd = cc0; dcomplex sumpd = cc0;
int nlemt = c4->nlem + c4->nlem; #pragma omp target teams distribute parallel for simd collapse(2) reduction(+:sumpi,sumpd)
for (int i16 = 1; i16 <= nlemt; i16++) { for (int i16 = 0; i16 < nlemt; i16++) {
for (int j16 = 1; j16 <= c4->nlem; j16++) { for (int j16 = 0; j16 < c4->nlem; j16++) {
int je = j16 + c4->nlem; int je = j16 + c4->nlem;
double rvalue = real( double rvalue = real(
dconjg(c1ao->am0m[i16 - 1][j16 - 1]) * c1ao->am0m[i16 - 1][j16 - 1] dconjg(vec_am0m[nlemt*i16 + j16]) * vec_am0m[nlemt*i16 + j16]
+ dconjg(c1ao->am0m[i16 - 1][je - 1]) * c1ao->am0m[i16 - 1][je - 1] + dconjg(vec_am0m[nlemt*i16 + je]) * vec_am0m[nlemt*i16 + je]
); );
sumpi += rvalue; sumpi += rvalue;
sumpd += ( sumpd += (
dconjg(c1ao->am0m[i16 - 1][j16 - 1]) * c1ao->am0m[i16 - 1][je - 1] dconjg(vec_am0m[nlemt*i16 + j16]) * vec_am0m[nlemt*i16 + je]
+ dconjg(c1ao->am0m[i16 - 1][je - 1]) * c1ao->am0m[i16 - 1][j16 - 1] + dconjg(vec_am0m[nlemt*i16 + je]) * vec_am0m[nlemt*i16 + j16]
); );
} // j16 loop } // j16 loop
} // i16 loop } // i16 loop
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment