diff --git a/src/libnptm/clu_subs.cpp b/src/libnptm/clu_subs.cpp index b299911750c1c0b9aa8ee12a6ec843d3d6a285d0..cb2757eae471741fa28c1ecda123abac1dfe29b6 100644 --- a/src/libnptm/clu_subs.cpp +++ b/src/libnptm/clu_subs.cpp @@ -2259,10 +2259,6 @@ void scr2( vec_sas[vecindex+2] = s21 * csam; vec_sas[vecindex+1] = s12 * csam; vec_sas[vecindex+3] = s22 * csam; - // c1->sas[i][0][0] = s11 * csam; - // c1->sas[i][1][0] = s21 * csam; - // c1->sas[i][0][1] = s12 * csam; - // c1->sas[i][1][1] = s22 * csam; } // label 12 dcomplex phas = cexp(cph * (duk[0] * c1->rxx[i] + duk[1] * c1->ryy[i] + duk[2] * c1->rzz[i])); @@ -2270,10 +2266,6 @@ void scr2( tsas10 += (c1->sas[iogi - 1][1][0] * phas); tsas01 += (c1->sas[iogi - 1][0][1] * phas); tsas11 += (c1->sas[iogi - 1][1][1] * phas); - // c3->tsas[0][0] += (c1->sas[iogi - 1][0][0] * phas); - // c3->tsas[1][0] += (c1->sas[iogi - 1][1][0] * phas); - // c3->tsas[0][1] += (c1->sas[iogi - 1][0][1] * phas); - // c3->tsas[1][1] += (c1->sas[iogi - 1][1][1] * phas); } // i14 loop #ifdef USE_NVTX nvtxRangePop(); @@ -2297,11 +2289,9 @@ void scr2( #pragma omp target teams distribute parallel for simd collapse(4) for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { - // cc = dconjg(c1->sas[i24 - 1][jpo1 - 1][ipo1 - 1]); for (int ipo2 = 1; ipo2 <= 2; ipo2++) { for (int jpo2 = 1; jpo2 <= 2; jpo2++) { int j = jpo2 + (ipo2-1)*2 + (jpo1-1)*4 + (ipo1-1)*8; - // j++; vec_vints[(i24 - 1)*16+j - 1] = c1->sas[i24 - 1][jpo2 - 1][ipo2 - 1] * dconjg(c1->sas[i24 - 1][jpo1 - 1][ipo1 - 1]) * cfsq; } // jpo2 loop } // ipo2 loop @@ -2322,10 +2312,8 @@ void scr2( #pragma omp target parallel for collapse(4) for (int ipo1 = 1; ipo1 <=2; ipo1++) { for (int jpo1 = 1; jpo1 <= 2; jpo1++) { - // cc = dconjg(c3->tsas[jpo1 - 1][ipo1 - 1]); for (int ipo2 = 1; ipo2 <= 2; ipo2++) { for (int jpo2 = 1; jpo2 <= 2; jpo2++) { - // j++; int j = jpo2-1 + (ipo2-1)*2 + (jpo1-1)*4 + (ipo1-1)*8; c1ao->vintt[j] = c3->tsas[jpo2 - 1][ipo2 - 1] * dconjg(c3->tsas[jpo1 - 1][ipo1 - 1]) * cfsq; } // jpo2 loop @@ -2443,38 +2431,54 @@ void ztm(dcomplex **am, C1 *c1, C1_AddOns *c1ao, C4 *c4, C6 *c6, C9 * c9) { // C9 *c9_para = new C9(*c9); dcomplex *gis_v = c9->gis[0]; dcomplex *gls_v = c9->gls[0]; -#pragma omp target parallel - { - double *rac3j_local = (double *) malloc(c6->lmtpo*sizeof(double)); -#pragma omp for collapse(3) - for (int n2 = 1; n2 <= c4->nsph; n2++) { // GPU portable? - for (int l2 = 1; l2 <= c4->li; l2++) { - for (int l3 = 1; l3 <= c4->le; l3++) { - int l2tpo = l2 + l2 + 1; - int l3tpo = l3 + l3 + 1; - for (int im2 = 1; im2 <= l2tpo; im2++) { - int i2 = (n2-1) * c4->li * (c4->li + 2) + l2 * l2 + im2 - 1; - // int vecindex0 = (i2 - 1)*c9->nlem; - int m2 = -l2 - 1 + im2; - // i2++; // old implementation - // int i3 = 0; // old implementation - //#pragma omp for - for (int im3 = 1; im3 <= l3tpo; im3++) { - int i3 = l3 * l3 + im3 - 1; - int m3 = -l3 - 1 + im3; - int vecindex = (i2 - 1)*c9->nlem + i3 - 1; - // i3++; // old implementation - gis_v[vecindex] = ghit_d(2, 0, n2, l2, m2, l3, m3, c1, c1ao, c4, rac3j_local); - gls_v[vecindex] = ghit_d(2, 1, n2, l2, m2, l3, m3, c1, c1ao, c4, rac3j_local); - // c9->gis[i2 - 1][i3 - 1] = ghit(2, 0, n2, l2, m2, l3, m3, c1, c1ao, c4, c6_local); - // c9->gls[i2 - 1][i3 - 1] = ghit(2, 1, n2, l2, m2, l3, m3, c1, c1ao, c4, c6_local); - } // im3 loop - } // l3 loop - } // im2 loop - } // l2 loop - } // n2 loop - free(rac3j_local); - } // pragma omp parallel + double *rac3j_local = (double *) malloc(c6->lmtpo*sizeof(double)); + int k2max = c4->li*(c4->li+2); + int k3max = c4->le*(c4->le+2); + // To parallelise, I run a linearised loop directly over k + // working out the algebra, it turns out that + // k = l*l-1+im + // we invert this to find + // l = (int) sqrt(k+1) and im = k - l*l+1 + // but if it results im = 0, then we set l = l-1 and im = 2*l+1 + // furthermore if it results im > 2*l+1, then we set + // im = im -(2*l+1) and l = l+1 (there was a rounding error in a nearly exact root) +# pragma omp target teams distribute parallel for simd collapse(3) + for (int n2 = 1; n2 <= c4->nsph; n2++) { // GPU portable? + for (int k2 = 1; k2<=k2max; k2++) { + for (int k3 = 1; k3<=k3max; k3++) { + int l2 = (int) sqrt(k2+1); + int im2 = k2 - (l2*l2) + 1; + if (im2 == 0) { + l2--; + im2 = 2*l2+1; + } + else if (im2 > 2*l2 + 1) { + im2 -= 2*l2 + 1; + l2++; + } + int l3 = (int) sqrt(k3+1); + int im3 = k3 - (l3*l3) + 1; + if (im3 == 0) { + l3--; + im3 = 2*l3+1; + } + else if (im3 > 2*l3 + 1) { + im3 -= 2*l3 + 1; + l3++; + } + // int l2tpo = l2 + l2 + 1; + // int l3tpo = l3 + l3 + 1; + int i2 = (n2-1) * c4->li * (c4->li + 2) + l2 * l2 + im2 - 1; + int m2 = -l2 - 1 + im2; + int i3 = l3 * l3 + im3 - 1; + int m3 = -l3 - 1 + im3; + int vecindex = (i2 - 1)*c9->nlem + i3 - 1; + gis_v[vecindex] = ghit_d(2, 0, n2, l2, m2, l3, m3, c1, c1ao, c4, rac3j_local); + gls_v[vecindex] = ghit_d(2, 1, n2, l2, m2, l3, m3, c1, c1ao, c4, rac3j_local); + } // close k3 loop, former l3 + im3 loops + } // close k2 loop, former l2 + im2 loops + } // close n2 loop + free(rac3j_local); #ifdef USE_NVTX nvtxRangePop(); #endif @@ -2483,7 +2487,7 @@ void ztm(dcomplex **am, C1 *c1, C1_AddOns *c1ao, C4 *c4, C6 *c6, C9 * c9) { #endif dcomplex *am_v = am[0]; dcomplex *sam_v = c9->sam[0]; -# pragma omp target teams distribute parallel for collapse(2) +# pragma omp target teams distribute parallel for simd collapse(2) for (int i1 = 1; i1 <= ndi; i1++) { // GPU portable? for (int i3 = 1; i3 <= c4->nlem; i3++) { dcomplex sum1 = cc0; @@ -2492,6 +2496,7 @@ void ztm(dcomplex **am, C1 *c1, C1_AddOns *c1ao, C4 *c4, C6 *c6, C9 * c9) { dcomplex sum4 = cc0; int i1e = i1 + ndi; int i3e = i3 + c4->nlem; +#pragma parallel for simd reduction(+:sum1,sum2,sum3,sum4) for (int i2 = 1; i2 <= ndi; i2++) { int i2e = i2 + ndi; int vecindg_23 = (i2 - 1)*c9->nlem + i3 - 1;