diff --git a/src/vect.1b.c b/src/vect.1b.c index 167e21f89220eb9fc315392e7c6d52421a1f747b..757d5ce04f5cd2ae3023470fcf87d2a63af6b7ef 100644 --- a/src/vect.1b.c +++ b/src/vect.1b.c @@ -138,16 +138,14 @@ void process_with_structure( const P_t * restrict P, } - double m1m2 = Ptarget.mass * data[3]; - // 1D distances - // - double dx = Ptarget.pos[0] - data[0]; - double dy = Ptarget.pos[1] - data[1]; - double dz = Ptarget.pos[2] - data[2]; + double register m1m2 = Ptarget.mass * data[3]; + double register dx = Ptarget.pos[0] - data[0]; + double register dy = Ptarget.pos[1] - data[1]; + double register dz = Ptarget.pos[2] - data[2]; // square the distance // - double dist3 = (dx*dx + dy*dy) + dz*dz; + double register dist3 = (dx*dx + dy*dy) + dz*dz; dist3 = 1.0/(dist3*sqrt(dist3)); // accumulate the force @@ -194,23 +192,19 @@ void process_with_vectors( const v4df * restrict V, { const v4df Vtarget = V[target]; // save the target vector in a local vector (much probably a register) - v4df Vbuffer; // The array of particles that are processed in the inner loop - v4df vforce = {0}; // accumulate the forces in separate lanes + v4df Vbuffer; // The array of particles that are processed in the inner loop + v4df vforce = {0}; // accumulate the force components v4df mask_pos = {1,1,1,0}; // used to mask out the mass v4df mask_mass = {0,0,0,1}; // used to mask out the position - // pre-load the first BUNCH of particles - // Vbuffer = V[ngb_list[0]]; - // outern loop - // loop over bunches for ( int i = 0; i < Nngb; i++ ) { - // hide the memory latency by pre-loding - // the next BUNCH + // try to hide the memory latency by + // pre-loading the next particle // v4df preVbuffer; if ( i < Nngb-1 ) @@ -227,7 +221,7 @@ void process_with_vectors( const v4df * restrict V, // the mass product with the i-th neighbour // - v4df register prod = (Vtarget * Vbuffer) * mask_mass; + v4df register prod = (Vtarget * Vbuffer); // get the mass product as a single double // @@ -369,8 +363,10 @@ void process_with_arrays( const double * restrict pos_x, const double * restrict pos_y, const double * restrict pos_z, const double * restrict mass, - const int target, const int *ngb_list, const int Nngb, - double force[3] ) + const int target, + const int * ngb_list, + const int Nngb, + double force[3] ) { const double x = pos_x[target]; @@ -400,22 +396,16 @@ void process_with_arrays( const double * restrict pos_x, nextm = mass[ngb_list[i+1]]; } - /* - vx = pos_x[ngb_list[i]]; - vy = pos_y[ngb_list[i]]; - vz = pos_z[ngb_list[i]]; - vm = mass[ngb_list[i]]; - */ // get distances along x, y and z for the i-th neighbour // - double dx = x - vx; - double dy = y - vy; - double dz = z - vz; - double mm = m * vm; + double register dx = x - vx; + double register dy = y - vy; + double register dz = z - vz; + double register mm = m * vm; // calculate the distance^2 for BUNCH neighbours // - double dist3 = dx*dx+dy*dy+dz*dz; + double register dist3 = dx*dx+dy*dy+dz*dz; dist3 = dist3*sqrt(dist3); @@ -614,7 +604,7 @@ void process_with_arrays_intrinsics( const double * restrict pos_x, IVDEP LOOP_VECTORIZE - LOOP_UNROLL_N(4) + LOOP_UNROLL_N(1) for ( int i = 0; i < _Nngb_; i+=4 ) { __m256d ngbx = _mm256_set_pd ( pos_x[ngb_list[i]], pos_x[ngb_list[i+1]], pos_x[ngb_list[i+2]], pos_x[ngb_list[i+3]] ); @@ -1091,16 +1081,11 @@ int main( int argc, char **argv ) if ( shot == 1 ) PAPI_STOP_CNTR; - - int offset = target*3; - for ( int i = 0; i < 3; i++ ) - force[ offset + i] += myforce[i]; - - /* if ( j == 0 ) */ - /* printf("shot %d timing is %g\n", shot, CPU_TIME-now); */ - + } + //printf("%d t%d %g %g %g\n", j, target, myforce[0], myforce[1], myforce[2]); + timing += this_timing; } @@ -1113,7 +1098,7 @@ int main( int argc, char **argv ) assert( myforce[i] != 0 ); assert( !isnan(myforce[i]) ); assert( !isinf(myforce[i]) ); - force[offset+i] /= (Nrepetitions+1); + force[offset+i] = myforce[i] / (Nrepetitions+1); } } diff --git a/src/vect.2b.c b/src/vect.2b.c new file mode 100644 index 0000000000000000000000000000000000000000..181fe49d29c7e300d7723455f69db0127b00f038 --- /dev/null +++ b/src/vect.2b.c @@ -0,0 +1,1009 @@ + +/* ────────────────────────────────────────────────────────────────────────── * + │ │ + │ This file is part of the nbody-vectorization-toy-lab in the fram of the │ + │ SPACE Center of Excellence │ + │ © 2024 │ + │ The work has been started by Luca Tornatore @ INAF (Italian National │ + │ Institute for Astrophysics) and continued in collaboration with │ + │ Vassilis Flouris @ FORTH (Foundation for Research and Technology - Hellas │ + │ │ + │ Consult the accopanying file changes.log for details │ + │ │ + │ contact: │ + │ luca.tornatore@inaf.it │ + │ vflouris@ics.forth.gr │ + │ │ + │ │ + │ ××××××××××××××××××××××××××××××××××××××××××××××××××××××××××××××××××××××××× │ + │ │ + │ This is open source software; you can redistribute it and/or modify │ + │ it under the terms of the "3 clauses BSD license" │ + │ https://opensource.org/license/bsd-3-clause │ + │ │ + │ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS │ + │ “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT │ + │ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS │ + │ FOR A PARTICULAR PURPOSE ARE DISCLAIMED. │ + │ │ + * ────────────────────────────────────────────────────────────────────────── */ + + + +#if defined(__STDC__) +#if (__STDC_VERSION__ <= 201710L) +#error "c2x standard is needed to compile this source" +#endif +#define _XOPEN_SOURCE 700 +#endif + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <time.h> +#include <math.h> +#include <assert.h> + +#include <vector_pragmas.h> +#include <vector_types.h> + +#include "mypapi.h" +//#include <immintrin.h> + +#define CPU_TIME ({struct timespec ts; clock_gettime( CLOCK_PROCESS_CPUTIME_ID, &ts ), (double)ts.tv_sec + \ + (double)ts.tv_nsec * 1e-9;}) + +// with how many neighbours each particle interacts, in avg +// +#define NNEIGHBOURS 200 + +#define NREPETITIONS 5 + +#define DATASIZE 4 // 3 positions + the mass + +// how many different implementations +// +#define NSHOTS 5 +const char *implementation_labels[NSHOTS+1] = {"generate particles", + "Using AoS", + "using AoV (array of vectors)", + "using AoV with intrinsics", + "using SoA", + "using SoA with intrinsics" }; + +// define a vector type +// + +#define DV4SIZE DATASIZE +typedef double v4df __attribute__ ((vector_size (DV4SIZE*sizeof(double)))); +typedef union +{ + v4df P; + double p[DV4SIZE]; +} v4df_u; + + +// define a structure +typedef struct { + double pos[3]; + double mass; + #if defined(FILL) + double array[FILL]; + #endif +} P_t; + + + + +// ··································································· +// +/* + * Processing the particles as structures + * + */ + +void process_with_structure( const P_t * restrict P, void * restrict mem, + const int target, const int *ngb_list, const int Nngb, + double force[3], + double *timing ) +{ + + typedef struct { + double pos[3]; + double mass; } Plocal_t ; + + /* + * move all the neighbours into a local buffer + * here we hide the latency of sparse memory access + * and we try to enhance the vectorization by building + * the possibility of a loop over contiguous memory + * + */ + Plocal_t * restrict memP = __builtin_assume_aligned((Plocal_t*)mem, VALIGN); + for ( int i = 0; i < Nngb; i++ ) + memcpy(&memP[i], &P[ngb_list[i]], 4*sizeof(double) ); + + // let's return the timing of the pure calculation so that we + // can compare with v1 + // + + *timing = CPU_TIME; + + Plocal_t Ptarget; + memcpy(&Ptarget, &P[target], 4*sizeof(double) ); + double local_force[3] = {0}; // this will accumulate the total force on target from all the neigbhours + + IVDEP + LOOP_VECTORIZE + VECTOR_ALIGNED + for ( int i = 0; i < Nngb; i++ ) + { + + double register m1m2 = Ptarget.mass * memP[i].mass; + // 1D distances + // + double register dx = Ptarget.pos[0] - memP[i].pos[0]; + double register dy = Ptarget.pos[1] - memP[i].pos[1]; + double register dz = Ptarget.pos[2] - memP[i].pos[2]; + + // square the distance + // + double register dist3 = (dx*dx + dy*dy) + dz*dz; + dist3 = 1.0/(dist3*sqrt(dist3)); + + // accumulate the force + // + local_force[0] += m1m2 * dx * dist3; + local_force[1] += m1m2 * dy * dist3; + local_force[2] += m1m2 * dz * dist3; + + + /* + double check = P[target].mass * memP[i].mass / dist2; + printf("T%d n %d->%d %g %g %g %g\n", + target, i, ngb_list[i], dx*dx, dy*dy, dz*dz, check ); + */ + + } + + for ( int i = 0; i < 3; i++) + force[i] += local_force[i]; + + *timing = CPU_TIME - *timing; + return; +} + + +// ··································································· +// +/* + * Here we trat the particles explicitly as vectors + * The array if structures P is accessed as v4df vectors + * + */ + + + +void process_with_vectors( const v4df *restrict V, + void *restrict mem, + const int target, + const int *ngb_list, + const int Nngb, + double force[3], + double *timing ) +{ + alignas(VALIGN) const v4df Vtarget = V[target]; // save the target vector in a local vector (much probably a register) + alignas(VALIGN) const v4df mask_pos = {1,1,1,0}; // used to mask out the mass + alignas(VALIGN) const v4df mask_mass = {0,0,0,1}; // used to mask out the position + alignas(VALIGN) v4df vforce = {0,0,0,0}; // accumulate the force components + + + + ASSUME_ALIGNED(mem, VALIGN); + ASSUME_ALIGNED(V, VALIGN); + ASSUME_ALIGNED(ngb_list, VALIGN); + v4df *memV = ASSUME_ALIGNED((v4df*)mem, VALIGN); + + IVDEP + LOOP_UNROLL_N(4) + VECTOR_ALIGNED + for ( int i = 0; i < Nngb; i++ ) + memV[i] = V[ ngb_list[i] ]; + + *timing = CPU_TIME; + + // set-up the new iteration spaces + // + //int _Nngb_ = Nngb & (-4); + + IVDEP + VECTOR_ALIGNED + LOOP_UNROLL_N(4) + for ( int i = 0; i < Nngb; i++ ) + { + // the distance^2 of the i-th neighbour + // + v4df register dist = (Vtarget - memV[i]) * mask_pos; + v4df_u register dist2; + dist2.P = dist*dist; + double register scalar_dist3 = (dist2.p[0]+dist2.p[1])+(dist2.p[2]+dist2.p[3]); + scalar_dist3 = 1.0 / (scalar_dist3*sqrt(scalar_dist3)); + + // the mass product with the i-th neighbour + // + v4df register prod = (Vtarget * memV[i]); + + // get the mass product as a single double + // + v4df mm = dist * ((v4df_u)prod).p[3]; + + // get the force from the i-th neighbour + // + vforce += mm * scalar_dist3; + // + // NOTE: the accumulation here above on a single double is obvisouly a bottleneck + // because it imposes a strict constraint on compiler's optimization. + // I + } + + + // accumulate the partial results + // + for ( int i = 0; i < 3; i++ ) + force[i] += ((v4df_u)vforce).p[i]; + + + *timing = CPU_TIME - *timing; + return; + +} + + + + +// ··································································· +// +/* + * + */ + +void process_with_vectors_intrinsics( const double * restrict V, + void * restrict mem, + const int target, + const int * ngb_list, + const int Nngb, + double force[3], + double * timing ) + +{ + long long bb = -1; // used for masking entries of a vector + __m256d register mask_pos = _mm256_set_pd(0, *(double*)&bb, *(double*)&bb, *(double*)&bb); // used to mask out the mass + __m256d register mask_mass = _mm256_set_pd(*(double*)&bb, 0, 0, 0); // used to mask out the position + const __m256d register vtarget = _mm256_load_pd(&V[target*4]); // load the target data + + + double * restrict memV = ASSUME_ALIGNED ( (double*)mem, VALIGN); + __m256d * restrict memVV = ASSUME_ALIGNED ((__m256d*)mem, VALIGN); + __m256d * restrict VV = ASSUME_ALIGNED ((__m256d*)V, VALIGN); + + __m256d Vforce = {0}; + + + IVDEP + LOOP_VECTORIZE + LOOP_UNROLL + for ( int i = 0; i < Nngb; i++ ) + memVV[i] = VV[ngb_list[i]]; + + double now = CPU_TIME; + + IVDEP + LOOP_VECTORIZE + LOOP_UNROLL + for ( int i = 0; i < Nngb; i++ ) + { + //__m256d register vdist; + //__m256d register vdist3; + __m256d vdist; + __m256d vdist3; + // note: the 64bits fields in the following + // comments are reported so that at + // the beginning there are the first + // bits in the vector field, i.e. + // those at the least significant bits + // >> however, when you consider the + // _mm256_set_pd instruction, the arguments + // are in the opposite direction: the first + // one sets the most significant bits and + // so on + // + // least ---> most + + // get [ dx, dy, dz dm ] + vdist = _mm256_sub_pd ( vtarget, memVV[i] ); + // + // get [ dx^2, dy^2, dz^2, dm^2 ] + vdist3 = _mm256_mul_pd ( vdist, vdist ); + + // + // get [ dx^2, dy^2, dz^2, 0 ] + vdist3 = _mm256_and_pd ( vdist3, mask_pos ); + // + // get [ dx^2+dy^2, dx^2+dy^2, dz^2, dz^2] + vdist3 = _mm256_hadd_pd ( vdist3, vdist3 ); + // + // get [ dx^2+dy^2, dz^2, dx^2+dy^2, dz^2 ] + vdist3 = _mm256_permute4x64_pd ( vdist3, 0b00100111); + // + // get [ dx^2+dy^2+dz^2, .., .., .. ] + vdist3 = _mm256_hadd_pd ( vdist3, vdist3 ); + vdist3 = _mm256_mul_pd ( vdist3, _mm256_sqrt_pd(vdist3) ); + + __m256d mprod; + double m1m2; + // + // get [ x0*x1, y0*y1, z0*z1, m0*m1 ] + mprod = _mm256_mul_pd( vtarget, memVV[i] ); + m1m2 = (((double*)&mprod)[3]); + mprod = _mm256_set_pd( m1m2, m1m2, m1m2, m1m2 ); + + // + // get [ 0, 0, 0, m0*m1 ] + mprod = _mm256_mul_pd( mprod, vdist ); + // + // get [ 0, 0, 0, m0*m1 / (dx^2+dy^2+dz^2) ] + mprod = _mm256_div_pd( mprod, vdist3 ); + + // get the uppermost element from the vector + // --> here we should use the reshuffle+extraction instruction + Vforce += mprod; + + } + + for ( int k = 0; k < 3; k++ ) + force[k] += ((double*)&Vforce)[k]; + + + *timing = CPU_TIME - now; + return; +} + + + +// ··································································· +// +/* + * + */ + +void process_with_arrays( const double * restrict pos_x, + const double * restrict pos_y, + const double * restrict pos_z, + const double * restrict mass, + void * restrict mems[], + const int target, + const int * restrict ngb_list, + const int Nngb, + double force[3], + double * timing ) + +{ + ASSUME_ALIGNED( pos_x, VALIGN ); + ASSUME_ALIGNED( pos_y, VALIGN ); + ASSUME_ALIGNED( pos_z, VALIGN ); + ASSUME_ALIGNED( mass, VALIGN ); + ASSUME_ALIGNED( ngb_list, VALIGN ); + + double *_memx = ASSUME_ALIGNED((double*)mems[0], VALIGN); + double *_memy = ASSUME_ALIGNED((double*)mems[1], VALIGN); + double *_memz = ASSUME_ALIGNED((double*)mems[2], VALIGN); + double *_memm = ASSUME_ALIGNED((double*)mems[3], VALIGN); + + IVDEP + LOOP_VECTORIZE + LOOP_UNROLL_N(4) + for ( int i = 0; i < Nngb; i++ ) { + int k = ngb_list[i]; + _memx[i] = pos_x[k]; } + + IVDEP + LOOP_VECTORIZE + LOOP_UNROLL_N(4) + for ( int i = 0; i < Nngb; i++ ) { + int k = ngb_list[i]; + _memy[i] = pos_y[k]; } + + IVDEP + LOOP_VECTORIZE + LOOP_UNROLL_N(4) + for ( int i = 0; i < Nngb; i++ ) { + int k = ngb_list[i]; + _memz[i] = pos_z[k]; } + + IVDEP + LOOP_VECTORIZE + LOOP_UNROLL_N(4) + for ( int i = 0; i < Nngb; i++ ) { + int k = ngb_list[i]; + _memm[i] = mass [k]; } + + double mytiming = CPU_TIME; + + double vforce[3] = {0}; + const double register x = pos_x[target]; + const double register y = pos_y[target]; + const double register z = pos_z[target]; + const double register m = mass[target]; + + // int _Nngb_ = Nngb & (-(int)BUNCH); + + IVDEP + LOOP_VECTORIZE + LOOP_UNROLL_N(4) + for ( int i = 0; i < Nngb; i++ ) + { + double register dx = x - _memx[i]; + double register dy = y - _memy[i]; + double register dz = z - _memz[i]; + double register mm = m * _memm[i]; + + // distance + double dist3 = (dx*dx + dy*dy) + dz*dz; + dist3 = 1.0 / (dist3*sqrt(dist3)); + + vforce[0] += mm * dx * dist3; + vforce[1] += mm * dy * dist3; + vforce[2] += mm * dz * dist3; + } + + // accumulate on force + // + for ( int k = 0; k < 3; k++ ) + force[k] += vforce[k]; + + + *timing = CPU_TIME - mytiming; + return; +} + + + +// ··································································· +// +/* + * + */ + +void process_with_arrays_vectors( const double * restrict pos_x, + const double * restrict pos_y, + const double * restrict pos_z, + const double * restrict mass, + void * mems[], + const int target, + const int * restrict ngb_list, + const int Nngb, + double force[3], + double * timing) +{ + + ASSUME_ALIGNED(pos_x, VALIGN); + ASSUME_ALIGNED(pos_y, VALIGN); + ASSUME_ALIGNED(pos_z, VALIGN); + ASSUME_ALIGNED( mass, VALIGN ); + ASSUME_ALIGNED( ngb_list, VALIGN ); + + double *_memx = ASSUME_ALIGNED((double*)mems[0], VALIGN); + double *_memy = ASSUME_ALIGNED((double*)mems[1], VALIGN); + double *_memz = ASSUME_ALIGNED((double*)mems[2], VALIGN); + double *_memm = ASSUME_ALIGNED((double*)mems[3], VALIGN); + + IVDEP + LOOP_VECTORIZE + LOOP_UNROLL_N(DVSIZE) + for ( int i = 0; i < Nngb; i++ ) { + int k = ngb_list[i]; + _memx[i] = pos_x[k]; + _memy[i] = pos_y[k]; + _memz[i] = pos_z[k]; + _memm[i] = mass [k]; } + + dvector_t *_vmemx = ASSUME_ALIGNED((dvector_t*)mems[0], VALIGN); + dvector_t *_vmemy = ASSUME_ALIGNED((dvector_t*)mems[1], VALIGN); + dvector_t *_vmemz = ASSUME_ALIGNED((dvector_t*)mems[2], VALIGN); + dvector_t *_vmemm = ASSUME_ALIGNED((dvector_t*)mems[3], VALIGN); + + #if defined(__clang__) || defined(__INTEL_LLVM_COMPILER) + dvector_t register vtargetx ATTRIBUTE_ALIGNED(VALIGN) = (dvector_t)(pos_x[target]); + dvector_t register vtargety ATTRIBUTE_ALIGNED(VALIGN) = (dvector_t)(pos_y[target]); + dvector_t register vtargetz ATTRIBUTE_ALIGNED(VALIGN) = (dvector_t)(pos_z[target]); + dvector_t register vtargetm ATTRIBUTE_ALIGNED(VALIGN) = (dvector_t)(mass[target]); + dvector_t one ATTRIBUTE_ALIGNED(VALIGN) = (dvector_t)(1.0); + #else + #define X(v) (pos_x[target]) + dvector_t vtargetx ATTRIBUTE_ALIGNED(VALIGN) = INIT_VECTOR(DVSIZE); + #undef X + #define X(v) (pos_y[target]) + dvector_t vtargety ATTRIBUTE_ALIGNED(VALIGN) = INIT_VECTOR(DVSIZE); + #undef X + #define X(v) (pos_z[target]) + dvector_t vtargetz ATTRIBUTE_ALIGNED(VALIGN) = INIT_VECTOR(DVSIZE); + #undef X + #define X(v) (mass[target]) + dvector_t vtargetm ATTRIBUTE_ALIGNED(VALIGN) = INIT_VECTOR(DVSIZE); + #undef X + #define X(v) 1.0 + dvector_t one ATTRIBUTE_ALIGNED(VALIGN) = INIT_VECTOR(DVSIZE); + #undef X + #define X(v) 0.0 + dvector_t vforcex ATTRIBUTE_ALIGNED(VALIGN) = INIT_VECTOR(DVSIZE); + dvector_t vforcey ATTRIBUTE_ALIGNED(VALIGN) = INIT_VECTOR(DVSIZE); + dvector_t vforcez ATTRIBUTE_ALIGNED(VALIGN) = INIT_VECTOR(DVSIZE); + #undef X + #endif + + + int _Nngb_ = Nngb & 0xFFFFFFFC; + _Nngb_ /= 4; + + double mytiming = CPU_TIME; + + IVDEP + LOOP_VECTORIZE + for ( int i = 0; i < _Nngb_; i++ ) + { + + dvector_t distx = (vtargetx - _vmemx[i]); + dvector_t disty = (vtargety - _vmemy[i]); + dvector_t distz = (vtargetz - _vmemz[i]); + + dvector_t dist2x = distx*distx; + dvector_t dist2y = disty*disty; + dvector_t dist2z = distz*distz; + + dist2x = (dist2x + dist2y) + dist2z; + dvector_t inv_dist3 = one / (dist2x * _mm256_sqrt_pd(dist2x)); + + vforcex += distx * (vtargetm * _vmemm[i]) * inv_dist3; + vforcey += disty * (vtargetm * _vmemm[i]) * inv_dist3; + vforcez += distz * (vtargetm * _vmemm[i]) * inv_dist3; + } + + // process the reminder of the iteration space + // + double local_force[3] = {0}; + for ( int i = Nngb&0xFFFFFFFC; i < Nngb; i++ ) + { + double dist3 = ( + (pos_x[target] - _memx[i])* (pos_x[target] - _memx[i]) + + (pos_y[target] - _memy[i])* (pos_y[target] - _memy[i]) + + (pos_z[target] - _memz[i])* (pos_z[target] - _memz[i]) ); + dist3 = 1.0 / (dist3* sqrt(dist3)); + + local_force[0] += mass[target]*_memm[i] * (pos_x[target] - _memx[i]) * dist3; + local_force[1] += mass[target]*_memm[i] * (pos_y[target] - _memy[i]) * dist3; + local_force[2] += mass[target]*_memm[i] * (pos_z[target] - _memz[i]) * dist3; + } + + for ( int i = 0; i < 3; i++ ) + force[i] += local_force[i]; + + // reduce to a unique double the vector force + // calculated before + // + dvector_u _vforce_; + _vforce_.V = vforcex; + for ( int i = 0; i < 4; i++ ) + force[0] += _vforce_.v[i]; + + _vforce_.V = vforcey; + for ( int i = 0; i < 4; i++ ) + force[1] += _vforce_.v[i]; + + _vforce_.V = vforcez; + for ( int i = 0; i < 4; i++ ) + force[2] += _vforce_.v[i]; + + *timing = CPU_TIME - mytiming; + + return; +} + + +int main( int argc, char **argv ) +{ + + int case_to_run = (argc > 1 ? atoi(*(argv+1)) : 0 ); + unsigned int N = (argc > 2 ? atoi(*(argv+2)) : 1000000 ); + long int seed = (argc > 3 ? atoi(*(argv+3)) : -1 ); // set the seed for repeatible runs + int Nrepetitions= (argc > 4 ? atoi(*(argv+4)) : NREPETITIONS ); + int dry_run = (argc > 5 ? atoi(*(argv+5)) : 0 ); // 1 to estimate floats for initialization + int from_file = ( case_to_run > 0 ); + + case_to_run = (case_to_run < 0 ? -case_to_run : case_to_run); // make it positive + + if ( case_to_run > 5 ) { + printf("unknown case %d\n", case_to_run ); + return 0; } + + + P_t * restrict P; + v4df * restrict vpos; + double * restrict pos_x; + double * restrict pos_y; + double * restrict pos_z; + double * restrict mass ; + double * restrict force; + int * restrict ngb_list; + + FILE * outfile = NULL; + FILE * infile = NULL; + + + if ( case_to_run == 0 ) + { + outfile = fopen( "particle.data", "w" ); + fwrite( &N, sizeof(int), 1, outfile ); + fwrite( &Nrepetitions, sizeof(int), 1, outfile ); + } + else + { + if ( from_file ) { + size_t ret; + int Nrep; + infile = fopen( "particle.data", "r" ); + ret = fread( &N, sizeof(int), 1, infile ); + ret = fread( &Nrep, sizeof(int), 1, infile ); + if ( Nrep < Nrepetitions ) { + printf("Nrepetitions is %d, larger than that " + "used to generate the particle file, which was %d.\n" + "This may lead to inf or NaN, better stop.\n", + Nrepetitions, Nrep); + exit(3); } + } + + force = (double *)aligned_alloc(VALIGN, N*3*sizeof(double)); + } + + + printf(" > Running case %d -> \"%s\" with %u points and %d repetitions\n" + " > Vectors are %lu bytes long, %d double long\n", + case_to_run, implementation_labels[case_to_run], N, Nrepetitions, + VSIZE, DVSIZE); + if ( dry_run ) + printf(" >>> DRY RUN :: use to estimate ops outside the actual calculations\n" ); + + + if ( case_to_run > 0 ) + { + switch( case_to_run ) + { + case 1: { + // allocate array of big structures + // + P = (P_t *)aligned_alloc(VALIGN, N * sizeof(P_t)); } break; + + case 2: + case 3: { + // allocate array of small structures (vectors) + // + vpos = (v4df *)aligned_alloc(VALIGN, N * sizeof(v4df)); } break; + + case 4: + case 5: { + // allocate arrays + // + pos_x = (double *)aligned_alloc(VALIGN, N*sizeof(double)); + pos_y = (double *)aligned_alloc(VALIGN, N*sizeof(double)); + pos_z = (double *)aligned_alloc(VALIGN, N*sizeof(double)); + mass = (double *)aligned_alloc(VALIGN, N*sizeof(double)); + }break; + } + + memset( (void*)force, 0, N*3*sizeof(double)); + } + + // initialize particle data + // + { + if ( ! from_file ) { + if ( seed == -1 ) + srand48(time(NULL)); + else + srand48( seed ); } + + for ( int i = 0; i < N; i++ ) + { + double data[4] __attribute__((aligned(VALIGN))); + if ( ! from_file ) + for ( int j = 0; j < 4; j++ ) + data[j] = drand48(); + else + { + size_t ret = fread ( data, 4, sizeof(double), infile ); + } + + if ( case_to_run >= 0 ) + switch( case_to_run ) + { + case 0:{ + size_t ret = fwrite( data, sizeof(double), 4, outfile ); } break; + + case 1: { + P[i].pos[0] = data[0]; + P[i].pos[1] = data[1]; + P[i].pos[2] = data[2]; + P[i].mass = data[3]; } break; + + case 2: + case 3: { + v4df_u * _vpos_ = (v4df_u*)vpos; + for ( int j = 0; j < 4; j++ ) + _vpos_[i].p[j] = data[j]; } break; + + case 4: + case 5:{ + pos_x[i] = data[0]; + pos_y[i] = data[1]; + pos_z[i] = data[2]; + mass [i] = data[3]; } break; + } + } + } + + if ( dry_run ) { + printf("dry run: going to clean up\n"); + goto clean; } + + /* ------------------------------------------------------ + * + * simulate interactions + * + * ------------------------------------------------------ */ + + // determine how many particles will be active, from + // a minimum of 1/100-ed to a maximum of 1/100+1/10 + // + + int Nactive; + int AvgNneighbours; + + if ( case_to_run >= 0 && !from_file ) + { + Nactive = N/100 + (int)lrand48() % (N/10); // 11% of particle will be processed at max, 1% as minimum + if ( case_to_run == 0 ) { + size_t ret = fwrite( &Nactive, sizeof(int), 1, outfile );} + } + else { + size_t ret = fread( &Nactive, sizeof(int), 1, infile ); } + + AvgNneighbours = ( NNEIGHBOURS > N*0.1 ? (int)(N*0.1) : NNEIGHBOURS ); + ngb_list = (int*)calloc( AvgNneighbours * 1.2, sizeof(int) ); + + PAPI_INIT; + + printf ( " * %d particles will be active\n", Nactive ); + + double timing = 0; + double intiming = 0; + + void *mem = NULL; + void *mems[4] = {NULL}; + switch ( case_to_run ) + { + case 1: mem = aligned_alloc(VALIGN, AvgNneighbours*1.2 * sizeof(P_t)); break; + case 2: + case 3: mem = aligned_alloc(VALIGN, AvgNneighbours*1.2 * sizeof(v4df)); break; + case 4: + case 5: { + mems[0] = aligned_alloc(VALIGN, AvgNneighbours*1.2 * sizeof(double)); + mems[1] = aligned_alloc(VALIGN, AvgNneighbours*1.2 * sizeof(double)); + mems[2] = aligned_alloc(VALIGN, AvgNneighbours*1.2 * sizeof(double)); + mems[3] = aligned_alloc(VALIGN, AvgNneighbours*1.2 * sizeof(double)); } break; + } + + // loop over active particles + // + for ( int j = 0; j < Nactive; j++ ) + { + // determine which one will be active + // of course it is not important which one will be, + // nor is important that we do not pick the same + // particle more than once in the loop + // (shouldn't happen, though, because the + // random repetition lenght is much larger) + // + int target, Nneighbours; + + if ( !from_file ) + { + target = (int)lrand48() % (N-Nrepetitions); // the index of the to-be-processed particle + + // determine how many will be the neighbours + // ( same comment than before apply) + Nneighbours = AvgNneighbours + (int)mrand48()%(AvgNneighbours/10) ; // with how many particles it will interact + // Nneighbours +- 10% + + + // decide which will be the neighbours + int upperlimit = target+Nrepetitions; + for ( int i = 0; i < Nneighbours; i++ ) + { + int draw = (int)lrand48() % N; + if ( (draw >= target) && (draw <= upperlimit ) ) + draw = upperlimit+1; + ngb_list[i] = draw; + } + + if ( case_to_run == 0 ) + { + /* + printf("iter %d target %d has %u neighbours\n", j, + target, (unsigned)Nneighbours ); + */ + size_t ret; + ret = fwrite( &target, sizeof(int), 1, outfile ); + ret = fwrite( &Nneighbours, sizeof(int), 1, outfile ); + ret = fwrite( &ngb_list[0], sizeof(int), Nneighbours, outfile ); + } + + } + else + { + size_t ret; + ret = fread( &target, sizeof(int), 1, infile ); + ret = fread( &Nneighbours, sizeof(int), 1, infile ); + ret = fread( ngb_list, sizeof(int), Nneighbours, infile ); + } + + double myforce[3] = {0}; + double now; + + // now, call the force evaluation with the different implementations + // + + if ( case_to_run > 0 ) { + + double this_timing = 1e10; + double this_inner_timing = 1e10; + for ( int shot = 0; shot <= Nrepetitions; shot++ ) { + + // shot = = iteration is used as a warm-up for the neighbours walk + int mytarget = (target+shot)%N; + + if ( shot >= 1 ) + now = CPU_TIME; + if ( shot == 1 ) + PAPI_START_CNTR; + + double inner_timing; + switch ( case_to_run ) + { + // ------------------------------------------------ + // + case 1: { + process_with_structure( P, mem, mytarget, ngb_list, Nneighbours, myforce, &inner_timing ); + } break; + + // ------------------------------------------------ + // + case 2: { + process_with_vectors( vpos, mem, mytarget, ngb_list, Nneighbours, myforce, &inner_timing ); + } break; + + // ------------------------------------------------ + // + + case 3: { + process_with_vectors_intrinsics( (double*)vpos, mem, mytarget, ngb_list, Nneighbours, myforce, &inner_timing ); + } break; + + // ------------------------------------------------ + // + + case 4: { + process_with_arrays( pos_x, pos_y, pos_z, mass, mems, + mytarget, ngb_list, Nneighbours, myforce, &inner_timing ); + } break; + + // ------------------------------------------------ + // + case 5: { + process_with_arrays_vectors( pos_x, pos_y, pos_z, mass, mems, + mytarget, ngb_list, Nneighbours, myforce, &inner_timing ); + } break; + } + + if (shot >= 1 ) { + double chrono = CPU_TIME - now; + if ( chrono < this_timing ) + this_timing = chrono; + if ( inner_timing < this_inner_timing ) + this_inner_timing = inner_timing; + } + + if ( shot == 1 ) + PAPI_STOP_CNTR; + + } + + //printf("%d t%d %g %g %g\n", j, target, myforce[0], myforce[1], myforce[2]); + + timing += this_timing; + intiming += this_inner_timing; + } + + if ( case_to_run > 0 ) { + #if !defined(NDEBUG) + #warning "you're compiling with assert" + #endif + int offset = target*3; + for ( int i = 0; i < 3; i++ ) { + assert( myforce[i] != 0 ); + assert( !isnan(myforce[i]) ); + assert( !isinf(myforce[i]) ); + force[offset+i] = myforce[i] / (Nrepetitions+1); + } + } + } + + if ( infile != NULL ) + fclose(infile); + if ( outfile != NULL ) + fclose(outfile); + + if ( case_to_run > 0 ) + { + + printf(" + timing and inner_timing for case %d \"%s\" : %g %g\n", + case_to_run, implementation_labels[case_to_run], + timing, intiming ); + + // just to trick the compiler, so that it does not optimize out + // pointless calculations + // + + char filename[100]; + sprintf( filename, "force2b.%d.out", case_to_run ); + FILE *output = fopen( filename, "w" ); + if( output != NULL ) { + fwrite( &N, sizeof(int), 1, output); + fwrite( force, sizeof(double), N*3, output ); + fclose(output); } + else + printf(">>> wow, I was unable to create stupid file\n"); + + free ( force ); + if ( !dry_run) + free ( ngb_list ); + + } + + clean: + if ( mem != NULL ) + free ( mem ); + if ( mems[0] != NULL ) + for ( int i = 0; i < 4; i++ ) + free ( mems[i] ); + switch ( case_to_run ) + { + case 1: { + free ( P ); } break; + case 2: + case 3: { + free ( vpos ); } break; + case 4: + case 5: { + free ( pos_x ), free ( pos_y ), free ( pos_z ), free ( mass ); } break; + } + + #if defined(USE_PAPI) + if ( case_to_run > 0 ) + for ( int i = 0; i < PAPI_EVENTS_NUM; i++ ) + printf("PAPI event %d: %llu\n", + i, (unsigned long long)papi_values[i]); + #endif + + + + return 0; +}