From bc875847b3559ace9a9d93b5f394f7c13c2bb24f Mon Sep 17 00:00:00 2001
From: Luca Tornatore <luca.tornatore@inaf.it>
Date: Fri, 29 Nov 2024 09:49:35 +0100
Subject: [PATCH] added 1b - the force in version b is a vector and not a
 scalar; also, I dropped the strategy of treating bunches of particles in the
 loops

---
 src/vect.1b.c | 1177 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 1177 insertions(+)
 create mode 100644 src/vect.1b.c

diff --git a/src/vect.1b.c b/src/vect.1b.c
new file mode 100644
index 0000000..167e21f
--- /dev/null
+++ b/src/vect.1b.c
@@ -0,0 +1,1177 @@
+
+/* ────────────────────────────────────────────────────────────────────────── *
+ │                                                                            │
+ │ This file is part of the nbody-vectorization-toy-lab in the fram of the    │
+ │ SPACE Center of Excellence                                                 │
+ │ © 2024                                                                     │
+ │ The work has been started by Luca Tornatore @ INAF (Italian National       │
+ │ Institute for Astrophysics) and continued in collaboration with            │
+ │ Vassilis Flouris @ FORTH (Foundation for Research and Technology - Hellas  │
+ │                                                                            │
+ │ Consult the accopanying file changes.log for details                       │
+ │                                                                            │
+ │ contact:                                                                   │
+ │  luca.tornatore@inaf.it                                                    │
+ │  vflouris@ics.forth.gr                                                     │
+ │                                                                            │
+ │                                                                            │
+ │ ×××××××××××××××××××××××××××××××××××××××××××××××××××××××××××××××××××××××××  │
+ │                                                                            │
+ │     This is open source software; you can redistribute it and/or modify    │
+ │     it under the terms of the "3 clauses BSD license"                      │
+ │       https://opensource.org/license/bsd-3-clause                          │
+ │                                                                            │
+ │     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS    │
+ │     “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT      │
+ │     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS      │
+ │     FOR A PARTICULAR PURPOSE ARE DISCLAIMED.                               │
+ │                                                                            │
+ * ────────────────────────────────────────────────────────────────────────── */
+
+
+
+
+#if defined(__STDC__)
+#  if (__STDC_VERSION__ >= 199901L)
+#     define _XOPEN_SOURCE 700
+#  endif
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include <math.h>
+#include <assert.h>
+
+#include <vector_pragmas.h>
+#include <vector_types.h>
+
+#include "mypapi.h"
+//#include <immintrin.h>
+
+#define CPU_TIME ({struct timespec ts; clock_gettime( CLOCK_PROCESS_CPUTIME_ID, &ts ), (double)ts.tv_sec + \
+					 (double)ts.tv_nsec * 1e-9;})
+
+// with how many neighbours each particle interacts, in avg
+//
+#define NNEIGHBOURS 200     
+
+#define NREPETITIONS 5
+
+#define DATASIZE 4              // 3 positions + the mass
+
+// how many different implementations
+//
+#define NSHOTS 5
+const char *implementation_labels[NSHOTS+1] = {"generate particles",
+  "Using AoS",
+  "using AoV (array of vectors)",
+  "using AoV with intrinsics",
+  "using SoA",
+  "using SoA with intrinsics" };
+
+// define a vector type
+//
+
+#define DV4SIZE DATASIZE
+typedef double v4df __attribute__ ((vector_size (DV4SIZE*sizeof(double))));
+typedef union
+{
+  v4df   P;
+  double p[DV4SIZE];
+} v4df_u;
+
+
+// define a structure
+typedef struct {
+  double pos[3];
+  double mass;
+ #if defined(FILL)
+  double array[FILL];
+ #endif
+} P_t;
+
+
+
+
+// ···································································
+//
+/*
+ * Processing the particles as structures
+ * 
+ */
+
+void process_with_structure( const P_t   * restrict P,
+			     const int              target,
+			     const int   * restrict ngb_list,
+			     const int              Nngb,
+			           double           force[3] )
+  
+{
+
+  double local_force[3] = {0}; // this will accumulate the total force on target from all the neigbhours
+
+  P_t register Ptarget = P[target];
+  double data[4], predata[4];
+
+  data[0] = P[ngb_list[0]].pos[0];
+  data[1] = P[ngb_list[0]].pos[1];
+  data[2] = P[ngb_list[0]].pos[2];
+  data[3] = P[ngb_list[0]].mass;
+
+  IVDEP
+  LOOP_VECTORIZE
+  for ( int i = 0; i < Nngb; i++ )
+    {
+      if ( i < Nngb-1)
+	{
+	  // try to hide the memory latency
+	  // by pre-loding the next one
+	  //
+	  int k = ngb_list[i+1];
+	  
+	  predata[0] = P[k].pos[0];
+	  predata[1] = P[k].pos[1];
+	  predata[2] = P[k].pos[2];
+	  predata[3] = P[k].mass;
+	}
+      
+      
+      double m1m2 = Ptarget.mass * data[3];
+      // 1D distances
+      //
+      double dx = Ptarget.pos[0] - data[0];
+      double dy = Ptarget.pos[1] - data[1];
+      double dz = Ptarget.pos[2] - data[2];
+
+      // square the distance
+      //
+      double dist3 = (dx*dx + dy*dy) + dz*dz;
+      dist3 = 1.0/(dist3*sqrt(dist3));
+      
+      // accumulate the force
+      //
+      
+      local_force[0] += m1m2 * dx * dist3;
+      local_force[1] += m1m2 * dy * dist3;
+      local_force[2] += m1m2 * dz * dist3;
+
+      /*
+      printf("T%d n %d->%d %g %g %g %g %g %g\n",
+	     target, i, ngb_list[i], dx*dx, dy*dy, dz*dz,
+	     m1m2 * dx * dist3, m1m2 * dy * dist3, m1m2 * dz * dist3 );
+      */
+      
+      data[0] = predata[0];
+      data[1] = predata[1];
+      data[2] = predata[2];
+      data[3] = predata[3];
+    }
+
+  for ( int i = 0; i < 3; i++)
+    force[i] += local_force[i];
+  
+  return;
+}
+
+
+// ···································································
+//
+/*
+ * Here we trat the particles explicitly as vectors
+ * The array if structures P is accessed as v4df vectors
+ *
+ */
+
+
+
+void process_with_vectors( const v4df   * restrict V,
+			   const int               target,
+			   const int    * restrict ngb_list,
+			   const int               Nngb,
+			         double            force[3] )
+
+{
+  const v4df   Vtarget   = V[target];    // save the target vector in a local vector (much probably a register)
+        v4df   Vbuffer;               // The array of particles that are processed in the inner loop
+        v4df   vforce    = {0};          // accumulate the forces in separate lanes
+        v4df   mask_pos  = {1,1,1,0};    // used to mask out the mass
+        v4df   mask_mass = {0,0,0,1};    // used to mask out the position
+
+	
+  // pre-load the first BUNCH of particles
+  //
+  Vbuffer = V[ngb_list[0]];
+
+  // outern loop
+  // loop over bunches 
+  for ( int i = 0; i < Nngb; i++ )
+    {
+      
+      // hide the memory latency by pre-loding
+      // the next BUNCH 
+      //
+      v4df preVbuffer;
+      if ( i < Nngb-1 )
+	preVbuffer = V[ngb_list[i+1]];
+
+
+      // the distance^2f the i-th neighbour
+      //
+      v4df   register delta = (Vtarget - Vbuffer) * mask_pos;	  
+      v4df_u register delta2;
+      delta2.P = delta * delta;
+      double scalar_dist = delta2.p[0]+delta2.p[1]+delta2.p[2]+delta2.p[3];
+      scalar_dist = pow(scalar_dist, 1.5);  // r^3
+      
+      // the mass product with the i-th neighbour
+      //
+      v4df register prod  = (Vtarget * Vbuffer) * mask_mass;
+      
+      // get the mass product as a single double
+      //
+      double mm = ((v4df_u)prod).p[3] / scalar_dist;
+      
+      // get the force from the i-th neighbour
+      //
+      vforce += mm * delta;
+
+      /*
+      v4df_u _force_;
+      _force_.P = mm*delta;
+      printf("T%d n %d->%d %g %g %g %g %g %g\n",
+	     target, i, ngb_list[i],
+	     delta2.p[0], delta2.p[1], delta2.p[2],
+	     _force_.p[0], _force_.p[1], _force_.p[2] );
+      */
+      Vbuffer = preVbuffer;
+    }
+
+  // accumulate the partial results
+  //
+  for ( int i = 0; i < 3; i++ )
+    force[i] += ((v4df_u)vforce).p[i];
+
+  return;
+}
+
+
+
+// ···································································
+//
+/*
+ *
+ */
+
+void process_with_vectors_intrinsics( const double * restrict V,
+				      const int target,
+				      const int *ngb_list,
+				      const int Nngb,
+				            double force[3] )
+  
+{    
+  long long bb       = -1;     // used for masking entries of a vector
+  __m256d mask_pos   = _mm256_set_pd(0, *(double*)&bb, *(double*)&bb, *(double*)&bb);  // used to mask out the mass
+  __m256d mask_mass  = _mm256_set_pd(*(double*)&bb, 0, 0, 0);                          // used to mask out the position
+  
+  const __m256d register vtarget = _mm256_load_pd(&V[target*4]);                       // load the target data
+    
+  __m256d Vbuffer;
+  __m256d Vforce = {0};
+
+  __m256d *VV = __builtin_assume_aligned ((__m256*)V, VALIGN);
+  
+  Vbuffer = _mm256_load_pd((double*)&VV[ngb_list[0]]);
+  
+  for ( int i = 0; i < Nngb; i++ )
+    {      
+      // hide the memory latency by pre-loding
+      // the next BUNCH 
+      //
+      __m256d preVbuffer;
+      if ( i < Nngb-1 )
+	preVbuffer = _mm256_load_pd((double*)&VV[ngb_list[i+1]]);
+
+
+      __m256d register vdist;
+      __m256d register vdist3;
+      // note: the 64bits fields in the following
+      //       comments are reported so that at
+      //       th beginning there are the first
+      //       bits in the vector field, i.e.
+      //       those at the least significant bits
+      //       >> however, when you consider the
+      //       _mm256_set_pd instruction, the arguments
+      //       are in the opposite direction: the first
+      //       one sets the most significant bits and
+      //       so on
+      //
+      //       least --->  most
+      
+      // get [ dx, dy, ddz dm ]
+      vdist  = _mm256_sub_pd         ( vtarget, Vbuffer );
+      //
+      // get [ dx^2, dy^2, dz^2, dm^2 ]
+      vdist3 = _mm256_mul_pd         ( vdist, vdist      );
+      
+      //
+      // get [ dx^2, dy^2, dz^2, 0 ]
+      vdist3 = _mm256_and_pd         ( vdist3, mask_pos   );      
+      //
+      // get [ dx^2+dy^2, dx^2+dy^2, dz^2, dz^2]
+      vdist3 = _mm256_hadd_pd        ( vdist3, vdist3      );
+      //
+      // get [ dx^2+dy^2, dz^2, dx^2+dy^2, dz^2 ]
+      vdist3 = _mm256_permute4x64_pd ( vdist3, 0b00100111 );
+      //
+      // get [ dx^2+dy^2+dz^2, .., .., .. ]
+      vdist3 = _mm256_hadd_pd        ( vdist3, vdist3      );
+      vdist3 = _mm256_mul_pd         ( vdist3, _mm256_sqrt_pd(vdist3) );
+      
+      __m256d mprod;
+      double  m1m2;
+      //
+      // get [ x0*x1, y0*y1, z0*z1, m0*m1 ]
+      mprod = _mm256_mul_pd( vtarget, Vbuffer );
+      m1m2  = (((double*)&mprod)[3]);
+      mprod = _mm256_set_pd( m1m2, m1m2, m1m2, m1m2 );
+      
+      //
+      // get [ m0*m1 / (dx^2+dy^2+dz^2)^3/2, .., .., .. ]
+      mprod = _mm256_div_pd( mprod, vdist3 );
+      
+      mprod = _mm256_mul_pd( mprod, vdist );
+      
+      // get the uppermost element from the vector
+      // --> here we should use the reshuffle+extraction instruction
+      Vforce += mprod;
+      
+      Vbuffer = preVbuffer;
+      
+    }
+
+  for ( int k = 0; k < 3; k++ )
+    force[k] += ((double*)&Vforce)[k];
+
+  return;
+}
+
+
+
+// ···································································
+//
+/*
+ *
+ */
+
+void process_with_arrays( const double * restrict pos_x,
+			  const double * restrict pos_y,
+			  const double * restrict pos_z,
+			  const double * restrict mass,
+			  const int target, const int *ngb_list, const int Nngb,
+			  double force[3] )
+  
+{
+  const double x = pos_x[target];
+  const double y = pos_y[target];
+  const double z = pos_z[target];
+  const double m = mass [target];
+
+  double vforce[3]  = { 0 };
+  double vx = pos_x[ngb_list[0]];
+  double vy = pos_y[ngb_list[0]];
+  double vz = pos_z[ngb_list[0]];
+  double vm = mass[ngb_list[0]];
+
+  
+  IVDEP
+  LOOP_VECTORIZE
+  LOOP_UNROLL_N(DVSIZE)
+  for ( int i = 0; i < Nngb; i++ )
+    {
+
+      double nextx, nexty, nextz, nextm;
+      if ( i < Nngb-1 )
+	{
+	  nextx = pos_x[ngb_list[i+1]];
+	  nexty = pos_y[ngb_list[i+1]];
+	  nextz = pos_z[ngb_list[i+1]];
+	  nextm =  mass[ngb_list[i+1]];
+	}
+      
+      /*
+      vx = pos_x[ngb_list[i]];
+      vy = pos_y[ngb_list[i]];
+      vz = pos_z[ngb_list[i]];
+      vm =  mass[ngb_list[i]];
+      */
+      // get distances along x, y and z for the i-th neighbour
+      //
+      double dx = x - vx;
+      double dy = y - vy;
+      double dz = z - vz;
+      double mm = m * vm;
+
+      // calculate the distance^2 for BUNCH neighbours
+      //
+      double dist3 = dx*dx+dy*dy+dz*dz;
+      dist3 = dist3*sqrt(dist3);
+
+	  
+      // get the force from BUNCH neighbrous
+      //
+      vforce[0] += mm * dx / dist3;
+      vforce[1] += mm * dy / dist3;
+      vforce[2] += mm * dz / dist3;
+
+      vx = nextx;
+      vy = nexty;
+      vz = nextz;
+      vm = nextm;
+    }
+
+  // accumulate on force
+  //
+  for ( int k = 0; k < 3; k++ )
+    force[k] += vforce[k];
+  
+
+  return;
+}
+
+/*
+void process_with_arrays( const double * restrict pos_x,
+			  const double * restrict pos_y,
+			  const double * restrict pos_z,
+			  const double * restrict mass,
+			  const int target, const int *ngb_list, const int Nngb,
+			  double force[3] )
+  
+{
+ #define BUNCH 1 //DVSIZE
+  
+  const double x[BUNCH] = { [0 ... BUNCH-1] = pos_x[target]};
+  const double y[BUNCH] = { [0 ... BUNCH-1] = pos_y[target]};
+  const double z[BUNCH] = { [0 ... BUNCH-1] = pos_z[target]};
+  const double m[BUNCH] = { [0 ... BUNCH-1] = mass [target]};
+
+  double vforce[BUNCH][3]  = { [0 ... BUNCH-1] = 0 };
+  double vx[BUNCH], vy[BUNCH], vz[BUNCH], vm[BUNCH];  
+
+  int _Nngb_;
+  {
+    int log2_BUNCH = 0;
+    while ( (BUNCH >> ++log2_BUNCH) > 0 );
+    _Nngb_ = Nngb & ((unsigned)-1 << (log2_BUNCH-1));
+  }
+  int _Nngb_last_ = _Nngb_ - BUNCH;
+
+  
+  LOOP_UNROLL_N(BUNCH)
+  for ( int j = 0; j < BUNCH; j++ )
+    vx[j] = pos_x[ngb_list[j]];
+  LOOP_UNROLL_N(BUNCH)
+  for ( int j = 0; j < BUNCH; j++ )
+    vy[j] = pos_y[ngb_list[j]];
+  LOOP_UNROLL_N(BUNCH)
+  for ( int j = 0; j < BUNCH; j++ )
+    vz[j] = pos_z[ngb_list[j]];
+  LOOP_UNROLL_N(BUNCH)
+  for ( int j = 0; j < BUNCH; j++ )
+    vm[j] = mass[ngb_list[j]];
+
+  
+  IVDEP
+  LOOP_VECTORIZE
+  LOOP_UNROLL_N(1)
+  for ( int i = 0; i < _Nngb_; i+=BUNCH )
+    {
+
+      double prevx[BUNCH], prevy[BUNCH], prevz[BUNCH], prevm[BUNCH];
+      if ( i < _Nngb_last_ )
+	{
+	  int next = i + BUNCH;
+	  for ( int j = 0; j < BUNCH; j++ )
+	    prevx[j] = pos_x[ngb_list[next+j]];
+	  for ( int j = 0; j < BUNCH; j++ )
+	    prevy[j] = pos_y[ngb_list[next+j]];
+	  for ( int j = 0; j < BUNCH; j++ )
+	    prevz[j] = pos_z[ngb_list[next+j]];
+	  for ( int j = 0; j < BUNCH; j++ )
+	    prevm[j] = mass[ngb_list[next+j]];
+	}
+
+      // get distances along x, y and z for 4 neighbours
+      //
+      double dx[BUNCH],
+	dy[BUNCH],
+	dz[BUNCH],
+	mm[BUNCH],
+	dist3[BUNCH];
+
+      LOOP_UNROLL_N(BUNCH)
+      for ( int j = 0; j < BUNCH; j++ )
+	dx[j] = x[j] - vx[j];
+      LOOP_UNROLL_N(BUNCH)
+      for ( int j = 0; j < BUNCH; j++ )
+	dy[j] = y[j] - vy[j];
+      LOOP_UNROLL_N(BUNCH)
+      for ( int j = 0; j < BUNCH; j++ )
+	dz[j] = z[j] - vz[j];
+      LOOP_UNROLL_N(BUNCH)
+      for ( int j = 0; j < BUNCH; j++ )
+	mm[j] = m[j] * vm[j];
+
+      // calculate the distance^2 for BUNCH neighbours
+      //
+      LOOP_UNROLL_N(BUNCH)
+	for ( int j = 0; j < BUNCH; j++ )
+	  {
+	    dist3[j] = dx[j]*dx[j]+dy[j]*dy[j]+dz[j]*dz[j];
+	    dist3[j] = dist3[j]*sqrt(dist3[j]);
+	  }
+	  
+      // get the force from BUNCH neighbrous
+      //
+      LOOP_UNROLL_N(BUNCH)
+	for ( int j = 0; j < BUNCH; j++ ) {
+	  vforce[j][0] += mm[j] * dx[j] / dist3[j];
+	  vforce[j][1] += mm[j] * dy[j] / dist3[j];
+	  vforce[j][2] += mm[j] * dz[j] / dist3[j]; }
+
+      for ( int j = 0; j < BUNCH; j++ )
+	vx[j] = prevx[j];
+      for ( int j = 0; j < BUNCH; j++ )
+	vy[j] = prevy[j];
+      for ( int j = 0; j < BUNCH; j++ )
+	vz[j] = prevz[j];
+      for ( int j = 0; j < BUNCH; j++ )
+	vm[j] = prevm[j];
+      
+    }
+
+  // accumulate on force
+  //
+  for ( int j = 0; j < BUNCH; j++ )
+    for ( int k = 0; k < 3; k++ )
+      force[k] += vforce[j][k];
+  
+  // process the reminder of the iteration space
+  //
+  for ( int i = _Nngb_; i < Nngb; i++ )
+    {
+      int k = ngb_list[i];
+      double dist3 = (x[0] - pos_x[k])*(x[0] - pos_x[k]) +
+	(y[0] - pos_y[k])*(y[0] - pos_y[k]) +
+	(z[0] - pos_z[k])*(z[0] - pos_z[k]);
+      dist3 = dist3 * sqrt(dist3);
+      
+      force[0] += mass[target]*mass[ngb_list[i]] * (x[0]-pos_x[k]) / dist3;
+      force[1] += mass[target]*mass[ngb_list[i]] * (y[0]-pos_y[k]) / dist3;
+      force[2] += mass[target]*mass[ngb_list[i]] * (z[0]-pos_z[k]) / dist3;
+    }
+
+  return;
+ #undef BUNCH
+}
+*/
+
+
+// ···································································
+//
+/*
+ *
+ */
+
+void process_with_arrays_intrinsics( const double * restrict pos_x,
+				     const double * restrict pos_y,
+				     const double * restrict pos_z,
+				     const double * restrict mass,
+				     const int target, const int *ngb_list, const int Nngb,
+				           double force[3] )
+{
+
+  pos_x = __builtin_assume_aligned( pos_x, VALIGN );
+  pos_y = __builtin_assume_aligned( pos_y, VALIGN );
+  pos_z = __builtin_assume_aligned( pos_z, VALIGN );
+  mass  = __builtin_assume_aligned(  mass, VALIGN );
+  
+  double myforce[3] = {0};           // this will accumulate the total force on target from all the neigbhours  
+
+  __m256d register xtarget = _mm256_set_pd( pos_x[target], pos_x[target], pos_x[target], pos_x[target] );
+  __m256d register ytarget = _mm256_set_pd( pos_y[target], pos_y[target], pos_y[target], pos_y[target] );
+  __m256d register ztarget = _mm256_set_pd( pos_z[target], pos_z[target], pos_z[target], pos_z[target] );
+  __m256d register mtarget = _mm256_set_pd(  mass[target],  mass[target],  mass[target],  mass[target] );
+  __m256d register vforcex  = _mm256_set_pd( 0, 0, 0, 0 );
+  __m256d register vforcey  = _mm256_set_pd( 0, 0, 0, 0 );
+  __m256d register vforcez  = _mm256_set_pd( 0, 0, 0, 0 );
+
+  
+
+
+  int _Nngb_ = Nngb & 0xFFFFFFFC;
+  
+  IVDEP
+  LOOP_VECTORIZE
+  LOOP_UNROLL_N(4)  
+  for ( int i = 0; i < _Nngb_; i+=4 )
+    {
+      __m256d ngbx = _mm256_set_pd ( pos_x[ngb_list[i]], pos_x[ngb_list[i+1]], pos_x[ngb_list[i+2]], pos_x[ngb_list[i+3]] );
+      __m256d ngby = _mm256_set_pd ( pos_y[ngb_list[i]], pos_y[ngb_list[i+1]], pos_y[ngb_list[i+2]], pos_y[ngb_list[i+3]] );
+      __m256d ngbz = _mm256_set_pd ( pos_z[ngb_list[i]], pos_z[ngb_list[i+1]], pos_z[ngb_list[i+2]], pos_z[ngb_list[i+3]] );
+      __m256d ngbm = _mm256_set_pd (  mass[ngb_list[i]],  mass[ngb_list[i+1]],  mass[ngb_list[i+2]],  mass[ngb_list[i+3]] );
+
+      // calculate dx^2, dy^2, dz^2 for all the 4 neighbours
+      //
+      __m256d deltax = xtarget - ngbx;
+      __m256d deltay = ytarget - ngby;
+      __m256d deltaz = ztarget - ngbz;
+      __m256d  mmass = mtarget * ngbm;
+      
+      __m256d dist3 = deltax*deltax + deltay*deltay + deltaz*deltaz;
+      dist3 = 1.0 / (dist3 * _mm256_sqrt_pd(dist3));
+
+      vforcex += deltax * mmass * dist3;
+      vforcey += deltay * mmass * dist3;
+      vforcez += deltaz * mmass * dist3;
+    }
+
+
+  // getting [ vforcex[0]+vforcex[1], vforcex[0]+vforcex[1], vforcex[2]+vforcex[3], vforcex[2]+vforcex[3] ]
+  __m256d forcex =  _mm256_hadd_pd ( vforcex, vforcex );
+  __m256d forcey =  _mm256_hadd_pd ( vforcey, vforcey );
+  __m256d forcez =  _mm256_hadd_pd ( vforcez, vforcez );
+
+  
+  forcex = _mm256_permute4x64_pd ( forcex, 0b00100111 );
+  forcey = _mm256_permute4x64_pd ( forcey, 0b00100111 );
+  forcez = _mm256_permute4x64_pd ( forcez, 0b00100111 );
+
+  
+  forcex = _mm256_hadd_pd        ( forcex, forcex );
+  forcey = _mm256_hadd_pd        ( forcey, forcey );
+  forcez = _mm256_hadd_pd        ( forcez, forcez );
+
+
+  double register posx = pos_x[target];
+  double register posy = pos_y[target];
+  double register posz = pos_z[target];
+  double register    m =  mass[target];
+  
+  for ( int i = _Nngb_; i < Nngb; i++ )
+    {
+      double dx = posx - pos_x[ngb_list[i]];
+      double dy = posy - pos_y[ngb_list[i]];
+      double dz = posz - pos_z[ngb_list[i]];
+      double mm =    m *  mass[ngb_list[i]];
+
+      double dist3 = ((dx*dx)+(dy*dy)) + (dz*dz);
+      dist3 = 1.0/(dist3*sqrt(dist3));
+
+      force[0] += mm*dx * dist3;
+      force[1] += mm*dy * dist3;
+      force[2] += mm*dz * dist3;
+    }
+  
+  force[0] += ((double*)&forcex)[0];
+  force[1] += ((double*)&forcey)[0];
+  force[2] += ((double*)&forcez)[0];
+  
+  return;
+}
+
+
+/*
+void process_with_arrays_intrinsics( const double * restrict pos_x,
+				     const double * restrict pos_y,
+				     const double * restrict pos_z,
+				     const double * restrict mass,
+				     const int target, const int *ngb_list, const int Nngb.
+				           double force[3] )
+{
+
+ #define BUNCH 4             
+
+  pos_x = __builtin_assume_aligned( pos_x, VALIGN );
+  pos_y = __builtin_assume_aligned( pos_y, VALIGN );
+  pos_z = __builtin_assume_aligned( pos_z, VALIGN );
+  mass  = __builtin_assume_aligned(  mass, VALIGN );
+  
+  double force = 0;           // this will accumulate the total force on target from all the neigbhours  
+
+                              // make an array of 4 vectors with the x,y,z and mass of the target
+  __m256d vtarget[4] = {
+    _mm256_set_pd( pos_x[target], pos_x[target], pos_x[target], pos_x[target] ),
+    _mm256_set_pd( pos_y[target], pos_y[target], pos_y[target], pos_y[target] ),
+    _mm256_set_pd( pos_z[target], pos_z[target], pos_z[target], pos_z[target] ),
+    _mm256_set_pd( mass [target], mass [target], mass [target], mass [target] )};
+  __m256d vforce = _mm256_set_pd( 0, 0, 0, 0 );
+
+    int _Nngb_;
+  {
+    int log2_BUNCH = 0;
+    while ( (BUNCH >> ++log2_BUNCH) > 0 );
+    _Nngb_ = Nngb & ((unsigned)-1 << (log2_BUNCH-1));
+  }
+  int _Nngb_last_ = _Nngb_ - BUNCH;
+
+  int k[BUNCH] = { ngb_list[0], ngb_list[1], ngb_list[2], ngb_list[3] };  
+  // make an array of 4 vectors with the x,y,z and mass of the next 4 neighbours
+  // each row contains either x, y, z, mass; each column refer to a neighbour
+  __m256d v[BUNCH] = {
+    _mm256_set_pd( pos_x[k[3]], pos_x[k[2]], pos_x[k[1]], pos_x[k[0]]),
+    _mm256_set_pd( pos_y[k[3]], pos_y[k[2]], pos_y[k[1]], pos_y[k[0]]),
+    _mm256_set_pd( pos_z[k[3]], pos_z[k[2]], pos_z[k[1]], pos_z[k[0]]),
+    _mm256_set_pd( mass [k[3]], mass [k[2]], mass [k[1]], mass [k[0]]) };
+
+
+  
+  IVDEP
+  LOOP_VECTORIZE
+  LOOP_UNROLL_N(1)  
+  for ( int i = 0; i < _Nngb_; i+=BUNCH )
+    {
+
+      // get the indexes of the next BUNCH neighbours
+      //
+
+      for ( int j = 0; j < BUNCH; j++ )
+	k[j] = ngb_list[i+BUNCH+j];
+      
+      // make an array of 4 vectors with the x,y,z and mass of the next 4 neighbours
+      // each row contains either x, y, z, mass; each column refer to a neighbour
+      
+      __m256d prev[BUNCH] = {
+	_mm256_set_pd( pos_x[k[3]], pos_x[k[2]], pos_x[k[1]], pos_x[k[0]]),
+	_mm256_set_pd( pos_y[k[3]], pos_y[k[2]], pos_y[k[1]], pos_y[k[0]]),
+	_mm256_set_pd( pos_z[k[3]], pos_z[k[2]], pos_z[k[1]], pos_z[k[0]]),
+	_mm256_set_pd( mass [k[3]], mass [k[2]], mass [k[1]], mass [k[0]]) };
+      
+      // calculate dx^2, dy^2, dz^2 for all the 4 neighbours
+      //
+      for ( int j = 0; j < 3; j++ ) {
+	v[j] = _mm256_sub_pd( vtarget[j], v[j] );
+	v[j] = _mm256_mul_pd( v[j], v[j] ); }
+      // calculate mass product
+      //
+      v[3] = _mm256_mul_pd( vtarget[3], v[3] );
+
+      //
+      // dx^2 + dy^2 for all the 4 neighbours      
+      v[0]   = _mm256_add_pd( v[0], v[1] );
+      //
+      // add dz^2 for all the 4 neighbours
+      v[0]   = _mm256_add_pd( v[0], v[2] );
+      //
+      // get mm / (dx^2 + dy^2 + dz^2) for all 4 neighbours
+      v[0]   = _mm256_div_pd( v[3], v[0] );
+      //
+      // accumulate the force on 4 lanes
+      vforce = _mm256_add_pd( vforce, v[0] );
+
+      for ( int j = 0; j < BUNCH; j++ )
+	v[j] = prev[j];
+    }
+
+  // process the reminder of the iteration space
+  //
+  for ( int i = _Nngb_; i < Nngb; i++ )
+    {
+      double dist2 = (
+		      (pos_x[target] - pos_x[ngb_list[i]])*(pos_x[target] - pos_x[ngb_list[i]]) +
+		      (pos_y[target] - pos_y[ngb_list[i]])*(pos_y[target] - pos_y[ngb_list[i]]) +
+		      (pos_z[target] - pos_z[ngb_list[i]])*(pos_z[target] - pos_z[ngb_list[i]]) );
+      
+      force += mass[target]*mass[ngb_list[i]] / dist2;					      
+    }
+
+  // reduce to a unique double the vector force
+  // calculated before
+  //
+  for ( int i = 0; i < 4; i++ )
+    force += ((double*)&vforce)[i];
+
+  return force;
+ #undef BUNCH
+}
+*/
+
+
+int main( int argc, char **argv )
+{
+
+           int case_to_run = (argc > 1 ? atoi(*(argv+1)) : 0 );
+  unsigned int N           = (argc > 2 ? atoi(*(argv+2)) : 1000000 );
+  long     int seed        = (argc > 3 ? atoi(*(argv+3)) : -1 );     // set the seed for repeatible runs
+           int Nrepetitions= (argc > 4 ? atoi(*(argv+4)) : NREPETITIONS );
+           int dry_run     = (argc > 5 ? atoi(*(argv+5)) : 0 );      // 1 to estimate floats for initialization
+           int from_file   = ( case_to_run > 0 );	   
+	   
+	   case_to_run = (case_to_run < 0 ? (-case_to_run) : case_to_run);  // make it positive
+	   
+  if ( (case_to_run < 0) || (case_to_run > 5) ) {
+    printf("unknown case %d\n", case_to_run );
+    return 0; }
+
+  
+
+  P_t     * restrict P;
+  v4df    * restrict vpos;
+  double  * restrict pos_x;
+  double  * restrict pos_y;
+  double  * restrict pos_z;
+  double  * restrict mass ;               
+  double  * restrict force; 
+  int     * restrict ngb_list;
+  
+  FILE   * outfile = NULL;
+  FILE   * infile  = NULL;
+  
+
+  if ( case_to_run == 0 )
+    {
+      outfile = fopen( "particle.data", "w" );
+      fwrite( &N, sizeof(int), 1, outfile );
+      fwrite( &Nrepetitions, sizeof(int), 1, outfile );
+    }
+  else 
+    {
+      if ( from_file ) {
+	size_t ret;
+	int    Nrep;
+	infile = fopen( "particle.data", "r" );
+	ret = fread( &N, sizeof(int), 1, infile );
+	ret = fread( &Nrep, sizeof(int), 1, infile );
+	if ( Nrep < Nrepetitions ) {
+	  printf("Nrepetitions is %d, larger than that "
+		 "used to generate the particle file, which was %d.\n"
+		 "This may lead to inf or NaN, better stop.\n",
+		 Nrepetitions, Nrep);
+	  exit(3); }	  
+      }
+
+      force = (double *)aligned_alloc(VALIGN, N*3*sizeof(double));
+    }
+
+
+  printf("> Running case %d -> \"%s\" with %u points and %d repetitions\n",
+	 case_to_run, implementation_labels[case_to_run], N, Nrepetitions);
+  if ( dry_run )
+    printf(" >>> DRY RUN :: use to estimate ops outside the actual calculations\n" );
+
+  
+  if ( case_to_run > 0 )
+    {
+      switch( case_to_run )
+	{
+	case 1: {
+	  printf ( "structure is %lu bytes long\n", sizeof(P_t));
+	  // allocate array of big structures
+	  //
+	  P = (P_t    *)aligned_alloc(VALIGN,   N * sizeof(P_t)); } break;
+	  
+	case 2:
+	case 3: {      
+	  // allocate array of small structures (vectors)
+	  //
+	  vpos  = (v4df   *)aligned_alloc(VALIGN,   N * sizeof(v4df)); } break;
+	  
+	case 4:
+	case 5: {
+	  // allocate arrays
+	  //
+	  pos_x  = (double *)aligned_alloc(VALIGN, N*sizeof(double));
+	  pos_y  = (double *)aligned_alloc(VALIGN, N*sizeof(double));
+	  pos_z  = (double *)aligned_alloc(VALIGN, N*sizeof(double));
+	  mass   = (double *)aligned_alloc(VALIGN, N*sizeof(double));
+	}break;
+	}
+
+      memset( (void*)force, 0, N*3*sizeof(double));
+    }
+  
+  // initialize particle data
+  //
+    {
+      if ( ! from_file ) {
+	if ( seed == -1 )
+	  srand48(time(NULL));
+	else
+	  srand48( seed ); }
+
+      for ( int i = 0; i < N; i++ )
+	{
+	  double data[4] __attribute__((aligned(VALIGN))); 
+	  if ( ! from_file )
+	    for ( int j = 0; j < 4; j++ )
+	      data[j] = drand48();
+	  else
+	    {
+	      size_t ret = fread ( data, 4, sizeof(double), infile );
+	    }
+
+	  if ( case_to_run >= 0 )	    
+	    switch( case_to_run )
+	      {
+	      case 0:{
+		size_t ret = fwrite( data, sizeof(double), 4, outfile ); } break;
+		
+	      case 1: {		
+		P[i].pos[0] = data[0];
+		P[i].pos[1] = data[1];
+		P[i].pos[2] = data[2];
+		P[i].mass   = data[3]; } break;
+		
+	      case 2:
+	      case 3: {
+		v4df_u * _vpos_ = (v4df_u*)vpos;
+		for ( int j = 0; j < 4; j++ )
+		  _vpos_[i].p[j] = data[j]; } break;
+		
+	      case 4:
+	      case 5:{ 
+		pos_x[i] = data[0];
+		pos_y[i] = data[1];
+		pos_z[i] = data[2];
+		mass [i] = data[3]; } break;
+	      }
+	}
+    }
+
+  if ( dry_run ) {
+    printf("dry run: going to clean up\n");
+    goto clean; }
+  
+  /* ------------------------------------------------------
+   *
+   * simulate interactions
+   *
+   * ------------------------------------------------------ */
+
+  // determine how many particles will be active, from
+  // a minimum of 1/100-ed to a maximum of 1/100+1/10
+  //
+
+  int Nactive;
+  int AvgNneighbours;
+  
+  if ( case_to_run >= 0 && !from_file )
+    {
+      Nactive  = N/100 + (int)lrand48() % (N/10);  // 11% of particle will be processed at max, 1% as minimum
+      if ( case_to_run == 0 ) {
+	size_t ret = fwrite( &Nactive, sizeof(int), 1, outfile );}
+    }
+  else {
+    size_t ret = fread( &Nactive, sizeof(int), 1, infile ); }
+
+  AvgNneighbours = ( NNEIGHBOURS > N*0.1 ? (int)(N*0.1) : NNEIGHBOURS );
+  ngb_list = (int*)calloc( AvgNneighbours * 1.2, sizeof(int) );
+
+  PAPI_INIT;
+  
+  printf ( "> %d particles will be active\n", Nactive );
+
+  double timing = 0;
+  
+  // loop over active particles
+  //
+  for ( int j = 0; j < Nactive; j++ )
+    {
+      // determine which one will be active
+      // of course it is not important which one will be,
+      // nor is important that we do not pick the same
+      // particle more than once in the loop
+      // (shouldn't happen, though, because the
+      // random repetition lenght is much larger)
+      //
+      int target, Nneighbours;
+
+      if ( !from_file )
+	{
+	  target      = (int)lrand48() % N;                              // the index of the to-be-processed particle
+	  
+	  // determine how many will be the neighbours
+	  // ( same comment than before apply)
+	  Nneighbours = AvgNneighbours + (int)mrand48()%(AvgNneighbours/10) ;  // with how many particles it will interact
+	  //   Nneighbours +- 10%
+	  
+	  
+	  // decide which will be the neighbours
+	  int upperlimit = target+Nrepetitions;
+	  for ( int i = 0; i < Nneighbours; i++ )
+	    {
+	      int draw = (int)lrand48() % N;
+	      if ( (draw >= target) && (draw <= upperlimit ) )
+		draw = upperlimit+1;
+	      ngb_list[i] = draw;
+	    }
+
+	  if ( case_to_run == 0 )
+	    {
+	      /*
+	      printf("iter %d target %d has %u neighbours\n", j,
+		     target, (unsigned)Nneighbours );
+	      */
+	      size_t ret;
+	      ret = fwrite( &target, sizeof(int), 1, outfile );
+	      ret = fwrite( &Nneighbours, sizeof(int), 1, outfile );
+	      ret = fwrite( &ngb_list[0], sizeof(int), Nneighbours, outfile );
+	    }
+	  
+	}
+      else 
+	{
+	  size_t ret;
+	  ret = fread( &target, sizeof(int), 1, infile );
+	  ret = fread( &Nneighbours, sizeof(int), 1, infile );
+	  ret = fread( ngb_list, sizeof(int), Nneighbours, infile );
+	}
+
+      double myforce[3] = {0};
+      double now;
+      
+      // now, call the force evaluation with the different implementations
+      //
+      
+      if ( case_to_run > 0 ) {
+
+	double this_timing = 1e10;
+	for ( int shot = 0; shot <= Nrepetitions; shot++ ) {
+
+	  // shot = = iteration is used as a warm-up for the neighbours walk
+	  int mytarget = (target+shot)%N;
+	 
+	  if ( shot >= 1 )
+	    now = CPU_TIME;
+	  if ( shot == 1 )
+	    PAPI_START_CNTR;
+	  
+	  switch ( case_to_run )
+	    {	  
+	      // ------------------------------------------------
+	      //
+	    case 1: {	      
+	      process_with_structure( P, mytarget, ngb_list, Nneighbours, myforce );
+	    } break;
+	    
+	      // ------------------------------------------------
+	      //     
+	    case 2: {
+	      process_with_vectors( vpos, mytarget, ngb_list, Nneighbours, myforce );
+	    } break;
+	    
+	      // ------------------------------------------------
+	      //
+	      
+	    case 3: {
+	      process_with_vectors_intrinsics( (double*)vpos, mytarget, ngb_list, Nneighbours, myforce );
+	    } break;
+	    
+	      // ------------------------------------------------
+	      //
+	      
+	    case 4: {
+	      process_with_arrays( pos_x, pos_y, pos_z, mass,
+				   mytarget, ngb_list, Nneighbours, myforce );
+	    } break;
+
+	      // ------------------------------------------------
+	      //
+	    case 5: {
+	      process_with_arrays_intrinsics( pos_x, pos_y, pos_z, mass,
+					      mytarget, ngb_list, Nneighbours, myforce );
+	    } break;
+	    }
+
+	  if (shot >= 1 ) {
+	    double chrono = CPU_TIME - now;
+	    if ( chrono < this_timing )
+	      this_timing = chrono; }
+
+	  if ( shot == 1 )
+	    PAPI_STOP_CNTR;	    
+
+	  int offset = target*3;
+	  for ( int i = 0; i < 3; i++ )
+	    force[ offset + i] += myforce[i];
+	  
+	  /* if ( j == 0 ) */
+	  /*   printf("shot %d timing is %g\n", shot, CPU_TIME-now); */
+	  
+	}
+
+	timing += this_timing;       
+      }
+
+      if ( case_to_run > 0 ) {
+       #if !defined(NDEBUG)
+       #warning "you're compiling with assert"
+       #endif
+	int offset = target*3;
+	for ( int i = 0; i < 3; i++ ) {
+	    assert( myforce[i] != 0 );
+	    assert( !isnan(myforce[i]) );
+	    assert( !isinf(myforce[i]) ); 
+	    force[offset+i] /= (Nrepetitions+1);
+	  }
+      }
+      
+    }
+
+  if ( infile != NULL )
+    fclose(infile);
+  if ( outfile != NULL )
+    fclose(outfile);
+
+  if ( case_to_run > 0 )
+    {
+      
+      printf("> timing for case %d \"%s\" : %g\n",
+	     case_to_run, implementation_labels[case_to_run],
+	     timing );
+      
+      // just to trick the compiler, so that it does not optimize out
+      // pointless calculations
+      //
+
+      char  filename[100];
+      sprintf( filename, "force1b.%d.out", case_to_run );
+      FILE *output = fopen( filename, "w" );
+      if( output != NULL ) {
+	fwrite( &N, sizeof(int), 1, output);
+	fwrite( force, sizeof(double), N*3, output );
+	fclose(output); }
+      else
+	printf(">>> wow, I was unable to create stupid file\n");
+
+      free ( force    );
+      if ( !dry_run)
+	free ( ngb_list );
+  
+    }
+
+ clean:
+  switch ( case_to_run )
+    {
+    case 1: {
+      free ( P ); } break;
+    case 2:
+    case 3: {
+      free ( vpos ); } break;
+    case 4:
+    case 5: {
+      free ( pos_x ), free ( pos_y ), free ( pos_z ), free ( mass ); } break;
+    }
+
+ #if defined(USE_PAPI)
+  if ( case_to_run > 0 )
+    for ( int i = 0; i < PAPI_EVENTS_NUM; i++ )
+      printf("PAPI event %d: %llu\n",
+	     i, (unsigned long long)papi_values[i]);
+ #endif
+
+  
+  
+  return 0;
+}
-- 
GitLab