diff --git a/src/vect.1b.c b/src/vect.1b.c
index 167e21f89220eb9fc315392e7c6d52421a1f747b..757d5ce04f5cd2ae3023470fcf87d2a63af6b7ef 100644
--- a/src/vect.1b.c
+++ b/src/vect.1b.c
@@ -138,16 +138,14 @@ void process_with_structure( const P_t   * restrict P,
 	}
       
       
-      double m1m2 = Ptarget.mass * data[3];
-      // 1D distances
-      //
-      double dx = Ptarget.pos[0] - data[0];
-      double dy = Ptarget.pos[1] - data[1];
-      double dz = Ptarget.pos[2] - data[2];
+      double register m1m2 = Ptarget.mass * data[3];
+      double register dx = Ptarget.pos[0] - data[0];
+      double register dy = Ptarget.pos[1] - data[1];
+      double register dz = Ptarget.pos[2] - data[2];
 
       // square the distance
       //
-      double dist3 = (dx*dx + dy*dy) + dz*dz;
+      double register dist3 = (dx*dx + dy*dy) + dz*dz;
       dist3 = 1.0/(dist3*sqrt(dist3));
       
       // accumulate the force
@@ -194,23 +192,19 @@ void process_with_vectors( const v4df   * restrict V,
 
 {
   const v4df   Vtarget   = V[target];    // save the target vector in a local vector (much probably a register)
-        v4df   Vbuffer;               // The array of particles that are processed in the inner loop
-        v4df   vforce    = {0};          // accumulate the forces in separate lanes
+        v4df   Vbuffer;                  // The array of particles that are processed in the inner loop
+        v4df   vforce    = {0};          // accumulate the force components
         v4df   mask_pos  = {1,1,1,0};    // used to mask out the mass
         v4df   mask_mass = {0,0,0,1};    // used to mask out the position
 
 	
-  // pre-load the first BUNCH of particles
-  //
   Vbuffer = V[ngb_list[0]];
 
-  // outern loop
-  // loop over bunches 
   for ( int i = 0; i < Nngb; i++ )
     {
       
-      // hide the memory latency by pre-loding
-      // the next BUNCH 
+      // try to hide the memory latency by
+      // pre-loading the next particle 
       //
       v4df preVbuffer;
       if ( i < Nngb-1 )
@@ -227,7 +221,7 @@ void process_with_vectors( const v4df   * restrict V,
       
       // the mass product with the i-th neighbour
       //
-      v4df register prod  = (Vtarget * Vbuffer) * mask_mass;
+      v4df register prod  = (Vtarget * Vbuffer);
       
       // get the mass product as a single double
       //
@@ -369,8 +363,10 @@ void process_with_arrays( const double * restrict pos_x,
 			  const double * restrict pos_y,
 			  const double * restrict pos_z,
 			  const double * restrict mass,
-			  const int target, const int *ngb_list, const int Nngb,
-			  double force[3] )
+			  const int               target,
+			  const int    *          ngb_list,
+			  const int               Nngb,
+			        double            force[3] )
   
 {
   const double x = pos_x[target];
@@ -400,22 +396,16 @@ void process_with_arrays( const double * restrict pos_x,
 	  nextm =  mass[ngb_list[i+1]];
 	}
       
-      /*
-      vx = pos_x[ngb_list[i]];
-      vy = pos_y[ngb_list[i]];
-      vz = pos_z[ngb_list[i]];
-      vm =  mass[ngb_list[i]];
-      */
       // get distances along x, y and z for the i-th neighbour
       //
-      double dx = x - vx;
-      double dy = y - vy;
-      double dz = z - vz;
-      double mm = m * vm;
+      double register dx = x - vx;
+      double register dy = y - vy;
+      double register dz = z - vz;
+      double register mm = m * vm;
 
       // calculate the distance^2 for BUNCH neighbours
       //
-      double dist3 = dx*dx+dy*dy+dz*dz;
+      double register dist3 = dx*dx+dy*dy+dz*dz;
       dist3 = dist3*sqrt(dist3);
 
 	  
@@ -614,7 +604,7 @@ void process_with_arrays_intrinsics( const double * restrict pos_x,
   
   IVDEP
   LOOP_VECTORIZE
-  LOOP_UNROLL_N(4)  
+  LOOP_UNROLL_N(1)  
   for ( int i = 0; i < _Nngb_; i+=4 )
     {
       __m256d ngbx = _mm256_set_pd ( pos_x[ngb_list[i]], pos_x[ngb_list[i+1]], pos_x[ngb_list[i+2]], pos_x[ngb_list[i+3]] );
@@ -1091,16 +1081,11 @@ int main( int argc, char **argv )
 
 	  if ( shot == 1 )
 	    PAPI_STOP_CNTR;	    
-
-	  int offset = target*3;
-	  for ( int i = 0; i < 3; i++ )
-	    force[ offset + i] += myforce[i];
-	  
-	  /* if ( j == 0 ) */
-	  /*   printf("shot %d timing is %g\n", shot, CPU_TIME-now); */
-	  
+	  	  
 	}
 
+	//printf("%d t%d %g %g %g\n", j, target, myforce[0], myforce[1], myforce[2]);
+	
 	timing += this_timing;       
       }
 
@@ -1113,7 +1098,7 @@ int main( int argc, char **argv )
 	    assert( myforce[i] != 0 );
 	    assert( !isnan(myforce[i]) );
 	    assert( !isinf(myforce[i]) ); 
-	    force[offset+i] /= (Nrepetitions+1);
+	    force[offset+i] = myforce[i] / (Nrepetitions+1);
 	  }
       }
       
diff --git a/src/vect.2b.c b/src/vect.2b.c
new file mode 100644
index 0000000000000000000000000000000000000000..181fe49d29c7e300d7723455f69db0127b00f038
--- /dev/null
+++ b/src/vect.2b.c
@@ -0,0 +1,1009 @@
+
+/* ────────────────────────────────────────────────────────────────────────── *
+ │                                                                            │
+ │ This file is part of the nbody-vectorization-toy-lab in the fram of the    │
+ │ SPACE Center of Excellence                                                 │
+ │ © 2024                                                                     │
+ │ The work has been started by Luca Tornatore @ INAF (Italian National       │
+ │ Institute for Astrophysics) and continued in collaboration with            │
+ │ Vassilis Flouris @ FORTH (Foundation for Research and Technology - Hellas  │
+ │                                                                            │
+ │ Consult the accopanying file changes.log for details                       │
+ │                                                                            │
+ │ contact:                                                                   │
+ │  luca.tornatore@inaf.it                                                    │
+ │  vflouris@ics.forth.gr                                                     │
+ │                                                                            │
+ │                                                                            │
+ │ ×××××××××××××××××××××××××××××××××××××××××××××××××××××××××××××××××××××××××  │
+ │                                                                            │
+ │     This is open source software; you can redistribute it and/or modify    │
+ │     it under the terms of the "3 clauses BSD license"                      │
+ │       https://opensource.org/license/bsd-3-clause                          │
+ │                                                                            │
+ │     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS    │
+ │     “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT      │
+ │     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS      │
+ │     FOR A PARTICULAR PURPOSE ARE DISCLAIMED.                               │
+ │                                                                            │
+ * ────────────────────────────────────────────────────────────────────────── */
+
+
+
+#if defined(__STDC__)
+#if (__STDC_VERSION__ <= 201710L)
+#error "c2x standard is needed to compile this source"
+#endif
+#define _XOPEN_SOURCE 700
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include <math.h>
+#include <assert.h>
+
+#include <vector_pragmas.h>
+#include <vector_types.h>
+
+#include "mypapi.h"
+//#include <immintrin.h>
+
+#define CPU_TIME ({struct timespec ts; clock_gettime( CLOCK_PROCESS_CPUTIME_ID, &ts ), (double)ts.tv_sec + \
+					 (double)ts.tv_nsec * 1e-9;})
+
+// with how many neighbours each particle interacts, in avg
+//
+#define NNEIGHBOURS 200     
+
+#define NREPETITIONS 5
+
+#define DATASIZE 4              // 3 positions + the mass
+
+// how many different implementations
+//
+#define NSHOTS 5
+const char *implementation_labels[NSHOTS+1] = {"generate particles",
+  "Using AoS",
+  "using AoV (array of vectors)",
+  "using AoV with intrinsics",
+  "using SoA",
+  "using SoA with intrinsics" };
+
+// define a vector type
+//
+
+#define DV4SIZE DATASIZE
+typedef double v4df __attribute__ ((vector_size (DV4SIZE*sizeof(double))));
+typedef union
+{
+  v4df   P;
+  double p[DV4SIZE];
+} v4df_u;
+
+
+// define a structure
+typedef struct {
+  double pos[3];
+  double mass;
+ #if defined(FILL)
+  double array[FILL];
+ #endif
+} P_t;
+
+
+
+
+// ···································································
+//
+/*
+ * Processing the particles as structures
+ * 
+ */
+
+void process_with_structure( const P_t * restrict P, void * restrict mem,
+			     const int target, const int *ngb_list, const int Nngb,
+			     double force[3],
+			     double *timing )
+{
+
+  typedef struct {
+    double pos[3];
+    double mass; } Plocal_t ;
+  
+  /*
+   * move all the neighbours into a local buffer
+   * here we hide the latency of sparse memory access
+   * and we try to enhance the vectorization by building
+   * the possibility of a loop over contiguous memory
+   *
+   */
+  Plocal_t * restrict memP = __builtin_assume_aligned((Plocal_t*)mem, VALIGN);
+  for ( int i = 0; i < Nngb; i++ )
+    memcpy(&memP[i], &P[ngb_list[i]], 4*sizeof(double) );
+
+  // let's return the timing of the pure calculation so that we
+  // can compare with v1
+  //
+  
+  *timing = CPU_TIME;
+  
+  Plocal_t Ptarget;
+  memcpy(&Ptarget, &P[target], 4*sizeof(double) );
+  double local_force[3] = {0};            // this will accumulate the total force on target from all the neigbhours
+
+  IVDEP
+  LOOP_VECTORIZE
+  VECTOR_ALIGNED
+  for ( int i = 0; i < Nngb; i++ )
+    {
+
+      double register m1m2 = Ptarget.mass * memP[i].mass;
+      // 1D distances
+      //
+      double register dx = Ptarget.pos[0] - memP[i].pos[0];
+      double register dy = Ptarget.pos[1] - memP[i].pos[1];
+      double register dz = Ptarget.pos[2] - memP[i].pos[2];
+
+      // square the distance
+      //
+      double register dist3 = (dx*dx + dy*dy) + dz*dz;
+      dist3 = 1.0/(dist3*sqrt(dist3));
+      
+      // accumulate the force
+      //
+      local_force[0] += m1m2 * dx * dist3;
+      local_force[1] += m1m2 * dy * dist3;
+      local_force[2] += m1m2 * dz * dist3;
+
+
+      /*
+      double check = P[target].mass * memP[i].mass / dist2;
+      printf("T%d n %d->%d %g %g %g %g\n",
+	     target, i, ngb_list[i], dx*dx, dy*dy, dz*dz, check );
+      */
+
+    }
+
+  for ( int i = 0; i < 3; i++)
+    force[i] += local_force[i];
+
+  *timing = CPU_TIME - *timing;
+  return;
+}
+
+
+// ···································································
+//
+/*
+ * Here we trat the particles explicitly as vectors
+ * The array if structures P is accessed as v4df vectors
+ *
+ */
+
+
+
+void process_with_vectors( const v4df   *restrict V,
+			         void   *restrict mem,
+			   const int     target,
+			   const int    *ngb_list,
+			   const int     Nngb,
+			         double  force[3],
+			         double *timing )
+{
+  alignas(VALIGN) const v4df   Vtarget   = V[target];    // save the target vector in a local vector (much probably a register)
+  alignas(VALIGN) const v4df   mask_pos  = {1,1,1,0};    // used to mask out the mass
+  alignas(VALIGN) const v4df   mask_mass = {0,0,0,1};    // used to mask out the position
+  alignas(VALIGN)       v4df   vforce    = {0,0,0,0};    // accumulate the force components
+
+  
+
+  ASSUME_ALIGNED(mem, VALIGN);
+  ASSUME_ALIGNED(V, VALIGN);
+  ASSUME_ALIGNED(ngb_list, VALIGN);
+  v4df *memV = ASSUME_ALIGNED((v4df*)mem, VALIGN);
+  
+  IVDEP
+  LOOP_UNROLL_N(4)
+  VECTOR_ALIGNED
+  for ( int i = 0; i < Nngb; i++ )
+    memV[i] = V[ ngb_list[i] ];
+
+  *timing = CPU_TIME;
+  
+  // set-up the new iteration spaces
+  //
+  //int _Nngb_ = Nngb & (-4);
+
+  IVDEP
+  VECTOR_ALIGNED
+  LOOP_UNROLL_N(4)
+  for ( int i = 0; i < Nngb; i++ )
+    {
+      // the distance^2 of the i-th neighbour
+      //
+      v4df   register dist  = (Vtarget - memV[i]) * mask_pos;
+      v4df_u register dist2;
+      dist2.P = dist*dist;
+      double register scalar_dist3 = (dist2.p[0]+dist2.p[1])+(dist2.p[2]+dist2.p[3]);
+      scalar_dist3 = 1.0 / (scalar_dist3*sqrt(scalar_dist3));
+      
+      // the mass product with the i-th neighbour
+      //
+      v4df register prod  = (Vtarget * memV[i]);
+      
+      // get the mass product as a single double
+      //      
+      v4df mm = dist * ((v4df_u)prod).p[3];
+      
+      // get the force from the i-th neighbour
+      //
+      vforce += mm * scalar_dist3;
+      //
+      // NOTE: the accumulation here above on a single double is obvisouly a bottleneck
+      //       because it imposes a strict constraint on compiler's optimization.
+      //       I
+    }
+  
+
+  // accumulate the partial results
+  //
+  for ( int i = 0; i < 3; i++ )
+    force[i] += ((v4df_u)vforce).p[i];
+
+  
+  *timing = CPU_TIME - *timing;
+  return;
+
+}
+
+
+
+
+// ···································································
+//
+/*
+ *
+ */
+
+void process_with_vectors_intrinsics( const double * restrict V,
+				            void   * restrict mem,
+				      const int      target,
+				      const int    * ngb_list,
+				      const int      Nngb,
+				            double force[3],
+				            double * timing )
+  
+{
+        long long bb               = -1;                                                             // used for masking entries of a vector
+        __m256d register mask_pos  = _mm256_set_pd(0, *(double*)&bb, *(double*)&bb, *(double*)&bb);  // used to mask out the mass
+        __m256d register mask_mass = _mm256_set_pd(*(double*)&bb, 0, 0, 0);                          // used to mask out the position
+  const __m256d register vtarget   = _mm256_load_pd(&V[target*4]);                       // load the target data    
+
+  
+        double  * restrict memV  = ASSUME_ALIGNED ( (double*)mem, VALIGN);
+        __m256d * restrict memVV = ASSUME_ALIGNED ((__m256d*)mem, VALIGN);
+	__m256d * restrict VV    = ASSUME_ALIGNED ((__m256d*)V, VALIGN);
+
+        __m256d Vforce = {0};
+
+	
+ IVDEP
+ LOOP_VECTORIZE
+ LOOP_UNROLL
+ for ( int i = 0; i < Nngb; i++ )
+   memVV[i] = VV[ngb_list[i]];
+ 
+  double now = CPU_TIME;
+
+  IVDEP
+  LOOP_VECTORIZE
+  LOOP_UNROLL
+  for ( int i = 0; i < Nngb; i++ )
+    {      
+      //__m256d register vdist;
+      //__m256d register vdist3;
+      __m256d vdist; 
+      __m256d vdist3;      
+      // note: the 64bits fields in the following
+      //       comments are reported so that at
+      //       the beginning there are the first
+      //       bits in the vector field, i.e.
+      //       those at the least significant bits
+      //       >> however, when you consider the
+      //       _mm256_set_pd instruction, the arguments
+      //       are in the opposite direction: the first
+      //       one sets the most significant bits and
+      //       so on
+      //
+      //       least --->  most
+      
+      // get [ dx, dy, dz dm ]
+      vdist  = _mm256_sub_pd         ( vtarget, memVV[i] );
+      //
+      // get [ dx^2, dy^2, dz^2, dm^2 ]
+      vdist3 = _mm256_mul_pd         ( vdist, vdist      );
+      
+      //
+      // get [ dx^2, dy^2, dz^2, 0 ]
+      vdist3 = _mm256_and_pd         ( vdist3, mask_pos  );      
+      //
+      // get [ dx^2+dy^2, dx^2+dy^2, dz^2, dz^2]
+      vdist3 = _mm256_hadd_pd        ( vdist3, vdist3    );
+      //
+      // get [ dx^2+dy^2, dz^2, dx^2+dy^2, dz^2 ]
+      vdist3 = _mm256_permute4x64_pd ( vdist3, 0b00100111);
+      //
+      // get [ dx^2+dy^2+dz^2, .., .., .. ]
+      vdist3 = _mm256_hadd_pd        ( vdist3, vdist3    );
+      vdist3 = _mm256_mul_pd         ( vdist3, _mm256_sqrt_pd(vdist3) );
+      
+      __m256d mprod;
+      double m1m2;
+      //
+      // get [ x0*x1, y0*y1, z0*z1, m0*m1 ]
+      mprod = _mm256_mul_pd( vtarget, memVV[i] );
+      m1m2  = (((double*)&mprod)[3]);
+      mprod = _mm256_set_pd( m1m2, m1m2, m1m2, m1m2 );
+
+      //
+      // get [ 0, 0, 0, m0*m1 ]
+      mprod = _mm256_mul_pd( mprod, vdist );      
+      //
+      // get [ 0, 0, 0, m0*m1 / (dx^2+dy^2+dz^2) ]
+      mprod = _mm256_div_pd( mprod, vdist3 );
+      
+      // get the uppermost element from the vector
+      // --> here we should use the reshuffle+extraction instruction
+      Vforce += mprod;
+      
+    }
+
+  for ( int k = 0; k < 3; k++ )
+    force[k] += ((double*)&Vforce)[k];
+  
+  
+  *timing = CPU_TIME - now;
+  return;
+}
+
+
+
+// ···································································
+//
+/*
+ *
+ */
+
+void process_with_arrays( const double * restrict pos_x,
+			  const double * restrict pos_y,
+			  const double * restrict pos_z,
+			  const double * restrict mass,
+			        void   * restrict mems[],
+			  const int               target,
+			  const int    * restrict ngb_list,
+			  const int               Nngb,
+			        double            force[3],
+			        double *          timing )
+  
+{
+  ASSUME_ALIGNED( pos_x, VALIGN );
+  ASSUME_ALIGNED( pos_y, VALIGN );
+  ASSUME_ALIGNED( pos_z, VALIGN );
+  ASSUME_ALIGNED( mass, VALIGN );
+  ASSUME_ALIGNED( ngb_list, VALIGN );
+
+  double *_memx = ASSUME_ALIGNED((double*)mems[0], VALIGN);
+  double *_memy = ASSUME_ALIGNED((double*)mems[1], VALIGN);
+  double *_memz = ASSUME_ALIGNED((double*)mems[2], VALIGN);
+  double *_memm = ASSUME_ALIGNED((double*)mems[3], VALIGN);
+
+  IVDEP
+  LOOP_VECTORIZE
+    LOOP_UNROLL_N(4)
+  for ( int i = 0; i < Nngb; i++ ) {
+    int k = ngb_list[i];
+    _memx[i] = pos_x[k]; }
+
+  IVDEP
+  LOOP_VECTORIZE
+  LOOP_UNROLL_N(4)
+  for ( int i = 0; i < Nngb; i++ ) {
+    int k = ngb_list[i];
+    _memy[i] = pos_y[k]; }
+
+  IVDEP
+  LOOP_VECTORIZE
+  LOOP_UNROLL_N(4)
+  for ( int i = 0; i < Nngb; i++ ) {
+    int k = ngb_list[i];
+    _memz[i] = pos_z[k]; }
+
+  IVDEP
+  LOOP_VECTORIZE
+  LOOP_UNROLL_N(4)
+  for ( int i = 0; i < Nngb; i++ ) {
+    int k = ngb_list[i];
+    _memm[i] = mass [k]; }
+   
+  double mytiming = CPU_TIME;
+
+  double vforce[3] = {0};
+  const double register x = pos_x[target];
+  const double register y = pos_y[target];
+  const double register z = pos_z[target];
+  const double register m =  mass[target];
+
+  //  int _Nngb_ = Nngb & (-(int)BUNCH);
+  
+  IVDEP
+  LOOP_VECTORIZE
+  LOOP_UNROLL_N(4)
+  for ( int i = 0; i < Nngb; i++ )
+    {
+      double register dx = x - _memx[i];
+      double register dy = y - _memy[i];
+      double register dz = z - _memz[i];
+      double register mm = m * _memm[i];
+      
+      // distance
+      double dist3 = (dx*dx + dy*dy) + dz*dz;
+      dist3 = 1.0 / (dist3*sqrt(dist3));
+
+      vforce[0] += mm * dx * dist3;
+      vforce[1] += mm * dy * dist3;
+      vforce[2] += mm * dz * dist3;
+    }
+  
+  // accumulate on force
+  //
+  for ( int k = 0; k < 3; k++ )
+    force[k] += vforce[k];
+  
+  
+  *timing = CPU_TIME - mytiming;
+  return;
+}
+
+
+
+// ···································································
+//
+/*
+ *
+ */
+
+void process_with_arrays_vectors( const double * restrict pos_x,
+				  const double * restrict pos_y,
+				  const double * restrict pos_z,
+				  const double * restrict mass,
+				        void   *          mems[],
+				  const int               target,
+				  const int    * restrict ngb_list,
+				  const int               Nngb,
+       				        double            force[3],
+				        double *          timing)
+{
+  
+  ASSUME_ALIGNED(pos_x, VALIGN);
+  ASSUME_ALIGNED(pos_y, VALIGN);
+  ASSUME_ALIGNED(pos_z, VALIGN);
+  ASSUME_ALIGNED( mass, VALIGN );
+  ASSUME_ALIGNED( ngb_list, VALIGN );
+  
+  double *_memx = ASSUME_ALIGNED((double*)mems[0], VALIGN);
+  double *_memy = ASSUME_ALIGNED((double*)mems[1], VALIGN);
+  double *_memz = ASSUME_ALIGNED((double*)mems[2], VALIGN);
+  double *_memm = ASSUME_ALIGNED((double*)mems[3], VALIGN);
+
+  IVDEP
+  LOOP_VECTORIZE
+  LOOP_UNROLL_N(DVSIZE)
+  for ( int i = 0; i < Nngb; i++ ) {
+    int k = ngb_list[i];
+    _memx[i] = pos_x[k];
+    _memy[i] = pos_y[k];
+    _memz[i] = pos_z[k];
+    _memm[i] = mass [k]; }
+
+  dvector_t *_vmemx = ASSUME_ALIGNED((dvector_t*)mems[0], VALIGN);
+  dvector_t *_vmemy = ASSUME_ALIGNED((dvector_t*)mems[1], VALIGN);
+  dvector_t *_vmemz = ASSUME_ALIGNED((dvector_t*)mems[2], VALIGN);
+  dvector_t *_vmemm = ASSUME_ALIGNED((dvector_t*)mems[3], VALIGN);
+
+ #if defined(__clang__) || defined(__INTEL_LLVM_COMPILER)
+  dvector_t register vtargetx ATTRIBUTE_ALIGNED(VALIGN) = (dvector_t)(pos_x[target]);
+  dvector_t register vtargety ATTRIBUTE_ALIGNED(VALIGN) = (dvector_t)(pos_y[target]);
+  dvector_t register vtargetz ATTRIBUTE_ALIGNED(VALIGN) = (dvector_t)(pos_z[target]);
+  dvector_t register vtargetm ATTRIBUTE_ALIGNED(VALIGN) = (dvector_t)(mass[target]);
+  dvector_t          one      ATTRIBUTE_ALIGNED(VALIGN) = (dvector_t)(1.0);
+ #else
+ #define X(v) (pos_x[target])
+  dvector_t vtargetx ATTRIBUTE_ALIGNED(VALIGN) = INIT_VECTOR(DVSIZE);
+ #undef X
+ #define X(v) (pos_y[target])
+  dvector_t vtargety ATTRIBUTE_ALIGNED(VALIGN) = INIT_VECTOR(DVSIZE);
+ #undef X
+ #define X(v) (pos_z[target])
+  dvector_t vtargetz ATTRIBUTE_ALIGNED(VALIGN) = INIT_VECTOR(DVSIZE);
+ #undef X
+ #define X(v) (mass[target])
+  dvector_t vtargetm ATTRIBUTE_ALIGNED(VALIGN) = INIT_VECTOR(DVSIZE);
+ #undef X
+ #define X(v) 1.0
+  dvector_t          one      ATTRIBUTE_ALIGNED(VALIGN) = INIT_VECTOR(DVSIZE);
+ #undef X
+ #define X(v) 0.0
+  dvector_t vforcex   ATTRIBUTE_ALIGNED(VALIGN) = INIT_VECTOR(DVSIZE);
+  dvector_t vforcey   ATTRIBUTE_ALIGNED(VALIGN) = INIT_VECTOR(DVSIZE);
+  dvector_t vforcez   ATTRIBUTE_ALIGNED(VALIGN) = INIT_VECTOR(DVSIZE);
+ #undef X
+ #endif
+
+  
+  int _Nngb_ = Nngb & 0xFFFFFFFC;
+  _Nngb_ /= 4;
+
+  double mytiming = CPU_TIME;
+    
+  IVDEP
+  LOOP_VECTORIZE
+  for ( int i = 0; i < _Nngb_; i++ )
+    {
+      
+      dvector_t distx = (vtargetx - _vmemx[i]);      
+      dvector_t disty = (vtargety - _vmemy[i]);      
+      dvector_t distz = (vtargetz - _vmemz[i]);
+
+      dvector_t dist2x = distx*distx;
+      dvector_t dist2y = disty*disty;
+      dvector_t dist2z = distz*distz;
+
+      dist2x = (dist2x + dist2y) + dist2z;
+      dvector_t inv_dist3 = one / (dist2x * _mm256_sqrt_pd(dist2x));
+      
+      vforcex += distx * (vtargetm * _vmemm[i]) * inv_dist3;
+      vforcey += disty * (vtargetm * _vmemm[i]) * inv_dist3;
+      vforcez += distz * (vtargetm * _vmemm[i]) * inv_dist3;
+    }
+
+  // process the reminder of the iteration space
+  //
+  double local_force[3] = {0};
+  for ( int i = Nngb&0xFFFFFFFC; i < Nngb; i++ )
+    {
+      double dist3 = (
+		      (pos_x[target] - _memx[i])* (pos_x[target] - _memx[i]) +
+		      (pos_y[target] - _memy[i])* (pos_y[target] - _memy[i]) +
+		      (pos_z[target] - _memz[i])* (pos_z[target] - _memz[i]) );
+      dist3 = 1.0 / (dist3* sqrt(dist3));
+      
+      local_force[0] += mass[target]*_memm[i] * (pos_x[target] - _memx[i]) * dist3;
+      local_force[1] += mass[target]*_memm[i] * (pos_y[target] - _memy[i]) * dist3;
+      local_force[2] += mass[target]*_memm[i] * (pos_z[target] - _memz[i]) * dist3;
+    }
+
+  for ( int i = 0; i < 3; i++ )
+    force[i] += local_force[i];
+  
+  // reduce to a unique double the vector force
+  // calculated before
+  //
+  dvector_u _vforce_;
+  _vforce_.V = vforcex;
+  for ( int i = 0; i < 4; i++ )
+    force[0] += _vforce_.v[i];
+
+  _vforce_.V = vforcey;
+  for ( int i = 0; i < 4; i++ )
+    force[1] += _vforce_.v[i];
+
+  _vforce_.V = vforcez;
+  for ( int i = 0; i < 4; i++ )
+    force[2] += _vforce_.v[i];
+
+  *timing = CPU_TIME - mytiming;
+
+  return;
+}
+
+
+int main( int argc, char **argv )
+{
+
+           int case_to_run = (argc > 1 ? atoi(*(argv+1)) : 0 );
+  unsigned int N           = (argc > 2 ? atoi(*(argv+2)) : 1000000 );
+  long     int seed        = (argc > 3 ? atoi(*(argv+3)) : -1 );     // set the seed for repeatible runs
+           int Nrepetitions= (argc > 4 ? atoi(*(argv+4)) : NREPETITIONS );
+           int dry_run     = (argc > 5 ? atoi(*(argv+5)) : 0 );      // 1 to estimate floats for initialization
+           int from_file   = ( case_to_run > 0 );	   
+	   
+  case_to_run = (case_to_run < 0 ? -case_to_run : case_to_run);  // make it positive
+	   
+  if ( case_to_run > 5 ) {
+    printf("unknown case %d\n", case_to_run );
+    return 0; }
+  
+
+  P_t    * restrict P;
+  v4df   * restrict vpos;
+  double * restrict pos_x;
+  double * restrict pos_y;
+  double * restrict pos_z;
+  double * restrict mass ;               
+  double * restrict force; 
+  int    * restrict ngb_list;
+  
+  FILE   * outfile = NULL;
+  FILE   * infile  = NULL;
+  
+
+  if ( case_to_run == 0 )
+    {
+      outfile = fopen( "particle.data", "w" );
+      fwrite( &N, sizeof(int), 1, outfile );
+      fwrite( &Nrepetitions, sizeof(int), 1, outfile );
+    }
+  else 
+    {
+      if ( from_file ) {
+	size_t ret;
+	int    Nrep;
+	infile = fopen( "particle.data", "r" );
+	ret = fread( &N, sizeof(int), 1, infile );
+	ret = fread( &Nrep, sizeof(int), 1, infile );
+	if ( Nrep < Nrepetitions ) {
+	  printf("Nrepetitions is %d, larger than that "
+		 "used to generate the particle file, which was %d.\n"
+		 "This may lead to inf or NaN, better stop.\n",
+		 Nrepetitions, Nrep);
+	  exit(3); }	  
+      }
+
+      force = (double *)aligned_alloc(VALIGN, N*3*sizeof(double));
+    }
+
+
+  printf(" >  Running case %d -> \"%s\" with %u points and %d repetitions\n"
+	 " >  Vectors are %lu bytes long, %d double long\n",
+	 case_to_run, implementation_labels[case_to_run], N, Nrepetitions,
+	 VSIZE, DVSIZE);
+  if ( dry_run )
+    printf("  >>> DRY RUN :: use to estimate ops outside the actual calculations\n" );
+
+  
+  if ( case_to_run > 0 )
+    {
+      switch( case_to_run )
+	{
+	case 1: {
+	  // allocate array of big structures
+	  //
+	  P = (P_t    *)aligned_alloc(VALIGN,   N * sizeof(P_t)); } break;
+	  
+	case 2:
+	case 3: {      
+	  // allocate array of small structures (vectors)
+	  //
+	  vpos  = (v4df   *)aligned_alloc(VALIGN,   N * sizeof(v4df)); } break;
+	  
+	case 4:
+	case 5: {
+	  // allocate arrays
+	  //
+	  pos_x  = (double *)aligned_alloc(VALIGN, N*sizeof(double));
+	  pos_y  = (double *)aligned_alloc(VALIGN, N*sizeof(double));
+	  pos_z  = (double *)aligned_alloc(VALIGN, N*sizeof(double));
+	  mass   = (double *)aligned_alloc(VALIGN, N*sizeof(double));
+	}break;
+	}
+
+      memset( (void*)force, 0, N*3*sizeof(double));
+    }
+  
+  // initialize particle data
+  //
+    {
+      if ( ! from_file ) {
+	if ( seed == -1 )
+	  srand48(time(NULL));
+	else
+	  srand48( seed ); }
+
+      for ( int i = 0; i < N; i++ )
+	{
+	  double data[4] __attribute__((aligned(VALIGN))); 
+	  if ( ! from_file )
+	    for ( int j = 0; j < 4; j++ )
+	      data[j] = drand48();
+	  else
+	    {
+	      size_t ret = fread ( data, 4, sizeof(double), infile );
+	    }
+
+	  if ( case_to_run >= 0 )	    
+	    switch( case_to_run )
+	      {
+	      case 0:{
+		size_t ret = fwrite( data, sizeof(double), 4, outfile ); } break;
+		
+	      case 1: {		
+		P[i].pos[0] = data[0];
+		P[i].pos[1] = data[1];
+		P[i].pos[2] = data[2];
+		P[i].mass   = data[3]; } break;
+		
+	      case 2:
+	      case 3: {
+		v4df_u * _vpos_ = (v4df_u*)vpos;
+		for ( int j = 0; j < 4; j++ )
+		  _vpos_[i].p[j] = data[j]; } break;
+		
+	      case 4:
+	      case 5:{ 
+		pos_x[i] = data[0];
+		pos_y[i] = data[1];
+		pos_z[i] = data[2];
+		mass [i] = data[3]; } break;
+	      }
+	}
+    }
+
+  if ( dry_run ) {
+    printf("dry run: going to clean up\n");
+    goto clean; }
+  
+  /* ------------------------------------------------------
+   *
+   * simulate interactions
+   *
+   * ------------------------------------------------------ */
+
+  // determine how many particles will be active, from
+  // a minimum of 1/100-ed to a maximum of 1/100+1/10
+  //
+
+  int Nactive;
+  int AvgNneighbours;
+  
+  if ( case_to_run >= 0 && !from_file )
+    {
+      Nactive  = N/100 + (int)lrand48() % (N/10);  // 11% of particle will be processed at max, 1% as minimum
+      if ( case_to_run == 0 ) {
+	size_t ret = fwrite( &Nactive, sizeof(int), 1, outfile );}
+    }
+  else {
+    size_t ret = fread( &Nactive, sizeof(int), 1, infile ); }
+
+  AvgNneighbours = ( NNEIGHBOURS > N*0.1 ? (int)(N*0.1) : NNEIGHBOURS );
+  ngb_list = (int*)calloc( AvgNneighbours * 1.2, sizeof(int) );
+
+  PAPI_INIT;
+  
+  printf ( " *  %d particles will be active\n", Nactive );
+
+  double timing   = 0;
+  double intiming = 0;
+  
+  void *mem = NULL;
+  void *mems[4] = {NULL};
+  switch ( case_to_run )
+    {
+    case 1: mem = aligned_alloc(VALIGN, AvgNneighbours*1.2 * sizeof(P_t)); break;
+    case 2: 
+    case 3: mem = aligned_alloc(VALIGN, AvgNneighbours*1.2 * sizeof(v4df)); break;
+    case 4:
+    case 5: {
+      mems[0] = aligned_alloc(VALIGN, AvgNneighbours*1.2 * sizeof(double));
+      mems[1] = aligned_alloc(VALIGN, AvgNneighbours*1.2 * sizeof(double));
+      mems[2] = aligned_alloc(VALIGN, AvgNneighbours*1.2 * sizeof(double));
+      mems[3] = aligned_alloc(VALIGN, AvgNneighbours*1.2 * sizeof(double)); } break;
+    }
+      
+  // loop over active particles
+  //
+  for ( int j = 0; j < Nactive; j++ )
+    {
+      // determine which one will be active
+      // of course it is not important which one will be,
+      // nor is important that we do not pick the same
+      // particle more than once in the loop
+      // (shouldn't happen, though, because the
+      // random repetition lenght is much larger)
+      //
+      int target, Nneighbours;
+
+      if ( !from_file )
+	{
+	  target      = (int)lrand48() % (N-Nrepetitions);                              // the index of the to-be-processed particle
+	  
+	  // determine how many will be the neighbours
+	  // ( same comment than before apply)
+	  Nneighbours = AvgNneighbours + (int)mrand48()%(AvgNneighbours/10) ;  // with how many particles it will interact
+	  //   Nneighbours +- 10%
+	  
+	  
+	  // decide which will be the neighbours
+	  int upperlimit = target+Nrepetitions;
+	  for ( int i = 0; i < Nneighbours; i++ )
+	    {
+	      int draw = (int)lrand48() % N;
+	      if ( (draw >= target) && (draw <= upperlimit ) )
+		draw = upperlimit+1;
+	      ngb_list[i] = draw;
+	    }
+
+	  if ( case_to_run == 0 )
+	    {
+	      /*
+		printf("iter %d target %d has %u neighbours\n", j,
+		target, (unsigned)Nneighbours );
+	      */
+	      size_t ret;
+	      ret = fwrite( &target, sizeof(int), 1, outfile );
+	      ret = fwrite( &Nneighbours, sizeof(int), 1, outfile );
+	      ret = fwrite( &ngb_list[0], sizeof(int), Nneighbours, outfile );
+	    }
+	  
+	}
+      else 
+	{
+	  size_t ret;
+	  ret = fread( &target, sizeof(int), 1, infile );
+	  ret = fread( &Nneighbours, sizeof(int), 1, infile );
+	  ret = fread( ngb_list, sizeof(int), Nneighbours, infile );
+	}
+
+      double myforce[3] = {0};
+      double now;
+      
+      // now, call the force evaluation with the different implementations
+      //
+      
+      if ( case_to_run > 0 ) {
+
+	double this_timing       = 1e10;
+	double this_inner_timing = 1e10;
+	for ( int shot = 0; shot <= Nrepetitions; shot++ ) {
+
+	  // shot = = iteration is used as a warm-up for the neighbours walk
+	  int mytarget = (target+shot)%N;
+	 
+	  if ( shot >= 1 )
+	    now = CPU_TIME;
+	  if ( shot == 1 )
+	    PAPI_START_CNTR;
+
+	  double inner_timing;
+	  switch ( case_to_run )
+	    {	  
+	      // ------------------------------------------------
+	      //
+	    case 1: {
+	      process_with_structure( P, mem, mytarget, ngb_list, Nneighbours, myforce, &inner_timing );
+	    } break;
+	    
+	      // ------------------------------------------------
+	      //     
+	    case 2: {
+	      process_with_vectors( vpos, mem, mytarget, ngb_list, Nneighbours, myforce, &inner_timing );
+	    } break;
+	    
+	      // ------------------------------------------------
+	      //
+	      
+	    case 3: {
+	      process_with_vectors_intrinsics( (double*)vpos, mem, mytarget, ngb_list, Nneighbours, myforce, &inner_timing );
+	    } break;
+	    
+	      // ------------------------------------------------
+	      //
+
+	    case 4: {
+	      process_with_arrays( pos_x, pos_y, pos_z, mass, mems,
+				   mytarget, ngb_list, Nneighbours, myforce, &inner_timing );
+	    } break;
+
+	      // ------------------------------------------------
+	      //
+	    case 5: {
+	      process_with_arrays_vectors( pos_x, pos_y, pos_z, mass, mems,
+					   mytarget, ngb_list, Nneighbours, myforce, &inner_timing );
+	    } break;
+	    }
+
+	  if (shot >= 1 ) {
+	    double chrono = CPU_TIME - now;
+	    if ( chrono < this_timing )
+	      this_timing = chrono;
+	    if ( inner_timing < this_inner_timing )
+	      this_inner_timing = inner_timing;
+	  }
+
+	  if ( shot == 1 )
+	    PAPI_STOP_CNTR;	    
+	  	  
+	}
+
+	//printf("%d t%d %g %g %g\n", j, target, myforce[0], myforce[1], myforce[2]);
+	
+	timing += this_timing;
+	intiming += this_inner_timing;
+      }
+
+      if ( case_to_run > 0 ) {
+       #if !defined(NDEBUG)
+       #warning "you're compiling with assert"
+       #endif
+	int offset = target*3;
+	for ( int i = 0; i < 3; i++ ) {
+	  assert( myforce[i] != 0 );
+	  assert( !isnan(myforce[i]) );
+	  assert( !isinf(myforce[i]) );
+	  force[offset+i] = myforce[i] / (Nrepetitions+1);
+	}
+      }
+    }
+
+  if ( infile != NULL )
+    fclose(infile);
+  if ( outfile != NULL )
+    fclose(outfile);
+
+  if ( case_to_run > 0 )
+    {
+      
+      printf(" +  timing and inner_timing for case %d \"%s\" : %g %g\n",
+	     case_to_run, implementation_labels[case_to_run],
+	     timing, intiming );
+      
+      // just to trick the compiler, so that it does not optimize out
+      // pointless calculations
+      //
+
+      char  filename[100];
+      sprintf( filename, "force2b.%d.out", case_to_run );
+      FILE *output = fopen( filename, "w" );
+      if( output != NULL ) {
+	fwrite( &N, sizeof(int), 1, output);
+	fwrite( force, sizeof(double), N*3, output );
+	fclose(output); }
+      else
+	printf(">>> wow, I was unable to create stupid file\n");
+
+      free ( force    );
+      if ( !dry_run)
+	free ( ngb_list );
+  
+    }
+
+ clean:
+  if ( mem != NULL )
+    free ( mem );
+  if ( mems[0] != NULL )
+    for ( int i = 0; i < 4; i++ )
+      free ( mems[i] );
+  switch ( case_to_run )
+    {
+    case 1: {
+      free ( P ); } break;
+    case 2:
+    case 3: {
+      free ( vpos ); } break;
+    case 4:
+    case 5: {
+      free ( pos_x ), free ( pos_y ), free ( pos_z ), free ( mass ); } break;
+    }
+
+ #if defined(USE_PAPI)
+  if ( case_to_run > 0 )
+    for ( int i = 0; i < PAPI_EVENTS_NUM; i++ )
+      printf("PAPI event %d: %llu\n",
+	     i, (unsigned long long)papi_values[i]);
+ #endif
+
+  
+  
+  return 0;
+}