finalized one-sided communication in shared-memory

1d708577 · Luca Tornatore · a22b9970 · 1d708577 · 1d708577 · 1d708577
Commit 1d708577 authored 3 years ago by Luca Tornatore
--- a/allvars.c
+++ b/allvars.c
@@ -7,7 +7,9 @@ struct ip in;
 struct op out, outparam;

 struct meta metaData;
-struct time timing;
+timing_t wt_timing = {0};
+timing_t pr_timing   = {0};
+
 struct parameter param;
 struct fileData data;

@@ -16,6 +18,7 @@ char datapath[900];
 int xaxis, yaxis;
 int global_rank;
 int size;
+int verbose_level = 0;
 long nsectors;
 long startrow;
 double resolution, dx, dw, w_supporth;

--- a/allvars.h
+++ b/allvars.h
@@ -92,13 +92,24 @@ extern struct meta
 } metaData;


-extern struct time
-{
-   	double setup_time, process_time, mpi_time, fftw_time, tot_time, kernel_time, reduce_time, compose_time, phase_time;
-	double setup_time1, process_time1, mpi_time1, fftw_time1, tot_time1, kernel_time1, reduce_time1, compose_time1, phase_time1;
-	double writetime, writetime1;
-
-} timing;
+typedef struct {
+  double setup;      // time spent in initialization, init()
+  double init;       // time spent in initializing arrays
+  double process;    // time spent in gridding;
+  double mpi;        // time spent in mpi communications
+  double fftw;       //
+  double kernel;     //
+  double mmove;      //
+  double reduce;     //
+  double reduce_mpi; //
+  double reduce_sh ; //
+  double compose;    //
+  double phase;      //
+  double write;      //
+  double total; } timing_t;
+
+extern timing_t wt_timing;      // wall-clock timings
+extern timing_t pr_timing;      // process CPU timing

 extern struct parameter
 {
@@ -127,12 +138,11 @@ extern char datapath[900];
 extern int xaxis, yaxis;
 extern int global_rank;
 extern int size;
+extern int verbose_level;
 extern long nsectors;
 extern long startrow;
 extern double resolution, dx, dw, w_supporth;

-extern clock_t start, end, start0, startk, endk;
-extern struct timespec begin, finish, begin0, begink, finishk;
 extern long * histo_send, size_of_grid;
 extern double * grid, *gridss, *gridss_real, *gridss_img, *gridss_w;

@@ -142,3 +152,60 @@ extern double * grid, *gridss, *gridss_real, *gridss_img, *gridss_w;
 #endif

 extern long **sectorarray;
+
+
+#if defined(DEBUG)
+#define dprintf(LEVEL, T, t, ...) if( (verbose_level >= (LEVEL)) &&	\
+				      ( ((t) ==-1 ) || ((T)==(t)) ) ) {	\
+    printf(__VA_ARGS__); fflush(stdout); }
+
+#else
+#define dprintf(...)
+#endif
+
+
+#define CPU_TIME_wt ({ struct timespec myts; (clock_gettime( CLOCK_REALTIME, &myts ), (double)myts.tv_sec + (double)myts.tv_nsec * 1e-9);})
+#define CPU_TIME_pr ({ struct timespec myts; (clock_gettime( CLOCK_PROCESS_CPUTIME_ID, &myts ), (double)myts.tv_sec + (double)myts.tv_nsec * 1e-9);})
+#define CPU_TIME_th ({ struct timespec myts; (clock_gettime( CLOCK_THREAD_CPUTIME_ID, &myts ), (double)myts.tv_sec + (double)myts.tv_nsec * 1e-9);})
+
+
+#if defined(_OPENMP)
+#define TAKE_TIME_START( T ) {			\
+    wt_timing.T = CPU_TIME_wt;			\
+    pr_timing.T = CPU_TIME_pr; }
+
+#define TAKE_TIME_STOP( T ) {			\
+    pr_timing.T = CPU_TIME_pr - pr_timing.T;	\
+    wt_timing.T = CPU_TIME_wt - wt_timing.T; }
+
+#define TAKE_TIME( Twt, Tpr ) { Twt = CPU_TIME_wt; Tpr = CPU_TIME_pr; }
+#define ADD_TIME( T, Twt, Tpr ) {		\
+    pr_timing.T += CPU_TIME_pr - Tpr;		\
+    wt_timing.T += CPU_TIME_wt - Twt;		\
+    Tpr = CPU_TIME_pr; Twt = CPU_TIME_wt; }
+
+#else
+
+#define TAKE_TIME_START( T ) wt_timing.T = CPU_TIME_wt
+
+#define TAKE_TIME_STOP( T )  wt_timing.T = CPU_TIME_wt - wt_timing.T
+
+#define TAKE_TIME( Twt, ... ) Twt = CPU_TIME_wt;
+#define ADD_TIME( T, Twt, ... ) { wt_timing.T += CPU_TIME_wt - Twt; Twt = CPU_TIME_wt;}
+
+#endif
+
+#define TAKE_TIMEwt_START( T) wt_timing.T = CPU_TIME_wt
+#define TAKE_TIMEwt_STOP( T) wt_timing.T = CPU_TIME_wt - wt_timing.T
+#define TAKE_TIMEwt( Twt ) Twt = CPU_TIME_wt;
+#define ADD_TIMEwt( T, Twt ) { wt_timing.T += CPU_TIME_wt - Twt; Twt = CPU_TIME_wt; }
+
+
+#if defined(__GNUC__) && !defined(__ICC) && !defined(__INTEL_COMPILER)
+#define PRAGMA_IVDEP _Pragma("GCC ivdep")
+#else
+#define PRAGMA_IVDEP _Pragma("ivdep")
+#endif
+
+#define STRINGIFY(a) #a
+#define UNROLL(N) _Pragma(STRINGIFY(unroll(N)))
--- a/data/paramfile.txt
+++ b/data/paramfile.txt
 ndatasets              1
-Datapath1              /beegfs/lofar/cgheller/L798046_SB244_uv.uncorr_130B27932t_121MHz.pre-cal.binMS/ 
-Datapath2              /beegfs/lofar/cgheller/L798046_SB244_uv.uncorr_130B27932t_123MHz.pre-cal.binMS/
-Datapath3              /beegfs/lofar/cgheller/L798046_SB244_uv.uncorr_130B27932t_125MHz.pre-cal.binMS/
+Datapath1              ../input/
+Datapath2              ../input/
+Datapath3              ../input/
 num_threads            2
 w_support              7
 grid_size_x            2048
@@ -24,3 +24,4 @@ fftfile3	       fft_img.bin
 logfile                run.log
 extension              .txt
 timingfile             timings.dat 
+verbose_level	       1
--- a/fourier_transform.c
+++ b/fourier_transform.c
@@ -2,13 +2,15 @@
 #include "allvars.h"
 #include "proto.h"

-void fftw_data(){
+void fftw_data()
+{
  
 #ifdef USE_FFTW
  // FFT transform the data (using distributed FFTW)
  if(global_rank == 0)printf("PERFORMING FFT\n");
-        clock_gettime(CLOCK_MONOTONIC, &begin);
-        start = clock();
+
+  TAKE_TIME_START(fftw);
+
  fftw_plan plan;
  fftw_complex *fftwgrid;
  ptrdiff_t alloc_local, local_n0, local_0_start;
@@ -64,21 +66,23 @@ void fftw_data(){
  MPI_Barrier(MPI_COMM_WORLD);
 #endif
  
-        end = clock();
-        clock_gettime(CLOCK_MONOTONIC, &finish);
-        timing.fftw_time = ((double) (end - start)) / CLOCKS_PER_SEC;
-        timing.fftw_time1 = (finish.tv_sec - begin.tv_sec);
-        timing.fftw_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
-        clock_gettime(CLOCK_MONOTONIC, &begin);
+  TAKE_TIME_STOP(fftw);

 #endif
 }

 void write_fftw_data(){

+  // Write results
+  
 #ifdef USE_FFTW
+  
+  double twt, tpr;
+    
 #ifdef WRITE_DATA
-        // Write results
+
+  TAKE_TIME(twt, tpr);
+  
 #ifdef USE_MPI
  MPI_Win writewin;
  MPI_Win_create(gridss, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &writewin);
@@ -150,27 +154,28 @@ void write_fftw_data(){
  MPI_Win_free(&writewin);
  MPI_Barrier(MPI_COMM_WORLD);
 #endif
+
+  ADD_TIME(write, twt, tpr);
 #endif //WRITE_DATA
  
  
  // Phase correction  

-     clock_gettime(CLOCK_MONOTONIC, &begin);
-     start = clock();
+  
+
+  TAKE_TIME_START(phase)
  if(global_rank == 0)printf("PHASE CORRECTION\n");
  double* image_real = (double*) calloc(xaxis*yaxis,sizeof(double));
  double* image_imag = (double*) calloc(xaxis*yaxis,sizeof(double));
  
  phase_correction(gridss,image_real,image_imag,xaxis,yaxis,param.num_w_planes,param.grid_size_x,param.grid_size_y,resolution,metaData.wmin,metaData.wmax,param.num_threads);

-     end = clock();
-     clock_gettime(CLOCK_MONOTONIC, &finish);
-     timing.phase_time = ((double) (end - start)) / CLOCKS_PER_SEC;
-     timing.phase_time1 = (finish.tv_sec - begin.tv_sec);
-     timing.phase_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
+  TAKE_TIME_STOP(phase);
  
 #ifdef WRITE_IMAGE

+  TAKE_TIME(twt, tpr);
+  
  if(global_rank == 0)
    {
      file.pFilereal = fopen (out.fftfile2,"wb");
@@ -208,6 +213,7 @@ void write_fftw_data(){
  MPI_Barrier(MPI_COMM_WORLD);
 #endif

+  ADD_TIME(write, twt, tpr);
 #endif //WRITE_IMAGE
  
 #endif  //FFTW

--- a/gridding.c
+++ b/gridding.c
@@ -4,35 +4,35 @@
 #include "proto.h"


-void gridding(){
+void gridding()
+{
  
  if(global_rank == 0)printf("GRIDDING DATA\n");
  
  // Create histograms and linked lists

-    clock_gettime(CLOCK_MONOTONIC, &begin);
-    start = clock();
+  TAKE_TIME_START(init);
  
  // Initialize linked list
  initialize_array();

+  TAKE_TIME_STOP(init);
+  TAKE_TIME_START(process);
+
  // Sector and Gridding data
  gridding_data();

+  TAKE_TIME_STOP(process);
+  
 #ifdef USE_MPI
  MPI_Barrier(MPI_COMM_WORLD);
 #endif

-    end = clock();
-    clock_gettime(CLOCK_MONOTONIC, &finish);
-    timing.process_time = ((double) (end - start)) / CLOCKS_PER_SEC;
-    timing.process_time1 = (finish.tv_sec - begin.tv_sec);
-    timing.process_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
-    clock_gettime(CLOCK_MONOTONIC, &begin);
-
+  return;
 }

-void initialize_array(){
+void initialize_array()
+{

  histo_send     = (long*) calloc(nsectors+1,sizeof(long));
  int * boundary = (int*) calloc(metaData.Nmeasures,sizeof(int));
@@ -79,13 +79,16 @@ void initialize_array(){
 #ifdef VERBOSE
  for (int iii=0; iii<nsectors+1; iii++)printf("HISTO %d %d %ld\n", global_rank, iii, histo_send[iii]);
 #endif
+
+  free( boundary);
+  return;
 }

-void gridding_data(){
+void gridding_data()
+{

  double shift = (double)(dx*yaxis);
  
-    // Open the MPI Memory Window for the slab
 #ifdef USE_MPI
  MPI_Win_create(grid, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &slabwin);
  MPI_Win_fence(0,slabwin);
@@ -95,24 +98,20 @@ void gridding_data(){
  file.pFile1 = fopen (out.outfile1,"w");
 #endif
  
-    timing.kernel_time = 0.0;
-    timing.kernel_time1 = 0.0;
-    timing.reduce_time = 0.0;
-    timing.reduce_time1 = 0.0;
-    timing.compose_time = 0.0;
-    timing.compose_time1 = 0.0; 
-
  // calculate the resolution in radians
  resolution = 1.0/MAX(abs(metaData.uvmin),abs(metaData.uvmax));
  
  // calculate the resolution in arcsec 
  double resolution_asec = (3600.0*180.0)/MAX(abs(metaData.uvmin),abs(metaData.uvmax))/PI;
+  if( global_rank == 0 )
    printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec);

  for (long isector = 0; isector < nsectors; isector++)
    {
-        clock_gettime(CLOCK_MONOTONIC, &begink);
-        startk = clock();
+      double twt, tpr;
+      
+      TAKE_TIME(twt, tpr);
+
      // define local destination sector
      //isector = (isector_count+rank)%size;  // this line must be wrong! [LT]
      
@@ -132,6 +131,7 @@ void gridding_data(){
      long ip = 0;
      long inu = 0;

+     #warning shall we omp-ize this ?
      for(long iphi = histo_send[isector]-1; iphi>=0; iphi--)
        {
 	  long ilocal = sectorarray[isector][iphi];
@@ -141,33 +141,32 @@ void gridding_data(){
 	  uus[icount] = data.uu[ilocal];
 	  vvs[icount] = data.vv[ilocal]-isector*shift;
 	  wws[icount] = data.ww[ilocal];	  
-              for (long ipol=0; ipol<metaData.polarisations; ipol++)
+	  UNROLL(4)
+          PRAGMA_IVDEP
+	  for (long ipol=0; ipol<metaData.polarisations; ipol++, ip++)
 	    {
 	      weightss[ip] = data.weights[ilocal*metaData.polarisations+ipol];
-		  ip++;
 	    }
-              for (long ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++)
+	  
+	  PRAGMA_IVDEP
+	  UNROLL(4)
+	    for (long ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++, inu++)
 	      {
 		visreals[inu] = data.visreal[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq];
 		visimgs[inu] = data.visimg[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq];
 	      //if(visimgs[inu]>1e10 || visimgs[inu]<-1e10)printf("%f %f %ld %ld %d %ld %ld\n",visreals[inu],visimgs[inu],inu,Nvissec,rank,ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq,metaData.Nvis);
-                     inu++;
 	    }
 	  icount++;
 	}

-         clock_gettime(CLOCK_MONOTONIC, &finishk);
-         endk = clock();
-         timing.compose_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
-         timing.compose_time1 += (finishk.tv_sec - begink.tv_sec);
-         timing.compose_time1 += (finishk.tv_sec - begink.tv_sec);
-         timing.compose_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
+      ADD_TIME(compose, twt, tpr);

     #ifndef USE_MPI
      double vvmin = 1e20;
      double uumax = -1e20;
      double vvmax = -1e20;

+     #warning shall we omp-ize this ?
      for (long ipart=0; ipart<Nsec; ipart++)
 	{
 	  uumin = MIN(uumin,uus[ipart]);
@@ -186,8 +185,7 @@ void gridding_data(){
     #ifdef VERBOSE
      printf("Processing sector %ld\n",isector);
     #endif
-       clock_gettime(CLOCK_MONOTONIC, &begink);
-       startk = clock();
+      TAKE_TIME(twt, tpr);

      wstack(param.num_w_planes,
 	     Nsec,
@@ -207,6 +205,8 @@ void gridding_data(){
 	     gridss,
 	     param.num_threads);

+      ADD_TIME(kernel, twt, tpr);
+      
      /* int z =0 ;
       * #pragma omp target map(to:test_i_gpu) map(from:z)
       * {
@@ -215,42 +215,49 @@ void gridding_data(){
       *       z = x + test_i_gpu;
       *       }*/

-       clock_gettime(CLOCK_MONOTONIC, &finishk);
-       endk = clock();
-       timing.kernel_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
-       timing.kernel_time1 += (finishk.tv_sec - begink.tv_sec);
-       timing.kernel_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
     #ifdef VERBOSE
      printf("Processed sector %ld\n",isector);
     #endif
-       clock_gettime(CLOCK_MONOTONIC, &begink);
-       startk = clock();
+      /* ----------------
+       * REDUCE
+       * ---------------- */

-       //for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++)printf("--> %f\n",gridss[iii]);
+      double twt_r, tpr_r;
+      TAKE_TIME(twt_r, tpr_r);

-       #ifndef USE_MPI
-       long stride = isector*2*xaxis*yaxis*num_w_planes;
-       for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++)
+                                                     // ..................
+     #ifndef USE_MPI                                 // REDUCE WITH NO MPI                
+      
+      #pragma omp parallel
+      {
+	long stride = isector * size_of_grid;
+       #pragma omp for
+	for (long iii=0; iii< size_fo_grid; iii++)
 	  gridtot[stride+iii] = gridss[iii];
-       #endif
+      }
+
+                                                     // ..................
+                                                     // REDUCE WITH MPI
+     #else

      // Write grid in the corresponding remote slab      
-      #ifdef USE_MPI
      // int target_rank = (int)isector;    it implied that size >= nsectors
      int target_rank = (int)(isector % size);
       
     #ifdef ONE_SIDE

-       //   MPI_Win_lock(MPI_LOCK_SHARED,target_rank,0,slabwin);
-       // MPI_Accumulate(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,MPI_SUM,slabwin);
-       // MPI_Win_unlock(target_rank,slabwin);
-       
      // for every task, gridss coincides with the 
      // that can be avoided if shared window coincides with gridss
+
+      TAKE_TIME(twt, tpr);
      memcpy(Me.win.ptr+isector*size_of_grid, gridss, size_of_grid*sizeof(double));
+      ADD_TIME(mmove, twt, tpr);
 
+      dprintf(1, global_rank, 0, "reducing sector %ld..\n", isector);

+      TAKE_TIME( twt, tpr);
      reduce( isector, target_rank );  // here the reduce is performed within every host 
+      ADD_TIME(reduce_sh, twt, tpr);

      if ( Me.Nhosts > 1 )
 	{
@@ -272,6 +279,7 @@ void gridding_data(){
 	   
 	  if( Sector_Comm != MPI_COMM_NULL )
 	    {
+	      double _twt_;
 	      int sector_size;
 	      int sector_rank = 0;
 	      int sector_target;
@@ -281,7 +289,10 @@ void gridding_data(){
 	      if ( global_rank == target_rank)
 		{
 		  MPI_Send( &sector_rank, 1, MPI_INT, 0, 0, Sector_Comm);
-		   memcpy(grid, Me.swins[Me.Rank[myHOST]].ptr+isector*size_of_grid*sizeof(double), size_of_grid * sizeof(double));    
+		  TAKE_TIMEwt( _twt_ );
+		  memcpy(gridss, Me.swins[Me.Rank[myHOST]].ptr+isector*size_of_grid*sizeof(double),
+			 size_of_grid * sizeof(double));
+		  ADD_TIMEwt( mmove, _twt_);
 		}
 	       
 	      if( sector_rank == 0 )
@@ -290,29 +301,35 @@ void gridding_data(){
 		  MPI_Recv( &sector_target, 1, MPI_INT, MPI_ANY_SOURCE, 0, Sector_Comm, &status);
 		}

+	      TAKE_TIMEwt(_twt_);
 	      MPI_Bcast( &sector_target, 1, MPI_INT, 0, Sector_Comm );
 	       
-	       MPI_Reduce(grid, grid, size_of_grid, MPI_DOUBLE,MPI_SUM, sector_target, Sector_Comm);
+	      MPI_Reduce(gridss, grid, size_of_grid, MPI_DOUBLE,MPI_SUM, sector_target, Sector_Comm);
 	      
 	      MPI_Comm_free( &Sector_Comm );
+	      ADD_TIMEwt(mpi, _twt_);
 	    }
 	}
+      ADD_TIME(reduce_mpi, twt, tpr);

       
     #else   // relates to #ifdef ONE_SIDE
      
-       
+      {
+	double _twt_;
+	TAKE_TIMEwt(_twt_);
 	MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD);
+	ADD_TIMEwt(mpi, _twt_);
+      }
+      
     #endif  //  closes #ifdef ONE_SIDE
     #endif  //  closes USE_MPI

-       clock_gettime(CLOCK_MONOTONIC, &finishk);
-       endk = clock();
-       timing.reduce_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
-       timing.reduce_time1 += (finishk.tv_sec - begink.tv_sec);
-       timing.reduce_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
-       // Go to next sector
-       for (long inull=0; inull<2*param.num_w_planes*xaxis*yaxis; inull++)gridss[inull] = 0.0;
+      ADD_TIME(reduce, twt_r, tpr_r);
+
+      
+      // wipe before getting to the next sector
+      memset((void*)gridss, 0, size_of_grid * sizeof(double));

      // Deallocate all sector arrays
      free(uus);
@@ -324,25 +341,18 @@ void gridding_data(){
      // End of loop over sector    
    }

-    // Finalize MPI communication
-    #ifdef USE_MPI
-       MPI_Win_fence(0,slabwin);
-    #endif  
+
+  free( histo_send );

 #ifndef USE_MPI
  fclose(file.pFile1);
 #endif

 #ifdef USE_MPI
+  MPI_Win_fence(0,slabwin);
  MPI_Barrier(MPI_COMM_WORLD);
 #endif
  
-    end = clock();
-    clock_gettime(CLOCK_MONOTONIC, &finish);
-    timing.process_time = ((double) (end - start)) / CLOCKS_PER_SEC;
-    timing.process_time1 = (finish.tv_sec - begin.tv_sec);
-    timing.process_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
-    clock_gettime(CLOCK_MONOTONIC, &begin);
 }

 void write_grided_data()
@@ -355,6 +365,7 @@ void write_grided_data()
        printf("WRITING GRIDDED DATA\n");
        file.pFilereal = fopen (out.outfile2,"wb");
        file.pFileimg = fopen (out.outfile3,"wb");
+	
       #ifdef USE_MPI
 	for (int isector=0; isector<nsectors; isector++)
 	  {
@@ -377,6 +388,7 @@ void write_grided_data()
 			fseek(file.pFilereal, global_index, SEEK_SET);
 			fwrite(&gridss_real[index], 1, sizeof(double), file.pFilereal);
 		      }
+		
 		for (int iw=0; iw<param.num_w_planes; iw++)
 		  for (int iv=0; iv<yaxis; iv++)
 		    for (int iu=0; iu<xaxis; iu++)
@@ -424,7 +436,6 @@ void write_grided_data()
    #endif
     
    #endif //WRITE_DATA      
-
 }


@@ -434,6 +445,7 @@ void reduce( int sector, int target_rank )
 {   
   
   int local_rank = Me.Rank[myHOST];
+   int target_rank_on_myhost = -1;
   
   if( Me.Ranks_to_host[ target_rank ] == Me.myhost )
     // exchange rank 0 with target rank
@@ -442,16 +454,32 @@ void reduce( int sector, int target_rank )
     // every target rank
     {

-       int r = 0;
-       while( Me.Ranks_to_myhost[r] != target_rank )
-	 r++;
+       target_rank_on_myhost = 0;
+       while( Me.Ranks_to_myhost[target_rank_on_myhost] != target_rank )
+	 target_rank_on_myhost++;

-       if( r > 0 )
+       dprintf(2, Me.Rank[myHOST], 0,
+	       "[SEC %d] swapping Host master with target rank %d (%d)\n",
+	       sector, target_rank, target_rank_on_myhost);
+       
+       
+       if( target_rank_on_myhost > 0 )
+	 // the target is not the task that already has rank 0
+	 // on my host
 	 {
+	   
 	   if( local_rank == 0 )
-	     local_rank = r;
-	   if( local_rank == r )
+	     local_rank = target_rank_on_myhost;
+	   else if( local_rank == target_rank_on_myhost )
 	     local_rank = 0;
+
+	   win_t temp = Me.swins[target_rank_on_myhost];
+	   Me.swins[target_rank_on_myhost] = Me.swins[0];
+	   Me.swins[0] = temp;
+
+	   temp = Me.scwins[target_rank_on_myhost];
+	   Me.scwins[target_rank_on_myhost] = Me.scwins[0];
+	   Me.scwins[0] = temp;
 	 }
     }
   
@@ -470,6 +498,10 @@ void reduce( int sector, int target_rank )
       if( local_rank % threshold == 0)
         {
 	   int source = local_rank + (1<<l);
+	   dprintf(2, 0, 0,
+		   "[SEC %d] task %d (%d) getting data from task %d at level %d\n", 
+		   sector, local_rank, Me.Rank[myHOST], source, l );
+	   
 	   while( *(int*)(Me.scwins[source].ptr) < l )
 	     // sleep 5 usec if the source target is not ready
 	     NSLEEP( 5000 );
@@ -481,8 +513,25 @@ void reduce( int sector, int target_rank )
 	   *(int*)(Me.win_ctrl.ptr) = l;
         }
       else
+	 {
+	   dprintf(2, 0, 0,
+		   "[SEC %d] task %d (%d) signaling that level %d is done\n",
+		   sector, local_rank, Me.Rank[myHOST], l );
+	   
 	   *(int*)(Me.win_ctrl.ptr) = l;
 	 }
+     }
+
+   if ( target_rank_on_myhost > 0 )
+     {
+       win_t temp = Me.swins[target_rank_on_myhost];
+       Me.swins[target_rank_on_myhost] = Me.swins[0];
+       Me.swins[0] = temp;
+
+       temp = Me.scwins[target_rank_on_myhost];
+       Me.scwins[target_rank_on_myhost] = Me.scwins[0];
+       Me.scwins[0] = temp;
+     }
   
   return;
 }

--- a/init.c
+++ b/init.c
@@ -7,8 +7,7 @@
 void init(int index)
 {

-   clock_gettime(CLOCK_MONOTONIC, &begin0);
-   start0 = clock();
+  TAKE_TIME_START(total);

   // DAV: the corresponding KernelLen is calculated within the wstack function. It can be anyway hardcoded for optimization
   dx = 1.0/(double)param.grid_size_x;
@@ -37,8 +36,7 @@ void init(int index)
     printf("\nTask %d sees %d topology levels\n", global_rank, Me.MAXl);
   #endif

-   clock_gettime(CLOCK_MONOTONIC, &begin);
-   start = clock();
+   TAKE_TIME_START(setup);

   // INPUT FILES (only the first ndatasets entries are used)
   strcpy(datapath,param.datapath_multi[index]);
@@ -64,13 +62,7 @@ void init(int index)
   MPI_Barrier(MPI_COMM_WORLD);
  #endif
   
-
-   clock_gettime(CLOCK_MONOTONIC, &finish);
-   end = clock();
-   timing.setup_time = ((double) (end - start)) / CLOCKS_PER_SEC;
-   timing.setup_time1 = (finish.tv_sec - begin.tv_sec);
-   timing.setup_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
- 
+   TAKE_TIME_STOP(setup);
 }

 void op_filename() {
@@ -233,6 +225,11 @@ void read_parameter_file(char *fname)
 		{
 		  strcpy(outparam.timingfile, buf2);
 		}
+	      if(strcmp(buf1, "verbose_level") == 0)
+		{
+		  verbose_level = atoi(buf1);
+		}
+
 	      if(param.ndatasets > 1)
 		{
                   
@@ -258,9 +255,12 @@ void read_parameter_file(char *fname)
    /* Communicating the relevent parameters to the other process */

  #ifdef USE_MPI
+   double twt;
+   TAKE_TIMEwt(twt);
   MPI_Bcast(&in, sizeof(struct ip), MPI_BYTE, 0, MPI_COMM_WORLD);
   MPI_Bcast(&outparam, sizeof(struct op), MPI_BYTE, 0, MPI_COMM_WORLD);
   MPI_Bcast(&param, sizeof(struct parameter), MPI_BYTE, 0, MPI_COMM_WORLD);
+   ADD_TIMEwt(mpi, twt);
  #endif

 }
@@ -312,6 +312,7 @@ void readMetaData(char fileLocal[1000]) {
 void metaData_calculation() {
   
     int nsub = 1000;
+     if( global_rank == 0 )
       printf("Subtracting last %d measurements\n",nsub);
     metaData.Nmeasures = metaData.Nmeasures-nsub;
     metaData.Nvis = metaData.Nmeasures*metaData.freq_per_chan*metaData.polarisations;

--- a/numa.c
+++ b/numa.c
@@ -62,9 +62,11 @@ int numa_init( int Rank, int Size, MPI_Comm *MYWORLD, map_t *Me )
  // at my shared-memory level
  //
  for( int t = 0; t < Me->Ntasks[SHMEMl]; t++ )
-    if( t != Me->Rank[SHMEMl] )
+    {
+      //if( t != Me->Rank[SHMEMl] )
 	MPI_Win_shared_query( Me->win_ctrl.win, t, &(Me->scwins[t].size),
 			      &(Me->scwins[t].disp), &(Me->scwins[t].ptr) );
+    }

  if( Me->Rank[SHMEMl] != 0 )
    MPI_Win_shared_query( win_ctrl_hostmaster, 0, &(win_ctrl_hostmaster_size),

--- a/result.c
+++ b/result.c
@@ -2,34 +2,43 @@
 #include "allvars.h"
 #include "proto.h"

-void write_result() {
+void write_result()
+{

-       end = clock();
-       clock_gettime(CLOCK_MONOTONIC, &finish);
-       timing.tot_time = ((double) (end - start0)) / CLOCKS_PER_SEC;
-       timing.tot_time1 = (finish.tv_sec - begin0.tv_sec);
-       timing.tot_time1 += (finish.tv_nsec - begin0.tv_nsec) / 1000000000.0;
+  TAKE_TIME_STOP( total );
       
  if (global_rank == 0)
    {
-          printf("Setup time:    %f sec\n",timing.setup_time);
-          printf("Process time:  %f sec\n",timing.process_time);
-          printf("Kernel time = %f, Array Composition time %f, Reduce time: %f sec\n",timing.kernel_time,timing.compose_time,timing.reduce_time);
+      printf("%14s time : %f sec\n", "Setup", wt_timing.setup);
+      printf("%14s time : %f sec\n", "Process", wt_timing.process);
+      printf("%14s time : %f sec\n", "Reduce", wt_timing.reduce);
+     #if defined(USE_MPI)
+     #if defined(ONE_SIDE)
+      printf("%14s time : %f sec\n", "Reduce sh", wt_timing.reduce_sh);
+      printf("%14s time : %f sec\n", "Mmove", wt_timing.mmove);
+      printf("%14s time : %f sec\n", "ReduceMPI", wt_timing.reduce_mpi);
+     #endif
+      printf("%14s time : %f sec\n", "MPI", wt_timing.mpi);
+     #endif
+      printf("%10s Kernel time = %f, Array Composition time %f, Reduce time: %f sec\n", "",
+	     wt_timing.kernel,wt_timing.compose,wt_timing.reduce);
     #ifdef USE_FFTW
-              printf("FFTW time:     %f sec\n",timing.fftw_time);
-              printf("Phase time:    %f sec\n",timing.phase_time);
+      printf("%14s time : %f sec\n", "FFTW", wt_timing.fftw);
+      printf("%14s time : %f sec\n", "Phase",wt_timing.phase);
     #endif
-          printf("TOT time:      %f sec\n",timing.tot_time);
+      printf("%14s time : %f sec\n\n", "TOTAL", wt_timing.total);
+      
      if(param.num_threads > 1)
 	{
-            printf("PSetup time:   %f sec\n",timing.setup_time1);
-            printf("PProcess time: %f sec\n",timing.process_time1);
-            printf("PKernel time = %f, PArray Composition time %f, PReduce time: %f sec\n",timing.kernel_time1,timing.compose_time1,timing.reduce_time1);
+	  printf("%14s time : %f sec\n", "PSetup", pr_timing.setup);
+	  printf("%14s time : %f sec\n", "PProcess", pr_timing.process);
+	  printf("%10s PKernel time = %f, PArray Composition time %f, PReduce time: %f sec\n", "",
+		 pr_timing.kernel,pr_timing.compose,pr_timing.reduce);
 	 #ifdef USE_FFTW
-               printf("PFFTW time:    %f sec\n",timing.fftw_time1);
-               printf("PPhase time:   %f sec\n",timing.phase_time1);
+	  printf("%14s time : %f sec\n", "PFFTW", pr_timing.fftw);
+	  printf("%14s time : %f sec\n", "PPhase", pr_timing.phase);
 	 #endif
-            printf("PTOT time:     %f sec\n",timing.tot_time1);
+	  printf("%14s time : %f sec\n", "PTOTAL", pr_timing.total);
 	}
    }
  
@@ -38,9 +47,13 @@ void write_result() {
      file.pFile = fopen (out.timingfile,"w");
      if (param.num_threads == 1)
 	{
-           fprintf(file.pFile, "%f %f %f %f %f %f %f\n",timing.setup_time,timing.kernel_time,timing.compose_time,timing.reduce_time,timing.fftw_time,timing.phase_time,timing.tot_time);
+	  fprintf(file.pFile, "%f %f %f %f %f %f %f\n",
+		  wt_timing.setup, wt_timing.kernel, wt_timing.compose,
+		  wt_timing.reduce,wt_timing.fftw,wt_timing.phase, wt_timing.total);
 	} else {
-           fprintf(file.pFile, "%f %f %f %f %f %f %f\n",timing.setup_time1,timing.kernel_time1,timing.compose_time1,timing.reduce_time1,timing.fftw_time1,timing.phase_time1,timing.tot_time1);
+	fprintf(file.pFile, "%f %f %f %f %f %f %f\n",
+		pr_timing.setup, pr_timing.kernel, pr_timing.compose,
+		pr_timing.reduce,pr_timing.fftw,pr_timing.phase, pr_timing.total);
      }
      fclose(file.pFile);
    }