diff --git a/allvars.c b/allvars.c
index 58a1c6b958b4ecf9d2a54bdd0e6b808208dceb37..50de943827fbba9fe9f5ef73eb05ac6440db6159 100644
--- a/allvars.c
+++ b/allvars.c
@@ -7,7 +7,9 @@ struct ip in;
 struct op out, outparam;
 
 struct meta metaData;
-struct time timing;
+timing_t wt_timing = {0};
+timing_t pr_timing   = {0};
+
 struct parameter param;
 struct fileData data;
 
@@ -16,6 +18,7 @@ char datapath[900];
 int xaxis, yaxis;
 int global_rank;
 int size;
+int verbose_level = 0;
 long nsectors;
 long startrow;
 double resolution, dx, dw, w_supporth;
diff --git a/allvars.h b/allvars.h
index 7792f1d2ed260c28f35b9ff3f84abe569e6a5eb6..685f00325645393223e31c65c47a1f54556a3b2e 100644
--- a/allvars.h
+++ b/allvars.h
@@ -92,13 +92,24 @@ extern struct meta
 } metaData;
 
 
-extern struct time
-{
-   	double setup_time, process_time, mpi_time, fftw_time, tot_time, kernel_time, reduce_time, compose_time, phase_time;
-	double setup_time1, process_time1, mpi_time1, fftw_time1, tot_time1, kernel_time1, reduce_time1, compose_time1, phase_time1;
-	double writetime, writetime1;
-
-} timing;
+typedef struct {
+  double setup;      // time spent in initialization, init()
+  double init;       // time spent in initializing arrays
+  double process;    // time spent in gridding;
+  double mpi;        // time spent in mpi communications
+  double fftw;       //
+  double kernel;     //
+  double mmove;      //
+  double reduce;     //
+  double reduce_mpi; //
+  double reduce_sh ; //
+  double compose;    //
+  double phase;      //
+  double write;      //
+  double total; } timing_t;
+
+extern timing_t wt_timing;      // wall-clock timings
+extern timing_t pr_timing;      // process CPU timing
 
 extern struct parameter
 {
@@ -127,12 +138,11 @@ extern char datapath[900];
 extern int xaxis, yaxis;
 extern int global_rank;
 extern int size;
+extern int verbose_level;
 extern long nsectors;
 extern long startrow;
 extern double resolution, dx, dw, w_supporth;
 
-extern clock_t start, end, start0, startk, endk;
-extern struct timespec begin, finish, begin0, begink, finishk;
 extern long * histo_send, size_of_grid;
 extern double * grid, *gridss, *gridss_real, *gridss_img, *gridss_w;
 
@@ -142,3 +152,60 @@ extern double * grid, *gridss, *gridss_real, *gridss_img, *gridss_w;
 #endif
 
 extern long **sectorarray;
+
+
+#if defined(DEBUG)
+#define dprintf(LEVEL, T, t, ...) if( (verbose_level >= (LEVEL)) &&	\
+				      ( ((t) ==-1 ) || ((T)==(t)) ) ) {	\
+    printf(__VA_ARGS__); fflush(stdout); }
+
+#else
+#define dprintf(...)
+#endif
+
+
+#define CPU_TIME_wt ({ struct timespec myts; (clock_gettime( CLOCK_REALTIME, &myts ), (double)myts.tv_sec + (double)myts.tv_nsec * 1e-9);})
+#define CPU_TIME_pr ({ struct timespec myts; (clock_gettime( CLOCK_PROCESS_CPUTIME_ID, &myts ), (double)myts.tv_sec + (double)myts.tv_nsec * 1e-9);})
+#define CPU_TIME_th ({ struct timespec myts; (clock_gettime( CLOCK_THREAD_CPUTIME_ID, &myts ), (double)myts.tv_sec + (double)myts.tv_nsec * 1e-9);})
+
+
+#if defined(_OPENMP)
+#define TAKE_TIME_START( T ) {			\
+    wt_timing.T = CPU_TIME_wt;			\
+    pr_timing.T = CPU_TIME_pr; }
+
+#define TAKE_TIME_STOP( T ) {			\
+    pr_timing.T = CPU_TIME_pr - pr_timing.T;	\
+    wt_timing.T = CPU_TIME_wt - wt_timing.T; }
+
+#define TAKE_TIME( Twt, Tpr ) { Twt = CPU_TIME_wt; Tpr = CPU_TIME_pr; }
+#define ADD_TIME( T, Twt, Tpr ) {		\
+    pr_timing.T += CPU_TIME_pr - Tpr;		\
+    wt_timing.T += CPU_TIME_wt - Twt;		\
+    Tpr = CPU_TIME_pr; Twt = CPU_TIME_wt; }
+
+#else
+
+#define TAKE_TIME_START( T ) wt_timing.T = CPU_TIME_wt
+
+#define TAKE_TIME_STOP( T )  wt_timing.T = CPU_TIME_wt - wt_timing.T
+
+#define TAKE_TIME( Twt, ... ) Twt = CPU_TIME_wt;
+#define ADD_TIME( T, Twt, ... ) { wt_timing.T += CPU_TIME_wt - Twt; Twt = CPU_TIME_wt;}
+
+#endif
+
+#define TAKE_TIMEwt_START( T) wt_timing.T = CPU_TIME_wt
+#define TAKE_TIMEwt_STOP( T) wt_timing.T = CPU_TIME_wt - wt_timing.T
+#define TAKE_TIMEwt( Twt ) Twt = CPU_TIME_wt;
+#define ADD_TIMEwt( T, Twt ) { wt_timing.T += CPU_TIME_wt - Twt; Twt = CPU_TIME_wt; }
+
+
+#if defined(__GNUC__) && !defined(__ICC) && !defined(__INTEL_COMPILER)
+#define PRAGMA_IVDEP _Pragma("GCC ivdep")
+#else
+#define PRAGMA_IVDEP _Pragma("ivdep")
+#endif
+
+#define STRINGIFY(a) #a
+#define UNROLL(N) _Pragma(STRINGIFY(unroll(N)))
diff --git a/data/paramfile.txt b/data/paramfile.txt
index e8a2156f3bdbf9f7ca792a22014fd2c64bebd1b5..2146b3bdb6c5c03a4b6ab59acb568302a96ea55a 100644
--- a/data/paramfile.txt
+++ b/data/paramfile.txt
@@ -1,7 +1,7 @@
 ndatasets              1
-Datapath1              /beegfs/lofar/cgheller/L798046_SB244_uv.uncorr_130B27932t_121MHz.pre-cal.binMS/ 
-Datapath2              /beegfs/lofar/cgheller/L798046_SB244_uv.uncorr_130B27932t_123MHz.pre-cal.binMS/
-Datapath3              /beegfs/lofar/cgheller/L798046_SB244_uv.uncorr_130B27932t_125MHz.pre-cal.binMS/
+Datapath1              ../input/
+Datapath2              ../input/
+Datapath3              ../input/
 num_threads            2
 w_support              7
 grid_size_x            2048
@@ -23,4 +23,5 @@ fftfile2	       fft_real.bin
 fftfile3	       fft_img.bin
 logfile                run.log
 extension              .txt
-timingfile             timings.dat  
+timingfile             timings.dat 
+verbose_level	       1
diff --git a/fourier_transform.c b/fourier_transform.c
index 81d7fd8de9189f5267791a9f5cefcd8931016b7b..fda11fe591406919707b1dda220d8b6e3c0a764c 100644
--- a/fourier_transform.c
+++ b/fourier_transform.c
@@ -2,213 +2,219 @@
 #include "allvars.h"
 #include "proto.h"
 
-void fftw_data(){
-
-    #ifdef USE_FFTW
-        // FFT transform the data (using distributed FFTW)
-        if(global_rank == 0)printf("PERFORMING FFT\n");
-        clock_gettime(CLOCK_MONOTONIC, &begin);
-        start = clock();
-        fftw_plan plan;
-        fftw_complex *fftwgrid;
-        ptrdiff_t alloc_local, local_n0, local_0_start;
-        double norm = 1.0/(double)(param.grid_size_x*param.grid_size_y);
-
-        // map the 1D array of complex visibilities to a 2D array required by FFTW (complex[*][2])
-        // x is the direction of contiguous data and maps to the second parameter
-        // y is the parallelized direction and corresponds to the first parameter (--> n0)
-        // and perform the FFT per w plane
-        alloc_local = fftw_mpi_local_size_2d(param.grid_size_y, param.grid_size_x, MPI_COMM_WORLD,&local_n0, &local_0_start);
-        fftwgrid = fftw_alloc_complex(alloc_local);
-        plan = fftw_mpi_plan_dft_2d(param.grid_size_y, param.grid_size_x, fftwgrid, fftwgrid, MPI_COMM_WORLD, FFTW_BACKWARD, FFTW_ESTIMATE);
-
-        long fftwindex = 0;
-        long fftwindex2D = 0;
-        for (int iw=0; iw<param.num_w_planes; iw++)
-        {
-            //printf("FFTing plan %d\n",iw);
-            //select the w-plane to transform
-            for (int iv=0; iv<yaxis; iv++)
-            {
-               for (int iu=0; iu<xaxis; iu++)
-               {
-                   fftwindex2D = iu + iv*xaxis;
-                   fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis);
-                   fftwgrid[fftwindex2D][0] = grid[fftwindex];
-                   fftwgrid[fftwindex2D][1] = grid[fftwindex+1];
-               }
-            }
-
-            // do the transform for each w-plane        
-            fftw_execute(plan);
-
-            // save the transformed w-plane
-            for (int iv=0; iv<yaxis; iv++)
-            {
-               for (int iu=0; iu<xaxis; iu++)
-               {
-                   fftwindex2D = iu + iv*xaxis;
-                   fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis);
-                   gridss[fftwindex] = norm*fftwgrid[fftwindex2D][0];
-                   gridss[fftwindex+1] = norm*fftwgrid[fftwindex2D][1];
-               }
-            }
-
-        }
-
-        fftw_destroy_plan(plan);
-        fftw_free(fftwgrid);
-
-        #ifdef USE_MPI
-        MPI_Win_fence(0,slabwin);
-        MPI_Barrier(MPI_COMM_WORLD);
-        #endif
-
-        end = clock();
-        clock_gettime(CLOCK_MONOTONIC, &finish);
-        timing.fftw_time = ((double) (end - start)) / CLOCKS_PER_SEC;
-        timing.fftw_time1 = (finish.tv_sec - begin.tv_sec);
-        timing.fftw_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
-        clock_gettime(CLOCK_MONOTONIC, &begin);
-
-    #endif
+void fftw_data()
+{
+  
+ #ifdef USE_FFTW
+  // FFT transform the data (using distributed FFTW)
+  if(global_rank == 0)printf("PERFORMING FFT\n");
+
+  TAKE_TIME_START(fftw);
+
+  fftw_plan plan;
+  fftw_complex *fftwgrid;
+  ptrdiff_t alloc_local, local_n0, local_0_start;
+  double norm = 1.0/(double)(param.grid_size_x*param.grid_size_y);
+  
+  // map the 1D array of complex visibilities to a 2D array required by FFTW (complex[*][2])
+  // x is the direction of contiguous data and maps to the second parameter
+  // y is the parallelized direction and corresponds to the first parameter (--> n0)
+  // and perform the FFT per w plane
+  alloc_local = fftw_mpi_local_size_2d(param.grid_size_y, param.grid_size_x, MPI_COMM_WORLD,&local_n0, &local_0_start);
+  fftwgrid    = fftw_alloc_complex(alloc_local);
+  plan        = fftw_mpi_plan_dft_2d(param.grid_size_y, param.grid_size_x, fftwgrid, fftwgrid, MPI_COMM_WORLD, FFTW_BACKWARD, FFTW_ESTIMATE);
+  
+  long fftwindex = 0;
+  long fftwindex2D = 0;
+  for (int iw=0; iw<param.num_w_planes; iw++)
+    {
+      //printf("FFTing plan %d\n",iw);
+      //select the w-plane to transform
+      for (int iv=0; iv<yaxis; iv++)
+	{
+	  for (int iu=0; iu<xaxis; iu++)
+	    {
+	      fftwindex2D = iu + iv*xaxis;
+	      fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis);
+	      fftwgrid[fftwindex2D][0] = grid[fftwindex];
+	      fftwgrid[fftwindex2D][1] = grid[fftwindex+1];
+	    }
+	}
+      
+      // do the transform for each w-plane        
+      fftw_execute(plan);
+      
+      // save the transformed w-plane
+      for (int iv=0; iv<yaxis; iv++)
+	{
+	  for (int iu=0; iu<xaxis; iu++)
+	    {
+	      fftwindex2D = iu + iv*xaxis;
+	      fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis);
+	      gridss[fftwindex] = norm*fftwgrid[fftwindex2D][0];
+	      gridss[fftwindex+1] = norm*fftwgrid[fftwindex2D][1];
+	    }
+	}
+      
+    }
+
+  fftw_destroy_plan(plan);
+  fftw_free(fftwgrid);
+  
+ #ifdef USE_MPI
+  MPI_Win_fence(0,slabwin);
+  MPI_Barrier(MPI_COMM_WORLD);
+ #endif
+  
+  TAKE_TIME_STOP(fftw);
+
+ #endif
 }
 
 void write_fftw_data(){
 
-   #ifdef USE_FFTW
-     #ifdef WRITE_DATA
-        // Write results
-        #ifdef USE_MPI
-        MPI_Win writewin;
-        MPI_Win_create(gridss, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &writewin);
-        MPI_Win_fence(0,writewin);
-        #endif
-        if (global_rank == 0)
-        {
-          printf("WRITING FFT TRANSFORMED DATA\n");
-          file.pFilereal = fopen (out.fftfile2,"wb");
-          file.pFileimg = fopen (out.fftfile3,"wb");
-          #ifdef USE_MPI
-          for (int isector=0; isector<nsectors; isector++)
-          {
-              MPI_Win_lock(MPI_LOCK_SHARED,isector,0,writewin);
-              MPI_Get(gridss_w,size_of_grid,MPI_DOUBLE,isector,0,size_of_grid,MPI_DOUBLE,writewin);
-              MPI_Win_unlock(isector,writewin);
-              for (long i=0; i<size_of_grid/2; i++)
-              {
-                      gridss_real[i] = gridss_w[2*i];
-                      gridss_img[i] = gridss_w[2*i+1];
-              }
-              if (param.num_w_planes > 1)
-              {
-                for (int iw=0; iw<param.num_w_planes; iw++)
+  // Write results
+  
+ #ifdef USE_FFTW
+  
+  double twt, tpr;
+    
+ #ifdef WRITE_DATA
+
+  TAKE_TIME(twt, tpr);
+  
+ #ifdef USE_MPI
+  MPI_Win writewin;
+  MPI_Win_create(gridss, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &writewin);
+  MPI_Win_fence(0,writewin);
+ #endif
+  if (global_rank == 0)
+    {
+      printf("WRITING FFT TRANSFORMED DATA\n");
+      file.pFilereal = fopen (out.fftfile2,"wb");
+      file.pFileimg = fopen (out.fftfile3,"wb");
+     #ifdef USE_MPI
+      for (int isector=0; isector<nsectors; isector++)
+	{
+	  MPI_Win_lock(MPI_LOCK_SHARED,isector,0,writewin);
+	  MPI_Get(gridss_w,size_of_grid,MPI_DOUBLE,isector,0,size_of_grid,MPI_DOUBLE,writewin);
+	  MPI_Win_unlock(isector,writewin);
+	  for (long i=0; i<size_of_grid/2; i++)
+	    {
+	      gridss_real[i] = gridss_w[2*i];
+	      gridss_img[i] = gridss_w[2*i+1];
+	    }
+	  if (param.num_w_planes > 1)
+	    {
+	      for (int iw=0; iw<param.num_w_planes; iw++)
                 for (int iv=0; iv<yaxis; iv++)
-                for (int iu=0; iu<xaxis; iu++)
-                {
-                          long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double);
-                          long index = iu + iv*xaxis + iw*xaxis*yaxis;
-                          fseek(file.pFilereal, global_index, SEEK_SET);
-                          fwrite(&gridss_real[index], 1, sizeof(double), file.pFilereal);
-                }
-                for (int iw=0; iw<param.num_w_planes; iw++)
+		  for (int iu=0; iu<xaxis; iu++)
+		    {
+		      long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double);
+		      long index = iu + iv*xaxis + iw*xaxis*yaxis;
+		      fseek(file.pFilereal, global_index, SEEK_SET);
+		      fwrite(&gridss_real[index], 1, sizeof(double), file.pFilereal);
+		    }
+	      for (int iw=0; iw<param.num_w_planes; iw++)
                 for (int iv=0; iv<yaxis; iv++)
-                for (int iu=0; iu<xaxis; iu++)
-                {
-                          long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double);
-                          long index = iu + iv*xaxis + iw*xaxis*yaxis;
-                          fseek(file.pFileimg, global_index, SEEK_SET);
-                          fwrite(&gridss_img[index], 1, sizeof(double), file.pFileimg);
-                }
-               } 
-               else 
-               {
-                          fwrite(gridss_real, size_of_grid/2, sizeof(double), file.pFilereal);
-                          fwrite(gridss_img, size_of_grid/2, sizeof(double), file.pFileimg);
-               }
-
-           }
-           #else
-           /*
-               for (int iw=0; iw<param.num_w_planes; iw++)
-                 for (int iv=0; iv<grid_size_y; iv++)
-                   for (int iu=0; iu<grid_size_x; iu++)
-                   {
-                          int isector = 0;
-                          long index = 2*(iu + iv*grid_size_x + iw*grid_size_x*grid_size_y);
-                          double v_norm = sqrt(gridtot[index]*gridtot[index]+gridtot[index+1]*gridtot[index+1]);
-                          fprintf (file.pFile, "%d %d %d %f %f %f\n", iu,iv,iw,gridtot[index],gridtot[index+1],v_norm);
-                   }
-           */
-           #endif
-
-           fclose(file.pFilereal);
-           fclose(file.pFileimg);
-        }
-        #ifdef USE_MPI
-               MPI_Win_fence(0,writewin);
-               MPI_Win_free(&writewin);
-               MPI_Barrier(MPI_COMM_WORLD);
-        #endif
-     #endif //WRITE_DATA
-
-
-     // Phase correction  
-
-     clock_gettime(CLOCK_MONOTONIC, &begin);
-     start = clock();
-     if(global_rank == 0)printf("PHASE CORRECTION\n");
-     double* image_real = (double*) calloc(xaxis*yaxis,sizeof(double));
-     double* image_imag = (double*) calloc(xaxis*yaxis,sizeof(double));
-
-     phase_correction(gridss,image_real,image_imag,xaxis,yaxis,param.num_w_planes,param.grid_size_x,param.grid_size_y,resolution,metaData.wmin,metaData.wmax,param.num_threads);
-
-     end = clock();
-     clock_gettime(CLOCK_MONOTONIC, &finish);
-     timing.phase_time = ((double) (end - start)) / CLOCKS_PER_SEC;
-     timing.phase_time1 = (finish.tv_sec - begin.tv_sec);
-     timing.phase_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
-
-     #ifdef WRITE_IMAGE
-
-        if(global_rank == 0)
-        {
-            file.pFilereal = fopen (out.fftfile2,"wb");
-            file.pFileimg = fopen (out.fftfile3,"wb");
-            fclose(file.pFilereal);
-            fclose(file.pFileimg);
-        }
-        #ifdef USE_MPI
-            MPI_Barrier(MPI_COMM_WORLD);
-        #endif
-        if(global_rank == 0)printf("WRITING IMAGE\n");
-        for (int isector=0; isector<size; isector++)
-        {
-            #ifdef USE_MPI
-                MPI_Barrier(MPI_COMM_WORLD);
-            #endif
-            if(isector == global_rank)
-            {
-               printf("%d writing\n",isector);
-               file.pFilereal = fopen (out.fftfile2,"ab");
-               file.pFileimg = fopen (out.fftfile3,"ab");
-
-               long global_index = isector*(xaxis*yaxis)*sizeof(double);
-
-               fseek(file.pFilereal, global_index, SEEK_SET);
-               fwrite(image_real, xaxis*yaxis, sizeof(double), file.pFilereal);
-               fseek(file.pFileimg, global_index, SEEK_SET);
-               fwrite(image_imag, xaxis*yaxis, sizeof(double), file.pFileimg);
-
-               fclose(file.pFilereal);
-               fclose(file.pFileimg);
-            }
-        }
-        #ifdef USE_MPI
-           MPI_Barrier(MPI_COMM_WORLD);
-        #endif
-
-     #endif //WRITE_IMAGE
-
-  #endif  //FFTW
+		  for (int iu=0; iu<xaxis; iu++)
+		    {
+		      long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double);
+		      long index = iu + iv*xaxis + iw*xaxis*yaxis;
+		      fseek(file.pFileimg, global_index, SEEK_SET);
+		      fwrite(&gridss_img[index], 1, sizeof(double), file.pFileimg);
+		    }
+	    } 
+	  else 
+	    {
+	      fwrite(gridss_real, size_of_grid/2, sizeof(double), file.pFilereal);
+	      fwrite(gridss_img, size_of_grid/2, sizeof(double), file.pFileimg);
+	    }
+	  
+	}
+     #else
+      /*
+	for (int iw=0; iw<param.num_w_planes; iw++)
+	for (int iv=0; iv<grid_size_y; iv++)
+	for (int iu=0; iu<grid_size_x; iu++)
+	{
+	int isector = 0;
+	long index = 2*(iu + iv*grid_size_x + iw*grid_size_x*grid_size_y);
+	double v_norm = sqrt(gridtot[index]*gridtot[index]+gridtot[index+1]*gridtot[index+1]);
+	fprintf (file.pFile, "%d %d %d %f %f %f\n", iu,iv,iw,gridtot[index],gridtot[index+1],v_norm);
+	}
+      */
+     #endif
+      
+      fclose(file.pFilereal);
+      fclose(file.pFileimg);
+    }
+ #ifdef USE_MPI
+  MPI_Win_fence(0,writewin);
+  MPI_Win_free(&writewin);
+  MPI_Barrier(MPI_COMM_WORLD);
+ #endif
+
+  ADD_TIME(write, twt, tpr);
+ #endif //WRITE_DATA
+  
+  
+  // Phase correction  
+
+  
+
+  TAKE_TIME_START(phase)
+  if(global_rank == 0)printf("PHASE CORRECTION\n");
+  double* image_real = (double*) calloc(xaxis*yaxis,sizeof(double));
+  double* image_imag = (double*) calloc(xaxis*yaxis,sizeof(double));
+  
+  phase_correction(gridss,image_real,image_imag,xaxis,yaxis,param.num_w_planes,param.grid_size_x,param.grid_size_y,resolution,metaData.wmin,metaData.wmax,param.num_threads);
+
+  TAKE_TIME_STOP(phase);
+  
+ #ifdef WRITE_IMAGE
+
+  TAKE_TIME(twt, tpr);
+  
+  if(global_rank == 0)
+    {
+      file.pFilereal = fopen (out.fftfile2,"wb");
+      file.pFileimg = fopen (out.fftfile3,"wb");
+      fclose(file.pFilereal);
+      fclose(file.pFileimg);
+    }
+ #ifdef USE_MPI
+  MPI_Barrier(MPI_COMM_WORLD);
+ #endif
+  if(global_rank == 0)printf("WRITING IMAGE\n");
+  for (int isector=0; isector<size; isector++)
+    {
+     #ifdef USE_MPI
+      MPI_Barrier(MPI_COMM_WORLD);
+     #endif
+      if(isector == global_rank)
+	{
+	  printf("%d writing\n",isector);
+	  file.pFilereal = fopen (out.fftfile2,"ab");
+	  file.pFileimg = fopen (out.fftfile3,"ab");
+	  
+	  long global_index = isector*(xaxis*yaxis)*sizeof(double);
+	  
+	  fseek(file.pFilereal, global_index, SEEK_SET);
+	  fwrite(image_real, xaxis*yaxis, sizeof(double), file.pFilereal);
+	  fseek(file.pFileimg, global_index, SEEK_SET);
+	  fwrite(image_imag, xaxis*yaxis, sizeof(double), file.pFileimg);
+	  
+	  fclose(file.pFilereal);
+	  fclose(file.pFileimg);
+	}
+    }
+ #ifdef USE_MPI
+  MPI_Barrier(MPI_COMM_WORLD);
+ #endif
+
+  ADD_TIME(write, twt, tpr);
+ #endif //WRITE_IMAGE
+  
+ #endif  //FFTW
 }
diff --git a/gridding.c b/gridding.c
index 758317749388280f2f09acd60aee4924b3998c09..afe7092e6bcf733289898826e9ba6f6bbf72ff1a 100644
--- a/gridding.c
+++ b/gridding.c
@@ -4,209 +4,209 @@
 #include "proto.h"
 
 
-void gridding(){
-
-    if(global_rank == 0)printf("GRIDDING DATA\n");
-
-    // Create histograms and linked lists
-    
-    clock_gettime(CLOCK_MONOTONIC, &begin);
-    start = clock();
+void gridding()
+{
+  
+  if(global_rank == 0)printf("GRIDDING DATA\n");
+  
+  // Create histograms and linked lists
 
-    // Initialize linked list
-    initialize_array();
+  TAKE_TIME_START(init);
+  
+  // Initialize linked list
+  initialize_array();
 
-    //Sector and Gridding data
-    gridding_data();
+  TAKE_TIME_STOP(init);
+  TAKE_TIME_START(process);
 
-    #ifdef USE_MPI
-        MPI_Barrier(MPI_COMM_WORLD);
-    #endif
+  // Sector and Gridding data
+  gridding_data();
 
-    end = clock();
-    clock_gettime(CLOCK_MONOTONIC, &finish);
-    timing.process_time = ((double) (end - start)) / CLOCKS_PER_SEC;
-    timing.process_time1 = (finish.tv_sec - begin.tv_sec);
-    timing.process_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
-    clock_gettime(CLOCK_MONOTONIC, &begin);
+  TAKE_TIME_STOP(process);
+  
+ #ifdef USE_MPI
+  MPI_Barrier(MPI_COMM_WORLD);
+ #endif
 
+  return;
 }
 
-void initialize_array(){
+void initialize_array()
+{
 
-    histo_send = (long*) calloc(nsectors+1,sizeof(long));
-    int * boundary = (int*) calloc(metaData.Nmeasures,sizeof(int));
-    double vvh;
+  histo_send     = (long*) calloc(nsectors+1,sizeof(long));
+  int * boundary = (int*) calloc(metaData.Nmeasures,sizeof(int));
+  double vvh;
     
-    for (long iphi = 0; iphi < metaData.Nmeasures; iphi++)
-      {
-	boundary[iphi] = -1;
-	vvh = data.vv[iphi];  //less or equal to 0.6
-	int binphi = (int)(vvh*nsectors); //has values expect 0 and nsectors-1. So we use updist and downdist condition
-	// check if the point influence also neighboring slabs
-	double updist = (double)((binphi+1)*yaxis)*dx - vvh;
-	double downdist = vvh - (double)(binphi*yaxis)*dx;
-	//
-	histo_send[binphi]++;
-	if(updist < w_supporth && updist >= 0.0)
-	  {histo_send[binphi+1]++; boundary[iphi] = binphi+1;};
-	if(downdist < w_supporth && binphi > 0 && downdist >= 0.0)
-	  {histo_send[binphi-1]++; boundary[iphi] = binphi-1;};
-      }
+  for (long iphi = 0; iphi < metaData.Nmeasures; iphi++)
+    {
+      boundary[iphi] = -1;
+      vvh = data.vv[iphi];  //less or equal to 0.6
+      int binphi = (int)(vvh*nsectors); //has values expect 0 and nsectors-1. So we use updist and downdist condition
+      // check if the point influence also neighboring slabs
+      double updist = (double)((binphi+1)*yaxis)*dx - vvh;
+      double downdist = vvh - (double)(binphi*yaxis)*dx;
+      //
+      histo_send[binphi]++;
+      if(updist < w_supporth && updist >= 0.0)
+	{histo_send[binphi+1]++; boundary[iphi] = binphi+1;};
+      if(downdist < w_supporth && binphi > 0 && downdist >= 0.0)
+	{histo_send[binphi-1]++; boundary[iphi] = binphi-1;};
+    }
 
-    sectorarray = (long**)malloc ((nsectors+1) * sizeof(long*));
-    for(int sec=0; sec<(nsectors+1); sec++)
-      {
-	sectorarray[sec] = (long*)malloc(histo_send[sec]*sizeof(long));
-      }
+  sectorarray = (long**)malloc ((nsectors+1) * sizeof(long*));
+  for(int sec=0; sec<(nsectors+1); sec++)
+    {
+      sectorarray[sec] = (long*)malloc(histo_send[sec]*sizeof(long));
+    }
     
-    long *counter = (long*) calloc(nsectors+1,sizeof(long));
-    for (long iphi = 0; iphi < metaData.Nmeasures; iphi++)
-      {
-	vvh = data.vv[iphi];
-	int binphi = (int)(vvh*nsectors);
-	double updist = (double)((binphi+1)*yaxis)*dx - vvh;
-	double downdist = vvh - (double)(binphi*yaxis)*dx;
-	sectorarray[binphi][counter[binphi]] = iphi;
-	counter[binphi]++;
-	if(updist < w_supporth && updist >= 0.0)
-	  { sectorarray[binphi+1][counter[binphi+1]] = iphi; counter[binphi+1]++;};
-	if(downdist < w_supporth && binphi > 0 && downdist >= 0.0)
-	  { sectorarray[binphi-1][counter[binphi-1]] = iphi; counter[binphi-1]++;};
-      }
+  long *counter = (long*) calloc(nsectors+1,sizeof(long));
+  for (long iphi = 0; iphi < metaData.Nmeasures; iphi++)
+    {
+      vvh = data.vv[iphi];
+      int binphi = (int)(vvh*nsectors);
+      double updist = (double)((binphi+1)*yaxis)*dx - vvh;
+      double downdist = vvh - (double)(binphi*yaxis)*dx;
+      sectorarray[binphi][counter[binphi]] = iphi;
+      counter[binphi]++;
+      if(updist < w_supporth && updist >= 0.0)
+	{ sectorarray[binphi+1][counter[binphi+1]] = iphi; counter[binphi+1]++;};
+      if(downdist < w_supporth && binphi > 0 && downdist >= 0.0)
+	{ sectorarray[binphi-1][counter[binphi-1]] = iphi; counter[binphi-1]++;};
+    }
      
   
-    #ifdef VERBOSE
-        for (int iii=0; iii<nsectors+1; iii++)printf("HISTO %d %d %ld\n", global_rank, iii, histo_send[iii]);
-    #endif
-}
+ #ifdef VERBOSE
+  for (int iii=0; iii<nsectors+1; iii++)printf("HISTO %d %d %ld\n", global_rank, iii, histo_send[iii]);
+ #endif
 
-void gridding_data(){
-
-    double shift = (double)(dx*yaxis);
- 
-    // Open the MPI Memory Window for the slab
-    #ifdef USE_MPI
-        MPI_Win_create(grid, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &slabwin);
-        MPI_Win_fence(0,slabwin);
-    #endif
-
-    #ifndef USE_MPI
-       file.pFile1 = fopen (out.outfile1,"w");
-    #endif
+  free( boundary);
+  return;
+}
 
-    timing.kernel_time = 0.0;
-    timing.kernel_time1 = 0.0;
-    timing.reduce_time = 0.0;
-    timing.reduce_time1 = 0.0;
-    timing.compose_time = 0.0;
-    timing.compose_time1 = 0.0; 
+void gridding_data()
+{
 
-    // calculate the resolution in radians
-    resolution = 1.0/MAX(abs(metaData.uvmin),abs(metaData.uvmax));
-    
-    // calculate the resolution in arcsec 
-    double resolution_asec = (3600.0*180.0)/MAX(abs(metaData.uvmin),abs(metaData.uvmax))/PI;
+  double shift = (double)(dx*yaxis);
+  
+ #ifdef USE_MPI
+  MPI_Win_create(grid, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &slabwin);
+  MPI_Win_fence(0,slabwin);
+ #endif
+  
+ #ifndef USE_MPI
+  file.pFile1 = fopen (out.outfile1,"w");
+ #endif
+  
+  // calculate the resolution in radians
+  resolution = 1.0/MAX(abs(metaData.uvmin),abs(metaData.uvmax));
+  
+  // calculate the resolution in arcsec 
+  double resolution_asec = (3600.0*180.0)/MAX(abs(metaData.uvmin),abs(metaData.uvmax))/PI;
+  if( global_rank == 0 )
     printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec);
 
-    for (long isector = 0; isector < nsectors; isector++)
+  for (long isector = 0; isector < nsectors; isector++)
     {
-        clock_gettime(CLOCK_MONOTONIC, &begink);
-        startk = clock();
-        // define local destination sector
-        //isector = (isector_count+rank)%size;  // this line must be wrong! [LT]
-
-        // allocate sector arrays 
-        long    Nsec       = histo_send[isector];
-        double *uus        = (double*) malloc(Nsec*sizeof(double));
-        double *vvs        = (double*) malloc(Nsec*sizeof(double));
-        double *wws        = (double*) malloc(Nsec*sizeof(double));
-        long    Nweightss  = Nsec*metaData.polarisations;
-        long    Nvissec    = Nweightss*metaData.freq_per_chan;
-        float *weightss    = (float*) malloc(Nweightss*sizeof(float));
-        float *visreals    = (float*) malloc(Nvissec*sizeof(float));
-        float *visimgs     = (float*) malloc(Nvissec*sizeof(float));
+      double twt, tpr;
+      
+      TAKE_TIME(twt, tpr);
+
+      // define local destination sector
+      //isector = (isector_count+rank)%size;  // this line must be wrong! [LT]
+      
+      // allocate sector arrays 
+      long    Nsec       = histo_send[isector];
+      double *uus        = (double*) malloc(Nsec*sizeof(double));
+      double *vvs        = (double*) malloc(Nsec*sizeof(double));
+      double *wws        = (double*) malloc(Nsec*sizeof(double));
+      long    Nweightss  = Nsec*metaData.polarisations;
+      long    Nvissec    = Nweightss*metaData.freq_per_chan;
+      float *weightss    = (float*) malloc(Nweightss*sizeof(float));
+      float *visreals    = (float*) malloc(Nvissec*sizeof(float));
+      float *visimgs     = (float*) malloc(Nvissec*sizeof(float));
        
-        // select data for this sector
-        long icount = 0;
-        long ip = 0;
-        long inu = 0;
+      // select data for this sector
+      long icount = 0;
+      long ip = 0;
+      long inu = 0;
 
-        for(long iphi = histo_send[isector]-1; iphi>=0; iphi--)
+     #warning shall we omp-ize this ?
+      for(long iphi = histo_send[isector]-1; iphi>=0; iphi--)
         {
-              long ilocal = sectorarray[isector][iphi];
-              //double vvh = data.vv[ilocal];
-              //int binphi = (int)(vvh*nsectors);
-              //if (binphi == isector || boundary[ilocal] == isector) {
-              uus[icount] = data.uu[ilocal];
-              vvs[icount] = data.vv[ilocal]-isector*shift;
-              wws[icount] = data.ww[ilocal];
-              for (long ipol=0; ipol<metaData.polarisations; ipol++)
-		{
-		  weightss[ip] = data.weights[ilocal*metaData.polarisations+ipol];
-		  ip++;
-		}
-              for (long ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++)
-		{
-		  visreals[inu] = data.visreal[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq];
-		  visimgs[inu] = data.visimg[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq];
-                    //if(visimgs[inu]>1e10 || visimgs[inu]<-1e10)printf("%f %f %ld %ld %d %ld %ld\n",visreals[inu],visimgs[inu],inu,Nvissec,rank,ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq,metaData.Nvis);
-                     inu++;
-		}
-              icount++;
-         }
-
-         clock_gettime(CLOCK_MONOTONIC, &finishk);
-         endk = clock();
-         timing.compose_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
-         timing.compose_time1 += (finishk.tv_sec - begink.tv_sec);
-         timing.compose_time1 += (finishk.tv_sec - begink.tv_sec);
-         timing.compose_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
-
-	#ifndef USE_MPI
-	 double vvmin = 1e20;
-	 double uumax = -1e20;
-	 double vvmax = -1e20;
-	 
-	 for (long ipart=0; ipart<Nsec; ipart++)
-	   {
-	     uumin = MIN(uumin,uus[ipart]);
-	     uumax = MAX(uumax,uus[ipart]);
-	     vvmin = MIN(vvmin,vvs[ipart]);
-	     vvmax = MAX(vvmax,vvs[ipart]);
+	  long ilocal = sectorarray[isector][iphi];
+	  //double vvh = data.vv[ilocal];
+	  //int binphi = (int)(vvh*nsectors);
+	  //if (binphi == isector || boundary[ilocal] == isector) {
+	  uus[icount] = data.uu[ilocal];
+	  vvs[icount] = data.vv[ilocal]-isector*shift;
+	  wws[icount] = data.ww[ilocal];	  
+	  UNROLL(4)
+          PRAGMA_IVDEP
+	  for (long ipol=0; ipol<metaData.polarisations; ipol++, ip++)
+	    {
+	      weightss[ip] = data.weights[ilocal*metaData.polarisations+ipol];
+	    }
+	  
+	  PRAGMA_IVDEP
+	  UNROLL(4)
+	    for (long ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++, inu++)
+	      {
+		visreals[inu] = data.visreal[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq];
+		visimgs[inu] = data.visimg[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq];
+	      //if(visimgs[inu]>1e10 || visimgs[inu]<-1e10)printf("%f %f %ld %ld %d %ld %ld\n",visreals[inu],visimgs[inu],inu,Nvissec,rank,ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq,metaData.Nvis);
+	    }
+	  icount++;
+	}
+
+      ADD_TIME(compose, twt, tpr);
+
+     #ifndef USE_MPI
+      double vvmin = 1e20;
+      double uumax = -1e20;
+      double vvmax = -1e20;
+
+     #warning shall we omp-ize this ?
+      for (long ipart=0; ipart<Nsec; ipart++)
+	{
+	  uumin = MIN(uumin,uus[ipart]);
+	  uumax = MAX(uumax,uus[ipart]);
+	  vvmin = MIN(vvmin,vvs[ipart]);
+	  vvmax = MAX(vvmax,vvs[ipart]);
 	     
-	     if(ipart%10 == 0)fprintf (file.pFile, "%ld %f %f %f\n",isector,uus[ipart],vvs[ipart]+isector*shift,wws[ipart]);
-	   }
+	  if(ipart%10 == 0)fprintf (file.pFile, "%ld %f %f %f\n",isector,uus[ipart],vvs[ipart]+isector*shift,wws[ipart]);
+	}
 	 
-	 printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax);
-        #endif
-
-       // Make convolution on the grid
+      printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax);
+     #endif
 
-       #ifdef VERBOSE
-          printf("Processing sector %ld\n",isector);
-       #endif
-       clock_gettime(CLOCK_MONOTONIC, &begink);
-       startk = clock();
-
-       wstack(param.num_w_planes,
-              Nsec,
-              metaData.freq_per_chan,
-              metaData.polarisations,
-              uus,
-              vvs,
-              wws,
-              visreals,
-              visimgs,
-              weightss,
-              dx,
-              dw,
-              param.w_support,
-              xaxis,
-              yaxis,
-              gridss,
-              param.num_threads);
+      // Make convolution on the grid
 
+     #ifdef VERBOSE
+      printf("Processing sector %ld\n",isector);
+     #endif
+      TAKE_TIME(twt, tpr);
+
+      wstack(param.num_w_planes,
+	     Nsec,
+	     metaData.freq_per_chan,
+	     metaData.polarisations,
+	     uus,
+	     vvs,
+	     wws,
+	     visreals,
+	     visimgs,
+	     weightss,
+	     dx,
+	     dw,
+	     param.w_support,
+	     xaxis,
+	     yaxis,
+	     gridss,
+	     param.num_threads);
+
+      ADD_TIME(kernel, twt, tpr);
+      
       /* int z =0 ;
        * #pragma omp target map(to:test_i_gpu) map(from:z)
        * {
@@ -215,134 +215,144 @@ void gridding_data(){
        *       z = x + test_i_gpu;
        *       }*/
 
-       clock_gettime(CLOCK_MONOTONIC, &finishk);
-       endk = clock();
-       timing.kernel_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
-       timing.kernel_time1 += (finishk.tv_sec - begink.tv_sec);
-       timing.kernel_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
-       #ifdef VERBOSE
-          printf("Processed sector %ld\n",isector);
-       #endif
-       clock_gettime(CLOCK_MONOTONIC, &begink);
-       startk = clock();
+     #ifdef VERBOSE
+      printf("Processed sector %ld\n",isector);
+     #endif
+      /* ----------------
+       * REDUCE
+       * ---------------- */
 
-       //for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++)printf("--> %f\n",gridss[iii]);
-    
-       #ifndef USE_MPI
-       long stride = isector*2*xaxis*yaxis*num_w_planes;
-       for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++)
-	 gridtot[stride+iii] = gridss[iii];
-       #endif
+      double twt_r, tpr_r;
+      TAKE_TIME(twt_r, tpr_r);
 
-       // Write grid in the corresponding remote slab
-      #ifdef USE_MPI
-       // int target_rank = (int)isector;    it implied that size >= nsectors
-       int target_rank = (int)(isector % size);
-       
-      #ifdef ONE_SIDE
+                                                     // ..................
+     #ifndef USE_MPI                                 // REDUCE WITH NO MPI                
+      
+      #pragma omp parallel
+      {
+	long stride = isector * size_of_grid;
+       #pragma omp for
+	for (long iii=0; iii< size_fo_grid; iii++)
+	  gridtot[stride+iii] = gridss[iii];
+      }
 
-       //   MPI_Win_lock(MPI_LOCK_SHARED,target_rank,0,slabwin);
-       // MPI_Accumulate(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,MPI_SUM,slabwin);
-       // MPI_Win_unlock(target_rank,slabwin);
-       
-       // for every task, gridss coincides with the 
-       // that can be avoided if shared window coincides with gridss
-       memcpy(Me.win.ptr+isector*size_of_grid, gridss, size_of_grid*sizeof(double));
+                                                     // ..................
+                                                     // REDUCE WITH MPI
+     #else
 
+      // Write grid in the corresponding remote slab      
+      // int target_rank = (int)isector;    it implied that size >= nsectors
+      int target_rank = (int)(isector % size);
        
-       reduce( isector, target_rank );  // here the reduce is performed within every host 
+     #ifdef ONE_SIDE
 
-       if ( Me.Nhosts > 1 )
-	 {
-	   // here the reduce is performed among hosts
-	   MPI_Barrier(MPI_COMM_WORLD);
+      // for every task, gridss coincides with the 
+      // that can be avoided if shared window coincides with gridss
+
+      TAKE_TIME(twt, tpr);
+      memcpy(Me.win.ptr+isector*size_of_grid, gridss, size_of_grid*sizeof(double));
+      ADD_TIME(mmove, twt, tpr);
+ 
+      dprintf(1, global_rank, 0, "reducing sector %ld..\n", isector);
+
+      TAKE_TIME( twt, tpr);
+      reduce( isector, target_rank );  // here the reduce is performed within every host 
+      ADD_TIME(reduce_sh, twt, tpr);
+
+      if ( Me.Nhosts > 1 )
+	{
+	  // here the reduce is performed among hosts
+	  MPI_Barrier(MPI_COMM_WORLD);
 	   
-	   int Im_in_the_new_communicator = MPI_UNDEFINED;
-	   if(global_rank == target_rank)
-	     Im_in_the_new_communicator = 1;
-	   else
-	     if( Me.Rank[HOSTS] == 0 )
-	       {
-		 if( Me.Ranks_to_host[ target_rank ] != Me.myhost )
-		   Im_in_the_new_communicator = 1;
-	       }
+	  int Im_in_the_new_communicator = MPI_UNDEFINED;
+	  if(global_rank == target_rank)
+	    Im_in_the_new_communicator = 1;
+	  else
+	    if( Me.Rank[HOSTS] == 0 )
+	      {
+		if( Me.Ranks_to_host[ target_rank ] != Me.myhost )
+		  Im_in_the_new_communicator = 1;
+	      }
 	   
-	   MPI_Comm Sector_Comm;
-	   MPI_Comm_split( COMM[WORLD], Im_in_the_new_communicator, global_rank, &Sector_Comm);
+	  MPI_Comm Sector_Comm;
+	  MPI_Comm_split( COMM[WORLD], Im_in_the_new_communicator, global_rank, &Sector_Comm);
 	   
-	   if( Sector_Comm != MPI_COMM_NULL )
-	     {
-	       int sector_size;
-	       int sector_rank = 0;
-	       int sector_target;
+	  if( Sector_Comm != MPI_COMM_NULL )
+	    {
+	      double _twt_;
+	      int sector_size;
+	      int sector_rank = 0;
+	      int sector_target;
 	       
-	       MPI_Comm_size( Sector_Comm, &sector_size);
-	       MPI_Comm_rank( Sector_Comm, &sector_rank);
-	       if ( global_rank == target_rank)
-		 {
-		   MPI_Send( &sector_rank, 1, MPI_INT, 0, 0, Sector_Comm);
-		   memcpy(grid, Me.swins[Me.Rank[myHOST]].ptr+isector*size_of_grid*sizeof(double), size_of_grid * sizeof(double));    
-		 }
-	       
-	       if( sector_rank == 0 )
-		 {
-		   MPI_Status status;
-		   MPI_Recv( &sector_target, 1, MPI_INT, MPI_ANY_SOURCE, 0, Sector_Comm, &status);
-		 }
-	       
-	       MPI_Bcast( &sector_target, 1, MPI_INT, 0, Sector_Comm );
-	       
-	       MPI_Reduce(grid, grid, size_of_grid, MPI_DOUBLE,MPI_SUM, sector_target, Sector_Comm);
+	      MPI_Comm_size( Sector_Comm, &sector_size);
+	      MPI_Comm_rank( Sector_Comm, &sector_rank);
+	      if ( global_rank == target_rank)
+		{
+		  MPI_Send( &sector_rank, 1, MPI_INT, 0, 0, Sector_Comm);
+		  TAKE_TIMEwt( _twt_ );
+		  memcpy(gridss, Me.swins[Me.Rank[myHOST]].ptr+isector*size_of_grid*sizeof(double),
+			 size_of_grid * sizeof(double));
+		  ADD_TIMEwt( mmove, _twt_);
+		}
 	       
-	       MPI_Comm_free( &Sector_Comm );
-	     }
-	 }
+	      if( sector_rank == 0 )
+		{
+		  MPI_Status status;
+		  MPI_Recv( &sector_target, 1, MPI_INT, MPI_ANY_SOURCE, 0, Sector_Comm, &status);
+		}
 
-       
-      #else   // relates to #ifdef ONE_SIDE
+	      TAKE_TIMEwt(_twt_);
+	      MPI_Bcast( &sector_target, 1, MPI_INT, 0, Sector_Comm );
+	       
+	      MPI_Reduce(gridss, grid, size_of_grid, MPI_DOUBLE,MPI_SUM, sector_target, Sector_Comm);
+	      
+	      MPI_Comm_free( &Sector_Comm );
+	      ADD_TIMEwt(mpi, _twt_);
+	    }
+	}
+      ADD_TIME(reduce_mpi, twt, tpr);
 
        
-       MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD);
-      #endif  //  closes #ifdef ONE_SIDE
-      #endif  //  closes USE_MPI
-	       
-       clock_gettime(CLOCK_MONOTONIC, &finishk);
-       endk = clock();
-       timing.reduce_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
-       timing.reduce_time1 += (finishk.tv_sec - begink.tv_sec);
-       timing.reduce_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
-       // Go to next sector
-       for (long inull=0; inull<2*param.num_w_planes*xaxis*yaxis; inull++)gridss[inull] = 0.0;
-
-       // Deallocate all sector arrays
-       free(uus);
-       free(vvs);
-       free(wws);
-       free(weightss);
-       free(visreals);
-       free(visimgs);
+     #else   // relates to #ifdef ONE_SIDE
+      
+      {
+	double _twt_;
+	TAKE_TIMEwt(_twt_);
+	MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD);
+	ADD_TIMEwt(mpi, _twt_);
+      }
+      
+     #endif  //  closes #ifdef ONE_SIDE
+     #endif  //  closes USE_MPI
+
+      ADD_TIME(reduce, twt_r, tpr_r);
+
+      
+      // wipe before getting to the next sector
+      memset((void*)gridss, 0, size_of_grid * sizeof(double));
+
+      // Deallocate all sector arrays
+      free(uus);
+      free(vvs);
+      free(wws);
+      free(weightss);
+      free(visreals);
+      free(visimgs);
       // End of loop over sector    
     }
 
-    // Finalize MPI communication
-    #ifdef USE_MPI
-       MPI_Win_fence(0,slabwin);
-    #endif  
 
-    #ifndef USE_MPI
-        fclose(file.pFile1);
-    #endif
+  free( histo_send );
 
-    #ifdef USE_MPI
-        MPI_Barrier(MPI_COMM_WORLD);
-    #endif
+ #ifndef USE_MPI
+  fclose(file.pFile1);
+ #endif
 
-    end = clock();
-    clock_gettime(CLOCK_MONOTONIC, &finish);
-    timing.process_time = ((double) (end - start)) / CLOCKS_PER_SEC;
-    timing.process_time1 = (finish.tv_sec - begin.tv_sec);
-    timing.process_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
-    clock_gettime(CLOCK_MONOTONIC, &begin);
+ #ifdef USE_MPI
+  MPI_Win_fence(0,slabwin);
+  MPI_Barrier(MPI_COMM_WORLD);
+ #endif
+  
 }
 
 void write_grided_data()
@@ -355,76 +365,77 @@ void write_grided_data()
         printf("WRITING GRIDDED DATA\n");
         file.pFilereal = fopen (out.outfile2,"wb");
         file.pFileimg = fopen (out.outfile3,"wb");
-        #ifdef USE_MPI
-           for (int isector=0; isector<nsectors; isector++)
-           {
-              MPI_Win_lock(MPI_LOCK_SHARED,isector,0,slabwin);
-              MPI_Get(gridss,size_of_grid,MPI_DOUBLE,isector,0,size_of_grid,MPI_DOUBLE,slabwin);
-              MPI_Win_unlock(isector,slabwin);
-              for (long i=0; i<size_of_grid/2; i++)
+	
+       #ifdef USE_MPI
+	for (int isector=0; isector<nsectors; isector++)
+	  {
+	    MPI_Win_lock(MPI_LOCK_SHARED,isector,0,slabwin);
+	    MPI_Get(gridss,size_of_grid,MPI_DOUBLE,isector,0,size_of_grid,MPI_DOUBLE,slabwin);
+	    MPI_Win_unlock(isector,slabwin);
+	    for (long i=0; i<size_of_grid/2; i++)
               {
-                      gridss_real[i] = gridss[2*i];
-                      gridss_img[i] = gridss[2*i+1];
+		gridss_real[i] = gridss[2*i];
+		gridss_img[i] = gridss[2*i+1];
               }
-              if (param.num_w_planes > 1)
+	    if (param.num_w_planes > 1)
               {
-                      for (int iw=0; iw<param.num_w_planes; iw++)
-                        for (int iv=0; iv<yaxis; iv++)
-                          for (int iu=0; iu<xaxis; iu++)
-                          {
-                               long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double);
-                               long index = iu + iv*xaxis + iw*xaxis*yaxis;
-                               fseek(file.pFilereal, global_index, SEEK_SET);
-                               fwrite(&gridss_real[index], 1, sizeof(double), file.pFilereal);
-                          }
-                      for (int iw=0; iw<param.num_w_planes; iw++)
-                        for (int iv=0; iv<yaxis; iv++)
-                          for (int iu=0; iu<xaxis; iu++)
-                          {
-                               long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double);
-                               long index = iu + iv*xaxis + iw*xaxis*yaxis;
-                               fseek(file.pFileimg, global_index, SEEK_SET);
-                               fwrite(&gridss_img[index], 1, sizeof(double), file.pFileimg);
-                               //double v_norm = sqrt(gridss[index]*gridss[index]+gridss[index+1]*gridss[index+1]);
-                               //fprintf (file.pFile, "%d %d %d %f %f %f\n", iu,isector*yaxis+iv,iw,gridss[index],gridss[index+1],v_norm);
-                          }
-
+		for (int iw=0; iw<param.num_w_planes; iw++)
+		  for (int iv=0; iv<yaxis; iv++)
+		    for (int iu=0; iu<xaxis; iu++)
+		      {
+			long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double);
+			long index = iu + iv*xaxis + iw*xaxis*yaxis;
+			fseek(file.pFilereal, global_index, SEEK_SET);
+			fwrite(&gridss_real[index], 1, sizeof(double), file.pFilereal);
+		      }
+		
+		for (int iw=0; iw<param.num_w_planes; iw++)
+		  for (int iv=0; iv<yaxis; iv++)
+		    for (int iu=0; iu<xaxis; iu++)
+		      {
+			long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double);
+			long index = iu + iv*xaxis + iw*xaxis*yaxis;
+			fseek(file.pFileimg, global_index, SEEK_SET);
+			fwrite(&gridss_img[index], 1, sizeof(double), file.pFileimg);
+			//double v_norm = sqrt(gridss[index]*gridss[index]+gridss[index+1]*gridss[index+1]);
+			//fprintf (file.pFile, "%d %d %d %f %f %f\n", iu,isector*yaxis+iv,iw,gridss[index],gridss[index+1],v_norm);
+		      }
+		
               }
-              else
+	    else
               {
-                      for (int iw=0; iw<param.num_w_planes; iw++)
-                      {
-                          long global_index = (xaxis*isector*yaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double);
-                          long index = iw*xaxis*yaxis;
-                          fseek(file.pFilereal, global_index, SEEK_SET);
-                          fwrite(&gridss_real[index], xaxis*yaxis, sizeof(double), file.pFilereal);
-                          fseek(file.pFileimg, global_index, SEEK_SET);
-                          fwrite(&gridss_img[index], xaxis*yaxis, sizeof(double), file.pFileimg);
-                     }
+		for (int iw=0; iw<param.num_w_planes; iw++)
+		  {
+		    long global_index = (xaxis*isector*yaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double);
+		    long index = iw*xaxis*yaxis;
+		    fseek(file.pFilereal, global_index, SEEK_SET);
+		    fwrite(&gridss_real[index], xaxis*yaxis, sizeof(double), file.pFilereal);
+		    fseek(file.pFileimg, global_index, SEEK_SET);
+		    fwrite(&gridss_img[index], xaxis*yaxis, sizeof(double), file.pFileimg);
+		  }
               }
           }
        #else
-          for (int iw=0; iw<param.num_w_planes; iw++)
-             for (int iv=0; iv<param.grid_size_y; iv++)
-               for (int iu=0; iu<param.grid_size_x; iu++)
-                {
-                      long index = 2*(iu + iv*param.grid_size_x + iw*param.grid_size_x*param.grid_size_y);
-                      fwrite(&gridtot[index], 1, sizeof(double), file.pFilereal);
-                      fwrite(&gridtot[index+1], 1, sizeof(double), file.pFileimg);
-                      //double v_norm = sqrt(gridtot[index]*gridtot[index]+gridtot[index+1]*gridtot[index+1]);
-                      //fprintf (file.pFile, "%d %d %d %f %f %f\n", iu,iv,iw,gridtot[index],gridtot[index+1],v_norm);
-                 }
-        #endif
+	for (int iw=0; iw<param.num_w_planes; iw++)
+	  for (int iv=0; iv<param.grid_size_y; iv++)
+	    for (int iu=0; iu<param.grid_size_x; iu++)
+	      {
+		long index = 2*(iu + iv*param.grid_size_x + iw*param.grid_size_x*param.grid_size_y);
+		fwrite(&gridtot[index], 1, sizeof(double), file.pFilereal);
+		fwrite(&gridtot[index+1], 1, sizeof(double), file.pFileimg);
+		//double v_norm = sqrt(gridtot[index]*gridtot[index]+gridtot[index+1]*gridtot[index+1]);
+		//fprintf (file.pFile, "%d %d %d %f %f %f\n", iu,iv,iw,gridtot[index],gridtot[index+1],v_norm);
+	      }
+       #endif
         fclose(file.pFilereal);
         fclose(file.pFileimg);
      }
-
-     #ifdef USE_MPI
-        MPI_Win_fence(0,slabwin);
-     #endif
-
-   #endif //WRITE_DATA 
-
+     
+    #ifdef USE_MPI
+     MPI_Win_fence(0,slabwin);
+    #endif
+     
+    #endif //WRITE_DATA      
 }
 
 
@@ -434,7 +445,8 @@ void reduce( int sector, int target_rank )
  {   
    
    int local_rank = Me.Rank[myHOST];
-
+   int target_rank_on_myhost = -1;
+   
    if( Me.Ranks_to_host[ target_rank ] == Me.myhost )
      // exchange rank 0 with target rank
      // in this way the following log2 alogorithm,
@@ -442,16 +454,32 @@ void reduce( int sector, int target_rank )
      // every target rank
      {
 
-       int r = 0;
-       while( Me.Ranks_to_myhost[r] != target_rank )
-	 r++;
+       target_rank_on_myhost = 0;
+       while( Me.Ranks_to_myhost[target_rank_on_myhost] != target_rank )
+	 target_rank_on_myhost++;
+
+       dprintf(2, Me.Rank[myHOST], 0,
+	       "[SEC %d] swapping Host master with target rank %d (%d)\n",
+	       sector, target_rank, target_rank_on_myhost);
        
-       if( r > 0 )
-	 {       
+       
+       if( target_rank_on_myhost > 0 )
+	 // the target is not the task that already has rank 0
+	 // on my host
+	 {
+	   
 	   if( local_rank == 0 )
-	     local_rank = r;
-	   if( local_rank == r )
+	     local_rank = target_rank_on_myhost;
+	   else if( local_rank == target_rank_on_myhost )
 	     local_rank = 0;
+
+	   win_t temp = Me.swins[target_rank_on_myhost];
+	   Me.swins[target_rank_on_myhost] = Me.swins[0];
+	   Me.swins[0] = temp;
+
+	   temp = Me.scwins[target_rank_on_myhost];
+	   Me.scwins[target_rank_on_myhost] = Me.scwins[0];
+	   Me.scwins[0] = temp;
 	 }
      }
    
@@ -466,10 +494,14 @@ void reduce( int sector, int target_rank )
    for(int l = 0; l < max_level; l++)
      {
        int threshold = 1 << (1+l);
-
+       
        if( local_rank % threshold == 0)
-         {	   
+         {
 	   int source = local_rank + (1<<l);
+	   dprintf(2, 0, 0,
+		   "[SEC %d] task %d (%d) getting data from task %d at level %d\n", 
+		   sector, local_rank, Me.Rank[myHOST], source, l );
+	   
 	   while( *(int*)(Me.scwins[source].ptr) < l )
 	     // sleep 5 usec if the source target is not ready
 	     NSLEEP( 5000 );
@@ -481,7 +513,24 @@ void reduce( int sector, int target_rank )
 	   *(int*)(Me.win_ctrl.ptr) = l;
          }
        else
-	 *(int*)(Me.win_ctrl.ptr) = l;
+	 {
+	   dprintf(2, 0, 0,
+		   "[SEC %d] task %d (%d) signaling that level %d is done\n",
+		   sector, local_rank, Me.Rank[myHOST], l );
+	   
+	   *(int*)(Me.win_ctrl.ptr) = l;
+	 }
+     }
+
+   if ( target_rank_on_myhost > 0 )
+     {
+       win_t temp = Me.swins[target_rank_on_myhost];
+       Me.swins[target_rank_on_myhost] = Me.swins[0];
+       Me.swins[0] = temp;
+
+       temp = Me.scwins[target_rank_on_myhost];
+       Me.scwins[target_rank_on_myhost] = Me.scwins[0];
+       Me.scwins[0] = temp;
      }
    
    return;
diff --git a/init.c b/init.c
index 8312a3b439f98e8d8c6faed57ded03c89e2e9c99..10dc736ff16b2ff4ef71edbb880f492f930b1877 100644
--- a/init.c
+++ b/init.c
@@ -7,8 +7,7 @@
 void init(int index)
 {
 
-   clock_gettime(CLOCK_MONOTONIC, &begin0);
-   start0 = clock();
+  TAKE_TIME_START(total);
 
    // DAV: the corresponding KernelLen is calculated within the wstack function. It can be anyway hardcoded for optimization
    dx = 1.0/(double)param.grid_size_x;
@@ -37,8 +36,7 @@ void init(int index)
      printf("\nTask %d sees %d topology levels\n", global_rank, Me.MAXl);
    #endif
 
-   clock_gettime(CLOCK_MONOTONIC, &begin);
-   start = clock();
+   TAKE_TIME_START(setup);
 
    // INPUT FILES (only the first ndatasets entries are used)
    strcpy(datapath,param.datapath_multi[index]);
@@ -46,31 +44,25 @@ void init(int index)
    
    //Changing the output file names
    op_filename();
-             
+   
    // Read metadata
    fileName(datapath, in.metafile);
    readMetaData(filename);
-
+   
    // Local Calculation
    metaData_calculation();
-
+   
    // Allocate Data Buffer
    allocate_memory();
-  
+   
    // Reading Data
    readData();
- 
-   #ifdef USE_MPI
-         MPI_Barrier(MPI_COMM_WORLD);
-   #endif
    
-
-   clock_gettime(CLOCK_MONOTONIC, &finish);
-   end = clock();
-   timing.setup_time = ((double) (end - start)) / CLOCKS_PER_SEC;
-   timing.setup_time1 = (finish.tv_sec - begin.tv_sec);
-   timing.setup_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
- 
+  #ifdef USE_MPI
+   MPI_Barrier(MPI_COMM_WORLD);
+  #endif
+   
+   TAKE_TIME_STOP(setup);
 }
 
 void op_filename() {
@@ -233,6 +225,11 @@ void read_parameter_file(char *fname)
 		{
 		  strcpy(outparam.timingfile, buf2);
 		}
+	      if(strcmp(buf1, "verbose_level") == 0)
+		{
+		  verbose_level = atoi(buf1);
+		}
+
 	      if(param.ndatasets > 1)
 		{
                    
@@ -257,11 +254,14 @@ void read_parameter_file(char *fname)
    
     /* Communicating the relevent parameters to the other process */
 
-    #ifdef USE_MPI
-    	MPI_Bcast(&in, sizeof(struct ip), MPI_BYTE, 0, MPI_COMM_WORLD);
-    	MPI_Bcast(&outparam, sizeof(struct op), MPI_BYTE, 0, MPI_COMM_WORLD);
-    	MPI_Bcast(&param, sizeof(struct parameter), MPI_BYTE, 0, MPI_COMM_WORLD);
-    #endif
+  #ifdef USE_MPI
+   double twt;
+   TAKE_TIMEwt(twt);
+   MPI_Bcast(&in, sizeof(struct ip), MPI_BYTE, 0, MPI_COMM_WORLD);
+   MPI_Bcast(&outparam, sizeof(struct op), MPI_BYTE, 0, MPI_COMM_WORLD);
+   MPI_Bcast(&param, sizeof(struct parameter), MPI_BYTE, 0, MPI_COMM_WORLD);
+   ADD_TIMEwt(mpi, twt);
+  #endif
 
 }
 
@@ -312,7 +312,8 @@ void readMetaData(char fileLocal[1000]) {
 void metaData_calculation() {
    
      int nsub = 1000;
-     printf("Subtracting last %d measurements\n",nsub);
+     if( global_rank == 0 )
+       printf("Subtracting last %d measurements\n",nsub);
      metaData.Nmeasures = metaData.Nmeasures-nsub;
      metaData.Nvis = metaData.Nmeasures*metaData.freq_per_chan*metaData.polarisations;
 
diff --git a/numa.c b/numa.c
index fe5867b9cbfd203f30a431a707be23193fd00e69..e880caf8b0362437a0ba10820d988daf5deb26fa 100644
--- a/numa.c
+++ b/numa.c
@@ -62,9 +62,11 @@ int numa_init( int Rank, int Size, MPI_Comm *MYWORLD, map_t *Me )
   // at my shared-memory level
   //
   for( int t = 0; t < Me->Ntasks[SHMEMl]; t++ )
-    if( t != Me->Rank[SHMEMl] )
-      MPI_Win_shared_query( Me->win_ctrl.win, t, &(Me->scwins[t].size),
-			    &(Me->scwins[t].disp), &(Me->scwins[t].ptr) );
+    {
+      //if( t != Me->Rank[SHMEMl] )
+	MPI_Win_shared_query( Me->win_ctrl.win, t, &(Me->scwins[t].size),
+			      &(Me->scwins[t].disp), &(Me->scwins[t].ptr) );
+    }
 
   if( Me->Rank[SHMEMl] != 0 )
     MPI_Win_shared_query( win_ctrl_hostmaster, 0, &(win_ctrl_hostmaster_size),
diff --git a/result.c b/result.c
index 15d310062a2db517902525bf3ac47cb351fe5bc5..7f20ffbd88fbb6d0a5cf2d2acfe01262f3a6f607 100644
--- a/result.c
+++ b/result.c
@@ -2,51 +2,64 @@
 #include "allvars.h"
 #include "proto.h"
 
-void write_result() {
-       
-       end = clock();
-       clock_gettime(CLOCK_MONOTONIC, &finish);
-       timing.tot_time = ((double) (end - start0)) / CLOCKS_PER_SEC;
-       timing.tot_time1 = (finish.tv_sec - begin0.tv_sec);
-       timing.tot_time1 += (finish.tv_nsec - begin0.tv_nsec) / 1000000000.0;
-       
-       if (global_rank == 0)
-        {
-          printf("Setup time:    %f sec\n",timing.setup_time);
-          printf("Process time:  %f sec\n",timing.process_time);
-          printf("Kernel time = %f, Array Composition time %f, Reduce time: %f sec\n",timing.kernel_time,timing.compose_time,timing.reduce_time);
-          #ifdef USE_FFTW
-              printf("FFTW time:     %f sec\n",timing.fftw_time);
-              printf("Phase time:    %f sec\n",timing.phase_time);
-          #endif
-          printf("TOT time:      %f sec\n",timing.tot_time);
-          if(param.num_threads > 1)
-          {
-            printf("PSetup time:   %f sec\n",timing.setup_time1);
-            printf("PProcess time: %f sec\n",timing.process_time1);
-            printf("PKernel time = %f, PArray Composition time %f, PReduce time: %f sec\n",timing.kernel_time1,timing.compose_time1,timing.reduce_time1);
-            #ifdef USE_FFTW
-               printf("PFFTW time:    %f sec\n",timing.fftw_time1);
-               printf("PPhase time:   %f sec\n",timing.phase_time1);
-            #endif
-            printf("PTOT time:     %f sec\n",timing.tot_time1);
-          }
-        }
+void write_result()
+{
 
-        if (global_rank == 0)
-        {
-         file.pFile = fopen (out.timingfile,"w");
-         if (param.num_threads == 1)
-         {
-           fprintf(file.pFile, "%f %f %f %f %f %f %f\n",timing.setup_time,timing.kernel_time,timing.compose_time,timing.reduce_time,timing.fftw_time,timing.phase_time,timing.tot_time);
-         } else {
-           fprintf(file.pFile, "%f %f %f %f %f %f %f\n",timing.setup_time1,timing.kernel_time1,timing.compose_time1,timing.reduce_time1,timing.fftw_time1,timing.phase_time1,timing.tot_time1);
-         }
-         fclose(file.pFile);
-        }
-        
-        #ifdef USE_MPI
-           MPI_Win_fence(0,slabwin);
-           MPI_Win_free(&slabwin);
-        #endif
+  TAKE_TIME_STOP( total );
+       
+  if (global_rank == 0)
+    {
+      printf("%14s time : %f sec\n", "Setup", wt_timing.setup);
+      printf("%14s time : %f sec\n", "Process", wt_timing.process);
+      printf("%14s time : %f sec\n", "Reduce", wt_timing.reduce);
+     #if defined(USE_MPI)
+     #if defined(ONE_SIDE)
+      printf("%14s time : %f sec\n", "Reduce sh", wt_timing.reduce_sh);
+      printf("%14s time : %f sec\n", "Mmove", wt_timing.mmove);
+      printf("%14s time : %f sec\n", "ReduceMPI", wt_timing.reduce_mpi);
+     #endif
+      printf("%14s time : %f sec\n", "MPI", wt_timing.mpi);
+     #endif
+      printf("%10s Kernel time = %f, Array Composition time %f, Reduce time: %f sec\n", "",
+	     wt_timing.kernel,wt_timing.compose,wt_timing.reduce);
+     #ifdef USE_FFTW
+      printf("%14s time : %f sec\n", "FFTW", wt_timing.fftw);
+      printf("%14s time : %f sec\n", "Phase",wt_timing.phase);
+     #endif
+      printf("%14s time : %f sec\n\n", "TOTAL", wt_timing.total);
+      
+      if(param.num_threads > 1)
+	{
+	  printf("%14s time : %f sec\n", "PSetup", pr_timing.setup);
+	  printf("%14s time : %f sec\n", "PProcess", pr_timing.process);
+	  printf("%10s PKernel time = %f, PArray Composition time %f, PReduce time: %f sec\n", "",
+		 pr_timing.kernel,pr_timing.compose,pr_timing.reduce);
+	 #ifdef USE_FFTW
+	  printf("%14s time : %f sec\n", "PFFTW", pr_timing.fftw);
+	  printf("%14s time : %f sec\n", "PPhase", pr_timing.phase);
+	 #endif
+	  printf("%14s time : %f sec\n", "PTOTAL", pr_timing.total);
+	}
+    }
+  
+  if (global_rank == 0)
+    {
+      file.pFile = fopen (out.timingfile,"w");
+      if (param.num_threads == 1)
+	{
+	  fprintf(file.pFile, "%f %f %f %f %f %f %f\n",
+		  wt_timing.setup, wt_timing.kernel, wt_timing.compose,
+		  wt_timing.reduce,wt_timing.fftw,wt_timing.phase, wt_timing.total);
+	} else {
+	fprintf(file.pFile, "%f %f %f %f %f %f %f\n",
+		pr_timing.setup, pr_timing.kernel, pr_timing.compose,
+		pr_timing.reduce,pr_timing.fftw,pr_timing.phase, pr_timing.total);
+      }
+      fclose(file.pFile);
+    }
+  
+ #ifdef USE_MPI
+  MPI_Win_fence(0,slabwin);
+  MPI_Win_free(&slabwin);
+ #endif
 }