diff --git a/.gitignore b/.gitignore
index 2f2b2deae59f09c00e894b90488f0ee3ecb200a2..8edc32e8a54a246064c56e86b61e7ea0eccb1724 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,4 @@ w-stackingfftw
 w-stackingCfftw_serial
 w-stackingfftw_serial
 inverse-imaging
+.*
diff --git a/Build/Makefile.M100 b/Build/Makefile.M100
index 737d24294f64606a97860290efd98ece981a1d9c..a1dde127926069cda6b36834d3241ffc7790f0dc 100644
--- a/Build/Makefile.M100
+++ b/Build/Makefile.M100
@@ -13,7 +13,7 @@ NVCC = nvcc
 NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11
 NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda
 
-OMP= -fopenmp
+OMP= #-fopenmp
 
 CFLAGS +=  -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL)
 
diff --git a/Makefile b/Makefile
index 0a8527d5b6f6a56e44f6bc79615afb8ce15f0575..b8257839d7d2c666afbdf455f35cc389c11c3437 100644
--- a/Makefile
+++ b/Makefile
@@ -29,7 +29,7 @@ endif
 # perform one-side communication (suggested) instead of reduce (only if MPI is active)
 OPT += -DONE_SIDE
 # write the full 3D cube of gridded visibilities and its FFT transform
-#OPT += -DWRITE_DATA
+OPT += -DWRITE_DATA
 # write the final image
 OPT += -DWRITE_IMAGE
 # perform w-stacking phase correction
@@ -71,8 +71,8 @@ serial_cuda:
 	$(CXX) $(OPTIMIZE) $(OPT) -o w-stackingfftw_serial w-stacking-fftw.o w-stacking.o phase_correction.o $(CFLAGS) $(NVLIB) -lm
 
 mpi: $(COBJ)
-	$(MPICC) $(OPTIMIZE) -o w-stackingCfftw   $^  $(CFLAGS) $(LIBS)
-	$(MPICC) $(OPTIMIZE) -o inverse-imaging inverse-imaging.c w-stacking.c $(CFLAGS) $(LIBS)
+	$(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw   $^  $(CFLAGS) $(LIBS)
+	$(MPICC) $(OPTIMIZE) $(OPT) -o inverse-imaging inverse-imaging.c w-stacking.c $(CFLAGS) $(LIBS)
 
 mpi_cuda:
 	$(NVCC)   $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB)
diff --git a/inverse-imaging.c b/inverse-imaging.c
index 2a10345487ef06c191eee3ea001f38f355ed3fde..a3dfbbeaebb15f3b9c0c0050a56e2013c1087674 100644
--- a/inverse-imaging.c
+++ b/inverse-imaging.c
@@ -201,6 +201,7 @@ if(rank == 0){
 	// LOCAL grid size
 	xaxis = local_grid_size_x;
 	yaxis = local_grid_size_y;
+        double shift = (double)(dx*yaxis);
 
 	clock_gettime(CLOCK_MONOTONIC, &begin);
 	start = clock();
@@ -232,6 +233,12 @@ if(rank == 0){
 	fclose(pFile);
 
 
+        // calculate the resolution in radians
+        resolution = 1.0/MAX(abs(uvmin),abs(uvmax));
+        // calculate the resolution in arcsec
+        double resolution_asec = (3600.0*180.0)/MAX(abs(uvmin),abs(uvmax))/PI;
+        printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec);
+
 	// WATCH THIS!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 	int nsub = 1000;
 	//int nsub = 10;
@@ -253,8 +260,8 @@ if(rank == 0){
 	long nm_pe = (long)(Nmeasures/size);
 	long remaining = Nmeasures%size;
 
-  long startrow = rank*nm_pe;
-  if (rank == size-1)nm_pe = nm_pe+remaining;
+  	long startrow = rank*nm_pe;
+  	if (rank == size-1)nm_pe = nm_pe+remaining;
 
 	long Nmeasures_tot = Nmeasures;
 	Nmeasures = nm_pe;
@@ -280,7 +287,7 @@ if(rank == 0){
 
         // reading baselines
 
-  if(rank == 0)printf("READING DATA\n");
+        if(rank == 0)printf("READING DATA\n");
 	// Read data
 	strcpy(filename,datapath);
 	strcat(filename,ufile);
@@ -295,7 +302,7 @@ if(rank == 0){
 	strcat(filename,vfile);
 	//printf("Reading %s\n",filename);
 
-  pFile = fopen (filename,"rb");
+        pFile = fopen (filename,"rb");
 	fseek (pFile,startrow*sizeof(double),SEEK_SET);
 	fread(vv,Nmeasures*sizeof(double),1,pFile);
 	fclose(pFile);
@@ -332,15 +339,163 @@ if(rank == 0){
         pFilereal = fopen (filename,"rb");	
         long global_index = rank*(xaxis*yaxis)*sizeof(double);
         fseek(pFilereal, global_index, SEEK_SET);
-	fwrite(image_real, xaxis*yaxis, sizeof(double), pFilereal);
+	fread(image_real, xaxis*yaxis, sizeof(double), pFilereal);
         fclose(pFilereal);
 	// image read
+	// We read binary images, however we can easily extend to fits files
 	
-  if(rank == 0)printf("FFT TRANSFORMING (from Real to Complex Fourier space)\n");
+        if(rank == 0)printf("FFT TRANSFORMING (from Real to Complex Fourier space)\n");
          
+        fftw_plan plan;
+        ptrdiff_t alloc_local, local_n0, local_0_start;
+	// FFTW double input variable whose size is = local_n0 x 2(Nx/2+1)
+	double *fftwimage;
+	// FFTW complex output variable whose size is = local_n0 x Nx/2 + 1
+	fftw_complex *fftwgrid;
+
+        alloc_local = fftw_mpi_local_size_2d(grid_size_y, grid_size_x, MPI_COMM_WORLD, &local_n0, &local_0_start);
+        fftwimage = fftw_alloc_real(2 * alloc_local);
+        fftwgrid = fftw_alloc_complex(alloc_local);
+
+	// Create plan for r2c DFT
+	//plan = fftw_mpi_plan_dft_r2c_2d(grid_size_y, grid_size_x, fftwimage, fftwgrid, MPI_COMM_WORLD, FFTW_FORWARD, FFTW_ESTIMATE); 
+	plan = fftw_mpi_plan_dft_r2c_2d(grid_size_y, grid_size_x, fftwimage, fftwgrid, MPI_COMM_WORLD, FFTW_ESTIMATE); 
+
+        // Initialize FFTW arrays
+        long fftwindex = 0;
+        long fftwindex2D = 0;
+	for (long iv = 0; iv < local_n0; iv++)
+	for (long iu = 0; iu < xaxis; iu++)
+        {
+            fftwindex2D = iu + iv*xaxis;
+	    fftwimage[fftwindex2D] = image_real[fftwindex2D];
+	    fftwgrid[fftwindex2D][0] = 0.0;
+	    fftwgrid[fftwindex2D][1] = 0.0;
+	}
+
+        // Perform the FFT
+        fftw_execute(plan);
+        // Finalize the FFT
+	fftw_destroy_plan(plan);
+
+        // Create the grid array 
+	double * grid;
+	long size_of_grid = 2*num_w_planes*xaxis*yaxis;
+        grid = (double*) calloc(size_of_grid,sizeof(double));
+        for (int iw=0; iw<num_w_planes; iw++)
+        {
+            for (int iv=0; iv<yaxis; iv++)
+            {
+               for (int iu=0; iu<xaxis; iu++)
+               {
+                   fftwindex2D = iu + iv*xaxis;
+                   fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis);
+                   grid[fftwindex] = fftwgrid[fftwindex2D][0];
+                   grid[fftwindex+1] = fftwgrid[fftwindex2D][1];
+               }
+            }
+        }
+	fftw_free(fftwgrid);
+	fftw_free(fftwimage);
+
+	// This may be moved inside the WRITE_DATA section: TO BE CHECKED
+        // Create sector grid
+        double * gridss;
+        double * gridss_w;
+        double * gridss_real;
+        double * gridss_img;
+        size_of_grid = 2*num_w_planes*xaxis*yaxis;
+        gridss = (double*) calloc(size_of_grid,sizeof(double));
+        gridss_w = (double*) calloc(size_of_grid,sizeof(double));
+        gridss_real = (double*) calloc(size_of_grid/2,sizeof(double));
+        gridss_img = (double*) calloc(size_of_grid/2,sizeof(double));
+        // Create temporary global grid
+#ifndef USE_MPI
+        double * gridtot = (double*) calloc(2*grid_size_x*grid_size_y*num_w_planes,sizeof(double));
+#endif
+
+        // Open the MPI Memory Window for the slab
+#ifdef USE_MPI
+        MPI_Win slabwin;
+        MPI_Win_create(grid, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &slabwin);
+        MPI_Win_fence(0,slabwin);
+#endif
+
+#ifdef WRITE_DATA
+        // Write results
+        if (rank == 0)
+        {
+          printf("WRITING GRIDDED DATA\n");
+          pFilereal = fopen (outfile2,"wb");
+          pFileimg = fopen (outfile3,"wb");
+          #ifdef USE_MPI
+          for (int isector=0; isector<nsectors; isector++)
+          {
+              MPI_Win_lock(MPI_LOCK_SHARED,isector,0,slabwin);
+              MPI_Get(gridss,size_of_grid,MPI_DOUBLE,isector,0,size_of_grid,MPI_DOUBLE,slabwin);
+              MPI_Win_unlock(isector,slabwin);
+              for (long i=0; i<size_of_grid/2; i++)
+              {
+                      gridss_real[i] = gridss[2*i];
+                      gridss_img[i] = gridss[2*i+1];
+              }
+              if (num_w_planes > 1)
+              {
+                for (int iw=0; iw<num_w_planes; iw++)
+                for (int iv=0; iv<yaxis; iv++)
+                for (int iu=0; iu<xaxis; iu++)
+                {
+                          long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*grid_size_x*grid_size_y)*sizeof(double);
+                          long index = iu + iv*xaxis + iw*xaxis*yaxis;
+                          fseek(pFilereal, global_index, SEEK_SET);
+                          fwrite(&gridss_real[index], 1, sizeof(double), pFilereal);
+                }
+                for (int iw=0; iw<num_w_planes; iw++)
+                for (int iv=0; iv<yaxis; iv++)
+                for (int iu=0; iu<xaxis; iu++)
+                {
+                          long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*grid_size_x*grid_size_y)*sizeof(double);
+                          long index = iu + iv*xaxis + iw*xaxis*yaxis;
+                          fseek(pFileimg, global_index, SEEK_SET);
+                          fwrite(&gridss_img[index], 1, sizeof(double), pFileimg);
+                          //double v_norm = sqrt(gridss[index]*gridss[index]+gridss[index+1]*gridss[index+1]);
+                          //fprintf (pFile, "%d %d %d %f %f %f\n", iu,isector*yaxis+iv,iw,gridss[index],gridss[index+1],v_norm);
+                }
+
+              } else {
+                for (int iw=0; iw<num_w_planes; iw++)
+                {
+                          long global_index = (xaxis*isector*yaxis + iw*grid_size_x*grid_size_y)*sizeof(double);
+                          long index = iw*xaxis*yaxis;
+                          fseek(pFilereal, global_index, SEEK_SET);
+                          fwrite(&gridss_real[index], xaxis*yaxis, sizeof(double), pFilereal);
+                          fseek(pFileimg, global_index, SEEK_SET);
+                          fwrite(&gridss_img[index], xaxis*yaxis, sizeof(double), pFileimg);
+                }
+              }
+          }
+          #else
+          for (int iw=0; iw<num_w_planes; iw++)
+            for (int iv=0; iv<grid_size_y; iv++)
+               for (int iu=0; iu<grid_size_x; iu++)
+               {
+                          long index = 2*(iu + iv*grid_size_x + iw*grid_size_x*grid_size_y);
+                          fwrite(&gridtot[index], 1, sizeof(double), pFilereal);
+                          fwrite(&gridtot[index+1], 1, sizeof(double), pFileimg);
+                          //double v_norm = sqrt(gridtot[index]*gridtot[index]+gridtot[index+1]*gridtot[index+1]);
+                          //fprintf (pFile, "%d %d %d %f %f %f\n", iu,iv,iw,gridtot[index],gridtot[index+1],v_norm);
+               }
+          #endif
+          fclose(pFilereal);
+          fclose(pFileimg);
+        }
 
+        #ifdef USE_MPI
+        MPI_Win_fence(0,slabwin);
+        #endif
+#endif //WRITE_DATA
 
-  if(rank == 0)printf("DEGRIDDING DATA\n");
+        if(rank == 0)printf("CREATING LINKED LISTS\n");
 
 	// Create histograms and linked lists
         clock_gettime(CLOCK_MONOTONIC, &begin);
@@ -362,20 +517,22 @@ if(rank == 0){
 	double uuh,vvh;
 	for (long iphi = 0; iphi < Nmeasures; iphi++)
 	{
-     boundary[iphi] = -1;
+           boundary[iphi] = -1;
 	   uuh = uu[iphi];
 	   vvh = vv[iphi];
 	   int binphi = (int)(vvh*nsectors);
-	   // check if the point influence also neighboring slabs
+	   // check if the point influences also neighboring slabs
 	   double updist = (double)((binphi+1)*yaxis)*dx - vvh;
 	   double downdist = vvh - (double)(binphi*yaxis)*dx;
 	   //
 	   histo_send[binphi]++;
-     Push(&sectorhead[binphi],iphi);
-     if(updist < w_supporth && updist >= 0.0) {histo_send[binphi+1]++; boundary[iphi] = binphi+1; Push(&sectorhead[binphi+1],iphi);};
+           Push(&sectorhead[binphi],iphi);
+           if(updist < w_supporth && updist >= 0.0) {histo_send[binphi+1]++; boundary[iphi] = binphi+1; Push(&sectorhead[binphi+1],iphi);};
 	   if(downdist < w_supporth && binphi > 0 && downdist >= 0.0) {histo_send[binphi-1]++; boundary[iphi] = binphi-1; Push(&sectorhead[binphi-1],iphi);};
 	}
 
+	// Linked lists and communication histograms created
+
 #ifdef PIPPO
 	struct sectorlist * current;
 	long iiii = 0;
@@ -393,39 +550,9 @@ if(rank == 0){
 #endif
 
 #ifdef VERBOSE
-  for (int iii=0; iii<nsectors+1; iii++)printf("HISTO %d %d %ld\n",rank, iii, histo_send[iii]);
+        for (int iii=0; iii<nsectors+1; iii++)printf("HISTO %d %d %ld\n",rank, iii, histo_send[iii]);
 #endif
 
-// Create sector grid
-  double * gridss;
-  double * gridss_w;
-  double * gridss_real;
-  double * gridss_img;
-  double * grid;
-  long size_of_grid;
-  size_of_grid = 2*num_w_planes*xaxis*yaxis;
-  gridss = (double*) calloc(size_of_grid,sizeof(double));
-  gridss_w = (double*) calloc(size_of_grid,sizeof(double));
-  gridss_real = (double*) calloc(size_of_grid/2,sizeof(double));
-  gridss_img = (double*) calloc(size_of_grid/2,sizeof(double));
-  // Create destination slab
-  grid = (double*) calloc(size_of_grid,sizeof(double));
-  // Create temporary global grid
-#ifndef USE_MPI
-  double * gridtot = (double*) calloc(2*grid_size_x*grid_size_y*num_w_planes,sizeof(double));
-#endif
-  double shift = (double)(dx*yaxis);
-  // Open the MPI Memory Window for the slab
-#ifdef USE_MPI
-  MPI_Win slabwin;
-  MPI_Win_create(grid, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &slabwin);
-  MPI_Win_fence(0,slabwin);
-#endif
-#ifndef USE_MPI
-  pFile1 = fopen (outfile1,"w");
-#endif
-
-
 	// loop over files
 	//
 	kernel_time = 0.0;
@@ -435,75 +562,24 @@ if(rank == 0){
 	compose_time = 0.0;
 	compose_time1 = 0.0;
 
-	// MAIN LOOP OVER FILES
+	// MAIN LOOP OVER FILES == FREQUENCIES
 	//
 	for (int ifiles=0; ifiles<ndatasets; ifiles++)
 	{
 	strcpy(filename,datapath_multi[ifiles]);
         printf("Processing %s, %d of %d\n",filename,ifiles+1,ndatasets);
 
-        // Read metadata
-        strcpy(filename,datapath);
-        strcat(filename,metafile);
-        pFile = fopen (filename,"r");
-        fscanf(pFile,"%ld",&Nmeasures0);
-        fscanf(pFile,"%ld",&Nvis0);
-        fscanf(pFile,"%ld",&freq_per_chan0);
-        fscanf(pFile,"%ld",&polarisations0);
-        fscanf(pFile,"%ld",&Ntimes0);
-        fscanf(pFile,"%lf",&dt0);
-        fscanf(pFile,"%lf",&thours0);
-        fscanf(pFile,"%ld",&baselines0);
-        fscanf(pFile,"%lf",&uvmin);
-        fscanf(pFile,"%lf",&uvmax);
-        fscanf(pFile,"%lf",&wmin0);
-        fscanf(pFile,"%lf",&wmax0);
-        fclose(pFile);
-
-        // calculate the resolution in radians
-        resolution = 1.0/MAX(abs(uvmin),abs(uvmax));
-        // calculate the resolution in arcsec
-        double resolution_asec = (3600.0*180.0)/MAX(abs(uvmin),abs(uvmax))/PI;
-        printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec);
-
-        strcpy(filename,datapath);
-        strcat(filename,weightsfile);
-        pFile = fopen (filename,"rb");
-        fseek (pFile,startrow*polarisations*sizeof(float),SEEK_SET);
-        fread(weights,(Nweights)*sizeof(float),1,pFile);
-        fclose(pFile);
-
-        strcpy(filename,datapath);
-        strcat(filename,visrealfile);
-
-  pFile = fopen (filename,"rb");
-  fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET);
-  fread(visreal,Nvis*sizeof(float),1,pFile);
-  fclose(pFile);
-  strcpy(filename,datapath);
-  strcat(filename,visimgfile);
-#ifdef VERBOSE
-  printf("Reading %s\n",filename);
-#endif
-  pFile = fopen (filename,"rb");
-  fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET);
-  fread(visimg,Nvis*sizeof(float),1,pFile);
-  fclose(pFile);
-
-#ifdef USE_MPI
-  MPI_Barrier(MPI_COMM_WORLD);
-#endif
-  // Declare temporary arrays for the masking
-  double * uus;
-  double * vvs;
-  double * wws;
-  float * visreals;
-  float * visimgs;
-  float * weightss;
+        // Declare temporary arrays for the masking
+  	double * uus;
+  	double * vvs;
+  	double * wws;
+  	float * visreals;
+  	float * visimgs;
+  	float * weightss;
 	long isector;
 
-  for (long isector_count=0; isector_count<nsectors; isector_count++)
-      {
+  	for (long isector_count=0; isector_count<nsectors; isector_count++)
+      	{
         clock_gettime(CLOCK_MONOTONIC, &begink);
         startk = clock();
         // define local destination sector
@@ -511,27 +587,24 @@ if(rank == 0){
         isector = isector_count;
 	  // allocate sector arrays
         long Nsec = histo_send[isector];
-	      uus = (double*) malloc(Nsec*sizeof(double));
-	      vvs = (double*) malloc(Nsec*sizeof(double));
-	      wws = (double*) malloc(Nsec*sizeof(double));
-	      long Nweightss = Nsec*polarisations;
-	      long Nvissec = Nweightss*freq_per_chan;
-	      weightss = (float*) malloc(Nweightss*sizeof(float));
-	      visreals = (float*) malloc(Nvissec*sizeof(float));
-	      visimgs = (float*) malloc(Nvissec*sizeof(float));
-
-	  // select data for this sector
+	uus = (double*) malloc(Nsec*sizeof(double));
+	vvs = (double*) malloc(Nsec*sizeof(double));
+	wws = (double*) malloc(Nsec*sizeof(double));
+	long Nweightss = Nsec*polarisations;
+	long Nvissec = Nweightss*freq_per_chan;
+	weightss = (float*) malloc(Nweightss*sizeof(float));
+	visreals = (float*) malloc(Nvissec*sizeof(float));
+	visimgs = (float*) malloc(Nvissec*sizeof(float));
+
+	// select data for this sector
         long icount = 0;
-	      long ip = 0;
-	      long inu = 0;
-//CLAAAA
-	       struct sectorlist * current;
-	       current = sectorhead[isector];
-
-
+	long ip = 0;
+	long inu = 0;
+	struct sectorlist * current;
+	current = sectorhead[isector];
 
-	       while (current->index != -1)
-          {
+	while (current->index != -1)
+        {
              long ilocal = current->index;
       	     //double vvh = vv[ilocal];
              //int binphi = (int)(vvh*nsectors);
@@ -540,36 +613,35 @@ if(rank == 0){
 	           vvs[icount] = vv[ilocal]-isector*shift;
 	           wws[icount] = ww[ilocal];
              for (long ipol=0; ipol<polarisations; ipol++)
-                {
-		              weightss[ip] = weights[ilocal*polarisations+ipol];
+             {
+	          weightss[ip] = weights[ilocal*polarisations+ipol];
                   ip++;
-                }
-             for (long ifreq=0; ifreq<polarisations*freq_per_chan; ifreq++)
-                {
-	                 visreals[inu] = visreal[ilocal*polarisations*freq_per_chan+ifreq];
-	                 visimgs[inu] = visimg[ilocal*polarisations*freq_per_chan+ifreq];
-		               //if(visimgs[inu]>1e10 || visimgs[inu]<-1e10)printf("%f %f %ld %ld %d %ld %ld\n",visreals[inu],visimgs[inu],inu,Nvissec,rank,ilocal*polarisations*freq_per_chan+ifreq,Nvis);
-	                 inu++;
-	              }
-	           icount++;
-	     //}
-	            current = current->next;
              }
+             for (long ifreq=0; ifreq<polarisations*freq_per_chan; ifreq++)
+             {
+	          visreals[inu] = visreal[ilocal*polarisations*freq_per_chan+ifreq];
+	          visimgs[inu] = visimg[ilocal*polarisations*freq_per_chan+ifreq];
+	          //if(visimgs[inu]>1e10 || visimgs[inu]<-1e10)printf("%f %f %ld %ld %d %ld %ld\n",visreals[inu],visimgs[inu],inu,Nvissec,rank,ilocal*polarisations*freq_per_chan+ifreq,Nvis);
+	          inu++;
+	     }
+	     icount++;
+	     current = current->next;
+        }
 
-	  clock_gettime(CLOCK_MONOTONIC, &finishk);
-	  endk = clock();
-	  compose_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
-	  compose_time1 += (finishk.tv_sec - begink.tv_sec);
-	  compose_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
+	clock_gettime(CLOCK_MONOTONIC, &finishk);
+	endk = clock();
+	compose_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
+	compose_time1 += (finishk.tv_sec - begink.tv_sec);
+	compose_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
 
     #ifndef USE_MPI
-	  double uumin = 1e20;
-	  double vvmin = 1e20;
-	  double uumax = -1e20;
-	  double vvmax = -1e20;
+	double uumin = 1e20;
+	double vvmin = 1e20;
+	double uumax = -1e20;
+	double vvmax = -1e20;
 
-          for (long ipart=0; ipart<Nsec; ipart++)
-          {
+        for (long ipart=0; ipart<Nsec; ipart++)
+        {
 	       uumin = MIN(uumin,uus[ipart]);
 	       uumax = MAX(uumax,uus[ipart]);
 	       vvmin = MIN(vvmin,vvs[ipart]);
@@ -577,19 +649,19 @@ if(rank == 0){
 
 
                if(ipart%10 == 0)fprintf (pFile, "%ld %f %f %f\n",isector,uus[ipart],vvs[ipart]+isector*shift,wws[ipart]);
-          }
+        }
 
-	  printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax);
-          #endif
+	printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax);
+        #endif
 
-          // Make convolution on the grid
-          #ifdef VERBOSE
-	  printf("Processing sector %ld\n",isector);
-	  #endif
-          clock_gettime(CLOCK_MONOTONIC, &begink);
-          startk = clock();
+        // Make convolution on the grid
+        #ifdef VERBOSE
+	printf("Processing sector %ld\n",isector);
+	#endif
+        clock_gettime(CLOCK_MONOTONIC, &begink);
+        startk = clock();
 
-          wstack(num_w_planes,
+        wstack(num_w_planes,
                Nsec,
                freq_per_chan,
                polarisations,
@@ -602,8 +674,8 @@ if(rank == 0){
                dx,
                dw,
                w_support,
-	             xaxis,
-	             yaxis,
+	       xaxis,
+	       yaxis,
                gridss,
                num_threads);
 
@@ -616,56 +688,54 @@ if(rank == 0){
   z = x + test_i_gpu;
 }*/
 
+	clock_gettime(CLOCK_MONOTONIC, &finishk);
+	endk = clock();
+	kernel_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
+	kernel_time1 += (finishk.tv_sec - begink.tv_sec);
+	kernel_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
+        #ifdef VERBOSE
+	printf("Processed sector %ld\n",isector);
+        #endif
+        clock_gettime(CLOCK_MONOTONIC, &begink);
+        startk = clock();
 
+        //for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++)printf("--> %f\n",gridss[iii]);
 
-	  clock_gettime(CLOCK_MONOTONIC, &finishk);
-	  endk = clock();
-	  kernel_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
-	  kernel_time1 += (finishk.tv_sec - begink.tv_sec);
-	  kernel_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
-          #ifdef VERBOSE
-	  printf("Processed sector %ld\n",isector);
-          #endif
-          clock_gettime(CLOCK_MONOTONIC, &begink);
-          startk = clock();
-
-          //for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++)printf("--> %f\n",gridss[iii]);
-
-          #ifndef USE_MPI
-	  long stride = isector*2*xaxis*yaxis*num_w_planes;
-          for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++)gridtot[stride+iii] = gridss[iii];
-	  #endif
+        #ifndef USE_MPI
+	long stride = isector*2*xaxis*yaxis*num_w_planes;
+        for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++)gridtot[stride+iii] = gridss[iii];
+	#endif
 
-	  // Write grid in the corresponding remote slab
-          #ifdef USE_MPI
-	  int target_rank = (int)isector;
-	  //int target_rank = (int)(size-isector-1);
-	  #ifdef ONE_SIDE
-	  printf("One Side communication active\n");
-	  MPI_Win_lock(MPI_LOCK_SHARED,target_rank,0,slabwin);
-	  MPI_Accumulate(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,MPI_SUM,slabwin);
-	  MPI_Win_unlock(target_rank,slabwin);
-	  //MPI_Put(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,slabwin);
-          #else
-          MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD);
-          #endif //ONE_SIDE
-          #endif //USE_MPI
-
-	  clock_gettime(CLOCK_MONOTONIC, &finishk);
-	  endk = clock();
-	  reduce_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
-	  reduce_time1 += (finishk.tv_sec - begink.tv_sec);
-	  reduce_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
-          // Go to next sector
-	  for (long inull=0; inull<2*num_w_planes*xaxis*yaxis; inull++)gridss[inull] = 0.0;
-
-	  // Deallocate all sector arrays
-	  free(uus);
-	  free(vvs);
-	  free(wws);
-	  free(weightss);
-	  free(visreals);
-	  free(visimgs);
+	// Write grid in the corresponding remote slab
+        #ifdef USE_MPI
+	int target_rank = (int)isector;
+	//int target_rank = (int)(size-isector-1);
+	#ifdef ONE_SIDE
+	printf("One Side communication active\n");
+	MPI_Win_lock(MPI_LOCK_SHARED,target_rank,0,slabwin);
+	MPI_Accumulate(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,MPI_SUM,slabwin);
+	MPI_Win_unlock(target_rank,slabwin);
+	//MPI_Put(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,slabwin);
+        #else
+        MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD);
+        #endif //ONE_SIDE
+        #endif //USE_MPI
+
+	clock_gettime(CLOCK_MONOTONIC, &finishk);
+	endk = clock();
+	reduce_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
+	reduce_time1 += (finishk.tv_sec - begink.tv_sec);
+	reduce_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
+        // Go to next sector
+	for (long inull=0; inull<2*num_w_planes*xaxis*yaxis; inull++)gridss[inull] = 0.0;
+
+	// Deallocate all sector arrays
+	free(uus);
+	free(vvs);
+	free(wws);
+	free(weightss);
+	free(visreals);
+	free(visimgs);
         // End of loop over sectors
         }
 	// End of loop over input files
@@ -676,40 +746,6 @@ if(rank == 0){
 	MPI_Win_fence(0,slabwin);
 	#endif
 
-        // Swap left and right parts APPARENTLY NOT NECESSARY
-	/*
-	for (long kswap=0; kswap<num_w_planes; kswap++)
-	for (long jswap=0; jswap<yaxis; jswap++)
-	for (long iswap=0; iswap<xaxis; iswap++)
-	{
-           long index_origin = 2*(iswap + jswap*xaxis + kswap*yaxis*xaxis);
-	   gridss[index_origin] = grid[index_origin];
-	   gridss[index_origin+1] = grid[index_origin+1];
-	}
-	for (long kswap=0; kswap<num_w_planes; kswap++)
-	for (long jswap=0; jswap<yaxis; jswap++)
-	for (long iswap=0; iswap<xaxis/2; iswap++)
-	{
-           long index_origin = 2*(iswap + jswap*xaxis + kswap*yaxis*xaxis);
-	   long index_destination = 2*(iswap+xaxis/2 + jswap*xaxis + kswap*yaxis*xaxis);
-	   grid[index_destination] = gridss[index_origin];
-	   grid[index_destination+1] = gridss[index_origin+1];
-	}
-
-	for (long kswap=0; kswap<num_w_planes; kswap++)
-	for (long jswap=0; jswap<yaxis; jswap++)
-	for (long iswap=xaxis/2; iswap<xaxis; iswap++)
-	{
-           long index_origin = 2*(iswap + jswap*xaxis + kswap*yaxis*xaxis);
-	   long index_destination = 2*(iswap-xaxis/2 + jswap*xaxis + kswap*yaxis*xaxis);
-	   grid[index_destination] = gridss[index_origin];
-	   grid[index_destination+1] = gridss[index_origin+1];
-	}
-	*/
-
-        #ifndef USE_MPI
-        fclose(pFile1);
-        #endif
         #ifdef USE_MPI
 	MPI_Barrier(MPI_COMM_WORLD);
 	#endif
@@ -722,227 +758,6 @@ if(rank == 0){
         clock_gettime(CLOCK_MONOTONIC, &begin);
 
 
-#ifdef WRITE_DATA
-	// Write results
-	if (rank == 0)
-	{
-          printf("WRITING GRIDDED DATA\n");
-          pFilereal = fopen (outfile2,"wb");
-          pFileimg = fopen (outfile3,"wb");
-	  #ifdef USE_MPI
-	  for (int isector=0; isector<nsectors; isector++)
-          {
-	      MPI_Win_lock(MPI_LOCK_SHARED,isector,0,slabwin);
-	      MPI_Get(gridss,size_of_grid,MPI_DOUBLE,isector,0,size_of_grid,MPI_DOUBLE,slabwin);
-	      MPI_Win_unlock(isector,slabwin);
-	      for (long i=0; i<size_of_grid/2; i++)
-	      {
-		      gridss_real[i] = gridss[2*i];
-		      gridss_img[i] = gridss[2*i+1];
-	      }
-	      if (num_w_planes > 1)
-	      {
-                for (int iw=0; iw<num_w_planes; iw++)
-                for (int iv=0; iv<yaxis; iv++)
-                for (int iu=0; iu<xaxis; iu++)
-                {
-			  long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*grid_size_x*grid_size_y)*sizeof(double);
-                          long index = iu + iv*xaxis + iw*xaxis*yaxis;
-			  fseek(pFilereal, global_index, SEEK_SET);
-			  fwrite(&gridss_real[index], 1, sizeof(double), pFilereal);
-                }
-                for (int iw=0; iw<num_w_planes; iw++)
-                for (int iv=0; iv<yaxis; iv++)
-                for (int iu=0; iu<xaxis; iu++)
-                {
-                          long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*grid_size_x*grid_size_y)*sizeof(double);
-                          long index = iu + iv*xaxis + iw*xaxis*yaxis;
-                          fseek(pFileimg, global_index, SEEK_SET);
-                          fwrite(&gridss_img[index], 1, sizeof(double), pFileimg);
-                          //double v_norm = sqrt(gridss[index]*gridss[index]+gridss[index+1]*gridss[index+1]);
-                          //fprintf (pFile, "%d %d %d %f %f %f\n", iu,isector*yaxis+iv,iw,gridss[index],gridss[index+1],v_norm);
-                }
-
-	      } else {
-		for (int iw=0; iw<num_w_planes; iw++)
-		{
-			  long global_index = (xaxis*isector*yaxis + iw*grid_size_x*grid_size_y)*sizeof(double);
-                          long index = iw*xaxis*yaxis;
-			  fseek(pFilereal, global_index, SEEK_SET);
-                          fwrite(&gridss_real[index], xaxis*yaxis, sizeof(double), pFilereal);
-                          fseek(pFileimg, global_index, SEEK_SET);
-                          fwrite(&gridss_img[index], xaxis*yaxis, sizeof(double), pFileimg);
-		}
-	      }
-          }
-	  #else
-          for (int iw=0; iw<num_w_planes; iw++)
-            for (int iv=0; iv<grid_size_y; iv++)
-               for (int iu=0; iu<grid_size_x; iu++)
-               {
-                          long index = 2*(iu + iv*grid_size_x + iw*grid_size_x*grid_size_y);
-		          fwrite(&gridtot[index], 1, sizeof(double), pFilereal);
-		          fwrite(&gridtot[index+1], 1, sizeof(double), pFileimg);
-                          //double v_norm = sqrt(gridtot[index]*gridtot[index]+gridtot[index+1]*gridtot[index+1]);
-                          //fprintf (pFile, "%d %d %d %f %f %f\n", iu,iv,iw,gridtot[index],gridtot[index+1],v_norm);
-               }
-          #endif
-          fclose(pFilereal);
-          fclose(pFileimg);
-	}
-
-        #ifdef USE_MPI
-        MPI_Win_fence(0,slabwin);
-        #endif
-#endif //WRITE_DATA
-
-
-#ifdef USE_FFTW
-	// FFT transform the data (using distributed FFTW)
-
-	if(rank == 0)printf("PERFORMING FFT\n");
-        clock_gettime(CLOCK_MONOTONIC, &begin);
-        start = clock();
-        fftw_plan plan;
-        fftw_complex *fftwgrid;
-	ptrdiff_t alloc_local, local_n0, local_0_start;
-	double norm = 1.0/(double)(grid_size_x*grid_size_y);
-
-        // map the 1D array of complex visibilities to a 2D array required by FFTW (complex[*][2])
-	// x is the direction of contiguous data and maps to the second parameter
-	// y is the parallelized direction and corresponds to the first parameter (--> n0)
-	// and perform the FFT per w plane
-	alloc_local = fftw_mpi_local_size_2d(grid_size_y, grid_size_x, MPI_COMM_WORLD,&local_n0, &local_0_start);
-	fftwgrid = fftw_alloc_complex(alloc_local);
-	plan = fftw_mpi_plan_dft_2d(grid_size_y, grid_size_x, fftwgrid, fftwgrid, MPI_COMM_WORLD, FFTW_BACKWARD, FFTW_ESTIMATE);
-
-	long fftwindex = 0;
-	long fftwindex2D = 0;
-	for (int iw=0; iw<num_w_planes; iw++)
-        {
-            //printf("FFTing plan %d\n",iw);
-            // select the w-plane to transform
-            for (int iv=0; iv<yaxis; iv++)
-            {
-               for (int iu=0; iu<xaxis; iu++)
-               {
-		   fftwindex2D = iu + iv*xaxis;
-		   fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis);
-                   fftwgrid[fftwindex2D][0] = grid[fftwindex];
-                   fftwgrid[fftwindex2D][1] = grid[fftwindex+1];
-	       }
-	    }
-
-            // do the transform for each w-plane
-	    fftw_execute(plan);
-
-	    // save the transformed w-plane
-            for (int iv=0; iv<yaxis; iv++)
-            {
-               for (int iu=0; iu<xaxis; iu++)
-               {
-		   fftwindex2D = iu + iv*xaxis;
-		   fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis);
-                   gridss[fftwindex] = norm*fftwgrid[fftwindex2D][0];
-                   gridss[fftwindex+1] = norm*fftwgrid[fftwindex2D][1];
-               }
-            }
-
-	}
-
-        fftw_destroy_plan(plan);
-
-        #ifdef USE_MPI
-        MPI_Win_fence(0,slabwin);
-	MPI_Barrier(MPI_COMM_WORLD);
-        #endif
-
-        end = clock();
-        clock_gettime(CLOCK_MONOTONIC, &finish);
-        fftw_time = ((double) (end - start)) / CLOCKS_PER_SEC;
-        fftw_time1 = (finish.tv_sec - begin.tv_sec);
-        fftw_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
-        clock_gettime(CLOCK_MONOTONIC, &begin);
-
-#ifdef WRITE_DATA
-        // Write results
-        #ifdef USE_MPI
-	MPI_Win writewin;
-        MPI_Win_create(gridss, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &writewin);
-	MPI_Win_fence(0,writewin);
-        #endif
-        if (rank == 0)
-        {
-          printf("WRITING FFT TRANSFORMED DATA\n");
-          pFilereal = fopen (fftfile2,"wb");
-          pFileimg = fopen (fftfile3,"wb");
-          #ifdef USE_MPI
-          for (int isector=0; isector<nsectors; isector++)
-          {
-              MPI_Win_lock(MPI_LOCK_SHARED,isector,0,writewin);
-              MPI_Get(gridss_w,size_of_grid,MPI_DOUBLE,isector,0,size_of_grid,MPI_DOUBLE,writewin);
-              MPI_Win_unlock(isector,writewin);
-	      for (long i=0; i<size_of_grid/2; i++)
-	      {
-		      gridss_real[i] = gridss_w[2*i];
-		      gridss_img[i] = gridss_w[2*i+1];
-	      }
-	      if (num_w_planes > 1)
-	      {
-                for (int iw=0; iw<num_w_planes; iw++)
-                for (int iv=0; iv<yaxis; iv++)
-                for (int iu=0; iu<xaxis; iu++)
-                {
-			  long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*grid_size_x*grid_size_y)*sizeof(double);
-                          long index = iu + iv*xaxis + iw*xaxis*yaxis;
-			  fseek(pFilereal, global_index, SEEK_SET);
-			  fwrite(&gridss_real[index], 1, sizeof(double), pFilereal);
-                }
-                for (int iw=0; iw<num_w_planes; iw++)
-                for (int iv=0; iv<yaxis; iv++)
-                for (int iu=0; iu<xaxis; iu++)
-                {
-                          long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*grid_size_x*grid_size_y)*sizeof(double);
-                          long index = iu + iv*xaxis + iw*xaxis*yaxis;
-                          fseek(pFileimg, global_index, SEEK_SET);
-                          fwrite(&gridss_img[index], 1, sizeof(double), pFileimg);
-                }
-              } else {
-                          fwrite(gridss_real, size_of_grid/2, sizeof(double), pFilereal);
-                          fwrite(gridss_img, size_of_grid/2, sizeof(double), pFileimg);
-              }
-
-
-          }
-          #else
-          /*
-	  for (int iw=0; iw<num_w_planes; iw++)
-            for (int iv=0; iv<grid_size_y; iv++)
-               for (int iu=0; iu<grid_size_x; iu++)
-               {
-                          int isector = 0;
-                          long index = 2*(iu + iv*grid_size_x + iw*grid_size_x*grid_size_y);
-                          double v_norm = sqrt(gridtot[index]*gridtot[index]+gridtot[index+1]*gridtot[index+1]);
-                          fprintf (pFile, "%d %d %d %f %f %f\n", iu,iv,iw,gridtot[index],gridtot[index+1],v_norm);
-               }
-	  */
-          #endif
-          fclose(pFilereal);
-          fclose(pFileimg);
-        }
-
-        #ifdef USE_MPI
-        //MPI_Win_fence(0,writewin);
-        MPI_Win_fence(0,writewin);
-	MPI_Win_free(&writewin);
-	MPI_Barrier(MPI_COMM_WORLD);
-        #endif
-#endif //WRITE_DATA
-
-	fftw_free(fftwgrid);
-
-#endif //USE_FFTW
-
 	end = clock();
 	clock_gettime(CLOCK_MONOTONIC, &finish);
 	tot_time = ((double) (end - start0)) / CLOCKS_PER_SEC;