diff --git a/w-stacking-fftw-cfitsio.c b/w-stacking-fftw-cfitsio.c
new file mode 100644
index 0000000000000000000000000000000000000000..ebba5b31a422598da4d7b913365fe43c2702c017
--- /dev/null
+++ b/w-stacking-fftw-cfitsio.c
@@ -0,0 +1,1089 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#ifdef FITSIO
+#include "/m100/home/userexternal/ederubei/cfitsio-3.49/fitsio.h"
+#endif
+#ifdef USE_MPI
+#include <mpi.h>
+#ifdef USE_FFTW
+#include <fftw3-mpi.h>
+#endif
+#endif
+#include <omp.h>
+#include <math.h>
+#include <time.h>
+#include <unistd.h>
+#ifdef ACCOMP
+#include "w-stacking_omp.h"
+#else
+#include "w-stacking.h"
+#endif
+#define PI 3.14159265359
+#define NUM_OF_SECTORS -1
+#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
+#define MAX(X, Y) (((X) > (Y)) ? (X) : (Y))
+#define NOVERBOSE
+#define NFILES 100
+#define FILENAMELENGTH 30
+
+// Linked List set-up
+struct sectorlist {
+     long index;
+     struct sectorlist * next;
+};
+
+void Push(struct sectorlist** headRef, long data) {
+     struct sectorlist* newNode = malloc(sizeof(struct sectorlist));
+     newNode->index = data;
+     newNode->next = *headRef;
+     *headRef = newNode;
+}
+
+// Main Code
+int main(int argc, char * argv[])
+{
+        // Main MPI parameters
+	int rank;
+	int size;
+
+
+        // Define main filenames
+	FILE * pFile;
+	FILE * pFile1;
+	FILE * pFilereal;
+	FILE * pFileimg;
+	// Global filename to be composed
+	char filename[1000];
+
+        // MS paths
+        char datapath[900];
+        char datapath_multi[NFILES][900];
+        char ufile[FILENAMELENGTH] = "ucoord.bin";
+        char vfile[FILENAMELENGTH] = "vcoord.bin";
+        char wfile[FILENAMELENGTH] = "wcoord.bin";
+        char weightsfile[FILENAMELENGTH] = "weights.bin";
+        char visrealfile[FILENAMELENGTH] = "visibilities_real.bin";
+        char visimgfile[FILENAMELENGTH] = "visibilities_img.bin";
+        char metafile[FILENAMELENGTH] = "meta.txt";
+
+        // Mesh related files 
+        char outfile[FILENAMELENGTH] = "grid.txt";
+        char outfile1[FILENAMELENGTH] = "coords.txt";
+        char outfile2[FILENAMELENGTH] = "grid_real.bin";
+        char outfile3[FILENAMELENGTH] = "grid_img.bin";
+        char fftfile[FILENAMELENGTH] = "fft.txt";
+        char fftfile2[FILENAMELENGTH] = "fft_real.bin";
+        char fftfile3[FILENAMELENGTH] = "fft_img.bin";
+        char logfile[FILENAMELENGTH] = "run.log";
+        char extension[FILENAMELENGTH] = ".txt";
+        char srank[4];
+        char timingfile[FILENAMELENGTH] = "timings.dat";
+
+	// Visibilities related variables
+	double * uu;
+	double * vv;
+	double * ww;
+	float * weights;
+	float * visreal;
+	float * visimg;
+
+	long Nmeasures,Nmeasures0;
+	long Nvis,Nvis0;
+	long Nweights,Nweights0;
+	long freq_per_chan,freq_per_chan0;
+	long polarisations,polarisations0;
+        long Ntimes,Ntimes0;
+	double dt,dt0;
+	double thours,thours0;
+	long baselines,baselines0;
+	double uvmin,uvmin0;
+	double uvmax,uvmax0;
+	double wmin,wmin0;
+	double wmax,wmax0;
+	double resolution;
+
+        // Mesh related parameters: global size
+	int grid_size_x = 2048;
+	int grid_size_y = 2048;
+	// Split Mesh size (auto-calculated)
+	int local_grid_size_x;
+	int local_grid_size_y;
+	int xaxis;
+	int yaxis;
+
+	// Number of planes in the w direction
+	int num_w_planes = 8;
+
+	// Size of the convoutional kernel support
+	int w_support = 7;
+
+	// Number of OpenMP threads (input parameter)
+	int num_threads;
+
+        // Resolution 
+	double dx = 0.5/(double)grid_size_x;
+	double dw = 0.5/(double)num_w_planes;
+	// Half support size
+	double w_supporth = (double)((w_support-1)/2)*dx;
+
+
+	// Initialize FITS image parameters
+	fitsfile *fptreal;
+	fitsfile *fptrimg;
+	int status;
+	long nelements;
+//	long fpixel, lpixel;
+	char testfitsreal[FILENAMELENGTH] = "parallel_np2_real.fits";
+	char testfitsimag[FILENAMELENGTH] = "parallel_np2_img.fits";
+
+	long naxis = 2;
+	long naxes[2] = { grid_size_x, grid_size_y };
+
+	nelements = naxes[0] * naxes[1];
+
+
+
+
+	// Internal profiling parameters 
+	clock_t start, end, start0, startk, endk;
+	double setup_time, process_time, mpi_time, fftw_time, tot_time, kernel_time, reduce_time, compose_time, phase_time;
+	double setup_time1, process_time1, mpi_time1, fftw_time1, tot_time1, kernel_time1, reduce_time1, compose_time1, phase_time1;
+	double writetime, writetime1;
+	struct timespec begin, finish, begin0, begink, finishk;
+	double elapsed;
+
+        // Number of sectors in which the mesh is divided along the v direction
+	// IF nsectors < 0, nsectors = NUMBER OF MPI RANKS
+	long nsectors = NUM_OF_SECTORS;
+
+
+        /* Get number of threads, exit if not given */
+        if (argc == 1) {
+          fprintf(stderr, "Usage: %s number_of_OMP_Threads \n", argv[0]);
+          exit(1);
+        }
+        // Set the number of OpenMP threads
+        num_threads = atoi(argv[1]);
+
+        // Number of threads <= 0 not acceptable
+        if (num_threads <= 0) {
+           fprintf(stderr, "Wrong parameter: %s\n\n", argv[1]);
+           fprintf(stderr, "Usage: %s number_of_OMP_Threads \n", argv[0]);
+           exit(1);
+        }
+
+	// Internal profiling
+	clock_gettime(CLOCK_MONOTONIC, &begin0);
+	start0 = clock();
+
+#ifdef USE_MPI
+	// Intialize MPI environment
+	MPI_Init(&argc,&argv);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+	if(rank == 0)printf("Running with %d MPI tasks\n",size);
+  #ifdef USE_FFTW
+	// Initialize FFTW environment
+	fftw_mpi_init();
+  #endif
+#else
+	// If MPI is not used the two parameters are set for consistency
+	rank = 0;
+	size = 1;
+#endif
+
+  if(rank == 0)printf("Running with %d threads\n",num_threads);
+
+#ifdef ACCOMP
+if(rank == 0){
+  if (0 == omp_get_num_devices()) {
+      printf("No accelerator found ... exit\n");
+      exit(255);
+   }
+   printf("Number of available GPUs %d\n", omp_get_num_devices());
+   #ifdef NVIDIA
+      prtAccelInfo();
+   #endif
+ }
+#endif
+
+	// set the local size of the image
+	local_grid_size_x = grid_size_x;
+	if (nsectors < 0) nsectors = size;
+	local_grid_size_y = grid_size_y/nsectors;
+	//nsectors = size;
+
+	// LOCAL grid size
+	xaxis = local_grid_size_x;
+	yaxis = local_grid_size_y;
+	clock_gettime(CLOCK_MONOTONIC, &begin);
+	start = clock();
+
+        // INPUT FILES (only the first ndatasets entries are used)
+	int ndatasets = 1;
+        strcpy(datapath_multi[0],"/m100_scratch/userexternal/ederubei/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/");
+        //strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/newgauss4_t201806301100_SBL180.binMS/");
+        //strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/");
+        //strcpy(datapath_multi[1],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_134MHz.pre-cal.binMS/");
+
+	strcpy(datapath,datapath_multi[0]);
+	// Read metadata
+	strcpy(filename,datapath);
+	strcat(filename,metafile);
+	pFile = fopen (filename,"r");
+        fscanf(pFile,"%ld",&Nmeasures);
+        fscanf(pFile,"%ld",&Nvis);
+        fscanf(pFile,"%ld",&freq_per_chan);
+        fscanf(pFile,"%ld",&polarisations);
+        fscanf(pFile,"%ld",&Ntimes);
+        fscanf(pFile,"%lf",&dt);
+        fscanf(pFile,"%lf",&thours);
+        fscanf(pFile,"%ld",&baselines);
+        fscanf(pFile,"%lf",&uvmin);
+        fscanf(pFile,"%lf",&uvmax);
+        fscanf(pFile,"%lf",&wmin);
+        fscanf(pFile,"%lf",&wmax);
+	fclose(pFile);
+
+
+	// WATCH THIS!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+	int nsub = 1000;
+	//int nsub = 10;
+	printf("Subtracting last %d measurements\n",nsub);
+  Nmeasures = Nmeasures-nsub;
+	Nvis = Nmeasures*freq_per_chan*polarisations;
+
+  // calculate the coordinates of the center
+	double uvshift = uvmin/(uvmax-uvmin);
+	//printf("UVSHIFT %f %f %f %f %f\n",uvmin, uvmax, wmin, wmax, uvshift);
+
+	if (rank == 0)
+	{
+	   printf("N. measurements %ld\n",Nmeasures);
+	   printf("N. visibilities %ld\n",Nvis);
+	}
+
+  // Set temporary local size of points
+	long nm_pe = (long)(Nmeasures/size);
+	long remaining = Nmeasures%size;
+
+  long startrow = rank*nm_pe;
+  if (rank == size-1)nm_pe = nm_pe+remaining;
+
+	long Nmeasures_tot = Nmeasures;
+	Nmeasures = nm_pe;
+	long Nvis_tot = Nvis;
+	Nvis = Nmeasures*freq_per_chan*polarisations;
+	Nweights = Nmeasures*polarisations;
+
+#ifdef VERBOSE
+	printf("N. measurements on %d %ld\n",rank,Nmeasures);
+	printf("N. visibilities on %d %ld\n",rank,Nvis);
+#endif
+
+
+	// DAV: all these arrays can be allocatate statically for the sake of optimization. However be careful that if MPI is used
+	// all the sizes are rescaled by the number of MPI tasks
+	// Allocate arrays
+	uu = (double*) calloc(Nmeasures,sizeof(double));
+	vv = (double*) calloc(Nmeasures,sizeof(double));
+	ww = (double*) calloc(Nmeasures,sizeof(double));
+	weights = (float*) calloc(Nweights,sizeof(float));
+	visreal = (float*) calloc(Nvis,sizeof(float));
+	visimg = (float*) calloc(Nvis,sizeof(float));
+
+  if(rank == 0)printf("READING DATA\n");
+	// Read data
+	strcpy(filename,datapath);
+	strcat(filename,ufile);
+	//printf("Reading %s\n",filename);
+
+	pFile = fopen (filename,"rb");
+	fseek (pFile,startrow*sizeof(double),SEEK_SET);
+	fread(uu,Nmeasures*sizeof(double),1,pFile);
+	fclose(pFile);
+
+	strcpy(filename,datapath);
+	strcat(filename,vfile);
+	//printf("Reading %s\n",filename);
+
+  pFile = fopen (filename,"rb");
+	fseek (pFile,startrow*sizeof(double),SEEK_SET);
+	fread(vv,Nmeasures*sizeof(double),1,pFile);
+	fclose(pFile);
+
+	strcpy(filename,datapath);
+	strcat(filename,wfile);
+	//printf("Reading %s\n",filename);
+
+	pFile = fopen (filename,"rb");
+	fseek (pFile,startrow*sizeof(double),SEEK_SET);
+	fread(ww,Nmeasures*sizeof(double),1,pFile);
+	fclose(pFile);
+
+#ifdef USE_MPI
+	MPI_Barrier(MPI_COMM_WORLD);
+#endif
+
+	clock_gettime(CLOCK_MONOTONIC, &finish);
+	end = clock();
+	setup_time = ((double) (end - start)) / CLOCKS_PER_SEC;
+	setup_time1 = (finish.tv_sec - begin.tv_sec);
+	setup_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
+
+
+  if(rank == 0)printf("GRIDDING DATA\n");
+
+	// Create histograms and linked lists
+  clock_gettime(CLOCK_MONOTONIC, &begin);
+  start = clock();
+
+	// Initialize linked list
+	struct sectorlist ** sectorhead;
+	sectorhead = (struct sectorlist **) malloc((nsectors+1) * sizeof(struct sectorlist));
+	for (int isec=0; isec<=nsectors; isec++)
+	{
+		sectorhead[isec] = malloc(sizeof(struct sectorlist));
+		sectorhead[isec]->index = -1;
+		sectorhead[isec]->next = NULL;
+	}
+
+
+	long * histo_send = (long*) calloc(nsectors+1,sizeof(long));
+	int * boundary = (int*) calloc(Nmeasures,sizeof(int));
+	double uuh,vvh;
+	for (long iphi = 0; iphi < Nmeasures; iphi++)
+	{
+     boundary[iphi] = -1;
+	   uuh = uu[iphi];
+	   vvh = vv[iphi];
+	   int binphi = (int)(vvh*nsectors);
+	   // check if the point influence also neighboring slabs
+	   double updist = (double)((binphi+1)*yaxis)*dx - vvh;
+	   double downdist = vvh - (double)(binphi*yaxis)*dx;
+	   //
+	   histo_send[binphi]++;
+     Push(&sectorhead[binphi],iphi);
+     if(updist < w_supporth && updist >= 0.0) {histo_send[binphi+1]++; boundary[iphi] = binphi+1; Push(&sectorhead[binphi+1],iphi);};
+	   if(downdist < w_supporth && binphi > 0 && downdist >= 0.0) {histo_send[binphi-1]++; boundary[iphi] = binphi-1; Push(&sectorhead[binphi-1],iphi);};
+	}
+
+#ifdef PIPPO
+	struct sectorlist * current;
+	long iiii = 0;
+	for (int j=0; j<nsectors; j++)
+	{
+		current = sectorhead[j];
+		iiii = 0;
+		while (current->index != -1)
+		{
+			printf("%d %d %ld %ld %ld\n",rank,j,iiii,histo_send[j],current->index);
+			current = current->next;
+			iiii++;
+		}
+	}
+#endif
+
+#ifdef VERBOSE
+  for (int iii=0; iii<nsectors+1; iii++)printf("HISTO %d %d %ld\n",rank, iii, histo_send[iii]);
+#endif
+
+// Create sector grid
+  double * gridss;
+  double * gridss_w;
+  double * gridss_real;
+  double * gridss_img;
+  double * grid;
+  long size_of_grid;
+  size_of_grid = 2*num_w_planes*xaxis*yaxis;
+  gridss = (double*) calloc(size_of_grid,sizeof(double));
+  gridss_w = (double*) calloc(size_of_grid,sizeof(double));
+  gridss_real = (double*) calloc(size_of_grid/2,sizeof(double));
+  gridss_img = (double*) calloc(size_of_grid/2,sizeof(double));
+  // Create destination slab
+  grid = (double*) calloc(size_of_grid,sizeof(double));
+  // Create temporary global grid
+#ifndef USE_MPI
+  double * gridtot = (double*) calloc(2*grid_size_x*grid_size_y*num_w_planes,sizeof(double));
+#endif
+  double shift = (double)(dx*yaxis);
+  // Open the MPI Memory Window for the slab
+#ifdef USE_MPI
+  MPI_Win slabwin;
+  MPI_Win_create(grid, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &slabwin);
+  MPI_Win_fence(0,slabwin);
+#endif
+#ifndef USE_MPI
+  pFile1 = fopen (outfile1,"w");
+#endif
+
+
+	// loop over files
+	//
+	kernel_time = 0.0;
+	kernel_time1 = 0.0;
+	reduce_time = 0.0;
+	reduce_time1 = 0.0;
+	compose_time = 0.0;
+	compose_time1 = 0.0;
+
+	// MAIN LOOP OVER FILES
+	//
+	for (int ifiles=0; ifiles<ndatasets; ifiles++)
+	{
+	strcpy(filename,datapath_multi[ifiles]);
+        printf("Processing %s, %d of %d\n",filename,ifiles+1,ndatasets);
+
+        // Read metadata
+        strcpy(filename,datapath);
+        strcat(filename,metafile);
+        pFile = fopen (filename,"r");
+        fscanf(pFile,"%ld",&Nmeasures0);
+        fscanf(pFile,"%ld",&Nvis0);
+        fscanf(pFile,"%ld",&freq_per_chan0);
+        fscanf(pFile,"%ld",&polarisations0);
+        fscanf(pFile,"%ld",&Ntimes0);
+        fscanf(pFile,"%lf",&dt0);
+        fscanf(pFile,"%lf",&thours0);
+        fscanf(pFile,"%ld",&baselines0);
+        fscanf(pFile,"%lf",&uvmin);
+        fscanf(pFile,"%lf",&uvmax);
+        fscanf(pFile,"%lf",&wmin0);
+        fscanf(pFile,"%lf",&wmax0);
+        fclose(pFile);
+
+        // calculate the resolution in radians
+        resolution = 1.0/MAX(abs(uvmin),abs(uvmax));
+        // calculate the resolution in arcsec
+        double resolution_asec = (3600.0*180.0)/MAX(abs(uvmin),abs(uvmax))/PI;
+        printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec);
+
+        strcpy(filename,datapath);
+        strcat(filename,weightsfile);
+        pFile = fopen (filename,"rb");
+        fseek (pFile,startrow*polarisations*sizeof(float),SEEK_SET);
+        fread(weights,(Nweights)*sizeof(float),1,pFile);
+        fclose(pFile);
+
+        strcpy(filename,datapath);
+        strcat(filename,visrealfile);
+
+  pFile = fopen (filename,"rb");
+  fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET);
+  fread(visreal,Nvis*sizeof(float),1,pFile);
+  fclose(pFile);
+  strcpy(filename,datapath);
+  strcat(filename,visimgfile);
+#ifdef VERBOSE
+  printf("Reading %s\n",filename);
+#endif
+  pFile = fopen (filename,"rb");
+  fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET);
+  fread(visimg,Nvis*sizeof(float),1,pFile);
+  fclose(pFile);
+
+#ifdef USE_MPI
+  MPI_Barrier(MPI_COMM_WORLD);
+#endif
+  // Declare temporary arrays for the masking
+  double * uus;
+  double * vvs;
+  double * wws;
+  float * visreals;
+  float * visimgs;
+  float * weightss;
+	long isector;
+
+  for (long isector_count=0; isector_count<nsectors; isector_count++)
+      {
+        clock_gettime(CLOCK_MONOTONIC, &begink);
+        startk = clock();
+        // define local destination sector
+        //isector = (isector_count+rank)%size;
+        isector = isector_count;
+	  // allocate sector arrays
+        long Nsec = histo_send[isector];
+	      uus = (double*) malloc(Nsec*sizeof(double));
+	      vvs = (double*) malloc(Nsec*sizeof(double));
+	      wws = (double*) malloc(Nsec*sizeof(double));
+	      long Nweightss = Nsec*polarisations;
+	      long Nvissec = Nweightss*freq_per_chan;
+	      weightss = (float*) malloc(Nweightss*sizeof(float));
+	      visreals = (float*) malloc(Nvissec*sizeof(float));
+	      visimgs = (float*) malloc(Nvissec*sizeof(float));
+
+	  // select data for this sector
+        long icount = 0;
+	      long ip = 0;
+	      long inu = 0;
+//CLAAAA
+	       struct sectorlist * current;
+	       current = sectorhead[isector];
+
+
+
+	       while (current->index != -1)
+          {
+             long ilocal = current->index;
+      	     //double vvh = vv[ilocal];
+             //int binphi = (int)(vvh*nsectors);
+	           //if (binphi == isector || boundary[ilocal] == isector) {
+	           uus[icount] = uu[ilocal];
+	           vvs[icount] = vv[ilocal]-isector*shift;
+	           wws[icount] = ww[ilocal];
+             for (long ipol=0; ipol<polarisations; ipol++)
+                {
+		              weightss[ip] = weights[ilocal*polarisations+ipol];
+                  ip++;
+                }
+             for (long ifreq=0; ifreq<polarisations*freq_per_chan; ifreq++)
+                {
+	                 visreals[inu] = visreal[ilocal*polarisations*freq_per_chan+ifreq];
+	                 visimgs[inu] = visimg[ilocal*polarisations*freq_per_chan+ifreq];
+		               //if(visimgs[inu]>1e10 || visimgs[inu]<-1e10)printf("%f %f %ld %ld %d %ld %ld\n",visreals[inu],visimgs[inu],inu,Nvissec,rank,ilocal*polarisations*freq_per_chan+ifreq,Nvis);
+	                 inu++;
+	              }
+	           icount++;
+	     //}
+	            current = current->next;
+             }
+
+	  clock_gettime(CLOCK_MONOTONIC, &finishk);
+	  endk = clock();
+	  compose_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
+	  compose_time1 += (finishk.tv_sec - begink.tv_sec);
+	  compose_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
+
+    #ifndef USE_MPI
+	  double uumin = 1e20;
+	  double vvmin = 1e20;
+	  double uumax = -1e20;
+	  double vvmax = -1e20;
+
+          for (long ipart=0; ipart<Nsec; ipart++)
+          {
+	       uumin = MIN(uumin,uus[ipart]);
+	       uumax = MAX(uumax,uus[ipart]);
+	       vvmin = MIN(vvmin,vvs[ipart]);
+	       vvmax = MAX(vvmax,vvs[ipart]);
+
+
+               if(ipart%10 == 0)fprintf (pFile, "%ld %f %f %f\n",isector,uus[ipart],vvs[ipart]+isector*shift,wws[ipart]);
+          }
+
+	  printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax);
+          #endif
+
+          // Make convolution on the grid
+          #ifdef VERBOSE
+	  printf("Processing sector %ld\n",isector);
+	  #endif
+          clock_gettime(CLOCK_MONOTONIC, &begink);
+          startk = clock();
+
+          wstack(num_w_planes,
+               Nsec,
+               freq_per_chan,
+               polarisations,
+               uus,
+               vvs,
+               wws,
+               visreals,
+               visimgs,
+               weightss,
+               dx,
+               dw,
+               w_support,
+	             xaxis,
+	             yaxis,
+               gridss,
+               num_threads);
+
+
+/* int z =0 ;
+#pragma omp target map(to:test_i_gpu) map(from:z)
+{
+  int x; // only accessible from accelerator
+  x = 2;
+  z = x + test_i_gpu;
+}*/
+
+
+
+	  clock_gettime(CLOCK_MONOTONIC, &finishk);
+	  endk = clock();
+	  kernel_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
+	  kernel_time1 += (finishk.tv_sec - begink.tv_sec);
+	  kernel_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
+          #ifdef VERBOSE
+	  printf("Processed sector %ld\n",isector);
+          #endif
+          clock_gettime(CLOCK_MONOTONIC, &begink);
+          startk = clock();
+
+          //for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++)printf("--> %f\n",gridss[iii]);
+
+          #ifndef USE_MPI
+	  long stride = isector*2*xaxis*yaxis*num_w_planes;
+          for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++)gridtot[stride+iii] = gridss[iii];
+	  #endif
+
+	  // Write grid in the corresponding remote slab
+          #ifdef USE_MPI
+	  int target_rank = (int)isector;
+	  //int target_rank = (int)(size-isector-1);
+	  #ifdef ONE_SIDE
+	  printf("One Side communication active\n");
+	  MPI_Win_lock(MPI_LOCK_SHARED,target_rank,0,slabwin);
+	  MPI_Accumulate(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,MPI_SUM,slabwin);
+	  MPI_Win_unlock(target_rank,slabwin);
+	  //MPI_Put(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,slabwin);
+          #else
+          MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD);
+          #endif //ONE_SIDE
+          #endif //USE_MPI
+
+	  clock_gettime(CLOCK_MONOTONIC, &finishk);
+	  endk = clock();
+	  reduce_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
+	  reduce_time1 += (finishk.tv_sec - begink.tv_sec);
+	  reduce_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
+          // Go to next sector
+	  for (long inull=0; inull<2*num_w_planes*xaxis*yaxis; inull++)gridss[inull] = 0.0;
+
+	  // Deallocate all sector arrays
+	  free(uus);
+	  free(vvs);
+	  free(wws);
+	  free(weightss);
+	  free(visreals);
+	  free(visimgs);
+        // End of loop over sectors
+        }
+	// End of loop over input files
+	}
+
+	// Finalize MPI communication
+        #ifdef USE_MPI
+	MPI_Win_fence(0,slabwin);
+	#endif
+
+        // Swap left and right parts APPARENTLY NOT NECESSARY
+	/*
+	for (long kswap=0; kswap<num_w_planes; kswap++)
+	for (long jswap=0; jswap<yaxis; jswap++)
+	for (long iswap=0; iswap<xaxis; iswap++)
+	{
+           long index_origin = 2*(iswap + jswap*xaxis + kswap*yaxis*xaxis);
+	   gridss[index_origin] = grid[index_origin];
+	   gridss[index_origin+1] = grid[index_origin+1];
+	}
+	for (long kswap=0; kswap<num_w_planes; kswap++)
+	for (long jswap=0; jswap<yaxis; jswap++)
+	for (long iswap=0; iswap<xaxis/2; iswap++)
+	{
+           long index_origin = 2*(iswap + jswap*xaxis + kswap*yaxis*xaxis);
+	   long index_destination = 2*(iswap+xaxis/2 + jswap*xaxis + kswap*yaxis*xaxis);
+	   grid[index_destination] = gridss[index_origin];
+	   grid[index_destination+1] = gridss[index_origin+1];
+	}
+
+	for (long kswap=0; kswap<num_w_planes; kswap++)
+	for (long jswap=0; jswap<yaxis; jswap++)
+	for (long iswap=xaxis/2; iswap<xaxis; iswap++)
+	{
+           long index_origin = 2*(iswap + jswap*xaxis + kswap*yaxis*xaxis);
+	   long index_destination = 2*(iswap-xaxis/2 + jswap*xaxis + kswap*yaxis*xaxis);
+	   grid[index_destination] = gridss[index_origin];
+	   grid[index_destination+1] = gridss[index_origin+1];
+	}
+	*/
+
+        #ifndef USE_MPI
+        fclose(pFile1);
+        #endif
+        #ifdef USE_MPI
+	MPI_Barrier(MPI_COMM_WORLD);
+	#endif
+
+        end = clock();
+        clock_gettime(CLOCK_MONOTONIC, &finish);
+        process_time = ((double) (end - start)) / CLOCKS_PER_SEC;
+        process_time1 = (finish.tv_sec - begin.tv_sec);
+        process_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
+        clock_gettime(CLOCK_MONOTONIC, &begin);
+
+
+#ifdef WRITE_DATA
+	// Write results
+	if (rank == 0)
+	{
+          printf("WRITING GRIDDED DATA\n");
+          pFilereal = fopen (outfile2,"wb");
+          pFileimg = fopen (outfile3,"wb");
+	  #ifdef USE_MPI
+	  for (int isector=0; isector<nsectors; isector++)
+          {
+	      MPI_Win_lock(MPI_LOCK_SHARED,isector,0,slabwin);
+	      MPI_Get(gridss,size_of_grid,MPI_DOUBLE,isector,0,size_of_grid,MPI_DOUBLE,slabwin);
+	      MPI_Win_unlock(isector,slabwin);
+	      for (long i=0; i<size_of_grid/2; i++)
+	      {
+		      gridss_real[i] = gridss[2*i];
+		      gridss_img[i] = gridss[2*i+1];
+	      }
+	      if (num_w_planes > 1)
+	      {
+                for (int iw=0; iw<num_w_planes; iw++)
+                for (int iv=0; iv<yaxis; iv++)
+                for (int iu=0; iu<xaxis; iu++)
+                {
+			  long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*grid_size_x*grid_size_y)*sizeof(double);
+                          long index = iu + iv*xaxis + iw*xaxis*yaxis;
+			  fseek(pFilereal, global_index, SEEK_SET);
+			  fwrite(&gridss_real[index], 1, sizeof(double), pFilereal);
+                }
+                for (int iw=0; iw<num_w_planes; iw++)
+                for (int iv=0; iv<yaxis; iv++)
+                for (int iu=0; iu<xaxis; iu++)
+                {
+                          long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*grid_size_x*grid_size_y)*sizeof(double);
+                          long index = iu + iv*xaxis + iw*xaxis*yaxis;
+                          fseek(pFileimg, global_index, SEEK_SET);
+                          fwrite(&gridss_img[index], 1, sizeof(double), pFileimg);
+                          //double v_norm = sqrt(gridss[index]*gridss[index]+gridss[index+1]*gridss[index+1]);
+                          //fprintf (pFile, "%d %d %d %f %f %f\n", iu,isector*yaxis+iv,iw,gridss[index],gridss[index+1],v_norm);
+                }
+
+	      } else {
+		for (int iw=0; iw<num_w_planes; iw++)
+		{
+			  long global_index = (xaxis*isector*yaxis + iw*grid_size_x*grid_size_y)*sizeof(double);
+                          long index = iw*xaxis*yaxis;
+			  fseek(pFilereal, global_index, SEEK_SET);
+                          fwrite(&gridss_real[index], xaxis*yaxis, sizeof(double), pFilereal);
+                          fseek(pFileimg, global_index, SEEK_SET);
+                          fwrite(&gridss_img[index], xaxis*yaxis, sizeof(double), pFileimg);
+		}
+	      }
+          }
+	  #else
+          for (int iw=0; iw<num_w_planes; iw++)
+            for (int iv=0; iv<grid_size_y; iv++)
+               for (int iu=0; iu<grid_size_x; iu++)
+               {
+                          long index = 2*(iu + iv*grid_size_x + iw*grid_size_x*grid_size_y);
+		          fwrite(&gridtot[index], 1, sizeof(double), pFilereal);
+		          fwrite(&gridtot[index+1], 1, sizeof(double), pFileimg);
+                          //double v_norm = sqrt(gridtot[index]*gridtot[index]+gridtot[index+1]*gridtot[index+1]);
+                          //fprintf (pFile, "%d %d %d %f %f %f\n", iu,iv,iw,gridtot[index],gridtot[index+1],v_norm);
+               }
+          #endif
+          fclose(pFilereal);
+          fclose(pFileimg);
+	}
+
+        #ifdef USE_MPI
+        MPI_Win_fence(0,slabwin);
+        #endif
+#endif //WRITE_DATA
+
+
+#ifdef USE_FFTW
+	// FFT transform the data (using distributed FFTW)
+
+	if(rank == 0)printf("PERFORMING FFT\n");
+        clock_gettime(CLOCK_MONOTONIC, &begin);
+        start = clock();
+        fftw_plan plan;
+        fftw_complex *fftwgrid;
+	ptrdiff_t alloc_local, local_n0, local_0_start;
+	double norm = 1.0/(double)(grid_size_x*grid_size_y);
+
+        // map the 1D array of complex visibilities to a 2D array required by FFTW (complex[*][2])
+	// x is the direction of contiguous data and maps to the second parameter
+	// y is the parallelized direction and corresponds to the first parameter (--> n0)
+	// and perform the FFT per w plane
+	alloc_local = fftw_mpi_local_size_2d(grid_size_y, grid_size_x, MPI_COMM_WORLD,&local_n0, &local_0_start);
+	fftwgrid = fftw_alloc_complex(alloc_local);
+	plan = fftw_mpi_plan_dft_2d(grid_size_y, grid_size_x, fftwgrid, fftwgrid, MPI_COMM_WORLD, FFTW_BACKWARD, FFTW_ESTIMATE);
+
+	long fftwindex = 0;
+	long fftwindex2D = 0;
+	for (int iw=0; iw<num_w_planes; iw++)
+        {
+            //printf("FFTing plan %d\n",iw);
+            // select the w-plane to transform
+            for (int iv=0; iv<yaxis; iv++)
+            {
+               for (int iu=0; iu<xaxis; iu++)
+               {
+		   fftwindex2D = iu + iv*xaxis;
+		   fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis);
+                   fftwgrid[fftwindex2D][0] = grid[fftwindex];
+                   fftwgrid[fftwindex2D][1] = grid[fftwindex+1];
+	       }
+	    }
+
+            // do the transform for each w-plane
+	    fftw_execute(plan);
+
+	    // save the transformed w-plane
+            for (int iv=0; iv<yaxis; iv++)
+            {
+               for (int iu=0; iu<xaxis; iu++)
+               {
+		   fftwindex2D = iu + iv*xaxis;
+		   fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis);
+                   gridss[fftwindex] = norm*fftwgrid[fftwindex2D][0];
+                   gridss[fftwindex+1] = norm*fftwgrid[fftwindex2D][1];
+               }
+            }
+
+	}
+
+        fftw_destroy_plan(plan);
+
+        #ifdef USE_MPI
+        MPI_Win_fence(0,slabwin);
+	MPI_Barrier(MPI_COMM_WORLD);
+        #endif
+
+        end = clock();
+        clock_gettime(CLOCK_MONOTONIC, &finish);
+        fftw_time = ((double) (end - start)) / CLOCKS_PER_SEC;
+        fftw_time1 = (finish.tv_sec - begin.tv_sec);
+        fftw_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
+        clock_gettime(CLOCK_MONOTONIC, &begin);
+
+#ifdef WRITE_DATA
+        // Write results
+        #ifdef USE_MPI
+	MPI_Win writewin;
+        MPI_Win_create(gridss, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &writewin);
+	MPI_Win_fence(0,writewin);
+        #endif
+        if (rank == 0)
+        {
+          printf("WRITING FFT TRANSFORMED DATA\n");
+          pFilereal = fopen (fftfile2,"wb");
+          pFileimg = fopen (fftfile3,"wb");
+          #ifdef USE_MPI
+          for (int isector=0; isector<nsectors; isector++)
+          {
+              MPI_Win_lock(MPI_LOCK_SHARED,isector,0,writewin);
+              MPI_Get(gridss_w,size_of_grid,MPI_DOUBLE,isector,0,size_of_grid,MPI_DOUBLE,writewin);
+              MPI_Win_unlock(isector,writewin);
+	      for (long i=0; i<size_of_grid/2; i++)
+	      {
+		      gridss_real[i] = gridss_w[2*i];
+		      gridss_img[i] = gridss_w[2*i+1];
+	      }
+	      if (num_w_planes > 1)
+	      {
+                for (int iw=0; iw<num_w_planes; iw++)
+                for (int iv=0; iv<yaxis; iv++)
+                for (int iu=0; iu<xaxis; iu++)
+                {
+			  long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*grid_size_x*grid_size_y)*sizeof(double);
+                          long index = iu + iv*xaxis + iw*xaxis*yaxis;
+			  fseek(pFilereal, global_index, SEEK_SET);
+			  fwrite(&gridss_real[index], 1, sizeof(double), pFilereal);
+                }
+                for (int iw=0; iw<num_w_planes; iw++)
+                for (int iv=0; iv<yaxis; iv++)
+                for (int iu=0; iu<xaxis; iu++)
+                {
+                          long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*grid_size_x*grid_size_y)*sizeof(double);
+                          long index = iu + iv*xaxis + iw*xaxis*yaxis;
+                          fseek(pFileimg, global_index, SEEK_SET);
+                          fwrite(&gridss_img[index], 1, sizeof(double), pFileimg);
+                }
+              } else {
+                          fwrite(gridss_real, size_of_grid/2, sizeof(double), pFilereal);
+                          fwrite(gridss_img, size_of_grid/2, sizeof(double), pFileimg);
+              }
+
+
+          }
+          #else
+          /*
+	  for (int iw=0; iw<num_w_planes; iw++)
+            for (int iv=0; iv<grid_size_y; iv++)
+               for (int iu=0; iu<grid_size_x; iu++)
+               {
+                          int isector = 0;
+                          long index = 2*(iu + iv*grid_size_x + iw*grid_size_x*grid_size_y);
+                          double v_norm = sqrt(gridtot[index]*gridtot[index]+gridtot[index+1]*gridtot[index+1]);
+                          fprintf (pFile, "%d %d %d %f %f %f\n", iu,iv,iw,gridtot[index],gridtot[index+1],v_norm);
+               }
+	  */
+          #endif
+          fclose(pFilereal);
+          fclose(pFileimg);
+        }
+
+        #ifdef USE_MPI
+        //MPI_Win_fence(0,writewin);
+        MPI_Win_fence(0,writewin);
+	MPI_Win_free(&writewin);
+	MPI_Barrier(MPI_COMM_WORLD);
+        #endif
+#endif //WRITE_DATA
+
+	fftw_free(fftwgrid);
+
+	// Phase correction
+        clock_gettime(CLOCK_MONOTONIC, &begin);
+        start = clock();
+	if(rank == 0)printf("PHASE CORRECTION\n");
+        double* image_real = (double*) calloc(xaxis*yaxis,sizeof(double));
+        double* image_imag = (double*) calloc(xaxis*yaxis,sizeof(double));
+
+        phase_correction(gridss,image_real,image_imag,xaxis,yaxis,num_w_planes,grid_size_x,grid_size_y,resolution,wmin,wmax,num_threads);
+
+        end = clock();
+        clock_gettime(CLOCK_MONOTONIC, &finish);
+        phase_time = ((double) (end - start)) / CLOCKS_PER_SEC;
+        phase_time1 = (finish.tv_sec - begin.tv_sec);
+        phase_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
+#ifdef WRITE_IMAGE
+
+        if(rank == 0)
+        {
+            #ifdef FITSIO
+            printf("REMOVING RESIDUAL FITS FILE\n");
+            remove(testfitsreal);
+            remove(testfitsimag);
+
+
+	    printf("FITS CREATING\n");
+            status = 0;
+
+            fits_create_file(&fptrimg, testfitsimag, &status);
+            fits_create_img(fptrimg, DOUBLE_IMG, naxis, naxes, &status);
+//          fits_write_img(fptrimg, TDOUBLE, fpixel, nelements, image_imag, &status);
+            fits_close_file(fptrimg, &status);
+
+	    status = 0;
+
+            fits_create_file(&fptreal, testfitsreal, &status);
+	    fits_create_img(fptreal, DOUBLE_IMG, naxis, naxes, &status);
+//	    fits_write_img(fptreal, TDOUBLE, fpixel, nelements, image_real, &status);
+	    fits_close_file(fptreal, &status);
+	    #endif
+
+            pFilereal = fopen (fftfile2, "wb");
+	    pFileimg = fopen (fftfile3, "wb");
+	    fclose(pFilereal);
+	    fclose(pFileimg);
+        }
+	#ifdef USE_MPI
+	MPI_Barrier(MPI_COMM_WORLD);
+        #endif
+        if(rank == 0)printf("WRITING IMAGE\n");
+	for (int isector=0; isector<size; isector++)
+	{
+	    #ifdef USE_MPI
+	    MPI_Barrier(MPI_COMM_WORLD);
+            #endif
+	    if(isector == rank)
+            //if(rank == 0)
+	    {
+               #ifdef FITSIO
+
+	       printf("%d writing\n",isector);
+               long * fpixel = (long *) malloc(sizeof(long)*naxis);
+               long * lpixel = (long *) malloc(sizeof(long)*naxis);
+
+
+               fpixel[0] = 1;
+               fpixel[1] = isector*yaxis+1;
+               lpixel[0] = xaxis;
+               lpixel[1] = (isector+1)*yaxis;
+               //printf("fpixel %d, %d\n", fpixel[0], fpixel[1]);
+	       //printf("lpixel %d, %d\n", lpixel[0], lpixel[1]);
+               status = 0;
+	       fits_open_image(&fptreal, testfitsreal, READWRITE, &status);
+               fits_write_subset(fptreal, TDOUBLE, fpixel, lpixel, image_real, &status);
+	       fits_close_file(fptreal, &status);
+
+               status = 0;
+               fits_open_image(&fptrimg, testfitsimag, READWRITE, &status);
+               fits_write_subset(fptrimg, TDOUBLE, fpixel, lpixel, image_imag, &status);
+               fits_close_file(fptrimg, &status);
+
+               #endif
+
+               pFilereal = fopen (fftfile2,"ab");
+               pFileimg = fopen (fftfile3,"ab");
+
+	       long global_index = isector*(xaxis*yaxis)*sizeof(double);
+
+               fseek(pFilereal, global_index, SEEK_SET);
+               fwrite(image_real, xaxis*yaxis, sizeof(double), pFilereal);
+               fseek(pFileimg, global_index, SEEK_SET);
+               fwrite(image_imag, xaxis*yaxis, sizeof(double), pFileimg);
+
+               fclose(pFilereal);
+               fclose(pFileimg);
+	    }
+	}
+	#ifdef USE_MPI
+	MPI_Barrier(MPI_COMM_WORLD);
+        #endif
+
+#endif //WRITE_IMAGE
+
+
+#endif //USE_FFTW
+
+	end = clock();
+	clock_gettime(CLOCK_MONOTONIC, &finish);
+	tot_time = ((double) (end - start0)) / CLOCKS_PER_SEC;
+	tot_time1 = (finish.tv_sec - begin0.tv_sec);
+	tot_time1 += (finish.tv_nsec - begin0.tv_nsec) / 1000000000.0;
+
+        if (rank == 0)
+        {
+          printf("Setup time:    %f sec\n",setup_time);
+          printf("Process time:  %f sec\n",process_time);
+          printf("Kernel time = %f, Array Composition time %f, Reduce time: %f sec\n",kernel_time,compose_time,reduce_time);
+#ifdef USE_FFTW
+          printf("FFTW time:     %f sec\n",fftw_time);
+          printf("Phase time:    %f sec\n",phase_time);
+#endif
+          printf("TOT time:      %f sec\n",tot_time);
+	  if(num_threads > 1)
+          {
+            printf("PSetup time:   %f sec\n",setup_time1);
+            printf("PProcess time: %f sec\n",process_time1);
+            printf("PKernel time = %f, PArray Composition time %f, PReduce time: %f sec\n",kernel_time1,compose_time1,reduce_time1);
+#ifdef USE_FFTW
+            printf("PFFTW time:    %f sec\n",fftw_time1);
+            printf("PPhase time:   %f sec\n",phase_time1);
+#endif
+            printf("PTOT time:     %f sec\n",tot_time1);
+	  }
+        }
+
+	if (rank == 0)
+	{
+	 pFile = fopen (timingfile,"w");
+	 if (num_threads == 1)
+         {
+	   fprintf(pFile, "%f %f %f %f %f %f %f\n",setup_time,kernel_time,compose_time,reduce_time,fftw_time,phase_time,tot_time);
+	 } else {
+	   fprintf(pFile, "%f %f %f %f %f %f %f\n",setup_time1,kernel_time1,compose_time1,reduce_time1,fftw_time1,phase_time1,tot_time1);
+	 }  
+	 fclose(pFile);
+	} 
+
+	// Close MPI environment
+	#ifdef USE_MPI
+        MPI_Win_fence(0,slabwin);
+	MPI_Win_free(&slabwin);
+	MPI_Finalize();
+	#endif
+}