diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..f109a98c7a1a1cbe5d406752091814495af52068
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,47 @@
+OPT += -DUSE_MPI
+OPT += -DUSE_FFTW
+OPT += -DONE_SIDE
+OPT += -DWRITE_DATA
+
+CC = gcc
+CXX = g++
+MPICC = mpicc
+MPICXX =mpiCC 
+
+CFLAGS += -O3 -mcpu=native
+CFLAGS += -I.
+LIBS = -L$(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm
+
+NVCC = nvcc
+NVFLAGS = -arch=sm_70 -c w-stacking.cu -Xcompiler -mno-float128 -std=c++11
+NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda
+
+DEPS = w-stacking.h
+COBJ = w-stacking.o w-stacking-fftw.o
+
+w-stacking.c:
+	cp w-stacking.cu w-stacking.c
+
+%.o: %.c $(DEPS)
+	$(CC) -c -o $@ $< $(CFLAGS) $(OPT)
+
+serial: $(COBJ)
+	$(CC) -o w-stackingCfftw_serial $(CFLAGS) $^ -lm
+
+serial_cuda:
+	$(NVCC) $(NVFLAGS) -c w-stacking.cu $(NVLIB)
+	$(CC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c
+	$(CXX) $(CFLAGS) $(OPT) -o w-stackingfftw_serial w-stacking-fftw.o w-stacking.o $(NVLIB) -lm
+
+mpi: $(COBJ)
+	$(MPICC) -o w-stackingCfftw $(CFLAGS) $^ $(LIBS)
+
+mpi_cuda:
+	$(NVCC) $(NVFLAGS) -c w-stacking.cu $(NVLIB)
+	$(MPICC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c
+	$(MPICXX) $(CFLAGS) $(OPT) -o w-stackingfftw w-stacking-fftw.o w-stacking.o $(NVLIB) $(LIBS) -lm
+
+clean:
+	rm *.o
+	rm w-stacking.c
+
diff --git a/peano.c b/peano.c
new file mode 100644
index 0000000000000000000000000000000000000000..dc831e82253fa928570b17f736e419c36b6bbad3
--- /dev/null
+++ b/peano.c
@@ -0,0 +1,130 @@
+#include "peano.h"
+
+//typedef unsigned long long peanokey;
+
+/*  The following rewrite of the original function
+ *  peano_hilbert_key_old() has been written by MARTIN REINECKE.
+ *  It is about a factor 2.3 - 2.5 faster than Volker's old routine!
+ */
+const unsigned char rottable3[48][8] = {
+  {36, 28, 25, 27, 10, 10, 25, 27},
+  {29, 11, 24, 24, 37, 11, 26, 26},
+  {8, 8, 25, 27, 30, 38, 25, 27},
+  {9, 39, 24, 24, 9, 31, 26, 26},
+  {40, 24, 44, 32, 40, 6, 44, 6},
+  {25, 7, 33, 7, 41, 41, 45, 45},
+  {4, 42, 4, 46, 26, 42, 34, 46},
+  {43, 43, 47, 47, 5, 27, 5, 35},
+  {33, 35, 36, 28, 33, 35, 2, 2},
+  {32, 32, 29, 3, 34, 34, 37, 3},
+  {33, 35, 0, 0, 33, 35, 30, 38},
+  {32, 32, 1, 39, 34, 34, 1, 31},
+  {24, 42, 32, 46, 14, 42, 14, 46},
+  {43, 43, 47, 47, 25, 15, 33, 15},
+  {40, 12, 44, 12, 40, 26, 44, 34},
+  {13, 27, 13, 35, 41, 41, 45, 45},
+  {28, 41, 28, 22, 38, 43, 38, 22},
+  {42, 40, 23, 23, 29, 39, 29, 39},
+  {41, 36, 20, 36, 43, 30, 20, 30},
+  {37, 31, 37, 31, 42, 40, 21, 21},
+  {28, 18, 28, 45, 38, 18, 38, 47},
+  {19, 19, 46, 44, 29, 39, 29, 39},
+  {16, 36, 45, 36, 16, 30, 47, 30},
+  {37, 31, 37, 31, 17, 17, 46, 44},
+  {12, 4, 1, 3, 34, 34, 1, 3},
+  {5, 35, 0, 0, 13, 35, 2, 2},
+  {32, 32, 1, 3, 6, 14, 1, 3},
+  {33, 15, 0, 0, 33, 7, 2, 2},
+  {16, 0, 20, 8, 16, 30, 20, 30},
+  {1, 31, 9, 31, 17, 17, 21, 21},
+  {28, 18, 28, 22, 2, 18, 10, 22},
+  {19, 19, 23, 23, 29, 3, 29, 11},
+  {9, 11, 12, 4, 9, 11, 26, 26},
+  {8, 8, 5, 27, 10, 10, 13, 27},
+  {9, 11, 24, 24, 9, 11, 6, 14},
+  {8, 8, 25, 15, 10, 10, 25, 7},
+  {0, 18, 8, 22, 38, 18, 38, 22},
+  {19, 19, 23, 23, 1, 39, 9, 39},
+  {16, 36, 20, 36, 16, 2, 20, 10},
+  {37, 3, 37, 11, 17, 17, 21, 21},
+  {4, 17, 4, 46, 14, 19, 14, 46},
+  {18, 16, 47, 47, 5, 15, 5, 15},
+  {17, 12, 44, 12, 19, 6, 44, 6},
+  {13, 7, 13, 7, 18, 16, 45, 45},
+  {4, 42, 4, 21, 14, 42, 14, 23},
+  {43, 43, 22, 20, 5, 15, 5, 15},
+  {40, 12, 21, 12, 40, 6, 23, 6},
+  {13, 7, 13, 7, 41, 41, 22, 20}
+};
+
+const unsigned char subpix3[48][8] = {
+  {0, 7, 1, 6, 3, 4, 2, 5},
+  {7, 4, 6, 5, 0, 3, 1, 2},
+  {4, 3, 5, 2, 7, 0, 6, 1},
+  {3, 0, 2, 1, 4, 7, 5, 6},
+  {1, 0, 6, 7, 2, 3, 5, 4},
+  {0, 3, 7, 4, 1, 2, 6, 5},
+  {3, 2, 4, 5, 0, 1, 7, 6},
+  {2, 1, 5, 6, 3, 0, 4, 7},
+  {6, 1, 7, 0, 5, 2, 4, 3},
+  {1, 2, 0, 3, 6, 5, 7, 4},
+  {2, 5, 3, 4, 1, 6, 0, 7},
+  {5, 6, 4, 7, 2, 1, 3, 0},
+  {7, 6, 0, 1, 4, 5, 3, 2},
+  {6, 5, 1, 2, 7, 4, 0, 3},
+  {5, 4, 2, 3, 6, 7, 1, 0},
+  {4, 7, 3, 0, 5, 6, 2, 1},
+  {6, 7, 5, 4, 1, 0, 2, 3},
+  {7, 0, 4, 3, 6, 1, 5, 2},
+  {0, 1, 3, 2, 7, 6, 4, 5},
+  {1, 6, 2, 5, 0, 7, 3, 4},
+  {2, 3, 1, 0, 5, 4, 6, 7},
+  {3, 4, 0, 7, 2, 5, 1, 6},
+  {4, 5, 7, 6, 3, 2, 0, 1},
+  {5, 2, 6, 1, 4, 3, 7, 0},
+  {7, 0, 6, 1, 4, 3, 5, 2},
+  {0, 3, 1, 2, 7, 4, 6, 5},
+  {3, 4, 2, 5, 0, 7, 1, 6},
+  {4, 7, 5, 6, 3, 0, 2, 1},
+  {6, 7, 1, 0, 5, 4, 2, 3},
+  {7, 4, 0, 3, 6, 5, 1, 2},
+  {4, 5, 3, 2, 7, 6, 0, 1},
+  {5, 6, 2, 1, 4, 7, 3, 0},
+  {1, 6, 0, 7, 2, 5, 3, 4},
+  {6, 5, 7, 4, 1, 2, 0, 3},
+  {5, 2, 4, 3, 6, 1, 7, 0},
+  {2, 1, 3, 0, 5, 6, 4, 7},
+  {0, 1, 7, 6, 3, 2, 4, 5},
+  {1, 2, 6, 5, 0, 3, 7, 4},
+  {2, 3, 5, 4, 1, 0, 6, 7},
+  {3, 0, 4, 7, 2, 1, 5, 6},
+  {1, 0, 2, 3, 6, 7, 5, 4},
+  {0, 7, 3, 4, 1, 6, 2, 5},
+  {7, 6, 4, 5, 0, 1, 3, 2},
+  {6, 1, 5, 2, 7, 0, 4, 3},
+  {5, 4, 6, 7, 2, 3, 1, 0},
+  {4, 3, 7, 0, 5, 2, 6, 1},
+  {3, 2, 0, 1, 4, 5, 7, 6},
+  {2, 5, 1, 6, 3, 4, 0, 7}
+};
+
+/*! This function computes a Peano-Hilbert key for an integer triplet (x,y,z),
+  *  with x,y,z in the range between 0 and 2^bits-1.
+  */
+peanokey peano_hilbert_key(int x, int y, int z, int bits)
+{
+  int mask;
+  unsigned char rotation = 0;
+  peanokey key = 0;
+
+  for(mask = 1 << (bits - 1); mask > 0; mask >>= 1)
+    {
+      unsigned char pix = ((x & mask) ? 4 : 0) | ((y & mask) ? 2 : 0) | ((z & mask) ? 1 : 0);
+
+      key <<= 3;
+      key |= subpix3[rotation][pix];
+      rotation = rottable3[rotation][pix];
+    }
+
+  return key;
+}
diff --git a/peano.h b/peano.h
new file mode 100644
index 0000000000000000000000000000000000000000..465ac263aea790dbc9d13619deba287aad6f9dc4
--- /dev/null
+++ b/peano.h
@@ -0,0 +1,3 @@
+typedef unsigned long long peanokey;
+
+peanokey peano_hilbert_key(int, int, int, int);
diff --git a/w-stacking-fftw.c b/w-stacking-fftw.c
new file mode 100644
index 0000000000000000000000000000000000000000..4088543ba021d22d28c5540bd99b5f0b2a9de958
--- /dev/null
+++ b/w-stacking-fftw.c
@@ -0,0 +1,860 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#ifdef USE_MPI
+#include <mpi.h>
+#ifdef USE_FFTW
+#include <fftw3-mpi.h>
+#endif
+#endif
+#include <math.h>
+#include <time.h>
+#include "w-stacking.h"
+#define PI 3.14159265359
+#define NUM_OF_SECTORS -1
+#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
+#define MAX(X, Y) (((X) > (Y)) ? (X) : (Y))
+#define NOVERBOSE
+#define NFILES 10
+
+// Linked List set-up
+struct sectorlist {
+     long index;
+     struct sectorlist * next;
+};
+
+void Push(struct sectorlist** headRef, long data) {
+     struct sectorlist* newNode = malloc(sizeof(struct sectorlist));
+     newNode->index = data;
+     newNode->next = *headRef;
+     *headRef = newNode; 
+}
+
+// Main Code
+int main(int argc, char * argv[]) 
+{
+	int rank;
+	int size;
+
+	FILE * pFile; 
+	FILE * pFile1;
+	FILE * pFilereal;
+	FILE * pFileimg;
+	char filename[1000];
+	//char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/old/data/gauss2_t201806301100_SBL180.binMS/";
+	//char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/newgauss2noconj_t201806301100_SBL180.binMS/";
+	//char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/gauss1_t201806301100_SBL180.binMS/";
+	//char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/newgauss4_t201806301100_SBL180.binMS/";
+	//char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/gauss4_t201806301100_SBL180.binMS/";
+	//char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/hba-8hrs_t201806301100_SBH255i-test.binMS/";
+	//
+	//char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/hba-8hrs_gauss4new.binMS/";
+	//char datapath[900] = "/m100_scratch/userexternal/cgheller/Lofar/Observations/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/";
+	char datapath[900]; 
+	char datapath_multi[NFILES][900];
+
+	char ufile[30] = "ucoord.bin";
+        char vfile[30] = "vcoord.bin";
+        char wfile[30] = "wcoord.bin";
+        char weightsfile[30] = "weights.bin";
+        char visrealfile[30] = "visibilities_real.bin";
+        char visimgfile[30] = "visibilities_img.bin";
+        char metafile[30] = "meta.txt";
+	char outfile[30] = "grid.txt";
+	char outfile1[30] = "coords.txt";
+	char outfile2[30] = "grid_real.bin";
+	char outfile3[30] = "grid_img.bin";
+	char fftfile[30] = "fft.txt";
+	char fftfile2[30] = "fft_real.bin";
+	char fftfile3[30] = "fft_img.bin";
+	char logfile[30] = "run.log";
+	char extension[30] = ".txt";
+	char srank[4];
+
+	double * uu;
+	double * vv;
+	double * ww;
+	float * weights;
+	float * visreal;
+	float * visimg;
+
+	long Nmeasures;
+	long Nvis;
+	long Nweights;
+	long freq_per_chan;
+	long polarisations;
+        long Ntimes;
+	double dt;
+	double thours;
+	long baselines;
+	double uvmin;
+	double uvmax;
+	double wmin;
+	double wmax;
+
+
+	int grid_size_x = 2048;
+	int grid_size_y = 2048;
+	int local_grid_size_x;// = 100;
+	int local_grid_size_y;// = 100;
+	int xaxis;
+	int yaxis;
+	int num_w_planes = 1;
+	// DAV: the corresponding KernelLen is calculated within the wstack function. It can be anyway hardcoded for optimization
+	int w_support = 7;
+	int num_threads;// = 4;
+	double dx = 1.0/(double)grid_size_x;
+	double dw = 1.0/(double)num_w_planes;
+	double w_supporth = (double)((w_support-1)/2)*dx;
+
+	clock_t start, end, start0, startk, endk;
+	double setup_time, process_time, mpi_time, fftw_time, tot_time, kernel_time, reduce_time, compose_time;
+	double setup_time1, process_time1, mpi_time1, fftw_time1, tot_time1, kernel_time1, reduce_time1, compose_time1;
+	double writetime, writetime1;
+
+	struct timespec begin, finish, begin0, begink, finishk;
+	double elapsed;
+	long nsectors;
+
+	clock_gettime(CLOCK_MONOTONIC, &begin0);
+	start0 = clock();
+        // Set the number of OpenMP threads
+        num_threads = atoi(argv[1]);
+
+	// Intialize MPI environment
+#ifdef USE_MPI
+	MPI_Init(&argc,&argv);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+	if(rank == 0)printf("Running with %d MPI tasks\n",size);
+  #ifdef USE_FFTW
+	fftw_mpi_init();
+  #endif
+#else
+	rank = 0;
+	size = 1;
+#endif
+        if(rank == 0)printf("Running with %d threads\n",num_threads);
+
+	// set the local size of the image
+	local_grid_size_x = grid_size_x;
+	nsectors = NUM_OF_SECTORS;
+	if (nsectors < 0) nsectors = size;
+	local_grid_size_y = grid_size_y/nsectors;
+	//nsectors = size;
+	
+	// LOCAL grid size
+	xaxis = local_grid_size_x;
+	yaxis = local_grid_size_y;
+
+	clock_gettime(CLOCK_MONOTONIC, &begin);
+	start = clock();
+
+	// CLAAAAAA
+	int ndatasets = 2;
+        strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/");
+        strcpy(datapath_multi[1],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_134MHz.pre-cal.binMS/");
+
+	strcpy(datapath,datapath_multi[0]);
+	// Read metadata
+	strcpy(filename,datapath);
+	strcat(filename,metafile);
+	pFile = fopen (filename,"r");
+        fscanf(pFile,"%ld",&Nmeasures);
+        fscanf(pFile,"%ld",&Nvis);
+        fscanf(pFile,"%ld",&freq_per_chan);
+        fscanf(pFile,"%ld",&polarisations);
+        fscanf(pFile,"%ld",&Ntimes);
+        fscanf(pFile,"%lf",&dt);
+        fscanf(pFile,"%lf",&thours);
+        fscanf(pFile,"%ld",&baselines);
+        fscanf(pFile,"%lf",&uvmin);
+        fscanf(pFile,"%lf",&uvmax);
+        fscanf(pFile,"%lf",&wmin);
+        fscanf(pFile,"%lf",&wmax);
+	fclose(pFile);
+
+	// WATCH THIS!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+	int nsub = 1000;
+	//int nsub = 10;
+	printf("Subtracting last %d measurements\n",nsub);
+        Nmeasures = Nmeasures-nsub;
+	Nvis = Nmeasures*freq_per_chan*polarisations;
+
+        // calculate the coordinates of the center
+	double uvshift = uvmin/(uvmax-uvmin);
+	//printf("UVSHIFT %f %f %f %f %f\n",uvmin, uvmax, wmin, wmax, uvshift);
+
+	if (rank == 0)
+	{
+	   printf("N. measurements %ld\n",Nmeasures);
+	   printf("N. visibilities %ld\n",Nvis);
+	}
+
+        // Set temporary local size of points
+	long nm_pe = (long)(Nmeasures/size);
+	long remaining = Nmeasures%size;
+        
+        long startrow = rank*nm_pe;
+        if (rank == size-1)nm_pe = nm_pe+remaining;
+
+	long Nmeasures_tot = Nmeasures;
+	Nmeasures = nm_pe;
+	long Nvis_tot = Nvis;
+	Nvis = Nmeasures*freq_per_chan*polarisations;
+	Nweights = Nmeasures*polarisations;
+
+        #ifdef VERBOSE
+	printf("N. measurements on %d %ld\n",rank,Nmeasures);
+	printf("N. visibilities on %d %ld\n",rank,Nvis);
+        #endif
+
+	// DAV: all these arrays can be allocatate statically for the sake of optimization. However be careful that if MPI is used
+	// all the sizes are rescaled by the number of MPI tasks
+	// Allocate arrays
+	uu = (double*) calloc(Nmeasures,sizeof(double));
+	vv = (double*) calloc(Nmeasures,sizeof(double));
+	ww = (double*) calloc(Nmeasures,sizeof(double));
+	weights = (float*) calloc(Nweights,sizeof(float));
+	visreal = (float*) calloc(Nvis,sizeof(float));
+	visimg = (float*) calloc(Nvis,sizeof(float));
+
+        if(rank == 0)printf("READING DATA\n");
+	// Read data
+	strcpy(filename,datapath);
+	strcat(filename,ufile);
+	//printf("Reading %s\n",filename);
+
+	pFile = fopen (filename,"rb");
+	fseek (pFile,startrow*sizeof(double),SEEK_SET);
+	fread(uu,Nmeasures*sizeof(double),1,pFile);
+	fclose(pFile);
+
+	strcpy(filename,datapath);
+	strcat(filename,vfile);
+	//printf("Reading %s\n",filename);
+
+	pFile = fopen (filename,"rb");
+	fseek (pFile,startrow*sizeof(double),SEEK_SET);
+	fread(vv,Nmeasures*sizeof(double),1,pFile);
+	fclose(pFile);
+
+	strcpy(filename,datapath);
+	strcat(filename,wfile);
+	//printf("Reading %s\n",filename);
+
+	pFile = fopen (filename,"rb");
+	fseek (pFile,startrow*sizeof(double),SEEK_SET);
+	fread(ww,Nmeasures*sizeof(double),1,pFile);
+	fclose(pFile);
+
+        #ifdef USE_MPI
+	MPI_Barrier(MPI_COMM_WORLD);
+        #endif
+
+	clock_gettime(CLOCK_MONOTONIC, &finish);
+	end = clock();
+	setup_time = ((double) (end - start)) / CLOCKS_PER_SEC;
+	setup_time1 = (finish.tv_sec - begin.tv_sec);
+	setup_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
+
+        if(rank == 0)printf("GRIDDING DATA\n");
+
+	// Create histograms and linked lists
+        clock_gettime(CLOCK_MONOTONIC, &begin);
+        start = clock();
+ 
+        //CLAAA
+	// Initialize linked list
+	struct sectorlist ** sectorhead;
+	sectorhead = (struct sectorlist **) malloc((nsectors+1) * sizeof(struct sectorlist));
+	for (int isec=0; isec<=nsectors; isec++)
+	{
+		sectorhead[isec] = malloc(sizeof(struct sectorlist));
+		sectorhead[isec]->index = -1;
+		sectorhead[isec]->next = NULL;
+	}
+
+
+	long * histo_send = (long*) calloc(nsectors+1,sizeof(long));
+	int * boundary = (int*) calloc(Nmeasures,sizeof(int));
+	double uuh,vvh;
+	for (long iphi = 0; iphi < Nmeasures; iphi++)
+	{
+           boundary[iphi] = -1;
+	   uuh = uu[iphi];
+	   vvh = vv[iphi];
+	   int binphi = (int)(vvh*nsectors);
+	   // check if the point influence also neighboring slabs
+	   double updist = (double)((binphi+1)*yaxis)*dx - vvh;
+	   double downdist = vvh - (double)(binphi*yaxis)*dx;
+	   //
+	   histo_send[binphi]++;
+           Push(&sectorhead[binphi],iphi);
+           if(updist < w_supporth && updist >= 0.0) {histo_send[binphi+1]++; boundary[iphi] = binphi+1; Push(&sectorhead[binphi+1],iphi);};
+	   if(downdist < w_supporth && binphi > 0 && downdist >= 0.0) {histo_send[binphi-1]++; boundary[iphi] = binphi-1; Push(&sectorhead[binphi-1],iphi);};
+	}
+
+#ifdef PIPPO
+	struct sectorlist * current;
+	long iiii = 0;
+	for (int j=0; j<nsectors; j++)
+	{
+		current = sectorhead[j];
+		iiii = 0;
+		while (current->index != -1)
+		{
+			printf("%d %d %ld %ld %ld\n",rank,j,iiii,histo_send[j],current->index);
+			current = current->next;
+			iiii++;
+		}
+	}
+#endif
+
+        #ifdef VERBOSE
+        for (int iii=0; iii<nsectors+1; iii++)printf("HISTO %d %d %ld\n",rank, iii, histo_send[iii]);
+        #endif
+
+       // Create sector grid
+        double * gridss;
+        double * gridss_w;
+        double * gridss_real;
+        double * gridss_img;
+	double * grid;
+        long size_of_grid;
+        size_of_grid = 2*num_w_planes*xaxis*yaxis;
+        gridss = (double*) calloc(size_of_grid,sizeof(double));
+        gridss_w = (double*) calloc(size_of_grid,sizeof(double));
+        gridss_real = (double*) calloc(size_of_grid/2,sizeof(double));
+        gridss_img = (double*) calloc(size_of_grid/2,sizeof(double));
+        // Create destination slab
+        grid = (double*) calloc(size_of_grid,sizeof(double));
+        // Create temporary global grid
+        #ifndef USE_MPI
+        double * gridtot = (double*) calloc(2*grid_size_x*grid_size_y*num_w_planes,sizeof(double));
+        #endif
+
+        double shift = (double)(dx*yaxis);
+
+        // Open the MPI Memory Window for the slab
+        #ifdef USE_MPI
+        MPI_Win slabwin;
+        MPI_Win_create(grid, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &slabwin);
+        MPI_Win_fence(0,slabwin);
+        #endif
+
+        #ifndef USE_MPI
+        pFile1 = fopen (outfile1,"w");
+        #endif
+
+	// loop over files
+	//
+	kernel_time = 0.0;
+	kernel_time1 = 0.0;
+	reduce_time = 0.0;
+	reduce_time1 = 0.0;
+	compose_time = 0.0;
+	compose_time1 = 0.0;
+	for (int ifiles=0; ifiles<ndatasets; ifiles++)
+	{
+
+	strcpy(filename,datapath_multi[ifiles]);
+        printf("Processing %s, %d of %d\n",filename,ifiles+1,ndatasets);
+        strcat(filename,weightsfile);
+
+        pFile = fopen (filename,"rb");
+        fseek (pFile,startrow*polarisations*sizeof(float),SEEK_SET);
+        fread(weights,(Nweights)*sizeof(float),1,pFile);
+        fclose(pFile);
+
+        strcpy(filename,datapath);
+        strcat(filename,visrealfile);
+        //printf("Reading %s\n",filename);
+
+        pFile = fopen (filename,"rb");
+        fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET);
+        fread(visreal,Nvis*sizeof(float),1,pFile);
+        fclose(pFile);
+
+        strcpy(filename,datapath);
+        strcat(filename,visimgfile);
+        //printf("Reading %s\n",filename);
+
+        pFile = fopen (filename,"rb");
+        fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET);
+        fread(visimg,Nvis*sizeof(float),1,pFile);
+        fclose(pFile);
+
+        #ifdef USE_MPI
+        MPI_Barrier(MPI_COMM_WORLD);
+        #endif
+
+        // Declare temporary arrays for the masking
+        double * uus;
+        double * vvs;
+        double * wws;
+        float * visreals;
+        float * visimgs;
+        float * weightss;
+
+	long isector;
+        for (long isector_count=0; isector_count<nsectors; isector_count++)
+        {
+          clock_gettime(CLOCK_MONOTONIC, &begink);
+          startk = clock();
+          // define local destination sector
+          //isector = (isector_count+rank)%size;
+          isector = isector_count;
+	  // allocate sector arrays     
+          long Nsec = histo_send[isector];
+	  uus = (double*) malloc(Nsec*sizeof(double));
+	  vvs = (double*) malloc(Nsec*sizeof(double));
+	  wws = (double*) malloc(Nsec*sizeof(double));
+	  long Nweightss = Nsec*polarisations;
+	  long Nvissec = Nweightss*freq_per_chan;
+	  weightss = (float*) malloc(Nweightss*sizeof(float));
+	  visreals = (float*) malloc(Nvissec*sizeof(float));
+	  visimgs = (float*) malloc(Nvissec*sizeof(float));
+
+	  // select data for this sector
+          long icount = 0;
+	  long ip = 0;
+	  long inu = 0;
+//CLAAAA
+	  struct sectorlist * current;
+	  current = sectorhead[isector];
+	  while (current->index != -1)
+          {
+             long ilocal = current->index;
+	     //double vvh = vv[ilocal];
+             //int binphi = (int)(vvh*nsectors);
+	     //if (binphi == isector || boundary[ilocal] == isector)
+	     //{
+	       uus[icount] = uu[ilocal];
+	       vvs[icount] = vv[ilocal]-isector*shift;
+	       wws[icount] = ww[ilocal];
+               for (long ipol=0; ipol<polarisations; ipol++)
+	       {
+		  weightss[ip] = weights[ilocal*polarisations+ipol];
+                  ip++;
+	       }
+               for (long ifreq=0; ifreq<polarisations*freq_per_chan; ifreq++)
+	       {
+	          visreals[inu] = visreal[ilocal*polarisations*freq_per_chan+ifreq];
+	          visimgs[inu] = visimg[ilocal*polarisations*freq_per_chan+ifreq];
+		  //if(visimgs[inu]>1e10 || visimgs[inu]<-1e10)printf("%f %f %ld %ld %d %ld %ld\n",visreals[inu],visimgs[inu],inu,Nvissec,rank,ilocal*polarisations*freq_per_chan+ifreq,Nvis);
+	          inu++;
+	       }
+	       icount++;
+	     //}
+	     current = current->next;
+          }
+
+	  clock_gettime(CLOCK_MONOTONIC, &finishk);
+	  endk = clock();
+	  compose_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
+	  compose_time1 += (finishk.tv_sec - begink.tv_sec);
+	  compose_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
+          #ifndef USE_MPI
+	  double uumin = 1e20;
+	  double vvmin = 1e20;
+	  double uumax = -1e20;
+	  double vvmax = -1e20;
+
+          for (long ipart=0; ipart<Nsec; ipart++)
+          {
+	       uumin = MIN(uumin,uus[ipart]);
+	       uumax = MAX(uumax,uus[ipart]);
+	       vvmin = MIN(vvmin,vvs[ipart]);
+	       vvmax = MAX(vvmax,vvs[ipart]);
+
+		  
+               if(ipart%10 == 0)fprintf (pFile, "%ld %f %f %f\n",isector,uus[ipart],vvs[ipart]+isector*shift,wws[ipart]);
+          }
+
+	  printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax);
+          #endif
+
+          // Make convolution on the grid
+          #ifdef VERBOSE
+	  printf("Processing sector %ld\n",isector);
+	  #endif
+          clock_gettime(CLOCK_MONOTONIC, &begink);
+          startk = clock();
+          wstack(num_w_planes,
+               Nsec,
+               freq_per_chan,
+               polarisations,
+               uus,
+               vvs,
+               wws,
+               visreals,
+               visimgs,
+               weightss,
+               dx,
+               dw,
+               w_support,
+	       xaxis,
+	       yaxis,
+               gridss,
+               num_threads);
+	  clock_gettime(CLOCK_MONOTONIC, &finishk);
+	  endk = clock();
+	  kernel_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
+	  kernel_time1 += (finishk.tv_sec - begink.tv_sec);
+	  kernel_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
+          #ifdef VERBOSE
+	  printf("Processed sector %ld\n",isector);
+          #endif
+          clock_gettime(CLOCK_MONOTONIC, &begink);
+          startk = clock();
+
+          //for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++)printf("--> %f\n",gridss[iii]);
+
+          #ifndef USE_MPI
+	  long stride = isector*2*xaxis*yaxis*num_w_planes;
+          for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++)gridtot[stride+iii] = gridss[iii];
+	  #endif
+
+	  // Write grid in the corresponding remote slab
+          #ifdef USE_MPI
+	  int target_rank = (int)isector;
+	  //int target_rank = (int)(size-isector-1);
+	  #ifdef ONE_SIDE
+	  printf("One Side communication active\n");
+	  MPI_Win_lock(MPI_LOCK_SHARED,target_rank,0,slabwin);
+	  MPI_Accumulate(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,MPI_SUM,slabwin); 
+	  MPI_Win_unlock(target_rank,slabwin);
+	  //MPI_Put(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,slabwin); 	   
+          #else
+          MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD);
+          #endif //ONE_SIDE
+          #endif //USE_MPI
+
+	  clock_gettime(CLOCK_MONOTONIC, &finishk);
+	  endk = clock();
+	  reduce_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
+	  reduce_time1 += (finishk.tv_sec - begink.tv_sec);
+	  reduce_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
+          // Go to next sector
+	  for (long inull=0; inull<2*num_w_planes*xaxis*yaxis; inull++)gridss[inull] = 0.0;
+
+	  // Deallocate all sector arrays
+	  free(uus);
+	  free(vvs);
+	  free(wws);
+	  free(weightss);
+	  free(visreals);
+	  free(visimgs);
+        // End of loop over sectors
+        }
+	// End of loop over input files
+	}
+
+	// Finalize MPI communication
+        #ifdef USE_MPI
+	MPI_Win_fence(0,slabwin);
+	#endif
+
+        // Swap left and right parts APPARENTLY NOT NECESSARY
+	/*
+	for (long kswap=0; kswap<num_w_planes; kswap++)
+	for (long jswap=0; jswap<yaxis; jswap++)
+	for (long iswap=0; iswap<xaxis; iswap++)
+	{
+           long index_origin = 2*(iswap + jswap*xaxis + kswap*yaxis*xaxis);
+	   gridss[index_origin] = grid[index_origin];
+	   gridss[index_origin+1] = grid[index_origin+1];
+	}
+	for (long kswap=0; kswap<num_w_planes; kswap++)
+	for (long jswap=0; jswap<yaxis; jswap++)
+	for (long iswap=0; iswap<xaxis/2; iswap++)
+	{
+           long index_origin = 2*(iswap + jswap*xaxis + kswap*yaxis*xaxis);
+	   long index_destination = 2*(iswap+xaxis/2 + jswap*xaxis + kswap*yaxis*xaxis);
+	   grid[index_destination] = gridss[index_origin];
+	   grid[index_destination+1] = gridss[index_origin+1];
+	}
+
+	for (long kswap=0; kswap<num_w_planes; kswap++)
+	for (long jswap=0; jswap<yaxis; jswap++)
+	for (long iswap=xaxis/2; iswap<xaxis; iswap++)
+	{
+           long index_origin = 2*(iswap + jswap*xaxis + kswap*yaxis*xaxis);
+	   long index_destination = 2*(iswap-xaxis/2 + jswap*xaxis + kswap*yaxis*xaxis);
+	   grid[index_destination] = gridss[index_origin];
+	   grid[index_destination+1] = gridss[index_origin+1];
+	}
+	*/
+
+        #ifndef USE_MPI
+        fclose(pFile1);
+        #endif
+        #ifdef USE_MPI
+	MPI_Barrier(MPI_COMM_WORLD);
+	#endif
+
+        end = clock();
+        clock_gettime(CLOCK_MONOTONIC, &finish);
+        process_time = ((double) (end - start)) / CLOCKS_PER_SEC;
+        process_time1 = (finish.tv_sec - begin.tv_sec);
+        process_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
+        clock_gettime(CLOCK_MONOTONIC, &begin);
+         
+
+#ifdef WRITE_DATA
+	// Write results
+	if (rank == 0)
+	{
+          printf("WRITING GRIDDED DATA\n");
+          pFilereal = fopen (outfile2,"wb");
+          pFileimg = fopen (outfile3,"wb");
+	  #ifdef USE_MPI
+	  for (int isector=0; isector<nsectors; isector++)
+          {
+	      MPI_Win_lock(MPI_LOCK_SHARED,isector,0,slabwin);
+	      MPI_Get(gridss,size_of_grid,MPI_DOUBLE,isector,0,size_of_grid,MPI_DOUBLE,slabwin);
+	      MPI_Win_unlock(isector,slabwin);
+	      for (long i=0; i<size_of_grid/2; i++)
+	      {
+		      gridss_real[i] = gridss[2*i];
+		      gridss_img[i] = gridss[2*i+1];
+	      }
+	      if (num_w_planes > 1)
+	      {
+                for (int iw=0; iw<num_w_planes; iw++)
+                for (int iv=0; iv<yaxis; iv++)
+                for (int iu=0; iu<xaxis; iu++)
+                {
+			  long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*grid_size_x*grid_size_y)*sizeof(double);
+                          long index = iu + iv*xaxis + iw*xaxis*yaxis;
+			  fseek(pFilereal, global_index, SEEK_SET);
+			  fwrite(&gridss_real[index], 1, sizeof(double), pFilereal);
+                }
+                for (int iw=0; iw<num_w_planes; iw++)
+                for (int iv=0; iv<yaxis; iv++)
+                for (int iu=0; iu<xaxis; iu++)
+                {
+                          long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*grid_size_x*grid_size_y)*sizeof(double);
+                          long index = iu + iv*xaxis + iw*xaxis*yaxis;
+                          fseek(pFileimg, global_index, SEEK_SET);
+                          fwrite(&gridss_img[index], 1, sizeof(double), pFileimg);
+                          //double v_norm = sqrt(gridss[index]*gridss[index]+gridss[index+1]*gridss[index+1]);
+                          //fprintf (pFile, "%d %d %d %f %f %f\n", iu,isector*yaxis+iv,iw,gridss[index],gridss[index+1],v_norm);
+                }
+
+	      } else {
+		for (int iw=0; iw<num_w_planes; iw++)
+		{
+			  long global_index = (xaxis*isector*yaxis + iw*grid_size_x*grid_size_y)*sizeof(double);
+                          long index = iw*xaxis*yaxis;
+			  fseek(pFilereal, global_index, SEEK_SET);
+                          fwrite(&gridss_real[index], xaxis*yaxis, sizeof(double), pFilereal);
+                          fseek(pFileimg, global_index, SEEK_SET);
+                          fwrite(&gridss_img[index], xaxis*yaxis, sizeof(double), pFileimg);
+		}
+	      }
+          }
+	  #else
+          for (int iw=0; iw<num_w_planes; iw++)
+            for (int iv=0; iv<grid_size_y; iv++)
+               for (int iu=0; iu<grid_size_x; iu++)
+               {
+                          long index = 2*(iu + iv*grid_size_x + iw*grid_size_x*grid_size_y);
+		          fwrite(&gridtot[index], 1, sizeof(double), pFilereal);
+		          fwrite(&gridtot[index+1], 1, sizeof(double), pFileimg);
+                          //double v_norm = sqrt(gridtot[index]*gridtot[index]+gridtot[index+1]*gridtot[index+1]);
+                          //fprintf (pFile, "%d %d %d %f %f %f\n", iu,iv,iw,gridtot[index],gridtot[index+1],v_norm);
+               }
+          #endif
+          fclose(pFilereal);
+          fclose(pFileimg);
+	}
+
+        #ifdef USE_MPI
+        MPI_Win_fence(0,slabwin);
+        #endif
+#endif //WRITE_DATA
+
+
+#ifdef USE_FFTW
+	// FFT transform the data (using distributed FFTW)
+	
+	if(rank == 0)printf("PERFORMING FFT\n");
+        clock_gettime(CLOCK_MONOTONIC, &begin);
+        start = clock();
+        fftw_plan plan;
+        fftw_complex *fftwgrid;
+	ptrdiff_t alloc_local, local_n0, local_0_start;
+	double norm = 1.0/(double)(grid_size_x*grid_size_y);
+
+        // map the 1D array of complex visibilities to a 2D array required by FFTW (complex[*][2])
+	// x is the direction of contiguous data and maps to the second parameter
+	// y is the parallelized direction and corresponds to the first parameter (--> n0)
+	// and perform the FFT per w plane
+	alloc_local = fftw_mpi_local_size_2d(grid_size_y, grid_size_x, MPI_COMM_WORLD,&local_n0, &local_0_start);
+	fftwgrid = fftw_alloc_complex(alloc_local);
+	plan = fftw_mpi_plan_dft_2d(grid_size_y, grid_size_x, fftwgrid, fftwgrid, MPI_COMM_WORLD, FFTW_BACKWARD, FFTW_ESTIMATE); 
+
+	long fftwindex = 0;
+	long fftwindex2D = 0;
+	for (int iw=0; iw<num_w_planes; iw++)
+        {
+            //printf("FFTing plan %d\n",iw);	    
+            // select the w-plane to transform
+            for (int iv=0; iv<yaxis; iv++)
+            {
+               for (int iu=0; iu<xaxis; iu++)
+               {
+		   fftwindex2D = iu + iv*xaxis;
+		   fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis);
+                   fftwgrid[fftwindex2D][0] = grid[fftwindex];
+                   fftwgrid[fftwindex2D][1] = grid[fftwindex+1];
+	       }
+	    }
+
+            // do the transform for each w-plane
+	    fftw_execute(plan);
+
+	    // save the transformed w-plane
+            for (int iv=0; iv<yaxis; iv++)
+            {
+               for (int iu=0; iu<xaxis; iu++)
+               {
+		   fftwindex2D = iu + iv*xaxis;
+		   fftwindex = 2*(fftwindex2D + iw*xaxis*yaxis);
+                   gridss[fftwindex] = norm*fftwgrid[fftwindex2D][0];
+                   gridss[fftwindex+1] = norm*fftwgrid[fftwindex2D][1];
+               }
+            }
+
+	}
+
+        fftw_destroy_plan(plan);
+
+        #ifdef USE_MPI
+        MPI_Win_fence(0,slabwin);
+	MPI_Barrier(MPI_COMM_WORLD);
+        #endif
+
+        end = clock();
+        clock_gettime(CLOCK_MONOTONIC, &finish);
+        fftw_time = ((double) (end - start)) / CLOCKS_PER_SEC;
+        fftw_time1 = (finish.tv_sec - begin.tv_sec);
+        fftw_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
+        clock_gettime(CLOCK_MONOTONIC, &begin);
+
+#ifdef WRITE_DATA
+        // Write results
+        #ifdef USE_MPI
+	MPI_Win writewin;
+        MPI_Win_create(gridss, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &writewin);
+	MPI_Win_fence(0,writewin);
+        #endif
+        if (rank == 0)
+        {
+          printf("WRITING FFT TRANSFORMED DATA\n");
+          pFilereal = fopen (fftfile2,"wb");
+          pFileimg = fopen (fftfile3,"wb");
+          #ifdef USE_MPI
+          for (int isector=0; isector<nsectors; isector++)
+          {
+              MPI_Win_lock(MPI_LOCK_SHARED,isector,0,writewin);
+              MPI_Get(gridss_w,size_of_grid,MPI_DOUBLE,isector,0,size_of_grid,MPI_DOUBLE,writewin);
+              MPI_Win_unlock(isector,writewin);
+	      for (long i=0; i<size_of_grid/2; i++)
+	      {
+		      gridss_real[i] = gridss_w[2*i];
+		      gridss_img[i] = gridss_w[2*i+1];
+	      }
+	      if (num_w_planes > 1)
+	      {
+                for (int iw=0; iw<num_w_planes; iw++)
+                for (int iv=0; iv<yaxis; iv++)
+                for (int iu=0; iu<xaxis; iu++)
+                {
+			  long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*grid_size_x*grid_size_y)*sizeof(double);
+                          long index = iu + iv*xaxis + iw*xaxis*yaxis;
+			  fseek(pFilereal, global_index, SEEK_SET);
+			  fwrite(&gridss_real[index], 1, sizeof(double), pFilereal);
+                }
+                for (int iw=0; iw<num_w_planes; iw++)
+                for (int iv=0; iv<yaxis; iv++)
+                for (int iu=0; iu<xaxis; iu++)
+                {         
+                          long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*grid_size_x*grid_size_y)*sizeof(double);
+                          long index = iu + iv*xaxis + iw*xaxis*yaxis;
+                          fseek(pFileimg, global_index, SEEK_SET);
+                          fwrite(&gridss_img[index], 1, sizeof(double), pFileimg);
+                }
+              } else {
+                          fwrite(gridss_real, size_of_grid/2, sizeof(double), pFilereal);
+                          fwrite(gridss_img, size_of_grid/2, sizeof(double), pFileimg);
+              }
+
+	      
+          }
+          #else
+          /*
+	  for (int iw=0; iw<num_w_planes; iw++)
+            for (int iv=0; iv<grid_size_y; iv++)
+               for (int iu=0; iu<grid_size_x; iu++)
+               {
+                          int isector = 0;
+                          long index = 2*(iu + iv*grid_size_x + iw*grid_size_x*grid_size_y);
+                          double v_norm = sqrt(gridtot[index]*gridtot[index]+gridtot[index+1]*gridtot[index+1]);
+                          fprintf (pFile, "%d %d %d %f %f %f\n", iu,iv,iw,gridtot[index],gridtot[index+1],v_norm);
+               }
+	  */
+          #endif
+          fclose(pFilereal);
+          fclose(pFileimg);
+        }
+
+        #ifdef USE_MPI
+        //MPI_Win_fence(0,writewin);
+        MPI_Win_fence(0,writewin);
+	MPI_Win_free(&writewin);
+	MPI_Barrier(MPI_COMM_WORLD);
+        #endif
+#endif //WRITE_DATA
+ 
+	fftw_free(fftwgrid);
+
+#endif //USE_FFTW
+
+	end = clock();
+	clock_gettime(CLOCK_MONOTONIC, &finish);
+	tot_time = ((double) (end - start0)) / CLOCKS_PER_SEC;
+	tot_time1 = (finish.tv_sec - begin0.tv_sec);
+	tot_time1 += (finish.tv_nsec - begin0.tv_nsec) / 1000000000.0;
+
+        if (rank == 0)
+        {
+          printf("Setup time:    %f sec\n",setup_time);
+          printf("Process time:  %f sec\n",process_time);
+          printf("Kernel time = %f, Array Composition time %f, Reduce time: %f sec\n",kernel_time,compose_time,reduce_time);
+#ifdef USE_FFTW
+          printf("FFTW time:     %f sec\n",fftw_time);
+#endif
+          printf("TOT time:      %f sec\n",tot_time);
+	  if(num_threads > 1)
+          {
+            printf("PSetup time:   %f sec\n",setup_time1);
+            printf("PProcess time: %f sec\n",process_time1);
+            printf("PKernel time = %f, PArray Composition time %f, PReduce time: %f sec\n",kernel_time1,compose_time1,reduce_time1);
+#ifdef USE_FFTW
+            printf("PFFTW time:    %f sec\n",fftw_time1);
+#endif
+            printf("PTOT time:     %f sec\n",tot_time1);
+	  }
+        }
+
+
+	// Close MPI environment
+	#ifdef USE_MPI
+        MPI_Win_fence(0,slabwin);
+	MPI_Win_free(&slabwin);
+	MPI_Finalize();
+	#endif
+}
diff --git a/w-stacking.cu b/w-stacking.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9d91d76a27627505cd2da7b487eedf57c49fbe96
--- /dev/null
+++ b/w-stacking.cu
@@ -0,0 +1,306 @@
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+#include "w-stacking.h"
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+#define NWORKERS -1    //100
+#define NTHREADS 32
+
+#ifdef __CUDACC__
+double __device__
+#else
+double
+#endif
+gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
+{
+     double conv_weight;
+     conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22);
+     return conv_weight;
+}
+
+#ifdef __CUDACC__
+//double __device__ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
+//{
+//     double conv_weight;
+//     conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22);
+//     return conv_weight;
+//}
+
+__global__ void convolve_g(
+     int num_w_planes,
+     long num_points,
+     long freq_per_chan,
+     long polarizations,
+     double* uu,
+     double* vv,
+     double* ww,
+     float* vis_real,
+     float* vis_img,
+     float* weight,
+     double dx,
+     double dw,
+     int KernelLen,
+     int grid_size_x,
+     int grid_size_y,
+     double* grid,
+     double std22)
+
+{
+	//printf("DENTRO AL KERNEL\n");
+        long gid = blockIdx.x*blockDim.x + threadIdx.x;
+	if(gid < num_points)
+	{
+	long i = gid;
+        long visindex = i*freq_per_chan*polarizations;
+	double norm = std22/PI;
+
+        int j, k;
+
+        /* Convert UV coordinates to grid coordinates. */
+        double pos_u = uu[i] / dx;
+        double pos_v = vv[i] / dx;
+        double ww_i  = ww[i] / dw;
+        int grid_w = (int)ww_i;
+        int grid_u = (int)pos_u;
+        int grid_v = (int)pos_v;
+
+        // check the boundaries
+        long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
+        long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
+        long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
+        long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
+        //printf("%ld, %ld, %ld, %ld\n",jmin,jmax,kmin,kmax);
+
+
+        // Convolve this point onto the grid.
+        for (k = kmin; k <= kmax; k++)
+        {
+
+            double v_dist = (double)k+0.5 - pos_v;
+
+            for (j = jmin; j <= jmax; j++)
+            {
+                double u_dist = (double)j+0.5 - pos_u;
+                long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
+		//printf("--> %ld %d %d %d\n",iKer,j,k,grid_w);
+
+                double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
+                // Loops over frequencies and polarizations
+                double add_term_real = 0.0;
+                double add_term_img = 0.0;
+                long ifine = visindex;
+                for (long ifreq=0; ifreq<freq_per_chan; ifreq++)
+		{	
+		   long iweight = visindex/freq_per_chan;
+                   for (long ipol=0; ipol<polarizations; ipol++)
+                   {
+                      double vistest = (double)vis_real[ifine];
+                      if (!isnan(vistest))
+                      {
+                         add_term_real += weight[iweight] * vis_real[ifine] * conv_weight;
+                         add_term_img += weight[iweight] * vis_img[ifine] * conv_weight;
+		      }
+                      ifine++;
+		      iweight++;
+                   }
+		}
+		atomicAdd(&(grid[iKer]),add_term_real);
+		atomicAdd(&(grid[iKer+1]),add_term_img);
+            }
+        }
+	}
+}
+#endif
+
+void wstack(
+     int num_w_planes,
+     long num_points,
+     long freq_per_chan,
+     long polarizations,
+     double* uu,
+     double* vv,
+     double* ww,
+     float* vis_real,
+     float* vis_img,
+     float* weight,
+     double dx,
+     double dw,
+     int w_support,
+     int grid_size_x,
+     int grid_size_y,
+     double* grid,
+     int num_threads)
+{
+    long i;
+    long index;
+    long visindex;
+
+    // initialize the convolution kernel
+    // gaussian:
+    int KernelLen = (w_support-1)/2;
+    double std = 1.0;
+    double std22 = 1.0/(2.0*std*std);
+    double norm = std22/PI;
+
+    // Loop over visibilities.
+// Switch between CUDA and GPU versions
+#ifdef __CUDACC__
+    // Define the CUDA set up
+    int Nth = NTHREADS;
+    int Nbl = num_points/Nth + 1;
+    if(NWORKERS == 1) {Nbl = 1; Nth = 1;};
+    long Nvis = num_points*freq_per_chan*polarizations;
+    printf("Running on GPU with %d threads and %d blocks\n",Nth,Nbl);
+
+    // Create GPU arrays and offload them
+    double * uu_g;
+    double * vv_g;
+    double * ww_g;
+    float * vis_real_g;
+    float * vis_img_g;
+    float * weight_g;
+    double * grid_g;
+
+    //for (int i=0; i<100000; i++)grid[i]=23.0;
+    cudaError_t mmm;
+    mmm=cudaMalloc(&uu_g,num_points*sizeof(double));
+    mmm=cudaMalloc(&vv_g,num_points*sizeof(double));
+    mmm=cudaMalloc(&ww_g,num_points*sizeof(double));
+    mmm=cudaMalloc(&vis_real_g,Nvis*sizeof(float));
+    mmm=cudaMalloc(&vis_img_g,Nvis*sizeof(float));
+    mmm=cudaMalloc(&weight_g,(Nvis/freq_per_chan)*sizeof(float));
+    mmm=cudaMalloc(&grid_g,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
+    mmm=cudaMemset(grid_g,0.0,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
+
+    mmm=cudaMemcpy(uu_g, uu, num_points*sizeof(double), cudaMemcpyHostToDevice);
+    mmm=cudaMemcpy(vv_g, vv, num_points*sizeof(double), cudaMemcpyHostToDevice);
+    mmm=cudaMemcpy(ww_g, ww, num_points*sizeof(double), cudaMemcpyHostToDevice);
+    mmm=cudaMemcpy(vis_real_g, vis_real, Nvis*sizeof(float), cudaMemcpyHostToDevice);
+    mmm=cudaMemcpy(vis_img_g, vis_img, Nvis*sizeof(float), cudaMemcpyHostToDevice);
+    mmm=cudaMemcpy(weight_g, weight, (Nvis/freq_per_chan)*sizeof(float), cudaMemcpyHostToDevice);
+
+    // Call main GPU Kernel
+    convolve_g <<<Nbl,Nth>>> (
+	       num_w_planes,
+               num_points,
+               freq_per_chan,
+               polarizations,
+               uu_g,
+               vv_g,
+               ww_g,
+               vis_real_g,
+               vis_img_g,
+               weight_g,
+               dx,
+               dw,
+               KernelLen,
+               grid_size_x,
+               grid_size_y,
+               grid_g,
+	       std22);
+
+    mmm = cudaMemcpy(grid, grid_g, 2*num_w_planes*grid_size_x*grid_size_y*sizeof(double), cudaMemcpyDeviceToHost);
+    //for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
+    printf("CUDA ERROR %s\n",cudaGetErrorString(mmm));
+    mmm=cudaFree(uu_g);
+    mmm=cudaFree(vv_g);
+    mmm=cudaFree(ww_g);
+    mmm=cudaFree(vis_real_g);
+    mmm=cudaFree(vis_img_g);
+    mmm=cudaFree(weight_g);
+    mmm=cudaFree(grid_g);
+
+// Switch between CUDA and GPU versions
+# else
+
+#ifdef _OPENMP
+    omp_set_num_threads(num_threads);
+#endif
+    #pragma omp parallel for private(visindex) 
+    for (i = 0; i < num_points; i++)
+    {
+#ifdef _OPENMP
+	//int tid;
+	//tid = omp_get_thread_num();
+	//printf("%d\n",tid);
+#endif
+
+        visindex = i*freq_per_chan*polarizations;
+
+        double sum = 0.0;
+        int j, k;
+	//if (i%1000 == 0)printf("%ld\n",i);
+
+        /* Convert UV coordinates to grid coordinates. */
+        double pos_u = uu[i] / dx;
+        double pos_v = vv[i] / dx;
+        double ww_i  = ww[i] / dw;
+        int grid_w = (int)ww_i;
+        int grid_u = (int)pos_u;
+        int grid_v = (int)pos_v;
+
+	// check the boundaries
+	long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
+	long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
+	long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
+	long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
+        //printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax);
+
+
+        // Convolve this point onto the grid.
+        for (k = kmin; k <= kmax; k++)
+        {
+
+            double v_dist = (double)k+0.5 - pos_v;
+
+            for (j = jmin; j <= jmax; j++)
+            {
+                double u_dist = (double)j+0.5 - pos_u;
+		long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
+
+		double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
+		// Loops over frequencies and polarizations
+		double add_term_real = 0.0;
+		double add_term_img = 0.0;
+		long ifine = visindex;
+		// DAV: the following two loops are performend by each thread separately: no problems of race conditions
+		for (long ifreq=0; ifreq<freq_per_chan; ifreq++)
+		{	
+		   long iweight = visindex/freq_per_chan;
+	           for (long ipol=0; ipol<polarizations; ipol++)
+	           {
+                      if (!isnan(vis_real[ifine]))
+                      {
+		         //printf("%f %ld\n",weight[iweight],iweight);
+                         add_term_real += weight[iweight] * vis_real[ifine] * conv_weight;
+		         add_term_img += weight[iweight] * vis_img[ifine] * conv_weight;
+			 //if(vis_img[ifine]>1e10 || vis_img[ifine]<-1e10)printf("%f %f %f %f %ld %ld\n",vis_real[ifine],vis_img[ifine],weight[iweight],conv_weight,ifine,num_points*freq_per_chan*polarizations);
+		      }
+		      ifine++;
+		      iweight++;
+		   }
+	        }
+		// DAV: this is the critical call in terms of correctness of the results and of performance
+		#pragma omp atomic
+		grid[iKer] += add_term_real;
+		#pragma omp atomic
+		grid[iKer+1] += add_term_img;
+            }
+        }
+	
+    }
+// End switch between CUDA and CPU versions
+#endif
+    //for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
+}
+
+int test(int nnn)
+{
+	int mmm;
+	
+	mmm = nnn+1;
+	return mmm;
+}	
+
diff --git a/w-stacking.h b/w-stacking.h
new file mode 100644
index 0000000000000000000000000000000000000000..4b8bdc1a10c3212509537eea9e455b39d839e241
--- /dev/null
+++ b/w-stacking.h
@@ -0,0 +1,32 @@
+#ifndef W_PROJECT_H_
+#define W_PROJECT_H_
+
+#define PI 3.14159265359
+#define REAL_TYPE double
+#ifdef __CUDACC__
+extern "C"
+#endif
+void wstack(
+     int,
+     long,
+     long,
+     long,
+     double*,
+     double*,
+     double*,
+     float*,
+     float*,
+     float*,
+     double,
+     double,
+     int,
+     int,
+     int,
+     double*,
+     int);
+#ifdef __CUDACC__
+extern "C"
+#endif
+int test(int nnn);
+
+#endif