diff --git a/.gitignore b/.gitignore
index 797747aa16a2fb9000f5268a91f10e289e07f077..77346fd87e48eca5a70138537312d012fec11328 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,6 @@
 *.o
+.DS_Store 
+sync.sh
 phase_correction.c
 w-stacking.c
 w-stackingCfftw
diff --git a/Build/Makefile.M100 b/Build/Makefile.M100
new file mode 100644
index 0000000000000000000000000000000000000000..1fa008e8cf29ab9f112d3ad399a6fe4dd5faa901
--- /dev/null
+++ b/Build/Makefile.M100
@@ -0,0 +1,20 @@
+CC       =  gcc
+CXX      =  g++
+
+MPICC    =  mpicc
+MPIC++   =  mpiCC
+
+
+#FFTW_INCL=  -I/home/taffoni/sw/include
+#FFTW_LIB=  -L/home/taffoni/sw/lib
+
+
+NVCC = nvcc
+NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11
+NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda
+
+OMP= -fopenmp
+
+CFLAGS +=  -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL)
+
+OPTIMIZE = $(OMP) -O3 -mtune=native
diff --git a/Build/Makefile.Macosx b/Build/Makefile.Macosx
new file mode 100644
index 0000000000000000000000000000000000000000..dbbde55f903759d481ff55f4e8d6919183384ca1
--- /dev/null
+++ b/Build/Makefile.Macosx
@@ -0,0 +1,26 @@
+CC       =  icc
+CXX      =  g++-11
+
+MPICC    =  mpicc
+MPIC++   =  mpiCC
+
+FFTW_INCL=  -I/usr/local/include
+FFTW_LIB=  /usr/local/lib/
+
+GSL_INCL =
+GSL_LIBS =
+
+MPI_LIB =
+MPI_INCL= -I/home/taffoni/sw/Linux_x86_64/21.5/comm_libs/mpi/include
+HDF5_INCL =
+HDF5_LIB  =
+
+OMP= -fopenmp
+
+NVCC = nvcc
+NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11
+NVLIB = -L/home/taffoni -lcudart -lcuda
+
+CFLAGS +=  -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL)
+
+OPTIMIZE = $(OMP) -O3
diff --git a/Build/Makefile.Magellanus b/Build/Makefile.Magellanus
new file mode 100644
index 0000000000000000000000000000000000000000..8f5dc582b872b0a3355d9b06fa4ebbcabda79843
--- /dev/null
+++ b/Build/Makefile.Magellanus
@@ -0,0 +1,34 @@
+CC       =  gcc-10
+CXX      =  g++-10
+
+MPICC    =  mpicc
+MPIC++   =  mpiCC
+
+GSL_INCL =  -I/home/taffoni/sw/include
+GSL_LIBS =  -L/home/taffoni/sw/lib
+
+FFTW_INCL=  -I/home/taffoni/sw/include
+FFTW_LIB=  /home/taffoni/sw/lib   -lfftw3_mpi -lfftw3
+
+#-L/opt/cluster/openmpi/3.1.3/gnu/8.2.0/lib -lmpi
+MPI_LIB =
+#-I/opt/cluster/openmpi/3.1.3/gnu/8.2.0/include
+MPI_INCL= -I/home/taffoni/sw/Linux_x86_64/21.5/comm_libs/mpi/include
+HDF5_INCL =
+HDF5_LIB  =
+
+OMP = -mp=multicore,gpu -Mprof -cuda
+#OMP = -fopenmp
+NVCC = nvcc
+NVFLAGS = -arch=sm_70 -Xcompiler -std=c++11
+NVLIB = -L/home/taffoni/sw/Linux_x86_64/21.5/cuda/11.3/lib64/ -lcudart -lcuda
+
+
+CFLAGS +=  -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL)
+
+OPTIMIZE =  $(OMP) -O3
+
+# OMP GPU SPECIFIC FLAGS
+#OPTIMIZE += -Wno-unused-result -foffload=-lm -ffast-math
+#OPTIMIZE += -fcf-protection=none -fno-stack-protector -foffload=nvptx-none -foffload=-misa=sm_35
+#-ffast-math  -fopt-info-all-omp -foffload=-misa=sm_35 -fcf-protection=none -fno-stack-protector -foffload=nvptx-none
diff --git a/Build/Makefile.Marconi b/Build/Makefile.Marconi
new file mode 100644
index 0000000000000000000000000000000000000000..0d68960b481ccaac10ce193c204dc5858c7d362c
--- /dev/null
+++ b/Build/Makefile.Marconi
@@ -0,0 +1,20 @@
+CC       =  gcc
+CXX      =  g++
+
+MPICC    =  mpicc
+MPIC++   =  mpiCC
+
+
+FFTW_INCL=  -I/home/taffoni/sw/include
+FFTW_LIB=  /home/taffoni/sw/lib
+
+
+NVCC = nvcc
+NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11
+NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda
+
+OMP= -fopenmp
+
+CFLAGS +=  -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL)
+
+OPTIMIZE = $(OMP) -O3 -mtune=native
diff --git a/Build/Makefile.systype b/Build/Makefile.systype
new file mode 100644
index 0000000000000000000000000000000000000000..04922322304924d1c8a5397ede089516d9a1d256
--- /dev/null
+++ b/Build/Makefile.systype
@@ -0,0 +1,24 @@
+CC       =  gcc-10
+CXX      =  g++-10
+
+MPICC    =  mpicc
+MPIC++   =  mpiCC
+
+OPTIMIZE =
+
+
+GSL_INCL =
+GSL_LIB =
+
+FFTW_INCL=
+FFTW_LIB=
+
+NVCC =
+NVFLAGS =
+NVLIB =
+
+CFLAGS +=
+
+MPICHLIB =
+HDF5INCL =
+HDF5LIB  =
diff --git a/Makefile b/Makefile
index 99d13891512a600974a7a33c5eb43683eaf22e3f..23d161d0f6b432626329c9ffa52f94d2e31fab2a 100644
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,31 @@
 # comment/uncomment the various options depending hoe you want to build the program
+# Set default values for compiler options if no systype options are given or found
+CC        = mpiCC
+CXX       = mpiCC
+OPTIMIZE  = -std=c++11 -Wall -g -O2
+MPICHLIB  = -lmpich
+SWITCHES =
+
+ifdef SYSTYPE
+SYSTYPE := $(SYSTYPE)
+include Build/Makefile.$(SYSTYPE)
+else
+include Build/Makefile.systype
+endif
+
+
+LIBS = -L$(FFTW_LIB) -lfftw3 -lm -lcudart  -lcuda
+
 # create MPI code
 OPT += -DUSE_MPI
+#OPT += -DACCOMP
 # use FFTW (it can be switched on ONLY if MPI is active)
-OPT += -DUSE_FFTW
+ifeq (USE_MPI,$(findstring USE_MPI,$(OPT)))
+   OPT += -DUSE_FFTW
+	 LIBS = -L$(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm
+endif
+
+#OPT += -DNVIDIA
 # perform one-side communication (suggested) instead of reduce (only if MPI is active)
 OPT += -DONE_SIDE
 # write the full 3D cube of gridded visibilities and its FFT transform
@@ -10,25 +33,8 @@ OPT += -DONE_SIDE
 # write the final image
 OPT += -DWRITE_IMAGE
 # perform w-stacking phase correction
-#OPT += -DPHASE_ON
-
-CC = gcc
-CXX = g++
-ifeq (USE_MPI,$(findstring USE_MPI,$(OPT)))
-  CC = mpicc
-  CXX = mpiCC 
-endif
-
-OMP = -fopenmp 
-#OMP = 
-
-CFLAGS += -O3 -mcpu=native
-CFLAGS += -I.
-LIBS = -L$(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm
+OPT += -DPHASE_ON
 
-NVCC = nvcc
-NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11
-NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda
 
 DEPS = w-stacking.h w-stacking-fftw.c w-stacking.cu phase_correction.cu
 COBJ = w-stacking.o w-stacking-fftw.o phase_correction.o
@@ -39,27 +45,40 @@ w-stacking.c: w-stacking.cu
 phase_correction.c: phase_correction.cu
 	cp phase_correction.cu phase_correction.c
 
+ifeq (USE_MPI,$(findstring USE_MPI,$(OPT)))
+%.o: %.c $(DEPS)
+	$(MPICC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS)
+else
 %.o: %.c $(DEPS)
-	$(CC) $(OMP) -c -o $@ $< $(CFLAGS) $(OPT)
+	$(CC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS)
+endif
 
 serial: $(COBJ)
-	$(CC) $(OMP) -o w-stackingCfftw_serial $(CFLAGS) $^ -lm
+	$(CC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_serial  $^ $(LIBS)
+
+serial_omp: phase_correction.c
+	$(CC)  $(OPTIMIZE) $(OPT) -o w-stackingOMP_serial w-stacking-fftw.c  w-stacking_omp.c    $(CFLAGS) $(LIBS)
+
+simple_mpi: phase_correction.c
+	$(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingMPI_simple w-stacking_omp.c w-stacking-fftw.c phase_correction.c  $(CFLAGS) $(LIBS)
+
+mpi_omp: phase_correction.c
+	$(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingMPI_omp w-stacking_omp.c w-stacking-fftw.c phase_correction.c  $(CFLAGS) $(LIBS)
 
 serial_cuda:
-	$(NVCC) $(OPT) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB)
-	$(CC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c
-	$(CXX) $(CFLAGS) $(OPT) -o w-stackingfftw_serial w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) -lm
+	$(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB)
+	$(CC)  $(OPTIMIZE) $(OPT) -c w-stacking-fftw.c $(CFLAGS) $(LIBS)
+	$(CXX) $(OPTIMIZE) $(OPT) -o w-stackingfftw_serial w-stacking-fftw.o w-stacking.o phase_correction.o $(CFLAGS) $(NVLIB) -lm
 
-mpi: $(COBJ) 
-	$(CC) $(OMP) -o w-stackingCfftw $(CFLAGS) $^ $(LIBS)
+mpi: $(COBJ)
+	$(MPICC) $(OPTIMIZE) -o w-stackingCfftw   $^  $(CFLAGS) $(LIBS)
 
 mpi_cuda:
-	$(NVCC) $(NVFLAGS) $(OPT) -c w-stacking.cu phase_correction.cu $(NVLIB)
-	$(CC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c
-	$(CXX) $(CFLAGS) $(OPT) -o w-stackingfftw w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) $(LIBS) -lm
+	$(NVCC)   $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB)
+	$(MPICC)  $(OPTIMIZE) $(OPT) -c w-stacking-fftw.c $(CFLAGS) $(LIBS)
+	$(MPIC++) $(OPTIMIZE) $(OPT)   -o w-stackingfftw w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) $(CFLAGS) $(LIBS)
 
 clean:
 	rm *.o
 	rm w-stacking.c
 	rm phase_correction.c
-
diff --git a/phase_correction.cu b/phase_correction.cu
index 9a439248899b8f592124a34e53a34bcd3e35c501..98bd6cc96939e214942e86cf947af32332841e40 100644
--- a/phase_correction.cu
+++ b/phase_correction.cu
@@ -170,6 +170,14 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
 
 		    double preal, pimag;
 		    double radius2 = (xcoord*xcoord+ycoord*ycoord);
+		    if(xcoord <= 1.0)
+		    {
+			    preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
+			    pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
+		    } else {
+			    preal = cos(-2.0*PI*wterm*(sqrt(radius2-1.0)-1));
+			    pimag = 0.0;
+		    }
 
 		    preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
 		    pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
@@ -198,7 +206,7 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
 		image_imag[img_index] += gridss[index+1];
 #endif // end of PHASE_ON
 
-            }  
+            }
 	}
 
 #endif // end of __CUDACC__
diff --git a/scripts/gridfftbin.m b/scripts/gridfftbin.m
index d948638fdc3324cb4c0aad449c90c70b6b9fae09..b4bc237f7d2c3c65e1bc0b309ef5d9c3b18810cb 100644
--- a/scripts/gridfftbin.m
+++ b/scripts/gridfftbin.m
@@ -10,7 +10,8 @@ nplanes = 1;
 % id = '4';
 suffix = '';
 id = '';
-datadir = "/Users/cgheller/Work/Gridding/data/";
+d
+atadir = "/Users/cgheller/Work/Gridding/data/";
 filename1 = strcat(datadir,"fft_real",suffix,id,'.bin');
 filename2 = strcat(datadir,"fft_img",suffix,id,'.bin');
 s = dir(filename1);
@@ -52,4 +53,3 @@ xlabel('cell');
 ylabel('cell');
 pngimage = strcat(datadir,'fft-2freq',suffix,id,'.png');
 saveas(gcf,pngimage);
-
diff --git a/scripts/plotgrid.py b/scripts/plotgrid.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cf1bab67a5c54c9befd586a414eb160fa88800e
--- /dev/null
+++ b/scripts/plotgrid.py
@@ -0,0 +1,27 @@
+#!/usr/bin/python3
+import numpy as np
+import matplotlib.pyplot as plt
+filename1 = "fft_real.bin"
+filename2 = "fft_img.bin"
+nplanes = 1
+
+with open(filename1, 'rb') as f1:
+    vreal  = np.fromfile(f1, dtype=np.float64)
+with open(filename2, 'rb') as f2:
+    vimg  = np.fromfile(f2, dtype=np.float64)
+
+xaxis = int(np.sqrt(vreal.size))
+yaxes = xaxis
+residual = np.vectorize(complex)(vreal, vimg)
+
+cumul2d = residual.reshape((xaxis,yaxes,nplanes), order='F')
+for i in range(nplanes):
+    gridded = np.squeeze(cumul2d[:,:,i])
+    ax = plt.subplot()
+    img = ax.imshow(np.abs(np.fft.fftshift(gridded)), aspect='auto', interpolation='none', origin='lower')
+    ax.set_xlabel('cell')
+    ax.set_ylabel('cell')
+    cbar = plt.colorbar(img)
+    cbar.set_label('norm(FFT)',size=18)
+    figname='grid_image_' + str(i) + '.png'
+    plt.savefig(figname)
diff --git a/w-stacking-fftw.c b/w-stacking-fftw.c
index 0eda87e9b2dcef70714081129aff0fbeb468fae0..908dda8c47943f936d689ce71a202684122aef4e 100644
--- a/w-stacking-fftw.c
+++ b/w-stacking-fftw.c
@@ -7,9 +7,15 @@
 #include <fftw3-mpi.h>
 #endif
 #endif
+#include <omp.h>
 #include <math.h>
 #include <time.h>
+#include <unistd.h>
+#ifdef ACCOMP
+#include "w-stacking_omp.h"
+#else
 #include "w-stacking.h"
+#endif
 #define PI 3.14159265359
 #define NUM_OF_SECTORS -1
 #define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
@@ -27,16 +33,16 @@ void Push(struct sectorlist** headRef, long data) {
      struct sectorlist* newNode = malloc(sizeof(struct sectorlist));
      newNode->index = data;
      newNode->next = *headRef;
-     *headRef = newNode; 
+     *headRef = newNode;
 }
 
 // Main Code
-int main(int argc, char * argv[]) 
+int main(int argc, char * argv[])
 {
 	int rank;
 	int size;
 
-	FILE * pFile; 
+	FILE * pFile;
 	FILE * pFile1;
 	FILE * pFilereal;
 	FILE * pFileimg;
@@ -50,16 +56,16 @@ int main(int argc, char * argv[])
 	//
 	//char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/hba-8hrs_gauss4new.binMS/";
 	//char datapath[900] = "/m100_scratch/userexternal/cgheller/Lofar/Observations/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/";
-	char datapath[900]; 
+	char datapath[900];
 	char datapath_multi[NFILES][900];
 
 	char ufile[30] = "ucoord.bin";
-        char vfile[30] = "vcoord.bin";
-        char wfile[30] = "wcoord.bin";
-        char weightsfile[30] = "weights.bin";
-        char visrealfile[30] = "visibilities_real.bin";
-        char visimgfile[30] = "visibilities_img.bin";
-        char metafile[30] = "meta.txt";
+  char vfile[30] = "vcoord.bin";
+  char wfile[30] = "wcoord.bin";
+  char weightsfile[30] = "weights.bin";
+  char visrealfile[30] = "visibilities_real.bin";
+  char visimgfile[30] = "visibilities_img.bin";
+  char metafile[30] = "meta.txt";
 	char outfile[30] = "grid.txt";
 	char outfile1[30] = "coords.txt";
 	char outfile2[30] = "grid_real.bin";
@@ -94,7 +100,7 @@ int main(int argc, char * argv[])
 	double wmax,wmax0;
 	double resolution;
 
-        // MESH SIZE
+  // MESH SIZE
 	int grid_size_x = 2048;
 	int grid_size_y = 2048;
 	int local_grid_size_x;// = 8;
@@ -102,6 +108,7 @@ int main(int argc, char * argv[])
 	int xaxis;
 	int yaxis;
 	int num_w_planes = 8;
+
 	// DAV: the corresponding KernelLen is calculated within the wstack function. It can be anyway hardcoded for optimization
 	int w_support = 7;
 	int num_threads;// = 4;
@@ -117,11 +124,23 @@ int main(int argc, char * argv[])
 	struct timespec begin, finish, begin0, begink, finishk;
 	double elapsed;
 	long nsectors;
+  /* GT get nymber of threads exit if not given */
+  if(argc == 1) {
+    fprintf(stderr, "Usage: %s number_of_OMP_Threads \n", argv[0]);
+    exit(1);
+  }
+  // Set the number of OpenMP threads
+  num_threads = atoi(argv[1]);
+
+  if ( num_threads == 0 )
+  {
+    fprintf(stderr, "Wrong parameter: %s\n\n", argv[1]);
+    fprintf(stderr, "Usage: %s number_of_OMP_Threads \n", argv[0]);
+    exit(1);
+  }
 
 	clock_gettime(CLOCK_MONOTONIC, &begin0);
 	start0 = clock();
-        // Set the number of OpenMP threads
-        num_threads = atoi(argv[1]);
 
 	// Intialize MPI environment
 #ifdef USE_MPI
@@ -136,7 +155,21 @@ int main(int argc, char * argv[])
 	rank = 0;
 	size = 1;
 #endif
-        if(rank == 0)printf("Running with %d threads\n",num_threads);
+
+  if(rank == 0)printf("Running with %d threads\n",num_threads);
+
+#ifdef ACCOMP
+if(rank == 0){
+  if (0 == omp_get_num_devices()) {
+      printf("No accelerator found ... exit\n");
+      exit(255);
+   }
+   printf("Number of available GPUs %d\n", omp_get_num_devices());
+   #ifdef NVIDIA
+      prtAccelInfo();
+   #endif
+ }
+#endif
 
 	// set the local size of the image
 	local_grid_size_x = grid_size_x;
@@ -144,7 +177,7 @@ int main(int argc, char * argv[])
 	if (nsectors < 0) nsectors = size;
 	local_grid_size_y = grid_size_y/nsectors;
 	//nsectors = size;
-	
+
 	// LOCAL grid size
 	xaxis = local_grid_size_x;
 	yaxis = local_grid_size_y;
@@ -183,10 +216,10 @@ int main(int argc, char * argv[])
 	int nsub = 1000;
 	//int nsub = 10;
 	printf("Subtracting last %d measurements\n",nsub);
-        Nmeasures = Nmeasures-nsub;
+  Nmeasures = Nmeasures-nsub;
 	Nvis = Nmeasures*freq_per_chan*polarisations;
 
-        // calculate the coordinates of the center
+  // calculate the coordinates of the center
 	double uvshift = uvmin/(uvmax-uvmin);
 	//printf("UVSHIFT %f %f %f %f %f\n",uvmin, uvmax, wmin, wmax, uvshift);
 
@@ -196,12 +229,12 @@ int main(int argc, char * argv[])
 	   printf("N. visibilities %ld\n",Nvis);
 	}
 
-        // Set temporary local size of points
+  // Set temporary local size of points
 	long nm_pe = (long)(Nmeasures/size);
 	long remaining = Nmeasures%size;
-        
-        long startrow = rank*nm_pe;
-        if (rank == size-1)nm_pe = nm_pe+remaining;
+
+  long startrow = rank*nm_pe;
+  if (rank == size-1)nm_pe = nm_pe+remaining;
 
 	long Nmeasures_tot = Nmeasures;
 	Nmeasures = nm_pe;
@@ -209,10 +242,11 @@ int main(int argc, char * argv[])
 	Nvis = Nmeasures*freq_per_chan*polarisations;
 	Nweights = Nmeasures*polarisations;
 
-        #ifdef VERBOSE
+#ifdef VERBOSE
 	printf("N. measurements on %d %ld\n",rank,Nmeasures);
 	printf("N. visibilities on %d %ld\n",rank,Nvis);
-        #endif
+#endif
+
 
 	// DAV: all these arrays can be allocatate statically for the sake of optimization. However be careful that if MPI is used
 	// all the sizes are rescaled by the number of MPI tasks
@@ -224,7 +258,7 @@ int main(int argc, char * argv[])
 	visreal = (float*) calloc(Nvis,sizeof(float));
 	visimg = (float*) calloc(Nvis,sizeof(float));
 
-        if(rank == 0)printf("READING DATA\n");
+  if(rank == 0)printf("READING DATA\n");
 	// Read data
 	strcpy(filename,datapath);
 	strcat(filename,ufile);
@@ -239,7 +273,7 @@ int main(int argc, char * argv[])
 	strcat(filename,vfile);
 	//printf("Reading %s\n",filename);
 
-	pFile = fopen (filename,"rb");
+  pFile = fopen (filename,"rb");
 	fseek (pFile,startrow*sizeof(double),SEEK_SET);
 	fread(vv,Nmeasures*sizeof(double),1,pFile);
 	fclose(pFile);
@@ -253,9 +287,9 @@ int main(int argc, char * argv[])
 	fread(ww,Nmeasures*sizeof(double),1,pFile);
 	fclose(pFile);
 
-        #ifdef USE_MPI
+#ifdef USE_MPI
 	MPI_Barrier(MPI_COMM_WORLD);
-        #endif
+#endif
 
 	clock_gettime(CLOCK_MONOTONIC, &finish);
 	end = clock();
@@ -263,13 +297,13 @@ int main(int argc, char * argv[])
 	setup_time1 = (finish.tv_sec - begin.tv_sec);
 	setup_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
 
-        if(rank == 0)printf("GRIDDING DATA\n");
+
+  if(rank == 0)printf("GRIDDING DATA\n");
 
 	// Create histograms and linked lists
-        clock_gettime(CLOCK_MONOTONIC, &begin);
-        start = clock();
- 
-        //CLAAA
+  clock_gettime(CLOCK_MONOTONIC, &begin);
+  start = clock();
+
 	// Initialize linked list
 	struct sectorlist ** sectorhead;
 	sectorhead = (struct sectorlist **) malloc((nsectors+1) * sizeof(struct sectorlist));
@@ -286,7 +320,7 @@ int main(int argc, char * argv[])
 	double uuh,vvh;
 	for (long iphi = 0; iphi < Nmeasures; iphi++)
 	{
-           boundary[iphi] = -1;
+     boundary[iphi] = -1;
 	   uuh = uu[iphi];
 	   vvh = vv[iphi];
 	   int binphi = (int)(vvh*nsectors);
@@ -295,8 +329,8 @@ int main(int argc, char * argv[])
 	   double downdist = vvh - (double)(binphi*yaxis)*dx;
 	   //
 	   histo_send[binphi]++;
-           Push(&sectorhead[binphi],iphi);
-           if(updist < w_supporth && updist >= 0.0) {histo_send[binphi+1]++; boundary[iphi] = binphi+1; Push(&sectorhead[binphi+1],iphi);};
+     Push(&sectorhead[binphi],iphi);
+     if(updist < w_supporth && updist >= 0.0) {histo_send[binphi+1]++; boundary[iphi] = binphi+1; Push(&sectorhead[binphi+1],iphi);};
 	   if(downdist < w_supporth && binphi > 0 && downdist >= 0.0) {histo_send[binphi-1]++; boundary[iphi] = binphi-1; Push(&sectorhead[binphi-1],iphi);};
 	}
 
@@ -316,41 +350,39 @@ int main(int argc, char * argv[])
 	}
 #endif
 
-        #ifdef VERBOSE
-        for (int iii=0; iii<nsectors+1; iii++)printf("HISTO %d %d %ld\n",rank, iii, histo_send[iii]);
-        #endif
-
-       // Create sector grid
-        double * gridss;
-        double * gridss_w;
-        double * gridss_real;
-        double * gridss_img;
-	double * grid;
-        long size_of_grid;
-        size_of_grid = 2*num_w_planes*xaxis*yaxis;
-        gridss = (double*) calloc(size_of_grid,sizeof(double));
-        gridss_w = (double*) calloc(size_of_grid,sizeof(double));
-        gridss_real = (double*) calloc(size_of_grid/2,sizeof(double));
-        gridss_img = (double*) calloc(size_of_grid/2,sizeof(double));
-        // Create destination slab
-        grid = (double*) calloc(size_of_grid,sizeof(double));
-        // Create temporary global grid
-        #ifndef USE_MPI
-        double * gridtot = (double*) calloc(2*grid_size_x*grid_size_y*num_w_planes,sizeof(double));
-        #endif
-
-        double shift = (double)(dx*yaxis);
+#ifdef VERBOSE
+  for (int iii=0; iii<nsectors+1; iii++)printf("HISTO %d %d %ld\n",rank, iii, histo_send[iii]);
+#endif
 
-        // Open the MPI Memory Window for the slab
-        #ifdef USE_MPI
-        MPI_Win slabwin;
-        MPI_Win_create(grid, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &slabwin);
-        MPI_Win_fence(0,slabwin);
-        #endif
+// Create sector grid
+  double * gridss;
+  double * gridss_w;
+  double * gridss_real;
+  double * gridss_img;
+  double * grid;
+  long size_of_grid;
+  size_of_grid = 2*num_w_planes*xaxis*yaxis;
+  gridss = (double*) calloc(size_of_grid,sizeof(double));
+  gridss_w = (double*) calloc(size_of_grid,sizeof(double));
+  gridss_real = (double*) calloc(size_of_grid/2,sizeof(double));
+  gridss_img = (double*) calloc(size_of_grid/2,sizeof(double));
+  // Create destination slab
+  grid = (double*) calloc(size_of_grid,sizeof(double));
+  // Create temporary global grid
+#ifndef USE_MPI
+  double * gridtot = (double*) calloc(2*grid_size_x*grid_size_y*num_w_planes,sizeof(double));
+#endif
+  double shift = (double)(dx*yaxis);
+  // Open the MPI Memory Window for the slab
+#ifdef USE_MPI
+  MPI_Win slabwin;
+  MPI_Win_create(grid, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &slabwin);
+  MPI_Win_fence(0,slabwin);
+#endif
+#ifndef USE_MPI
+  pFile1 = fopen (outfile1,"w");
+#endif
 
-        #ifndef USE_MPI
-        pFile1 = fopen (outfile1,"w");
-        #endif
 
 	// loop over files
 	//
@@ -365,7 +397,6 @@ int main(int argc, char * argv[])
 	//
 	for (int ifiles=0; ifiles<ndatasets; ifiles++)
 	{
-
 	strcpy(filename,datapath_multi[ifiles]);
         printf("Processing %s, %d of %d\n",filename,ifiles+1,ndatasets);
 
@@ -400,95 +431,93 @@ int main(int argc, char * argv[])
         fread(weights,(Nweights)*sizeof(float),1,pFile);
         fclose(pFile);
 
-        strcpy(filename,datapath);
-        strcat(filename,visrealfile);
-        //printf("Reading %s\n",filename);
-
-        pFile = fopen (filename,"rb");
-        fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET);
-        fread(visreal,Nvis*sizeof(float),1,pFile);
-        fclose(pFile);
+  pFile = fopen (filename,"rb");
+  fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET);
+  fread(visreal,Nvis*sizeof(float),1,pFile);
+  fclose(pFile);
+  strcpy(filename,datapath);
+  strcat(filename,visimgfile);
+#ifdef VERBOSE
+  printf("Reading %s\n",filename);
+#endif
+  pFile = fopen (filename,"rb");
+  fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET);
+  fread(visimg,Nvis*sizeof(float),1,pFile);
+  fclose(pFile);
 
-        strcpy(filename,datapath);
-        strcat(filename,visimgfile);
-        //printf("Reading %s\n",filename);
+#ifdef USE_MPI
+  MPI_Barrier(MPI_COMM_WORLD);
+#endif
+  // Declare temporary arrays for the masking
+  double * uus;
+  double * vvs;
+  double * wws;
+  float * visreals;
+  float * visimgs;
+  float * weightss;
+	long isector;
 
-        pFile = fopen (filename,"rb");
-        fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET);
-        fread(visimg,Nvis*sizeof(float),1,pFile);
-        fclose(pFile);
+  for (long isector_count=0; isector_count<nsectors; isector_count++)
+      {
+        clock_gettime(CLOCK_MONOTONIC, &begink);
+        startk = clock();
+        // define local destination sector
+        //isector = (isector_count+rank)%size;
+        isector = isector_count;
+	  // allocate sector arrays
+        long Nsec = histo_send[isector];
+	      uus = (double*) malloc(Nsec*sizeof(double));
+	      vvs = (double*) malloc(Nsec*sizeof(double));
+	      wws = (double*) malloc(Nsec*sizeof(double));
+	      long Nweightss = Nsec*polarisations;
+	      long Nvissec = Nweightss*freq_per_chan;
+	      weightss = (float*) malloc(Nweightss*sizeof(float));
+	      visreals = (float*) malloc(Nvissec*sizeof(float));
+	      visimgs = (float*) malloc(Nvissec*sizeof(float));
 
-        #ifdef USE_MPI
-        MPI_Barrier(MPI_COMM_WORLD);
-        #endif
+	  // select data for this sector
+        long icount = 0;
+	      long ip = 0;
+	      long inu = 0;
+//CLAAAA
+	       struct sectorlist * current;
+	       current = sectorhead[isector];
 
-        // Declare temporary arrays for the masking
-        double * uus;
-        double * vvs;
-        double * wws;
-        float * visreals;
-        float * visimgs;
-        float * weightss;
 
-	long isector;
-        for (long isector_count=0; isector_count<nsectors; isector_count++)
-        {
-          clock_gettime(CLOCK_MONOTONIC, &begink);
-          startk = clock();
-          // define local destination sector
-          //isector = (isector_count+rank)%size;
-          isector = isector_count;
-	  // allocate sector arrays     
-          long Nsec = histo_send[isector];
-	  uus = (double*) malloc(Nsec*sizeof(double));
-	  vvs = (double*) malloc(Nsec*sizeof(double));
-	  wws = (double*) malloc(Nsec*sizeof(double));
-	  long Nweightss = Nsec*polarisations;
-	  long Nvissec = Nweightss*freq_per_chan;
-	  weightss = (float*) malloc(Nweightss*sizeof(float));
-	  visreals = (float*) malloc(Nvissec*sizeof(float));
-	  visimgs = (float*) malloc(Nvissec*sizeof(float));
 
-	  // select data for this sector
-          long icount = 0;
-	  long ip = 0;
-	  long inu = 0;
-//CLAAAA
-	  struct sectorlist * current;
-	  current = sectorhead[isector];
-	  while (current->index != -1)
+	       while (current->index != -1)
           {
              long ilocal = current->index;
-	     //double vvh = vv[ilocal];
+      	     //double vvh = vv[ilocal];
              //int binphi = (int)(vvh*nsectors);
-	     //if (binphi == isector || boundary[ilocal] == isector)
-	     //{
-	       uus[icount] = uu[ilocal];
-	       vvs[icount] = vv[ilocal]-isector*shift;
-	       wws[icount] = ww[ilocal];
-               for (long ipol=0; ipol<polarisations; ipol++)
-	       {
-		  weightss[ip] = weights[ilocal*polarisations+ipol];
+	           //if (binphi == isector || boundary[ilocal] == isector) {
+	           uus[icount] = uu[ilocal];
+	           vvs[icount] = vv[ilocal]-isector*shift;
+	           wws[icount] = ww[ilocal];
+             for (long ipol=0; ipol<polarisations; ipol++)
+                {
+		              weightss[ip] = weights[ilocal*polarisations+ipol];
                   ip++;
-	       }
-               for (long ifreq=0; ifreq<polarisations*freq_per_chan; ifreq++)
-	       {
-	          visreals[inu] = visreal[ilocal*polarisations*freq_per_chan+ifreq];
-	          visimgs[inu] = visimg[ilocal*polarisations*freq_per_chan+ifreq];
-		  //if(visimgs[inu]>1e10 || visimgs[inu]<-1e10)printf("%f %f %ld %ld %d %ld %ld\n",visreals[inu],visimgs[inu],inu,Nvissec,rank,ilocal*polarisations*freq_per_chan+ifreq,Nvis);
-	          inu++;
-	       }
-	       icount++;
+                }
+             for (long ifreq=0; ifreq<polarisations*freq_per_chan; ifreq++)
+                {
+	                 visreals[inu] = visreal[ilocal*polarisations*freq_per_chan+ifreq];
+	                 visimgs[inu] = visimg[ilocal*polarisations*freq_per_chan+ifreq];
+		               //if(visimgs[inu]>1e10 || visimgs[inu]<-1e10)printf("%f %f %ld %ld %d %ld %ld\n",visreals[inu],visimgs[inu],inu,Nvissec,rank,ilocal*polarisations*freq_per_chan+ifreq,Nvis);
+	                 inu++;
+	              }
+	           icount++;
 	     //}
-	     current = current->next;
-          }
+	            current = current->next;
+             }
 
 	  clock_gettime(CLOCK_MONOTONIC, &finishk);
 	  endk = clock();
 	  compose_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
 	  compose_time1 += (finishk.tv_sec - begink.tv_sec);
 	  compose_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
-          #ifndef USE_MPI
+
+    #ifndef USE_MPI
 	  double uumin = 1e20;
 	  double vvmin = 1e20;
 	  double uumax = -1e20;
@@ -501,7 +530,7 @@ int main(int argc, char * argv[])
 	       vvmin = MIN(vvmin,vvs[ipart]);
 	       vvmax = MAX(vvmax,vvs[ipart]);
 
-		  
+
                if(ipart%10 == 0)fprintf (pFile, "%ld %f %f %f\n",isector,uus[ipart],vvs[ipart]+isector*shift,wws[ipart]);
           }
 
@@ -514,6 +543,7 @@ int main(int argc, char * argv[])
 	  #endif
           clock_gettime(CLOCK_MONOTONIC, &begink);
           startk = clock();
+
           wstack(num_w_planes,
                Nsec,
                freq_per_chan,
@@ -527,10 +557,22 @@ int main(int argc, char * argv[])
                dx,
                dw,
                w_support,
-	       xaxis,
-	       yaxis,
+	             xaxis,
+	             yaxis,
                gridss,
                num_threads);
+
+
+/* int z =0 ;
+#pragma omp target map(to:test_i_gpu) map(from:z)
+{
+  int x; // only accessible from accelerator
+  x = 2;
+  z = x + test_i_gpu;
+}*/
+
+
+
 	  clock_gettime(CLOCK_MONOTONIC, &finishk);
 	  endk = clock();
 	  kernel_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
@@ -556,9 +598,9 @@ int main(int argc, char * argv[])
 	  #ifdef ONE_SIDE
 	  printf("One Side communication active\n");
 	  MPI_Win_lock(MPI_LOCK_SHARED,target_rank,0,slabwin);
-	  MPI_Accumulate(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,MPI_SUM,slabwin); 
+	  MPI_Accumulate(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,MPI_SUM,slabwin);
 	  MPI_Win_unlock(target_rank,slabwin);
-	  //MPI_Put(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,slabwin); 	   
+	  //MPI_Put(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,slabwin);
           #else
           MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD);
           #endif //ONE_SIDE
@@ -633,7 +675,7 @@ int main(int argc, char * argv[])
         process_time1 = (finish.tv_sec - begin.tv_sec);
         process_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
         clock_gettime(CLOCK_MONOTONIC, &begin);
-         
+
 
 #ifdef WRITE_DATA
 	// Write results
@@ -712,7 +754,7 @@ int main(int argc, char * argv[])
 
 #ifdef USE_FFTW
 	// FFT transform the data (using distributed FFTW)
-	
+
 	if(rank == 0)printf("PERFORMING FFT\n");
         clock_gettime(CLOCK_MONOTONIC, &begin);
         start = clock();
@@ -727,13 +769,13 @@ int main(int argc, char * argv[])
 	// and perform the FFT per w plane
 	alloc_local = fftw_mpi_local_size_2d(grid_size_y, grid_size_x, MPI_COMM_WORLD,&local_n0, &local_0_start);
 	fftwgrid = fftw_alloc_complex(alloc_local);
-	plan = fftw_mpi_plan_dft_2d(grid_size_y, grid_size_x, fftwgrid, fftwgrid, MPI_COMM_WORLD, FFTW_BACKWARD, FFTW_ESTIMATE); 
+	plan = fftw_mpi_plan_dft_2d(grid_size_y, grid_size_x, fftwgrid, fftwgrid, MPI_COMM_WORLD, FFTW_BACKWARD, FFTW_ESTIMATE);
 
 	long fftwindex = 0;
 	long fftwindex2D = 0;
 	for (int iw=0; iw<num_w_planes; iw++)
         {
-            //printf("FFTing plan %d\n",iw);	    
+            //printf("FFTing plan %d\n",iw);
             // select the w-plane to transform
             for (int iv=0; iv<yaxis; iv++)
             {
@@ -814,7 +856,7 @@ int main(int argc, char * argv[])
                 for (int iw=0; iw<num_w_planes; iw++)
                 for (int iv=0; iv<yaxis; iv++)
                 for (int iu=0; iu<xaxis; iu++)
-                {         
+                {
                           long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*grid_size_x*grid_size_y)*sizeof(double);
                           long index = iu + iv*xaxis + iw*xaxis*yaxis;
                           fseek(pFileimg, global_index, SEEK_SET);
@@ -825,7 +867,7 @@ int main(int argc, char * argv[])
                           fwrite(gridss_img, size_of_grid/2, sizeof(double), pFileimg);
               }
 
-	      
+
           }
           #else
           /*
@@ -851,7 +893,7 @@ int main(int argc, char * argv[])
 	MPI_Barrier(MPI_COMM_WORLD);
         #endif
 #endif //WRITE_DATA
- 
+
 	fftw_free(fftwgrid);
 
 	// Phase correction
diff --git a/w-stacking.cu b/w-stacking.cu
index 499189619802f8e94c7a41cd49caa46c8afb7093..4fe36254789fff354e28fb887a5d4bb6c3b37704 100644
--- a/w-stacking.cu
+++ b/w-stacking.cu
@@ -6,6 +6,9 @@
 #include <stdlib.h>
 #include <stdio.h>
 
+#ifdef ACCOMP
+#pragma omp  declare target
+#endif
 #ifdef __CUDACC__
 double __device__
 #else
@@ -17,6 +20,9 @@ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
      conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22);
      return conv_weight;
 }
+#ifdef ACCOMP
+#pragma omp end declare target
+#endif
 
 #ifdef __CUDACC__
 //double __device__ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
@@ -90,7 +96,7 @@ __global__ void convolve_g(
                 double add_term_img = 0.0;
                 long ifine = visindex;
                 for (long ifreq=0; ifreq<freq_per_chan; ifreq++)
-		{	
+		{
 		   long iweight = visindex/freq_per_chan;
                    for (long ipol=0; ipol<polarizations; ipol++)
                    {
@@ -111,7 +117,9 @@ __global__ void convolve_g(
 	}
 }
 #endif
-
+#ifdef ACCOMP
+#pragma  omp declare target
+#endif
 void wstack(
      int num_w_planes,
      long num_points,
@@ -216,7 +224,14 @@ void wstack(
 #ifdef _OPENMP
     omp_set_num_threads(num_threads);
 #endif
-    #pragma omp parallel for private(visindex) 
+
+#ifdef ACCOMP
+    long Nvis = num_points*freq_per_chan*polarizations;
+  //  #pragma omp target data map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan])
+  //  #pragma omp target teams distribute parallel for  map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom: grid[0:2*num_w_planes*grid_size_x*grid_size_y])
+#else
+    #pragma omp parallel for private(visindex)
+#endif
     for (i = 0; i < num_points; i++)
     {
 #ifdef _OPENMP
@@ -258,6 +273,7 @@ void wstack(
                 double u_dist = (double)j+0.5 - pos_u;
 		long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
 
+
 		double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
 		// Loops over frequencies and polarizations
 		double add_term_real = 0.0;
@@ -265,7 +281,7 @@ void wstack(
 		long ifine = visindex;
 		// DAV: the following two loops are performend by each thread separately: no problems of race conditions
 		for (long ifreq=0; ifreq<freq_per_chan; ifreq++)
-		{	
+		{
 		   long iweight = visindex/freq_per_chan;
 	           for (long ipol=0; ipol<polarizations; ipol++)
 	           {
@@ -287,18 +303,20 @@ void wstack(
 		grid[iKer+1] += add_term_img;
             }
         }
-	
+
     }
 // End switch between CUDA and CPU versions
 #endif
     //for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
 }
+#ifdef ACCOMP
+#pragma  omp end declare target
+#endif
 
 int test(int nnn)
 {
 	int mmm;
-	
+
 	mmm = nnn+1;
 	return mmm;
-}	
-
+}
diff --git a/w-stacking.h b/w-stacking.h
index 35e1495ea90864c6dbc0120b3ba063f28afef583..7de9b80414742d64f91a82642770878aaac1dbf7 100644
--- a/w-stacking.h
+++ b/w-stacking.h
@@ -8,6 +8,7 @@
 #ifdef __CUDACC__
 extern "C"
 #endif
+
 void wstack(
      int,
      long,
diff --git a/w-stacking_omp.c b/w-stacking_omp.c
new file mode 100644
index 0000000000000000000000000000000000000000..b848bc5c117b037d070159a58242f08019cc39e1
--- /dev/null
+++ b/w-stacking_omp.c
@@ -0,0 +1,247 @@
+#include <omp.h>
+#include "w-stacking_omp.h"
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+#ifdef NVIDIA
+#include <cuda_runtime.h>
+#endif
+
+#ifdef ACCOMP
+#pragma omp  declare target
+#endif
+double gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
+{
+     double conv_weight;
+     conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22);
+     return conv_weight;
+}
+#ifdef ACCOMP
+#pragma omp end declare target
+#endif
+
+
+
+void wstack(
+     int num_w_planes,
+     long num_points,
+     long freq_per_chan,
+     long polarizations,
+     double* uu,
+     double* vv,
+     double* ww,
+     float* vis_real,
+     float* vis_img,
+     float* weight,
+     double dx,
+     double dw,
+     int w_support,
+     int grid_size_x,
+     int grid_size_y,
+     double* grid,
+     int num_threads)
+{
+    //long index;
+    long visindex;
+
+    // initialize the convolution kernel
+    // gaussian:
+    int KernelLen = (w_support-1)/2;
+    double std = 1.0;
+    double std22 = 1.0/(2.0*std*std);
+    double norm = std22/PI;
+
+    // Loop over visibilities.
+#ifdef _OPENMP
+    omp_set_num_threads(num_threads);
+#endif
+
+#ifdef ACCOMP
+    long Nvis = num_points*freq_per_chan*polarizations;
+    long gpu_weight_dim = Nvis/freq_per_chan;
+    long gpu_grid_dim = 2*num_w_planes*grid_size_x*grid_size_y;
+#pragma omp target teams distribute parallel for private(visindex) \
+map(to:num_points, KernelLen, std,  std22, norm, num_w_planes, \
+  uu[0:num_points], vv[0:num_points], ww[0:num_points], \
+  vis_real[0:Nvis], vis_img[0:Nvis], weight[0:gpu_weight_dim], \
+  grid_size_x, grid_size_y, freq_per_chan, polarizations, dx,dw, w_support, num_threads) \
+  map(tofrom: grid[0:gpu_grid_dim])
+#endif
+    for (long i = 0; i < num_points; i++)
+    {
+#ifdef _OPENMP
+	//int tid;
+	//tid = omp_get_thread_num();
+	//printf("%d\n",tid);
+#endif
+
+        visindex = i*freq_per_chan*polarizations;
+
+        double sum = 0.0;
+        int j, k;
+	//if (i%1000 == 0)printf("%ld\n",i);
+
+        /* Convert UV coordinates to grid coordinates. */
+        double pos_u = uu[i] / dx;
+        double pos_v = vv[i] / dx;
+        double ww_i  = ww[i] / dw;
+        int grid_w = (int)ww_i;
+        int grid_u = (int)pos_u;
+        int grid_v = (int)pos_v;
+
+	// check the boundaries
+	long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
+	long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
+	long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
+	long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
+        //printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax);
+
+
+        // Convolve this point onto the grid.
+        for (k = kmin; k <= kmax; k++)
+        {
+
+            double v_dist = (double)k+0.5 - pos_v;
+
+            for (j = jmin; j <= jmax; j++)
+            {
+                double u_dist = (double)j+0.5 - pos_u;
+		long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
+
+
+		double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
+		// Loops over frequencies and polarizations
+		double add_term_real = 0.0;
+		double add_term_img = 0.0;
+		long ifine = visindex;
+		// DAV: the following two loops are performend by each thread separately: no problems of race conditions
+		for (long ifreq=0; ifreq<freq_per_chan; ifreq++)
+		{
+		   long iweight = visindex/freq_per_chan;
+	           for (long ipol=0; ipol<polarizations; ipol++)
+	           {
+                      if (!isnan(vis_real[ifine]))
+                      {
+		         //printf("%f %ld\n",weight[iweight],iweight);
+                         add_term_real += weight[iweight] * vis_real[ifine] * conv_weight;
+		         add_term_img += weight[iweight] * vis_img[ifine] * conv_weight;
+			 //if(vis_img[ifine]>1e10 || vis_img[ifine]<-1e10)printf("%f %f %f %f %ld %ld\n",vis_real[ifine],vis_img[ifine],weight[iweight],conv_weight,ifine,num_points*freq_per_chan*polarizations);
+		      }
+		      ifine++;
+		      iweight++;
+		   }
+	        }
+		// DAV: this is the critical call in terms of correctness of the results and of performance
+		#pragma omp atomic
+		grid[iKer] += add_term_real;
+		#pragma omp atomic
+		grid[iKer+1] += add_term_img;
+            }
+        }
+
+    }
+
+    //for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
+}
+
+
+#ifdef NVIDIA
+#define CUDAErrorCheck(funcall)                                         \
+do {                                                                    \
+  cudaError_t ierr = funcall;                                           \
+  if (cudaSuccess != ierr) {                                            \
+    fprintf(stderr, "%s(line %d) : CUDA RT API error : %s(%d) -> %s\n", \
+    __FILE__, __LINE__, #funcall, ierr, cudaGetErrorString(ierr));      \
+    exit(ierr);                                                         \
+  }                                                                     \
+} while (0)
+
+static inline int _corePerSM(int major, int minor)
+/**
+ * @brief Give the number of CUDA cores per streaming multiprocessor (SM).
+ *
+ * The number of CUDA cores per SM is determined by the compute capability.
+ *
+ * @param major Major revision number of the compute capability.
+ * @param minor Minor revision number of the compute capability.
+ *
+ * @return The number of CUDA cores per SM.
+ */
+{
+  if (1 == major) {
+    if (0 == minor || 1 == minor || 2 == minor || 3 == minor) return 8;
+  }
+  if (2 == major) {
+    if (0 == minor) return 32;
+    if (1 == minor) return 48;
+  }
+  if (3 == major) {
+    if (0 == minor || 5 == minor || 7 == minor) return 192;
+  }
+  if (5 == major) {
+    if (0 == minor || 2 == minor) return 128;
+  }
+  if (6 == major) {
+    if (0 == minor) return 64;
+    if (1 == minor || 2 == minor) return 128;
+  }
+  if (7 == major) {
+    if (0 == minor || 2 == minor || 5 == minor) return 64;
+  }
+  return -1;
+}
+
+void getGPUInfo(int iaccel)
+{
+  int corePerSM;
+
+ struct cudaDeviceProp dev;
+
+  CUDAErrorCheck(cudaSetDevice(iaccel));
+  CUDAErrorCheck(cudaGetDeviceProperties(&dev, iaccel));
+  corePerSM = _corePerSM(dev.major, dev.minor);
+
+  printf("\n");
+  printf("============================================================\n");
+  printf("CUDA Device name : \"%s\"\n", dev.name);
+  printf("------------------------------------------------------------\n");
+  printf("Comp. Capability : %d.%d\n", dev.major, dev.minor);
+  printf("max clock rate   : %.0f MHz\n", dev.clockRate * 1.e-3f);
+  printf("number of SMs    : %d\n", dev.multiProcessorCount);
+  printf("cores  /  SM     : %d\n", corePerSM);
+  printf("# of CUDA cores  : %d\n", corePerSM * dev.multiProcessorCount);
+  printf("------------------------------------------------------------\n");
+  printf("global memory    : %5.0f MBytes\n", dev.totalGlobalMem / 1048576.0f);
+  printf("shared mem. / SM : %5.1f KBytes\n", dev.sharedMemPerMultiprocessor / 1024.0f);
+  printf("32-bit reg. / SM : %d\n", dev.regsPerMultiprocessor);
+  printf("------------------------------------------------------------\n");
+  printf("max # of threads / SM    : %d\n", dev.maxThreadsPerMultiProcessor);
+  printf("max # of threads / block : %d\n", dev.maxThreadsPerBlock);
+  printf("max dim. of block        : (%d, %d, %d)\n",
+      dev.maxThreadsDim[0], dev.maxThreadsDim[1], dev.maxThreadsDim[2]);
+  printf("max dim. of grid         : (%d, %d, %d)\n",
+      dev.maxGridSize[0],   dev.maxGridSize[1],   dev.maxGridSize[2]);
+  printf("warp size                : %d\n", dev.warpSize);
+  printf("============================================================\n");
+
+  int z = 0, x = 2;
+  #pragma omp target map(to:x) map(tofrom:z)
+  {
+    z=x+100;
+  }
+}
+
+#endif
+
+
+
+
+
+
+int test(int nnn)
+{
+	int mmm;
+
+	mmm = nnn+1;
+	return mmm;
+}
diff --git a/w-stacking_omp.h b/w-stacking_omp.h
new file mode 100644
index 0000000000000000000000000000000000000000..b655ffe1ecc37a742a175c61754104cb5150bec8
--- /dev/null
+++ b/w-stacking_omp.h
@@ -0,0 +1,82 @@
+#ifndef W_PROJECT_H_
+#define W_PROJECT_H_
+#endif
+
+#define NWORKERS -1    //100
+#define NTHREADS 32
+#define PI 3.14159265359
+#define REAL_TYPE double
+
+void wstack(
+     int,
+     long,
+     long,
+     long,
+     double*,
+     double*,
+     double*,
+     float*,
+     float*,
+     float*,
+     double,
+     double,
+     int,
+     int,
+     int,
+     double*,
+     int);
+
+int test(int nnn);
+
+double gauss_kernel_norm(
+  double norm,
+  double std22,
+  double u_dist,
+  double v_dist);
+
+
+
+
+void phase_correction(
+     double*,
+     double*,
+     double*,
+     int,
+     int,
+     int,
+     int,
+     int);
+
+#ifdef ACCOMP
+#ifdef NVIDIA
+void getGPUInfo(int);
+#endif
+#pragma omp declare target (gauss_kernel_norm)
+#endif
+
+#ifdef NVIDIA
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef PRTACCELINFO_H
+#define PRTACCELINFO_H
+
+void prtAccelInfo(int iaccel);
+/**<
+ * @brief Print some basic info of an accelerator.
+ *
+ * Strictly speaking, \c prtAccelInfo() can only print the basic info of an
+ * Nvidia CUDA device.
+ *
+ * @param iaccel The index of an accelerator.
+ *
+ * @return \c void.
+ */
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif