diff --git a/Build/Makefile.Magellanus b/Build/Makefile.Magellanus
index c440365f48bd8bc2cbe33011ccaec061bdadb245..c2f78ea3ba66d3dcc4d0c14224d6a274a53bf1ce 100644
--- a/Build/Makefile.Magellanus
+++ b/Build/Makefile.Magellanus
@@ -4,29 +4,31 @@ CXX      =  g++-10
 MPICC    =  mpicc
 MPIC++   =  mpiCC
 
-OPTIMIZE =  -O3
-#-ffast-math  -fopt-info-all-omp -fcf-protection=none -fno-stack-protector -foffload=nvptx-none
-
-
-GSL_INCL =  -I/home/taffoni/sw/include #-I/opt/cluster/openmpi/3.1.3/gnu/8.2.0/include
-GSL_LIBS =  -L/home/taffoni/sw/lib #-L/opt/cluster/openmpi/3.1.3/gnu/8.2.0/lib -lmpi
+GSL_INCL =  -I/home/taffoni/sw/include
+GSL_LIBS =  -L/home/taffoni/sw/lib
 
 FFTW_INCL=  -I/home/taffoni/sw/include
 FFTW_LIB=  -L/home/taffoni/sw/lib   -lfftw3_mpi -lfftw3
 
+#-L/opt/cluster/openmpi/3.1.3/gnu/8.2.0/lib -lmpi
 MPI_LIB =
-MPI_INCL=
+#-I/opt/cluster/openmpi/3.1.3/gnu/8.2.0/include
+MPI_INCL= -I/home/taffoni/sw/Linux_x86_64/21.5/comm_libs/mpi/include
 HDF5_INCL =
 HDF5_LIB  =
 
-OMP = -fopenmp
-
+OMP = -mp=multicore,gpu -Mprof -cuda
+#OMP = -fopenmp
 NVCC = nvcc
 NVFLAGS = -arch=sm_70 -Xcompiler -std=c++11
 NVLIB = -L/home/taffoni/sw/Linux_x86_64/21.5/cuda/11.3/lib64/ -lcudart -lcuda
 
-CFLAGS += $(OPTIMIZE)
-CFLAGS += -I.
-CFLAGS += -I/home/taffoni/sw/Linux_x86_64/21.5/comm_libs/mpi/include
-CFLAGS += $(FFTW_INCL) $(GSL_INCL)
-CFLAGS += $(FFTW_LIB) -lm
+
+CFLAGS +=  -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL)
+
+OPTIMIZE =  $(OMP) -O3
+
+# OMP GPU SPECIFIC FLAGS
+#OPTIMIZE += -Wno-unused-result -foffload=-lm -ffast-math
+#OPTIMIZE += -fcf-protection=none -fno-stack-protector -foffload=nvptx-none -foffload=-misa=sm_35
+#-ffast-math  -fopt-info-all-omp -foffload=-misa=sm_35 -fcf-protection=none -fno-stack-protector -foffload=nvptx-none
diff --git a/Build/Makefile.Marconi b/Build/Makefile.Marconi
index 4fb0d034f58b6a585c7c8846b694822c869ee7d7..4fec93afd7851aad0ebd3e85f449c6c234a7376d 100644
--- a/Build/Makefile.Marconi
+++ b/Build/Makefile.Marconi
@@ -5,16 +5,16 @@ MPICC    =  mpicc
 MPIC++   =  mpiCC
 
 
-CFLAGS += -O3 -mcpu=native
-CFLAGS += -I.
 FFTW_INCL=  -I/home/taffoni/sw/include
-FFTW_LIB=  -L/home/taffoni/sw/lib 
+FFTW_LIB=  -L/home/taffoni/sw/lib
 
-LIBS = $(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm
 
 NVCC = nvcc
 NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11
 NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda
 
+OMP= -fopenmp
 
-CFLAGS += -O3 -mtune=native
+CFLAGS +=  -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL)
+
+OPTIMIZE = $(OMP) -O3 -mtune=native
diff --git a/Makefile b/Makefile
index b0667357d1d1879ff93abdf7c6b447aa894dfb7b..144f065782f1810a212a951cd3af5ff4fd0269d2 100644
--- a/Makefile
+++ b/Makefile
@@ -14,21 +14,22 @@ include Build/Makefile.systype
 endif
 
 
-
-
+LIBS = $(FFTW_LIB) -lfftw3 -lm -lcudart  -lcuda
 
 # create MPI code
 OPT += -DUSE_MPI
-
+#OPT += -DACCOMP
 # use FFTW (it can be switched on ONLY if MPI is active)
 ifeq (USE_MPI,$(findstring USE_MPI,$(OPT)))
    OPT += -DUSE_FFTW
+	 LIBS = $(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm
 endif
 
+#OPT += -DNVIDIA
 # perform one-side communication (suggested) instead of reduce (only if MPI is active)
 OPT += -DONE_SIDE
 # write the full 3D cube of gridded visibilities and its FFT transform
-#OPT += -DWRITE_DATA
+OPT += -DWRITE_DATA
 # write the final image
 OPT += -DWRITE_IMAGE
 # perform w-stacking phase correction
@@ -46,27 +47,36 @@ phase_correction.c: phase_correction.cu
 
 ifeq (USE_MPI,$(findstring USE_MPI,$(OPT)))
 %.o: %.c $(DEPS)
-	$(MPICC)  -c -o $@ $< $(CFLAGS) $(OPT)
+	$(MPICC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS)
 else
 %.o: %.c $(DEPS)
-	$(CC) $(OMP) -c -o $@ $< $(CFLAGS) $(OPT)
+	$(CC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS)
 endif
 
 serial: $(COBJ)
-	$(CC)  -o w-stackingCfftw_serial $(CFLAGS) $^ -lm
+	$(CC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_serial  $^ $(LIBS)
+
+serial_omp: phase_correction.c
+	$(CC)  $(OPTIMIZE) $(OPT) -o w-stackingOMP_serial w-stacking-fftw.c  w-stacking_omp.c    $(CFLAGS) $(LIBS)
+
+simple_mpi: phase_correction.c
+	$(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingMPI_simple w-stacking_omp.c w-stacking-fftw.c phase_correction.c  $(CFLAGS) $(LIBS)
+
+mpi_omp: phase_correction.c
+	$(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingMPI_omp w-stacking_omp.c w-stacking-fftw.c phase_correction.c  $(CFLAGS) $(LIBS)
 
 serial_cuda:
 	$(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB)
-	$(CC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c
-	$(CXX) $(CFLAGS) $(OPT) -o w-stackingfftw_serial w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) -lm
+	$(CC)  $(OPTIMIZE) $(OPT) -c w-stacking-fftw.c $(CFLAGS) $(LIBS)
+	$(CXX) $(OPTIMIZE) $(OPT) -o w-stackingfftw_serial w-stacking-fftw.o w-stacking.o phase_correction.o $(CFLAGS) $(NVLIB) -lm
 
 mpi: $(COBJ)
-	$(MPICC) -o w-stackingCfftw   $^ $(CFLAGS)
+	$(MPICC) $(OPTIMIZE) -o w-stackingCfftw   $^  $(CFLAGS) $(LIBS)
 
 mpi_cuda:
-	$(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB)
-	$(MPICC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c
-	$(MPIC++)  $(OPT)   -o w-stackingfftw w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) $(CFLAGS)
+	$(NVCC)   $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB)
+	$(MPICC)  $(OPTIMIZE) $(OPT) -c w-stacking-fftw.c $(CFLAGS) $(LIBS)
+	$(MPIC++) $(OPTIMIZE) $(OPT)   -o w-stackingfftw w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) $(CFLAGS) $(LIBS)
 
 clean:
 	rm *.o
diff --git a/phase_correction.cu b/phase_correction.cu
index 9d1cc7e790b8aec7c5a552745719daf862018cff..ba5274674681f9c249901d32d70609bb1b84b6fb 100644
--- a/phase_correction.cu
+++ b/phase_correction.cu
@@ -17,8 +17,12 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
 
 	for (int iw=0; iw<num_w_planes; iw++)
 	{
-            double wterm = (double)iw/dnum_w_planes;
-
+      double wterm = (double)iw/dnum_w_planes;
+#ifdef ACCOMP
+			#pragma omp target teams distribute parallel for \
+			map(tofrom: image_real[0:xaxis*yaxis], image_imag[0:xaxis*yaxis])	\
+			map (to: gridss[0:2*num_w_planes*xaxis*yaxis])
+#endif
 	    for (int iv=0; iv<yaxis; iv++)
             for (int iu=0; iu<xaxis; iu++)
             {
@@ -26,10 +30,10 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
 		    long index = 2*(iu+iv*xaxis+xaxis*yaxis*iw);
 		    long img_index = iu+iv*xaxis;
 #ifdef PHASE_ON
-                    double xcoord = (double)(iu-xaxistot/2);
-                    if(xcoord < 0.0)xcoord = (double)(iu+xaxistot/2);
-                    double ycoord = (double)(iv-yaxistot/2);
-                    if(ycoord < 0.0)ycoord = (double)(iv+yaxistot/2);
+        double xcoord = (double)(iu-xaxistot/2);
+        if(xcoord < 0.0) xcoord = (double)(iu+xaxistot/2);
+        double ycoord = (double)(iv-yaxistot/2);
+        if(ycoord < 0.0) ycoord = (double)(iv+yaxistot/2);
 		    xcoord = xcoord/dxaxistot;
 		    ycoord = ycoord/dyaxistot;
 
@@ -43,7 +47,7 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
 		    } else {
 			    preal = cos(-2.0*PI*wterm*(sqrt(radius2-1.0)-1));
 			    pimag = 0.0;
-		    } 
+		    }
 
 
 		    double p,q,r,s;
@@ -53,14 +57,18 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
 		    s = pimag;
 
 		    //printf("%d %d %d %ld %ld\n",iu,iv,iw,index,img_index);
+				#pragma omp atomic
 		    image_real[img_index] += p*r-q*s;
+				#pragma omp atomic
 		    image_imag[img_index] += p*s+q*r;
 #else
+#pragma omp atomic
 		    image_real[img_index] += gridss[index];
+#pragma omp atomic
 		    image_imag[img_index] += gridss[index+1];
 #endif
 
-            }  
+            }
 	}
 
 
diff --git a/w-stacking-fftw.c b/w-stacking-fftw.c
index 4d8130f87c6b66cef76611158d7b80076a3744ab..b1d32ae9b3ff5e40137ef6f704600cadc80fcb25 100644
--- a/w-stacking-fftw.c
+++ b/w-stacking-fftw.c
@@ -7,9 +7,15 @@
 #include <fftw3-mpi.h>
 #endif
 #endif
+#include <omp.h>
 #include <math.h>
 #include <time.h>
+#include <unistd.h>
+#ifdef ACCOMP
+#include "w-stacking_omp.h"
+#else
 #include "w-stacking.h"
+#endif
 #define PI 3.14159265359
 #define NUM_OF_SECTORS -1
 #define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
@@ -27,16 +33,16 @@ void Push(struct sectorlist** headRef, long data) {
      struct sectorlist* newNode = malloc(sizeof(struct sectorlist));
      newNode->index = data;
      newNode->next = *headRef;
-     *headRef = newNode; 
+     *headRef = newNode;
 }
 
 // Main Code
-int main(int argc, char * argv[]) 
+int main(int argc, char * argv[])
 {
 	int rank;
 	int size;
 
-	FILE * pFile; 
+	FILE * pFile;
 	FILE * pFile1;
 	FILE * pFilereal;
 	FILE * pFileimg;
@@ -50,16 +56,16 @@ int main(int argc, char * argv[])
 	//
 	//char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/hba-8hrs_gauss4new.binMS/";
 	//char datapath[900] = "/m100_scratch/userexternal/cgheller/Lofar/Observations/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/";
-	char datapath[900]; 
+	char datapath[900];
 	char datapath_multi[NFILES][900];
 
 	char ufile[30] = "ucoord.bin";
-        char vfile[30] = "vcoord.bin";
-        char wfile[30] = "wcoord.bin";
-        char weightsfile[30] = "weights.bin";
-        char visrealfile[30] = "visibilities_real.bin";
-        char visimgfile[30] = "visibilities_img.bin";
-        char metafile[30] = "meta.txt";
+  char vfile[30] = "vcoord.bin";
+  char wfile[30] = "wcoord.bin";
+  char weightsfile[30] = "weights.bin";
+  char visrealfile[30] = "visibilities_real.bin";
+  char visimgfile[30] = "visibilities_img.bin";
+  char metafile[30] = "meta.txt";
 	char outfile[30] = "grid.txt";
 	char outfile1[30] = "coords.txt";
 	char outfile2[30] = "grid_real.bin";
@@ -83,7 +89,7 @@ int main(int argc, char * argv[])
 	long Nweights;
 	long freq_per_chan;
 	long polarisations;
-        long Ntimes;
+  long Ntimes;
 	double dt;
 	double thours;
 	long baselines;
@@ -92,7 +98,7 @@ int main(int argc, char * argv[])
 	double wmin;
 	double wmax;
 
-        // MESH SIZE
+  // MESH SIZE
 	int grid_size_x = 2048;
 	int grid_size_y = 2048;
 	int local_grid_size_x;// = 100;
@@ -100,6 +106,8 @@ int main(int argc, char * argv[])
 	int xaxis;
 	int yaxis;
 	int num_w_planes = 1;
+
+
 	// DAV: the corresponding KernelLen is calculated within the wstack function. It can be anyway hardcoded for optimization
 	int w_support = 7;
 	int num_threads;// = 4;
@@ -115,11 +123,23 @@ int main(int argc, char * argv[])
 	struct timespec begin, finish, begin0, begink, finishk;
 	double elapsed;
 	long nsectors;
+  /* GT get nymber of threads exit if not given */
+  if(argc == 1) {
+    fprintf(stderr, "Usage: %s number_of_OMP_Threads \n", argv[0]);
+    exit(1);
+  }
+  // Set the number of OpenMP threads
+  num_threads = atoi(argv[1]);
+
+  if ( num_threads == 0 )
+  {
+    fprintf(stderr, "Wrong parameter: %s\n\n", argv[1]);
+    fprintf(stderr, "Usage: %s number_of_OMP_Threads \n", argv[0]);
+    exit(1);
+  }
 
 	clock_gettime(CLOCK_MONOTONIC, &begin0);
 	start0 = clock();
-        // Set the number of OpenMP threads
-        num_threads = atoi(argv[1]);
 
 	// Intialize MPI environment
 #ifdef USE_MPI
@@ -134,7 +154,21 @@ int main(int argc, char * argv[])
 	rank = 0;
 	size = 1;
 #endif
-        if(rank == 0)printf("Running with %d threads\n",num_threads);
+
+  if(rank == 0)printf("Running with %d threads\n",num_threads);
+
+#ifdef ACCOMP
+if(rank == 0){
+  if (0 == omp_get_num_devices()) {
+      printf("No accelerator found ... exit\n");
+      exit(255);
+   }
+   printf("Number of available GPUs %d\n", omp_get_num_devices());
+   #ifdef NVIDIA
+      prtAccelInfo();
+   #endif
+ }
+#endif
 
 	// set the local size of the image
 	local_grid_size_x = grid_size_x;
@@ -142,7 +176,7 @@ int main(int argc, char * argv[])
 	if (nsectors < 0) nsectors = size;
 	local_grid_size_y = grid_size_y/nsectors;
 	//nsectors = size;
-	
+
 	// LOCAL grid size
 	xaxis = local_grid_size_x;
 	yaxis = local_grid_size_y;
@@ -180,10 +214,10 @@ int main(int argc, char * argv[])
 	int nsub = 1000;
 	//int nsub = 10;
 	printf("Subtracting last %d measurements\n",nsub);
-        Nmeasures = Nmeasures-nsub;
+  Nmeasures = Nmeasures-nsub;
 	Nvis = Nmeasures*freq_per_chan*polarisations;
 
-        // calculate the coordinates of the center
+  // calculate the coordinates of the center
 	double uvshift = uvmin/(uvmax-uvmin);
 	//printf("UVSHIFT %f %f %f %f %f\n",uvmin, uvmax, wmin, wmax, uvshift);
 
@@ -193,12 +227,12 @@ int main(int argc, char * argv[])
 	   printf("N. visibilities %ld\n",Nvis);
 	}
 
-        // Set temporary local size of points
+  // Set temporary local size of points
 	long nm_pe = (long)(Nmeasures/size);
 	long remaining = Nmeasures%size;
-        
-        long startrow = rank*nm_pe;
-        if (rank == size-1)nm_pe = nm_pe+remaining;
+
+  long startrow = rank*nm_pe;
+  if (rank == size-1)nm_pe = nm_pe+remaining;
 
 	long Nmeasures_tot = Nmeasures;
 	Nmeasures = nm_pe;
@@ -206,10 +240,11 @@ int main(int argc, char * argv[])
 	Nvis = Nmeasures*freq_per_chan*polarisations;
 	Nweights = Nmeasures*polarisations;
 
-        #ifdef VERBOSE
+#ifdef VERBOSE
 	printf("N. measurements on %d %ld\n",rank,Nmeasures);
 	printf("N. visibilities on %d %ld\n",rank,Nvis);
-        #endif
+#endif
+
 
 	// DAV: all these arrays can be allocatate statically for the sake of optimization. However be careful that if MPI is used
 	// all the sizes are rescaled by the number of MPI tasks
@@ -221,7 +256,7 @@ int main(int argc, char * argv[])
 	visreal = (float*) calloc(Nvis,sizeof(float));
 	visimg = (float*) calloc(Nvis,sizeof(float));
 
-        if(rank == 0)printf("READING DATA\n");
+  if(rank == 0)printf("READING DATA\n");
 	// Read data
 	strcpy(filename,datapath);
 	strcat(filename,ufile);
@@ -236,7 +271,7 @@ int main(int argc, char * argv[])
 	strcat(filename,vfile);
 	//printf("Reading %s\n",filename);
 
-	pFile = fopen (filename,"rb");
+  pFile = fopen (filename,"rb");
 	fseek (pFile,startrow*sizeof(double),SEEK_SET);
 	fread(vv,Nmeasures*sizeof(double),1,pFile);
 	fclose(pFile);
@@ -250,9 +285,9 @@ int main(int argc, char * argv[])
 	fread(ww,Nmeasures*sizeof(double),1,pFile);
 	fclose(pFile);
 
-        #ifdef USE_MPI
+#ifdef USE_MPI
 	MPI_Barrier(MPI_COMM_WORLD);
-        #endif
+#endif
 
 	clock_gettime(CLOCK_MONOTONIC, &finish);
 	end = clock();
@@ -260,13 +295,13 @@ int main(int argc, char * argv[])
 	setup_time1 = (finish.tv_sec - begin.tv_sec);
 	setup_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
 
-        if(rank == 0)printf("GRIDDING DATA\n");
+
+  if(rank == 0)printf("GRIDDING DATA\n");
 
 	// Create histograms and linked lists
-        clock_gettime(CLOCK_MONOTONIC, &begin);
-        start = clock();
- 
-        //CLAAA
+  clock_gettime(CLOCK_MONOTONIC, &begin);
+  start = clock();
+
 	// Initialize linked list
 	struct sectorlist ** sectorhead;
 	sectorhead = (struct sectorlist **) malloc((nsectors+1) * sizeof(struct sectorlist));
@@ -283,7 +318,7 @@ int main(int argc, char * argv[])
 	double uuh,vvh;
 	for (long iphi = 0; iphi < Nmeasures; iphi++)
 	{
-           boundary[iphi] = -1;
+     boundary[iphi] = -1;
 	   uuh = uu[iphi];
 	   vvh = vv[iphi];
 	   int binphi = (int)(vvh*nsectors);
@@ -292,8 +327,8 @@ int main(int argc, char * argv[])
 	   double downdist = vvh - (double)(binphi*yaxis)*dx;
 	   //
 	   histo_send[binphi]++;
-           Push(&sectorhead[binphi],iphi);
-           if(updist < w_supporth && updist >= 0.0) {histo_send[binphi+1]++; boundary[iphi] = binphi+1; Push(&sectorhead[binphi+1],iphi);};
+     Push(&sectorhead[binphi],iphi);
+     if(updist < w_supporth && updist >= 0.0) {histo_send[binphi+1]++; boundary[iphi] = binphi+1; Push(&sectorhead[binphi+1],iphi);};
 	   if(downdist < w_supporth && binphi > 0 && downdist >= 0.0) {histo_send[binphi-1]++; boundary[iphi] = binphi-1; Push(&sectorhead[binphi-1],iphi);};
 	}
 
@@ -313,41 +348,39 @@ int main(int argc, char * argv[])
 	}
 #endif
 
-        #ifdef VERBOSE
-        for (int iii=0; iii<nsectors+1; iii++)printf("HISTO %d %d %ld\n",rank, iii, histo_send[iii]);
-        #endif
-
-       // Create sector grid
-        double * gridss;
-        double * gridss_w;
-        double * gridss_real;
-        double * gridss_img;
-	double * grid;
-        long size_of_grid;
-        size_of_grid = 2*num_w_planes*xaxis*yaxis;
-        gridss = (double*) calloc(size_of_grid,sizeof(double));
-        gridss_w = (double*) calloc(size_of_grid,sizeof(double));
-        gridss_real = (double*) calloc(size_of_grid/2,sizeof(double));
-        gridss_img = (double*) calloc(size_of_grid/2,sizeof(double));
-        // Create destination slab
-        grid = (double*) calloc(size_of_grid,sizeof(double));
-        // Create temporary global grid
-        #ifndef USE_MPI
-        double * gridtot = (double*) calloc(2*grid_size_x*grid_size_y*num_w_planes,sizeof(double));
-        #endif
-
-        double shift = (double)(dx*yaxis);
+#ifdef VERBOSE
+  for (int iii=0; iii<nsectors+1; iii++)printf("HISTO %d %d %ld\n",rank, iii, histo_send[iii]);
+#endif
 
-        // Open the MPI Memory Window for the slab
-        #ifdef USE_MPI
-        MPI_Win slabwin;
-        MPI_Win_create(grid, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &slabwin);
-        MPI_Win_fence(0,slabwin);
-        #endif
+// Create sector grid
+  double * gridss;
+  double * gridss_w;
+  double * gridss_real;
+  double * gridss_img;
+  double * grid;
+  long size_of_grid;
+  size_of_grid = 2*num_w_planes*xaxis*yaxis;
+  gridss = (double*) calloc(size_of_grid,sizeof(double));
+  gridss_w = (double*) calloc(size_of_grid,sizeof(double));
+  gridss_real = (double*) calloc(size_of_grid/2,sizeof(double));
+  gridss_img = (double*) calloc(size_of_grid/2,sizeof(double));
+  // Create destination slab
+  grid = (double*) calloc(size_of_grid,sizeof(double));
+  // Create temporary global grid
+#ifndef USE_MPI
+  double * gridtot = (double*) calloc(2*grid_size_x*grid_size_y*num_w_planes,sizeof(double));
+#endif
+  double shift = (double)(dx*yaxis);
+  // Open the MPI Memory Window for the slab
+#ifdef USE_MPI
+  MPI_Win slabwin;
+  MPI_Win_create(grid, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &slabwin);
+  MPI_Win_fence(0,slabwin);
+#endif
+#ifndef USE_MPI
+  pFile1 = fopen (outfile1,"w");
+#endif
 
-        #ifndef USE_MPI
-        pFile1 = fopen (outfile1,"w");
-        #endif
 
 	// loop over files
 	//
@@ -357,107 +390,110 @@ int main(int argc, char * argv[])
 	reduce_time1 = 0.0;
 	compose_time = 0.0;
 	compose_time1 = 0.0;
+
 	for (int ifiles=0; ifiles<ndatasets; ifiles++)
 	{
-
 	strcpy(filename,datapath_multi[ifiles]);
-        printf("Processing %s, %d of %d\n",filename,ifiles+1,ndatasets);
-        strcat(filename,weightsfile);
-
-        pFile = fopen (filename,"rb");
-        fseek (pFile,startrow*polarisations*sizeof(float),SEEK_SET);
-        fread(weights,(Nweights)*sizeof(float),1,pFile);
-        fclose(pFile);
+  printf("Processing %s, %d of %d\n",filename,ifiles+1,ndatasets);
+  strcat(filename,weightsfile);
+
+  pFile = fopen (filename,"rb");
+  fseek (pFile,startrow*polarisations*sizeof(float),SEEK_SET);
+  fread(weights,(Nweights)*sizeof(float),1,pFile);
+  fclose(pFile);
+  strcpy(filename,datapath);
+  strcat(filename,visrealfile);
+#ifdef VERBOSE
+  printf("Reading %s\n",filename);
+#endif
 
-        strcpy(filename,datapath);
-        strcat(filename,visrealfile);
-        //printf("Reading %s\n",filename);
+  pFile = fopen (filename,"rb");
+  fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET);
+  fread(visreal,Nvis*sizeof(float),1,pFile);
+  fclose(pFile);
+  strcpy(filename,datapath);
+  strcat(filename,visimgfile);
+#ifdef VERBOSE
+  printf("Reading %s\n",filename);
+#endif
+  pFile = fopen (filename,"rb");
+  fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET);
+  fread(visimg,Nvis*sizeof(float),1,pFile);
+  fclose(pFile);
 
-        pFile = fopen (filename,"rb");
-        fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET);
-        fread(visreal,Nvis*sizeof(float),1,pFile);
-        fclose(pFile);
+#ifdef USE_MPI
+  MPI_Barrier(MPI_COMM_WORLD);
+#endif
+  // Declare temporary arrays for the masking
+  double * uus;
+  double * vvs;
+  double * wws;
+  float * visreals;
+  float * visimgs;
+  float * weightss;
+	long isector;
 
-        strcpy(filename,datapath);
-        strcat(filename,visimgfile);
-        //printf("Reading %s\n",filename);
+  for (long isector_count=0; isector_count<nsectors; isector_count++)
+      {
+        clock_gettime(CLOCK_MONOTONIC, &begink);
+        startk = clock();
+        // define local destination sector
+        //isector = (isector_count+rank)%size;
+        isector = isector_count;
+	  // allocate sector arrays
+        long Nsec = histo_send[isector];
+	      uus = (double*) malloc(Nsec*sizeof(double));
+	      vvs = (double*) malloc(Nsec*sizeof(double));
+	      wws = (double*) malloc(Nsec*sizeof(double));
+	      long Nweightss = Nsec*polarisations;
+	      long Nvissec = Nweightss*freq_per_chan;
+	      weightss = (float*) malloc(Nweightss*sizeof(float));
+	      visreals = (float*) malloc(Nvissec*sizeof(float));
+	      visimgs = (float*) malloc(Nvissec*sizeof(float));
 
-        pFile = fopen (filename,"rb");
-        fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET);
-        fread(visimg,Nvis*sizeof(float),1,pFile);
-        fclose(pFile);
+	  // select data for this sector
+        long icount = 0;
+	      long ip = 0;
+	      long inu = 0;
+//CLAAAA
+	       struct sectorlist * current;
+	       current = sectorhead[isector];
 
-        #ifdef USE_MPI
-        MPI_Barrier(MPI_COMM_WORLD);
-        #endif
 
-        // Declare temporary arrays for the masking
-        double * uus;
-        double * vvs;
-        double * wws;
-        float * visreals;
-        float * visimgs;
-        float * weightss;
 
-	long isector;
-        for (long isector_count=0; isector_count<nsectors; isector_count++)
-        {
-          clock_gettime(CLOCK_MONOTONIC, &begink);
-          startk = clock();
-          // define local destination sector
-          //isector = (isector_count+rank)%size;
-          isector = isector_count;
-	  // allocate sector arrays     
-          long Nsec = histo_send[isector];
-	  uus = (double*) malloc(Nsec*sizeof(double));
-	  vvs = (double*) malloc(Nsec*sizeof(double));
-	  wws = (double*) malloc(Nsec*sizeof(double));
-	  long Nweightss = Nsec*polarisations;
-	  long Nvissec = Nweightss*freq_per_chan;
-	  weightss = (float*) malloc(Nweightss*sizeof(float));
-	  visreals = (float*) malloc(Nvissec*sizeof(float));
-	  visimgs = (float*) malloc(Nvissec*sizeof(float));
-
-	  // select data for this sector
-          long icount = 0;
-	  long ip = 0;
-	  long inu = 0;
-//CLAAAA
-	  struct sectorlist * current;
-	  current = sectorhead[isector];
-	  while (current->index != -1)
+	       while (current->index != -1)
           {
              long ilocal = current->index;
-	     //double vvh = vv[ilocal];
+      	     //double vvh = vv[ilocal];
              //int binphi = (int)(vvh*nsectors);
-	     //if (binphi == isector || boundary[ilocal] == isector)
-	     //{
-	       uus[icount] = uu[ilocal];
-	       vvs[icount] = vv[ilocal]-isector*shift;
-	       wws[icount] = ww[ilocal];
-               for (long ipol=0; ipol<polarisations; ipol++)
-	       {
-		  weightss[ip] = weights[ilocal*polarisations+ipol];
+	           //if (binphi == isector || boundary[ilocal] == isector) {
+	           uus[icount] = uu[ilocal];
+	           vvs[icount] = vv[ilocal]-isector*shift;
+	           wws[icount] = ww[ilocal];
+             for (long ipol=0; ipol<polarisations; ipol++)
+                {
+		              weightss[ip] = weights[ilocal*polarisations+ipol];
                   ip++;
-	       }
-               for (long ifreq=0; ifreq<polarisations*freq_per_chan; ifreq++)
-	       {
-	          visreals[inu] = visreal[ilocal*polarisations*freq_per_chan+ifreq];
-	          visimgs[inu] = visimg[ilocal*polarisations*freq_per_chan+ifreq];
-		  //if(visimgs[inu]>1e10 || visimgs[inu]<-1e10)printf("%f %f %ld %ld %d %ld %ld\n",visreals[inu],visimgs[inu],inu,Nvissec,rank,ilocal*polarisations*freq_per_chan+ifreq,Nvis);
-	          inu++;
-	       }
-	       icount++;
+                }
+             for (long ifreq=0; ifreq<polarisations*freq_per_chan; ifreq++)
+                {
+	                 visreals[inu] = visreal[ilocal*polarisations*freq_per_chan+ifreq];
+	                 visimgs[inu] = visimg[ilocal*polarisations*freq_per_chan+ifreq];
+		               //if(visimgs[inu]>1e10 || visimgs[inu]<-1e10)printf("%f %f %ld %ld %d %ld %ld\n",visreals[inu],visimgs[inu],inu,Nvissec,rank,ilocal*polarisations*freq_per_chan+ifreq,Nvis);
+	                 inu++;
+	              }
+	           icount++;
 	     //}
-	     current = current->next;
-          }
+	            current = current->next;
+             }
 
 	  clock_gettime(CLOCK_MONOTONIC, &finishk);
 	  endk = clock();
 	  compose_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
 	  compose_time1 += (finishk.tv_sec - begink.tv_sec);
 	  compose_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
-          #ifndef USE_MPI
+
+    #ifndef USE_MPI
 	  double uumin = 1e20;
 	  double vvmin = 1e20;
 	  double uumax = -1e20;
@@ -470,7 +506,7 @@ int main(int argc, char * argv[])
 	       vvmin = MIN(vvmin,vvs[ipart]);
 	       vvmax = MAX(vvmax,vvs[ipart]);
 
-		  
+
                if(ipart%10 == 0)fprintf (pFile, "%ld %f %f %f\n",isector,uus[ipart],vvs[ipart]+isector*shift,wws[ipart]);
           }
 
@@ -483,6 +519,7 @@ int main(int argc, char * argv[])
 	  #endif
           clock_gettime(CLOCK_MONOTONIC, &begink);
           startk = clock();
+
           wstack(num_w_planes,
                Nsec,
                freq_per_chan,
@@ -496,10 +533,22 @@ int main(int argc, char * argv[])
                dx,
                dw,
                w_support,
-	       xaxis,
-	       yaxis,
+	             xaxis,
+	             yaxis,
                gridss,
                num_threads);
+
+
+/* int z =0 ;
+#pragma omp target map(to:test_i_gpu) map(from:z)
+{
+  int x; // only accessible from accelerator
+  x = 2;
+  z = x + test_i_gpu;
+}*/
+
+
+
 	  clock_gettime(CLOCK_MONOTONIC, &finishk);
 	  endk = clock();
 	  kernel_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
@@ -525,9 +574,9 @@ int main(int argc, char * argv[])
 	  #ifdef ONE_SIDE
 	  printf("One Side communication active\n");
 	  MPI_Win_lock(MPI_LOCK_SHARED,target_rank,0,slabwin);
-	  MPI_Accumulate(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,MPI_SUM,slabwin); 
+	  MPI_Accumulate(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,MPI_SUM,slabwin);
 	  MPI_Win_unlock(target_rank,slabwin);
-	  //MPI_Put(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,slabwin); 	   
+	  //MPI_Put(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,slabwin);
           #else
           MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD);
           #endif //ONE_SIDE
@@ -602,7 +651,7 @@ int main(int argc, char * argv[])
         process_time1 = (finish.tv_sec - begin.tv_sec);
         process_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
         clock_gettime(CLOCK_MONOTONIC, &begin);
-         
+
 
 #ifdef WRITE_DATA
 	// Write results
@@ -681,7 +730,7 @@ int main(int argc, char * argv[])
 
 #ifdef USE_FFTW
 	// FFT transform the data (using distributed FFTW)
-	
+
 	if(rank == 0)printf("PERFORMING FFT\n");
         clock_gettime(CLOCK_MONOTONIC, &begin);
         start = clock();
@@ -696,13 +745,13 @@ int main(int argc, char * argv[])
 	// and perform the FFT per w plane
 	alloc_local = fftw_mpi_local_size_2d(grid_size_y, grid_size_x, MPI_COMM_WORLD,&local_n0, &local_0_start);
 	fftwgrid = fftw_alloc_complex(alloc_local);
-	plan = fftw_mpi_plan_dft_2d(grid_size_y, grid_size_x, fftwgrid, fftwgrid, MPI_COMM_WORLD, FFTW_BACKWARD, FFTW_ESTIMATE); 
+	plan = fftw_mpi_plan_dft_2d(grid_size_y, grid_size_x, fftwgrid, fftwgrid, MPI_COMM_WORLD, FFTW_BACKWARD, FFTW_ESTIMATE);
 
 	long fftwindex = 0;
 	long fftwindex2D = 0;
 	for (int iw=0; iw<num_w_planes; iw++)
         {
-            //printf("FFTing plan %d\n",iw);	    
+            //printf("FFTing plan %d\n",iw);
             // select the w-plane to transform
             for (int iv=0; iv<yaxis; iv++)
             {
@@ -783,7 +832,7 @@ int main(int argc, char * argv[])
                 for (int iw=0; iw<num_w_planes; iw++)
                 for (int iv=0; iv<yaxis; iv++)
                 for (int iu=0; iu<xaxis; iu++)
-                {         
+                {
                           long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*grid_size_x*grid_size_y)*sizeof(double);
                           long index = iu + iv*xaxis + iw*xaxis*yaxis;
                           fseek(pFileimg, global_index, SEEK_SET);
@@ -794,7 +843,7 @@ int main(int argc, char * argv[])
                           fwrite(gridss_img, size_of_grid/2, sizeof(double), pFileimg);
               }
 
-	      
+
           }
           #else
           /*
@@ -820,15 +869,15 @@ int main(int argc, char * argv[])
 	MPI_Barrier(MPI_COMM_WORLD);
         #endif
 #endif //WRITE_DATA
- 
+
 	fftw_free(fftwgrid);
 
 	// Phase correction
         clock_gettime(CLOCK_MONOTONIC, &begin);
         start = clock();
 	if(rank == 0)printf("PHASE CORRECTION\n");
-        double* image_real = (double*) calloc(xaxis*yaxis,sizeof(double));	
-        double* image_imag = (double*) calloc(xaxis*yaxis,sizeof(double));	
+        double* image_real = (double*) calloc(xaxis*yaxis,sizeof(double));
+        double* image_imag = (double*) calloc(xaxis*yaxis,sizeof(double));
         phase_correction(gridss,image_real,image_imag,xaxis,yaxis,num_w_planes,grid_size_x,grid_size_y);
 
         end = clock();
diff --git a/w-stacking.cu b/w-stacking.cu
index 9c76f82008b0742f88ff01dfef36843807ac777d..a6411b3869dd26e2691f6f5359d75174a7aa341c 100644
--- a/w-stacking.cu
+++ b/w-stacking.cu
@@ -6,6 +6,9 @@
 #include <stdlib.h>
 #include <stdio.h>
 
+#ifdef ACCOMP
+#pragma omp  declare target
+#endif
 #ifdef __CUDACC__
 double __device__
 #else
@@ -17,6 +20,9 @@ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
      conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22);
      return conv_weight;
 }
+#ifdef ACCOMP
+#pragma omp end declare target
+#endif
 
 #ifdef __CUDACC__
 //double __device__ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
@@ -90,7 +96,7 @@ __global__ void convolve_g(
                 double add_term_img = 0.0;
                 long ifine = visindex;
                 for (long ifreq=0; ifreq<freq_per_chan; ifreq++)
-		{	
+		{
 		   long iweight = visindex/freq_per_chan;
                    for (long ipol=0; ipol<polarizations; ipol++)
                    {
@@ -111,7 +117,9 @@ __global__ void convolve_g(
 	}
 }
 #endif
-
+#ifdef ACCOMP
+#pragma  omp declare target
+#endif
 void wstack(
      int num_w_planes,
      long num_points,
@@ -216,7 +224,14 @@ void wstack(
 #ifdef _OPENMP
     omp_set_num_threads(num_threads);
 #endif
-    #pragma omp parallel for private(visindex) 
+
+#ifdef ACCOMP
+    long Nvis = num_points*freq_per_chan*polarizations;
+  //  #pragma omp target data map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan])
+  //  #pragma omp target teams distribute parallel for  map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom: grid[0:2*num_w_planes*grid_size_x*grid_size_y])
+#else
+    #pragma omp parallel for private(visindex)
+#endif
     for (i = 0; i < num_points; i++)
     {
 #ifdef _OPENMP
@@ -258,6 +273,7 @@ void wstack(
                 double u_dist = (double)j+0.5 - pos_u;
 		long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
 
+
 		double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
 		// Loops over frequencies and polarizations
 		double add_term_real = 0.0;
@@ -265,7 +281,7 @@ void wstack(
 		long ifine = visindex;
 		// DAV: the following two loops are performend by each thread separately: no problems of race conditions
 		for (long ifreq=0; ifreq<freq_per_chan; ifreq++)
-		{	
+		{
 		   long iweight = visindex/freq_per_chan;
 	           for (long ipol=0; ipol<polarizations; ipol++)
 	           {
@@ -287,18 +303,20 @@ void wstack(
 		grid[iKer+1] += add_term_img;
             }
         }
-	
+
     }
 // End switch between CUDA and CPU versions
 #endif
     //for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
 }
+#ifdef ACCOMP
+#pragma  omp end declare target
+#endif
 
 int test(int nnn)
 {
 	int mmm;
-	
+
 	mmm = nnn+1;
 	return mmm;
-}	
-
+}
diff --git a/w-stacking.h b/w-stacking.h
index 5c675e474d2174bd47b54b4dd98c2a7faf95fd4b..b58c9d64acd8b56353606086ef4f67025556c5cc 100644
--- a/w-stacking.h
+++ b/w-stacking.h
@@ -8,6 +8,7 @@
 #ifdef __CUDACC__
 extern "C"
 #endif
+
 void wstack(
      int,
      long,
@@ -45,4 +46,4 @@ void phase_correction(
      int,
      int);
 
-#endif 
+#endif
diff --git a/w-stacking_omp.c b/w-stacking_omp.c
index 5403f2d8b620045d5767d934a3702a189462dfa3..b848bc5c117b037d070159a58242f08019cc39e1 100644
--- a/w-stacking_omp.c
+++ b/w-stacking_omp.c
@@ -3,6 +3,9 @@
 #include <math.h>
 #include <stdlib.h>
 #include <stdio.h>
+#ifdef NVIDIA
+#include <cuda_runtime.h>
+#endif
 
 #ifdef ACCOMP
 #pragma omp  declare target
@@ -17,9 +20,8 @@ double gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist
 #pragma omp end declare target
 #endif
 
-#ifdef ACCOMP
-#pragma  omp declare target
-#endif
+
+
 void wstack(
      int num_w_planes,
      long num_points,
@@ -39,8 +41,7 @@ void wstack(
      double* grid,
      int num_threads)
 {
-    long i;
-    long index;
+    //long index;
     long visindex;
 
     // initialize the convolution kernel
@@ -57,12 +58,16 @@ void wstack(
 
 #ifdef ACCOMP
     long Nvis = num_points*freq_per_chan*polarizations;
-  //  #pragma omp target data map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan])
-    #pragma omp target teams distribute parallel for private(visindex)  map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom: grid[0:2*num_w_planes*grid_size_x*grid_size_y])
-#else
-    #pragma omp parallel for private(visindex)
+    long gpu_weight_dim = Nvis/freq_per_chan;
+    long gpu_grid_dim = 2*num_w_planes*grid_size_x*grid_size_y;
+#pragma omp target teams distribute parallel for private(visindex) \
+map(to:num_points, KernelLen, std,  std22, norm, num_w_planes, \
+  uu[0:num_points], vv[0:num_points], ww[0:num_points], \
+  vis_real[0:Nvis], vis_img[0:Nvis], weight[0:gpu_weight_dim], \
+  grid_size_x, grid_size_y, freq_per_chan, polarizations, dx,dw, w_support, num_threads) \
+  map(tofrom: grid[0:gpu_grid_dim])
 #endif
-    for (i = 0; i < num_points; i++)
+    for (long i = 0; i < num_points; i++)
     {
 #ifdef _OPENMP
 	//int tid;
@@ -135,12 +140,104 @@ void wstack(
         }
 
     }
+
     //for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
 }
-#ifdef ACCOMP
-#pragma  omp end declare target
+
+
+#ifdef NVIDIA
+#define CUDAErrorCheck(funcall)                                         \
+do {                                                                    \
+  cudaError_t ierr = funcall;                                           \
+  if (cudaSuccess != ierr) {                                            \
+    fprintf(stderr, "%s(line %d) : CUDA RT API error : %s(%d) -> %s\n", \
+    __FILE__, __LINE__, #funcall, ierr, cudaGetErrorString(ierr));      \
+    exit(ierr);                                                         \
+  }                                                                     \
+} while (0)
+
+static inline int _corePerSM(int major, int minor)
+/**
+ * @brief Give the number of CUDA cores per streaming multiprocessor (SM).
+ *
+ * The number of CUDA cores per SM is determined by the compute capability.
+ *
+ * @param major Major revision number of the compute capability.
+ * @param minor Minor revision number of the compute capability.
+ *
+ * @return The number of CUDA cores per SM.
+ */
+{
+  if (1 == major) {
+    if (0 == minor || 1 == minor || 2 == minor || 3 == minor) return 8;
+  }
+  if (2 == major) {
+    if (0 == minor) return 32;
+    if (1 == minor) return 48;
+  }
+  if (3 == major) {
+    if (0 == minor || 5 == minor || 7 == minor) return 192;
+  }
+  if (5 == major) {
+    if (0 == minor || 2 == minor) return 128;
+  }
+  if (6 == major) {
+    if (0 == minor) return 64;
+    if (1 == minor || 2 == minor) return 128;
+  }
+  if (7 == major) {
+    if (0 == minor || 2 == minor || 5 == minor) return 64;
+  }
+  return -1;
+}
+
+void getGPUInfo(int iaccel)
+{
+  int corePerSM;
+
+ struct cudaDeviceProp dev;
+
+  CUDAErrorCheck(cudaSetDevice(iaccel));
+  CUDAErrorCheck(cudaGetDeviceProperties(&dev, iaccel));
+  corePerSM = _corePerSM(dev.major, dev.minor);
+
+  printf("\n");
+  printf("============================================================\n");
+  printf("CUDA Device name : \"%s\"\n", dev.name);
+  printf("------------------------------------------------------------\n");
+  printf("Comp. Capability : %d.%d\n", dev.major, dev.minor);
+  printf("max clock rate   : %.0f MHz\n", dev.clockRate * 1.e-3f);
+  printf("number of SMs    : %d\n", dev.multiProcessorCount);
+  printf("cores  /  SM     : %d\n", corePerSM);
+  printf("# of CUDA cores  : %d\n", corePerSM * dev.multiProcessorCount);
+  printf("------------------------------------------------------------\n");
+  printf("global memory    : %5.0f MBytes\n", dev.totalGlobalMem / 1048576.0f);
+  printf("shared mem. / SM : %5.1f KBytes\n", dev.sharedMemPerMultiprocessor / 1024.0f);
+  printf("32-bit reg. / SM : %d\n", dev.regsPerMultiprocessor);
+  printf("------------------------------------------------------------\n");
+  printf("max # of threads / SM    : %d\n", dev.maxThreadsPerMultiProcessor);
+  printf("max # of threads / block : %d\n", dev.maxThreadsPerBlock);
+  printf("max dim. of block        : (%d, %d, %d)\n",
+      dev.maxThreadsDim[0], dev.maxThreadsDim[1], dev.maxThreadsDim[2]);
+  printf("max dim. of grid         : (%d, %d, %d)\n",
+      dev.maxGridSize[0],   dev.maxGridSize[1],   dev.maxGridSize[2]);
+  printf("warp size                : %d\n", dev.warpSize);
+  printf("============================================================\n");
+
+  int z = 0, x = 2;
+  #pragma omp target map(to:x) map(tofrom:z)
+  {
+    z=x+100;
+  }
+}
+
 #endif
 
+
+
+
+
+
 int test(int nnn)
 {
 	int mmm;
diff --git a/w-stacking_omp.h b/w-stacking_omp.h
index d07e2bffd8479f67d345381dec4a3658cb976db9..b655ffe1ecc37a742a175c61754104cb5150bec8 100644
--- a/w-stacking_omp.h
+++ b/w-stacking_omp.h
@@ -34,6 +34,9 @@ double gauss_kernel_norm(
   double u_dist,
   double v_dist);
 
+
+
+
 void phase_correction(
      double*,
      double*,
@@ -43,3 +46,37 @@ void phase_correction(
      int,
      int,
      int);
+
+#ifdef ACCOMP
+#ifdef NVIDIA
+void getGPUInfo(int);
+#endif
+#pragma omp declare target (gauss_kernel_norm)
+#endif
+
+#ifdef NVIDIA
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef PRTACCELINFO_H
+#define PRTACCELINFO_H
+
+void prtAccelInfo(int iaccel);
+/**<
+ * @brief Print some basic info of an accelerator.
+ *
+ * Strictly speaking, \c prtAccelInfo() can only print the basic info of an
+ * Nvidia CUDA device.
+ *
+ * @param iaccel The index of an accelerator.
+ *
+ * @return \c void.
+ */
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif