Full AMD GPUs enabling

732d1d44 · Emanuele De Rubeis · f7773175 · 732d1d44 · 732d1d44 · 732d1d44
Commit 732d1d44 authored 1 year ago by Emanuele De Rubeis
--- a/Makefile
+++ b/Makefile
@@ -91,18 +91,29 @@ OPT += -DGAUSS_HI_PRECISION
 # use GPU to perform FFT
 #OPT += -DCUFFTMP
-# FULL GPU SUPPORT - Recommended for full NVIDIA GPU code execution
+# FULL NVIDIA GPU SUPPORT - Recommended for full NVIDIA GPU code execution
 OPT += -DFULL_NVIDIA
 ifeq (FULL_NVIDIA,$(findstring FULL_NVIDIA,$(OPT)))
 OPT += -DCUDACC -DNCCL_REDUCE -DCUFFTMP
 endif
+# use HIP for GPUs
+#OPT += -DHIPCC
 # support for AMD GPUs
-#OPT += __HIP_PLATFORM_AMD__
+#OPT += -D__HIP_PLATFORM_AMD__
 # use AMD GPU to perform the reduce
 #OPT += -DRCCL_REDUCE
+# FULL AMD GPU SUPPORT - Recommended for full AMD GPU code execution
+#OPT += -DFULL_AMD
+ifeq (FULL_AMD,$(findstring FULL_AMD,$(OPT)))
+OPT += -DHIPCC -DRCCL_REDUCE -D__HIP_PLATFORM_AMD__
+endif
 # =======================================================
 # OUTPUT HANDLING
@@ -167,6 +178,11 @@ CC_OBJ_ACC = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform
 DEPS_ACC_CUDA = w-stacking.h w-stacking.cu phase_correction.cu
 OBJ_ACC_CUDA = phase_correction.o w-stacking.o
+# ----- define which files will be compiled by HIPCC for AMD
+#
+DEPS_ACC_HIP = w-stacking.hip.hpp w-stacking.hip.cpp phase_correction.hip.cpp
+OBJ_ACC_HIP = phase_correction.hip.o w-stacking.hip.o
 # ----- define which files will be compiled by NVC with OMP offloading for wither Nvidia or AMD
 #
 DEPS_ACC_OMP = w-stacking.h phase_correction.c w-stacking.c
@@ -180,14 +196,12 @@ ifeq (CUDACC,$(findstring CUDACC,$(OPT)))
 DEPS_NCCL_REDUCE = gridding_nccl.cu
 OBJ_NCCL_REDUCE  = gridding_nccl.o
-DEPS_RCCL_REDUCE = gridding_rccl.cu
+else ifeq (HIPCC,$(findstring HIPCC,$(OPT)))
-OBJ_RCCL_REDUCE  = gridding_rccl.o
+DEPS_RCCL_REDUCE = gridding_rccl.hip.cpp allvars_rccl.hip.hpp
+OBJ_RCCL_REDUCE  = gridding_rccl.hip.o
 else
 DEPS_NCCL_REDUCE = gridding_nccl.cpp
 OBJ_NCCL_REDUCE  = gridding_nccl.o
-DEPS_RCCL_REDUCE = gridding_rccl.cpp
-OBJ_RCCL_REDUCE  = gridding_rccl.o
 endif
 # ----- define what files will be compiled by NVCC for Nvidia cufftMP implementation of FFT
@@ -211,6 +225,8 @@ ifeq (ACCOMP,$(findstring ACCOMP, $(OPT)))
 OBJ = $(CC_OBJ_ACC)
 else ifeq (CUDACC,$(findstring CUDACC, $(OPT)))
 OBJ = $(CC_OBJ_ACC)
+else ifeq (HIPCC,$(findstring HIPCC, $(OPT)))
+OBJ = $(CC_OBJ_ACC)
 else 
 OBJ = $(CC_OBJ_NOACC)
 endif
@@ -240,6 +256,22 @@ cuda_fft.cpp: cuda_fft.cu
 gridding_nccl.cpp: gridding_nccl.cu
 	cp gridding_nccl.cu gridding_nccl.cpp
+gridding_rccl.cpp: gridding_rccl.cu
+	cp gridding_rccl.cu gridding_rccl.cpp
+else ifneq (HIPCC,$(findstring HIPCC,$(OPT)))
+w-stacking.c: w-stacking.cu
+	cp w-stacking.cu w-stacking.c
+phase_correction.c: phase_correction.cu
+	cp phase_correction.cu phase_correction.c
+cuda_fft.cpp: cuda_fft.cu
+	cp cuda_fft.cu cuda_fft.cpp
+gridding_nccl.cpp: gridding_nccl.cu
+	cp gridding_nccl.cu gridding_nccl.cpp
 gridding_rccl.cpp: gridding_rccl.cu
 	cp gridding_rccl.cu gridding_rccl.cpp
 else
@@ -277,6 +309,15 @@ $(OBJ_ACC_CUDA): $(DEPS_ACC_CUDA)
 OBJ += $(OBJ_ACC_CUDA)
 endif
+ifeq (HIPCC,$(findstring HIPCC,$(OPT)))
+EXEC_EXT := $(EXEC_EXT)_acc-hip
+LINKER=$(MPIC++)
+FLAGS=$(OPTIMIZE) 
+LIBS=$(AMDLIB)
+$(OBJ_ACC_HIP): $(DEPS_ACC_HIP)
+	$(HIPCC) $(OPT) $(OPT_HIPCC) $(CFLAGS) -c w-stacking.hip.cpp phase_correction.hip.cpp $(LIBS)
+OBJ += $(OBJ_ACC_HIP)
+endif
 ifeq (ACCOMP,$(findstring ACCOMP,$(OPT)))
@@ -317,6 +358,16 @@ LIBS=$(NVLIB_3)
 $(OBJ_NCCL_REDUCE): $(DEPS_NCCL_REDUCE)
 	$(NVCC) $(OPT_NVCC) $(OPT) -c $^ $(LIBS)
 OBJ += $(OBJ_NCCL_REDUCE)
+else ifeq (HIPCC,$(findstring HIPCC,$(OPT)))
+EXEC_EXT := $(EXEC_EXT)_acc-reduce
+LINKER=$(MPIC++)
+FLAGS=$(OPTIMIZE) $(CFLAGS)
+LIBS=$(AMDLIB_3) 
+$(OBJ_NCCL_REDUCE): $(DEPS_NCCL_REDUCE)
+	$(MPIC++) $(FLAGS) $(OPT) -c $^ $(CFLAGS) $(LIBS)
+OBJ += $(OBJ_NCCL_REDUCE)
 else
 EXEC_EXT := $(EXEC_EXT)_acc-reduce

--- a/allvars_rccl.hip.hpp
+++ b/allvars_rccl.hip.hpp
+/* file to store global variables*/
+#if defined(__STDC__)
+#  if (__STDC_VERSION__ >= 199901L)
+#     define _XOPEN_SOURCE 700
+#  endif
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <unistd.h>
+#if !defined( RCCL_REDUCE ) && !defined(__HIPCC__)
+#include <stdatomic.h>
+#endif
+#include <mpi.h>
+#if defined (_OPENMP)
+#include <omp.h>
+#endif
+#if defined(USE_FFTW) && !defined(CUFFTMP) // use MPI fftw
+#include <fftw3-mpi.h>
+#endif
+#if defined(ACCOMP)               
+#include "w-stacking_omp.h"
+#else
+#include "w-stacking.hip.hpp"
+#endif 
+#if defined(NVIDIA)
+#include <cuda_runtime.h>
+#endif
+#include "fft.h"
+#include "numa.h"
+#include "timing.h"
+#include "errcodes.h"
+#define PI 3.14159265359
+#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
+#define MAX(X, Y) (((X) > (Y)) ? (X) : (Y))
+#define NOVERBOSE
+#define NFILES 100
+#define NAME_LEN 50
+#define LONGNAME_LEN 1000
+#define REDUCE_MPI  0
+#define REDUCE_RING 1
+#if defined(DEBUG)
+#define dprintf(LEVEL, T, t, ...) if( (verbose_level >= (LEVEL)) &&	\
+				      ( ((t) ==-1 ) || ((T)==(t)) ) ) {	\
+    printf(__VA_ARGS__); fflush(stdout); }
+#else
+#define dprintf(...)
+#endif
+typedef double double_t;
+#if defined(DOUBLE_PRECISION)
+typedef double float_t;
+#else
+typedef float float_t;
+#endif
+typedef unsigned int       uint;
+typedef unsigned long long ull;
+extern struct io
+{
+	FILE * pFile;
+        FILE * pFile1;
+        FILE * pFilereal;
+        FILE * pFileimg;
+} file;
+extern struct ip
+{
+	char ufile[NAME_LEN];
+  	char vfile[NAME_LEN];
+  	char wfile[NAME_LEN];
+  	char weightsfile[NAME_LEN];
+  	char visrealfile[NAME_LEN];
+  	char visimgfile[NAME_LEN];
+  	char metafile[NAME_LEN];
+        char paramfile[NAME_LEN];
+} in;
+extern struct op
+{
+	char outfile[NAME_LEN];
+        char outfile1[NAME_LEN];
+        char outfile2[NAME_LEN];
+        char outfile3[NAME_LEN];
+        char fftfile[NAME_LEN];
+        char fftfile_writedata1[NAME_LEN];
+        char fftfile_writedata2[NAME_LEN];
+        char fftfile2[NAME_LEN];
+        char fftfile3[NAME_LEN];
+        char logfile[NAME_LEN];
+        char extension[NAME_LEN];
+        char timingfile[NAME_LEN];
+} out, outparam;
+extern struct meta
+{
+  uint   Nmeasures;
+  uint   Nvis;
+  uint   Nweights;
+  uint   freq_per_chan;
+  uint   polarisations;
+  uint   Ntimes;
+  double dt;
+  double thours;
+  uint   baselines;
+  double uvmin;
+  double uvmax;
+  double wmin;
+  double wmax;
+} metaData;
+extern struct parameter
+{
+  int  num_threads;
+  int  ndatasets;
+  char datapath_multi[NFILES][LONGNAME_LEN];
+  int  grid_size_x;
+  int  grid_size_y;
+  int  num_w_planes;
+  int  w_support;
+  int  reduce_method;
+} param;
+extern struct fileData
+{
+        double * uu;
+        double * vv;
+        double * ww;
+        float * weights;
+        float * visreal;
+        float * visimg;
+}data;
+extern char filename[LONGNAME_LEN], buf[NAME_LEN], num_buf[NAME_LEN];
+extern char datapath[LONGNAME_LEN];
+extern int  xaxis, yaxis;
+extern int  rank;
+extern int  size;
+extern uint nsectors;
+extern uint startrow;
+extern double_t resolution, dx, dw, w_supporth;
+extern uint **sectorarray;
+extern uint  *histo_send;
+extern int    verbose_level; 
+extern uint    size_of_grid;
+extern double_t *grid_pointers, *grid, *gridss, *gridss_real, *gridss_img, *gridss_w, *grid_gpu, *gridss_gpu;
+extern MPI_Comm MYMPI_COMM_WORLD;
+extern MPI_Win  slabwin;
--- a/gridding_rccl.hip.cpp
+++ b/gridding_rccl.hip.cpp
+#include "allvars_rccl.hip.hpp"
+#include "proto.h"
+#include <hip/hip_runtime.h>
+#include <rccl.h>
+/* 
+ * Implements the gridding of data via GPU
+ * by using NCCL library
+ *
+ */
+#if defined( RCCL_REDUCE )
+/*
+#define NCCLCHECK(cmd) do {                         
+ncclResult_t r = cmd;                             
+if (r!= ncclSuccess) {                            
+  printf("Failed, NCCL error %s:%d '%s'\n",       
+	 __FILE__,__LINE__,ncclGetErrorString(r));   
+  exit(EXIT_FAILURE);                             
+ }                                                 
+} while(0)
+*/  
+static uint64_t getHostHash(const char* string) {
+  // Based on DJB2a, result = result * 33 ^ char                                                                                                 
+  uint64_t result = 5381;
+  for (int c = 0; string[c] != '\0'; c++){
+    result = ((result << 5) + result) ^ string[c];
+  }
+  return result;
+}
+static void getHostName(char* hostname, int maxlen) {
+  gethostname(hostname, maxlen);
+  for (int i=0; i< maxlen; i++) {
+    if (hostname[i] == '.') {
+        hostname[i] = '\0';
+        return;
+    }
+  }  
+}
+void gridding_data(){
+  double shift = (double)(dx*yaxis);
+  timing_wt.kernel     = 0.0;
+  timing_wt.reduce     = 0.0;
+  timing_wt.reduce_mpi = 0.0;
+  timing_wt.reduce_sh  = 0.0;
+  timing_wt.compose    = 0.0;
+  // calculate the resolution in radians
+  resolution = 1.0/MAX(fabs(metaData.uvmin),fabs(metaData.uvmax));
+  // calculate the resolution in arcsec 
+  double resolution_asec = (3600.0*180.0)/MAX(fabs(metaData.uvmin),fabs(metaData.uvmax))/PI;
+  if ( rank == 0 )
+    printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec);
+  // find the largest value in histo_send[]
+  //                                                                  
+    //Initialize nccl
+  //double *gridss_gpu, *grid_gpu;
+  int local_rank = 0;
+  uint64_t hostHashs[size];
+  char hostname[1024];
+  getHostName(hostname, 1024);
+  hostHashs[rank] = getHostHash(hostname);
+  MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD);
+  for (int p=0; p<size; p++) {
+     if (p == rank) break;
+     if (hostHashs[p] == hostHashs[rank]) local_rank++;
+  }
+  ncclUniqueId id;
+  ncclComm_t comm;
+  hipError_t nnn;
+  hipStream_t stream_reduce, stream_stacking;
+  if (rank == 0) ncclGetUniqueId(&id);
+  MPI_Bcast((void *)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);
+  hipSetDevice(local_rank);
+  int n = hipMalloc(&grid_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double));
+  n = hipMalloc(&gridss_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double));
+  n = hipStreamCreate(&stream_reduce);
+  n = hipStreamCreate(&stream_stacking);
+  ncclCommInitRank(&comm, size, id, rank);
+  for (uint isector = 0; isector < nsectors; isector++)
+    {
+      double start = CPU_TIME_wt;
+      uint Nsec            = histo_send[isector];
+      uint Nweightss       = Nsec*metaData.polarisations;
+      uint Nvissec         = Nweightss*metaData.freq_per_chan;
+      double_t *memory     = (double*) malloc ( (Nsec*3)*sizeof(double_t) +
+						(Nvissec*2+Nweightss)*sizeof(float_t) );
+      if ( memory == NULL )
+	shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__);
+      double_t *uus        = (double_t*) memory;
+      double_t *vvs        = (double_t*) uus+Nsec;
+      double_t *wws        = (double_t*) vvs+Nsec;
+      float_t  *weightss   = (float_t*)((double_t*)wws+Nsec);
+      float_t  *visreals   = (float_t*)weightss + Nweightss;
+      float_t  *visimgs    = (float_t*)visreals + Nvissec;
+      // select data for this sector
+      uint icount = 0;
+      uint ip = 0;
+      uint inu = 0;
+      #warning "this loop should be threaded"
+      #warning "the counter of this loop should not be int"
+      for(int iphi = histo_send[isector]-1; iphi>=0; iphi--)
+        {
+	  uint ilocal = sectorarray[isector][iphi];
+	  uus[icount] = data.uu[ilocal];
+	  vvs[icount] = data.vv[ilocal]-isector*shift;
+	  wws[icount] = data.ww[ilocal];
+	  for (uint ipol=0; ipol<metaData.polarisations; ipol++)
+	    {
+	      weightss[ip] = data.weights[ilocal*metaData.polarisations+ipol];
+	      ip++;
+	    }
+	  for (uint ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++)
+	    {
+	      visreals[inu] = data.visreal[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq];
+	      visimgs[inu]  = data.visimg[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq];
+	      inu++;
+	    }
+	  icount++;
+	}
+      double uumin = 1e20;
+      double vvmin = 1e20;
+      double uumax = -1e20;
+      double vvmax = -1e20;
+      #pragma omp parallel reduction( min: uumin, vvmin) reduction( max: uumax, vvmax) num_threads(param.num_threads)
+      {
+        double my_uumin = 1e20;
+        double my_vvmin = 1e20;
+        double my_uumax = -1e20;
+        double my_vvmax = -1e20;
+       #pragma omp for
+        for (uint ipart=0; ipart<Nsec; ipart++)
+          {
+            my_uumin = MIN(my_uumin, uus[ipart]);
+            my_uumax = MAX(my_uumax, uus[ipart]);
+            my_vvmin = MIN(my_vvmin, vvs[ipart]);
+            my_vvmax = MAX(my_vvmax, vvs[ipart]);
+          }
+        uumin = MIN( uumin, my_uumin );
+        uumax = MAX( uumax, my_uumax );
+        vvmin = MIN( vvmin, my_vvmin );
+        vvmax = MAX( vvmax, my_vvmax );
+      }
+      timing_wt.compose += CPU_TIME_wt - start;
+      //printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax);
+      // Make convolution on the grid
+     #ifdef VERBOSE
+      printf("Processing sector %ld\n",isector);
+     #endif
+      start = CPU_TIME_wt;
+     //We have to call different GPUs per MPI task!!! [GL]
+     #ifdef HIPCC
+      wstack(param.num_w_planes,
+	     Nsec,
+	     metaData.freq_per_chan,
+	     metaData.polarisations,
+	     uus,
+	     vvs,
+	     wws,
+	     visreals,
+	     visimgs,
+	     weightss,
+	     dx,
+	     dw,
+	     param.w_support,
+	     xaxis,
+	     yaxis,
+	     gridss_gpu,
+	     param.num_threads,
+	     rank,
+	     stream_stacking);
+      #else
+      wstack(param.num_w_planes,
+	     Nsec,
+	     metaData.freq_per_chan,
+	     metaData.polarisations,
+	     uus,
+	     vvs,
+	     wws,
+	     visreals,
+	     visimgs,
+	     weightss,
+	     dx,
+	     dw,
+	     param.w_support,
+	     xaxis,
+	     yaxis,
+	     gridss,
+	     param.num_threads,
+	     rank);
+      #endif
+      //Allocate memory on devices non-blocking for the host                                                                                   
+      ///////////////////////////////////////////////////////
+      timing_wt.kernel += CPU_TIME_wt - start;
+     #ifdef VERBOSE
+      printf("Processed sector %ld\n",isector);
+     #endif
+      if( size > 1 )
+	{
+	  // Write grid in the corresponding remote slab
+	  // int target_rank = (int)isector;    it implied that size >= nsectors
+	  int target_rank = (int)(isector % size);
+	  start = CPU_TIME_wt;
+	  ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble, ncclSum, target_rank, comm, stream_reduce);
+	  n = hipStreamSynchronize(stream_reduce);
+	  timing_wt.reduce += CPU_TIME_wt - start;
+	  // Go to next sector
+	  nnn = hipMemset( gridss_gpu, 0.0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) );
+	  if (nnn != hipSuccess) {printf("!!! w-stacking.cu hipMemset ERROR %d !!!\n", nnn);}
+	}
+      free(memory);
+    }
+  //Copy data back from device to host (to be deleted in next steps)
+   n = hipMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), hipMemcpyDeviceToHost, stream_reduce);
+  MPI_Barrier(MPI_COMM_WORLD);
+  n = hipStreamSynchronize(stream_reduce);
+  n = hipFree(grid_gpu);
+  n = hipFree(gridss_gpu);
+  n = hipStreamDestroy(stream_reduce);
+  n = hipStreamDestroy(stream_stacking);
+  ncclCommDestroy(comm);
+  return;
+}
+#endif
--- a/phase_correction.hip.cpp
+++ b/phase_correction.hip.cpp
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+#if !defined(ACCOMP)
+#include "w-stacking.hip.hpp"
+#else
+#include "w-stacking_omp.h"
+#endif
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "errcodes.h"
+#include "proto.h"
+#ifdef __HIPCC__
+__global__ void phase_g(int xaxis, 
+		        int yaxis,
+			int num_w_planes,
+			double * gridss,
+			double * image_real,
+			double * image_imag,
+			double wmin,
+			double dw,
+			double dwnorm,
+			int xaxistot,
+			int yaxistot,
+			double resolution,
+			int nbucket)
+{
+	long gid = blockIdx.x*blockDim.x + threadIdx.x;
+	double add_term_real;
+	double add_term_img;
+	double wterm;
+	long arraysize = (long)((xaxis*yaxis*num_w_planes)/nbucket);
+	if(gid < arraysize)
+	{
+	  long gid_aux = nbucket*gid;
+	  for(int iaux=0; iaux<nbucket; iaux++) 
+          {
+		int iw = gid_aux/(xaxis*yaxis);
+		int ivaux = gid_aux%(xaxis*yaxis);
+		int iv = ivaux/xaxis;
+		int iu = ivaux%xaxis;
+		long index = 2*gid_aux;
+		long img_index = iu+iv*xaxis;
+                wterm = wmin + iw*dw;
+#ifdef PHASE_ON
+                if (num_w_planes > 1)
+                {
+                    double xcoord = (double)(iu-xaxistot/2);
+                    if(xcoord < 0.0)xcoord = (double)(iu+xaxistot/2);
+                    xcoord = sin(xcoord*resolution);
+                    double ycoord = (double)(iv-yaxistot/2);
+                    if(ycoord < 0.0)ycoord = (double)(iv+yaxistot/2);
+                    ycoord = sin(ycoord*resolution);
+                    double preal, pimag;
+                    double radius2 = (xcoord*xcoord+ycoord*ycoord);
+                    preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
+                    pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
+                    double p,q,r,s;
+                    p = gridss[index];
+                    q = gridss[index+1];
+                    r = preal;
+                    s = pimag;
+                    //printf("%d %d %d %ld %ld\n",iu,iv,iw,index,img_index);
+		    add_term_real = (p*r-q*s)*dwnorm*sqrt(1-radius2);
+		    add_term_img = (p*s+q*r)*dwnorm*sqrt(1-radius2);
+		    atomicAdd(&(image_real[img_index]),add_term_real);
+		    atomicAdd(&(image_imag[img_index]),add_term_img);
+                } else {
+		    atomicAdd(&(image_real[img_index]),gridss[index]);
+		    atomicAdd(&(image_imag[img_index]),gridss[index+1]);
+                }
+#else
+		atomicAdd(&(image_real[img_index]),gridss[index]);
+		atomicAdd(&(image_imag[img_index]),gridss[index+1]);
+#endif // end of PHASE_ON
+		gid_aux++;
+           }
+	}
+}
+#endif
+void phase_correction(double* gridss, double* image_real, double* image_imag, int xaxis, int yaxis, int num_w_planes, int xaxistot, int yaxistot,
+		      double resolution, double wmin, double wmax, int num_threads, int rank)
+{
+        double dw = (wmax-wmin)/(double)num_w_planes;
+	double wterm = wmin+0.5*dw;
+	double dwnorm = dw/(wmax-wmin);
+#ifdef HIPCC
+	// WARNING: nbucket MUST be chosen such that xaxis*yaxis*num_w_planes is a multiple of nbucket
+	int nbucket = 1;
+	int Nth = NTHREADS;
+        long Nbl = (long)((num_w_planes*xaxis*yaxis)/Nth/nbucket) + 1;
+        if(NWORKERS == 1) {Nbl = 1; Nth = 1;};
+	int ndevices;
+	int m = hipGetDeviceCount(&ndevices);
+	m = hipSetDevice(rank % ndevices);
+	if ( rank == 0 ) {
+	  if (0 == ndevices) {
+	    shutdown_wstacking(NO_ACCELERATORS_FOUND, "No accelerators found", __FILE__, __LINE__ );
+	  }
+	}
+	  printf("Running rank %d using GPU %d\n", rank, rank % ndevices);
+	 #ifdef NVIDIA
+	  prtAccelInfo();
+	 #endif
+	hipError_t mmm;
+	double * image_real_g;
+	double * image_imag_g;
+	double * gridss_g;
+	//Copy gridss on the device
+	mmm=hipMalloc(&gridss_g, 2*num_w_planes*xaxis*yaxis*sizeof(double));
+	mmm=hipMemcpy(gridss_g, gridss, 2*num_w_planes*xaxis*yaxis*sizeof(double), hipMemcpyHostToDevice);
+	mmm=hipMalloc(&image_real_g, xaxis*yaxis*sizeof(double));
+	//printf("HIP ERROR 2 %s\n",hipGetErrorString(mmm));
+	mmm=hipMalloc(&image_imag_g, xaxis*yaxis*sizeof(double));
+	//printf("HIP ERROR 3 %s\n",hipGetErrorString(mmm));
+	//printf("HIP ERROR 4 %s\n",hipGetErrorString(mmm));
+	mmm=hipMemset(image_real_g, 0.0, xaxis*yaxis*sizeof(double));
+	//printf("HIP ERROR 5 %s\n",hipGetErrorString(mmm));
+	mmm=hipMemset(image_imag_g, 0.0, xaxis*yaxis*sizeof(double));
+	//printf("HIP ERROR 6 %s\n",hipGetErrorString(mmm));
+	// call the phase correction kernel
+	phase_g <<<Nbl,Nth>>> (xaxis,
+                               yaxis,
+			       num_w_planes,
+                               gridss_g,
+                               image_real_g,
+                               image_imag_g,
+                               wmin,
+                               dw,
+                               dwnorm,
+                               xaxistot,
+                               yaxistot,
+                               resolution,
+			       nbucket);
+	mmm = hipMemcpy(image_real, image_real_g, xaxis*yaxis*sizeof(double), hipMemcpyDeviceToHost);
+	//printf("HIP ERROR 7 %s\n",hipGetErrorString(mmm));
+	mmm = hipMemcpy(image_imag, image_imag_g, xaxis*yaxis*sizeof(double), hipMemcpyDeviceToHost);
+	//printf("HIP ERROR 8 %s\n",hipGetErrorString(mmm));
+	mmm= hipFree(gridss_g);
+#else
+#ifndef ACCOMP
+#ifdef _OPENMP
+	omp_set_num_threads(num_threads);
+#endif
+       #pragma omp parallel for collapse(2) private(wterm) 
+	for (int iw=0; iw<num_w_planes; iw++)
+	{
+	    for (int iv=0; iv<yaxis; iv++)
+            for (int iu=0; iu<xaxis; iu++)
+            {
+		unsigned int index = 2*(iu+iv*xaxis+xaxis*yaxis*iw);
+		unsigned int img_index = iu+iv*xaxis;
+		wterm = wmin + iw*dw;
+#ifdef PHASE_ON
+                if (num_w_planes > 1)
+		{
+                    double xcoord = (double)(iu-xaxistot/2);
+                    if(xcoord < 0.0)xcoord = (double)(iu+xaxistot/2);
+		    xcoord = sin(xcoord*resolution);
+                    double ycoord = (double)(iv-yaxistot/2);
+                    if(ycoord < 0.0)ycoord = (double)(iv+yaxistot/2);
+		    ycoord = sin(ycoord*resolution);
+		    double preal, pimag;
+		    double radius2 = (xcoord*xcoord+ycoord*ycoord);
+		    if(xcoord <= 1.0)
+		    {
+			    preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
+			    pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
+		    } else {
+			    preal = cos(-2.0*PI*wterm*(sqrt(radius2-1.0)-1));
+			    pimag = 0.0;
+		    }
+		    preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
+		    pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
+		    double p,q,r,s;
+		    p = gridss[index];
+		    q = gridss[index+1];
+		    r = preal;
+		    s = pimag;
+		    //printf("%d %d %d %ld %ld\n",iu,iv,iw,index,img_index);
+		   #pragma omp atomic
+		    image_real[img_index] += (p*r-q*s)*dwnorm*sqrt(1-radius2);
+		   #pragma omp atomic
+		    image_imag[img_index] += (p*s+q*r)*dwnorm*sqrt(1-radius2);
+	        } else {
+		 #pragma omp atomic
+		  image_real[img_index] += gridss[index];
+		 #pragma omp atomic
+		  image_imag[img_index] += gridss[index+1];
+		}
+#else
+	       #pragma omp atomic
+  	        image_real[img_index] += gridss[index];
+	       #pragma omp atomic
+		image_imag[img_index] += gridss[index+1];
+#endif // end of PHASE_ON
+            }
+	}
+#else
+	omp_set_default_device(rank % omp_get_num_devices());
+       #if !defined(__clang__)
+       #pragma omp target teams distribute parallel for collapse(2) simd private(wterm) map(to:gridss[0:2*num_w_planes*xaxis*yaxis]) map(from:image_real[0:xaxis*yaxis]) map(from:image_imag[0:xaxis*yaxis])
+       #else
+       #pragma omp target teams distribute parallel for collapse(2) private(wterm) map(to:gridss[0:2*num_w_planes*xaxis*yaxis]) map(from:image_real[0:xaxis*yaxis]) map(from:image_imag[0:xaxis*yaxis])
+       #endif
+	for (int iw=0; iw<num_w_planes; iw++)
+	{
+	    for (int iv=0; iv<yaxis; iv++)
+            for (int iu=0; iu<xaxis; iu++)
+            {
+		long index = 2*(iu+iv*xaxis+xaxis*yaxis*iw);
+		long img_index = iu+iv*xaxis;
+		wterm = wmin + iw*dw;
+#ifdef PHASE_ON
+                if (num_w_planes > 1)
+		{
+                    double xcoord = (double)(iu-xaxistot/2);
+                    if(xcoord < 0.0)xcoord = (double)(iu+xaxistot/2);
+		    xcoord = sin(xcoord*resolution);
+                    double ycoord = (double)(iv-yaxistot/2);
+                    if(ycoord < 0.0)ycoord = (double)(iv+yaxistot/2);
+		    ycoord = sin(ycoord*resolution);
+		    double preal, pimag;
+		    double radius2 = (xcoord*xcoord+ycoord*ycoord);
+		    if(xcoord <= 1.0)
+		    {
+			    preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
+			    pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
+		    } else {
+			    preal = cos(-2.0*PI*wterm*(sqrt(radius2-1.0)-1));
+			    pimag = 0.0;
+		    }
+		    preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
+		    pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
+		    double p,q,r,s;
+		    p = gridss[index];
+		    q = gridss[index+1];
+		    r = preal;
+		    s = pimag;
+		    //printf("%d %d %d %ld %ld\n",iu,iv,iw,index,img_index);
+		    #pragma omp atomic
+		    image_real[img_index] += (p*r-q*s)*dwnorm*sqrt(1-radius2);
+		    #pragma omp atomic
+		    image_imag[img_index] += (p*s+q*r)*dwnorm*sqrt(1-radius2);
+	        } else {
+		    #pragma omp atomic
+		    image_real[img_index] += gridss[index];
+		    #pragma omp atomic
+		    image_imag[img_index] += gridss[index+1];
+		}
+#else
+		#pragma omp atomic
+  	        image_real[img_index] += gridss[index];
+		#pragma omp atomic
+		image_imag[img_index] += gridss[index+1];
+#endif // end of PHASE_ON
+            }
+	}
+#endif	
+#endif // end of __HIPCC__
+}
--- a/w-stacking.hip.cpp
+++ b/w-stacking.hip.cpp
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+#include "w-stacking.hip.hpp"
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+#ifdef __HIPCC__
+#include "allvars_nccl.hip.hpp"
+#endif
+#include "proto.h"
+#ifdef ACCOMP
+#pragma omp  declare target
+#endif
+#ifdef __HIPCC__
+double __device__
+#else
+double
+#endif
+// Gaussian Kernel
+gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
+{
+     double conv_weight;
+     conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22);
+     return conv_weight;
+}
+void makeGaussKernel(double * kernel,
+		     int KernelLen,
+		     int increaseprecision,
+		     double std22)
+{
+  double norm = std22/PI;
+  int n = increaseprecision*KernelLen, mid = n / 2;
+  for (int i = 0; i != mid + 1; i++) {
+      double term = (double)i/(double)increaseprecision;
+      kernel[mid + i] = sqrt(norm) * exp(-(term*term)*std22);
+  }
+  for (int i = 0; i != mid; i++) kernel[i] = kernel[n - 1 - i];
+//  for (int i = 0; i < n; i++) printf("%f\n",kernel[i]);
+}
+// Kaiser-Bessel Kernel: it is adapted from WSClean
+double bessel0(double x, double precision) {
+  // Calculate I_0 = SUM of m 0 -> inf [ (x/2)^(2m) ]
+  // This is the unnormalized bessel function of order 0.
+  double d = 0.0, ds = 1.0, sum = 1.0;
+  do {
+    d += 2.0;
+    ds *= x * x / (d * d);
+    sum += ds;
+  } while (ds > sum * precision);
+  return sum;
+}
+void makeKaiserBesselKernel(double * kernel,
+		            int KernelLen,
+			    int increaseprecision,
+                            double alpha,
+                            double overSamplingFactor,
+                            int withSinc) {
+  int n = increaseprecision*KernelLen, mid = n / 2;
+  double * sincKernel = (double*)malloc((mid + 1) * sizeof(*sincKernel));
+  const double filterRatio = 1.0 / overSamplingFactor;
+  sincKernel[0] = filterRatio;
+  for (int i = 1; i != mid + 1; i++) {
+    double x = i;
+    sincKernel[i] =
+        withSinc ? (sin(PI * filterRatio * x) / (PI * x)) : filterRatio;
+  }
+  const double normFactor = overSamplingFactor / bessel0(alpha, 1e-8);
+  for (int i = 0; i != mid + 1; i++) {
+    double term = (double)i / mid;
+    kernel[mid + i] = sincKernel[i] *
+                bessel0(alpha * sqrt(1.0 - (term * term)), 1e-8) *
+                normFactor;
+  }
+  for (int i = 0; i != mid; i++) kernel[i] = kernel[n - 1 - i];
+  //for (int i = 0; i < n; i++) printf("%f\n",kernel[i]);
+}
+#ifdef ACCOMP
+#pragma omp end declare target
+#endif
+#ifdef __HIPCC__
+//double __device__ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
+//{
+//     double conv_weight;
+//     conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22);
+//     return conv_weight;
+//}
+__global__ void convolve_g(
+			   int num_w_planes,
+			   uint num_points,
+			   uint freq_per_chan,
+			   uint polarizations,
+			   double* uu,
+			   double* vv,
+			   double* ww,
+			   float* vis_real,
+			   float* vis_img,
+			   float* weight,
+			   double dx,
+			   double dw,
+			   int KernelLen,
+			   int grid_size_x,
+			   int grid_size_y,
+			   double* grid,
+			  #if defined(GAUSS_HI_PRECISION)
+			   double std22
+			  #else
+			   double std22,
+			   double* convkernel
+			  #endif
+			   )
+{
+  //printf("DENTRO AL KERNEL\n");
+  uint gid = blockIdx.x*blockDim.x + threadIdx.x;
+  if(gid < num_points)
+    {
+      uint i = gid;
+      uint visindex = i*freq_per_chan*polarizations;
+      double norm = std22/PI;
+      int j, k;
+      /* Convert UV coordinates to grid coordinates. */
+      double pos_u = uu[i] / dx;
+      double pos_v = vv[i] / dx;
+      double ww_i  = ww[i] / dw;
+      int grid_w = (int)ww_i;
+      int grid_u = (int)pos_u;
+      int grid_v = (int)pos_v;
+      // check the boundaries
+      uint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
+      uint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
+      uint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
+      uint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
+      // Convolve this point onto the grid.
+      for (k = kmin; k <= kmax; k++)
+        {
+	  double v_dist = (double)k - pos_v;
+	  int increaseprecision = 5;
+	  for (j = jmin; j <= jmax; j++)
+            {
+	      double u_dist = (double)j+0.5 - pos_u;
+	      uint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
+	      int jKer = (int)(increaseprecision * (fabs(u_dist+(double)KernelLen)));
+	      int kKer = (int)(increaseprecision * (fabs(v_dist+(double)KernelLen)));
+	     #ifdef GAUSS_HI_PRECISION
+	      double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
+	     #endif
+	     #ifdef GAUSS
+	      double conv_weight = convkernel[jKer]*convkernel[kKer];
+	     #endif
+	     #ifdef KAISERBESSEL
+	      double conv_weight = convkernel[jKer]*convkernel[kKer];
+	     #endif
+	      // Loops over frequencies and polarizations
+	      double add_term_real = 0.0;
+	      double add_term_img = 0.0;
+	      uint ifine = visindex;
+	      for (uint ifreq=0; ifreq<freq_per_chan; ifreq++)
+		{
+		  uint iweight = visindex/freq_per_chan;
+		  for (uint ipol=0; ipol<polarizations; ipol++)
+		    {
+                      double vistest = (double)vis_real[ifine];
+                      if (!isnan(vistest))
+			{
+			  add_term_real += weight[iweight] * vis_real[ifine] * conv_weight;
+			  add_term_img += weight[iweight] * vis_img[ifine] * conv_weight;
+			}
+                      ifine++;
+		      iweight++;
+		    }
+		}
+	      atomicAdd(&(grid[iKer]),add_term_real);
+	      atomicAdd(&(grid[iKer+1]),add_term_img);
+            }
+        }
+    }
+}
+#endif
+#ifdef ACCOMP
+#pragma  omp declare target
+#endif
+void wstack(
+     int num_w_planes,
+     uint num_points,
+     uint freq_per_chan,
+     uint polarizations,
+     double* uu,
+     double* vv,
+     double* ww,
+     float* vis_real,
+     float* vis_img,
+     float* weight,
+     double dx,
+     double dw,
+     int w_support,
+     int grid_size_x,
+     int grid_size_y,
+     double* grid,
+     int num_threads,
+    #ifdef HIPCC
+     int rank,
+     hipStream_t stream_stacking
+    #else
+     int rank
+    #endif
+	    )
+{
+    uint i;
+    //uint index;
+    uint visindex;
+    // initialize the convolution kernel
+    // gaussian:
+    int KernelLen = (w_support-1)/2;
+    int increaseprecision = 5; // this number must be odd: increaseprecison*w_support must be odd (w_support must be odd)
+    double std = 1.0;
+    double std22 = 1.0/(2.0*std*std);
+    double norm = std22/PI;
+    double * convkernel = (double*)malloc(increaseprecision*w_support*sizeof(*convkernel));
+    #ifdef GAUSS
+    makeGaussKernel(convkernel,w_support,increaseprecision,std22);
+    #endif
+    #ifdef KAISERBESSEL
+    double overSamplingFactor = 1.0;
+    int withSinc = 0;
+    double alpha = 8.6;
+    makeKaiserBesselKernel(convkernel, w_support, increaseprecision, alpha, overSamplingFactor, withSinc);
+    #endif
+    // Loop over visibilities.
+// Switch between HIP and GPU versions
+   #ifdef __HIPCC__
+    // Define the HIP set up
+    int Nth = NTHREADS;
+    uint Nbl = (uint)(num_points/Nth) + 1;
+    if(NWORKERS == 1) {Nbl = 1; Nth = 1;};
+    uint Nvis = num_points*freq_per_chan*polarizations;
+    int ndevices;
+    int num = hipGetDeviceCount(&ndevices);
+    int res = hipSetDevice(rank % ndevices);
+    if ( rank == 0 ) {
+      if (0 == ndevices) {
+	shutdown_wstacking(NO_ACCELERATORS_FOUND, "No accelerators found", __FILE__, __LINE__ );
+      }
+    }
+     #ifdef NVIDIA
+      prtAccelInfo();
+     #endif
+    // Create GPU arrays and offload them
+    double * uu_g;
+    double * vv_g;
+    double * ww_g;
+    float * vis_real_g;
+    float * vis_img_g;
+    float * weight_g;
+    //double * grid_g;
+    double * convkernel_g;
+    //Create the event inside stream stacking
+    //hipEvent_t event_kernel;
+    //for (int i=0; i<100000; i++)grid[i]=23.0;
+    hipError_t mmm;
+    //mmm=hipEventCreate(&event_kernel);
+    mmm=hipMalloc(&uu_g,num_points*sizeof(double));
+    mmm=hipMalloc(&vv_g,num_points*sizeof(double));
+    mmm=hipMalloc(&ww_g,num_points*sizeof(double));
+    mmm=hipMalloc(&vis_real_g,Nvis*sizeof(float));
+    mmm=hipMalloc(&vis_img_g,Nvis*sizeof(float));
+    mmm=hipMalloc(&weight_g,(Nvis/freq_per_chan)*sizeof(float));
+    //mmm=hipMalloc(&grid_g,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
+   #if !defined(GAUSS_HI_PRECISION)
+    mmm=hipMalloc(&convkernel_g,increaseprecision*w_support*sizeof(double));
+   #endif
+    if (mmm != hipSuccess) {printf("!!! w-stacking.cu hipMalloc ERROR %d !!!\n", mmm);}
+    //mmm=hipMemset(grid_g,0.0,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
+    if (mmm != hipSuccess) {printf("!!! w-stacking.cu hipMemset ERROR %d !!!\n", mmm);}
+    mmm=hipMemcpyAsync(uu_g, uu, num_points*sizeof(double), hipMemcpyHostToDevice, stream_stacking);
+    mmm=hipMemcpyAsync(vv_g, vv, num_points*sizeof(double), hipMemcpyHostToDevice, stream_stacking);
+    mmm=hipMemcpyAsync(ww_g, ww, num_points*sizeof(double), hipMemcpyHostToDevice, stream_stacking);
+    mmm=hipMemcpyAsync(vis_real_g, vis_real, Nvis*sizeof(float), hipMemcpyHostToDevice, stream_stacking);
+    mmm=hipMemcpyAsync(vis_img_g, vis_img, Nvis*sizeof(float), hipMemcpyHostToDevice, stream_stacking);
+    mmm=hipMemcpyAsync(weight_g, weight, (Nvis/freq_per_chan)*sizeof(float), hipMemcpyHostToDevice, stream_stacking);
+   #if !defined(GAUSS_HI_PRECISION)
+    mmm=hipMemcpyAsync(convkernel_g, convkernel, increaseprecision*w_support*sizeof(double), hipMemcpyHostToDevice, stream_stacking);
+   #endif
+    if (mmm != hipSuccess) {printf("!!! w-stacking.cu hipMemcpyAsync ERROR %d !!!\n", mmm);}
+    // Call main GPU Kernel
+   #if defined(GAUSS_HI_PRECISION)
+    convolve_g <<<Nbl,Nth,0,stream_stacking>>> (
+	       num_w_planes,
+               num_points,
+               freq_per_chan,
+               polarizations,
+               uu_g,
+               vv_g,
+               ww_g,
+               vis_real_g,
+               vis_img_g,
+               weight_g,
+               dx,
+               dw,
+               KernelLen,
+               grid_size_x,
+               grid_size_y,
+               grid,
+	       std22
+						);
+   #else
+    convolve_g <<<Nbl,Nth,0,stream_stacking>>> (
+               num_w_planes,
+               num_points,
+               freq_per_chan,
+               polarizations,
+               uu_g,
+               vv_g,
+               ww_g,
+               vis_real_g,
+               vis_img_g,
+               weight_g,
+               dx,
+               dw,
+               KernelLen,
+               grid_size_x,
+               grid_size_y,
+               grid,
+               std22,
+               convkernel_g
+                                                );
+   #endif
+    mmm=hipStreamSynchronize(stream_stacking);
+    //Record the event
+    //mmm=hipEventRecord(event_kernel,stream_stacking);
+    //Wait until the kernel ends
+    //mmm=hipStreamWaitEvent(stream_stacking,event_kernel);
+    //for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
+    if (mmm != hipSuccess)
+      printf("HIP ERROR %s\n",hipGetErrorString(mmm));
+    mmm=hipFree(uu_g);
+    mmm=hipFree(vv_g);
+    mmm=hipFree(ww_g);
+    mmm=hipFree(vis_real_g);
+    mmm=hipFree(vis_img_g);
+    mmm=hipFree(weight_g);
+    //mmm=hipFree(grid_g);
+   #if !defined(GAUSS_HI_PRECISION)
+    mmm=hipFree(convkernel_g);
+   #endif
+// Switch between HIP and GPU versions
+# else
+#ifdef _OPENMP
+    omp_set_num_threads(num_threads);
+#endif
+   #if defined(ACCOMP) && (GPU_STACKING)
+    omp_set_default_device(rank % omp_get_num_devices());
+    uint Nvis = num_points*freq_per_chan*polarizations;
+   #pragma omp target teams distribute parallel for private(visindex) map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom:grid[0:2*num_w_planes*grid_size_x*grid_size_y])
+#else
+    #pragma omp parallel for private(visindex)
+#endif
+    for (i = 0; i < num_points; i++)
+    {
+#ifdef _OPENMP
+	//int tid;
+	//tid = omp_get_thread_num();
+	//printf("%d\n",tid);
+#endif
+        visindex = i*freq_per_chan*polarizations;
+        double sum = 0.0;
+        int j, k;
+	//if (i%1000 == 0)printf("%ld\n",i);
+        /* Convert UV coordinates to grid coordinates. */
+        double pos_u = uu[i] / dx;
+        double pos_v = vv[i] / dx;
+        double ww_i  = ww[i] / dw;
+        int grid_w = (int)ww_i;
+        int grid_u = (int)pos_u;
+        int grid_v = (int)pos_v;
+	// check the boundaries
+	uint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
+	uint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
+	uint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
+	uint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
+        //printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax);
+        // Convolve this point onto the grid.
+        for (k = kmin; k <= kmax; k++)
+        {
+            //double v_dist = (double)k+0.5 - pos_v;
+            double v_dist = (double)k - pos_v;
+            for (j = jmin; j <= jmax; j++)
+            {
+                //double u_dist = (double)j+0.5 - pos_u;
+                double u_dist = (double)j - pos_u;
+		uint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
+                int jKer = (int)(increaseprecision * (fabs(u_dist+(double)KernelLen)));
+                int kKer = (int)(increaseprecision * (fabs(v_dist+(double)KernelLen)));
+                #ifdef GAUSS_HI_PRECISION
+		double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
+                #endif
+                #ifdef GAUSS
+		double conv_weight = convkernel[jKer]*convkernel[kKer];
+		//if(jKer < 0 || jKer >= 35 || kKer < 0 || kKer >= 35)
+		//	printf("%f %d %f %d\n",fabs(u_dist+(double)KernelLen),jKer,fabs(v_dist+(double)KernelLen),kKer);
+		//printf("%d %d %d %d %f %f %f %f %f\n",jKer, j, kKer, k, pos_u, pos_v, u_dist,v_dist,conv_weight);
+                #endif
+                #ifdef KAISERBESSEL
+		double conv_weight = convkernel[jKer]*convkernel[kKer];
+	        #endif
+		// Loops over frequencies and polarizations
+		double add_term_real = 0.0;
+		double add_term_img = 0.0;
+		uint ifine = visindex;
+		// DAV: the following two loops are performend by each thread separately: no problems of race conditions
+		for (uint ifreq=0; ifreq<freq_per_chan; ifreq++)
+		{
+		   uint iweight = visindex/freq_per_chan;
+	           for (uint ipol=0; ipol<polarizations; ipol++)
+	           {
+                      if (!isnan(vis_real[ifine]))
+                      {
+		         //printf("%f %ld\n",weight[iweight],iweight);
+                         add_term_real += weight[iweight] * vis_real[ifine] * conv_weight;
+		         add_term_img += weight[iweight] * vis_img[ifine] * conv_weight;
+			 //if(vis_img[ifine]>1e10 || vis_img[ifine]<-1e10)printf("%f %f %f %f %ld %ld\n",vis_real[ifine],vis_img[ifine],weight[iweight],conv_weight,ifine,num_points*freq_per_chan*polarizations);
+		      }
+		      ifine++;
+		      iweight++;
+		   }
+	        }
+		// DAV: this is the critical call in terms of correctness of the results and of performance
+		#pragma omp atomic
+		grid[iKer] += add_term_real;
+		#pragma omp atomic
+		grid[iKer+1] += add_term_img;
+            }
+        }
+    }
+   #if defined(ACCOMP) && (GPU_STACKING)
+   #pragma omp target exit data map(delete:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan], grid[0:2*num_w_planes*grid_size_x*grid_size_y])
+   #endif
+    // End switch between HIP and CPU versions
+#endif
+    //for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
+}
+#ifdef ACCOMP
+#pragma  omp end declare target
+#endif
+int test(int nnn)
+{
+	int mmm;
+	mmm = nnn+1;
+	return mmm;
+}
--- a/w-stacking.hip.hpp
+++ b/w-stacking.hip.hpp
+#ifndef W_PROJECT_H_
+#define W_PROJECT_H_
+#define NWORKERS -1    //100
+#define NTHREADS 64    //AMD WARP SIZE
+#define PI 3.14159265359
+#define REAL_TYPE double
+#include <mpi.h>
+#include <hip/hip_runtime.h>
+#ifdef __HIPCC__
+extern "C"
+#endif
+#ifdef __cplusplus
+extern "C" {
+void wstack(
+     int,
+     unsigned int,
+     unsigned int,
+     unsigned int,
+     double*,
+     double*,
+     double*,
+     float*,
+     float*,
+     float*,
+     double,
+     double,
+     int,
+     int,
+     int,
+     double*,
+     int,
+     int,
+     hipStream_t);
+}
+#else 
+void wstack(
+     int,
+     unsigned int,
+     unsigned int,
+     unsigned int,
+     double*,
+     double*,
+     double*,
+     float*,
+     float*,
+     float*,
+     double,
+     double,
+     int,
+     int,
+     int,
+     double*,
+     int,
+     int);
+#endif
+#ifdef __HIPCC__
+extern "C"
+#endif
+int test(int nnn);
+#ifdef __HIPCC__
+extern "C"
+#endif
+void phase_correction(
+     double*,
+     double*,
+     double*,
+     int,
+     int,
+     int,
+     int,
+     int,
+     double,
+     double,
+     double,
+     int,
+     int);
+double gauss_kernel_norm(
+  double norm,
+  double std22,
+  double u_dist,
+  double v_dist);
+#ifdef ACCOMP
+#pragma omp declare target (gauss_kernel_norm)
+#endif
+#ifdef __HIPCC__
+extern "C"
+#endif
+void cuda_fft(
+	int,
+	int,
+	int,
+	int,
+	int,
+	double*,
+	double*,
+	int,
+	MPI_Comm);
+#endif