diff --git a/Build/Makefile.leo b/Build/Makefile.leo
index ce350a18f3de19fa7b3f91c9238bfd7d6f0a3d1a..2f19b24f5d6c5d177261cb52bb46e5a143874425 100644
--- a/Build/Makefile.leo
+++ b/Build/Makefile.leo
@@ -9,8 +9,14 @@ OPT_PURE_MPI = -O4 -march=native -mavx -mavx2
 
 OMP_GPU = -mp=multicore,gpu -gpu=cuda11.8 -gpu=cc80
 
-CUDA_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/include
-CUDA_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/lib64 -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/targets/x86_64-linux/lib/stubs
+###CUDA_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/include
+###CUDA_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/lib64 -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/cuda/11.8/targets/x86_64-linux/lib/stubs
+
+
+CUDA_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/cuda/12.3/include
+CUDA_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/cuda/12.3/lib -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/cuda/12.3/targets/x86_64-linux/lib/stubs
+
+
 
 FFTW_INCL=
 FFTW_LIB=
@@ -19,18 +25,33 @@ FFTW_LIB=
 ##########################################################
 #NVIDIA CUFFTMP
 
-CUFFTMP_LIB  = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/lib64
-CUFFTMP_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/include/cufftmp
+###CUFFTMP_LIB  = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/lib64
+###CUFFTMP_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/math_libs/11.8/include/cufftmp
+
+CUFFTMP_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/math_libs/12.3/include/cufftmp
+CUFFTMP_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/math_libs/12.3/lib64
+
+
 ##########################################################
 
-NVSHMEM_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/include/
-NVSHMEM_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/lib/
+###NVSHMEM_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/include/
+###NVSHMEM_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nvshmem_cufftmp_compat/lib/
+
+NVSHMEM_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/comm_libs/12.3/nvshmem/include
+NVSHMEM_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/comm_libs/12.3/nvshmem/lib
+
 
 ##########################################################
 #NVIDIA NCCL REDUCE
 
-NCCL_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nccl/include
-NCCL_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nccl/lib
+###NCCL_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nccl/include
+###NCCL_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.5-pdmwq3k5perrhdqyrv2hspv4poqrb2dr/Linux_x86_64/23.5/comm_libs/11.8/nccl/lib
+
+
+NCCL_INC = -I/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/comm_libs/12.3/nccl/include
+NCCL_LIB = -L/leonardo/prod/spack/03/install/0.19/linux-rhel8-icelake/gcc-11.3.0/nvhpc-23.11-tgvw3c2exrfgdvn5qdw3rybzd3dbkkim/Linux_x86_64/23.11/comm_libs/12.3/nccl/lib
+
+
 ##########################################################
 
 NVC = nvc 
diff --git a/allvars.h b/allvars.h
index 5ee68b1374734496aeb08814bb9729737205ecd2..88e062e0cf96e9777acf0ef06c53b039492121a7 100755
--- a/allvars.h
+++ b/allvars.h
@@ -175,4 +175,3 @@ extern double_t *grid_pointers, *grid, *gridss, *gridss_real, *gridss_img, *grid
 
 extern MPI_Comm MYMPI_COMM_WORLD;
 extern MPI_Win  slabwin;
-
diff --git a/allvars_nccl.h b/allvars_nccl.h
index b67b03295fb2c04b1e7f995634447d09552f8e40..d102d6835b548d11e3569e340cedd3321a115039 100755
--- a/allvars_nccl.h
+++ b/allvars_nccl.h
@@ -173,4 +173,3 @@ extern double_t *grid_pointers, *grid, *gridss, *gridss_real, *gridss_img, *grid
 
 extern MPI_Comm MYMPI_COMM_WORLD;
 extern MPI_Win  slabwin;
-
diff --git a/cuda_fft.cu b/cuda_fft.cu
index 8efae0a334105942caecba4c556eb4afdff442f8..3a28d999365d823faa6efcc8179e3ebc49555f00 100755
--- a/cuda_fft.cu
+++ b/cuda_fft.cu
@@ -89,7 +89,10 @@ void cuda_fft(
 
   
   // Alloco fftwgrid su GPU utilizzando cudaMalloc
-  mmm=cudaMalloc(&fftwgrid, sizeof(cufftDoubleComplex)*yaxis*xaxis);
+
+  long long unsigned size_finta_fft = (long long unsigned)((long long unsigned)xaxis*(long long unsigned)yaxis);
+
+  mmm=cudaMalloc(&fftwgrid, (size_t)(size_finta_fft*sizeof(cufftDoubleComplex)));
   if (mmm != cudaSuccess) {printf("!!! cuda_fft.cu cudaMalloc ERROR %d !!!\n", mmm);}
 
   int Nth = 32;
@@ -149,7 +152,7 @@ void cuda_fft(
       cudaStreamSynchronize(stream);
 
       //Copy the array to be transformed onto the descriptor structure array 
-      mmm = cudaMemcpy(fftwgrid_g->descriptor->data[0], fftwgrid, xaxis*yaxis*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToDevice);
+      mmm = cudaMemcpy(fftwgrid_g->descriptor->data[0], fftwgrid, (size_t)(size_finta_fft*sizeof(cufftDoubleComplex)), cudaMemcpyDeviceToDevice);
       if (mmm != cudaSuccess) {printf("!!! cudaMemcpy 1 ERROR %d !!!\n", mmm);}
 
       //Perform the FFT
@@ -166,7 +169,7 @@ void cuda_fft(
       if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMemcpy dtd fftwgrid ERROR %d !!!\n", status);}
 
       //Copy the result descriptor structure array again onto the original fftwgrid
-      mmm = cudaMemcpy(fftwgrid, fftwgrid_g2->descriptor->data[0], xaxis*yaxis*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToDevice);
+      mmm = cudaMemcpy(fftwgrid, fftwgrid_g2->descriptor->data[0], (size_t)(size_finta_fft*sizeof(cufftDoubleComplex)), cudaMemcpyDeviceToDevice);
       if (mmm != cudaSuccess) {printf("!!! cudaMemcpy 2 ERROR %d !!!\n", mmm);}
 
       //Write gridss starting from fftwgrid
diff --git a/gridding_nccl.cu b/gridding_nccl.cu
index b10cb5cc513a0e8f40b514060b722349d7f935a9..36b1d2cd4322b12c77efc4d59ac90ee144b4440f 100755
--- a/gridding_nccl.cu
+++ b/gridding_nccl.cu
@@ -95,10 +95,12 @@ void gridding_data(){
 
   cudaSetDevice(local_rank);
 
-  nnn = cudaMalloc(&grid_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double));
+  long long unsigned size_finta = (long long unsigned)(2*(long long unsigned)param.num_w_planes*(long long unsigned)xaxis*(long long unsigned)yaxis); 
+  
+  nnn = cudaMalloc(&grid_gpu, (size_t)(size_finta*sizeof(double)));
   if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaMalloc &grid_gpu ERROR %d !!!\n", nnn);}
 
-  nnn = cudaMalloc(&gridss_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double));
+  nnn = cudaMalloc(&gridss_gpu, (size_t)(size_finta*sizeof(double)));
   if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaMalloc &gridss_gpu ERROR %d !!!\n", nnn);}
   
   nnn = cudaStreamCreate(&stream_reduce);
@@ -202,27 +204,27 @@ void gridding_data(){
       start = CPU_TIME_wt;
 	    
      //We have to call different GPUs per MPI task!!! [GL]
-     #ifdef CUDACC
-      wstack(param.num_w_planes,
-	     Nsec,
-	     metaData.freq_per_chan,
-	     metaData.polarisations,
-	     uus,
-	     vvs,
-	     wws,
-	     visreals,
-	     visimgs,
-	     weightss,
-	     dx,
-	     dw,
-	     param.w_support,
-	     xaxis,
-	     yaxis,
-	     gridss_gpu,
-	     param.num_threads,
-	     rank,
-	     stream_stacking);
-      #else
+#ifdef CUDACC
+      wstack((long long unsigned)param.num_w_planes,
+             Nsec,
+             metaData.freq_per_chan,
+             metaData.polarisations,
+             uus,
+             vvs,
+             wws,
+             visreals,
+             visimgs,
+             weightss,
+             dx,
+             dw,
+             param.w_support,
+             (long long unsigned)xaxis,
+             (long long unsigned)yaxis,
+             gridss_gpu,
+             param.num_threads,
+             rank,
+             stream_stacking);
+#else
       wstack(param.num_w_planes,
 	     Nsec,
 	     metaData.freq_per_chan,
@@ -241,7 +243,7 @@ void gridding_data(){
 	     gridss,
 	     param.num_threads,
 	     rank);
-      #endif
+#endif
       //Allocate memory on devices non-blocking for the host                                                                                   
       ///////////////////////////////////////////////////////
 
@@ -269,7 +271,7 @@ void gridding_data(){
 	  timing_wt.reduce += CPU_TIME_wt - start;
 
 	  // Go to next sector
-	  nnn = cudaMemset( gridss_gpu, 0.0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) );
+	  nnn = cudaMemset( gridss_gpu, 0.0, (size_t)(size_finta*sizeof(double)) );
 	  if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaMemset ERROR %d !!!\n", nnn);}
 	}
 
@@ -282,7 +284,7 @@ void gridding_data(){
   //cudaMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost, stream_reduce);
   
 #if !defined(CUFFTMP)
-  cudaMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost, stream_reduce);
+  cudaMemcpyAsync(grid, grid_gpu, (size_t)(size_finta*sizeof(double)), cudaMemcpyDeviceToHost, stream_reduce);
   cudaStreamSynchronize(stream_reduce);
 #endif
 
diff --git a/phase_correction.cu b/phase_correction.cu
index 5ed1345a90294e2646984a5e67628fb0a5d05b96..9dcee918de696e9f0e30349e78d6da175078833e 100755
--- a/phase_correction.cu
+++ b/phase_correction.cu
@@ -30,65 +30,65 @@ __global__ void phase_g(int xaxis,
 			double resolution,
 			int nbucket)
 {
-	long gid = blockIdx.x*blockDim.x + threadIdx.x;
-	double add_term_real;
-	double add_term_img;
-	double wterm;
-	long arraysize = (long)((xaxis*yaxis*num_w_planes)/nbucket);
-
-	if(gid < arraysize)
+  long gid = blockIdx.x*blockDim.x + threadIdx.x;
+  double add_term_real;
+  double add_term_img;
+  double wterm;
+  long arraysize = (long)((xaxis*yaxis*num_w_planes)/nbucket);
+
+  if(gid < arraysize)
+    {
+      long gid_aux = nbucket*gid;
+      for(int iaux=0; iaux<nbucket; iaux++) 
 	{
-	  long gid_aux = nbucket*gid;
-	  for(int iaux=0; iaux<nbucket; iaux++) 
-          {
-		int iw = gid_aux/(xaxis*yaxis);
-		int ivaux = gid_aux%(xaxis*yaxis);
-		int iv = ivaux/xaxis;
-		int iu = ivaux%xaxis;
-		long index = 2*gid_aux;
-		long img_index = iu+iv*xaxis;
+	  int iw = gid_aux/(xaxis*yaxis);
+	  int ivaux = gid_aux%(xaxis*yaxis);
+	  int iv = ivaux/xaxis;
+	  int iu = ivaux%xaxis;
+	  long index = 2*gid_aux;
+	  long img_index = iu+iv*xaxis;
 
-                wterm = wmin + iw*dw;
+	  wterm = wmin + iw*dw;
 
 #ifdef PHASE_ON
-                if (num_w_planes > 1)
-                {
-                    double xcoord = (double)(iu-xaxistot/2);
-                    if(xcoord < 0.0)xcoord = (double)(iu+xaxistot/2);
-                    xcoord = sin(xcoord*resolution);
-                    double ycoord = (double)(iv-yaxistot/2);
-                    if(ycoord < 0.0)ycoord = (double)(iv+yaxistot/2);
-                    ycoord = sin(ycoord*resolution);
-
-                    double preal, pimag;
-                    double radius2 = (xcoord*xcoord+ycoord*ycoord);
-
-                    preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
-                    pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
-
-                    double p,q,r,s;
-                    p = gridss[index];
-                    q = gridss[index+1];
-                    r = preal;
-                    s = pimag;
-
-                    //printf("%d %d %d %ld %ld\n",iu,iv,iw,index,img_index);
-
-		    add_term_real = (p*r-q*s)*dwnorm*sqrt(1-radius2);
-		    add_term_img = (p*s+q*r)*dwnorm*sqrt(1-radius2);
-		    atomicAdd(&(image_real[img_index]),add_term_real);
-		    atomicAdd(&(image_imag[img_index]),add_term_img);
-                } else {
-		    atomicAdd(&(image_real[img_index]),gridss[index]);
-		    atomicAdd(&(image_imag[img_index]),gridss[index+1]);
-                }
+	  if (num_w_planes > 1)
+	    {
+	      double xcoord = (double)(iu-xaxistot/2);
+	      if(xcoord < 0.0)xcoord = (double)(iu+xaxistot/2);
+	      xcoord = sin(xcoord*resolution);
+	      double ycoord = (double)(iv-yaxistot/2);
+	      if(ycoord < 0.0)ycoord = (double)(iv+yaxistot/2);
+	      ycoord = sin(ycoord*resolution);
+
+	      double preal, pimag;
+	      double radius2 = (xcoord*xcoord+ycoord*ycoord);
+
+	      preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
+	      pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
+
+	      double p,q,r,s;
+	      p = gridss[index];
+	      q = gridss[index+1];
+	      r = preal;
+	      s = pimag;
+
+	      //printf("%d %d %d %ld %ld\n",iu,iv,iw,index,img_index);
+
+	      add_term_real = (p*r-q*s)*dwnorm*sqrt(1-radius2);
+	      add_term_img = (p*s+q*r)*dwnorm*sqrt(1-radius2);
+	      atomicAdd(&(image_real[img_index]),add_term_real);
+	      atomicAdd(&(image_imag[img_index]),add_term_img);
+	    } else {
+	    atomicAdd(&(image_real[img_index]),gridss[index]);
+	    atomicAdd(&(image_imag[img_index]),gridss[index+1]);
+	  }
 #else
-		atomicAdd(&(image_real[img_index]),gridss[index]);
-		atomicAdd(&(image_imag[img_index]),gridss[index+1]);
+	  atomicAdd(&(image_real[img_index]),gridss[index]);
+	  atomicAdd(&(image_imag[img_index]),gridss[index+1]);
 #endif // end of PHASE_ON
-		gid_aux++;
-           }
+	  gid_aux++;
 	}
+    }
 
 }
 
@@ -97,227 +97,229 @@ __global__ void phase_g(int xaxis,
 void phase_correction(double* gridss, double* image_real, double* image_imag, int xaxis, int yaxis, int num_w_planes, int xaxistot, int yaxistot,
 		      double resolution, double wmin, double wmax, int num_threads, int rank)
 {
-        double dw = (wmax-wmin)/(double)num_w_planes;
-	double wterm = wmin+0.5*dw;
-	double dwnorm = dw/(wmax-wmin);
-
+  double dw = (wmax-wmin)/(double)num_w_planes;
+  double wterm = wmin+0.5*dw;
+  double dwnorm = dw/(wmax-wmin);
+  
 #ifdef CUDACC
-
-	// WARNING: nbucket MUST be chosen such that xaxis*yaxis*num_w_planes is a multiple of nbucket
-	int nbucket = 1;
-	int Nth = NTHREADS;
-        long Nbl = (long)((num_w_planes*xaxis*yaxis)/Nth/nbucket) + 1;
-        if(NWORKERS == 1) {Nbl = 1; Nth = 1;};
-
-	int ndevices;
-	cudaGetDeviceCount(&ndevices);
-	cudaSetDevice(rank % ndevices);
-
-	if ( rank == 0 ) {
-	  if (0 == ndevices) {
-
-	    shutdown_wstacking(NO_ACCELERATORS_FOUND, "No accelerators found", __FILE__, __LINE__ );
-	  }
-
-	}
-	  printf("Running rank %d using GPU %d\n", rank, rank % ndevices);
-	 #ifdef NVIDIA
-	  prtAccelInfo();
-	 #endif
-
-	cudaError_t mmm;
-	double * image_real_g;
-	double * image_imag_g;
-
+  
+  // WARNING: nbucket MUST be chosen such that xaxis*yaxis*num_w_planes is a multiple of nbucket
+  int nbucket = 1;
+  int Nth = NTHREADS;
+  long Nbl = (long)((num_w_planes*xaxis*yaxis)/Nth/nbucket) + 1;
+  if(NWORKERS == 1) {Nbl = 1; Nth = 1;};
+  
+  int ndevices;
+  cudaGetDeviceCount(&ndevices);
+  cudaSetDevice(rank % ndevices);
+  
+  if ( rank == 0 ) {
+    if (0 == ndevices) {
+      
+      shutdown_wstacking(NO_ACCELERATORS_FOUND, "No accelerators found", __FILE__, __LINE__ );
+    }
+    
+  }
+  printf("Running rank %d using GPU %d\n", rank, rank % ndevices);
+#ifdef NVIDIA
+  prtAccelInfo();
+#endif
+  
+  cudaError_t mmm;
+  double * image_real_g;
+  double * image_imag_g;
+  
 #if !defined(CUFFTMP)
-        double * gridss_g;
-
-	mmm = cudaMalloc(&gridss_g, 2*num_w_planes*xaxis*yaxis*sizeof(double));
-        mmm = cudaMemcpy(gridss_g, gridss, 2*num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyHostToDevice);
+  double * gridss_g;
+  
+  long long unsigned size_finta = (long long unsigned)(2*(long long unsigned)num_w_planes*(long long unsigned)xaxis*(long long unsigned)yaxis);
+  
+  mmm = cudaMalloc(&gridss_g, (size_t)(size_finta*sizeof(double)));
+  mmm = cudaMemcpy(gridss_g, gridss, (size_t)(size_finta*sizeof(double)), cudaMemcpyHostToDevice);
 #endif
-	
-	mmm=cudaMalloc(&image_real_g, xaxis*yaxis*sizeof(double));
-	//printf("CUDA ERROR 2 %s\n",cudaGetErrorString(mmm));
-	mmm=cudaMalloc(&image_imag_g, xaxis*yaxis*sizeof(double));
-	//printf("CUDA ERROR 3 %s\n",cudaGetErrorString(mmm));
-
-	//printf("CUDA ERROR 4 %s\n",cudaGetErrorString(mmm));
-	mmm=cudaMemset(image_real_g, 0.0, xaxis*yaxis*sizeof(double));
-	//printf("CUDA ERROR 5 %s\n",cudaGetErrorString(mmm));
-	mmm=cudaMemset(image_imag_g, 0.0, xaxis*yaxis*sizeof(double));
-	//printf("CUDA ERROR 6 %s\n",cudaGetErrorString(mmm));
-
-	// call the phase correction kernel
-	phase_g <<<Nbl,Nth>>> (xaxis,
-                               yaxis,
-			       num_w_planes,
+		   
+  mmm=cudaMalloc(&image_real_g, xaxis*yaxis*sizeof(double));
+  //printf("CUDA ERROR 2 %s\n",cudaGetErrorString(mmm));
+  mmm=cudaMalloc(&image_imag_g, xaxis*yaxis*sizeof(double));
+  //printf("CUDA ERROR 3 %s\n",cudaGetErrorString(mmm));
+  
+  //printf("CUDA ERROR 4 %s\n",cudaGetErrorString(mmm));
+  mmm=cudaMemset(image_real_g, 0.0, xaxis*yaxis*sizeof(double));
+  //printf("CUDA ERROR 5 %s\n",cudaGetErrorString(mmm));
+  mmm=cudaMemset(image_imag_g, 0.0, xaxis*yaxis*sizeof(double));
+  //printf("CUDA ERROR 6 %s\n",cudaGetErrorString(mmm));
+		   
+  // call the phase correction kernel
+  phase_g <<<Nbl,Nth>>> (xaxis,
+			 yaxis,
+			 num_w_planes,
 #if defined(CUFFTMP)
-                               gridss,
+			 gridss,
 #else
-			       gridss_g,
+			 gridss_g,
 #endif
-                               image_real_g,
-                               image_imag_g,
-                               wmin,
-                               dw,
-                               dwnorm,
-                               xaxistot,
-                               yaxistot,
-                               resolution,
-			       nbucket);
-
-	mmm = cudaMemcpy(image_real, image_real_g, xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost);
-	//printf("CUDA ERROR 7 %s\n",cudaGetErrorString(mmm));
-	mmm = cudaMemcpy(image_imag, image_imag_g, xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost);
-	//printf("CUDA ERROR 8 %s\n",cudaGetErrorString(mmm));
-
+			 image_real_g,
+			 image_imag_g,
+			 wmin,
+			 dw,
+			 dwnorm,
+			 xaxistot,
+			 yaxistot,
+			 resolution,
+			 nbucket);
+		   
+  mmm = cudaMemcpy(image_real, image_real_g, xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost);
+  //printf("CUDA ERROR 7 %s\n",cudaGetErrorString(mmm));
+  mmm = cudaMemcpy(image_imag, image_imag_g, xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost);
+  //printf("CUDA ERROR 8 %s\n",cudaGetErrorString(mmm));
+		   
 #if !defined(CUFFTMP)
-	cudaFree(gridss_g);
+  cudaFree(gridss_g);
 #else
-	cudaFree(gridss);
+  cudaFree(gridss);
 #endif
-
-
+		   
+		   
 #else
-
+		   
 #ifndef ACCOMP
-	
+		   
 #ifdef _OPENMP
-	omp_set_num_threads(num_threads);
+  omp_set_num_threads(num_threads);
 #endif
-	
-       #pragma omp parallel for collapse(2) private(wterm) 
-	for (int iw=0; iw<num_w_planes; iw++)
-	{
-	    for (int iv=0; iv<yaxis; iv++)
-            for (int iu=0; iu<xaxis; iu++)
-            {
-
-		unsigned int index = 2*(iu+iv*xaxis+xaxis*yaxis*iw);
-		unsigned int img_index = iu+iv*xaxis;
-		wterm = wmin + iw*dw;
+		   
+#pragma omp parallel for collapse(2) private(wterm) 
+  for (int iw=0; iw<num_w_planes; iw++)
+    {
+      for (int iv=0; iv<yaxis; iv++)
+	for (int iu=0; iu<xaxis; iu++)
+	  {
+			     
+	    unsigned int index = 2*(iu+iv*xaxis+xaxis*yaxis*iw);
+	    unsigned int img_index = iu+iv*xaxis;
+	    wterm = wmin + iw*dw;
 #ifdef PHASE_ON
-                if (num_w_planes > 1)
-		{
-                    double xcoord = (double)(iu-xaxistot/2);
-                    if(xcoord < 0.0)xcoord = (double)(iu+xaxistot/2);
-		    xcoord = sin(xcoord*resolution);
-                    double ycoord = (double)(iv-yaxistot/2);
-                    if(ycoord < 0.0)ycoord = (double)(iv+yaxistot/2);
-		    ycoord = sin(ycoord*resolution);
-
-		    double preal, pimag;
-		    double radius2 = (xcoord*xcoord+ycoord*ycoord);
-		    if(xcoord <= 1.0)
-		    {
-			    preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
-			    pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
-		    } else {
-			    preal = cos(-2.0*PI*wterm*(sqrt(radius2-1.0)-1));
-			    pimag = 0.0;
-		    }
-
+	    if (num_w_planes > 1)
+	      {
+		double xcoord = (double)(iu-xaxistot/2);
+		if(xcoord < 0.0)xcoord = (double)(iu+xaxistot/2);
+		xcoord = sin(xcoord*resolution);
+		double ycoord = (double)(iv-yaxistot/2);
+		if(ycoord < 0.0)ycoord = (double)(iv+yaxistot/2);
+		ycoord = sin(ycoord*resolution);
+				 
+		double preal, pimag;
+		double radius2 = (xcoord*xcoord+ycoord*ycoord);
+		if(xcoord <= 1.0)
+		  {
 		    preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
 		    pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
-
-		    double p,q,r,s;
-		    p = gridss[index];
-		    q = gridss[index+1];
-		    r = preal;
-		    s = pimag;
-
-		    //printf("%d %d %d %ld %ld\n",iu,iv,iw,index,img_index);
-		   #pragma omp atomic
-		    image_real[img_index] += (p*r-q*s)*dwnorm*sqrt(1-radius2);
-		   #pragma omp atomic
-		    image_imag[img_index] += (p*s+q*r)*dwnorm*sqrt(1-radius2);
-	        } else {
-		 #pragma omp atomic
-		  image_real[img_index] += gridss[index];
-		 #pragma omp atomic
-		  image_imag[img_index] += gridss[index+1];
+		  } else {
+		  preal = cos(-2.0*PI*wterm*(sqrt(radius2-1.0)-1));
+		  pimag = 0.0;
 		}
+				 
+		preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
+		pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
+
+		double p,q,r,s;
+		p = gridss[index];
+		q = gridss[index+1];
+		r = preal;
+		s = pimag;
+
+		//printf("%d %d %d %ld %ld\n",iu,iv,iw,index,img_index);
+#pragma omp atomic
+		image_real[img_index] += (p*r-q*s)*dwnorm*sqrt(1-radius2);
+#pragma omp atomic
+		image_imag[img_index] += (p*s+q*r)*dwnorm*sqrt(1-radius2);
+	      } else {
+#pragma omp atomic
+	      image_real[img_index] += gridss[index];
+#pragma omp atomic
+	      image_imag[img_index] += gridss[index+1];
+	    }
 #else
-	       #pragma omp atomic
-  	        image_real[img_index] += gridss[index];
-	       #pragma omp atomic
-		image_imag[img_index] += gridss[index+1];
+#pragma omp atomic
+	    image_real[img_index] += gridss[index];
+#pragma omp atomic
+	    image_imag[img_index] += gridss[index+1];
 #endif // end of PHASE_ON
 
-            }
-	}
+	  }
+    }
 
 #else
-	omp_set_default_device(rank % omp_get_num_devices());
+  omp_set_default_device(rank % omp_get_num_devices());
 	
-       #if !defined(__clang__)
+#if !defined(__clang__)
 
-       #pragma omp target teams distribute parallel for collapse(2) simd private(wterm) map(to:gridss[0:2*num_w_planes*xaxis*yaxis]) map(from:image_real[0:xaxis*yaxis]) map(from:image_imag[0:xaxis*yaxis])
+#pragma omp target teams distribute parallel for collapse(2) simd private(wterm) map(to:gridss[0:2*num_w_planes*xaxis*yaxis]) map(from:image_real[0:xaxis*yaxis]) map(from:image_imag[0:xaxis*yaxis])
 
-       #else
+#else
 
-       #pragma omp target teams distribute parallel for collapse(2) private(wterm) map(to:gridss[0:2*num_w_planes*xaxis*yaxis]) map(from:image_real[0:xaxis*yaxis]) map(from:image_imag[0:xaxis*yaxis])
-       #endif
+#pragma omp target teams distribute parallel for collapse(2) private(wterm) map(to:gridss[0:2*num_w_planes*xaxis*yaxis]) map(from:image_real[0:xaxis*yaxis]) map(from:image_imag[0:xaxis*yaxis])
+#endif
 	
-	for (int iw=0; iw<num_w_planes; iw++)
-	{
-	    for (int iv=0; iv<yaxis; iv++)
-            for (int iu=0; iu<xaxis; iu++)
-            {
-
-		long index = 2*(iu+iv*xaxis+xaxis*yaxis*iw);
-		long img_index = iu+iv*xaxis;
-		wterm = wmin + iw*dw;
+  for (int iw=0; iw<num_w_planes; iw++)
+    {
+      for (int iv=0; iv<yaxis; iv++)
+	for (int iu=0; iu<xaxis; iu++)
+	  {
+
+	    long index = 2*(iu+iv*xaxis+xaxis*yaxis*iw);
+	    long img_index = iu+iv*xaxis;
+	    wterm = wmin + iw*dw;
 #ifdef PHASE_ON
-                if (num_w_planes > 1)
-		{
-                    double xcoord = (double)(iu-xaxistot/2);
-                    if(xcoord < 0.0)xcoord = (double)(iu+xaxistot/2);
-		    xcoord = sin(xcoord*resolution);
-                    double ycoord = (double)(iv-yaxistot/2);
-                    if(ycoord < 0.0)ycoord = (double)(iv+yaxistot/2);
-		    ycoord = sin(ycoord*resolution);
-
-		    double preal, pimag;
-		    double radius2 = (xcoord*xcoord+ycoord*ycoord);
-		    if(xcoord <= 1.0)
-		    {
-			    preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
-			    pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
-		    } else {
-			    preal = cos(-2.0*PI*wterm*(sqrt(radius2-1.0)-1));
-			    pimag = 0.0;
-		    }
-
+	    if (num_w_planes > 1)
+	      {
+		double xcoord = (double)(iu-xaxistot/2);
+		if(xcoord < 0.0)xcoord = (double)(iu+xaxistot/2);
+		xcoord = sin(xcoord*resolution);
+		double ycoord = (double)(iv-yaxistot/2);
+		if(ycoord < 0.0)ycoord = (double)(iv+yaxistot/2);
+		ycoord = sin(ycoord*resolution);
+
+		double preal, pimag;
+		double radius2 = (xcoord*xcoord+ycoord*ycoord);
+		if(xcoord <= 1.0)
+		  {
 		    preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
 		    pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
-
-		    double p,q,r,s;
-		    p = gridss[index];
-		    q = gridss[index+1];
-		    r = preal;
-		    s = pimag;
-
-		    //printf("%d %d %d %ld %ld\n",iu,iv,iw,index,img_index);
-		    #pragma omp atomic
-		    image_real[img_index] += (p*r-q*s)*dwnorm*sqrt(1-radius2);
-		    #pragma omp atomic
-		    image_imag[img_index] += (p*s+q*r)*dwnorm*sqrt(1-radius2);
-	        } else {
-		    #pragma omp atomic
-		    image_real[img_index] += gridss[index];
-		    #pragma omp atomic
-		    image_imag[img_index] += gridss[index+1];
+		  } else {
+		  preal = cos(-2.0*PI*wterm*(sqrt(radius2-1.0)-1));
+		  pimag = 0.0;
 		}
+
+		preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
+		pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
+
+		double p,q,r,s;
+		p = gridss[index];
+		q = gridss[index+1];
+		r = preal;
+		s = pimag;
+
+		//printf("%d %d %d %ld %ld\n",iu,iv,iw,index,img_index);
+#pragma omp atomic
+		image_real[img_index] += (p*r-q*s)*dwnorm*sqrt(1-radius2);
+#pragma omp atomic
+		image_imag[img_index] += (p*s+q*r)*dwnorm*sqrt(1-radius2);
+	      } else {
+#pragma omp atomic
+	      image_real[img_index] += gridss[index];
+#pragma omp atomic
+	      image_imag[img_index] += gridss[index+1];
+	    }
 #else
-		#pragma omp atomic
-  	        image_real[img_index] += gridss[index];
-		#pragma omp atomic
-		image_imag[img_index] += gridss[index+1];
+#pragma omp atomic
+	    image_real[img_index] += gridss[index];
+#pragma omp atomic
+	    image_imag[img_index] += gridss[index+1];
 #endif // end of PHASE_ON
 
-            }
-	}
+	  }
+    }
 	
 #endif	
 #endif // end of __CUDACC__
diff --git a/w-stacking.cu b/w-stacking.cu
index 22136113a182c58491db811739376bdb567d1f76..f52889a37fd597ba08328a00c9c75ad7c5efd13c 100755
--- a/w-stacking.cu
+++ b/w-stacking.cu
@@ -209,329 +209,328 @@ __global__ void convolve_g(
 #pragma  omp declare target
 #endif
 void wstack(
-     int num_w_planes,
-     uint num_points,
-     uint freq_per_chan,
-     uint polarizations,
-     double* uu,
-     double* vv,
-     double* ww,
-     float* vis_real,
-     float* vis_img,
-     float* weight,
-     double dx,
-     double dw,
-     int w_support,
-     int grid_size_x,
-     int grid_size_y,
-     double* grid,
-     int num_threads,
-    #ifdef CUDACC
-     int rank,
-     cudaStream_t stream_stacking
-    #else
-     int rank
-    #endif
-	    )
-	    
+	    long long unsigned num_w_planes,
+	    uint num_points,
+	    uint freq_per_chan,
+	    uint polarizations,
+	    double* uu,
+	    double* vv,
+	    double* ww,
+	    float* vis_real,
+	    float* vis_img,
+	    float* weight,
+	    double dx,
+	    double dw,
+	    int w_support,
+	    long long unsigned grid_size_x,
+	    long long unsigned grid_size_y,
+	    double* grid,
+	    int num_threads,
+#ifdef NCCL_REDUCE
+	    int rank,
+	    cudaStream_t stream_stacking
+#else
+	    int rank
+#endif
+            )
 {
-    uint i;
-    //uint index;
-    uint visindex;
-
-    // initialize the convolution kernel
-    // gaussian:
-    int KernelLen = (w_support-1)/2;
-    int increaseprecision = 5; // this number must be odd: increaseprecison*w_support must be odd (w_support must be odd)
-    double std = 1.0;
-    double std22 = 1.0/(2.0*std*std);
-    double norm = std22/PI;
-    double * convkernel = (double*)malloc(increaseprecision*w_support*sizeof(*convkernel));
-
-    #ifdef GAUSS
-    makeGaussKernel(convkernel,w_support,increaseprecision,std22);
-    #endif
-    #ifdef KAISERBESSEL
-    double overSamplingFactor = 1.0;
-    int withSinc = 0;
-    double alpha = 8.6;
-    makeKaiserBesselKernel(convkernel, w_support, increaseprecision, alpha, overSamplingFactor, withSinc);
-    #endif
-
-
-    // Loop over visibilities.
-// Switch between CUDA and GPU versions
-   #ifdef __CUDACC__
-    // Define the CUDA set up
-    int Nth = NTHREADS;
-    uint Nbl = (uint)(num_points/Nth) + 1;
-    if(NWORKERS == 1) {Nbl = 1; Nth = 1;};
-    uint Nvis = num_points*freq_per_chan*polarizations;
-
-    int ndevices;
-    cudaGetDeviceCount(&ndevices);
-    cudaSetDevice(rank % ndevices);
-
-    if ( rank == 0 ) {
-      if (0 == ndevices) {
-
-	shutdown_wstacking(NO_ACCELERATORS_FOUND, "No accelerators found", __FILE__, __LINE__ );
-      }
+  uint i;
+  //uint index;
+  uint visindex;
+  
+  // initialize the convolution kernel
+  // gaussian:
+  int KernelLen = (w_support-1)/2;
+  int increaseprecision = 5; // this number must be odd: increaseprecison*w_support must be odd (w_support must be odd)
+  double std = 1.0;
+  double std22 = 1.0/(2.0*std*std);
+  double norm = std22/PI;
+  double * convkernel = (double*)malloc(increaseprecision*w_support*sizeof(*convkernel));
+  
+#ifdef GAUSS
+  makeGaussKernel(convkernel,w_support,increaseprecision,std22);
+#endif
+#ifdef KAISERBESSEL
+  double overSamplingFactor = 1.0;
+  int withSinc = 0;
+  double alpha = 8.6;
+  makeKaiserBesselKernel(convkernel, w_support, increaseprecision, alpha, overSamplingFactor, withSinc);
+#endif
+  
+  
+  // Loop over visibilities.
+  // Switch between CUDA and GPU versions
+#ifdef __CUDACC__
+  // Define the CUDA set up
+  int Nth = NTHREADS;
+  uint Nbl = (uint)(num_points/Nth) + 1;
+  if(NWORKERS == 1) {Nbl = 1; Nth = 1;};
+  uint Nvis = num_points*freq_per_chan*polarizations;
+  
+  int ndevices;
+  cudaGetDeviceCount(&ndevices);
+  cudaSetDevice(rank % ndevices);
+  
+  if ( rank == 0 ) {
+    if (0 == ndevices) {
+      
+      shutdown_wstacking(NO_ACCELERATORS_FOUND, "No accelerators found", __FILE__, __LINE__ );
     }
-
-     #ifdef NVIDIA
-      prtAccelInfo();
-     #endif
-
-    // Create GPU arrays and offload them
-    double * uu_g;
-    double * vv_g;
-    double * ww_g;
-    float * vis_real_g;
-    float * vis_img_g;
-    float * weight_g;
-    double * convkernel_g;
+  }
+  
+#ifdef NVIDIA
+  prtAccelInfo();
+#endif
+  
+  // Create GPU arrays and offload them
+  double * uu_g;
+  double * vv_g;
+  double * ww_g;
+  float * vis_real_g;
+  float * vis_img_g;
+  float * weight_g;
+  double * convkernel_g;
 #if !defined(NCCL_REDUCE)
-    double * grid_g;
+  double * grid_g;
 #endif
 #if !defined(NCCL_REDUCE)
-    cudaStream_t stream_stacking;
-    cudaStreamCreate(&stream_stacking);
+  cudaStream_t stream_stacking;
+  cudaStreamCreate(&stream_stacking);
 #endif
-    
-    //Create the event inside stream stacking
-    //cudaEvent_t event_kernel;
-    
-    //for (int i=0; i<100000; i++)grid[i]=23.0;
-    cudaError_t mmm;
-    //mmm=cudaEventCreate(&event_kernel);
-    mmm=cudaMalloc(&uu_g,num_points*sizeof(double));
-    mmm=cudaMalloc(&vv_g,num_points*sizeof(double));
-    mmm=cudaMalloc(&ww_g,num_points*sizeof(double));
-    mmm=cudaMalloc(&vis_real_g,Nvis*sizeof(float));
-    mmm=cudaMalloc(&vis_img_g,Nvis*sizeof(float));
-    mmm=cudaMalloc(&weight_g,(Nvis/freq_per_chan)*sizeof(float));
-    //mmm=cudaMalloc(&grid_g,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
+  
+  //Create the event inside stream stacking
+  //cudaEvent_t event_kernel;
+  
+  //for (int i=0; i<100000; i++)grid[i]=23.0;
+  cudaError_t mmm;
+  //mmm=cudaEventCreate(&event_kernel);
+  mmm=cudaMalloc(&uu_g,num_points*sizeof(double));
+  mmm=cudaMalloc(&vv_g,num_points*sizeof(double));
+  mmm=cudaMalloc(&ww_g,num_points*sizeof(double));
+  mmm=cudaMalloc(&vis_real_g,Nvis*sizeof(float));
+  mmm=cudaMalloc(&vis_img_g,Nvis*sizeof(float));
+  mmm=cudaMalloc(&weight_g,(Nvis/freq_per_chan)*sizeof(float));
+  //mmm=cudaMalloc(&grid_g,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
 
 #if !defined(NCCL_REDUCE)
-    mmm = cudaMalloc(&grid_g,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
+  mmm = cudaMalloc(&grid_g,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
 #endif
 
 #if !defined(GAUSS_HI_PRECISION)
-    mmm=cudaMalloc(&convkernel_g,increaseprecision*w_support*sizeof(double));
+  mmm=cudaMalloc(&convkernel_g,increaseprecision*w_support*sizeof(double));
 #endif    
-    if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMalloc ERROR %d !!!\n", mmm);}
+  if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMalloc ERROR %d !!!\n", mmm);}
     
 #if !defined(NCCL_REDUCE)
-    mmm=cudaMemset(grid_g,0.0,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
-    if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMemset ERROR %d !!!\n", mmm);}
+  mmm=cudaMemset(grid_g,0.0,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
+  if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMemset ERROR %d !!!\n", mmm);}
 #endif
     
-    mmm=cudaMemcpyAsync(uu_g, uu, num_points*sizeof(double), cudaMemcpyHostToDevice, stream_stacking);
-    mmm=cudaMemcpyAsync(vv_g, vv, num_points*sizeof(double), cudaMemcpyHostToDevice, stream_stacking);
-    mmm=cudaMemcpyAsync(ww_g, ww, num_points*sizeof(double), cudaMemcpyHostToDevice, stream_stacking);
-    mmm=cudaMemcpyAsync(vis_real_g, vis_real, Nvis*sizeof(float), cudaMemcpyHostToDevice, stream_stacking);
-    mmm=cudaMemcpyAsync(vis_img_g, vis_img, Nvis*sizeof(float), cudaMemcpyHostToDevice, stream_stacking);
-    mmm=cudaMemcpyAsync(weight_g, weight, (Nvis/freq_per_chan)*sizeof(float), cudaMemcpyHostToDevice, stream_stacking);
+  mmm=cudaMemcpyAsync(uu_g, uu, num_points*sizeof(double), cudaMemcpyHostToDevice, stream_stacking);
+  mmm=cudaMemcpyAsync(vv_g, vv, num_points*sizeof(double), cudaMemcpyHostToDevice, stream_stacking);
+  mmm=cudaMemcpyAsync(ww_g, ww, num_points*sizeof(double), cudaMemcpyHostToDevice, stream_stacking);
+  mmm=cudaMemcpyAsync(vis_real_g, vis_real, Nvis*sizeof(float), cudaMemcpyHostToDevice, stream_stacking);
+  mmm=cudaMemcpyAsync(vis_img_g, vis_img, Nvis*sizeof(float), cudaMemcpyHostToDevice, stream_stacking);
+  mmm=cudaMemcpyAsync(weight_g, weight, (Nvis/freq_per_chan)*sizeof(float), cudaMemcpyHostToDevice, stream_stacking);
 
 
-   #if !defined(GAUSS_HI_PRECISION)
-    mmm=cudaMemcpyAsync(convkernel_g, convkernel, increaseprecision*w_support*sizeof(double), cudaMemcpyHostToDevice, stream_stacking);
-   #endif
+#if !defined(GAUSS_HI_PRECISION)
+  mmm=cudaMemcpyAsync(convkernel_g, convkernel, increaseprecision*w_support*sizeof(double), cudaMemcpyHostToDevice, stream_stacking);
+#endif
 
-    if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMemcpyAsync ERROR %d !!!\n", mmm);}
+  if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMemcpyAsync ERROR %d !!!\n", mmm);}
 
-    // Call main GPU Kernel
+  // Call main GPU Kernel
 #if defined(GAUSS_HI_PRECISION)
-    convolve_g <<<Nbl,Nth,0,stream_stacking>>> (
-						num_w_planes,
-						num_points,
-						freq_per_chan,
-						polarizations,
-						uu_g,
-						vv_g,
-						ww_g,
-						vis_real_g,
-						vis_img_g,
-						weight_g,
-						dx,
-						dw,
-						KernelLen,
-						grid_size_x,
-						grid_size_y,
+  convolve_g <<<Nbl,Nth,0,stream_stacking>>> (
+					      num_w_planes,
+					      num_points,
+					      freq_per_chan,
+					      polarizations,
+					      uu_g,
+					      vv_g,
+					      ww_g,
+					      vis_real_g,
+					      vis_img_g,
+					      weight_g,
+					      dx,
+					      dw,
+					      KernelLen,
+					      grid_size_x,
+					      grid_size_y,
 #if !defined(NCCL_REDUCE)
-						grid_g,
+					      grid_g,
 #else
-						grid,
+					      grid,
 #endif
-						std22
-                                                );
+					      std22
+					      );
 #else
-    convolve_g <<<Nbl,Nth,0,stream_stacking>>> (
-						num_w_planes,
-						num_points,
-						freq_per_chan,
-						polarizations,
-						uu_g,
-						vv_g,
-						ww_g,
-						vis_real_g,
-						vis_img_g,
-						weight_g,
-						dx,
-						dw,
-						KernelLen,
-						grid_size_x,
-						grid_size_y,
+  convolve_g <<<Nbl,Nth,0,stream_stacking>>> (
+					      num_w_planes,
+					      num_points,
+					      freq_per_chan,
+					      polarizations,
+					      uu_g,
+					      vv_g,
+					      ww_g,
+					      vis_real_g,
+					      vis_img_g,
+					      weight_g,
+					      dx,
+					      dw,
+					      KernelLen,
+					      grid_size_x,
+					      grid_size_y,
 #if !defined(NCCL_REDUCE)
-						grid_g,
+					      grid_g,
 #else
-						grid,
+					      grid,
 #endif
-						std22,
-						convkernel_g
-                                                );
+					      std22,
+					      convkernel_g
+					      );
 #endif    
     
-    mmm=cudaStreamSynchronize(stream_stacking);
-    //Record the event
-    //mmm=cudaEventRecord(event_kernel,stream_stacking);
+  mmm=cudaStreamSynchronize(stream_stacking);
+  //Record the event
+  //mmm=cudaEventRecord(event_kernel,stream_stacking);
     
-    //Wait until the kernel ends
-    //mmm=cudaStreamWaitEvent(stream_stacking,event_kernel);
+  //Wait until the kernel ends
+  //mmm=cudaStreamWaitEvent(stream_stacking,event_kernel);
     
-    //for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
+  //for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
     
 #if !defined(NCCL_REDUCE)
-    mmm=cudaMemcpy(grid, grid_g, 2*num_w_planes*grid_size_x*grid_size_y*sizeof(double), cudaMemcpyDeviceToHost);
+  mmm=cudaMemcpy(grid, grid_g, 2*num_w_planes*grid_size_x*grid_size_y*sizeof(double), cudaMemcpyDeviceToHost);
 #endif
 
-    if (mmm != cudaSuccess)
-      printf("CUDA ERROR %s\n",cudaGetErrorString(mmm));
+  if (mmm != cudaSuccess)
+    printf("CUDA ERROR %s\n",cudaGetErrorString(mmm));
     
-    mmm=cudaFree(uu_g);
-    mmm=cudaFree(vv_g);
-    mmm=cudaFree(ww_g);
-    mmm=cudaFree(vis_real_g);
-    mmm=cudaFree(vis_img_g);
-    mmm=cudaFree(weight_g);
+  mmm=cudaFree(uu_g);
+  mmm=cudaFree(vv_g);
+  mmm=cudaFree(ww_g);
+  mmm=cudaFree(vis_real_g);
+  mmm=cudaFree(vis_img_g);
+  mmm=cudaFree(weight_g);
 
 #if !defined(NCCL_REDUCE)
-    mmm=cudaFree(grid_g);
+  mmm=cudaFree(grid_g);
 #endif
     
 #if !defined(GAUSS_HI_PRECISION)
-    mmm=cudaFree(convkernel_g);
+  mmm=cudaFree(convkernel_g);
 #endif
 
-// Switch between CUDA and GPU versions
+  // Switch between CUDA and GPU versions
 # else
 
 #ifdef _OPENMP
-    omp_set_num_threads(num_threads);
+  omp_set_num_threads(num_threads);
 #endif
 
-   #if defined(ACCOMP) && (GPU_STACKING)
-    omp_set_default_device(rank % omp_get_num_devices());
-    uint Nvis = num_points*freq_per_chan*polarizations;
-   #pragma omp target teams distribute parallel for private(visindex) map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom:grid[0:2*num_w_planes*grid_size_x*grid_size_y])
+#if defined(ACCOMP) && (GPU_STACKING)
+  omp_set_default_device(rank % omp_get_num_devices());
+  uint Nvis = num_points*freq_per_chan*polarizations;
+#pragma omp target teams distribute parallel for private(visindex) map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom:grid[0:2*num_w_planes*grid_size_x*grid_size_y])
 #else
-    #pragma omp parallel for private(visindex)
+#pragma omp parallel for private(visindex)
 #endif
-    for (i = 0; i < num_points; i++)
+  for (i = 0; i < num_points; i++)
     {
 #ifdef _OPENMP
-	//int tid;
-	//tid = omp_get_thread_num();
-	//printf("%d\n",tid);
+      //int tid;
+      //tid = omp_get_thread_num();
+      //printf("%d\n",tid);
 #endif
 
-        visindex = i*freq_per_chan*polarizations;
+      visindex = i*freq_per_chan*polarizations;
 
-        double sum = 0.0;
-        int j, k;
-	//if (i%1000 == 0)printf("%ld\n",i);
+      double sum = 0.0;
+      int j, k;
+      //if (i%1000 == 0)printf("%ld\n",i);
 
-        /* Convert UV coordinates to grid coordinates. */
-        double pos_u = uu[i] / dx;
-        double pos_v = vv[i] / dx;
-        double ww_i  = ww[i] / dw;
+      /* Convert UV coordinates to grid coordinates. */
+      double pos_u = uu[i] / dx;
+      double pos_v = vv[i] / dx;
+      double ww_i  = ww[i] / dw;
 	
-        int grid_w = (int)ww_i;
-        int grid_u = (int)pos_u;
-        int grid_v = (int)pos_v;
+      int grid_w = (int)ww_i;
+      int grid_u = (int)pos_u;
+      int grid_v = (int)pos_v;
 
-	// check the boundaries
-	uint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
-	uint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
-	uint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
-	uint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
-        //printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax);
+      // check the boundaries
+      uint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
+      uint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
+      uint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
+      uint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
+      //printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax);
 
 
-        // Convolve this point onto the grid.
-        for (k = kmin; k <= kmax; k++)
+      // Convolve this point onto the grid.
+      for (k = kmin; k <= kmax; k++)
         {
 
-            //double v_dist = (double)k+0.5 - pos_v;
-            double v_dist = (double)k - pos_v;
+	  //double v_dist = (double)k+0.5 - pos_v;
+	  double v_dist = (double)k - pos_v;
 
-            for (j = jmin; j <= jmax; j++)
+	  for (j = jmin; j <= jmax; j++)
             {
-                //double u_dist = (double)j+0.5 - pos_u;
-                double u_dist = (double)j - pos_u;
-		uint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
-                int jKer = (int)(increaseprecision * (fabs(u_dist+(double)KernelLen)));
-                int kKer = (int)(increaseprecision * (fabs(v_dist+(double)KernelLen)));
-
-                #ifdef GAUSS_HI_PRECISION
-		double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
-                #endif
-                #ifdef GAUSS
-		double conv_weight = convkernel[jKer]*convkernel[kKer];
-		//if(jKer < 0 || jKer >= 35 || kKer < 0 || kKer >= 35)
-		//	printf("%f %d %f %d\n",fabs(u_dist+(double)KernelLen),jKer,fabs(v_dist+(double)KernelLen),kKer);
-		//printf("%d %d %d %d %f %f %f %f %f\n",jKer, j, kKer, k, pos_u, pos_v, u_dist,v_dist,conv_weight);
-                #endif
-                #ifdef KAISERBESSEL
-		double conv_weight = convkernel[jKer]*convkernel[kKer];
-	        #endif
-		// Loops over frequencies and polarizations
-		double add_term_real = 0.0;
-		double add_term_img = 0.0;
-		uint ifine = visindex;
-		// DAV: the following two loops are performend by each thread separately: no problems of race conditions
-		for (uint ifreq=0; ifreq<freq_per_chan; ifreq++)
+	      //double u_dist = (double)j+0.5 - pos_u;
+	      double u_dist = (double)j - pos_u;
+	      uint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
+	      int jKer = (int)(increaseprecision * (fabs(u_dist+(double)KernelLen)));
+	      int kKer = (int)(increaseprecision * (fabs(v_dist+(double)KernelLen)));
+
+#ifdef GAUSS_HI_PRECISION
+	      double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
+#endif
+#ifdef GAUSS
+	      double conv_weight = convkernel[jKer]*convkernel[kKer];
+	      //if(jKer < 0 || jKer >= 35 || kKer < 0 || kKer >= 35)
+	      //	printf("%f %d %f %d\n",fabs(u_dist+(double)KernelLen),jKer,fabs(v_dist+(double)KernelLen),kKer);
+	      //printf("%d %d %d %d %f %f %f %f %f\n",jKer, j, kKer, k, pos_u, pos_v, u_dist,v_dist,conv_weight);
+#endif
+#ifdef KAISERBESSEL
+	      double conv_weight = convkernel[jKer]*convkernel[kKer];
+#endif
+	      // Loops over frequencies and polarizations
+	      double add_term_real = 0.0;
+	      double add_term_img = 0.0;
+	      uint ifine = visindex;
+	      // DAV: the following two loops are performend by each thread separately: no problems of race conditions
+	      for (uint ifreq=0; ifreq<freq_per_chan; ifreq++)
 		{
-		   uint iweight = visindex/freq_per_chan;
-	           for (uint ipol=0; ipol<polarizations; ipol++)
-	           {
+		  uint iweight = visindex/freq_per_chan;
+		  for (uint ipol=0; ipol<polarizations; ipol++)
+		    {
                       if (!isnan(vis_real[ifine]))
-                      {
-		         //printf("%f %ld\n",weight[iweight],iweight);
-                         add_term_real += weight[iweight] * vis_real[ifine] * conv_weight;
-		         add_term_img += weight[iweight] * vis_img[ifine] * conv_weight;
-			 //if(vis_img[ifine]>1e10 || vis_img[ifine]<-1e10)printf("%f %f %f %f %ld %ld\n",vis_real[ifine],vis_img[ifine],weight[iweight],conv_weight,ifine,num_points*freq_per_chan*polarizations);
-		      }
+			{
+			  //printf("%f %ld\n",weight[iweight],iweight);
+			  add_term_real += weight[iweight] * vis_real[ifine] * conv_weight;
+			  add_term_img += weight[iweight] * vis_img[ifine] * conv_weight;
+			  //if(vis_img[ifine]>1e10 || vis_img[ifine]<-1e10)printf("%f %f %f %f %ld %ld\n",vis_real[ifine],vis_img[ifine],weight[iweight],conv_weight,ifine,num_points*freq_per_chan*polarizations);
+			}
 		      ifine++;
 		      iweight++;
-		   }
+		    }
 	        }
-		// DAV: this is the critical call in terms of correctness of the results and of performance
-		#pragma omp atomic
-		grid[iKer] += add_term_real;
-		#pragma omp atomic
-		grid[iKer+1] += add_term_img;
+	      // DAV: this is the critical call in terms of correctness of the results and of performance
+#pragma omp atomic
+	      grid[iKer] += add_term_real;
+#pragma omp atomic
+	      grid[iKer+1] += add_term_img;
             }
         }
 	
     }
-   #if defined(ACCOMP) && (GPU_STACKING)
-   #pragma omp target exit data map(delete:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan], grid[0:2*num_w_planes*grid_size_x*grid_size_y])
-   #endif
-    // End switch between CUDA and CPU versions
+#if defined(ACCOMP) && (GPU_STACKING)
+#pragma omp target exit data map(delete:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan], grid[0:2*num_w_planes*grid_size_x*grid_size_y])
+#endif
+  // End switch between CUDA and CPU versions
 #endif
-    //for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
+  //for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
 }
 
 #ifdef ACCOMP
@@ -540,8 +539,8 @@ void wstack(
 
 int test(int nnn)
 {
-	int mmm;
+  int mmm;
 
-	mmm = nnn+1;
-	return mmm;
+  mmm = nnn+1;
+  return mmm;
 }
diff --git a/w-stacking.h b/w-stacking.h
index c4b3aaa8c50bfc4663dbd6da98c20ba365e76bfa..845e923562bb7444365462ba5ad3e92cff252863 100755
--- a/w-stacking.h
+++ b/w-stacking.h
@@ -15,7 +15,7 @@ extern "C"
 #ifdef __cplusplus
 extern "C" {
 void wstack(
-     int,
+     long long unsigned,
      unsigned int,
      unsigned int,
      unsigned int,
@@ -28,14 +28,13 @@ void wstack(
      double,
      double,
      int,
-     int,
-     int,
+     long long unsigned,
+     long long unsigned,
      double*,
      int,
      int,
      cudaStream_t);
 }
-
 #else 
 void wstack(
      int,