diff --git a/Makefile b/Makefile
index cfa29569c6eb59db4e077c4a4329e1719872cfd1..b083ad76df666a7315de05d6999796851d4dfcca 100755
--- a/Makefile
+++ b/Makefile
@@ -63,11 +63,11 @@ OPT += -DPHASE_ON
 # SELECT THE GRIDDING KERNEL: GAUSS, GAUSS_HI_PRECISION, KAISERBESSEL
 
 
-OPT += -DGAUSS_HI_PRECISION
+#OPT += -DGAUSS_HI_PRECISION
 
 #OPT += -DGAUSS
 
-#OPT += -DKAISERBESSEL
+OPT += -DKAISERBESSEL
 
 # ========================================================
 
@@ -77,16 +77,16 @@ OPT += -DGAUSS_HI_PRECISION
 #OPT += -DNVIDIA
 
 # use CUDA for GPUs
-OPT += -DCUDACC
+#OPT += -DCUDACC
 
 # use GPU acceleration via OMP 
 #OPT += -DACCOMP
 
 # perform stacking on GPUs
-OPT += -DGPU_STACKING
+#OPT += -DGPU_STACKING
 
 # use NVIDIA GPU to perform the reduce
-OPT += -DNCCL_REDUCE
+#OPT += -DNCCL_REDUCE
 
 # use GPU to perform FFT
 #OPT += -DCUFFTMP
diff --git a/allvars.c b/allvars.c
index 0f43b70840905185b5938cc5512c0432ad99de5f..89ee345e28b0a8befaf042b4a40b10847a060282 100755
--- a/allvars.c
+++ b/allvars.c
@@ -18,12 +18,12 @@ char   datapath[LONGNAME_LEN];
 int    xaxis, yaxis;
 int    rank;
 int    size;
-uint   nsectors;
-uint   startrow;
+myuint   nsectors;
+myuint   startrow;
 double resolution, dx, dw, w_supporth;
 
-uint **sectorarray   = NULL;
-uint  *histo_send    = NULL;
+myuint **sectorarray   = NULL;
+myuint  *histo_send    = NULL;
 int    verbose_level = 0; 
 
 timing_t timing_wt;
@@ -31,12 +31,8 @@ double   reduce_mpi_time;
 double   reduce_shmem_time;
 
 
-uint     size_of_grid;
+myuint     size_of_grid;
 double   *grid_pointers = NULL, *grid, *gridss, *gridss_real, *gridss_img, *gridss_w, *grid_gpu, *gridss_gpu;
 
 MPI_Comm      MYMPI_COMM_WORLD; 
 MPI_Win       slabwin;
-
-
-
-
diff --git a/allvars.h b/allvars.h
index c434035ba89fcfe91c553c624c46e172854cb988..889403e210e0fd608f88dc8bc51ae342ceb8f887 100755
--- a/allvars.h
+++ b/allvars.h
@@ -73,8 +73,8 @@ typedef double float_t;
 typedef float float_t;
 #endif
 
-typedef unsigned int       uint;
-typedef unsigned long long ull;
+typedef unsigned int       myuint;
+typedef unsigned long long myull;
 
 
 extern struct io
@@ -104,6 +104,8 @@ extern struct op
         char outfile2[NAME_LEN];
         char outfile3[NAME_LEN];
         char fftfile[NAME_LEN];
+        char gridded_writedata1[NAME_LEN];
+        char gridded_writedata2[NAME_LEN];
         char fftfile_writedata1[NAME_LEN];
         char fftfile_writedata2[NAME_LEN];
         char fftfile2[NAME_LEN];
@@ -111,21 +113,21 @@ extern struct op
         char logfile[NAME_LEN];
         char extension[NAME_LEN];
         char timingfile[NAME_LEN];
-
+  
 } out, outparam;
 
 extern struct meta
 {
 
-  unsigned long   Nmeasures;
-  unsigned long   Nvis;
-  uint   Nweights;
-  uint   freq_per_chan;
-  uint   polarisations;
-  uint   Ntimes;
+  myuint   Nmeasures;
+  myull   Nvis;
+  myuint   Nweights;
+  myuint   freq_per_chan;
+  myuint   polarisations;
+  myuint   Ntimes;
   double dt;
   double thours;
-  uint   baselines;
+  myuint   baselines;
   double uvmin;
   double uvmax;
   double wmin;
@@ -161,16 +163,16 @@ extern char datapath[LONGNAME_LEN];
 extern int  xaxis, yaxis;
 extern int  rank;
 extern int  size;
-extern uint nsectors;
-extern uint startrow;
+extern myuint nsectors;
+extern myuint startrow;
 extern double_t resolution, dx, dw, w_supporth;
 
-extern uint **sectorarray;
-extern uint  *histo_send;
+extern myuint **sectorarray;
+extern myuint  *histo_send;
 extern int    verbose_level; 
 
 
-extern uint    size_of_grid;
+extern myuint    size_of_grid;
 extern double_t *grid_pointers, *grid, *gridss, *gridss_real, *gridss_img, *gridss_w, *grid_gpu, *gridss_gpu;
 
 extern MPI_Comm MYMPI_COMM_WORLD;
diff --git a/allvars_nccl.h b/allvars_nccl.h
index 225968efd31c13bfbb6dd2219d19ca1117819e2e..bab1ebdea151972f87e6b1e9fa0ee91c9cc30118 100755
--- a/allvars_nccl.h
+++ b/allvars_nccl.h
@@ -71,8 +71,8 @@ typedef double float_t;
 typedef float float_t;
 #endif
 
-typedef unsigned int       uint;
-typedef unsigned long long ull;
+typedef unsigned int       myuint;
+typedef unsigned long long myull;
 
 
 extern struct io
@@ -102,6 +102,8 @@ extern struct op
         char outfile2[NAME_LEN];
         char outfile3[NAME_LEN];
         char fftfile[NAME_LEN];
+        char gridded_writedata1[NAME_LEN];
+        char gridded_writedata2[NAME_LEN];
         char fftfile_writedata1[NAME_LEN];
         char fftfile_writedata2[NAME_LEN];
         char fftfile2[NAME_LEN];
@@ -109,21 +111,21 @@ extern struct op
         char logfile[NAME_LEN];
         char extension[NAME_LEN];
         char timingfile[NAME_LEN];
-
+  
 } out, outparam;
 
 extern struct meta
 {
 
-  unsigned long   Nmeasures;
-  unsigned long   Nvis;
-  uint   Nweights;
-  uint   freq_per_chan;
-  uint   polarisations;
-  uint   Ntimes;
+  myuint   Nmeasures;
+  myull   Nvis;
+  myuint   Nweights;
+  myuint   freq_per_chan;
+  myuint   polarisations;
+  myuint   Ntimes;
   double dt;
   double thours;
-  uint   baselines;
+  myuint   baselines;
   double uvmin;
   double uvmax;
   double wmin;
@@ -159,16 +161,16 @@ extern char datapath[LONGNAME_LEN];
 extern int  xaxis, yaxis;
 extern int  rank;
 extern int  size;
-extern uint nsectors;
-extern uint startrow;
+extern myuint nsectors;
+extern myuint startrow;
 extern double_t resolution, dx, dw, w_supporth;
 
-extern uint **sectorarray;
-extern uint  *histo_send;
+extern myuint **sectorarray;
+extern myuint  *histo_send;
 extern int    verbose_level; 
 
 
-extern uint    size_of_grid;
+extern myuint    size_of_grid;
 extern double_t *grid_pointers, *grid, *gridss, *gridss_real, *gridss_img, *gridss_w, *grid_gpu, *gridss_gpu;
 
 extern MPI_Comm MYMPI_COMM_WORLD;
diff --git a/allvars_rccl.h b/allvars_rccl.h
index 6e09fcae29beb32ac26369abbad053b426fb5a16..5e9691effa40db742c0422d5bfa1a09e4ae67e2c 100755
--- a/allvars_rccl.h
+++ b/allvars_rccl.h
@@ -74,7 +74,7 @@ typedef double float_ty;
 typedef float float_ty;
 #endif
 
-typedef unsigned int       uint;
+typedef unsigned int       myuint;
 typedef unsigned long long ull;
 
 
@@ -105,6 +105,8 @@ extern struct op
         char outfile2[NAME_LEN];
         char outfile3[NAME_LEN];
         char fftfile[NAME_LEN];
+        char gridded_writedata1[NAME_LEN];
+        char gridded_writedata2[NAME_LEN];
         char fftfile_writedata1[NAME_LEN];
         char fftfile_writedata2[NAME_LEN];
         char fftfile2[NAME_LEN];
@@ -112,21 +114,21 @@ extern struct op
         char logfile[NAME_LEN];
         char extension[NAME_LEN];
         char timingfile[NAME_LEN];
-
+  
 } out, outparam;
 
 extern struct meta
 {
 
-  uint   Nmeasures;
-  uint   Nvis;
-  uint   Nweights;
-  uint   freq_per_chan;
-  uint   polarisations;
-  uint   Ntimes;
+  myuint   Nmeasures;
+  myuint   Nvis;
+  myuint   Nweights;
+  myuint   freq_per_chan;
+  myuint   polarisations;
+  myuint   Ntimes;
   double dt;
   double thours;
-  uint   baselines;
+  myuint   baselines;
   double uvmin;
   double uvmax;
   double wmin;
@@ -162,16 +164,16 @@ extern char datapath[LONGNAME_LEN];
 extern int  xaxis, yaxis;
 extern int  rank;
 extern int  size;
-extern uint nsectors;
-extern uint startrow;
+extern myuint nsectors;
+extern myuint startrow;
 extern double_ty resolution, dx, dw, w_supporth;
 
-extern uint **sectorarray;
-extern uint  *histo_send;
+extern myuint **sectorarray;
+extern myuint  *histo_send;
 extern int    verbose_level; 
 
 
-extern uint    size_of_grid;
+extern myuint    size_of_grid;
 extern double_ty *grid_pointers, *grid, *gridss, *gridss_real, *gridss_img, *gridss_w;
 
 extern MPI_Comm MYMPI_COMM_WORLD;
diff --git a/cuda_fft.cu b/cuda_fft.cu
index 3a28d999365d823faa6efcc8179e3ebc49555f00..c23f8302fb0ef7b90201471fc69e38c8ca3a5c4c 100755
--- a/cuda_fft.cu
+++ b/cuda_fft.cu
@@ -96,7 +96,7 @@ void cuda_fft(
   if (mmm != cudaSuccess) {printf("!!! cuda_fft.cu cudaMalloc ERROR %d !!!\n", mmm);}
 
   int Nth = 32;
-  unsigned int Nbl = (unsigned int)((yaxis*xaxis)/Nth + 1);  
+  myuint Nbl = (myuint)((yaxis*xaxis)/Nth + 1);  
   
 
   // Plan creation
diff --git a/data/paramfile.txt b/data/paramfile.txt
index 3b6991582618a1fd2d0d955130338c36e3134b91..3e509ce790a23d4c57ccdcd7914350ab201eac70 100755
--- a/data/paramfile.txt
+++ b/data/paramfile.txt
@@ -1,13 +1,13 @@
-ndatasets              1
+	ndatasets              1
 Datapath1		./data/tail01_L720378_SB001_uv_12DFF03B0t_121MHz_12DFF03BFt_143MHz_120ch_flag.binMS/
 #Datapath2              /beegfs/lofar/cgheller/L798046_SB244_uv.uncorr_130B27932t_123MHz.pre-cal.binMS/
 #Datapath3              /beegfs/lofar/cgheller/L798046_SB244_uv.uncorr_130B27932t_125MHz.pre-cal.binMS/
 num_threads            1
 w_support              7
 reduce_method 	       0
-grid_size_x            16384
-grid_size_y            16384
-num_w_planes           32
+grid_size_x            4096
+grid_size_y            4096
+num_w_planes           4
 ufile		       ucoord.bin
 vfile    	       vcoord.bin
 wfile                  wcoord.bin
@@ -20,6 +20,8 @@ outfile1	       coords.txt
 outfile2	       grid_real.bin
 outfile3 	       grid_img.bin
 fftfile		       fft.txt
+gridded_writedata1      gridded_data_real.bin
+gridded_writedata2      gridded_data_img.bin
 fftfile_writedata1     ffted_real.bin
 fftfile_writedata2     ffted_img.bin
 fftfile2	       fft_real.bin
diff --git a/fourier_transform.c b/fourier_transform.c
index 088889c30b68f4d06115e5b24e4a6d9ede118252..0b432aa81607d2c4e60a1bc8c31dfefb20c6c68b 100755
--- a/fourier_transform.c
+++ b/fourier_transform.c
@@ -35,8 +35,8 @@ void fftw_data ( void )
   fftwgrid = fftw_alloc_complex(alloc_local);
   plan = fftw_mpi_plan_dft_2d(param.grid_size_y, param.grid_size_x, fftwgrid, fftwgrid, MPI_COMM_WORLD, FFTW_BACKWARD, FFTW_ESTIMATE);
 
-  uint fftwindex = 0;
-  uint fftwindex2D = 0;
+  myuint fftwindex = 0;
+  myuint fftwindex2D = 0;
   for (int iw=0; iw<param.num_w_planes; iw++)
     {
       //printf("FFTing plan %d\n",iw);
@@ -147,7 +147,7 @@ void write_fftw_data(){
 	  MPI_Win_lock(MPI_LOCK_SHARED,isector,0,writewin);
 	  MPI_Get(gridss_w,size_of_grid,MPI_DOUBLE,isector,0,size_of_grid,MPI_DOUBLE,writewin);
 	  MPI_Win_unlock(isector,writewin);
-	  for (uint i=0; i<size_of_grid/2; i++)
+	  for (myuint i=0; i<size_of_grid/2; i++)
 	    {
 	      gridss_real[i] = gridss_w[2*i];
 	      gridss_img[i] = gridss_w[2*i+1];
@@ -158,8 +158,8 @@ void write_fftw_data(){
                 for (int iv=0; iv<yaxis; iv++)
 		  for (int iu=0; iu<xaxis; iu++)
 		    {
-		      uint global_index = (iu + (iv+isector*yaxis)*xaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double);
-		      uint index = iu + iv*xaxis + iw*xaxis*yaxis;
+		      myuint global_index = (iu + (iv+isector*yaxis)*xaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double);
+		      myuint index = iu + iv*xaxis + iw*xaxis*yaxis;
 		      fseek(file.pFilereal, global_index, SEEK_SET);
 		      fwrite(&gridss_real[index], 1, sizeof(double), file.pFilereal);
 		    }
@@ -167,8 +167,8 @@ void write_fftw_data(){
                 for (int iv=0; iv<yaxis; iv++)
 		  for (int iu=0; iu<xaxis; iu++)
 		    {
-		      uint global_index = (iu + (iv+isector*yaxis)*xaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double);
-		      uint index = iu + iv*xaxis + iw*xaxis*yaxis;
+		      myuint global_index = (iu + (iv+isector*yaxis)*xaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double);
+		      myuint index = iu + iv*xaxis + iw*xaxis*yaxis;
 		      fseek(file.pFileimg, global_index, SEEK_SET);
 		      fwrite(&gridss_img[index], 1, sizeof(double), file.pFileimg);
 		    }
@@ -186,7 +186,7 @@ void write_fftw_data(){
 	for (int iu=0; iu<grid_size_x; iu++)
 	{
 	int isector = 0;
-	uint index = 2*(iu + iv*grid_size_x + iw*grid_size_x*grid_size_y);
+	myuint index = 2*(iu + iv*grid_size_x + iw*grid_size_x*grid_size_y);
 	double v_norm = sqrt(gridtot[index]*gridtot[index]+gridtot[index+1]*gridtot[index+1]);
 	fprintf (file.pFile, "%d %d %d %f %f %f\n", iu,iv,iw,gridtot[index],gridtot[index+1],v_norm);
 	}
@@ -255,8 +255,8 @@ void write_fftw_data(){
   if(rank == 0)printf("WRITING IMAGE\n");
 
 #ifdef FITSIO
-  uint * fpixel = (uint *) malloc(sizeof(uint)*naxis);
-  uint * lpixel = (uint *) malloc(sizeof(uint)*naxis);
+  myuint * fpixel = (myuint *) malloc(sizeof(myuint)*naxis);
+  myuint * lpixel = (myuint *) malloc(sizeof(myuint)*naxis);
 #endif
 
 #ifdef FITSIO
diff --git a/gridding.c b/gridding.c
index 10cf812837ad873e1f36ab6b11885f9173ae96e8..8e40b48a28ebe6f6336ea6ec2aaa88ccaef60b44 100755
--- a/gridding.c
+++ b/gridding.c
@@ -4,7 +4,7 @@
 
 
 
-void free_array       ( uint *, uint **, int );
+void free_array       ( myuint *, myuint **, int );
 void initialize_array ( void );
 void gridding_data    ( void );
 
@@ -38,7 +38,7 @@ void gridding()
     cmp_t mygetmax = { 0 };
 
    #pragma omp for
-    for (uint inorm=0; inorm<metaData.Nmeasures; inorm++)
+    for (myuint inorm=0; inorm<metaData.Nmeasures; inorm++)
       {
 	mygetmin.u = MIN(mygetmin.u, data.uu[inorm]);
 	mygetmin.v = MIN(mygetmin.v, data.vv[inorm]);
@@ -74,7 +74,7 @@ void gridding()
   maxg = maxg + offset*maxg;
 
  #pragma omp parallel for num_threads(param.num_threads)
-  for (uint inorm=0; inorm < metaData.Nmeasures; inorm++)
+  for (myuint inorm=0; inorm < metaData.Nmeasures; inorm++)
     {
       data.uu[inorm] = (data.uu[inorm]+maxg) / (2.0*maxg);
       data.vv[inorm] = (data.vv[inorm]+maxg) / (2.0*maxg);
@@ -115,14 +115,14 @@ void gridding()
 
 //   .....................................................................
 //
-void free_array( uint *histo_send, uint **sectorarrays, int nsectors )
+void free_array( myuint *histo_send, myuint **sectorarrays, int nsectors )
 //
 // releases memory allocated for gridding and reduce ops
 //
   
 { 
   
-  for ( uint i = nsectors-1; i > 0; i-- )
+  for ( myuint i = nsectors-1; i > 0; i-- )
     free( sectorarrays[i] );
 
   free( sectorarrays );
@@ -144,9 +144,9 @@ void initialize_array()
   
 {
 
-  histo_send = (uint*) calloc(nsectors+1, sizeof(uint));
+  histo_send = (myuint*) calloc(nsectors+1, sizeof(myuint));
 
-  for (uint iphi = 0; iphi < metaData.Nmeasures; iphi++)
+  for (myuint iphi = 0; iphi < metaData.Nmeasures; iphi++)
     {
       double vvh = data.vv[iphi];              //less or equal to 0.6
       int binphi = (int)(vvh*nsectors); //has values expect 0 and nsectors-1.
@@ -164,15 +164,15 @@ void initialize_array()
 	histo_send[binphi-1]++;
     }
 
-  sectorarray = (uint**)malloc ((nsectors+1) * sizeof(uint*));
-  uint  *counter     = (uint*) calloc (nsectors+1, sizeof(uint));
-  for(uint sec=0; sec<(nsectors+1); sec++)
+  sectorarray = (myuint**)malloc ((nsectors+1) * sizeof(myuint*));
+  myuint  *counter     = (myuint*) calloc (nsectors+1, sizeof(myuint));
+  for(myuint sec=0; sec<(nsectors+1); sec++)
     {
-      sectorarray[sec] = (uint*)malloc(histo_send[sec]*sizeof(uint));
+      sectorarray[sec] = (myuint*)malloc(histo_send[sec]*sizeof(myuint));
     }
     
     
-  for (uint iphi = 0; iphi < metaData.Nmeasures; iphi++)
+  for (myuint iphi = 0; iphi < metaData.Nmeasures; iphi++)
     {
       double vvh      = data.vv[iphi];
       int    binphi   = (int)(vvh*nsectors);
@@ -208,15 +208,18 @@ void write_gridded_data()
 {
 
  #ifdef WRITE_DATA
-  /*
-#error "This routine has not yet been harmonized with new stuf !!"
+
   // Write gridded results
+  MPI_Win slabwin;
+  MPI_Win_create(gridss, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &slabwin);
+  MPI_Win_fence(0,slabwin);
+
   
   if (rank == 0)
     {
       printf("WRITING GRIDDED DATA\n");
-      file.pFilereal = fopen (out.outfile2,"wb");
-      file.pFileimg = fopen (out.outfile3,"wb");
+      file.pFilereal = fopen (out.gridded_writedata1,"wb");
+      file.pFileimg = fopen (out.gridded_writedata2,"wb");
 
       for (int isector=0; isector<nsectors; isector++)
 	{
@@ -227,7 +230,7 @@ void write_gridded_data()
 	  MPI_Get(gridss,size_of_grid,MPI_DOUBLE,isector,0,size_of_grid,MPI_DOUBLE,slabwin);
 	  MPI_Win_unlock(isector,slabwin);
 	 #endif
-	  for (uint i=0; i<size_of_grid/2; i++)
+	  for (myuint i=0; i<size_of_grid/2; i++)
 	    {
 	      gridss_real[i] = gridss[2*i];
 	      gridss_img[i] = gridss[2*i+1];
@@ -238,8 +241,8 @@ void write_gridded_data()
 		for (int iv=0; iv<yaxis; iv++)
 		  for (int iu=0; iu<xaxis; iu++)
 		    {
-		      uint global_index = (iu + (iv+isector*yaxis)*xaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double);
-		      uint index = iu + iv*xaxis + iw*xaxis*yaxis;
+		      myuint global_index = (iu + (iv+isector*yaxis)*xaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double);
+		      myuint index = iu + iv*xaxis + iw*xaxis*yaxis;
 		      fseek(file.pFilereal, global_index, SEEK_SET);
 		      fwrite(&gridss_real[index], 1, sizeof(double), file.pFilereal);
 		    }
@@ -247,8 +250,8 @@ void write_gridded_data()
 		for (int iv=0; iv<yaxis; iv++)
 		  for (int iu=0; iu<xaxis; iu++)
 		    {
-		      uint global_index = (iu + (iv+isector*yaxis)*xaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double);
-		      uint index = iu + iv*xaxis + iw*xaxis*yaxis;
+		      myuint global_index = (iu + (iv+isector*yaxis)*xaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double);
+		      myuint index = iu + iv*xaxis + iw*xaxis*yaxis;
 		      fseek(file.pFileimg, global_index, SEEK_SET);
 		      fwrite(&gridss_img[index], 1, sizeof(double), file.pFileimg);
 		      //double v_norm = sqrt(gridss[index]*gridss[index]+gridss[index+1]*gridss[index+1]);
@@ -260,8 +263,8 @@ void write_gridded_data()
 	    {
 	      for (int iw=0; iw<param.num_w_planes; iw++)
 		{
-		  uint global_index = (xaxis*isector*yaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double);
-		  uint index = iw*xaxis*yaxis;
+		  myuint global_index = (xaxis*isector*yaxis + iw*param.grid_size_x*param.grid_size_y)*sizeof(double);
+		  myuint index = iw*xaxis*yaxis;
 		  fseek(file.pFilereal, global_index, SEEK_SET);
 		  fwrite(&gridss_real[index], xaxis*yaxis, sizeof(double), file.pFilereal);
 		  fseek(file.pFileimg, global_index, SEEK_SET);
@@ -269,13 +272,12 @@ void write_gridded_data()
 		}
 	    }
 	}
-
       fclose(file.pFilereal);
       fclose(file.pFileimg);
     }
   
   MPI_Win_fence(0,slabwin);
-  */
+  MPI_Win_free(&slabwin);
+  MPI_Barrier(MPI_COMM_WORLD);
  #endif //WRITE_DATA 
-
 }
diff --git a/gridding_cpu.c b/gridding_cpu.c
index dab71a93bfa377a2ab4c01ec8460d3224aae5b4b..29cc9408cf6a69930abf77e0bd503aa3ed39632f 100755
--- a/gridding_cpu.c
+++ b/gridding_cpu.c
@@ -85,14 +85,14 @@ void gridding_data()
   // find the largest value in histo_send[]
   //
   
-  for (uint isector = 0; isector < nsectors; isector++)
+  for (myuint isector = 0; isector < nsectors; isector++)
     {
 
       double start = CPU_TIME_wt;
 
-      uint Nsec            = histo_send[isector];
-      uint Nweightss       = Nsec*metaData.polarisations;
-      uint Nvissec         = Nweightss*metaData.freq_per_chan;
+      myuint Nsec            = histo_send[isector];
+      myuint Nweightss       = Nsec*metaData.polarisations;
+      myull Nvissec         = Nweightss*metaData.freq_per_chan;
       double_t *memory     = (double*) malloc ( (Nsec*3)*sizeof(double_t) +
 						(Nvissec*2+Nweightss)*sizeof(float_t) );
 
@@ -109,26 +109,26 @@ void gridding_data()
       
       
       // select data for this sector
-      uint icount = 0;
-      uint ip = 0;
-      uint inu = 0;
+      myuint icount = 0;
+      myuint ip = 0;
+      myuint inu = 0;
 
       #warning "this loop should be threaded"
       #warning "the counter of this loop should not be int"
       for( int iphi = histo_send[isector]-1; iphi >=0 ; iphi--)
         {
 	  
-	  uint ilocal = sectorarray[isector][iphi];
+	  myuint ilocal = sectorarray[isector][iphi];
 
 	  uus[icount] = data.uu[ilocal];
 	  vvs[icount] = data.vv[ilocal]-isector*shift;
 	  wws[icount] = data.ww[ilocal];
-	  for (uint ipol=0; ipol<metaData.polarisations; ipol++)
+	  for (myuint ipol=0; ipol<metaData.polarisations; ipol++)
 	    {
 	      weightss[ip] = data.weights[ilocal*metaData.polarisations+ipol];
 	      ip++;
 	    }
-	  for (uint ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++)
+	  for (myuint ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++)
 	    {
 	      visreals[inu] = data.visreal[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq];
 	      visimgs[inu]  = data.visimg[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq];
@@ -150,7 +150,7 @@ void gridding_data()
 	double my_vvmax = -1e20;
 
        #pragma omp for 
-	for (uint ipart=0; ipart<Nsec; ipart++)
+	for (myuint ipart=0; ipart<Nsec; ipart++)
 	  {
 	    my_uumin = MIN(my_uumin, uus[ipart]);
 	    my_uumax = MAX(my_uumax, uus[ipart]);
diff --git a/gridding_nccl.cu b/gridding_nccl.cu
index 36b1d2cd4322b12c77efc4d59ac90ee144b4440f..8c0fd0077a0c66c1f53a2f46c48d930eac92810e 100755
--- a/gridding_nccl.cu
+++ b/gridding_nccl.cu
@@ -111,14 +111,14 @@ void gridding_data(){
 
   ncclCommInitRank(&comm, size, id, rank);
 
-  for (uint isector = 0; isector < nsectors; isector++)
+  for (myuint isector = 0; isector < nsectors; isector++)
     {
 
       double start = CPU_TIME_wt;
 
-      uint Nsec            = histo_send[isector];
-      uint Nweightss       = Nsec*metaData.polarisations;
-      uint Nvissec         = Nweightss*metaData.freq_per_chan;
+      myuint Nsec            = histo_send[isector];
+      myuint Nweightss       = Nsec*metaData.polarisations;
+      myull Nvissec         = Nweightss*metaData.freq_per_chan;
       double_t *memory     = (double*) malloc ( (Nsec*3)*sizeof(double_t) +
 						(Nvissec*2+Nweightss)*sizeof(float_t) );
 
@@ -135,26 +135,26 @@ void gridding_data(){
       
       
       // select data for this sector
-      uint icount = 0;
-      uint ip = 0;
-      uint inu = 0;
+      myuint icount = 0;
+      myuint ip = 0;
+      myuint inu = 0;
 
       #warning "this loop should be threaded"
       #warning "the counter of this loop should not be int"
       for(int iphi = histo_send[isector]-1; iphi>=0; iphi--)
         {
 
-	  uint ilocal = sectorarray[isector][iphi];
+	  myuint ilocal = sectorarray[isector][iphi];
 
 	  uus[icount] = data.uu[ilocal];
 	  vvs[icount] = data.vv[ilocal]-isector*shift;
 	  wws[icount] = data.ww[ilocal];
-	  for (uint ipol=0; ipol<metaData.polarisations; ipol++)
+	  for (myuint ipol=0; ipol<metaData.polarisations; ipol++)
 	    {
 	      weightss[ip] = data.weights[ilocal*metaData.polarisations+ipol];
 	      ip++;
 	    }
-	  for (uint ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++)
+	  for (myuint ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++)
 	    {
 	      visreals[inu] = data.visreal[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq];
 	      visimgs[inu]  = data.visimg[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq];
@@ -177,7 +177,7 @@ void gridding_data(){
         double my_vvmax = -1e20;
 
        #pragma omp for
-        for (uint ipart=0; ipart<Nsec; ipart++)
+        for (myuint ipart=0; ipart<Nsec; ipart++)
           {
             my_uumin = MIN(my_uumin, uus[ipart]);
             my_uumax = MAX(my_uumax, uus[ipart]);
diff --git a/gridding_rccl.cpp b/gridding_rccl.cpp
deleted file mode 100644
index 8d0efd5e1bb500eb3551199f931873bcef2b0ced..0000000000000000000000000000000000000000
--- a/gridding_rccl.cpp
+++ /dev/null
@@ -1,273 +0,0 @@
-#include "allvars_rccl.h"
-#include "proto.h"
-#include <hip/hip_runtime.h>
-#include <rccl.h>
-
-/* 
- * Implements the gridding of data via GPU
- * by using NCCL library
- *
- */
-
-
-#if defined( RCCL_REDUCE )
-
-/*
-#define NCCLCHECK(cmd) do {                         
-ncclResult_t r = cmd;                             
-if (r!= ncclSuccess) {                            
-  printf("Failed, NCCL error %s:%d '%s'\n",       
-	 __FILE__,__LINE__,ncclGetErrorString(r));   
-  exit(EXIT_FAILURE);                             
- }                                                 
-} while(0)
-*/  
-
-static uint64_t getHostHash(const char* string) {
-  // Based on DJB2a, result = result * 33 ^ char                                                                                                 
-  uint64_t result = 5381;
-  for (int c = 0; string[c] != '\0'; c++){
-    result = ((result << 5) + result) ^ string[c];
-  }
-  return result;
-}
-
-
-static void getHostName(char* hostname, int maxlen) {
-  gethostname(hostname, maxlen);
-  for (int i=0; i< maxlen; i++) {
-    if (hostname[i] == '.') {
-        hostname[i] = '\0';
-        return;
-    }
-  }  
-}
-
-
-
-
-void gridding_data(){
-
-  double shift = (double)(dx*yaxis);
-
-  timing_wt.kernel     = 0.0;
-  timing_wt.reduce     = 0.0;
-  timing_wt.reduce_mpi = 0.0;
-  timing_wt.reduce_sh  = 0.0;
-  timing_wt.compose    = 0.0;
-  
-  // calculate the resolution in radians
-  resolution = 1.0/MAX(fabs(metaData.uvmin),fabs(metaData.uvmax));
-
-  // calculate the resolution in arcsec 
-  double resolution_asec = (3600.0*180.0)/MAX(fabs(metaData.uvmin),fabs(metaData.uvmax))/PI;
-  if ( rank == 0 )
-    printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec);
-
- 
-
-    //Initialize nccl
-
-  double * grid_gpu, *gridss_gpu;
-  int local_rank = 0;
-
-  uint64_t hostHashs[size];
-  char hostname[1024];
-  getHostName(hostname, 1024);
-  hostHashs[rank] = getHostHash(hostname);
-  MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD);
-  for (int p=0; p<size; p++) {
-     if (p == rank) break;
-     if (hostHashs[p] == hostHashs[rank]) local_rank++;
-  }
-  
-  ncclUniqueId id;
-  ncclComm_t comm;
-  hipStream_t stream_reduce;
-
-  if (rank == 0) ncclGetUniqueId(&id);
-  MPI_Bcast((void *)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);
-
-  hipSetDevice(local_rank);
-
-  hipMalloc(&grid_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double));
-  hipMalloc(&gridss_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double));
-  hipStreamCreate(&stream_reduce);
-  
-  ncclCommInitRank(&comm, size, id, rank);
-
-  for (uint isector = 0; isector < nsectors; isector++)
-    {
-
-      double start = CPU_TIME_wt;
-
-      uint Nsec            = histo_send[isector];
-      uint Nweightss       = Nsec*metaData.polarisations;
-      uint Nvissec         = Nweightss*metaData.freq_per_chan;
-      double_ty *memory     = (double*) malloc ( (Nsec*3)*sizeof(double_ty) +
-						(Nvissec*2+Nweightss)*sizeof(float_ty) );
-
-      if ( memory == NULL )
-	shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__);
-  
-      double_ty *uus        = (double_ty*) memory;
-      double_ty *vvs        = (double_ty*) uus+Nsec;
-      double_ty *wws        = (double_ty*) vvs+Nsec;
-      float_ty  *weightss   = (float_ty*)((double_t*)wws+Nsec);
-      float_ty  *visreals   = (float_ty*)weightss + Nweightss;
-      float_ty  *visimgs    = (float_ty*)visreals + Nvissec;
-  
-      
-      
-      // select data for this sector
-      uint icount = 0;
-      uint ip = 0;
-      uint inu = 0;
-
-      #warning "this loop should be threaded"
-      #warning "the counter of this loop should not be int"
-      for(int iphi = histo_send[isector]-1; iphi>=0; iphi--)
-        {
-
-	  uint ilocal = sectorarray[isector][iphi];
-
-	  uus[icount] = data.uu[ilocal];
-	  vvs[icount] = data.vv[ilocal]-isector*shift;
-	  wws[icount] = data.ww[ilocal];
-	  for (uint ipol=0; ipol<metaData.polarisations; ipol++)
-	    {
-	      weightss[ip] = data.weights[ilocal*metaData.polarisations+ipol];
-	      ip++;
-	    }
-	  for (uint ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++)
-	    {
-	      visreals[inu] = data.visreal[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq];
-	      visimgs[inu]  = data.visimg[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq];
-
-	      inu++;
-	    }
-	  icount++;
-	}
-
-      double uumin = 1e20;
-      double vvmin = 1e20;
-      double uumax = -1e20;
-      double vvmax = -1e20;
-
-      #pragma omp parallel reduction( min: uumin, vvmin) reduction( max: uumax, vvmax) num_threads(param.num_threads)
-      {
-        double my_uumin = 1e20;
-        double my_vvmin = 1e20;
-        double my_uumax = -1e20;
-        double my_vvmax = -1e20;
-
-       #pragma omp for
-        for (uint ipart=0; ipart<Nsec; ipart++)
-          {
-            my_uumin = MIN(my_uumin, uus[ipart]);
-            my_uumax = MAX(my_uumax, uus[ipart]);
-            my_vvmin = MIN(my_vvmin, vvs[ipart]);
-            my_vvmax = MAX(my_vvmax, vvs[ipart]);
-          }
-
-        uumin = MIN( uumin, my_uumin );
-        uumax = MAX( uumax, my_uumax );
-        vvmin = MIN( vvmin, my_vvmin );
-        vvmax = MAX( vvmax, my_vvmax );
-      }
-
-      timing_wt.compose += CPU_TIME_wt - start;
-
-      //printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax);
-      
-      // Make convolution on the grid
-
-     #ifdef VERBOSE
-      printf("Processing sector %ld\n",isector);
-     #endif
-
-      start = CPU_TIME_wt;
-
-      double *stacking_target_array;
-      if ( size > 1 )
-	stacking_target_array = gridss;
-      else
-	stacking_target_array = grid;
-      
-     //We have to call different GPUs per MPI task!!! [GL]
-      wstack(param.num_w_planes,
-	     Nsec,
-	     metaData.freq_per_chan,
-	     metaData.polarisations,
-	     uus,
-	     vvs,
-	     wws,
-	     visreals,
-	     visimgs,
-	     weightss,
-	     dx,
-	     dw,
-	     param.w_support,
-	     xaxis,
-	     yaxis,
-	     stacking_target_array,
-	     param.num_threads,
-	     rank);
-      //Allocate memory on devices non-blocking for the host                                                                                   
-      ///////////////////////////////////////////////////////
-
-
-      timing_wt.kernel += CPU_TIME_wt - start;
-	    
-      hipMemcpyAsync(gridss_gpu, gridss, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), hipMemcpyHostToDevice, stream_reduce);
-      
-     #ifdef VERBOSE
-      printf("Processed sector %ld\n",isector);
-     #endif
-      
-
-      if( size > 1 )
-	{
-     
-	  // Write grid in the corresponding remote slab
-     
-	  // int target_rank = (int)isector;    it implied that size >= nsectors
-	  int target_rank = (int)(isector % size);
-	  
-	  	  
-	  hipStreamSynchronize(stream_reduce);
-    
-          start = CPU_TIME_wt;
-
-	  ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble, ncclSum, target_rank, comm, stream_reduce);
-	  hipStreamSynchronize(stream_reduce);
-      
-	  timing_wt.reduce += CPU_TIME_wt - start;
-
-		  
-	  // Go to next sector
-	  memset ( gridss, 0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) );	  
-	}	
-
-      free(memory);
-    }
-
-  //Copy data back from device to host (to be deleted in next steps)
-
-  hipMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), hipMemcpyDeviceToHost, stream_reduce);
-  
-  MPI_Barrier(MPI_COMM_WORLD);
-
-  hipStreamSynchronize(stream_reduce);
-  hipFree(gridss_gpu);
-  hipFree(grid_gpu);
-  
-  hipStreamDestroy(stream_reduce);
-  
-  ncclCommDestroy(comm);
-
-  return;
-  
-}
-
-#endif
diff --git a/gridding_rccl.hip.cpp b/gridding_rccl.hip.cpp
index e54fa4d6d0a28f83d4c42716d50036466541eedb..19b93c8cb2f9cee3991c7c3d5aa0fd92483e8e0d 100755
--- a/gridding_rccl.hip.cpp
+++ b/gridding_rccl.hip.cpp
@@ -101,14 +101,14 @@ void gridding_data(){
   
   ncclCommInitRank(&comm, size, id, rank);
 
-  for (uint isector = 0; isector < nsectors; isector++)
+  for (myuint isector = 0; isector < nsectors; isector++)
     {
 
       double start = CPU_TIME_wt;
 
-      uint Nsec            = histo_send[isector];
-      uint Nweightss       = Nsec*metaData.polarisations;
-      uint Nvissec         = Nweightss*metaData.freq_per_chan;
+      myuint Nsec            = histo_send[isector];
+      myuint Nweightss       = Nsec*metaData.polarisations;
+      myull Nvissec         = Nweightss*metaData.freq_per_chan;
       double_t *memory     = (double*) malloc ( (Nsec*3)*sizeof(double_t) +
 						(Nvissec*2+Nweightss)*sizeof(float_t) );
 
@@ -125,26 +125,26 @@ void gridding_data(){
       
       
       // select data for this sector
-      uint icount = 0;
-      uint ip = 0;
-      uint inu = 0;
+      myuint icount = 0;
+      myuint ip = 0;
+      myuint inu = 0;
 
       #warning "this loop should be threaded"
       #warning "the counter of this loop should not be int"
       for(int iphi = histo_send[isector]-1; iphi>=0; iphi--)
         {
 
-	  uint ilocal = sectorarray[isector][iphi];
+	  myuint ilocal = sectorarray[isector][iphi];
 
 	  uus[icount] = data.uu[ilocal];
 	  vvs[icount] = data.vv[ilocal]-isector*shift;
 	  wws[icount] = data.ww[ilocal];
-	  for (uint ipol=0; ipol<metaData.polarisations; ipol++)
+	  for (myuint ipol=0; ipol<metaData.polarisations; ipol++)
 	    {
 	      weightss[ip] = data.weights[ilocal*metaData.polarisations+ipol];
 	      ip++;
 	    }
-	  for (uint ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++)
+	  for (myuint ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++)
 	    {
 	      visreals[inu] = data.visreal[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq];
 	      visimgs[inu]  = data.visimg[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq];
@@ -167,7 +167,7 @@ void gridding_data(){
         double my_vvmax = -1e20;
 
        #pragma omp for
-        for (uint ipart=0; ipart<Nsec; ipart++)
+        for (myuint ipart=0; ipart<Nsec; ipart++)
           {
             my_uumin = MIN(my_uumin, uus[ipart]);
             my_uumax = MAX(my_uumax, uus[ipart]);
diff --git a/init.c b/init.c
index 76566bbe6eaca15c81a274a5d3e9ca08d63f057f..79a7c5d514f43309d1b60c6ba2114ec32322713b 100755
--- a/init.c
+++ b/init.c
@@ -83,7 +83,15 @@ void op_filename() {
    	strcpy(buf, num_buf);
    	strcat(buf, outparam.fftfile);
    	strcpy(out.fftfile, buf);
-#ifdef WRITE_DATA	
+#ifdef WRITE_DATA
+        strcpy(buf, num_buf);
+        strcat(buf, outparam.gridded_writedata1);
+        strcpy(out.gridded_writedata1, buf);
+
+        strcpy(buf, num_buf);
+        strcat(buf, outparam.gridded_writedata2);
+        strcpy(out.gridded_writedata2, buf);
+
         strcpy(buf, num_buf);
         strcat(buf, outparam.fftfile_writedata1);
         strcpy(out.fftfile_writedata1, buf);
@@ -213,6 +221,14 @@ void read_parameter_file(char *fname)
 		  strcpy(outparam.fftfile, buf2);
 		}
 #ifdef WRITE_DATA
+	      if(strcmp(buf1, "gridded_writedata1") == 0)
+                {
+                  strcpy(outparam.gridded_writedata1, buf2);
+                }
+	      if(strcmp(buf1, "gridded_writedata2") == 0)
+                {
+                  strcpy(outparam.gridded_writedata2, buf2);
+                }
 	      if(strcmp(buf1, "fftfile_writedata1") == 0)
                 {
                   strcpy(outparam.fftfile_writedata1, buf2);
@@ -334,15 +350,15 @@ void metaData_calculation() {
      }
 
      // Set temporary local size of points
-     uint nm_pe = (uint)(metaData.Nmeasures/size);
-     uint remaining = metaData.Nmeasures%size;
+     myuint nm_pe = (myuint)(metaData.Nmeasures/size);
+     myuint remaining = metaData.Nmeasures%size;
 
      startrow = rank*nm_pe;
      if (rank == size-1)nm_pe = nm_pe+remaining;
 
-     uint Nmeasures_tot = metaData.Nmeasures;
+     //myuint Nmeasures_tot = metaData.Nmeasures;
      metaData.Nmeasures = nm_pe;
-     unsigned long Nvis_tot = metaData.Nvis;
+     //unsigned long Nvis_tot = metaData.Nvis;
      metaData.Nvis = metaData.Nmeasures*metaData.freq_per_chan*metaData.polarisations;
      metaData.Nweights = metaData.Nmeasures*metaData.polarisations;
 
diff --git a/main.c b/main.c
index eece36980e15f57753c2fe9d4eaaba92c7d8634c..63540cd88be41a225d8779803b0c579119bd783d 100755
--- a/main.c
+++ b/main.c
@@ -107,8 +107,8 @@ int main(int argc, char * argv[])
   char testfitsreal[NAME_LEN] = "parallel_real.fits";
   char testfitsimag[NAME_LEN] = "parallel_img.fits";
 
-  uint naxis = 2;
-  uint naxes[2] = { grid_size_x, grid_size_y };
+  myuint naxis = 2;
+  myuint naxes[2] = { grid_size_x, grid_size_y };
  #endif
 
   
diff --git a/w-stacking.cu b/w-stacking.cu
index 9aa0d6684489352a5f173330f08be64257ec39a2..d7f1c36690948a0103ef9251e2a24e97ce9475eb 100755
--- a/w-stacking.cu
+++ b/w-stacking.cu
@@ -103,9 +103,9 @@ void makeKaiserBesselKernel(double * kernel,
 
 __global__ void convolve_g(
 			   int num_w_planes,
-			   uint num_points,
-			   uint freq_per_chan,
-			   uint polarizations,
+			   myuint num_points,
+			   myuint freq_per_chan,
+			   myuint polarizations,
 			   double* uu,
 			   double* vv,
 			   double* ww,
@@ -130,11 +130,11 @@ __global__ void convolve_g(
 
 {
   //printf("DENTRO AL KERNEL\n");
-  uint gid = blockIdx.x*blockDim.x + threadIdx.x;
+  myuint gid = blockIdx.x*blockDim.x + threadIdx.x;
   if(gid < num_points)
     {
-      uint i = gid;
-      uint visindex = i*freq_per_chan*polarizations;
+      myuint i = gid;
+      unsigned long visindex = i*freq_per_chan*polarizations;
       double norm = std22/PI;
 
       int j, k;
@@ -149,10 +149,10 @@ __global__ void convolve_g(
       int grid_v = (int)pos_v;
 
       // check the boundaries
-      uint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
-      uint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
-      uint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
-      uint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
+      myuint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
+      myuint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
+      myuint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
+      myuint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
 
 
       // Convolve this point onto the grid.
@@ -165,7 +165,7 @@ __global__ void convolve_g(
 	  for (j = jmin; j <= jmax; j++)
             {
 	      double u_dist = (double)j+0.5 - pos_u;
-	      uint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
+	      myuint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
 	      int jKer = (int)(increaseprecision * (fabs(u_dist+(double)KernelLen)));
 	      int kKer = (int)(increaseprecision * (fabs(v_dist+(double)KernelLen)));
 	      
@@ -182,11 +182,11 @@ __global__ void convolve_g(
 	      // Loops over frequencies and polarizations
 	      double add_term_real = 0.0;
 	      double add_term_img = 0.0;
-	      uint ifine = visindex;
-	      for (uint ifreq=0; ifreq<freq_per_chan; ifreq++)
+	      unsigned long ifine = visindex;
+	      for (myuint ifreq=0; ifreq<freq_per_chan; ifreq++)
 		{
-		  uint iweight = visindex/freq_per_chan;
-		  for (uint ipol=0; ipol<polarizations; ipol++)
+		  myuint iweight = visindex/freq_per_chan;
+		  for (myuint ipol=0; ipol<polarizations; ipol++)
 		    {
                       double vistest = (double)vis_real[ifine];
                       if (!isnan(vistest))
@@ -209,10 +209,14 @@ __global__ void convolve_g(
 #pragma  omp declare target
 #endif
 void wstack(
+#ifdef __CUDACC__
 	    long long unsigned num_w_planes,
-	    uint num_points,
-	    uint freq_per_chan,
-	    uint polarizations,
+#else
+	    int num_w_planes,
+#endif
+	    myuint num_points,
+	    myuint freq_per_chan,
+	    myuint polarizations,
 	    double* uu,
 	    double* vv,
 	    double* ww,
@@ -222,8 +226,13 @@ void wstack(
 	    double dx,
 	    double dw,
 	    int w_support,
+#ifdef __CUDACC__
 	    long long unsigned grid_size_x,
 	    long long unsigned grid_size_y,
+#else
+	    int grid_size_x,
+	    int grid_size_y,
+#endif
 	    double* grid,
 	    int num_threads,
 #ifdef NCCL_REDUCE
@@ -234,9 +243,9 @@ void wstack(
 #endif
             )
 {
-  uint i;
-  //uint index;
-  uint visindex;
+  myuint i;
+  //myuint index;
+  unsigned long visindex;
   
   // initialize the convolution kernel
   // gaussian:
@@ -263,9 +272,9 @@ void wstack(
 #ifdef __CUDACC__
   // Define the CUDA set up
   int Nth = NTHREADS;
-  uint Nbl = (uint)(num_points/Nth) + 1;
+  myuint Nbl = (myuint)(num_points/Nth) + 1;
   if(NWORKERS == 1) {Nbl = 1; Nth = 1;};
-  uint Nvis = num_points*freq_per_chan*polarizations;
+  unsigned long Nvis = num_points*freq_per_chan*polarizations;
   
   int ndevices;
   cudaGetDeviceCount(&ndevices);
@@ -432,7 +441,7 @@ void wstack(
 
 #if defined(ACCOMP) && (GPU_STACKING)
   omp_set_default_device(rank % omp_get_num_devices());
-  uint Nvis = num_points*freq_per_chan*polarizations;
+  myuint Nvis = num_points*freq_per_chan*polarizations;
 #pragma omp target teams distribute parallel for private(visindex) map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom:grid[0:2*num_w_planes*grid_size_x*grid_size_y])
 #else
 #pragma omp parallel for private(visindex)
@@ -461,10 +470,10 @@ void wstack(
       int grid_v = (int)pos_v;
 
       // check the boundaries
-      uint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
-      uint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
-      uint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
-      uint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
+      myuint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
+      myuint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
+      myuint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
+      myuint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
       //printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax);
 
 
@@ -479,7 +488,7 @@ void wstack(
             {
 	      double u_dist = (double)j+0.5 - pos_u;
 	      //double u_dist = (double)j - pos_u;
-	      uint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
+	      myuint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
 	      int jKer = (int)(increaseprecision * (fabs(u_dist+(double)KernelLen)));
 	      int kKer = (int)(increaseprecision * (fabs(v_dist+(double)KernelLen)));
 
@@ -498,12 +507,12 @@ void wstack(
 	      // Loops over frequencies and polarizations
 	      double add_term_real = 0.0;
 	      double add_term_img = 0.0;
-	      uint ifine = visindex;
+	      unsigned long ifine = visindex;
 	      // DAV: the following two loops are performend by each thread separately: no problems of race conditions
-	      for (uint ifreq=0; ifreq<freq_per_chan; ifreq++)
+	      for (myuint ifreq=0; ifreq<freq_per_chan; ifreq++)
 		{
-		  uint iweight = visindex/freq_per_chan;
-		  for (uint ipol=0; ipol<polarizations; ipol++)
+		  myuint iweight = visindex/freq_per_chan;
+		  for (myuint ipol=0; ipol<polarizations; ipol++)
 		    {
                       if (!isnan(vis_real[ifine]))
 			{
diff --git a/w-stacking.hip.cpp b/w-stacking.hip.cpp
index 5c376f3c7d8a6566d84593bbeafe7dd0162edfe9..bc5b7301b7f039b1bf6495b08e535757a6d5204a 100755
--- a/w-stacking.hip.cpp
+++ b/w-stacking.hip.cpp
@@ -99,9 +99,9 @@ void makeKaiserBesselKernel(double * kernel,
 
 __global__ void convolve_g(
 			   int num_w_planes,
-			   uint num_points,
-			   uint freq_per_chan,
-			   uint polarizations,
+			   myuint num_points,
+			   myuint freq_per_chan,
+			   myuint polarizations,
 			   double* uu,
 			   double* vv,
 			   double* ww,
@@ -126,11 +126,11 @@ __global__ void convolve_g(
 
 {
   //printf("DENTRO AL KERNEL\n");
-  uint gid = blockIdx.x*blockDim.x + threadIdx.x;
+  myuint gid = blockIdx.x*blockDim.x + threadIdx.x;
   if(gid < num_points)
     {
-      uint i = gid;
-      uint visindex = i*freq_per_chan*polarizations;
+      myuint i = gid;
+      myuint visindex = i*freq_per_chan*polarizations;
       double norm = std22/PI;
 
       int j, k;
@@ -145,10 +145,10 @@ __global__ void convolve_g(
       int grid_v = (int)pos_v;
 
       // check the boundaries
-      uint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
-      uint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
-      uint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
-      uint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
+      myuint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
+      myuint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
+      myuint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
+      myuint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
 
 
       // Convolve this point onto the grid.
@@ -161,7 +161,7 @@ __global__ void convolve_g(
 	  for (j = jmin; j <= jmax; j++)
             {
 	      double u_dist = (double)j+0.5 - pos_u;
-	      uint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
+	      myuint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
 	      int jKer = (int)(increaseprecision * (fabs(u_dist+(double)KernelLen)));
 	      int kKer = (int)(increaseprecision * (fabs(v_dist+(double)KernelLen)));
 	      
@@ -178,11 +178,11 @@ __global__ void convolve_g(
 	      // Loops over frequencies and polarizations
 	      double add_term_real = 0.0;
 	      double add_term_img = 0.0;
-	      uint ifine = visindex;
-	      for (uint ifreq=0; ifreq<freq_per_chan; ifreq++)
+	      myuint ifine = visindex;
+	      for (myuint ifreq=0; ifreq<freq_per_chan; ifreq++)
 		{
-		  uint iweight = visindex/freq_per_chan;
-		  for (uint ipol=0; ipol<polarizations; ipol++)
+		  myuint iweight = visindex/freq_per_chan;
+		  for (myuint ipol=0; ipol<polarizations; ipol++)
 		    {
                       double vistest = (double)vis_real[ifine];
                       if (!isnan(vistest))
@@ -206,9 +206,9 @@ __global__ void convolve_g(
 #endif
 void wstack(
      int num_w_planes,
-     uint num_points,
-     uint freq_per_chan,
-     uint polarizations,
+     myuint num_points,
+     myuint freq_per_chan,
+     myuint polarizations,
      double* uu,
      double* vv,
      double* ww,
@@ -231,9 +231,9 @@ void wstack(
 	    )
 	    
 {
-    uint i;
-    //uint index;
-    uint visindex;
+    myuint i;
+    //myuint index;
+    myuint visindex;
 
     // initialize the convolution kernel
     // gaussian:
@@ -260,9 +260,9 @@ void wstack(
    #ifdef __HIPCC__
     // Define the HIP set up
     int Nth = NTHREADS;
-    uint Nbl = (uint)(num_points/Nth) + 1;
+    myuint Nbl = (myuint)(num_points/Nth) + 1;
     if(NWORKERS == 1) {Nbl = 1; Nth = 1;};
-    uint Nvis = num_points*freq_per_chan*polarizations;
+    myuint Nvis = num_points*freq_per_chan*polarizations;
 
     int ndevices;
     int num = hipGetDeviceCount(&ndevices);
@@ -404,7 +404,7 @@ void wstack(
 
    #if defined(ACCOMP) && (GPU_STACKING)
     omp_set_default_device(rank % omp_get_num_devices());
-    uint Nvis = num_points*freq_per_chan*polarizations;
+    myuint Nvis = num_points*freq_per_chan*polarizations;
    #pragma omp target teams distribute parallel for private(visindex) map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom:grid[0:2*num_w_planes*grid_size_x*grid_size_y])
 #else
     #pragma omp parallel for private(visindex)
@@ -433,10 +433,10 @@ void wstack(
         int grid_v = (int)pos_v;
 
 	// check the boundaries
-	uint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
-	uint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
-	uint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
-	uint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
+	myuint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
+	myuint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
+	myuint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
+	myuint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
         //printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax);
 
 
@@ -451,7 +451,7 @@ void wstack(
             {
                 //double u_dist = (double)j+0.5 - pos_u;
                 double u_dist = (double)j - pos_u;
-		uint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
+		myuint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
                 int jKer = (int)(increaseprecision * (fabs(u_dist+(double)KernelLen)));
                 int kKer = (int)(increaseprecision * (fabs(v_dist+(double)KernelLen)));
 
@@ -470,12 +470,12 @@ void wstack(
 		// Loops over frequencies and polarizations
 		double add_term_real = 0.0;
 		double add_term_img = 0.0;
-		uint ifine = visindex;
+		myuint ifine = visindex;
 		// DAV: the following two loops are performend by each thread separately: no problems of race conditions
-		for (uint ifreq=0; ifreq<freq_per_chan; ifreq++)
+		for (myuint ifreq=0; ifreq<freq_per_chan; ifreq++)
 		{
-		   uint iweight = visindex/freq_per_chan;
-	           for (uint ipol=0; ipol<polarizations; ipol++)
+		   myuint iweight = visindex/freq_per_chan;
+	           for (myuint ipol=0; ipol<polarizations; ipol++)
 	           {
                       if (!isnan(vis_real[ifine]))
                       {