diff --git a/Makefile b/Makefile
index 9d6921cc0895d8139818e537a00c10cdf7d05f2e..165575e64fbf47d15af899c8ddd38c9bb67d845f 100644
--- a/Makefile
+++ b/Makefile
@@ -33,22 +33,18 @@ OPT += -DNVIDIA
 # write the final image
 OPT += -DWRITE_IMAGE
 # perform w-stacking phase correction
-<<<<<<< HEAD
 OPT += -DPHASE_ON
 # GPU support for FFT using cuFFTMP
 OPT += -DCUDA_FFT
-# Support CFITSIO
-=======
-#OPT += -DPHASE_ON
 # Support CFITSIO !!! Remember to add the path to the CFITSIO library to LD_LIBRARY_PATH
->>>>>>> main
 #OPT += -DFITSIO
 # Perform true parallel images writing
 #OPT += -DPARALLELIO
 # Normalize uvw in case it is not done in the binMS
-OPT += -DNORMALIZE_UVW
+#OPT += -DNORMALIZE_UVW
 # Gridding kernel: GAUSS, GAUSS_HI_PRECISION, KAISERBESSEL
-OPT += -DGAUSS_HI_PRECISION
+OPT += -DGAUSS
+#OPT += -DVERBOSE
 
 ifeq (FITSIO,$(findstring FITSIO,$(OPT)))
 	LIBS += -L$(FITSIO_LIB) -lcfitsio
@@ -100,3 +96,5 @@ clean:
 	rm *.o
 	rm w-stacking.c
 	rm phase_correction.c
+# module load spectrum_mpi/10.3.1--binary && module load fftw && module load cuda/11.3 && module load profile/candidate && module load hpc-sdk/2022--binary && LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/m100_work/IscrC_CD-DLS/cfitsio-3.49 \
+# && LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/cineca/prod/opt/compilers/hpc-sdk/2022/binary/Linux_ppc64le/2022/math_libs/11.8/lib64 && export LD_LIBRARY_PATH
diff --git a/phase_correction.cu b/phase_correction.cu
index 98bd6cc96939e214942e86cf947af32332841e40..1be47b2bffc7e39b164169c66f8b5f5ecabf2860 100644
--- a/phase_correction.cu
+++ b/phase_correction.cu
@@ -98,7 +98,7 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
 	// WARNING: nbucket MUST be chosen such that xaxis*yaxis*num_w_planes is a multiple of nbucket
 	int nbucket = 1;
         int Nth = NTHREADS;
-        long Nbl = (long)((num_w_planes*xaxis*yaxis)/Nth/nbucket) + 1;
+        long Nbl = (long)((num_w_planes*xaxis*yaxis)/nbucket/Nth) + 1;
         if(NWORKERS == 1) {Nbl = 1; Nth = 1;};
         printf("Running on GPU with %d threads and %d blocks\n",Nth,Nbl);
 	
diff --git a/w-stacking-fftw.c b/w-stacking-fftw.c
index 70024c21be588c92273cdc3d47c17f689db920e3..3d274a6fcbe830e5e3a921dcba94dbb0ca958bb6 100644
--- a/w-stacking-fftw.c
+++ b/w-stacking-fftw.c
@@ -106,8 +106,8 @@ int main(int argc, char * argv[])
 	double resolution;
 
         // Mesh related parameters: global size
-	int grid_size_x = 4096;
-	int grid_size_y = 4096;
+	int grid_size_x = 8192;
+	int grid_size_y = 8192;
 	// Split Mesh size (auto-calculated)
 	int local_grid_size_x;
 	int local_grid_size_y;
@@ -115,7 +115,7 @@ int main(int argc, char * argv[])
 	int yaxis;
 
 	// Number of planes in the w direction
-	int num_w_planes = 1;
+	int num_w_planes = 2;
 
 	// Size of the convoutional kernel support
 	int w_support = 7;
@@ -131,23 +131,13 @@ int main(int argc, char * argv[])
 
 
 	// Initialize FITS image parameters
-<<<<<<< HEAD
 
 #ifdef FITSIO
 	fitsfile *fptreal;
 	fitsfile *fptrimg;
 	int status;
-	char testfitsreal[FILENAMELENGTH] = "parallel_np8_real.fits";
-	char testfitsimag[FILENAMELENGTH] = "parallel_np8_img.fits";
-=======
-	#ifdef FITSIO
-	fitsfile *fptreal;
-	fitsfile *fptrimg;
-	int status;
-	char testfitsreal[FILENAMELENGTH] = "parallel_np2_real.fits";
-	char testfitsimag[FILENAMELENGTH] = "parallel_np2_img.fits";
-        #endif
->>>>>>> main
+	char testfitsreal[FILENAMELENGTH] = "parallel_real.fits";
+	char testfitsimag[FILENAMELENGTH] = "parallel_img.fits";
 
 	long naxis = 2;
 	long naxes[2] = { grid_size_x, grid_size_y };
@@ -241,7 +231,9 @@ if(rank == 0){
 
         // INPUT FILES (only the first ndatasets entries are used)
 	int ndatasets = 1;
-        strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/SKA3/ZW2_IFRQ_0-1-5.binMS/");
+//	strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/SKA3/ZW2_IFRQ_0-1-5.binMS/");
+	strcpy(datapath_multi[0],"/m100_scratch/userexternal/ederubei/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/");
+
 
 	strcpy(datapath,datapath_multi[0]);
 	// Read metadata
@@ -266,7 +258,7 @@ if(rank == 0){
 	// WATCH THIS!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 	int nsub = 1000;
 	//int nsub = 10;
-	if(rank == 0)printf("Subtracting last %d measurements\n",nsub);
+	printf("Subtracting last %d measurements\n",nsub);
         Nmeasures = Nmeasures-nsub;
 	Nvis = Nmeasures*freq_per_chan*polarisations;
 
@@ -345,8 +337,8 @@ if(rank == 0){
 	double maxu = -1e20;
 	double maxv = -1e20;
 	double maxw = -1e20;
-	
-        for (long inorm=0; inorm<Nmeasures; inorm++)
+
+	for (long inorm=0; inorm<Nmeasures; inorm++)
 	    {
                minu = MIN(minu,uu[inorm]);
                minv = MIN(minv,vv[inorm]);
@@ -504,7 +496,7 @@ if(rank == 0){
 	for (int ifiles=0; ifiles<ndatasets; ifiles++)
 	{
 	strcpy(filename,datapath_multi[ifiles]);
-        if(rank == 0)printf("Processing %s, %d of %d\n",filename,ifiles+1,ndatasets);
+        printf("Processing %s, %d of %d\n",filename,ifiles+1,ndatasets);
 
         // Read metadata
         strcpy(filename,datapath);
@@ -528,7 +520,7 @@ if(rank == 0){
         resolution = 1.0/MAX(abs(uvmin),abs(uvmax));
         // calculate the resolution in arcsec
         double resolution_asec = (3600.0*180.0)/MAX(abs(uvmin),abs(uvmax))/PI;
-        if(rank == 0)printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec);
+        printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec);
 
         strcpy(filename,datapath);
         strcat(filename,weightsfile);
diff --git a/w-stacking.cu b/w-stacking.cu
index cbad05c30e47fdb45ff8c86e424f2f00967c5522..fa7ad81409e48ff9d639f83225f116e5dce4cd53 100644
--- a/w-stacking.cu
+++ b/w-stacking.cu
@@ -59,7 +59,7 @@ void makeKaiserBesselKernel(double * kernel,
                             double overSamplingFactor,
                             int withSinc) {
   int n = increaseprecision*KernelLen, mid = n / 2;
-  double * sincKernel = malloc((mid + 1)*sizeof(*sincKernel));
+  double * sincKernel = (double*)malloc((mid + 1) * sizeof(*sincKernel));
   const double filterRatio = 1.0 / overSamplingFactor;
   sincKernel[0] = filterRatio;
   for (int i = 1; i != mid + 1; i++) {
@@ -209,7 +209,7 @@ void wstack(
     double std = 1.0;
     double std22 = 1.0/(2.0*std*std);
     double norm = std22/PI;
-    double * convkernel = malloc(increaseprecision*w_support*sizeof(*convkernel));
+    double * convkernel = (double*)malloc(increaseprecision*w_support*sizeof(*convkernel));
     double overSamplingFactor = 1.0;
     int withSinc = 0;
     double alpha = 8.6;
@@ -220,7 +220,7 @@ void wstack(
     makeKaiserBesselKernel(convkernel, w_support, increaseprecision, alpha, overSamplingFactor, withSinc);
     #endif
 
-    
+
     // Loop over visibilities.
 // Switch between CUDA and GPU versions
 #ifdef __CUDACC__
@@ -249,7 +249,10 @@ void wstack(
     mmm=cudaMalloc(&vis_img_g,Nvis*sizeof(float));
     mmm=cudaMalloc(&weight_g,(Nvis/freq_per_chan)*sizeof(float));
     mmm=cudaMalloc(&grid_g,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
+    if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMalloc ERROR %d !!!\n", mmm);}
     mmm=cudaMemset(grid_g,0.0,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
+    if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMemset ERROR %d !!!\n", mmm);}
+
 
     mmm=cudaMemcpy(uu_g, uu, num_points*sizeof(double), cudaMemcpyHostToDevice);
     mmm=cudaMemcpy(vv_g, vv, num_points*sizeof(double), cudaMemcpyHostToDevice);
@@ -257,6 +260,7 @@ void wstack(
     mmm=cudaMemcpy(vis_real_g, vis_real, Nvis*sizeof(float), cudaMemcpyHostToDevice);
     mmm=cudaMemcpy(vis_img_g, vis_img, Nvis*sizeof(float), cudaMemcpyHostToDevice);
     mmm=cudaMemcpy(weight_g, weight, (Nvis/freq_per_chan)*sizeof(float), cudaMemcpyHostToDevice);
+    if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMemcpy ERROR %d !!!\n", mmm);}
 
     // Call main GPU Kernel
     convolve_g <<<Nbl,Nth>>> (
@@ -348,7 +352,7 @@ void wstack(
                 int jKer = (int)(increaseprecision * (fabs(u_dist+(double)KernelLen)));
                 int kKer = (int)(increaseprecision * (fabs(v_dist+(double)KernelLen)));
 
-                #ifdef GAUSS_HI_PRECISION 
+                #ifdef GAUSS_HI_PRECISION
 		double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
                 #endif
                 #ifdef GAUSS
@@ -359,7 +363,7 @@ void wstack(
                 #endif
                 #ifdef KAISERBESSEL
 		double conv_weight = convkernel[jKer]*convkernel[kKer];
-	        #endif	
+	        #endif
 		// Loops over frequencies and polarizations
 		double add_term_real = 0.0;
 		double add_term_img = 0.0;