From eacb3f9ec9ff661d2ba720d22d8922f3298f8104 Mon Sep 17 00:00:00 2001
From: Claudio Gheller <cgheller@login02.m100.cineca.it>
Date: Thu, 11 Nov 2021 17:31:13 +0100
Subject: [PATCH] bug fix in the CUDA implementation. nbucket defined to reduce
 the number of blocks

---
 phase_correction.cu | 26 +++++++++++++++++---------
 w-stacking-fftw.c   |  8 ++++----
 2 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/phase_correction.cu b/phase_correction.cu
index b581471..d43e4cf 100644
--- a/phase_correction.cu
+++ b/phase_correction.cu
@@ -19,20 +19,25 @@ __global__ void phase_g(int xaxis,
 			double dwnorm,
 			int xaxistot,
 			int yaxistot,
-			double resolution)
+			double resolution,
+			int nbucket)
 {
 	long gid = blockIdx.x*blockDim.x + threadIdx.x;
 	double add_term_real;
 	double add_term_img;
 	double wterm;
-	long arraysize = xaxis*yaxis*num_w_planes;
+	long arraysize = (long)((xaxis*yaxis*num_w_planes)/nbucket + 1);
 
 	if(gid < arraysize)
 	{
-		int iw = (int)(gid/(xaxis*yaxis));
-		int iv = (int)((gid%(xaxis*yaxis))/xaxis);
-		int iu = (iv%yaxis);
-		long index = 2*gid;
+	  long gid_aux = nbucket*gid;
+	  for(int iaux=0; iaux<nbucket; iaux++) 
+          {
+		int iw = gid_aux/(xaxis*yaxis);
+		int ivaux = gid_aux%(xaxis*yaxis);
+		int iv = ivaux/xaxis;
+		int iu = ivaux%xaxis;
+		long index = 2*gid_aux;
 		long img_index = iu+iv*xaxis;
 
                 wterm = wmin + iw*dw;
@@ -73,7 +78,8 @@ __global__ void phase_g(int xaxis,
 		atomicAdd(&(image_real[img_index]),gridss[index]);
 		atomicAdd(&(image_imag[img_index]),gridss[index+1]);
 #endif // end of PHASE_ON
-
+		gid_aux++;
+           }
 	}
 
 }
@@ -89,8 +95,9 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
 
 #ifdef __CUDACC__
 
+	int nbucket = 32;
         int Nth = NTHREADS;
-        long Nbl = (long)((num_w_planes*xaxis*yaxis)/Nth) + 1;
+        long Nbl = (long)((num_w_planes*xaxis*yaxis)/Nth/nbucket) + 1;
         if(NWORKERS == 1) {Nbl = 1; Nth = 1;};
         printf("Running on GPU with %d threads and %d blocks\n",Nth,Nbl);
 	
@@ -120,7 +127,8 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
                                dwnorm,
                                xaxistot,
                                yaxistot,
-                               resolution);
+                               resolution,
+			       nbucket);
 
 	mmm = cudaMemcpy(image_real, image_real_g, xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost);
 	mmm = cudaMemcpy(image_imag, image_imag_g, xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost);
diff --git a/w-stacking-fftw.c b/w-stacking-fftw.c
index 07f25be..1580715 100644
--- a/w-stacking-fftw.c
+++ b/w-stacking-fftw.c
@@ -94,8 +94,8 @@ int main(int argc, char * argv[])
 	double resolution;
 
         // MESH SIZE
-	int grid_size_x = 256;
-	int grid_size_y = 256;
+	int grid_size_x = 2048;
+	int grid_size_y = 2048;
 	int local_grid_size_x;// = 8;
 	int local_grid_size_y;// = 8;
 	int xaxis;
@@ -153,9 +153,9 @@ int main(int argc, char * argv[])
 
 	// INPUT FILES (only the first ndatasets entries are used)
 	int ndatasets = 1;
-        strcpy(datapath_multi[0],"data/newgauss2noconj_t201806301100_SBL180.binMS/");
+        //strcpy(datapath_multi[0],"data/newgauss2noconj_t201806301100_SBL180.binMS/");
         //strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/newgauss4_t201806301100_SBL180.binMS/");
-        //strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/");
+        strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/");
         //strcpy(datapath_multi[1],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_134MHz.pre-cal.binMS/");
 
 	strcpy(datapath,datapath_multi[0]);
-- 
GitLab