bug fix in the CUDA implementation. nbucket defined to reduce the number of blocks

eacb3f9e · Claudio Gheller · 9d47e94e · eacb3f9e · eacb3f9e
Commit eacb3f9e authored 3 years ago by Claudio Gheller
--- a/phase_correction.cu
+++ b/phase_correction.cu
@@ -19,20 +19,25 @@ __global__ void phase_g(int xaxis,
 			double dwnorm,
 			int xaxistot,
 			int yaxistot,
-			double resolution)
+			double resolution,
+			int nbucket)
 {
 	long gid = blockIdx.x*blockDim.x + threadIdx.x;
 	double add_term_real;
 	double add_term_img;
 	double wterm;
-	long arraysize = xaxis*yaxis*num_w_planes;
+	long arraysize = (long)((xaxis*yaxis*num_w_planes)/nbucket + 1);
 	if(gid < arraysize)
 	{
-		int iw = (int)(gid/(xaxis*yaxis));
+	  long gid_aux = nbucket*gid;
-		int iv = (int)((gid%(xaxis*yaxis))/xaxis);
+	  for(int iaux=0; iaux<nbucket; iaux++) 
-		int iu = (iv%yaxis);
+          {
-		long index = 2*gid;
+		int iw = gid_aux/(xaxis*yaxis);
+		int ivaux = gid_aux%(xaxis*yaxis);
+		int iv = ivaux/xaxis;
+		int iu = ivaux%xaxis;
+		long index = 2*gid_aux;
 		long img_index = iu+iv*xaxis;
                wterm = wmin + iw*dw;
@@ -73,7 +78,8 @@ __global__ void phase_g(int xaxis,
 		atomicAdd(&(image_real[img_index]),gridss[index]);
 		atomicAdd(&(image_imag[img_index]),gridss[index+1]);
 #endif // end of PHASE_ON
+		gid_aux++;
+           }
 	}
 }
@@ -89,8 +95,9 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
 #ifdef __CUDACC__
+	int nbucket = 32;
        int Nth = NTHREADS;
-        long Nbl = (long)((num_w_planes*xaxis*yaxis)/Nth) + 1;
+        long Nbl = (long)((num_w_planes*xaxis*yaxis)/Nth/nbucket) + 1;
        if(NWORKERS == 1) {Nbl = 1; Nth = 1;};
        printf("Running on GPU with %d threads and %d blocks\n",Nth,Nbl);
@@ -120,7 +127,8 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
                               dwnorm,
                               xaxistot,
                               yaxistot,
-                               resolution);
+                               resolution,
+			       nbucket);
 	mmm = cudaMemcpy(image_real, image_real_g, xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost);
 	mmm = cudaMemcpy(image_imag, image_imag_g, xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost);

--- a/w-stacking-fftw.c
+++ b/w-stacking-fftw.c
@@ -94,8 +94,8 @@ int main(int argc, char * argv[])
 	double resolution;
        // MESH SIZE
-	int grid_size_x = 256;
+	int grid_size_x = 2048;
-	int grid_size_y = 256;
+	int grid_size_y = 2048;
 	int local_grid_size_x;// = 8;
 	int local_grid_size_y;// = 8;
 	int xaxis;
@@ -153,9 +153,9 @@ int main(int argc, char * argv[])
 	// INPUT FILES (only the first ndatasets entries are used)
 	int ndatasets = 1;
-        strcpy(datapath_multi[0],"data/newgauss2noconj_t201806301100_SBL180.binMS/");
+        //strcpy(datapath_multi[0],"data/newgauss2noconj_t201806301100_SBL180.binMS/");
        //strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/newgauss4_t201806301100_SBL180.binMS/");
-        //strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/");
+        strcpy(datapath_multi[0],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/");
        //strcpy(datapath_multi[1],"/m100_scratch/userexternal/cgheller/gridding/Lofar/L798046_SB244_uv.uncorr_130B27932t_134MHz.pre-cal.binMS/");
 	strcpy(datapath,datapath_multi[0]);