From 9e323f267f2fb8d81e177470ece8a8eebefc5fa9 Mon Sep 17 00:00:00 2001
From: "giovanni.lacopo" <giovanni.lacopo@inaf.it>
Date: Thu, 9 Nov 2023 10:55:16 +0100
Subject: [PATCH] Stacking/Reduce on GPU with reduced communication

---
 Makefile                              |  61 ++++++--
 allvars_nccl.h                        |   2 -
 gridding_nccl.cpp => gridding_nccl.cu |  48 ++++--
 w-stacking.cu                         | 216 ++++++++++++++++----------
 w-stacking.h                          |  11 +-
 5 files changed, 221 insertions(+), 117 deletions(-)
 rename gridding_nccl.cpp => gridding_nccl.cu (90%)

diff --git a/Makefile b/Makefile
index fde90c6..f6c49b1 100644
--- a/Makefile
+++ b/Makefile
@@ -59,9 +59,9 @@ OPT += -DPHASE_ON
 #OPT += -DNORMALIZE_UVW
 
 # Gridding kernel: GAUSS, GAUSS_HI_PRECISION, KAISERBESSEL
-OPT += -DGAUSS_HI_PRECISION
+#OPT += -DGAUSS_HI_PRECISION
 
-#OPT += -DGAUSS
+OPT += -DGAUSS
 
 #OPT += -DKAISERBESSEL
 
@@ -77,7 +77,7 @@ OPT += -DGAUSS_HI_PRECISION
 #OPT += -DCUDACC
 
 # use GPU acceleration via OMP 
-OPT += -DACCOMP
+#OPT += -DACCOMP
 
 # perform stacking on GPUs
 #OPT += -DGPU_STACKING
@@ -125,7 +125,7 @@ DEPS = w-stacking.h  main.c allvars.h
 # ----- define which files will be compiled by MPICC
 #
 # these are the OBJS that will be compiled by C compiler if no acceleration (neither with CUDA nor with OpenMP) is provided
-CC_OBJ_NOACC = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o
+CC_OBJ_NOACC = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform_3d.o result.o numa.o reduce.o w-stacking.o phase_correction.o
 
 # these are the OBJs that will be compiled by the normal MPICC compiler if GPU acceleration is switched on
 CC_OBJ_ACC = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o
@@ -138,17 +138,26 @@ OBJ_ACC_CUDA = phase_correction.o w-stacking.o
 
 # ----- define which files will be compiled by NVC with OMP offloading for wither Nvidia or AMD
 #
-DEPS_ACC_OMP = w-stacking.h phase_correction.c w-stacking.c 
-OBJ_ACC_OMP = phase_correction.o w-stacking.o 
+DEPS_ACC_OMP = w-stacking.h phase_correction.c w-stacking.c
+OBJ_ACC_OMP = phase_correction.o w-stacking.o
+
 
 
 # ----- define what files will be compiled by NVC with OMP offloading when the stacking reduce is
 #       offloaded on GPU
+ifeq (CUDACC,$(findstring CUDACC,$(OPT)))
+DEPS_NCCL_REDUCE = gridding_nccl.cu
+OBJ_NCCL_REDUCE  = gridding_nccl.o
+
+DEPS_RCCL_REDUCE = gridding_rccl.cu
+OBJ_RCCL_REDUCE  = gridding_rccl.o
+else
 DEPS_NCCL_REDUCE = gridding_nccl.cpp
 OBJ_NCCL_REDUCE  = gridding_nccl.o
 
 DEPS_RCCL_REDUCE = gridding_rccl.cpp
 OBJ_RCCL_REDUCE  = gridding_rccl.o
+endif
 
 # ----- define what files will be compiled by NVCC for Nvidia cufftMP implementation of FFT
 #
@@ -196,6 +205,12 @@ phase_correction.c: phase_correction.cu
 
 cuda_fft.cpp: cuda_fft.cu
 	cp cuda_fft.cu cuda_fft.cpp
+
+gridding_nccl.cpp: gridding_nccl.cu
+	cp gridding_nccl.cu gridding_nccl.cpp
+
+gridding_rccl.cpp: gridding_rccl.cu
+	cp gridding_rccl.cu gridding_rccl.cpp
 else
 w-stacking.c: w-stacking.cu
 	rm -f w-stacking.c
@@ -206,6 +221,12 @@ phase_correction.c: phase_correction.cu
 cuda_fft.cpp: cuda_fft.cu
 	rm -f cuda_fft.cpp
 	touch cuda_fft.cpp
+gridding_nccl.cpp: gridding_nccl.cu
+	rm -f gridding_nccl.cpp
+	touch gridding_nccl.cpp
+gridding_rccl.cpp: gridding_rccl.cu
+	rm -f gridding_rccl.cpp
+	touch gridding_rccl.cpp
 endif
 
 
@@ -221,7 +242,7 @@ LINKER=$(MPIC++)
 FLAGS=$(OPTIMIZE) 
 LIBS=$(NVLIB)
 $(OBJ_ACC_CUDA): $(DEPS_ACC_CUDA)
-	$(NVCC) $(OPT) $(OPT_NVCC) $(CFLAGS) -c *.cu $(LIBS)
+	$(NVCC) $(OPT) $(OPT_NVCC) $(CFLAGS) -c w-stacking.cu phase_correction.cu $(LIBS)
 OBJ += $(OBJ_ACC_CUDA)
 endif
 
@@ -256,6 +277,17 @@ endif
 
 
 ifeq (NCCL_REDUCE,$(findstring NCCL_REDUCE,$(OPT)))
+
+ifeq (CUDACC,$(findstring CUDACC,$(OPT)))
+EXEC_EXT := $(EXEC_EXT)_acc-reduce
+LINKER=$(MPIC++)
+FLAGS=$(OPTIMIZE)
+LIBS=$(NVLIB_3)
+$(OBJ_NCCL_REDUCE): $(DEPS_NCCL_REDUCE)
+	$(NVCC) $(OPT_NVCC) $(OPT) -c $^ $(LIBS)
+OBJ += $(OBJ_NCCL_REDUCE)
+else
+
 EXEC_EXT := $(EXEC_EXT)_acc-reduce
 LINKER=$(NVC++)
 FLAGS=$(NVFLAGS) $(CFLAGS)
@@ -264,6 +296,7 @@ $(OBJ_NCCL_REDUCE): $(DEPS_NCCL_REDUCE)
 	$(NVC++) $(FLAGS) $(OPT) -c $^ $(LIBS)
 OBJ += $(OBJ_NCCL_REDUCE)
 endif
+endif
 
 ifeq (RCCL_REDUCE,$(findstring RCCL_REDUCE,$(OPT)))
 EXEC_EXT := $(EXEC_EXT)_acc-reduce
@@ -319,9 +352,11 @@ clean:
 	rm -f w-stacking.c
 	rm -f phase_correction.c
 	rm -f cuda_fft.cpp
-
-cleanall:
-	rm -f $(EXEC)$(EXT)
-	rm -f *.o
-	rm -f w-stacking.c
-	rm -f phase_correction.c
+	rm -f gridding_nccl.cpp
+	rm -f gridding_rccl.cpp
+
+#cleanall:
+#	rm -f $(EXEC)$(EXT)
+#	rm -f *.o
+#	rm -f w-stacking.c
+#	rm -f phase_correction.c
diff --git a/allvars_nccl.h b/allvars_nccl.h
index 032a26e..67b5ed6 100644
--- a/allvars_nccl.h
+++ b/allvars_nccl.h
@@ -23,8 +23,6 @@
 #include <omp.h>
 #endif
 
-
-
 #if defined(USE_FFTW) && !defined(CUFFTMP) // use MPI fftw
 #include <fftw3-mpi.h>
 #endif
diff --git a/gridding_nccl.cpp b/gridding_nccl.cu
similarity index 90%
rename from gridding_nccl.cpp
rename to gridding_nccl.cu
index 556ee6d..67c8897 100644
--- a/gridding_nccl.cpp
+++ b/gridding_nccl.cu
@@ -1,4 +1,3 @@
-
 #include "allvars_nccl.h"
 #include "proto.h"
 #include <cuda.h>
@@ -72,7 +71,7 @@ void gridding_data(){
   
     //Initialize nccl
 
-  double * grid_gpu, *gridss_gpu;
+  double *grid_gpu, *gridss_gpu;
   int local_rank = 0;
 
   uint64_t hostHashs[size];
@@ -87,7 +86,8 @@ void gridding_data(){
   
   ncclUniqueId id;
   ncclComm_t comm;
-  cudaStream_t stream_reduce;
+  cudaError_t nnn;
+  cudaStream_t stream_reduce, stream_stacking;
 
   if (rank == 0) ncclGetUniqueId(&id);
   MPI_Bcast((void *)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);
@@ -97,6 +97,7 @@ void gridding_data(){
   cudaMalloc(&grid_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double));
   cudaMalloc(&gridss_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double));
   cudaStreamCreate(&stream_reduce);
+  cudaStreamCreate(&stream_stacking);
   
   ncclCommInitRank(&comm, size, id, rank);
 
@@ -189,17 +190,38 @@ void gridding_data(){
      #ifdef VERBOSE
       printf("Processing sector %ld\n",isector);
      #endif
-
-   
+  
       double *stacking_target_array;
       if ( size > 1 )
 	stacking_target_array = gridss;
       else
 	stacking_target_array = grid;
 
+      
       start = CPU_TIME_wt;
 	    
      //We have to call different GPUs per MPI task!!! [GL]
+     #ifdef CUDACC
+      wstack(param.num_w_planes,
+	     Nsec,
+	     metaData.freq_per_chan,
+	     metaData.polarisations,
+	     uus,
+	     vvs,
+	     wws,
+	     visreals,
+	     visimgs,
+	     weightss,
+	     dx,
+	     dw,
+	     param.w_support,
+	     xaxis,
+	     yaxis,
+	     gridss_gpu,
+	     param.num_threads,
+	     rank,
+	     stream_stacking);
+      #else
       wstack(param.num_w_planes,
 	     Nsec,
 	     metaData.freq_per_chan,
@@ -218,13 +240,12 @@ void gridding_data(){
 	     stacking_target_array,
 	     param.num_threads,
 	     rank);
+      #endif
       //Allocate memory on devices non-blocking for the host                                                                                   
       ///////////////////////////////////////////////////////
 
 
       timing_wt.kernel += CPU_TIME_wt - start;
-	    
-      cudaMemcpyAsync(gridss_gpu, gridss, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyHostToDevice, stream_reduce);
       
      #ifdef VERBOSE
       printf("Processed sector %ld\n",isector);
@@ -239,8 +260,6 @@ void gridding_data(){
 	  // int target_rank = (int)isector;    it implied that size >= nsectors
 	  int target_rank = (int)(isector % size);
 
-	  cudaStreamSynchronize(stream_reduce);
-
 	  start = CPU_TIME_wt;
 
 	  ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble, ncclSum, target_rank, comm, stream_reduce);
@@ -248,26 +267,25 @@ void gridding_data(){
       
 	  timing_wt.reduce += CPU_TIME_wt - start;
 
-		  
 	  // Go to next sector
-	  memset ( gridss, 0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) );	  
-	}	
+	  nnn = cudaMemset( gridss_gpu, 0.0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) );
+	  if (nnn != cudaSuccess) {printf("!!! w-stacking.cu cudaMemset ERROR %d !!!\n", nnn);}
+	}
 
       free(memory);
     }
 
   //Copy data back from device to host (to be deleted in next steps)
-
   
   cudaMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost, stream_reduce);
   
   MPI_Barrier(MPI_COMM_WORLD);
 
   cudaStreamSynchronize(stream_reduce);
-  cudaFree(gridss_gpu);
   cudaFree(grid_gpu);
-  
+  cudaFree(gridss_gpu);
   cudaStreamDestroy(stream_reduce);
+  cudaStreamDestroy(stream_stacking);
   
   ncclCommDestroy(comm);
 
diff --git a/w-stacking.cu b/w-stacking.cu
index a01344c..98804ca 100644
--- a/w-stacking.cu
+++ b/w-stacking.cu
@@ -98,88 +98,101 @@ void makeKaiserBesselKernel(double * kernel,
 //}
 
 __global__ void convolve_g(
-     int num_w_planes,
-     uint num_points,
-     uint freq_per_chan,
-     uint polarizations,
-     double* uu,
-     double* vv,
-     double* ww,
-     float* vis_real,
-     float* vis_img,
-     float* weight,
-     double dx,
-     double dw,
-     int KernelLen,
-     int grid_size_x,
-     int grid_size_y,
-     double* grid,
-     double std22)
+			   int num_w_planes,
+			   uint num_points,
+			   uint freq_per_chan,
+			   uint polarizations,
+			   double* uu,
+			   double* vv,
+			   double* ww,
+			   float* vis_real,
+			   float* vis_img,
+			   float* weight,
+			   double dx,
+			   double dw,
+			   int KernelLen,
+			   int grid_size_x,
+			   int grid_size_y,
+			   double* grid,
+			   double std22)
+			   
+
 
 {
-	//printf("DENTRO AL KERNEL\n");
-        uint gid = blockIdx.x*blockDim.x + threadIdx.x;
-	if(gid < num_points)
-	{
-	uint i = gid;
-        uint visindex = i*freq_per_chan*polarizations;
-	double norm = std22/PI;
+  //printf("DENTRO AL KERNEL\n");
+  uint gid = blockIdx.x*blockDim.x + threadIdx.x;
+  if(gid < num_points)
+    {
+      uint i = gid;
+      uint visindex = i*freq_per_chan*polarizations;
+      double norm = std22/PI;
 
-        int j, k;
+      int j, k;
 
-        /* Convert UV coordinates to grid coordinates. */
-        double pos_u = uu[i] / dx;
-        double pos_v = vv[i] / dx;
-        double ww_i  = ww[i] / dw;
-        int grid_w = (int)ww_i;
-        int grid_u = (int)pos_u;
-        int grid_v = (int)pos_v;
+      /* Convert UV coordinates to grid coordinates. */
+      double pos_u = uu[i] / dx;
+      double pos_v = vv[i] / dx;
+      double ww_i  = ww[i] / dw;
 
-        // check the boundaries
-        uint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
-        uint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
-        uint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
-        uint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
-        //printf("%ld, %ld, %ld, %ld\n",jmin,jmax,kmin,kmax);
+      int grid_w = (int)ww_i;
+      int grid_u = (int)pos_u;
+      int grid_v = (int)pos_v;
 
+      // check the boundaries
+      uint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
+      uint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
+      uint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
+      uint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
 
-        // Convolve this point onto the grid.
-        for (k = kmin; k <= kmax; k++)
-        {
 
-            double v_dist = (double)k+0.5 - pos_v;
+      // Convolve this point onto the grid.
+      for (k = kmin; k <= kmax; k++)
+        {
 
-            for (j = jmin; j <= jmax; j++)
+	  double v_dist = (double)k - pos_v;
+	  int increaseprecision = 5;
+	  
+	  for (j = jmin; j <= jmax; j++)
             {
-                double u_dist = (double)j+0.5 - pos_u;
-                uint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
-		//printf("--> %ld %d %d %d\n",iKer,j,k,grid_w);
-
-                double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
-                // Loops over frequencies and polarizations
-                double add_term_real = 0.0;
-                double add_term_img = 0.0;
-                uint ifine = visindex;
-                for (uint ifreq=0; ifreq<freq_per_chan; ifreq++)
+	      double u_dist = (double)j+0.5 - pos_u;
+	      uint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
+	      int jKer = (int)(increaseprecision * (fabs(u_dist+(double)KernelLen)));
+	      int kKer = (int)(increaseprecision * (fabs(v_dist+(double)KernelLen)));
+	      
+	     #ifdef GAUSS_HI_PRECISION
+	      double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
+	     #endif
+	     #ifdef GAUSS
+	      double conv_weight = convkernel[jKer]*convkernel[kKer];
+	     #endif
+	     #ifdef KAISERBESSEL
+	      double conv_weight = convkernel[jKer]*convkernel[kKer];
+	     #endif
+
+	      // Loops over frequencies and polarizations
+	      double add_term_real = 0.0;
+	      double add_term_img = 0.0;
+	      uint ifine = visindex;
+	      for (uint ifreq=0; ifreq<freq_per_chan; ifreq++)
 		{
-		   uint iweight = visindex/freq_per_chan;
-                   for (uint ipol=0; ipol<polarizations; ipol++)
-                   {
+		  uint iweight = visindex/freq_per_chan;
+		  for (uint ipol=0; ipol<polarizations; ipol++)
+		    {
                       double vistest = (double)vis_real[ifine];
                       if (!isnan(vistest))
-                      {
-                         add_term_real += weight[iweight] * vis_real[ifine] * conv_weight;
-                         add_term_img += weight[iweight] * vis_img[ifine] * conv_weight;
-		      }
+			{
+			  add_term_real += weight[iweight] * vis_real[ifine] * conv_weight;
+			  add_term_img += weight[iweight] * vis_img[ifine] * conv_weight;
+			}
                       ifine++;
 		      iweight++;
-                   }
+		    }
 		}
-		atomicAdd(&(grid[iKer]),add_term_real);
-		atomicAdd(&(grid[iKer+1]),add_term_img);
+	      atomicAdd(&(grid[iKer]),add_term_real);
+	      atomicAdd(&(grid[iKer+1]),add_term_img);
             }
         }
-	}
+    }
 }
 #endif
 #ifdef ACCOMP
@@ -203,7 +216,14 @@ void wstack(
      int grid_size_y,
      double* grid,
      int num_threads,
-     int rank)
+    #ifdef CUDACC
+     int rank,
+     cudaStream_t stream_stacking
+    #else
+     int rank
+    #endif
+	    )
+	    
 {
     uint i;
     //uint index;
@@ -249,13 +269,10 @@ void wstack(
       }
     }
 
-      printf("Running rank %d/%d using GPU %d\n", rank, size, rank % ndevices);
      #ifdef NVIDIA
       prtAccelInfo();
      #endif
 
-    printf("Running on GPU with %d threads and %d blocks\n",Nth,Nbl);
-
     // Create GPU arrays and offload them
     double * uu_g;
     double * vv_g;
@@ -263,32 +280,48 @@ void wstack(
     float * vis_real_g;
     float * vis_img_g;
     float * weight_g;
-    double * grid_g;
-
+    //double * grid_g;
+    double * convkernel_g;
+    
+    //Create the event inside stream stacking
+    //cudaEvent_t event_kernel;
+    
     //for (int i=0; i<100000; i++)grid[i]=23.0;
     cudaError_t mmm;
+    //mmm=cudaEventCreate(&event_kernel);
     mmm=cudaMalloc(&uu_g,num_points*sizeof(double));
     mmm=cudaMalloc(&vv_g,num_points*sizeof(double));
     mmm=cudaMalloc(&ww_g,num_points*sizeof(double));
     mmm=cudaMalloc(&vis_real_g,Nvis*sizeof(float));
     mmm=cudaMalloc(&vis_img_g,Nvis*sizeof(float));
     mmm=cudaMalloc(&weight_g,(Nvis/freq_per_chan)*sizeof(float));
-    mmm=cudaMalloc(&grid_g,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
+    //mmm=cudaMalloc(&grid_g,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
+    /*
+   #if !defined(GAUSS_HI_PRECISION)
+    mmm=cudaMalloc(&convkernel_g,increaseprecision*w_support*sizeof(double));
+   #endif
+    */
     if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMalloc ERROR %d !!!\n", mmm);}
-    mmm=cudaMemset(grid_g,0.0,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
+    //mmm=cudaMemset(grid_g,0.0,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
     if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMemset ERROR %d !!!\n", mmm);}
 
 
-    mmm=cudaMemcpy(uu_g, uu, num_points*sizeof(double), cudaMemcpyHostToDevice);
-    mmm=cudaMemcpy(vv_g, vv, num_points*sizeof(double), cudaMemcpyHostToDevice);
-    mmm=cudaMemcpy(ww_g, ww, num_points*sizeof(double), cudaMemcpyHostToDevice);
-    mmm=cudaMemcpy(vis_real_g, vis_real, Nvis*sizeof(float), cudaMemcpyHostToDevice);
-    mmm=cudaMemcpy(vis_img_g, vis_img, Nvis*sizeof(float), cudaMemcpyHostToDevice);
-    mmm=cudaMemcpy(weight_g, weight, (Nvis/freq_per_chan)*sizeof(float), cudaMemcpyHostToDevice);
-    if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMemcpy ERROR %d !!!\n", mmm);}
+    mmm=cudaMemcpyAsync(uu_g, uu, num_points*sizeof(double), cudaMemcpyHostToDevice, stream_stacking);
+    mmm=cudaMemcpyAsync(vv_g, vv, num_points*sizeof(double), cudaMemcpyHostToDevice, stream_stacking);
+    mmm=cudaMemcpyAsync(ww_g, ww, num_points*sizeof(double), cudaMemcpyHostToDevice, stream_stacking);
+    mmm=cudaMemcpyAsync(vis_real_g, vis_real, Nvis*sizeof(float), cudaMemcpyHostToDevice, stream_stacking);
+    mmm=cudaMemcpyAsync(vis_img_g, vis_img, Nvis*sizeof(float), cudaMemcpyHostToDevice, stream_stacking);
+    mmm=cudaMemcpyAsync(weight_g, weight, (Nvis/freq_per_chan)*sizeof(float), cudaMemcpyHostToDevice, stream_stacking);
+
+    /*
+   #if !defined(GAUSS_HI_PRECISION)
+    mmm=cudaMemcpyAsync(convkernel_g, convkernel, increaseprecision*w_support*sizeof(double), cudaMemcpyHostToDevice, stream_stacking);
+   #endif
+    */
+    if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMemcpyAsync ERROR %d !!!\n", mmm);}
     
     // Call main GPU Kernel
-    convolve_g <<<Nbl,Nth>>> (
+    convolve_g <<<Nbl,Nth,0,stream_stacking>>> (
 	       num_w_planes,
                num_points,
                freq_per_chan,
@@ -304,20 +337,33 @@ void wstack(
                KernelLen,
                grid_size_x,
                grid_size_y,
-               grid_g,
-	       std22);
+               grid,
+	       std22
+						);
 
-    mmm = cudaMemcpy(grid, grid_g, 2*num_w_planes*grid_size_x*grid_size_y*sizeof(double), cudaMemcpyDeviceToHost);
+    mmm=cudaStreamSynchronize(stream_stacking);
+    //Record the event
+    //mmm=cudaEventRecord(event_kernel,stream_stacking);
+
+    //Wait until the kernel ends
+    //mmm=cudaStreamWaitEvent(stream_stacking,event_kernel);
+    
     //for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
+    
     printf("CUDA ERROR %s\n",cudaGetErrorString(mmm));
+
     mmm=cudaFree(uu_g);
     mmm=cudaFree(vv_g);
     mmm=cudaFree(ww_g);
     mmm=cudaFree(vis_real_g);
     mmm=cudaFree(vis_img_g);
     mmm=cudaFree(weight_g);
-    mmm=cudaFree(grid_g);
-
+    //mmm=cudaFree(grid_g);
+    /*
+   #if !defined(GAUSS_HI_PRECISION)
+    mmm=cudaFree(convkernel_g);
+   #endif
+    */
 // Switch between CUDA and GPU versions
 # else
 
diff --git a/w-stacking.h b/w-stacking.h
index c953ee6..c4b3aaa 100644
--- a/w-stacking.h
+++ b/w-stacking.h
@@ -32,10 +32,11 @@ void wstack(
      int,
      double*,
      int,
-     int);
+     int,
+     cudaStream_t);
 }
 
-#else
+#else 
 void wstack(
      int,
      unsigned int,
@@ -58,6 +59,7 @@ void wstack(
 #endif
 
 
+
 #ifdef __CUDACC__
 extern "C"
 #endif
@@ -81,6 +83,11 @@ void phase_correction(
      int,
      int);
 
+double gauss_kernel_norm(
+  double norm,
+  double std22,
+  double u_dist,
+  double v_dist);
 
 #ifdef ACCOMP
 #pragma omp declare target (gauss_kernel_norm)
-- 
GitLab