From 404e0296d3a267344dec9bd0e4f5832b78f77386 Mon Sep 17 00:00:00 2001
From: "giovanni.lacopo" <giovanni.lacopo@inaf.it>
Date: Mon, 18 Sep 2023 11:25:47 +0200
Subject: [PATCH] Choose whether performing stacking on CPUs or GPUs

---
 Makefile      | 15 +++++++++------
 w-stacking.cu |  4 ++--
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/Makefile b/Makefile
index 1be9de6..fde90c6 100644
--- a/Makefile
+++ b/Makefile
@@ -59,9 +59,9 @@ OPT += -DPHASE_ON
 #OPT += -DNORMALIZE_UVW
 
 # Gridding kernel: GAUSS, GAUSS_HI_PRECISION, KAISERBESSEL
-#OPT += -DGAUSS_HI_PRECISION
+OPT += -DGAUSS_HI_PRECISION
 
-OPT += -DGAUSS
+#OPT += -DGAUSS
 
 #OPT += -DKAISERBESSEL
 
@@ -77,7 +77,10 @@ OPT += -DGAUSS
 #OPT += -DCUDACC
 
 # use GPU acceleration via OMP 
-#OPT += -DACCOMP
+OPT += -DACCOMP
+
+# perform stacking on GPUs
+#OPT += -DGPU_STACKING
 
 # use NVIDIA GPU to perform the reduce
 #OPT += -DNCCL_REDUCE
@@ -135,8 +138,8 @@ OBJ_ACC_CUDA = phase_correction.o w-stacking.o
 
 # ----- define which files will be compiled by NVC with OMP offloading for wither Nvidia or AMD
 #
-DEPS_ACC_OMP = w-stacking_omp.h phase_correction.c w-stacking_omp.c
-OBJ_ACC_OMP = phase_correction.o w-stacking_omp.o
+DEPS_ACC_OMP = w-stacking.h phase_correction.c w-stacking.c 
+OBJ_ACC_OMP = phase_correction.o w-stacking.o 
 
 
 # ----- define what files will be compiled by NVC with OMP offloading when the stacking reduce is
@@ -174,7 +177,7 @@ endif
 
 ifeq (USE_FFTW,$(findstring USE_FFTW,$(OPT)))
 CFLAGS += $(FFTW_MPI_INC)
-ifeq (HIBRYD_FFTW,$(findstring HYBRID_FFTW,$(OPT)))
+ifeq (HYBRID_FFTW,$(findstring HYBRID_FFTW,$(OPT)))
 FFTWLIBS = $(FFTW_MPI_LIB) -lfftw3_omp -lfftw3_mpi -lfftw3 -lm
 else
 FFTWLIBS = $(FFTW_MPI_LIB) -lfftw3_mpi -lfftw3 -lm
diff --git a/w-stacking.cu b/w-stacking.cu
index 309fd6e..a01344c 100644
--- a/w-stacking.cu
+++ b/w-stacking.cu
@@ -325,7 +325,7 @@ void wstack(
     omp_set_num_threads(num_threads);
 #endif
 
-#ifdef ACCOMP
+   #if defined(ACCOMP) && (GPU_STACKING)
     omp_set_default_device(rank % omp_get_num_devices());
     uint Nvis = num_points*freq_per_chan*polarizations;
    #pragma omp target teams distribute parallel for private(visindex) map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom:grid[0:2*num_w_planes*grid_size_x*grid_size_y])
@@ -420,7 +420,7 @@ void wstack(
         }
 	
     }
-   #ifdef ACCOMP
+   #if defined(ACCOMP) && (GPU_STACKING)
    #pragma omp target exit data map(delete:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan], grid[0:2*num_w_planes*grid_size_x*grid_size_y])
    #endif
     // End switch between CUDA and CPU versions
-- 
GitLab