From 011d30c6dc887d41319d18bb797c34c319289d15 Mon Sep 17 00:00:00 2001 From: "giovanni.lacopo" Date: Fri, 8 Sep 2023 12:29:31 +0200 Subject: [PATCH] CUDA bug fixing --- Makefile | 42 +++++++++++++++++++++++++++++++++--------- allvars.h | 3 --- allvars_nccl.h | 2 +- main.c | 20 -------------------- phase_correction.cu | 19 ++++++++++++++++++- w-stacking.cu | 37 +++++++++++++++++++++++++++++++------ w-stacking.h | 2 -- 7 files changed, 83 insertions(+), 42 deletions(-) diff --git a/Makefile b/Makefile index 61afb16..5384ac4 100644 --- a/Makefile +++ b/Makefile @@ -37,7 +37,7 @@ FFTWLIBS = OPT += -DUSE_FFTW # use omp-ized version of fftw routines -OPT += -DHYBRID_FFTW +#OPT += -DHYBRID_FFTW # write the full 3D cube of gridded visibilities and its FFT transform #OPT += -DWRITE_DATA @@ -73,10 +73,10 @@ OPT += -DPHASE_ON #OPT += -DNVIDIA #use cuda for GPUs -#OPT += -DCUDACC +OPT += -DCUDACC # use GPU acceleration via OMP -OPT += -DACCOMP +#OPT += -DACCOMP # use NVIDIA GPU to perform the reduce #OPT += -DNCCL_REDUCE @@ -85,7 +85,7 @@ OPT += -DACCOMP #OPT += -DRCCL_REDUCE # use GPU to perform FFT -#OPT += -DCUFFTMP +OPT += -DCUFFTMP #support for AMD GPUs #OPT += __HIP_PLATFORM_AMD__ @@ -141,8 +141,13 @@ OBJ_RCCL_REDUCE = gridding_rccl.o # ----- define what files will be compiled by NVCC for Nvidia cufftMP implementation of FFT # -DEPS_ACC_CUFFTMP = w-stacking_omp.h cuda_fft.cpp +ifeq (CUDACC,$(findstring CUDACC,$(OPT))) +DEPS_ACC_CUFFTMP = cuda_fft.cu +OBJ_ACC_CUFFTMP = cuda_fft.o +else +DEPS_ACC_CUFFTMP = cuda_fft.cpp OBJ_ACC_CUFFTMP = cuda_fft.o +endif # ----------------------------------------------------- @@ -173,13 +178,19 @@ w-stacking.c: w-stacking.cu phase_correction.c: phase_correction.cu cp phase_correction.cu phase_correction.c + +cuda_fft.cpp: cuda_fft.cu + cp cuda_fft.cu cuda_fft.cpp else w-stacking.c: w-stacking.cu - rm -f w-stacking.cun + rm -f w-stacking.c touch w-stacking.c phase_correction.c: phase_correction.cu rm -f phase_correction.c touch phase_correction.c +cuda_fft.cpp: cuda_fft.cu + rm -f cuda_fft.cpp + touch cuda_fft.cpp endif @@ -192,7 +203,7 @@ endif ifeq (CUDACC,$(findstring CUDACC,$(OPT))) EXEC_EXT := $(EXEC_EXT)_acc-cuda LINKER=$(MPIC++) -FLAGS=$(OPTIMIZE) +FLAGS=$(OPTIMIZE) LIBS=$(NVLIB) $(OBJ_ACC_CUDA): $(DEPS_ACC_CUDA) $(NVCC) $(OPT) $(OPT_NVCC) $(CFLAGS) -c *.cu $(LIBS) @@ -250,15 +261,28 @@ OBJ += $(OBJ_RCCL_REDUCE) endif ifeq (CUFFTMP,$(findstring CUFFTMP,$(OPT))) + +ifeq (CUDACC,$(findstring CUDACC,$(OPT))) +EXEC_EXT := $(EXEC_EXT)_acc-fft +LINKER=$(MPIC++) +FLAGS=$(OPTIMIZE) +LIBS=$(NVLIB_2) +$(OBJ_ACC_CUFFTMP): $(DEPS_ACC_CUFFTMP) + $(NVCC) $(OPT_NVCC) $(OPT) -c $^ $(LIBS) +OBJ += $(OBJ_ACC_CUFFTMP) + +else + EXEC_EXT := $(EXEC_EXT)_acc-fft LINKER=$(NVC++) FLAGS=$(NVFLAGS) $(CFLAGS) -LIBS=$(NVLIB) $(NVLIB_2) +LIBS=$(NVLIB_2) $(OBJ_ACC_CUFFTMP): $(DEPS_ACC_CUFFTMP) $(NVC++) $(FLAGS) $(OPT) -c $^ $(LIBS) OBJ += $(OBJ_ACC_CUFFTMP) endif +endif ################################################################################### @@ -270,11 +294,11 @@ w-stacking: $(OBJ) $(DEPS) Makefile %.o: %.c $(DEPS) $(MPICC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS) - clean: rm -f *.o rm -f w-stacking.c rm -f phase_correction.c + rm -f cuda_fft.cpp cleanall: rm -f $(EXEC)$(EXT) diff --git a/allvars.h b/allvars.h index 8c83722..9ebf91a 100644 --- a/allvars.h +++ b/allvars.h @@ -34,9 +34,6 @@ #include "w-stacking.h" #endif -#if defined(CUDACC) -#include -#endif #if defined(NVIDIA) #include diff --git a/allvars_nccl.h b/allvars_nccl.h index b48763f..032a26e 100644 --- a/allvars_nccl.h +++ b/allvars_nccl.h @@ -13,7 +13,7 @@ #include -#if !defined( NCCL_REDUCE ) +#if !defined( NCCL_REDUCE ) && !defined(__CUDACC__) #include #endif diff --git a/main.c b/main.c index 4643c63..d94e0b9 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,6 @@ #include "allvars.h" #include "proto.h" - void shutdown_wstacking( int errcode, char *message, char *fname, int linenum ) { if ( ( rank == 0 ) && @@ -77,25 +76,6 @@ int main(int argc, char * argv[]) FFT_INIT; - #if defined(CUDACC) - int ndevices; - cudaGetDeviceCount(&ndevices); - cudaSetDevice(rank % ndevices); - - if ( rank == 0 ) { - if (0 == ndevices) { - - shutdown_wstacking(NO_ACCELERATORS_FOUND, "No accelerators found", __FILE__, __LINE__ ); - } - - - printf("Running rank %d/%d using GPU %d\n", rank, size, rank % ndevices); - #ifdef NVIDIA - prtAccelInfo(); - #endif - } - #endif - #ifdef ACCOMP if ( rank == 0 ) { if (0 == omp_get_num_devices()) { diff --git a/phase_correction.cu b/phase_correction.cu index 37974d4..644f59d 100644 --- a/phase_correction.cu +++ b/phase_correction.cu @@ -11,6 +11,8 @@ #include #include #include +#include "errcodes.h" +#include "proto.h" #ifdef __CUDACC__ @@ -106,7 +108,22 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in int Nth = NTHREADS; long Nbl = (long)((num_w_planes*xaxis*yaxis)/Nth/nbucket) + 1; if(NWORKERS == 1) {Nbl = 1; Nth = 1;}; - + + int ndevices; + cudaGetDeviceCount(&ndevices); + cudaSetDevice(rank % ndevices); + + if ( rank == 0 ) { + if (0 == ndevices) { + + shutdown_wstacking(NO_ACCELERATORS_FOUND, "No accelerators found", __FILE__, __LINE__ ); + } + + } + printf("Running rank %d using GPU %d\n", rank, rank % ndevices); + #ifdef NVIDIA + prtAccelInfo(); + #endif cudaError_t mmm; double * image_real_g; diff --git a/w-stacking.cu b/w-stacking.cu index d5aae63..c11f622 100644 --- a/w-stacking.cu +++ b/w-stacking.cu @@ -5,7 +5,15 @@ #include #include #include +#include "errcodes.h" + +#ifdef __CUDACC__ +#include "allvars_nccl.h" +#else #include "allvars.h" +#endif + +#include "proto.h" #ifdef ACCOMP #pragma omp declare target @@ -109,8 +117,7 @@ __global__ void convolve_g( int grid_size_x, int grid_size_y, double* grid, - double std22, - int rank) + double std22) { //printf("DENTRO AL KERNEL\n"); @@ -226,12 +233,29 @@ void wstack( // Loop over visibilities. // Switch between CUDA and GPU versions -#ifdef __CUDACC__ + #ifdef __CUDACC__ // Define the CUDA set up int Nth = NTHREADS; uint Nbl = (uint)(num_points/Nth) + 1; if(NWORKERS == 1) {Nbl = 1; Nth = 1;}; uint Nvis = num_points*freq_per_chan*polarizations; + + int ndevices; + cudaGetDeviceCount(&ndevices); + cudaSetDevice(rank % ndevices); + + if ( rank == 0 ) { + if (0 == ndevices) { + + shutdown_wstacking(NO_ACCELERATORS_FOUND, "No accelerators found", __FILE__, __LINE__ ); + } + } + + printf("Running rank %d/%d using GPU %d\n", rank, size, rank % ndevices); + #ifdef NVIDIA + prtAccelInfo(); + #endif + printf("Running on GPU with %d threads and %d blocks\n",Nth,Nbl); // Create GPU arrays and offload them @@ -264,7 +288,7 @@ void wstack( mmm=cudaMemcpy(vis_img_g, vis_img, Nvis*sizeof(float), cudaMemcpyHostToDevice); mmm=cudaMemcpy(weight_g, weight, (Nvis/freq_per_chan)*sizeof(float), cudaMemcpyHostToDevice); if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMemcpy ERROR %d !!!\n", mmm);} - + // Call main GPU Kernel convolve_g <<>> ( num_w_planes, @@ -396,12 +420,13 @@ void wstack( grid[iKer+1] += add_term_img; } } - + } -// End switch between CUDA and CPU versions + // End switch between CUDA and CPU versions #endif //for (int i=0; i<100000; i++)printf("%f\n",grid[i]); } + #ifdef ACCOMP #pragma omp end declare target #endif diff --git a/w-stacking.h b/w-stacking.h index c74fb06..f06bbbc 100644 --- a/w-stacking.h +++ b/w-stacking.h @@ -8,8 +8,6 @@ #include - - #ifdef __CUDACC__ extern "C" #endif -- GitLab