Skip to content
Snippets Groups Projects
Commit 9e323f26 authored by Giovanni Lacopo's avatar Giovanni Lacopo
Browse files

Stacking/Reduce on GPU with reduced communication

parent a42e9056
No related branches found
No related tags found
No related merge requests found
......@@ -59,9 +59,9 @@ OPT += -DPHASE_ON
#OPT += -DNORMALIZE_UVW
# Gridding kernel: GAUSS, GAUSS_HI_PRECISION, KAISERBESSEL
OPT += -DGAUSS_HI_PRECISION
#OPT += -DGAUSS_HI_PRECISION
#OPT += -DGAUSS
OPT += -DGAUSS
#OPT += -DKAISERBESSEL
......@@ -77,7 +77,7 @@ OPT += -DGAUSS_HI_PRECISION
#OPT += -DCUDACC
# use GPU acceleration via OMP
OPT += -DACCOMP
#OPT += -DACCOMP
# perform stacking on GPUs
#OPT += -DGPU_STACKING
......@@ -125,7 +125,7 @@ DEPS = w-stacking.h main.c allvars.h
# ----- define which files will be compiled by MPICC
#
# these are the OBJS that will be compiled by C compiler if no acceleration (neither with CUDA nor with OpenMP) is provided
CC_OBJ_NOACC = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o
CC_OBJ_NOACC = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform_3d.o result.o numa.o reduce.o w-stacking.o phase_correction.o
# these are the OBJs that will be compiled by the normal MPICC compiler if GPU acceleration is switched on
CC_OBJ_ACC = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o
......@@ -142,13 +142,22 @@ DEPS_ACC_OMP = w-stacking.h phase_correction.c w-stacking.c
OBJ_ACC_OMP = phase_correction.o w-stacking.o
# ----- define what files will be compiled by NVC with OMP offloading when the stacking reduce is
# offloaded on GPU
ifeq (CUDACC,$(findstring CUDACC,$(OPT)))
DEPS_NCCL_REDUCE = gridding_nccl.cu
OBJ_NCCL_REDUCE = gridding_nccl.o
DEPS_RCCL_REDUCE = gridding_rccl.cu
OBJ_RCCL_REDUCE = gridding_rccl.o
else
DEPS_NCCL_REDUCE = gridding_nccl.cpp
OBJ_NCCL_REDUCE = gridding_nccl.o
DEPS_RCCL_REDUCE = gridding_rccl.cpp
OBJ_RCCL_REDUCE = gridding_rccl.o
endif
# ----- define what files will be compiled by NVCC for Nvidia cufftMP implementation of FFT
#
......@@ -196,6 +205,12 @@ phase_correction.c: phase_correction.cu
cuda_fft.cpp: cuda_fft.cu
cp cuda_fft.cu cuda_fft.cpp
gridding_nccl.cpp: gridding_nccl.cu
cp gridding_nccl.cu gridding_nccl.cpp
gridding_rccl.cpp: gridding_rccl.cu
cp gridding_rccl.cu gridding_rccl.cpp
else
w-stacking.c: w-stacking.cu
rm -f w-stacking.c
......@@ -206,6 +221,12 @@ phase_correction.c: phase_correction.cu
cuda_fft.cpp: cuda_fft.cu
rm -f cuda_fft.cpp
touch cuda_fft.cpp
gridding_nccl.cpp: gridding_nccl.cu
rm -f gridding_nccl.cpp
touch gridding_nccl.cpp
gridding_rccl.cpp: gridding_rccl.cu
rm -f gridding_rccl.cpp
touch gridding_rccl.cpp
endif
......@@ -221,7 +242,7 @@ LINKER=$(MPIC++)
FLAGS=$(OPTIMIZE)
LIBS=$(NVLIB)
$(OBJ_ACC_CUDA): $(DEPS_ACC_CUDA)
$(NVCC) $(OPT) $(OPT_NVCC) $(CFLAGS) -c *.cu $(LIBS)
$(NVCC) $(OPT) $(OPT_NVCC) $(CFLAGS) -c w-stacking.cu phase_correction.cu $(LIBS)
OBJ += $(OBJ_ACC_CUDA)
endif
......@@ -256,6 +277,17 @@ endif
ifeq (NCCL_REDUCE,$(findstring NCCL_REDUCE,$(OPT)))
ifeq (CUDACC,$(findstring CUDACC,$(OPT)))
EXEC_EXT := $(EXEC_EXT)_acc-reduce
LINKER=$(MPIC++)
FLAGS=$(OPTIMIZE)
LIBS=$(NVLIB_3)
$(OBJ_NCCL_REDUCE): $(DEPS_NCCL_REDUCE)
$(NVCC) $(OPT_NVCC) $(OPT) -c $^ $(LIBS)
OBJ += $(OBJ_NCCL_REDUCE)
else
EXEC_EXT := $(EXEC_EXT)_acc-reduce
LINKER=$(NVC++)
FLAGS=$(NVFLAGS) $(CFLAGS)
......@@ -264,6 +296,7 @@ $(OBJ_NCCL_REDUCE): $(DEPS_NCCL_REDUCE)
$(NVC++) $(FLAGS) $(OPT) -c $^ $(LIBS)
OBJ += $(OBJ_NCCL_REDUCE)
endif
endif
ifeq (RCCL_REDUCE,$(findstring RCCL_REDUCE,$(OPT)))
EXEC_EXT := $(EXEC_EXT)_acc-reduce
......@@ -319,9 +352,11 @@ clean:
rm -f w-stacking.c
rm -f phase_correction.c
rm -f cuda_fft.cpp
cleanall:
rm -f $(EXEC)$(EXT)
rm -f *.o
rm -f w-stacking.c
rm -f phase_correction.c
rm -f gridding_nccl.cpp
rm -f gridding_rccl.cpp
#cleanall:
# rm -f $(EXEC)$(EXT)
# rm -f *.o
# rm -f w-stacking.c
# rm -f phase_correction.c
......@@ -23,8 +23,6 @@
#include <omp.h>
#endif
#if defined(USE_FFTW) && !defined(CUFFTMP) // use MPI fftw
#include <fftw3-mpi.h>
#endif
......
#include "allvars_nccl.h"
#include "proto.h"
#include <cuda.h>
......@@ -87,7 +86,8 @@ void gridding_data(){
ncclUniqueId id;
ncclComm_t comm;
cudaStream_t stream_reduce;
cudaError_t nnn;
cudaStream_t stream_reduce, stream_stacking;
if (rank == 0) ncclGetUniqueId(&id);
MPI_Bcast((void *)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);
......@@ -97,6 +97,7 @@ void gridding_data(){
cudaMalloc(&grid_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double));
cudaMalloc(&gridss_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double));
cudaStreamCreate(&stream_reduce);
cudaStreamCreate(&stream_stacking);
ncclCommInitRank(&comm, size, id, rank);
......@@ -190,16 +191,37 @@ void gridding_data(){
printf("Processing sector %ld\n",isector);
#endif
double *stacking_target_array;
if ( size > 1 )
stacking_target_array = gridss;
else
stacking_target_array = grid;
start = CPU_TIME_wt;
//We have to call different GPUs per MPI task!!! [GL]
#ifdef CUDACC
wstack(param.num_w_planes,
Nsec,
metaData.freq_per_chan,
metaData.polarisations,
uus,
vvs,
wws,
visreals,
visimgs,
weightss,
dx,
dw,
param.w_support,
xaxis,
yaxis,
gridss_gpu,
param.num_threads,
rank,
stream_stacking);
#else
wstack(param.num_w_planes,
Nsec,
metaData.freq_per_chan,
......@@ -218,14 +240,13 @@ void gridding_data(){
stacking_target_array,
param.num_threads,
rank);
#endif
//Allocate memory on devices non-blocking for the host
///////////////////////////////////////////////////////
timing_wt.kernel += CPU_TIME_wt - start;
cudaMemcpyAsync(gridss_gpu, gridss, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyHostToDevice, stream_reduce);
#ifdef VERBOSE
printf("Processed sector %ld\n",isector);
#endif
......@@ -239,8 +260,6 @@ void gridding_data(){
// int target_rank = (int)isector; it implied that size >= nsectors
int target_rank = (int)(isector % size);
cudaStreamSynchronize(stream_reduce);
start = CPU_TIME_wt;
ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble, ncclSum, target_rank, comm, stream_reduce);
......@@ -248,9 +267,9 @@ void gridding_data(){
timing_wt.reduce += CPU_TIME_wt - start;
// Go to next sector
memset ( gridss, 0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) );
nnn = cudaMemset( gridss_gpu, 0.0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) );
if (nnn != cudaSuccess) {printf("!!! w-stacking.cu cudaMemset ERROR %d !!!\n", nnn);}
}
free(memory);
......@@ -258,16 +277,15 @@ void gridding_data(){
//Copy data back from device to host (to be deleted in next steps)
cudaMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost, stream_reduce);
MPI_Barrier(MPI_COMM_WORLD);
cudaStreamSynchronize(stream_reduce);
cudaFree(gridss_gpu);
cudaFree(grid_gpu);
cudaFree(gridss_gpu);
cudaStreamDestroy(stream_reduce);
cudaStreamDestroy(stream_stacking);
ncclCommDestroy(comm);
......
......@@ -116,6 +116,8 @@ __global__ void convolve_g(
double* grid,
double std22)
{
//printf("DENTRO AL KERNEL\n");
uint gid = blockIdx.x*blockDim.x + threadIdx.x;
......@@ -131,6 +133,7 @@ __global__ void convolve_g(
double pos_u = uu[i] / dx;
double pos_v = vv[i] / dx;
double ww_i = ww[i] / dw;
int grid_w = (int)ww_i;
int grid_u = (int)pos_u;
int grid_v = (int)pos_v;
......@@ -140,22 +143,32 @@ __global__ void convolve_g(
uint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
uint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
uint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
//printf("%ld, %ld, %ld, %ld\n",jmin,jmax,kmin,kmax);
// Convolve this point onto the grid.
for (k = kmin; k <= kmax; k++)
{
double v_dist = (double)k+0.5 - pos_v;
double v_dist = (double)k - pos_v;
int increaseprecision = 5;
for (j = jmin; j <= jmax; j++)
{
double u_dist = (double)j+0.5 - pos_u;
uint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
//printf("--> %ld %d %d %d\n",iKer,j,k,grid_w);
int jKer = (int)(increaseprecision * (fabs(u_dist+(double)KernelLen)));
int kKer = (int)(increaseprecision * (fabs(v_dist+(double)KernelLen)));
#ifdef GAUSS_HI_PRECISION
double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
#endif
#ifdef GAUSS
double conv_weight = convkernel[jKer]*convkernel[kKer];
#endif
#ifdef KAISERBESSEL
double conv_weight = convkernel[jKer]*convkernel[kKer];
#endif
// Loops over frequencies and polarizations
double add_term_real = 0.0;
double add_term_img = 0.0;
......@@ -203,7 +216,14 @@ void wstack(
int grid_size_y,
double* grid,
int num_threads,
int rank)
#ifdef CUDACC
int rank,
cudaStream_t stream_stacking
#else
int rank
#endif
)
{
uint i;
//uint index;
......@@ -249,13 +269,10 @@ void wstack(
}
}
printf("Running rank %d/%d using GPU %d\n", rank, size, rank % ndevices);
#ifdef NVIDIA
prtAccelInfo();
#endif
printf("Running on GPU with %d threads and %d blocks\n",Nth,Nbl);
// Create GPU arrays and offload them
double * uu_g;
double * vv_g;
......@@ -263,32 +280,48 @@ void wstack(
float * vis_real_g;
float * vis_img_g;
float * weight_g;
double * grid_g;
//double * grid_g;
double * convkernel_g;
//Create the event inside stream stacking
//cudaEvent_t event_kernel;
//for (int i=0; i<100000; i++)grid[i]=23.0;
cudaError_t mmm;
//mmm=cudaEventCreate(&event_kernel);
mmm=cudaMalloc(&uu_g,num_points*sizeof(double));
mmm=cudaMalloc(&vv_g,num_points*sizeof(double));
mmm=cudaMalloc(&ww_g,num_points*sizeof(double));
mmm=cudaMalloc(&vis_real_g,Nvis*sizeof(float));
mmm=cudaMalloc(&vis_img_g,Nvis*sizeof(float));
mmm=cudaMalloc(&weight_g,(Nvis/freq_per_chan)*sizeof(float));
mmm=cudaMalloc(&grid_g,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
//mmm=cudaMalloc(&grid_g,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
/*
#if !defined(GAUSS_HI_PRECISION)
mmm=cudaMalloc(&convkernel_g,increaseprecision*w_support*sizeof(double));
#endif
*/
if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMalloc ERROR %d !!!\n", mmm);}
mmm=cudaMemset(grid_g,0.0,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
//mmm=cudaMemset(grid_g,0.0,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMemset ERROR %d !!!\n", mmm);}
mmm=cudaMemcpy(uu_g, uu, num_points*sizeof(double), cudaMemcpyHostToDevice);
mmm=cudaMemcpy(vv_g, vv, num_points*sizeof(double), cudaMemcpyHostToDevice);
mmm=cudaMemcpy(ww_g, ww, num_points*sizeof(double), cudaMemcpyHostToDevice);
mmm=cudaMemcpy(vis_real_g, vis_real, Nvis*sizeof(float), cudaMemcpyHostToDevice);
mmm=cudaMemcpy(vis_img_g, vis_img, Nvis*sizeof(float), cudaMemcpyHostToDevice);
mmm=cudaMemcpy(weight_g, weight, (Nvis/freq_per_chan)*sizeof(float), cudaMemcpyHostToDevice);
if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMemcpy ERROR %d !!!\n", mmm);}
mmm=cudaMemcpyAsync(uu_g, uu, num_points*sizeof(double), cudaMemcpyHostToDevice, stream_stacking);
mmm=cudaMemcpyAsync(vv_g, vv, num_points*sizeof(double), cudaMemcpyHostToDevice, stream_stacking);
mmm=cudaMemcpyAsync(ww_g, ww, num_points*sizeof(double), cudaMemcpyHostToDevice, stream_stacking);
mmm=cudaMemcpyAsync(vis_real_g, vis_real, Nvis*sizeof(float), cudaMemcpyHostToDevice, stream_stacking);
mmm=cudaMemcpyAsync(vis_img_g, vis_img, Nvis*sizeof(float), cudaMemcpyHostToDevice, stream_stacking);
mmm=cudaMemcpyAsync(weight_g, weight, (Nvis/freq_per_chan)*sizeof(float), cudaMemcpyHostToDevice, stream_stacking);
/*
#if !defined(GAUSS_HI_PRECISION)
mmm=cudaMemcpyAsync(convkernel_g, convkernel, increaseprecision*w_support*sizeof(double), cudaMemcpyHostToDevice, stream_stacking);
#endif
*/
if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMemcpyAsync ERROR %d !!!\n", mmm);}
// Call main GPU Kernel
convolve_g <<<Nbl,Nth>>> (
convolve_g <<<Nbl,Nth,0,stream_stacking>>> (
num_w_planes,
num_points,
freq_per_chan,
......@@ -304,20 +337,33 @@ void wstack(
KernelLen,
grid_size_x,
grid_size_y,
grid_g,
std22);
grid,
std22
);
mmm=cudaStreamSynchronize(stream_stacking);
//Record the event
//mmm=cudaEventRecord(event_kernel,stream_stacking);
//Wait until the kernel ends
//mmm=cudaStreamWaitEvent(stream_stacking,event_kernel);
mmm = cudaMemcpy(grid, grid_g, 2*num_w_planes*grid_size_x*grid_size_y*sizeof(double), cudaMemcpyDeviceToHost);
//for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
printf("CUDA ERROR %s\n",cudaGetErrorString(mmm));
mmm=cudaFree(uu_g);
mmm=cudaFree(vv_g);
mmm=cudaFree(ww_g);
mmm=cudaFree(vis_real_g);
mmm=cudaFree(vis_img_g);
mmm=cudaFree(weight_g);
mmm=cudaFree(grid_g);
//mmm=cudaFree(grid_g);
/*
#if !defined(GAUSS_HI_PRECISION)
mmm=cudaFree(convkernel_g);
#endif
*/
// Switch between CUDA and GPU versions
# else
......
......@@ -32,7 +32,8 @@ void wstack(
int,
double*,
int,
int);
int,
cudaStream_t);
}
#else
......@@ -58,6 +59,7 @@ void wstack(
#endif
#ifdef __CUDACC__
extern "C"
#endif
......@@ -81,6 +83,11 @@ void phase_correction(
int,
int);
double gauss_kernel_norm(
double norm,
double std22,
double u_dist,
double v_dist);
#ifdef ACCOMP
#pragma omp declare target (gauss_kernel_norm)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment