Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
H
HPC_Imaging
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Claudio Gheller
HPC_Imaging
Commits
9e323f26
Commit
9e323f26
authored
Nov 9, 2023
by
Giovanni Lacopo
Browse files
Options
Downloads
Patches
Plain Diff
Stacking/Reduce on GPU with reduced communication
parent
a42e9056
No related branches found
No related tags found
No related merge requests found
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
Makefile
+48
-13
48 additions, 13 deletions
Makefile
allvars_nccl.h
+0
-2
0 additions, 2 deletions
allvars_nccl.h
gridding_nccl.cu
+33
-15
33 additions, 15 deletions
gridding_nccl.cu
w-stacking.cu
+131
-85
131 additions, 85 deletions
w-stacking.cu
w-stacking.h
+9
-2
9 additions, 2 deletions
w-stacking.h
with
221 additions
and
117 deletions
Makefile
+
48
−
13
View file @
9e323f26
...
...
@@ -59,9 +59,9 @@ OPT += -DPHASE_ON
#OPT += -DNORMALIZE_UVW
# Gridding kernel: GAUSS, GAUSS_HI_PRECISION, KAISERBESSEL
OPT
+=
-DGAUSS_HI_PRECISION
#
OPT += -DGAUSS_HI_PRECISION
#
OPT += -DGAUSS
OPT
+=
-DGAUSS
#OPT += -DKAISERBESSEL
...
...
@@ -77,7 +77,7 @@ OPT += -DGAUSS_HI_PRECISION
#OPT += -DCUDACC
# use GPU acceleration via OMP
OPT
+=
-DACCOMP
#
OPT += -DACCOMP
# perform stacking on GPUs
#OPT += -DGPU_STACKING
...
...
@@ -125,7 +125,7 @@ DEPS = w-stacking.h main.c allvars.h
# ----- define which files will be compiled by MPICC
#
# these are the OBJS that will be compiled by C compiler if no acceleration (neither with CUDA nor with OpenMP) is provided
CC_OBJ_NOACC
=
allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o w-stacking.o phase_correction.o
CC_OBJ_NOACC
=
allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform
_3d
.o result.o numa.o reduce.o w-stacking.o phase_correction.o
# these are the OBJs that will be compiled by the normal MPICC compiler if GPU acceleration is switched on
CC_OBJ_ACC
=
allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform.o result.o numa.o reduce.o
...
...
@@ -142,13 +142,22 @@ DEPS_ACC_OMP = w-stacking.h phase_correction.c w-stacking.c
OBJ_ACC_OMP
=
phase_correction.o w-stacking.o
# ----- define what files will be compiled by NVC with OMP offloading when the stacking reduce is
# offloaded on GPU
ifeq
(CUDACC,$(findstring CUDACC,$(OPT)))
DEPS_NCCL_REDUCE
=
gridding_nccl.cu
OBJ_NCCL_REDUCE
=
gridding_nccl.o
DEPS_RCCL_REDUCE
=
gridding_rccl.cu
OBJ_RCCL_REDUCE
=
gridding_rccl.o
else
DEPS_NCCL_REDUCE
=
gridding_nccl.cpp
OBJ_NCCL_REDUCE
=
gridding_nccl.o
DEPS_RCCL_REDUCE
=
gridding_rccl.cpp
OBJ_RCCL_REDUCE
=
gridding_rccl.o
endif
# ----- define what files will be compiled by NVCC for Nvidia cufftMP implementation of FFT
#
...
...
@@ -196,6 +205,12 @@ phase_correction.c: phase_correction.cu
cuda_fft.cpp
:
cuda_fft.cu
cp
cuda_fft.cu cuda_fft.cpp
gridding_nccl.cpp
:
gridding_nccl.cu
cp
gridding_nccl.cu gridding_nccl.cpp
gridding_rccl.cpp
:
gridding_rccl.cu
cp
gridding_rccl.cu gridding_rccl.cpp
else
w-stacking.c
:
w-stacking.cu
rm
-f
w-stacking.c
...
...
@@ -206,6 +221,12 @@ phase_correction.c: phase_correction.cu
cuda_fft.cpp
:
cuda_fft.cu
rm
-f
cuda_fft.cpp
touch
cuda_fft.cpp
gridding_nccl.cpp
:
gridding_nccl.cu
rm
-f
gridding_nccl.cpp
touch
gridding_nccl.cpp
gridding_rccl.cpp
:
gridding_rccl.cu
rm
-f
gridding_rccl.cpp
touch
gridding_rccl.cpp
endif
...
...
@@ -221,7 +242,7 @@ LINKER=$(MPIC++)
FLAGS
=
$(
OPTIMIZE
)
LIBS
=
$(
NVLIB
)
$(OBJ_ACC_CUDA)
:
$(DEPS_ACC_CUDA)
$(
NVCC
)
$(
OPT
)
$(
OPT_NVCC
)
$(
CFLAGS
)
-c
*
.cu
$(
LIBS
)
$(
NVCC
)
$(
OPT
)
$(
OPT_NVCC
)
$(
CFLAGS
)
-c
w-stacking.cu phase_correction
.cu
$(
LIBS
)
OBJ
+=
$(
OBJ_ACC_CUDA
)
endif
...
...
@@ -256,6 +277,17 @@ endif
ifeq
(NCCL_REDUCE,$(findstring NCCL_REDUCE,$(OPT)))
ifeq
(CUDACC,$(findstring CUDACC,$(OPT)))
EXEC_EXT
:=
$(
EXEC_EXT
)
_acc-reduce
LINKER
=
$(
MPIC++
)
FLAGS
=
$(
OPTIMIZE
)
LIBS
=
$(
NVLIB_3
)
$(OBJ_NCCL_REDUCE)
:
$(DEPS_NCCL_REDUCE)
$(
NVCC
)
$(
OPT_NVCC
)
$(
OPT
)
-c
$^
$(
LIBS
)
OBJ
+=
$(
OBJ_NCCL_REDUCE
)
else
EXEC_EXT
:=
$(
EXEC_EXT
)
_acc-reduce
LINKER
=
$(
NVC++
)
FLAGS
=
$(
NVFLAGS
)
$(
CFLAGS
)
...
...
@@ -264,6 +296,7 @@ $(OBJ_NCCL_REDUCE): $(DEPS_NCCL_REDUCE)
$(
NVC++
)
$(
FLAGS
)
$(
OPT
)
-c
$^
$(
LIBS
)
OBJ
+=
$(
OBJ_NCCL_REDUCE
)
endif
endif
ifeq
(RCCL_REDUCE,$(findstring RCCL_REDUCE,$(OPT)))
EXEC_EXT
:=
$(
EXEC_EXT
)
_acc-reduce
...
...
@@ -319,9 +352,11 @@ clean:
rm
-f
w-stacking.c
rm
-f
phase_correction.c
rm
-f
cuda_fft.cpp
cleanall
:
rm
-f
$(
EXEC
)$(
EXT
)
rm
-f
*
.o
rm
-f
w-stacking.c
rm
-f
phase_correction.c
rm
-f
gridding_nccl.cpp
rm
-f
gridding_rccl.cpp
#cleanall:
# rm -f $(EXEC)$(EXT)
# rm -f *.o
# rm -f w-stacking.c
# rm -f phase_correction.c
This diff is collapsed.
Click to expand it.
allvars_nccl.h
+
0
−
2
View file @
9e323f26
...
...
@@ -23,8 +23,6 @@
#include
<omp.h>
#endif
#if defined(USE_FFTW) && !defined(CUFFTMP) // use MPI fftw
#include
<fftw3-mpi.h>
#endif
...
...
This diff is collapsed.
Click to expand it.
gridding_nccl.c
pp
→
gridding_nccl.c
u
+
33
−
15
View file @
9e323f26
#include
"allvars_nccl.h"
#include
"proto.h"
#include
<cuda.h>
...
...
@@ -87,7 +86,8 @@ void gridding_data(){
ncclUniqueId
id
;
ncclComm_t
comm
;
cudaStream_t
stream_reduce
;
cudaError_t
nnn
;
cudaStream_t
stream_reduce
,
stream_stacking
;
if
(
rank
==
0
)
ncclGetUniqueId
(
&
id
);
MPI_Bcast
((
void
*
)
&
id
,
sizeof
(
id
),
MPI_BYTE
,
0
,
MPI_COMM_WORLD
);
...
...
@@ -97,6 +97,7 @@ void gridding_data(){
cudaMalloc
(
&
grid_gpu
,
2
*
param
.
num_w_planes
*
xaxis
*
yaxis
*
sizeof
(
double
));
cudaMalloc
(
&
gridss_gpu
,
2
*
param
.
num_w_planes
*
xaxis
*
yaxis
*
sizeof
(
double
));
cudaStreamCreate
(
&
stream_reduce
);
cudaStreamCreate
(
&
stream_stacking
);
ncclCommInitRank
(
&
comm
,
size
,
id
,
rank
);
...
...
@@ -190,16 +191,37 @@ void gridding_data(){
printf
(
"Processing sector %ld
\n
"
,
isector
);
#endif
double
*
stacking_target_array
;
if
(
size
>
1
)
stacking_target_array
=
gridss
;
else
stacking_target_array
=
grid
;
start
=
CPU_TIME_wt
;
//We have to call different GPUs per MPI task!!! [GL]
#ifdef CUDACC
wstack
(
param
.
num_w_planes
,
Nsec
,
metaData
.
freq_per_chan
,
metaData
.
polarisations
,
uus
,
vvs
,
wws
,
visreals
,
visimgs
,
weightss
,
dx
,
dw
,
param
.
w_support
,
xaxis
,
yaxis
,
gridss_gpu
,
param
.
num_threads
,
rank
,
stream_stacking
);
#else
wstack
(
param
.
num_w_planes
,
Nsec
,
metaData
.
freq_per_chan
,
...
...
@@ -218,14 +240,13 @@ void gridding_data(){
stacking_target_array
,
param
.
num_threads
,
rank
);
#endif
//Allocate memory on devices non-blocking for the host
///////////////////////////////////////////////////////
timing_wt
.
kernel
+=
CPU_TIME_wt
-
start
;
cudaMemcpyAsync
(
gridss_gpu
,
gridss
,
2
*
param
.
num_w_planes
*
xaxis
*
yaxis
*
sizeof
(
double
),
cudaMemcpyHostToDevice
,
stream_reduce
);
#ifdef VERBOSE
printf
(
"Processed sector %ld
\n
"
,
isector
);
#endif
...
...
@@ -239,8 +260,6 @@ void gridding_data(){
// int target_rank = (int)isector; it implied that size >= nsectors
int
target_rank
=
(
int
)(
isector
%
size
);
cudaStreamSynchronize
(
stream_reduce
);
start
=
CPU_TIME_wt
;
ncclReduce
(
gridss_gpu
,
grid_gpu
,
size_of_grid
,
ncclDouble
,
ncclSum
,
target_rank
,
comm
,
stream_reduce
);
...
...
@@ -248,9 +267,9 @@ void gridding_data(){
timing_wt
.
reduce
+=
CPU_TIME_wt
-
start
;
// Go to next sector
memset
(
gridss
,
0
,
2
*
param
.
num_w_planes
*
xaxis
*
yaxis
*
sizeof
(
double
)
);
nnn
=
cudaMemset
(
gridss_gpu
,
0.0
,
2
*
param
.
num_w_planes
*
xaxis
*
yaxis
*
sizeof
(
double
)
);
if
(
nnn
!=
cudaSuccess
)
{
printf
(
"!!! w-stacking.cu cudaMemset ERROR %d !!!
\n
"
,
nnn
);}
}
free
(
memory
);
...
...
@@ -258,16 +277,15 @@ void gridding_data(){
//Copy data back from device to host (to be deleted in next steps)
cudaMemcpyAsync
(
grid
,
grid_gpu
,
2
*
param
.
num_w_planes
*
xaxis
*
yaxis
*
sizeof
(
double
),
cudaMemcpyDeviceToHost
,
stream_reduce
);
MPI_Barrier
(
MPI_COMM_WORLD
);
cudaStreamSynchronize
(
stream_reduce
);
cudaFree
(
gridss_gpu
);
cudaFree
(
grid_gpu
);
cudaFree
(
gridss_gpu
);
cudaStreamDestroy
(
stream_reduce
);
cudaStreamDestroy
(
stream_stacking
);
ncclCommDestroy
(
comm
);
...
...
This diff is collapsed.
Click to expand it.
w-stacking.cu
+
131
−
85
View file @
9e323f26
...
...
@@ -116,6 +116,8 @@ __global__ void convolve_g(
double
*
grid
,
double
std22
)
{
//printf("DENTRO AL KERNEL\n");
uint
gid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
...
...
@@ -131,6 +133,7 @@ __global__ void convolve_g(
double
pos_u
=
uu
[
i
]
/
dx
;
double
pos_v
=
vv
[
i
]
/
dx
;
double
ww_i
=
ww
[
i
]
/
dw
;
int
grid_w
=
(
int
)
ww_i
;
int
grid_u
=
(
int
)
pos_u
;
int
grid_v
=
(
int
)
pos_v
;
...
...
@@ -140,22 +143,32 @@ __global__ void convolve_g(
uint
jmax
=
(
grid_u
<
grid_size_x
-
KernelLen
)
?
grid_u
+
KernelLen
:
grid_size_x
-
1
;
uint
kmin
=
(
grid_v
>
KernelLen
-
1
)
?
grid_v
-
KernelLen
:
0
;
uint
kmax
=
(
grid_v
<
grid_size_y
-
KernelLen
)
?
grid_v
+
KernelLen
:
grid_size_y
-
1
;
//printf("%ld, %ld, %ld, %ld\n",jmin,jmax,kmin,kmax);
// Convolve this point onto the grid.
for
(
k
=
kmin
;
k
<=
kmax
;
k
++
)
{
double
v_dist
=
(
double
)
k
+
0.5
-
pos_v
;
double
v_dist
=
(
double
)
k
-
pos_v
;
int
increaseprecision
=
5
;
for
(
j
=
jmin
;
j
<=
jmax
;
j
++
)
{
double
u_dist
=
(
double
)
j
+
0.5
-
pos_u
;
uint
iKer
=
2
*
(
j
+
k
*
grid_size_x
+
grid_w
*
grid_size_x
*
grid_size_y
);
//printf("--> %ld %d %d %d\n",iKer,j,k,grid_w);
int
jKer
=
(
int
)(
increaseprecision
*
(
fabs
(
u_dist
+
(
double
)
KernelLen
)));
int
kKer
=
(
int
)(
increaseprecision
*
(
fabs
(
v_dist
+
(
double
)
KernelLen
)));
#ifdef GAUSS_HI_PRECISION
double
conv_weight
=
gauss_kernel_norm
(
norm
,
std22
,
u_dist
,
v_dist
);
#endif
#ifdef GAUSS
double
conv_weight
=
convkernel
[
jKer
]
*
convkernel
[
kKer
];
#endif
#ifdef KAISERBESSEL
double
conv_weight
=
convkernel
[
jKer
]
*
convkernel
[
kKer
];
#endif
// Loops over frequencies and polarizations
double
add_term_real
=
0.0
;
double
add_term_img
=
0.0
;
...
...
@@ -203,7 +216,14 @@ void wstack(
int
grid_size_y
,
double
*
grid
,
int
num_threads
,
int
rank
)
#ifdef CUDACC
int
rank
,
cudaStream_t
stream_stacking
#else
int
rank
#endif
)
{
uint
i
;
//uint index;
...
...
@@ -249,13 +269,10 @@ void wstack(
}
}
printf
(
"Running rank %d/%d using GPU %d
\n
"
,
rank
,
size
,
rank
%
ndevices
);
#ifdef NVIDIA
prtAccelInfo
();
#endif
printf
(
"Running on GPU with %d threads and %d blocks
\n
"
,
Nth
,
Nbl
);
// Create GPU arrays and offload them
double
*
uu_g
;
double
*
vv_g
;
...
...
@@ -263,32 +280,48 @@ void wstack(
float
*
vis_real_g
;
float
*
vis_img_g
;
float
*
weight_g
;
double
*
grid_g
;
//double * grid_g;
double
*
convkernel_g
;
//Create the event inside stream stacking
//cudaEvent_t event_kernel;
//for (int i=0; i<100000; i++)grid[i]=23.0;
cudaError_t
mmm
;
//mmm=cudaEventCreate(&event_kernel);
mmm
=
cudaMalloc
(
&
uu_g
,
num_points
*
sizeof
(
double
));
mmm
=
cudaMalloc
(
&
vv_g
,
num_points
*
sizeof
(
double
));
mmm
=
cudaMalloc
(
&
ww_g
,
num_points
*
sizeof
(
double
));
mmm
=
cudaMalloc
(
&
vis_real_g
,
Nvis
*
sizeof
(
float
));
mmm
=
cudaMalloc
(
&
vis_img_g
,
Nvis
*
sizeof
(
float
));
mmm
=
cudaMalloc
(
&
weight_g
,(
Nvis
/
freq_per_chan
)
*
sizeof
(
float
));
mmm
=
cudaMalloc
(
&
grid_g
,
2
*
num_w_planes
*
grid_size_x
*
grid_size_y
*
sizeof
(
double
));
//mmm=cudaMalloc(&grid_g,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
/*
#if !defined(GAUSS_HI_PRECISION)
mmm=cudaMalloc(&convkernel_g,increaseprecision*w_support*sizeof(double));
#endif
*/
if
(
mmm
!=
cudaSuccess
)
{
printf
(
"!!! w-stacking.cu cudaMalloc ERROR %d !!!
\n
"
,
mmm
);}
mmm
=
cudaMemset
(
grid_g
,
0.0
,
2
*
num_w_planes
*
grid_size_x
*
grid_size_y
*
sizeof
(
double
));
//
mmm=cudaMemset(grid_g,0.0,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
if
(
mmm
!=
cudaSuccess
)
{
printf
(
"!!! w-stacking.cu cudaMemset ERROR %d !!!
\n
"
,
mmm
);}
mmm
=
cudaMemcpy
(
uu_g
,
uu
,
num_points
*
sizeof
(
double
),
cudaMemcpyHostToDevice
);
mmm
=
cudaMemcpy
(
vv_g
,
vv
,
num_points
*
sizeof
(
double
),
cudaMemcpyHostToDevice
);
mmm
=
cudaMemcpy
(
ww_g
,
ww
,
num_points
*
sizeof
(
double
),
cudaMemcpyHostToDevice
);
mmm
=
cudaMemcpy
(
vis_real_g
,
vis_real
,
Nvis
*
sizeof
(
float
),
cudaMemcpyHostToDevice
);
mmm
=
cudaMemcpy
(
vis_img_g
,
vis_img
,
Nvis
*
sizeof
(
float
),
cudaMemcpyHostToDevice
);
mmm
=
cudaMemcpy
(
weight_g
,
weight
,
(
Nvis
/
freq_per_chan
)
*
sizeof
(
float
),
cudaMemcpyHostToDevice
);
if
(
mmm
!=
cudaSuccess
)
{
printf
(
"!!! w-stacking.cu cudaMemcpy ERROR %d !!!
\n
"
,
mmm
);}
mmm
=
cudaMemcpyAsync
(
uu_g
,
uu
,
num_points
*
sizeof
(
double
),
cudaMemcpyHostToDevice
,
stream_stacking
);
mmm
=
cudaMemcpyAsync
(
vv_g
,
vv
,
num_points
*
sizeof
(
double
),
cudaMemcpyHostToDevice
,
stream_stacking
);
mmm
=
cudaMemcpyAsync
(
ww_g
,
ww
,
num_points
*
sizeof
(
double
),
cudaMemcpyHostToDevice
,
stream_stacking
);
mmm
=
cudaMemcpyAsync
(
vis_real_g
,
vis_real
,
Nvis
*
sizeof
(
float
),
cudaMemcpyHostToDevice
,
stream_stacking
);
mmm
=
cudaMemcpyAsync
(
vis_img_g
,
vis_img
,
Nvis
*
sizeof
(
float
),
cudaMemcpyHostToDevice
,
stream_stacking
);
mmm
=
cudaMemcpyAsync
(
weight_g
,
weight
,
(
Nvis
/
freq_per_chan
)
*
sizeof
(
float
),
cudaMemcpyHostToDevice
,
stream_stacking
);
/*
#if !defined(GAUSS_HI_PRECISION)
mmm=cudaMemcpyAsync(convkernel_g, convkernel, increaseprecision*w_support*sizeof(double), cudaMemcpyHostToDevice, stream_stacking);
#endif
*/
if
(
mmm
!=
cudaSuccess
)
{
printf
(
"!!! w-stacking.cu cudaMemcpyAsync ERROR %d !!!
\n
"
,
mmm
);}
// Call main GPU Kernel
convolve_g
<<<
Nbl
,
Nth
>>>
(
convolve_g
<<<
Nbl
,
Nth
,
0
,
stream_stacking
>>>
(
num_w_planes
,
num_points
,
freq_per_chan
,
...
...
@@ -304,20 +337,33 @@ void wstack(
KernelLen
,
grid_size_x
,
grid_size_y
,
grid_g
,
std22
);
grid
,
std22
);
mmm
=
cudaStreamSynchronize
(
stream_stacking
);
//Record the event
//mmm=cudaEventRecord(event_kernel,stream_stacking);
//Wait until the kernel ends
//mmm=cudaStreamWaitEvent(stream_stacking,event_kernel);
mmm
=
cudaMemcpy
(
grid
,
grid_g
,
2
*
num_w_planes
*
grid_size_x
*
grid_size_y
*
sizeof
(
double
),
cudaMemcpyDeviceToHost
);
//for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
printf
(
"CUDA ERROR %s
\n
"
,
cudaGetErrorString
(
mmm
));
mmm
=
cudaFree
(
uu_g
);
mmm
=
cudaFree
(
vv_g
);
mmm
=
cudaFree
(
ww_g
);
mmm
=
cudaFree
(
vis_real_g
);
mmm
=
cudaFree
(
vis_img_g
);
mmm
=
cudaFree
(
weight_g
);
mmm
=
cudaFree
(
grid_g
);
//mmm=cudaFree(grid_g);
/*
#if !defined(GAUSS_HI_PRECISION)
mmm=cudaFree(convkernel_g);
#endif
*/
// Switch between CUDA and GPU versions
# else
...
...
This diff is collapsed.
Click to expand it.
w-stacking.h
+
9
−
2
View file @
9e323f26
...
...
@@ -32,7 +32,8 @@ void wstack(
int
,
double
*
,
int
,
int
);
int
,
cudaStream_t
);
}
#else
...
...
@@ -58,6 +59,7 @@ void wstack(
#endif
#ifdef __CUDACC__
extern
"C"
#endif
...
...
@@ -81,6 +83,11 @@ void phase_correction(
int
,
int
);
double
gauss_kernel_norm
(
double
norm
,
double
std22
,
double
u_dist
,
double
v_dist
);
#ifdef ACCOMP
#pragma omp declare target (gauss_kernel_norm)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment