Skip to content
Snippets Groups Projects
Commit c66c4671 authored by Emanuele De Rubeis's avatar Emanuele De Rubeis
Browse files

Compilation updates - CPU/GPU interoperability

parent 84e3702e
No related branches found
No related tags found
No related merge requests found
......@@ -20,8 +20,8 @@ endif
LINKER=$(MPICC)
FFTW_MPI_INC = -I/opt/cray/pe/fftw/3.3.10.5/x86_rome/include
FFTW_MPI_LIB = -L/opt/cray/pe/fftw/3.3.10.5/x86_rome/lib
FFTW_MPI_INC =
FFTW_MPI_LIB =
CFLAGS += -I./
......@@ -63,11 +63,11 @@ OPT += -DPHASE_ON
# SELECT THE GRIDDING KERNEL: GAUSS, GAUSS_HI_PRECISION, KAISERBESSEL
#OPT += -DGAUSS_HI_PRECISION
OPT += -DGAUSS_HI_PRECISION
#OPT += -DGAUSS
OPT += -DKAISERBESSEL
#OPT += -DKAISERBESSEL
# ========================================================
......@@ -77,16 +77,16 @@ OPT += -DKAISERBESSEL
#OPT += -DNVIDIA
# use CUDA for GPUs
#OPT += -DCUDACC
OPT += -DCUDACC
# use GPU acceleration via OMP
#OPT += -DACCOMP
# perform stacking on GPUs
#OPT += -DGPU_STACKING
OPT += -DGPU_STACKING
# use NVIDIA GPU to perform the reduce
#OPT += -DNCCL_REDUCE
OPT += -DNCCL_REDUCE
# use GPU to perform FFT
#OPT += -DCUFFTMP
......@@ -109,7 +109,7 @@ endif
#OPT += -DRCCL_REDUCE
# FULL AMD GPU SUPPORT - Recommended for full AMD GPU code execution
OPT += -DFULL_AMD
#OPT += -DFULL_AMD
ifeq (FULL_AMD,$(findstring FULL_AMD,$(OPT)))
OPT += -DHIPCC -DRCCL_REDUCE -D__HIP_PLATFORM_AMD__
endif
......
......@@ -30,7 +30,6 @@ __global__ void write_grid(
fftwgrid[fftwindex2D].x = grid[fftwindex];
fftwgrid[fftwindex2D].y = grid[fftwindex+1];
}
}
......@@ -88,8 +87,9 @@ void cuda_fft(
cufftDoubleComplex *fftwgrid;
// Alloco fftwgrid su GPU utilizzando cudaMalloc
mmm=cudaMalloc(&fftwgrid, sizeof(cufftDoubleComplex)*2*yaxis*xaxis);
mmm=cudaMalloc(&fftwgrid, sizeof(cufftDoubleComplex)*yaxis*xaxis);
if (mmm != cudaSuccess) {printf("!!! cuda_fft.cu cudaMalloc ERROR %d !!!\n", mmm);}
int Nth = 32;
......@@ -146,8 +146,11 @@ void cuda_fft(
status = cufftXtMalloc(plan, &fftwgrid_g, CUFFT_XT_FORMAT_INPLACE);
if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMalloc ERROR %d !!!\n", status);}
cudaStreamSynchronize(stream);
//Copy the array to be transformed onto the descriptor structure array
cudaMemcpy(fftwgrid_g->descriptor->data[0], fftwgrid, 2*xaxis*yaxis*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToDevice);
mmm = cudaMemcpy(fftwgrid_g->descriptor->data[0], fftwgrid, xaxis*yaxis*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToDevice);
if (mmm != cudaSuccess) {printf("!!! cudaMemcpy 1 ERROR %d !!!\n", mmm);}
//Perform the FFT
status = cufftXtExecDescriptor(plan, fftwgrid_g, fftwgrid_g, CUFFT_INVERSE);
......@@ -163,7 +166,8 @@ void cuda_fft(
if (status != CUFFT_SUCCESS) {printf("!!! cufftXtMemcpy dtd fftwgrid ERROR %d !!!\n", status);}
//Copy the result descriptor structure array again onto the original fftwgrid
cudaMemcpy(fftwgrid, fftwgrid_g2->descriptor->data[0], 2*xaxis*yaxis*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToDevice);
mmm = cudaMemcpy(fftwgrid, fftwgrid_g2->descriptor->data[0], xaxis*yaxis*sizeof(cufftDoubleComplex), cudaMemcpyDeviceToDevice);
if (mmm != cudaSuccess) {printf("!!! cudaMemcpy 2 ERROR %d !!!\n", mmm);}
//Write gridss starting from fftwgrid
write_gridss<<<Nbl, Nth>>>(num_w_planes, xaxis, yaxis, fftwgrid, gridss, norm, iw);
......@@ -191,4 +195,3 @@ void cuda_fft(
}
#endif
......@@ -104,8 +104,6 @@ void fftw_data ( void )
// FFT transform the data using cuFFT
if(rank==0)printf("PERFORMING CUDA FFT\n");
MPI_Barrier(MPI_COMM_WORLD);
double start = CPU_TIME_wt;
......@@ -120,8 +118,6 @@ void fftw_data ( void )
rank,
MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
timing_wt.cufftmp += CPU_TIME_wt - start;
return;
......@@ -213,7 +209,7 @@ void write_fftw_data(){
double* image_real = (double*) calloc(xaxis*yaxis,sizeof(double));
double* image_imag = (double*) calloc(xaxis*yaxis,sizeof(double));
#ifdef CUDACC
#ifdef CUFFTMP
phase_correction(gridss_gpu,image_real,image_imag,xaxis,yaxis,param.num_w_planes,param.grid_size_x,param.grid_size_y,resolution,metaData.wmin,metaData.wmax,param.num_threads,rank);
#else
phase_correction(gridss,image_real,image_imag,xaxis,yaxis,param.num_w_planes,param.grid_size_x,param.grid_size_y,resolution,metaData.wmin,metaData.wmax,param.num_threads,rank);
......
......@@ -95,10 +95,17 @@ void gridding_data(){
cudaSetDevice(local_rank);
cudaMalloc(&grid_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double));
cudaMalloc(&gridss_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double));
cudaStreamCreate(&stream_reduce);
nnn = cudaMalloc(&grid_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double));
if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaMalloc &grid_gpu ERROR %d !!!\n", nnn);}
nnn = cudaMalloc(&gridss_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double));
if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaMalloc &gridss_gpu ERROR %d !!!\n", nnn);}
nnn = cudaStreamCreate(&stream_reduce);
if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaStreamCreate &stream_reduce ERROR %d !!!\n", nnn);}
cudaStreamCreate(&stream_stacking);
if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaStreamCreate &stream_stacking ERROR %d !!!\n", nnn);}
ncclCommInitRank(&comm, size, id, rank);
......@@ -263,7 +270,7 @@ void gridding_data(){
// Go to next sector
nnn = cudaMemset( gridss_gpu, 0.0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) );
if (nnn != cudaSuccess) {printf("!!! w-stacking.cu cudaMemset ERROR %d !!!\n", nnn);}
if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaMemset ERROR %d !!!\n", nnn);}
}
free(memory);
......@@ -274,11 +281,18 @@ void gridding_data(){
//cudaMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost, stream_reduce);
#if !defined(CUFFTMP)
cudaMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost, stream_reduce);
cudaStreamSynchronize(stream_reduce);
#endif
MPI_Barrier(MPI_COMM_WORLD);
//cudaStreamSynchronize(stream_reduce);
// cudaFree(grid_gpu);
//cudaFree(gridss_gpu);
#if !defined(CUFFTMP)
cudaFree(grid_gpu);
cudaFree(gridss_gpu);
#endif
cudaStreamDestroy(stream_reduce);
cudaStreamDestroy(stream_stacking);
......
......@@ -129,6 +129,13 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
double * image_real_g;
double * image_imag_g;
#if !defined(CUFFTMP)
double * gridss_g;
mmm = cudaMalloc(&gridss_g, 2*num_w_planes*xaxis*yaxis*sizeof(double));
mmm = cudaMemcpy(gridss_g, gridss, 2*num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyHostToDevice);
#endif
mmm=cudaMalloc(&image_real_g, xaxis*yaxis*sizeof(double));
//printf("CUDA ERROR 2 %s\n",cudaGetErrorString(mmm));
mmm=cudaMalloc(&image_imag_g, xaxis*yaxis*sizeof(double));
......@@ -144,7 +151,11 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
phase_g <<<Nbl,Nth>>> (xaxis,
yaxis,
num_w_planes,
#if defined(CUFFTMP)
gridss,
#else
gridss_g,
#endif
image_real_g,
image_imag_g,
wmin,
......@@ -160,7 +171,13 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
mmm = cudaMemcpy(image_imag, image_imag_g, xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost);
//printf("CUDA ERROR 8 %s\n",cudaGetErrorString(mmm));
mmm= cudaFree(gridss);
#if !defined(CUFFTMP)
cudaFree(gridss_g);
#else
cudaFree(gridss);
#endif
#else
#ifndef ACCOMP
......
......@@ -8,6 +8,10 @@
#ifdef __CUDACC__
#include "allvars_nccl.h"
#elif HIPCC
#include "allvars_rccl.h"
#else
#include "allvars.h"
#endif
#include "proto.h"
......@@ -286,8 +290,14 @@ void wstack(
float * vis_real_g;
float * vis_img_g;
float * weight_g;
//double * grid_g;
double * convkernel_g;
#if !defined(NCCL_REDUCE)
double * grid_g;
#endif
#if !defined(NCCL_REDUCE)
cudaStream_t stream_stacking;
cudaStreamCreate(&stream_stacking);
#endif
//Create the event inside stream stacking
//cudaEvent_t event_kernel;
......@@ -303,14 +313,19 @@ void wstack(
mmm=cudaMalloc(&weight_g,(Nvis/freq_per_chan)*sizeof(float));
//mmm=cudaMalloc(&grid_g,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
#if !defined(NCCL_REDUCE)
mmm = cudaMalloc(&grid_g,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
#endif
#if !defined(GAUSS_HI_PRECISION)
mmm=cudaMalloc(&convkernel_g,increaseprecision*w_support*sizeof(double));
#endif
if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMalloc ERROR %d !!!\n", mmm);}
//mmm=cudaMemset(grid_g,0.0,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMemset ERROR %d !!!\n", mmm);}
#if !defined(NCCL_REDUCE)
mmm=cudaMemset(grid_g,0.0,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
if (mmm != cudaSuccess) {printf("!!! w-stacking.cu cudaMemset ERROR %d !!!\n", mmm);}
#endif
mmm=cudaMemcpyAsync(uu_g, uu, num_points*sizeof(double), cudaMemcpyHostToDevice, stream_stacking);
mmm=cudaMemcpyAsync(vv_g, vv, num_points*sizeof(double), cudaMemcpyHostToDevice, stream_stacking);
......@@ -344,7 +359,11 @@ void wstack(
KernelLen,
grid_size_x,
grid_size_y,
#if !defined(NCCL_REDUCE)
grid_g,
#else
grid,
#endif
std22
);
#else
......@@ -364,13 +383,16 @@ void wstack(
KernelLen,
grid_size_x,
grid_size_y,
#if !defined(NCCL_REDUCE)
grid_g,
#else
grid,
#endif
std22,
convkernel_g
);
#endif
mmm=cudaStreamSynchronize(stream_stacking);
//Record the event
//mmm=cudaEventRecord(event_kernel,stream_stacking);
......@@ -380,6 +402,10 @@ void wstack(
//for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
#if !defined(NCCL_REDUCE)
mmm=cudaMemcpy(grid, grid_g, 2*num_w_planes*grid_size_x*grid_size_y*sizeof(double), cudaMemcpyDeviceToHost);
#endif
if (mmm != cudaSuccess)
printf("CUDA ERROR %s\n",cudaGetErrorString(mmm));
......@@ -389,7 +415,10 @@ void wstack(
mmm=cudaFree(vis_real_g);
mmm=cudaFree(vis_img_g);
mmm=cudaFree(weight_g);
//mmm=cudaFree(grid_g);
#if !defined(NCCL_REDUCE)
mmm=cudaFree(grid_g);
#endif
#if !defined(GAUSS_HI_PRECISION)
mmm=cudaFree(convkernel_g);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment