Skip to content
Snippets Groups Projects
Commit 732d1d44 authored by Emanuele De Rubeis's avatar Emanuele De Rubeis
Browse files

Full AMD GPUs enabling

parent f7773175
No related branches found
No related tags found
No related merge requests found
...@@ -91,18 +91,29 @@ OPT += -DGAUSS_HI_PRECISION ...@@ -91,18 +91,29 @@ OPT += -DGAUSS_HI_PRECISION
# use GPU to perform FFT # use GPU to perform FFT
#OPT += -DCUFFTMP #OPT += -DCUFFTMP
# FULL GPU SUPPORT - Recommended for full NVIDIA GPU code execution # FULL NVIDIA GPU SUPPORT - Recommended for full NVIDIA GPU code execution
OPT += -DFULL_NVIDIA OPT += -DFULL_NVIDIA
ifeq (FULL_NVIDIA,$(findstring FULL_NVIDIA,$(OPT))) ifeq (FULL_NVIDIA,$(findstring FULL_NVIDIA,$(OPT)))
OPT += -DCUDACC -DNCCL_REDUCE -DCUFFTMP OPT += -DCUDACC -DNCCL_REDUCE -DCUFFTMP
endif endif
# use HIP for GPUs
#OPT += -DHIPCC
# support for AMD GPUs # support for AMD GPUs
#OPT += __HIP_PLATFORM_AMD__ #OPT += -D__HIP_PLATFORM_AMD__
# use AMD GPU to perform the reduce # use AMD GPU to perform the reduce
#OPT += -DRCCL_REDUCE #OPT += -DRCCL_REDUCE
# FULL AMD GPU SUPPORT - Recommended for full AMD GPU code execution
#OPT += -DFULL_AMD
ifeq (FULL_AMD,$(findstring FULL_AMD,$(OPT)))
OPT += -DHIPCC -DRCCL_REDUCE -D__HIP_PLATFORM_AMD__
endif
# ======================================================= # =======================================================
# OUTPUT HANDLING # OUTPUT HANDLING
...@@ -167,6 +178,11 @@ CC_OBJ_ACC = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform ...@@ -167,6 +178,11 @@ CC_OBJ_ACC = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform
DEPS_ACC_CUDA = w-stacking.h w-stacking.cu phase_correction.cu DEPS_ACC_CUDA = w-stacking.h w-stacking.cu phase_correction.cu
OBJ_ACC_CUDA = phase_correction.o w-stacking.o OBJ_ACC_CUDA = phase_correction.o w-stacking.o
# ----- define which files will be compiled by HIPCC for AMD
#
DEPS_ACC_HIP = w-stacking.hip.hpp w-stacking.hip.cpp phase_correction.hip.cpp
OBJ_ACC_HIP = phase_correction.hip.o w-stacking.hip.o
# ----- define which files will be compiled by NVC with OMP offloading for wither Nvidia or AMD # ----- define which files will be compiled by NVC with OMP offloading for wither Nvidia or AMD
# #
DEPS_ACC_OMP = w-stacking.h phase_correction.c w-stacking.c DEPS_ACC_OMP = w-stacking.h phase_correction.c w-stacking.c
...@@ -180,14 +196,12 @@ ifeq (CUDACC,$(findstring CUDACC,$(OPT))) ...@@ -180,14 +196,12 @@ ifeq (CUDACC,$(findstring CUDACC,$(OPT)))
DEPS_NCCL_REDUCE = gridding_nccl.cu DEPS_NCCL_REDUCE = gridding_nccl.cu
OBJ_NCCL_REDUCE = gridding_nccl.o OBJ_NCCL_REDUCE = gridding_nccl.o
DEPS_RCCL_REDUCE = gridding_rccl.cu else ifeq (HIPCC,$(findstring HIPCC,$(OPT)))
OBJ_RCCL_REDUCE = gridding_rccl.o DEPS_RCCL_REDUCE = gridding_rccl.hip.cpp allvars_rccl.hip.hpp
OBJ_RCCL_REDUCE = gridding_rccl.hip.o
else else
DEPS_NCCL_REDUCE = gridding_nccl.cpp DEPS_NCCL_REDUCE = gridding_nccl.cpp
OBJ_NCCL_REDUCE = gridding_nccl.o OBJ_NCCL_REDUCE = gridding_nccl.o
DEPS_RCCL_REDUCE = gridding_rccl.cpp
OBJ_RCCL_REDUCE = gridding_rccl.o
endif endif
# ----- define what files will be compiled by NVCC for Nvidia cufftMP implementation of FFT # ----- define what files will be compiled by NVCC for Nvidia cufftMP implementation of FFT
...@@ -211,6 +225,8 @@ ifeq (ACCOMP,$(findstring ACCOMP, $(OPT))) ...@@ -211,6 +225,8 @@ ifeq (ACCOMP,$(findstring ACCOMP, $(OPT)))
OBJ = $(CC_OBJ_ACC) OBJ = $(CC_OBJ_ACC)
else ifeq (CUDACC,$(findstring CUDACC, $(OPT))) else ifeq (CUDACC,$(findstring CUDACC, $(OPT)))
OBJ = $(CC_OBJ_ACC) OBJ = $(CC_OBJ_ACC)
else ifeq (HIPCC,$(findstring HIPCC, $(OPT)))
OBJ = $(CC_OBJ_ACC)
else else
OBJ = $(CC_OBJ_NOACC) OBJ = $(CC_OBJ_NOACC)
endif endif
...@@ -240,6 +256,22 @@ cuda_fft.cpp: cuda_fft.cu ...@@ -240,6 +256,22 @@ cuda_fft.cpp: cuda_fft.cu
gridding_nccl.cpp: gridding_nccl.cu gridding_nccl.cpp: gridding_nccl.cu
cp gridding_nccl.cu gridding_nccl.cpp cp gridding_nccl.cu gridding_nccl.cpp
gridding_rccl.cpp: gridding_rccl.cu
cp gridding_rccl.cu gridding_rccl.cpp
else ifneq (HIPCC,$(findstring HIPCC,$(OPT)))
w-stacking.c: w-stacking.cu
cp w-stacking.cu w-stacking.c
phase_correction.c: phase_correction.cu
cp phase_correction.cu phase_correction.c
cuda_fft.cpp: cuda_fft.cu
cp cuda_fft.cu cuda_fft.cpp
gridding_nccl.cpp: gridding_nccl.cu
cp gridding_nccl.cu gridding_nccl.cpp
gridding_rccl.cpp: gridding_rccl.cu gridding_rccl.cpp: gridding_rccl.cu
cp gridding_rccl.cu gridding_rccl.cpp cp gridding_rccl.cu gridding_rccl.cpp
else else
...@@ -277,6 +309,15 @@ $(OBJ_ACC_CUDA): $(DEPS_ACC_CUDA) ...@@ -277,6 +309,15 @@ $(OBJ_ACC_CUDA): $(DEPS_ACC_CUDA)
OBJ += $(OBJ_ACC_CUDA) OBJ += $(OBJ_ACC_CUDA)
endif endif
ifeq (HIPCC,$(findstring HIPCC,$(OPT)))
EXEC_EXT := $(EXEC_EXT)_acc-hip
LINKER=$(MPIC++)
FLAGS=$(OPTIMIZE)
LIBS=$(AMDLIB)
$(OBJ_ACC_HIP): $(DEPS_ACC_HIP)
$(HIPCC) $(OPT) $(OPT_HIPCC) $(CFLAGS) -c w-stacking.hip.cpp phase_correction.hip.cpp $(LIBS)
OBJ += $(OBJ_ACC_HIP)
endif
ifeq (ACCOMP,$(findstring ACCOMP,$(OPT))) ifeq (ACCOMP,$(findstring ACCOMP,$(OPT)))
...@@ -317,6 +358,16 @@ LIBS=$(NVLIB_3) ...@@ -317,6 +358,16 @@ LIBS=$(NVLIB_3)
$(OBJ_NCCL_REDUCE): $(DEPS_NCCL_REDUCE) $(OBJ_NCCL_REDUCE): $(DEPS_NCCL_REDUCE)
$(NVCC) $(OPT_NVCC) $(OPT) -c $^ $(LIBS) $(NVCC) $(OPT_NVCC) $(OPT) -c $^ $(LIBS)
OBJ += $(OBJ_NCCL_REDUCE) OBJ += $(OBJ_NCCL_REDUCE)
else ifeq (HIPCC,$(findstring HIPCC,$(OPT)))
EXEC_EXT := $(EXEC_EXT)_acc-reduce
LINKER=$(MPIC++)
FLAGS=$(OPTIMIZE) $(CFLAGS)
LIBS=$(AMDLIB_3)
$(OBJ_NCCL_REDUCE): $(DEPS_NCCL_REDUCE)
$(MPIC++) $(FLAGS) $(OPT) -c $^ $(CFLAGS) $(LIBS)
OBJ += $(OBJ_NCCL_REDUCE)
else else
EXEC_EXT := $(EXEC_EXT)_acc-reduce EXEC_EXT := $(EXEC_EXT)_acc-reduce
......
/* file to store global variables*/
#if defined(__STDC__)
# if (__STDC_VERSION__ >= 199901L)
# define _XOPEN_SOURCE 700
# endif
#endif
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <unistd.h>
#if !defined( RCCL_REDUCE ) && !defined(__HIPCC__)
#include <stdatomic.h>
#endif
#include <mpi.h>
#if defined (_OPENMP)
#include <omp.h>
#endif
#if defined(USE_FFTW) && !defined(CUFFTMP) // use MPI fftw
#include <fftw3-mpi.h>
#endif
#if defined(ACCOMP)
#include "w-stacking_omp.h"
#else
#include "w-stacking.hip.hpp"
#endif
#if defined(NVIDIA)
#include <cuda_runtime.h>
#endif
#include "fft.h"
#include "numa.h"
#include "timing.h"
#include "errcodes.h"
#define PI 3.14159265359
#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
#define MAX(X, Y) (((X) > (Y)) ? (X) : (Y))
#define NOVERBOSE
#define NFILES 100
#define NAME_LEN 50
#define LONGNAME_LEN 1000
#define REDUCE_MPI 0
#define REDUCE_RING 1
#if defined(DEBUG)
#define dprintf(LEVEL, T, t, ...) if( (verbose_level >= (LEVEL)) && \
( ((t) ==-1 ) || ((T)==(t)) ) ) { \
printf(__VA_ARGS__); fflush(stdout); }
#else
#define dprintf(...)
#endif
typedef double double_t;
#if defined(DOUBLE_PRECISION)
typedef double float_t;
#else
typedef float float_t;
#endif
typedef unsigned int uint;
typedef unsigned long long ull;
extern struct io
{
FILE * pFile;
FILE * pFile1;
FILE * pFilereal;
FILE * pFileimg;
} file;
extern struct ip
{
char ufile[NAME_LEN];
char vfile[NAME_LEN];
char wfile[NAME_LEN];
char weightsfile[NAME_LEN];
char visrealfile[NAME_LEN];
char visimgfile[NAME_LEN];
char metafile[NAME_LEN];
char paramfile[NAME_LEN];
} in;
extern struct op
{
char outfile[NAME_LEN];
char outfile1[NAME_LEN];
char outfile2[NAME_LEN];
char outfile3[NAME_LEN];
char fftfile[NAME_LEN];
char fftfile_writedata1[NAME_LEN];
char fftfile_writedata2[NAME_LEN];
char fftfile2[NAME_LEN];
char fftfile3[NAME_LEN];
char logfile[NAME_LEN];
char extension[NAME_LEN];
char timingfile[NAME_LEN];
} out, outparam;
extern struct meta
{
uint Nmeasures;
uint Nvis;
uint Nweights;
uint freq_per_chan;
uint polarisations;
uint Ntimes;
double dt;
double thours;
uint baselines;
double uvmin;
double uvmax;
double wmin;
double wmax;
} metaData;
extern struct parameter
{
int num_threads;
int ndatasets;
char datapath_multi[NFILES][LONGNAME_LEN];
int grid_size_x;
int grid_size_y;
int num_w_planes;
int w_support;
int reduce_method;
} param;
extern struct fileData
{
double * uu;
double * vv;
double * ww;
float * weights;
float * visreal;
float * visimg;
}data;
extern char filename[LONGNAME_LEN], buf[NAME_LEN], num_buf[NAME_LEN];
extern char datapath[LONGNAME_LEN];
extern int xaxis, yaxis;
extern int rank;
extern int size;
extern uint nsectors;
extern uint startrow;
extern double_t resolution, dx, dw, w_supporth;
extern uint **sectorarray;
extern uint *histo_send;
extern int verbose_level;
extern uint size_of_grid;
extern double_t *grid_pointers, *grid, *gridss, *gridss_real, *gridss_img, *gridss_w, *grid_gpu, *gridss_gpu;
extern MPI_Comm MYMPI_COMM_WORLD;
extern MPI_Win slabwin;
#include "allvars_rccl.hip.hpp"
#include "proto.h"
#include <hip/hip_runtime.h>
#include <rccl.h>
/*
* Implements the gridding of data via GPU
* by using NCCL library
*
*/
#if defined( RCCL_REDUCE )
/*
#define NCCLCHECK(cmd) do {
ncclResult_t r = cmd;
if (r!= ncclSuccess) {
printf("Failed, NCCL error %s:%d '%s'\n",
__FILE__,__LINE__,ncclGetErrorString(r));
exit(EXIT_FAILURE);
}
} while(0)
*/
static uint64_t getHostHash(const char* string) {
// Based on DJB2a, result = result * 33 ^ char
uint64_t result = 5381;
for (int c = 0; string[c] != '\0'; c++){
result = ((result << 5) + result) ^ string[c];
}
return result;
}
static void getHostName(char* hostname, int maxlen) {
gethostname(hostname, maxlen);
for (int i=0; i< maxlen; i++) {
if (hostname[i] == '.') {
hostname[i] = '\0';
return;
}
}
}
void gridding_data(){
double shift = (double)(dx*yaxis);
timing_wt.kernel = 0.0;
timing_wt.reduce = 0.0;
timing_wt.reduce_mpi = 0.0;
timing_wt.reduce_sh = 0.0;
timing_wt.compose = 0.0;
// calculate the resolution in radians
resolution = 1.0/MAX(fabs(metaData.uvmin),fabs(metaData.uvmax));
// calculate the resolution in arcsec
double resolution_asec = (3600.0*180.0)/MAX(fabs(metaData.uvmin),fabs(metaData.uvmax))/PI;
if ( rank == 0 )
printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec);
// find the largest value in histo_send[]
//
//Initialize nccl
//double *gridss_gpu, *grid_gpu;
int local_rank = 0;
uint64_t hostHashs[size];
char hostname[1024];
getHostName(hostname, 1024);
hostHashs[rank] = getHostHash(hostname);
MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD);
for (int p=0; p<size; p++) {
if (p == rank) break;
if (hostHashs[p] == hostHashs[rank]) local_rank++;
}
ncclUniqueId id;
ncclComm_t comm;
hipError_t nnn;
hipStream_t stream_reduce, stream_stacking;
if (rank == 0) ncclGetUniqueId(&id);
MPI_Bcast((void *)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);
hipSetDevice(local_rank);
int n = hipMalloc(&grid_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double));
n = hipMalloc(&gridss_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double));
n = hipStreamCreate(&stream_reduce);
n = hipStreamCreate(&stream_stacking);
ncclCommInitRank(&comm, size, id, rank);
for (uint isector = 0; isector < nsectors; isector++)
{
double start = CPU_TIME_wt;
uint Nsec = histo_send[isector];
uint Nweightss = Nsec*metaData.polarisations;
uint Nvissec = Nweightss*metaData.freq_per_chan;
double_t *memory = (double*) malloc ( (Nsec*3)*sizeof(double_t) +
(Nvissec*2+Nweightss)*sizeof(float_t) );
if ( memory == NULL )
shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__);
double_t *uus = (double_t*) memory;
double_t *vvs = (double_t*) uus+Nsec;
double_t *wws = (double_t*) vvs+Nsec;
float_t *weightss = (float_t*)((double_t*)wws+Nsec);
float_t *visreals = (float_t*)weightss + Nweightss;
float_t *visimgs = (float_t*)visreals + Nvissec;
// select data for this sector
uint icount = 0;
uint ip = 0;
uint inu = 0;
#warning "this loop should be threaded"
#warning "the counter of this loop should not be int"
for(int iphi = histo_send[isector]-1; iphi>=0; iphi--)
{
uint ilocal = sectorarray[isector][iphi];
uus[icount] = data.uu[ilocal];
vvs[icount] = data.vv[ilocal]-isector*shift;
wws[icount] = data.ww[ilocal];
for (uint ipol=0; ipol<metaData.polarisations; ipol++)
{
weightss[ip] = data.weights[ilocal*metaData.polarisations+ipol];
ip++;
}
for (uint ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++)
{
visreals[inu] = data.visreal[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq];
visimgs[inu] = data.visimg[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq];
inu++;
}
icount++;
}
double uumin = 1e20;
double vvmin = 1e20;
double uumax = -1e20;
double vvmax = -1e20;
#pragma omp parallel reduction( min: uumin, vvmin) reduction( max: uumax, vvmax) num_threads(param.num_threads)
{
double my_uumin = 1e20;
double my_vvmin = 1e20;
double my_uumax = -1e20;
double my_vvmax = -1e20;
#pragma omp for
for (uint ipart=0; ipart<Nsec; ipart++)
{
my_uumin = MIN(my_uumin, uus[ipart]);
my_uumax = MAX(my_uumax, uus[ipart]);
my_vvmin = MIN(my_vvmin, vvs[ipart]);
my_vvmax = MAX(my_vvmax, vvs[ipart]);
}
uumin = MIN( uumin, my_uumin );
uumax = MAX( uumax, my_uumax );
vvmin = MIN( vvmin, my_vvmin );
vvmax = MAX( vvmax, my_vvmax );
}
timing_wt.compose += CPU_TIME_wt - start;
//printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax);
// Make convolution on the grid
#ifdef VERBOSE
printf("Processing sector %ld\n",isector);
#endif
start = CPU_TIME_wt;
//We have to call different GPUs per MPI task!!! [GL]
#ifdef HIPCC
wstack(param.num_w_planes,
Nsec,
metaData.freq_per_chan,
metaData.polarisations,
uus,
vvs,
wws,
visreals,
visimgs,
weightss,
dx,
dw,
param.w_support,
xaxis,
yaxis,
gridss_gpu,
param.num_threads,
rank,
stream_stacking);
#else
wstack(param.num_w_planes,
Nsec,
metaData.freq_per_chan,
metaData.polarisations,
uus,
vvs,
wws,
visreals,
visimgs,
weightss,
dx,
dw,
param.w_support,
xaxis,
yaxis,
gridss,
param.num_threads,
rank);
#endif
//Allocate memory on devices non-blocking for the host
///////////////////////////////////////////////////////
timing_wt.kernel += CPU_TIME_wt - start;
#ifdef VERBOSE
printf("Processed sector %ld\n",isector);
#endif
if( size > 1 )
{
// Write grid in the corresponding remote slab
// int target_rank = (int)isector; it implied that size >= nsectors
int target_rank = (int)(isector % size);
start = CPU_TIME_wt;
ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble, ncclSum, target_rank, comm, stream_reduce);
n = hipStreamSynchronize(stream_reduce);
timing_wt.reduce += CPU_TIME_wt - start;
// Go to next sector
nnn = hipMemset( gridss_gpu, 0.0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) );
if (nnn != hipSuccess) {printf("!!! w-stacking.cu hipMemset ERROR %d !!!\n", nnn);}
}
free(memory);
}
//Copy data back from device to host (to be deleted in next steps)
n = hipMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), hipMemcpyDeviceToHost, stream_reduce);
MPI_Barrier(MPI_COMM_WORLD);
n = hipStreamSynchronize(stream_reduce);
n = hipFree(grid_gpu);
n = hipFree(gridss_gpu);
n = hipStreamDestroy(stream_reduce);
n = hipStreamDestroy(stream_stacking);
ncclCommDestroy(comm);
return;
}
#endif
#ifdef _OPENMP
#include <omp.h>
#endif
#if !defined(ACCOMP)
#include "w-stacking.hip.hpp"
#else
#include "w-stacking_omp.h"
#endif
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include "errcodes.h"
#include "proto.h"
#ifdef __HIPCC__
__global__ void phase_g(int xaxis,
int yaxis,
int num_w_planes,
double * gridss,
double * image_real,
double * image_imag,
double wmin,
double dw,
double dwnorm,
int xaxistot,
int yaxistot,
double resolution,
int nbucket)
{
long gid = blockIdx.x*blockDim.x + threadIdx.x;
double add_term_real;
double add_term_img;
double wterm;
long arraysize = (long)((xaxis*yaxis*num_w_planes)/nbucket);
if(gid < arraysize)
{
long gid_aux = nbucket*gid;
for(int iaux=0; iaux<nbucket; iaux++)
{
int iw = gid_aux/(xaxis*yaxis);
int ivaux = gid_aux%(xaxis*yaxis);
int iv = ivaux/xaxis;
int iu = ivaux%xaxis;
long index = 2*gid_aux;
long img_index = iu+iv*xaxis;
wterm = wmin + iw*dw;
#ifdef PHASE_ON
if (num_w_planes > 1)
{
double xcoord = (double)(iu-xaxistot/2);
if(xcoord < 0.0)xcoord = (double)(iu+xaxistot/2);
xcoord = sin(xcoord*resolution);
double ycoord = (double)(iv-yaxistot/2);
if(ycoord < 0.0)ycoord = (double)(iv+yaxistot/2);
ycoord = sin(ycoord*resolution);
double preal, pimag;
double radius2 = (xcoord*xcoord+ycoord*ycoord);
preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
double p,q,r,s;
p = gridss[index];
q = gridss[index+1];
r = preal;
s = pimag;
//printf("%d %d %d %ld %ld\n",iu,iv,iw,index,img_index);
add_term_real = (p*r-q*s)*dwnorm*sqrt(1-radius2);
add_term_img = (p*s+q*r)*dwnorm*sqrt(1-radius2);
atomicAdd(&(image_real[img_index]),add_term_real);
atomicAdd(&(image_imag[img_index]),add_term_img);
} else {
atomicAdd(&(image_real[img_index]),gridss[index]);
atomicAdd(&(image_imag[img_index]),gridss[index+1]);
}
#else
atomicAdd(&(image_real[img_index]),gridss[index]);
atomicAdd(&(image_imag[img_index]),gridss[index+1]);
#endif // end of PHASE_ON
gid_aux++;
}
}
}
#endif
void phase_correction(double* gridss, double* image_real, double* image_imag, int xaxis, int yaxis, int num_w_planes, int xaxistot, int yaxistot,
double resolution, double wmin, double wmax, int num_threads, int rank)
{
double dw = (wmax-wmin)/(double)num_w_planes;
double wterm = wmin+0.5*dw;
double dwnorm = dw/(wmax-wmin);
#ifdef HIPCC
// WARNING: nbucket MUST be chosen such that xaxis*yaxis*num_w_planes is a multiple of nbucket
int nbucket = 1;
int Nth = NTHREADS;
long Nbl = (long)((num_w_planes*xaxis*yaxis)/Nth/nbucket) + 1;
if(NWORKERS == 1) {Nbl = 1; Nth = 1;};
int ndevices;
int m = hipGetDeviceCount(&ndevices);
m = hipSetDevice(rank % ndevices);
if ( rank == 0 ) {
if (0 == ndevices) {
shutdown_wstacking(NO_ACCELERATORS_FOUND, "No accelerators found", __FILE__, __LINE__ );
}
}
printf("Running rank %d using GPU %d\n", rank, rank % ndevices);
#ifdef NVIDIA
prtAccelInfo();
#endif
hipError_t mmm;
double * image_real_g;
double * image_imag_g;
double * gridss_g;
//Copy gridss on the device
mmm=hipMalloc(&gridss_g, 2*num_w_planes*xaxis*yaxis*sizeof(double));
mmm=hipMemcpy(gridss_g, gridss, 2*num_w_planes*xaxis*yaxis*sizeof(double), hipMemcpyHostToDevice);
mmm=hipMalloc(&image_real_g, xaxis*yaxis*sizeof(double));
//printf("HIP ERROR 2 %s\n",hipGetErrorString(mmm));
mmm=hipMalloc(&image_imag_g, xaxis*yaxis*sizeof(double));
//printf("HIP ERROR 3 %s\n",hipGetErrorString(mmm));
//printf("HIP ERROR 4 %s\n",hipGetErrorString(mmm));
mmm=hipMemset(image_real_g, 0.0, xaxis*yaxis*sizeof(double));
//printf("HIP ERROR 5 %s\n",hipGetErrorString(mmm));
mmm=hipMemset(image_imag_g, 0.0, xaxis*yaxis*sizeof(double));
//printf("HIP ERROR 6 %s\n",hipGetErrorString(mmm));
// call the phase correction kernel
phase_g <<<Nbl,Nth>>> (xaxis,
yaxis,
num_w_planes,
gridss_g,
image_real_g,
image_imag_g,
wmin,
dw,
dwnorm,
xaxistot,
yaxistot,
resolution,
nbucket);
mmm = hipMemcpy(image_real, image_real_g, xaxis*yaxis*sizeof(double), hipMemcpyDeviceToHost);
//printf("HIP ERROR 7 %s\n",hipGetErrorString(mmm));
mmm = hipMemcpy(image_imag, image_imag_g, xaxis*yaxis*sizeof(double), hipMemcpyDeviceToHost);
//printf("HIP ERROR 8 %s\n",hipGetErrorString(mmm));
mmm= hipFree(gridss_g);
#else
#ifndef ACCOMP
#ifdef _OPENMP
omp_set_num_threads(num_threads);
#endif
#pragma omp parallel for collapse(2) private(wterm)
for (int iw=0; iw<num_w_planes; iw++)
{
for (int iv=0; iv<yaxis; iv++)
for (int iu=0; iu<xaxis; iu++)
{
unsigned int index = 2*(iu+iv*xaxis+xaxis*yaxis*iw);
unsigned int img_index = iu+iv*xaxis;
wterm = wmin + iw*dw;
#ifdef PHASE_ON
if (num_w_planes > 1)
{
double xcoord = (double)(iu-xaxistot/2);
if(xcoord < 0.0)xcoord = (double)(iu+xaxistot/2);
xcoord = sin(xcoord*resolution);
double ycoord = (double)(iv-yaxistot/2);
if(ycoord < 0.0)ycoord = (double)(iv+yaxistot/2);
ycoord = sin(ycoord*resolution);
double preal, pimag;
double radius2 = (xcoord*xcoord+ycoord*ycoord);
if(xcoord <= 1.0)
{
preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
} else {
preal = cos(-2.0*PI*wterm*(sqrt(radius2-1.0)-1));
pimag = 0.0;
}
preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
double p,q,r,s;
p = gridss[index];
q = gridss[index+1];
r = preal;
s = pimag;
//printf("%d %d %d %ld %ld\n",iu,iv,iw,index,img_index);
#pragma omp atomic
image_real[img_index] += (p*r-q*s)*dwnorm*sqrt(1-radius2);
#pragma omp atomic
image_imag[img_index] += (p*s+q*r)*dwnorm*sqrt(1-radius2);
} else {
#pragma omp atomic
image_real[img_index] += gridss[index];
#pragma omp atomic
image_imag[img_index] += gridss[index+1];
}
#else
#pragma omp atomic
image_real[img_index] += gridss[index];
#pragma omp atomic
image_imag[img_index] += gridss[index+1];
#endif // end of PHASE_ON
}
}
#else
omp_set_default_device(rank % omp_get_num_devices());
#if !defined(__clang__)
#pragma omp target teams distribute parallel for collapse(2) simd private(wterm) map(to:gridss[0:2*num_w_planes*xaxis*yaxis]) map(from:image_real[0:xaxis*yaxis]) map(from:image_imag[0:xaxis*yaxis])
#else
#pragma omp target teams distribute parallel for collapse(2) private(wterm) map(to:gridss[0:2*num_w_planes*xaxis*yaxis]) map(from:image_real[0:xaxis*yaxis]) map(from:image_imag[0:xaxis*yaxis])
#endif
for (int iw=0; iw<num_w_planes; iw++)
{
for (int iv=0; iv<yaxis; iv++)
for (int iu=0; iu<xaxis; iu++)
{
long index = 2*(iu+iv*xaxis+xaxis*yaxis*iw);
long img_index = iu+iv*xaxis;
wterm = wmin + iw*dw;
#ifdef PHASE_ON
if (num_w_planes > 1)
{
double xcoord = (double)(iu-xaxistot/2);
if(xcoord < 0.0)xcoord = (double)(iu+xaxistot/2);
xcoord = sin(xcoord*resolution);
double ycoord = (double)(iv-yaxistot/2);
if(ycoord < 0.0)ycoord = (double)(iv+yaxistot/2);
ycoord = sin(ycoord*resolution);
double preal, pimag;
double radius2 = (xcoord*xcoord+ycoord*ycoord);
if(xcoord <= 1.0)
{
preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
} else {
preal = cos(-2.0*PI*wterm*(sqrt(radius2-1.0)-1));
pimag = 0.0;
}
preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
double p,q,r,s;
p = gridss[index];
q = gridss[index+1];
r = preal;
s = pimag;
//printf("%d %d %d %ld %ld\n",iu,iv,iw,index,img_index);
#pragma omp atomic
image_real[img_index] += (p*r-q*s)*dwnorm*sqrt(1-radius2);
#pragma omp atomic
image_imag[img_index] += (p*s+q*r)*dwnorm*sqrt(1-radius2);
} else {
#pragma omp atomic
image_real[img_index] += gridss[index];
#pragma omp atomic
image_imag[img_index] += gridss[index+1];
}
#else
#pragma omp atomic
image_real[img_index] += gridss[index];
#pragma omp atomic
image_imag[img_index] += gridss[index+1];
#endif // end of PHASE_ON
}
}
#endif
#endif // end of __HIPCC__
}
#ifdef _OPENMP
#include <omp.h>
#endif
#include "w-stacking.hip.hpp"
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#ifdef __HIPCC__
#include "allvars_nccl.hip.hpp"
#endif
#include "proto.h"
#ifdef ACCOMP
#pragma omp declare target
#endif
#ifdef __HIPCC__
double __device__
#else
double
#endif
// Gaussian Kernel
gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
{
double conv_weight;
conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22);
return conv_weight;
}
void makeGaussKernel(double * kernel,
int KernelLen,
int increaseprecision,
double std22)
{
double norm = std22/PI;
int n = increaseprecision*KernelLen, mid = n / 2;
for (int i = 0; i != mid + 1; i++) {
double term = (double)i/(double)increaseprecision;
kernel[mid + i] = sqrt(norm) * exp(-(term*term)*std22);
}
for (int i = 0; i != mid; i++) kernel[i] = kernel[n - 1 - i];
// for (int i = 0; i < n; i++) printf("%f\n",kernel[i]);
}
// Kaiser-Bessel Kernel: it is adapted from WSClean
double bessel0(double x, double precision) {
// Calculate I_0 = SUM of m 0 -> inf [ (x/2)^(2m) ]
// This is the unnormalized bessel function of order 0.
double d = 0.0, ds = 1.0, sum = 1.0;
do {
d += 2.0;
ds *= x * x / (d * d);
sum += ds;
} while (ds > sum * precision);
return sum;
}
void makeKaiserBesselKernel(double * kernel,
int KernelLen,
int increaseprecision,
double alpha,
double overSamplingFactor,
int withSinc) {
int n = increaseprecision*KernelLen, mid = n / 2;
double * sincKernel = (double*)malloc((mid + 1) * sizeof(*sincKernel));
const double filterRatio = 1.0 / overSamplingFactor;
sincKernel[0] = filterRatio;
for (int i = 1; i != mid + 1; i++) {
double x = i;
sincKernel[i] =
withSinc ? (sin(PI * filterRatio * x) / (PI * x)) : filterRatio;
}
const double normFactor = overSamplingFactor / bessel0(alpha, 1e-8);
for (int i = 0; i != mid + 1; i++) {
double term = (double)i / mid;
kernel[mid + i] = sincKernel[i] *
bessel0(alpha * sqrt(1.0 - (term * term)), 1e-8) *
normFactor;
}
for (int i = 0; i != mid; i++) kernel[i] = kernel[n - 1 - i];
//for (int i = 0; i < n; i++) printf("%f\n",kernel[i]);
}
#ifdef ACCOMP
#pragma omp end declare target
#endif
#ifdef __HIPCC__
//double __device__ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
//{
// double conv_weight;
// conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22);
// return conv_weight;
//}
__global__ void convolve_g(
int num_w_planes,
uint num_points,
uint freq_per_chan,
uint polarizations,
double* uu,
double* vv,
double* ww,
float* vis_real,
float* vis_img,
float* weight,
double dx,
double dw,
int KernelLen,
int grid_size_x,
int grid_size_y,
double* grid,
#if defined(GAUSS_HI_PRECISION)
double std22
#else
double std22,
double* convkernel
#endif
)
{
//printf("DENTRO AL KERNEL\n");
uint gid = blockIdx.x*blockDim.x + threadIdx.x;
if(gid < num_points)
{
uint i = gid;
uint visindex = i*freq_per_chan*polarizations;
double norm = std22/PI;
int j, k;
/* Convert UV coordinates to grid coordinates. */
double pos_u = uu[i] / dx;
double pos_v = vv[i] / dx;
double ww_i = ww[i] / dw;
int grid_w = (int)ww_i;
int grid_u = (int)pos_u;
int grid_v = (int)pos_v;
// check the boundaries
uint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
uint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
uint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
uint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
// Convolve this point onto the grid.
for (k = kmin; k <= kmax; k++)
{
double v_dist = (double)k - pos_v;
int increaseprecision = 5;
for (j = jmin; j <= jmax; j++)
{
double u_dist = (double)j+0.5 - pos_u;
uint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
int jKer = (int)(increaseprecision * (fabs(u_dist+(double)KernelLen)));
int kKer = (int)(increaseprecision * (fabs(v_dist+(double)KernelLen)));
#ifdef GAUSS_HI_PRECISION
double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
#endif
#ifdef GAUSS
double conv_weight = convkernel[jKer]*convkernel[kKer];
#endif
#ifdef KAISERBESSEL
double conv_weight = convkernel[jKer]*convkernel[kKer];
#endif
// Loops over frequencies and polarizations
double add_term_real = 0.0;
double add_term_img = 0.0;
uint ifine = visindex;
for (uint ifreq=0; ifreq<freq_per_chan; ifreq++)
{
uint iweight = visindex/freq_per_chan;
for (uint ipol=0; ipol<polarizations; ipol++)
{
double vistest = (double)vis_real[ifine];
if (!isnan(vistest))
{
add_term_real += weight[iweight] * vis_real[ifine] * conv_weight;
add_term_img += weight[iweight] * vis_img[ifine] * conv_weight;
}
ifine++;
iweight++;
}
}
atomicAdd(&(grid[iKer]),add_term_real);
atomicAdd(&(grid[iKer+1]),add_term_img);
}
}
}
}
#endif
#ifdef ACCOMP
#pragma omp declare target
#endif
void wstack(
int num_w_planes,
uint num_points,
uint freq_per_chan,
uint polarizations,
double* uu,
double* vv,
double* ww,
float* vis_real,
float* vis_img,
float* weight,
double dx,
double dw,
int w_support,
int grid_size_x,
int grid_size_y,
double* grid,
int num_threads,
#ifdef HIPCC
int rank,
hipStream_t stream_stacking
#else
int rank
#endif
)
{
uint i;
//uint index;
uint visindex;
// initialize the convolution kernel
// gaussian:
int KernelLen = (w_support-1)/2;
int increaseprecision = 5; // this number must be odd: increaseprecison*w_support must be odd (w_support must be odd)
double std = 1.0;
double std22 = 1.0/(2.0*std*std);
double norm = std22/PI;
double * convkernel = (double*)malloc(increaseprecision*w_support*sizeof(*convkernel));
#ifdef GAUSS
makeGaussKernel(convkernel,w_support,increaseprecision,std22);
#endif
#ifdef KAISERBESSEL
double overSamplingFactor = 1.0;
int withSinc = 0;
double alpha = 8.6;
makeKaiserBesselKernel(convkernel, w_support, increaseprecision, alpha, overSamplingFactor, withSinc);
#endif
// Loop over visibilities.
// Switch between HIP and GPU versions
#ifdef __HIPCC__
// Define the HIP set up
int Nth = NTHREADS;
uint Nbl = (uint)(num_points/Nth) + 1;
if(NWORKERS == 1) {Nbl = 1; Nth = 1;};
uint Nvis = num_points*freq_per_chan*polarizations;
int ndevices;
int num = hipGetDeviceCount(&ndevices);
int res = hipSetDevice(rank % ndevices);
if ( rank == 0 ) {
if (0 == ndevices) {
shutdown_wstacking(NO_ACCELERATORS_FOUND, "No accelerators found", __FILE__, __LINE__ );
}
}
#ifdef NVIDIA
prtAccelInfo();
#endif
// Create GPU arrays and offload them
double * uu_g;
double * vv_g;
double * ww_g;
float * vis_real_g;
float * vis_img_g;
float * weight_g;
//double * grid_g;
double * convkernel_g;
//Create the event inside stream stacking
//hipEvent_t event_kernel;
//for (int i=0; i<100000; i++)grid[i]=23.0;
hipError_t mmm;
//mmm=hipEventCreate(&event_kernel);
mmm=hipMalloc(&uu_g,num_points*sizeof(double));
mmm=hipMalloc(&vv_g,num_points*sizeof(double));
mmm=hipMalloc(&ww_g,num_points*sizeof(double));
mmm=hipMalloc(&vis_real_g,Nvis*sizeof(float));
mmm=hipMalloc(&vis_img_g,Nvis*sizeof(float));
mmm=hipMalloc(&weight_g,(Nvis/freq_per_chan)*sizeof(float));
//mmm=hipMalloc(&grid_g,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
#if !defined(GAUSS_HI_PRECISION)
mmm=hipMalloc(&convkernel_g,increaseprecision*w_support*sizeof(double));
#endif
if (mmm != hipSuccess) {printf("!!! w-stacking.cu hipMalloc ERROR %d !!!\n", mmm);}
//mmm=hipMemset(grid_g,0.0,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
if (mmm != hipSuccess) {printf("!!! w-stacking.cu hipMemset ERROR %d !!!\n", mmm);}
mmm=hipMemcpyAsync(uu_g, uu, num_points*sizeof(double), hipMemcpyHostToDevice, stream_stacking);
mmm=hipMemcpyAsync(vv_g, vv, num_points*sizeof(double), hipMemcpyHostToDevice, stream_stacking);
mmm=hipMemcpyAsync(ww_g, ww, num_points*sizeof(double), hipMemcpyHostToDevice, stream_stacking);
mmm=hipMemcpyAsync(vis_real_g, vis_real, Nvis*sizeof(float), hipMemcpyHostToDevice, stream_stacking);
mmm=hipMemcpyAsync(vis_img_g, vis_img, Nvis*sizeof(float), hipMemcpyHostToDevice, stream_stacking);
mmm=hipMemcpyAsync(weight_g, weight, (Nvis/freq_per_chan)*sizeof(float), hipMemcpyHostToDevice, stream_stacking);
#if !defined(GAUSS_HI_PRECISION)
mmm=hipMemcpyAsync(convkernel_g, convkernel, increaseprecision*w_support*sizeof(double), hipMemcpyHostToDevice, stream_stacking);
#endif
if (mmm != hipSuccess) {printf("!!! w-stacking.cu hipMemcpyAsync ERROR %d !!!\n", mmm);}
// Call main GPU Kernel
#if defined(GAUSS_HI_PRECISION)
convolve_g <<<Nbl,Nth,0,stream_stacking>>> (
num_w_planes,
num_points,
freq_per_chan,
polarizations,
uu_g,
vv_g,
ww_g,
vis_real_g,
vis_img_g,
weight_g,
dx,
dw,
KernelLen,
grid_size_x,
grid_size_y,
grid,
std22
);
#else
convolve_g <<<Nbl,Nth,0,stream_stacking>>> (
num_w_planes,
num_points,
freq_per_chan,
polarizations,
uu_g,
vv_g,
ww_g,
vis_real_g,
vis_img_g,
weight_g,
dx,
dw,
KernelLen,
grid_size_x,
grid_size_y,
grid,
std22,
convkernel_g
);
#endif
mmm=hipStreamSynchronize(stream_stacking);
//Record the event
//mmm=hipEventRecord(event_kernel,stream_stacking);
//Wait until the kernel ends
//mmm=hipStreamWaitEvent(stream_stacking,event_kernel);
//for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
if (mmm != hipSuccess)
printf("HIP ERROR %s\n",hipGetErrorString(mmm));
mmm=hipFree(uu_g);
mmm=hipFree(vv_g);
mmm=hipFree(ww_g);
mmm=hipFree(vis_real_g);
mmm=hipFree(vis_img_g);
mmm=hipFree(weight_g);
//mmm=hipFree(grid_g);
#if !defined(GAUSS_HI_PRECISION)
mmm=hipFree(convkernel_g);
#endif
// Switch between HIP and GPU versions
# else
#ifdef _OPENMP
omp_set_num_threads(num_threads);
#endif
#if defined(ACCOMP) && (GPU_STACKING)
omp_set_default_device(rank % omp_get_num_devices());
uint Nvis = num_points*freq_per_chan*polarizations;
#pragma omp target teams distribute parallel for private(visindex) map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom:grid[0:2*num_w_planes*grid_size_x*grid_size_y])
#else
#pragma omp parallel for private(visindex)
#endif
for (i = 0; i < num_points; i++)
{
#ifdef _OPENMP
//int tid;
//tid = omp_get_thread_num();
//printf("%d\n",tid);
#endif
visindex = i*freq_per_chan*polarizations;
double sum = 0.0;
int j, k;
//if (i%1000 == 0)printf("%ld\n",i);
/* Convert UV coordinates to grid coordinates. */
double pos_u = uu[i] / dx;
double pos_v = vv[i] / dx;
double ww_i = ww[i] / dw;
int grid_w = (int)ww_i;
int grid_u = (int)pos_u;
int grid_v = (int)pos_v;
// check the boundaries
uint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
uint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
uint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
uint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
//printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax);
// Convolve this point onto the grid.
for (k = kmin; k <= kmax; k++)
{
//double v_dist = (double)k+0.5 - pos_v;
double v_dist = (double)k - pos_v;
for (j = jmin; j <= jmax; j++)
{
//double u_dist = (double)j+0.5 - pos_u;
double u_dist = (double)j - pos_u;
uint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
int jKer = (int)(increaseprecision * (fabs(u_dist+(double)KernelLen)));
int kKer = (int)(increaseprecision * (fabs(v_dist+(double)KernelLen)));
#ifdef GAUSS_HI_PRECISION
double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
#endif
#ifdef GAUSS
double conv_weight = convkernel[jKer]*convkernel[kKer];
//if(jKer < 0 || jKer >= 35 || kKer < 0 || kKer >= 35)
// printf("%f %d %f %d\n",fabs(u_dist+(double)KernelLen),jKer,fabs(v_dist+(double)KernelLen),kKer);
//printf("%d %d %d %d %f %f %f %f %f\n",jKer, j, kKer, k, pos_u, pos_v, u_dist,v_dist,conv_weight);
#endif
#ifdef KAISERBESSEL
double conv_weight = convkernel[jKer]*convkernel[kKer];
#endif
// Loops over frequencies and polarizations
double add_term_real = 0.0;
double add_term_img = 0.0;
uint ifine = visindex;
// DAV: the following two loops are performend by each thread separately: no problems of race conditions
for (uint ifreq=0; ifreq<freq_per_chan; ifreq++)
{
uint iweight = visindex/freq_per_chan;
for (uint ipol=0; ipol<polarizations; ipol++)
{
if (!isnan(vis_real[ifine]))
{
//printf("%f %ld\n",weight[iweight],iweight);
add_term_real += weight[iweight] * vis_real[ifine] * conv_weight;
add_term_img += weight[iweight] * vis_img[ifine] * conv_weight;
//if(vis_img[ifine]>1e10 || vis_img[ifine]<-1e10)printf("%f %f %f %f %ld %ld\n",vis_real[ifine],vis_img[ifine],weight[iweight],conv_weight,ifine,num_points*freq_per_chan*polarizations);
}
ifine++;
iweight++;
}
}
// DAV: this is the critical call in terms of correctness of the results and of performance
#pragma omp atomic
grid[iKer] += add_term_real;
#pragma omp atomic
grid[iKer+1] += add_term_img;
}
}
}
#if defined(ACCOMP) && (GPU_STACKING)
#pragma omp target exit data map(delete:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan], grid[0:2*num_w_planes*grid_size_x*grid_size_y])
#endif
// End switch between HIP and CPU versions
#endif
//for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
}
#ifdef ACCOMP
#pragma omp end declare target
#endif
int test(int nnn)
{
int mmm;
mmm = nnn+1;
return mmm;
}
#ifndef W_PROJECT_H_
#define W_PROJECT_H_
#define NWORKERS -1 //100
#define NTHREADS 64 //AMD WARP SIZE
#define PI 3.14159265359
#define REAL_TYPE double
#include <mpi.h>
#include <hip/hip_runtime.h>
#ifdef __HIPCC__
extern "C"
#endif
#ifdef __cplusplus
extern "C" {
void wstack(
int,
unsigned int,
unsigned int,
unsigned int,
double*,
double*,
double*,
float*,
float*,
float*,
double,
double,
int,
int,
int,
double*,
int,
int,
hipStream_t);
}
#else
void wstack(
int,
unsigned int,
unsigned int,
unsigned int,
double*,
double*,
double*,
float*,
float*,
float*,
double,
double,
int,
int,
int,
double*,
int,
int);
#endif
#ifdef __HIPCC__
extern "C"
#endif
int test(int nnn);
#ifdef __HIPCC__
extern "C"
#endif
void phase_correction(
double*,
double*,
double*,
int,
int,
int,
int,
int,
double,
double,
double,
int,
int);
double gauss_kernel_norm(
double norm,
double std22,
double u_dist,
double v_dist);
#ifdef ACCOMP
#pragma omp declare target (gauss_kernel_norm)
#endif
#ifdef __HIPCC__
extern "C"
#endif
void cuda_fft(
int,
int,
int,
int,
int,
double*,
double*,
int,
MPI_Comm);
#endif
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment