diff --git a/Makefile b/Makefile index 7c98d0f4f214dc8470a8b0f5f53db552a5a96426..6c927cf832feaf1f51aeb578f146f4b1d7e73b80 100755 --- a/Makefile +++ b/Makefile @@ -91,18 +91,29 @@ OPT += -DGAUSS_HI_PRECISION # use GPU to perform FFT #OPT += -DCUFFTMP -# FULL GPU SUPPORT - Recommended for full NVIDIA GPU code execution +# FULL NVIDIA GPU SUPPORT - Recommended for full NVIDIA GPU code execution OPT += -DFULL_NVIDIA ifeq (FULL_NVIDIA,$(findstring FULL_NVIDIA,$(OPT))) OPT += -DCUDACC -DNCCL_REDUCE -DCUFFTMP endif + + +# use HIP for GPUs +#OPT += -DHIPCC + # support for AMD GPUs -#OPT += __HIP_PLATFORM_AMD__ +#OPT += -D__HIP_PLATFORM_AMD__ # use AMD GPU to perform the reduce #OPT += -DRCCL_REDUCE +# FULL AMD GPU SUPPORT - Recommended for full AMD GPU code execution +#OPT += -DFULL_AMD +ifeq (FULL_AMD,$(findstring FULL_AMD,$(OPT))) +OPT += -DHIPCC -DRCCL_REDUCE -D__HIP_PLATFORM_AMD__ +endif + # ======================================================= # OUTPUT HANDLING @@ -167,6 +178,11 @@ CC_OBJ_ACC = allvars.o main.o init.o gridding.o gridding_cpu.o fourier_transform DEPS_ACC_CUDA = w-stacking.h w-stacking.cu phase_correction.cu OBJ_ACC_CUDA = phase_correction.o w-stacking.o +# ----- define which files will be compiled by HIPCC for AMD +# +DEPS_ACC_HIP = w-stacking.hip.hpp w-stacking.hip.cpp phase_correction.hip.cpp +OBJ_ACC_HIP = phase_correction.hip.o w-stacking.hip.o + # ----- define which files will be compiled by NVC with OMP offloading for wither Nvidia or AMD # DEPS_ACC_OMP = w-stacking.h phase_correction.c w-stacking.c @@ -180,14 +196,12 @@ ifeq (CUDACC,$(findstring CUDACC,$(OPT))) DEPS_NCCL_REDUCE = gridding_nccl.cu OBJ_NCCL_REDUCE = gridding_nccl.o -DEPS_RCCL_REDUCE = gridding_rccl.cu -OBJ_RCCL_REDUCE = gridding_rccl.o +else ifeq (HIPCC,$(findstring HIPCC,$(OPT))) +DEPS_RCCL_REDUCE = gridding_rccl.hip.cpp allvars_rccl.hip.hpp +OBJ_RCCL_REDUCE = gridding_rccl.hip.o else DEPS_NCCL_REDUCE = gridding_nccl.cpp OBJ_NCCL_REDUCE = gridding_nccl.o - -DEPS_RCCL_REDUCE = gridding_rccl.cpp -OBJ_RCCL_REDUCE = gridding_rccl.o endif # ----- define what files will be compiled by NVCC for Nvidia cufftMP implementation of FFT @@ -211,7 +225,9 @@ ifeq (ACCOMP,$(findstring ACCOMP, $(OPT))) OBJ = $(CC_OBJ_ACC) else ifeq (CUDACC,$(findstring CUDACC, $(OPT))) OBJ = $(CC_OBJ_ACC) -else +else ifeq (HIPCC,$(findstring HIPCC, $(OPT))) +OBJ = $(CC_OBJ_ACC) +else OBJ = $(CC_OBJ_NOACC) endif @@ -240,6 +256,22 @@ cuda_fft.cpp: cuda_fft.cu gridding_nccl.cpp: gridding_nccl.cu cp gridding_nccl.cu gridding_nccl.cpp +gridding_rccl.cpp: gridding_rccl.cu + cp gridding_rccl.cu gridding_rccl.cpp + +else ifneq (HIPCC,$(findstring HIPCC,$(OPT))) +w-stacking.c: w-stacking.cu + cp w-stacking.cu w-stacking.c + +phase_correction.c: phase_correction.cu + cp phase_correction.cu phase_correction.c + +cuda_fft.cpp: cuda_fft.cu + cp cuda_fft.cu cuda_fft.cpp + +gridding_nccl.cpp: gridding_nccl.cu + cp gridding_nccl.cu gridding_nccl.cpp + gridding_rccl.cpp: gridding_rccl.cu cp gridding_rccl.cu gridding_rccl.cpp else @@ -277,6 +309,15 @@ $(OBJ_ACC_CUDA): $(DEPS_ACC_CUDA) OBJ += $(OBJ_ACC_CUDA) endif +ifeq (HIPCC,$(findstring HIPCC,$(OPT))) +EXEC_EXT := $(EXEC_EXT)_acc-hip +LINKER=$(MPIC++) +FLAGS=$(OPTIMIZE) +LIBS=$(AMDLIB) +$(OBJ_ACC_HIP): $(DEPS_ACC_HIP) + $(HIPCC) $(OPT) $(OPT_HIPCC) $(CFLAGS) -c w-stacking.hip.cpp phase_correction.hip.cpp $(LIBS) +OBJ += $(OBJ_ACC_HIP) +endif ifeq (ACCOMP,$(findstring ACCOMP,$(OPT))) @@ -317,6 +358,16 @@ LIBS=$(NVLIB_3) $(OBJ_NCCL_REDUCE): $(DEPS_NCCL_REDUCE) $(NVCC) $(OPT_NVCC) $(OPT) -c $^ $(LIBS) OBJ += $(OBJ_NCCL_REDUCE) + +else ifeq (HIPCC,$(findstring HIPCC,$(OPT))) +EXEC_EXT := $(EXEC_EXT)_acc-reduce +LINKER=$(MPIC++) +FLAGS=$(OPTIMIZE) $(CFLAGS) +LIBS=$(AMDLIB_3) +$(OBJ_NCCL_REDUCE): $(DEPS_NCCL_REDUCE) + $(MPIC++) $(FLAGS) $(OPT) -c $^ $(CFLAGS) $(LIBS) +OBJ += $(OBJ_NCCL_REDUCE) + else EXEC_EXT := $(EXEC_EXT)_acc-reduce diff --git a/allvars_rccl.hip.hpp b/allvars_rccl.hip.hpp new file mode 100755 index 0000000000000000000000000000000000000000..04bb24c3681b3b28b60b7e9968ff280ae0f0d94b --- /dev/null +++ b/allvars_rccl.hip.hpp @@ -0,0 +1,176 @@ +/* file to store global variables*/ + +#if defined(__STDC__) +# if (__STDC_VERSION__ >= 199901L) +# define _XOPEN_SOURCE 700 +# endif +#endif + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <math.h> +#include <unistd.h> + + +#if !defined( RCCL_REDUCE ) && !defined(__HIPCC__) +#include <stdatomic.h> +#endif + +#include <mpi.h> + +#if defined (_OPENMP) +#include <omp.h> +#endif + +#if defined(USE_FFTW) && !defined(CUFFTMP) // use MPI fftw +#include <fftw3-mpi.h> +#endif + +#if defined(ACCOMP) +#include "w-stacking_omp.h" +#else +#include "w-stacking.hip.hpp" +#endif + +#if defined(NVIDIA) +#include <cuda_runtime.h> +#endif + +#include "fft.h" +#include "numa.h" +#include "timing.h" +#include "errcodes.h" + +#define PI 3.14159265359 +#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y)) +#define MAX(X, Y) (((X) > (Y)) ? (X) : (Y)) +#define NOVERBOSE +#define NFILES 100 + +#define NAME_LEN 50 +#define LONGNAME_LEN 1000 + + +#define REDUCE_MPI 0 +#define REDUCE_RING 1 + +#if defined(DEBUG) +#define dprintf(LEVEL, T, t, ...) if( (verbose_level >= (LEVEL)) && \ + ( ((t) ==-1 ) || ((T)==(t)) ) ) { \ + printf(__VA_ARGS__); fflush(stdout); } + +#else +#define dprintf(...) +#endif + +typedef double double_t; +#if defined(DOUBLE_PRECISION) +typedef double float_t; +#else +typedef float float_t; +#endif + +typedef unsigned int uint; +typedef unsigned long long ull; + + +extern struct io +{ + FILE * pFile; + FILE * pFile1; + FILE * pFilereal; + FILE * pFileimg; +} file; + +extern struct ip +{ + char ufile[NAME_LEN]; + char vfile[NAME_LEN]; + char wfile[NAME_LEN]; + char weightsfile[NAME_LEN]; + char visrealfile[NAME_LEN]; + char visimgfile[NAME_LEN]; + char metafile[NAME_LEN]; + char paramfile[NAME_LEN]; +} in; + +extern struct op +{ + char outfile[NAME_LEN]; + char outfile1[NAME_LEN]; + char outfile2[NAME_LEN]; + char outfile3[NAME_LEN]; + char fftfile[NAME_LEN]; + char fftfile_writedata1[NAME_LEN]; + char fftfile_writedata2[NAME_LEN]; + char fftfile2[NAME_LEN]; + char fftfile3[NAME_LEN]; + char logfile[NAME_LEN]; + char extension[NAME_LEN]; + char timingfile[NAME_LEN]; + +} out, outparam; + +extern struct meta +{ + + uint Nmeasures; + uint Nvis; + uint Nweights; + uint freq_per_chan; + uint polarisations; + uint Ntimes; + double dt; + double thours; + uint baselines; + double uvmin; + double uvmax; + double wmin; + double wmax; +} metaData; + + +extern struct parameter +{ + int num_threads; + int ndatasets; + char datapath_multi[NFILES][LONGNAME_LEN]; + int grid_size_x; + int grid_size_y; + int num_w_planes; + int w_support; + int reduce_method; +} param; + +extern struct fileData +{ + double * uu; + double * vv; + double * ww; + float * weights; + float * visreal; + float * visimg; +}data; + + +extern char filename[LONGNAME_LEN], buf[NAME_LEN], num_buf[NAME_LEN]; +extern char datapath[LONGNAME_LEN]; +extern int xaxis, yaxis; +extern int rank; +extern int size; +extern uint nsectors; +extern uint startrow; +extern double_t resolution, dx, dw, w_supporth; + +extern uint **sectorarray; +extern uint *histo_send; +extern int verbose_level; + + +extern uint size_of_grid; +extern double_t *grid_pointers, *grid, *gridss, *gridss_real, *gridss_img, *gridss_w, *grid_gpu, *gridss_gpu; + +extern MPI_Comm MYMPI_COMM_WORLD; +extern MPI_Win slabwin; + diff --git a/gridding_rccl.hip.cpp b/gridding_rccl.hip.cpp new file mode 100755 index 0000000000000000000000000000000000000000..dba42eeec2580d387322856e21392998c25d7791 --- /dev/null +++ b/gridding_rccl.hip.cpp @@ -0,0 +1,290 @@ +#include "allvars_rccl.hip.hpp" +#include "proto.h" +#include <hip/hip_runtime.h> +#include <rccl.h> + +/* + * Implements the gridding of data via GPU + * by using NCCL library + * + */ + + +#if defined( RCCL_REDUCE ) + +/* +#define NCCLCHECK(cmd) do { +ncclResult_t r = cmd; +if (r!= ncclSuccess) { + printf("Failed, NCCL error %s:%d '%s'\n", + __FILE__,__LINE__,ncclGetErrorString(r)); + exit(EXIT_FAILURE); + } +} while(0) +*/ + + +static uint64_t getHostHash(const char* string) { + // Based on DJB2a, result = result * 33 ^ char + uint64_t result = 5381; + for (int c = 0; string[c] != '\0'; c++){ + result = ((result << 5) + result) ^ string[c]; + } + return result; +} + + +static void getHostName(char* hostname, int maxlen) { + gethostname(hostname, maxlen); + for (int i=0; i< maxlen; i++) { + if (hostname[i] == '.') { + hostname[i] = '\0'; + return; + } + } +} + + + + +void gridding_data(){ + + double shift = (double)(dx*yaxis); + + timing_wt.kernel = 0.0; + timing_wt.reduce = 0.0; + timing_wt.reduce_mpi = 0.0; + timing_wt.reduce_sh = 0.0; + timing_wt.compose = 0.0; + + // calculate the resolution in radians + resolution = 1.0/MAX(fabs(metaData.uvmin),fabs(metaData.uvmax)); + + // calculate the resolution in arcsec + double resolution_asec = (3600.0*180.0)/MAX(fabs(metaData.uvmin),fabs(metaData.uvmax))/PI; + if ( rank == 0 ) + printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec); + + // find the largest value in histo_send[] + // + + + //Initialize nccl + + //double *gridss_gpu, *grid_gpu; + int local_rank = 0; + + uint64_t hostHashs[size]; + char hostname[1024]; + getHostName(hostname, 1024); + hostHashs[rank] = getHostHash(hostname); + MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD); + for (int p=0; p<size; p++) { + if (p == rank) break; + if (hostHashs[p] == hostHashs[rank]) local_rank++; + } + + ncclUniqueId id; + ncclComm_t comm; + hipError_t nnn; + hipStream_t stream_reduce, stream_stacking; + + if (rank == 0) ncclGetUniqueId(&id); + MPI_Bcast((void *)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); + + hipSetDevice(local_rank); + + int n = hipMalloc(&grid_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double)); + n = hipMalloc(&gridss_gpu, 2*param.num_w_planes*xaxis*yaxis * sizeof(double)); + n = hipStreamCreate(&stream_reduce); + n = hipStreamCreate(&stream_stacking); + + ncclCommInitRank(&comm, size, id, rank); + + for (uint isector = 0; isector < nsectors; isector++) + { + + double start = CPU_TIME_wt; + + uint Nsec = histo_send[isector]; + uint Nweightss = Nsec*metaData.polarisations; + uint Nvissec = Nweightss*metaData.freq_per_chan; + double_t *memory = (double*) malloc ( (Nsec*3)*sizeof(double_t) + + (Nvissec*2+Nweightss)*sizeof(float_t) ); + + if ( memory == NULL ) + shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__); + + double_t *uus = (double_t*) memory; + double_t *vvs = (double_t*) uus+Nsec; + double_t *wws = (double_t*) vvs+Nsec; + float_t *weightss = (float_t*)((double_t*)wws+Nsec); + float_t *visreals = (float_t*)weightss + Nweightss; + float_t *visimgs = (float_t*)visreals + Nvissec; + + + + // select data for this sector + uint icount = 0; + uint ip = 0; + uint inu = 0; + + #warning "this loop should be threaded" + #warning "the counter of this loop should not be int" + for(int iphi = histo_send[isector]-1; iphi>=0; iphi--) + { + + uint ilocal = sectorarray[isector][iphi]; + + uus[icount] = data.uu[ilocal]; + vvs[icount] = data.vv[ilocal]-isector*shift; + wws[icount] = data.ww[ilocal]; + for (uint ipol=0; ipol<metaData.polarisations; ipol++) + { + weightss[ip] = data.weights[ilocal*metaData.polarisations+ipol]; + ip++; + } + for (uint ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++) + { + visreals[inu] = data.visreal[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq]; + visimgs[inu] = data.visimg[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq]; + + inu++; + } + icount++; + } + + double uumin = 1e20; + double vvmin = 1e20; + double uumax = -1e20; + double vvmax = -1e20; + + #pragma omp parallel reduction( min: uumin, vvmin) reduction( max: uumax, vvmax) num_threads(param.num_threads) + { + double my_uumin = 1e20; + double my_vvmin = 1e20; + double my_uumax = -1e20; + double my_vvmax = -1e20; + + #pragma omp for + for (uint ipart=0; ipart<Nsec; ipart++) + { + my_uumin = MIN(my_uumin, uus[ipart]); + my_uumax = MAX(my_uumax, uus[ipart]); + my_vvmin = MIN(my_vvmin, vvs[ipart]); + my_vvmax = MAX(my_vvmax, vvs[ipart]); + } + + uumin = MIN( uumin, my_uumin ); + uumax = MAX( uumax, my_uumax ); + vvmin = MIN( vvmin, my_vvmin ); + vvmax = MAX( vvmax, my_vvmax ); + } + + timing_wt.compose += CPU_TIME_wt - start; + + //printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax); + + // Make convolution on the grid + + #ifdef VERBOSE + printf("Processing sector %ld\n",isector); + #endif + + start = CPU_TIME_wt; + + //We have to call different GPUs per MPI task!!! [GL] + #ifdef HIPCC + wstack(param.num_w_planes, + Nsec, + metaData.freq_per_chan, + metaData.polarisations, + uus, + vvs, + wws, + visreals, + visimgs, + weightss, + dx, + dw, + param.w_support, + xaxis, + yaxis, + gridss_gpu, + param.num_threads, + rank, + stream_stacking); + #else + wstack(param.num_w_planes, + Nsec, + metaData.freq_per_chan, + metaData.polarisations, + uus, + vvs, + wws, + visreals, + visimgs, + weightss, + dx, + dw, + param.w_support, + xaxis, + yaxis, + gridss, + param.num_threads, + rank); + #endif + //Allocate memory on devices non-blocking for the host + /////////////////////////////////////////////////////// + + + timing_wt.kernel += CPU_TIME_wt - start; + + #ifdef VERBOSE + printf("Processed sector %ld\n",isector); + #endif + + + if( size > 1 ) + { + + // Write grid in the corresponding remote slab + + // int target_rank = (int)isector; it implied that size >= nsectors + int target_rank = (int)(isector % size); + + start = CPU_TIME_wt; + + ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble, ncclSum, target_rank, comm, stream_reduce); + n = hipStreamSynchronize(stream_reduce); + + timing_wt.reduce += CPU_TIME_wt - start; + + // Go to next sector + nnn = hipMemset( gridss_gpu, 0.0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) ); + if (nnn != hipSuccess) {printf("!!! w-stacking.cu hipMemset ERROR %d !!!\n", nnn);} + } + + free(memory); + } + + + //Copy data back from device to host (to be deleted in next steps) + + n = hipMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), hipMemcpyDeviceToHost, stream_reduce); + + MPI_Barrier(MPI_COMM_WORLD); + + n = hipStreamSynchronize(stream_reduce); + n = hipFree(grid_gpu); + n = hipFree(gridss_gpu); + n = hipStreamDestroy(stream_reduce); + n = hipStreamDestroy(stream_stacking); + + ncclCommDestroy(comm); + + return; + +} + +#endif diff --git a/phase_correction.hip.cpp b/phase_correction.hip.cpp new file mode 100755 index 0000000000000000000000000000000000000000..4e287130d8e2c0be82addcf53b2e9c5c1fa8c1b7 --- /dev/null +++ b/phase_correction.hip.cpp @@ -0,0 +1,315 @@ +#ifdef _OPENMP +#include <omp.h> +#endif + +#if !defined(ACCOMP) +#include "w-stacking.hip.hpp" +#else +#include "w-stacking_omp.h" +#endif + +#include <math.h> +#include <stdlib.h> +#include <stdio.h> +#include "errcodes.h" +#include "proto.h" + +#ifdef __HIPCC__ + +__global__ void phase_g(int xaxis, + int yaxis, + int num_w_planes, + double * gridss, + double * image_real, + double * image_imag, + double wmin, + double dw, + double dwnorm, + int xaxistot, + int yaxistot, + double resolution, + int nbucket) +{ + long gid = blockIdx.x*blockDim.x + threadIdx.x; + double add_term_real; + double add_term_img; + double wterm; + long arraysize = (long)((xaxis*yaxis*num_w_planes)/nbucket); + + if(gid < arraysize) + { + long gid_aux = nbucket*gid; + for(int iaux=0; iaux<nbucket; iaux++) + { + int iw = gid_aux/(xaxis*yaxis); + int ivaux = gid_aux%(xaxis*yaxis); + int iv = ivaux/xaxis; + int iu = ivaux%xaxis; + long index = 2*gid_aux; + long img_index = iu+iv*xaxis; + + wterm = wmin + iw*dw; + +#ifdef PHASE_ON + if (num_w_planes > 1) + { + double xcoord = (double)(iu-xaxistot/2); + if(xcoord < 0.0)xcoord = (double)(iu+xaxistot/2); + xcoord = sin(xcoord*resolution); + double ycoord = (double)(iv-yaxistot/2); + if(ycoord < 0.0)ycoord = (double)(iv+yaxistot/2); + ycoord = sin(ycoord*resolution); + + double preal, pimag; + double radius2 = (xcoord*xcoord+ycoord*ycoord); + + preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0)); + pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0)); + + double p,q,r,s; + p = gridss[index]; + q = gridss[index+1]; + r = preal; + s = pimag; + + //printf("%d %d %d %ld %ld\n",iu,iv,iw,index,img_index); + + add_term_real = (p*r-q*s)*dwnorm*sqrt(1-radius2); + add_term_img = (p*s+q*r)*dwnorm*sqrt(1-radius2); + atomicAdd(&(image_real[img_index]),add_term_real); + atomicAdd(&(image_imag[img_index]),add_term_img); + } else { + atomicAdd(&(image_real[img_index]),gridss[index]); + atomicAdd(&(image_imag[img_index]),gridss[index+1]); + } +#else + atomicAdd(&(image_real[img_index]),gridss[index]); + atomicAdd(&(image_imag[img_index]),gridss[index+1]); +#endif // end of PHASE_ON + gid_aux++; + } + } + +} + +#endif + +void phase_correction(double* gridss, double* image_real, double* image_imag, int xaxis, int yaxis, int num_w_planes, int xaxistot, int yaxistot, + double resolution, double wmin, double wmax, int num_threads, int rank) +{ + double dw = (wmax-wmin)/(double)num_w_planes; + double wterm = wmin+0.5*dw; + double dwnorm = dw/(wmax-wmin); + +#ifdef HIPCC + + // WARNING: nbucket MUST be chosen such that xaxis*yaxis*num_w_planes is a multiple of nbucket + int nbucket = 1; + int Nth = NTHREADS; + long Nbl = (long)((num_w_planes*xaxis*yaxis)/Nth/nbucket) + 1; + if(NWORKERS == 1) {Nbl = 1; Nth = 1;}; + + int ndevices; + int m = hipGetDeviceCount(&ndevices); + m = hipSetDevice(rank % ndevices); + + if ( rank == 0 ) { + if (0 == ndevices) { + + shutdown_wstacking(NO_ACCELERATORS_FOUND, "No accelerators found", __FILE__, __LINE__ ); + } + + } + printf("Running rank %d using GPU %d\n", rank, rank % ndevices); + #ifdef NVIDIA + prtAccelInfo(); + #endif + + hipError_t mmm; + double * image_real_g; + double * image_imag_g; + double * gridss_g; + + + //Copy gridss on the device + mmm=hipMalloc(&gridss_g, 2*num_w_planes*xaxis*yaxis*sizeof(double)); + mmm=hipMemcpy(gridss_g, gridss, 2*num_w_planes*xaxis*yaxis*sizeof(double), hipMemcpyHostToDevice); + + mmm=hipMalloc(&image_real_g, xaxis*yaxis*sizeof(double)); + //printf("HIP ERROR 2 %s\n",hipGetErrorString(mmm)); + mmm=hipMalloc(&image_imag_g, xaxis*yaxis*sizeof(double)); + //printf("HIP ERROR 3 %s\n",hipGetErrorString(mmm)); + + //printf("HIP ERROR 4 %s\n",hipGetErrorString(mmm)); + mmm=hipMemset(image_real_g, 0.0, xaxis*yaxis*sizeof(double)); + //printf("HIP ERROR 5 %s\n",hipGetErrorString(mmm)); + mmm=hipMemset(image_imag_g, 0.0, xaxis*yaxis*sizeof(double)); + //printf("HIP ERROR 6 %s\n",hipGetErrorString(mmm)); + + // call the phase correction kernel + phase_g <<<Nbl,Nth>>> (xaxis, + yaxis, + num_w_planes, + gridss_g, + image_real_g, + image_imag_g, + wmin, + dw, + dwnorm, + xaxistot, + yaxistot, + resolution, + nbucket); + + mmm = hipMemcpy(image_real, image_real_g, xaxis*yaxis*sizeof(double), hipMemcpyDeviceToHost); + //printf("HIP ERROR 7 %s\n",hipGetErrorString(mmm)); + mmm = hipMemcpy(image_imag, image_imag_g, xaxis*yaxis*sizeof(double), hipMemcpyDeviceToHost); + //printf("HIP ERROR 8 %s\n",hipGetErrorString(mmm)); + + mmm= hipFree(gridss_g); +#else + +#ifndef ACCOMP + +#ifdef _OPENMP + omp_set_num_threads(num_threads); +#endif + + #pragma omp parallel for collapse(2) private(wterm) + for (int iw=0; iw<num_w_planes; iw++) + { + for (int iv=0; iv<yaxis; iv++) + for (int iu=0; iu<xaxis; iu++) + { + + unsigned int index = 2*(iu+iv*xaxis+xaxis*yaxis*iw); + unsigned int img_index = iu+iv*xaxis; + wterm = wmin + iw*dw; +#ifdef PHASE_ON + if (num_w_planes > 1) + { + double xcoord = (double)(iu-xaxistot/2); + if(xcoord < 0.0)xcoord = (double)(iu+xaxistot/2); + xcoord = sin(xcoord*resolution); + double ycoord = (double)(iv-yaxistot/2); + if(ycoord < 0.0)ycoord = (double)(iv+yaxistot/2); + ycoord = sin(ycoord*resolution); + + double preal, pimag; + double radius2 = (xcoord*xcoord+ycoord*ycoord); + if(xcoord <= 1.0) + { + preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0)); + pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0)); + } else { + preal = cos(-2.0*PI*wterm*(sqrt(radius2-1.0)-1)); + pimag = 0.0; + } + + preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0)); + pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0)); + + double p,q,r,s; + p = gridss[index]; + q = gridss[index+1]; + r = preal; + s = pimag; + + //printf("%d %d %d %ld %ld\n",iu,iv,iw,index,img_index); + #pragma omp atomic + image_real[img_index] += (p*r-q*s)*dwnorm*sqrt(1-radius2); + #pragma omp atomic + image_imag[img_index] += (p*s+q*r)*dwnorm*sqrt(1-radius2); + } else { + #pragma omp atomic + image_real[img_index] += gridss[index]; + #pragma omp atomic + image_imag[img_index] += gridss[index+1]; + } +#else + #pragma omp atomic + image_real[img_index] += gridss[index]; + #pragma omp atomic + image_imag[img_index] += gridss[index+1]; +#endif // end of PHASE_ON + + } + } + +#else + omp_set_default_device(rank % omp_get_num_devices()); + + #if !defined(__clang__) + + #pragma omp target teams distribute parallel for collapse(2) simd private(wterm) map(to:gridss[0:2*num_w_planes*xaxis*yaxis]) map(from:image_real[0:xaxis*yaxis]) map(from:image_imag[0:xaxis*yaxis]) + + #else + + #pragma omp target teams distribute parallel for collapse(2) private(wterm) map(to:gridss[0:2*num_w_planes*xaxis*yaxis]) map(from:image_real[0:xaxis*yaxis]) map(from:image_imag[0:xaxis*yaxis]) + #endif + + for (int iw=0; iw<num_w_planes; iw++) + { + for (int iv=0; iv<yaxis; iv++) + for (int iu=0; iu<xaxis; iu++) + { + + long index = 2*(iu+iv*xaxis+xaxis*yaxis*iw); + long img_index = iu+iv*xaxis; + wterm = wmin + iw*dw; +#ifdef PHASE_ON + if (num_w_planes > 1) + { + double xcoord = (double)(iu-xaxistot/2); + if(xcoord < 0.0)xcoord = (double)(iu+xaxistot/2); + xcoord = sin(xcoord*resolution); + double ycoord = (double)(iv-yaxistot/2); + if(ycoord < 0.0)ycoord = (double)(iv+yaxistot/2); + ycoord = sin(ycoord*resolution); + + double preal, pimag; + double radius2 = (xcoord*xcoord+ycoord*ycoord); + if(xcoord <= 1.0) + { + preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0)); + pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0)); + } else { + preal = cos(-2.0*PI*wterm*(sqrt(radius2-1.0)-1)); + pimag = 0.0; + } + + preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0)); + pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0)); + + double p,q,r,s; + p = gridss[index]; + q = gridss[index+1]; + r = preal; + s = pimag; + + //printf("%d %d %d %ld %ld\n",iu,iv,iw,index,img_index); + #pragma omp atomic + image_real[img_index] += (p*r-q*s)*dwnorm*sqrt(1-radius2); + #pragma omp atomic + image_imag[img_index] += (p*s+q*r)*dwnorm*sqrt(1-radius2); + } else { + #pragma omp atomic + image_real[img_index] += gridss[index]; + #pragma omp atomic + image_imag[img_index] += gridss[index+1]; + } +#else + #pragma omp atomic + image_real[img_index] += gridss[index]; + #pragma omp atomic + image_imag[img_index] += gridss[index+1]; +#endif // end of PHASE_ON + + } + } + +#endif +#endif // end of __HIPCC__ + + +} diff --git a/w-stacking.hip.cpp b/w-stacking.hip.cpp new file mode 100755 index 0000000000000000000000000000000000000000..37db97cd94eaa78afc9133602cd93f7f287fd326 --- /dev/null +++ b/w-stacking.hip.cpp @@ -0,0 +1,518 @@ +#ifdef _OPENMP +#include <omp.h> +#endif +#include "w-stacking.hip.hpp" +#include <math.h> +#include <stdlib.h> +#include <stdio.h> + +#ifdef __HIPCC__ +#include "allvars_nccl.hip.hpp" +#endif + +#include "proto.h" + +#ifdef ACCOMP +#pragma omp declare target +#endif +#ifdef __HIPCC__ +double __device__ +#else +double +#endif +// Gaussian Kernel +gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist) +{ + double conv_weight; + conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22); + return conv_weight; +} + +void makeGaussKernel(double * kernel, + int KernelLen, + int increaseprecision, + double std22) +{ + + double norm = std22/PI; + int n = increaseprecision*KernelLen, mid = n / 2; + for (int i = 0; i != mid + 1; i++) { + double term = (double)i/(double)increaseprecision; + kernel[mid + i] = sqrt(norm) * exp(-(term*term)*std22); + } + + for (int i = 0; i != mid; i++) kernel[i] = kernel[n - 1 - i]; +// for (int i = 0; i < n; i++) printf("%f\n",kernel[i]); + +} + +// Kaiser-Bessel Kernel: it is adapted from WSClean +double bessel0(double x, double precision) { + // Calculate I_0 = SUM of m 0 -> inf [ (x/2)^(2m) ] + // This is the unnormalized bessel function of order 0. + double d = 0.0, ds = 1.0, sum = 1.0; + do { + d += 2.0; + ds *= x * x / (d * d); + sum += ds; + } while (ds > sum * precision); + return sum; +} +void makeKaiserBesselKernel(double * kernel, + int KernelLen, + int increaseprecision, + double alpha, + double overSamplingFactor, + int withSinc) { + int n = increaseprecision*KernelLen, mid = n / 2; + double * sincKernel = (double*)malloc((mid + 1) * sizeof(*sincKernel)); + const double filterRatio = 1.0 / overSamplingFactor; + sincKernel[0] = filterRatio; + for (int i = 1; i != mid + 1; i++) { + double x = i; + sincKernel[i] = + withSinc ? (sin(PI * filterRatio * x) / (PI * x)) : filterRatio; + } + const double normFactor = overSamplingFactor / bessel0(alpha, 1e-8); + for (int i = 0; i != mid + 1; i++) { + double term = (double)i / mid; + kernel[mid + i] = sincKernel[i] * + bessel0(alpha * sqrt(1.0 - (term * term)), 1e-8) * + normFactor; + } + for (int i = 0; i != mid; i++) kernel[i] = kernel[n - 1 - i]; + //for (int i = 0; i < n; i++) printf("%f\n",kernel[i]); +} + + +#ifdef ACCOMP +#pragma omp end declare target +#endif + +#ifdef __HIPCC__ +//double __device__ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist) +//{ +// double conv_weight; +// conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22); +// return conv_weight; +//} + +__global__ void convolve_g( + int num_w_planes, + uint num_points, + uint freq_per_chan, + uint polarizations, + double* uu, + double* vv, + double* ww, + float* vis_real, + float* vis_img, + float* weight, + double dx, + double dw, + int KernelLen, + int grid_size_x, + int grid_size_y, + double* grid, + #if defined(GAUSS_HI_PRECISION) + double std22 + #else + double std22, + double* convkernel + #endif + ) + + + +{ + //printf("DENTRO AL KERNEL\n"); + uint gid = blockIdx.x*blockDim.x + threadIdx.x; + if(gid < num_points) + { + uint i = gid; + uint visindex = i*freq_per_chan*polarizations; + double norm = std22/PI; + + int j, k; + + /* Convert UV coordinates to grid coordinates. */ + double pos_u = uu[i] / dx; + double pos_v = vv[i] / dx; + double ww_i = ww[i] / dw; + + int grid_w = (int)ww_i; + int grid_u = (int)pos_u; + int grid_v = (int)pos_v; + + // check the boundaries + uint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0; + uint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1; + uint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0; + uint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1; + + + // Convolve this point onto the grid. + for (k = kmin; k <= kmax; k++) + { + + double v_dist = (double)k - pos_v; + int increaseprecision = 5; + + for (j = jmin; j <= jmax; j++) + { + double u_dist = (double)j+0.5 - pos_u; + uint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); + int jKer = (int)(increaseprecision * (fabs(u_dist+(double)KernelLen))); + int kKer = (int)(increaseprecision * (fabs(v_dist+(double)KernelLen))); + + #ifdef GAUSS_HI_PRECISION + double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist); + #endif + #ifdef GAUSS + double conv_weight = convkernel[jKer]*convkernel[kKer]; + #endif + #ifdef KAISERBESSEL + double conv_weight = convkernel[jKer]*convkernel[kKer]; + #endif + + // Loops over frequencies and polarizations + double add_term_real = 0.0; + double add_term_img = 0.0; + uint ifine = visindex; + for (uint ifreq=0; ifreq<freq_per_chan; ifreq++) + { + uint iweight = visindex/freq_per_chan; + for (uint ipol=0; ipol<polarizations; ipol++) + { + double vistest = (double)vis_real[ifine]; + if (!isnan(vistest)) + { + add_term_real += weight[iweight] * vis_real[ifine] * conv_weight; + add_term_img += weight[iweight] * vis_img[ifine] * conv_weight; + } + ifine++; + iweight++; + } + } + atomicAdd(&(grid[iKer]),add_term_real); + atomicAdd(&(grid[iKer+1]),add_term_img); + } + } + } +} +#endif +#ifdef ACCOMP +#pragma omp declare target +#endif +void wstack( + int num_w_planes, + uint num_points, + uint freq_per_chan, + uint polarizations, + double* uu, + double* vv, + double* ww, + float* vis_real, + float* vis_img, + float* weight, + double dx, + double dw, + int w_support, + int grid_size_x, + int grid_size_y, + double* grid, + int num_threads, + #ifdef HIPCC + int rank, + hipStream_t stream_stacking + #else + int rank + #endif + ) + +{ + uint i; + //uint index; + uint visindex; + + // initialize the convolution kernel + // gaussian: + int KernelLen = (w_support-1)/2; + int increaseprecision = 5; // this number must be odd: increaseprecison*w_support must be odd (w_support must be odd) + double std = 1.0; + double std22 = 1.0/(2.0*std*std); + double norm = std22/PI; + double * convkernel = (double*)malloc(increaseprecision*w_support*sizeof(*convkernel)); + + #ifdef GAUSS + makeGaussKernel(convkernel,w_support,increaseprecision,std22); + #endif + #ifdef KAISERBESSEL + double overSamplingFactor = 1.0; + int withSinc = 0; + double alpha = 8.6; + makeKaiserBesselKernel(convkernel, w_support, increaseprecision, alpha, overSamplingFactor, withSinc); + #endif + + + // Loop over visibilities. +// Switch between HIP and GPU versions + #ifdef __HIPCC__ + // Define the HIP set up + int Nth = NTHREADS; + uint Nbl = (uint)(num_points/Nth) + 1; + if(NWORKERS == 1) {Nbl = 1; Nth = 1;}; + uint Nvis = num_points*freq_per_chan*polarizations; + + int ndevices; + int num = hipGetDeviceCount(&ndevices); + int res = hipSetDevice(rank % ndevices); + + if ( rank == 0 ) { + if (0 == ndevices) { + + shutdown_wstacking(NO_ACCELERATORS_FOUND, "No accelerators found", __FILE__, __LINE__ ); + } + } + + #ifdef NVIDIA + prtAccelInfo(); + #endif + + // Create GPU arrays and offload them + double * uu_g; + double * vv_g; + double * ww_g; + float * vis_real_g; + float * vis_img_g; + float * weight_g; + //double * grid_g; + double * convkernel_g; + + //Create the event inside stream stacking + //hipEvent_t event_kernel; + + //for (int i=0; i<100000; i++)grid[i]=23.0; + hipError_t mmm; + //mmm=hipEventCreate(&event_kernel); + mmm=hipMalloc(&uu_g,num_points*sizeof(double)); + mmm=hipMalloc(&vv_g,num_points*sizeof(double)); + mmm=hipMalloc(&ww_g,num_points*sizeof(double)); + mmm=hipMalloc(&vis_real_g,Nvis*sizeof(float)); + mmm=hipMalloc(&vis_img_g,Nvis*sizeof(float)); + mmm=hipMalloc(&weight_g,(Nvis/freq_per_chan)*sizeof(float)); + //mmm=hipMalloc(&grid_g,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double)); + + #if !defined(GAUSS_HI_PRECISION) + mmm=hipMalloc(&convkernel_g,increaseprecision*w_support*sizeof(double)); + #endif + + if (mmm != hipSuccess) {printf("!!! w-stacking.cu hipMalloc ERROR %d !!!\n", mmm);} + //mmm=hipMemset(grid_g,0.0,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double)); + if (mmm != hipSuccess) {printf("!!! w-stacking.cu hipMemset ERROR %d !!!\n", mmm);} + + + mmm=hipMemcpyAsync(uu_g, uu, num_points*sizeof(double), hipMemcpyHostToDevice, stream_stacking); + mmm=hipMemcpyAsync(vv_g, vv, num_points*sizeof(double), hipMemcpyHostToDevice, stream_stacking); + mmm=hipMemcpyAsync(ww_g, ww, num_points*sizeof(double), hipMemcpyHostToDevice, stream_stacking); + mmm=hipMemcpyAsync(vis_real_g, vis_real, Nvis*sizeof(float), hipMemcpyHostToDevice, stream_stacking); + mmm=hipMemcpyAsync(vis_img_g, vis_img, Nvis*sizeof(float), hipMemcpyHostToDevice, stream_stacking); + mmm=hipMemcpyAsync(weight_g, weight, (Nvis/freq_per_chan)*sizeof(float), hipMemcpyHostToDevice, stream_stacking); + + + #if !defined(GAUSS_HI_PRECISION) + mmm=hipMemcpyAsync(convkernel_g, convkernel, increaseprecision*w_support*sizeof(double), hipMemcpyHostToDevice, stream_stacking); + #endif + + if (mmm != hipSuccess) {printf("!!! w-stacking.cu hipMemcpyAsync ERROR %d !!!\n", mmm);} + + // Call main GPU Kernel + #if defined(GAUSS_HI_PRECISION) + convolve_g <<<Nbl,Nth,0,stream_stacking>>> ( + num_w_planes, + num_points, + freq_per_chan, + polarizations, + uu_g, + vv_g, + ww_g, + vis_real_g, + vis_img_g, + weight_g, + dx, + dw, + KernelLen, + grid_size_x, + grid_size_y, + grid, + std22 + ); + + #else + convolve_g <<<Nbl,Nth,0,stream_stacking>>> ( + num_w_planes, + num_points, + freq_per_chan, + polarizations, + uu_g, + vv_g, + ww_g, + vis_real_g, + vis_img_g, + weight_g, + dx, + dw, + KernelLen, + grid_size_x, + grid_size_y, + grid, + std22, + convkernel_g + ); + #endif + + mmm=hipStreamSynchronize(stream_stacking); + //Record the event + //mmm=hipEventRecord(event_kernel,stream_stacking); + + //Wait until the kernel ends + //mmm=hipStreamWaitEvent(stream_stacking,event_kernel); + + //for (int i=0; i<100000; i++)printf("%f\n",grid[i]); + + if (mmm != hipSuccess) + printf("HIP ERROR %s\n",hipGetErrorString(mmm)); + + mmm=hipFree(uu_g); + mmm=hipFree(vv_g); + mmm=hipFree(ww_g); + mmm=hipFree(vis_real_g); + mmm=hipFree(vis_img_g); + mmm=hipFree(weight_g); + //mmm=hipFree(grid_g); + + #if !defined(GAUSS_HI_PRECISION) + mmm=hipFree(convkernel_g); + #endif + +// Switch between HIP and GPU versions +# else + +#ifdef _OPENMP + omp_set_num_threads(num_threads); +#endif + + #if defined(ACCOMP) && (GPU_STACKING) + omp_set_default_device(rank % omp_get_num_devices()); + uint Nvis = num_points*freq_per_chan*polarizations; + #pragma omp target teams distribute parallel for private(visindex) map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom:grid[0:2*num_w_planes*grid_size_x*grid_size_y]) +#else + #pragma omp parallel for private(visindex) +#endif + for (i = 0; i < num_points; i++) + { +#ifdef _OPENMP + //int tid; + //tid = omp_get_thread_num(); + //printf("%d\n",tid); +#endif + + visindex = i*freq_per_chan*polarizations; + + double sum = 0.0; + int j, k; + //if (i%1000 == 0)printf("%ld\n",i); + + /* Convert UV coordinates to grid coordinates. */ + double pos_u = uu[i] / dx; + double pos_v = vv[i] / dx; + double ww_i = ww[i] / dw; + + int grid_w = (int)ww_i; + int grid_u = (int)pos_u; + int grid_v = (int)pos_v; + + // check the boundaries + uint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0; + uint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1; + uint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0; + uint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1; + //printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax); + + + // Convolve this point onto the grid. + for (k = kmin; k <= kmax; k++) + { + + //double v_dist = (double)k+0.5 - pos_v; + double v_dist = (double)k - pos_v; + + for (j = jmin; j <= jmax; j++) + { + //double u_dist = (double)j+0.5 - pos_u; + double u_dist = (double)j - pos_u; + uint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); + int jKer = (int)(increaseprecision * (fabs(u_dist+(double)KernelLen))); + int kKer = (int)(increaseprecision * (fabs(v_dist+(double)KernelLen))); + + #ifdef GAUSS_HI_PRECISION + double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist); + #endif + #ifdef GAUSS + double conv_weight = convkernel[jKer]*convkernel[kKer]; + //if(jKer < 0 || jKer >= 35 || kKer < 0 || kKer >= 35) + // printf("%f %d %f %d\n",fabs(u_dist+(double)KernelLen),jKer,fabs(v_dist+(double)KernelLen),kKer); + //printf("%d %d %d %d %f %f %f %f %f\n",jKer, j, kKer, k, pos_u, pos_v, u_dist,v_dist,conv_weight); + #endif + #ifdef KAISERBESSEL + double conv_weight = convkernel[jKer]*convkernel[kKer]; + #endif + // Loops over frequencies and polarizations + double add_term_real = 0.0; + double add_term_img = 0.0; + uint ifine = visindex; + // DAV: the following two loops are performend by each thread separately: no problems of race conditions + for (uint ifreq=0; ifreq<freq_per_chan; ifreq++) + { + uint iweight = visindex/freq_per_chan; + for (uint ipol=0; ipol<polarizations; ipol++) + { + if (!isnan(vis_real[ifine])) + { + //printf("%f %ld\n",weight[iweight],iweight); + add_term_real += weight[iweight] * vis_real[ifine] * conv_weight; + add_term_img += weight[iweight] * vis_img[ifine] * conv_weight; + //if(vis_img[ifine]>1e10 || vis_img[ifine]<-1e10)printf("%f %f %f %f %ld %ld\n",vis_real[ifine],vis_img[ifine],weight[iweight],conv_weight,ifine,num_points*freq_per_chan*polarizations); + } + ifine++; + iweight++; + } + } + // DAV: this is the critical call in terms of correctness of the results and of performance + #pragma omp atomic + grid[iKer] += add_term_real; + #pragma omp atomic + grid[iKer+1] += add_term_img; + } + } + + } + #if defined(ACCOMP) && (GPU_STACKING) + #pragma omp target exit data map(delete:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan], grid[0:2*num_w_planes*grid_size_x*grid_size_y]) + #endif + // End switch between HIP and CPU versions +#endif + //for (int i=0; i<100000; i++)printf("%f\n",grid[i]); +} + +#ifdef ACCOMP +#pragma omp end declare target +#endif + +int test(int nnn) +{ + int mmm; + + mmm = nnn+1; + return mmm; +} diff --git a/w-stacking.hip.hpp b/w-stacking.hip.hpp new file mode 100755 index 0000000000000000000000000000000000000000..128c60b8036beb72dffe038da7c2b29e5e6c8dc0 --- /dev/null +++ b/w-stacking.hip.hpp @@ -0,0 +1,113 @@ +#ifndef W_PROJECT_H_ +#define W_PROJECT_H_ + +#define NWORKERS -1 //100 +#define NTHREADS 64 //AMD WARP SIZE +#define PI 3.14159265359 +#define REAL_TYPE double + +#include <mpi.h> +#include <hip/hip_runtime.h> + +#ifdef __HIPCC__ +extern "C" +#endif + +#ifdef __cplusplus +extern "C" { +void wstack( + int, + unsigned int, + unsigned int, + unsigned int, + double*, + double*, + double*, + float*, + float*, + float*, + double, + double, + int, + int, + int, + double*, + int, + int, + hipStream_t); +} + +#else +void wstack( + int, + unsigned int, + unsigned int, + unsigned int, + double*, + double*, + double*, + float*, + float*, + float*, + double, + double, + int, + int, + int, + double*, + int, + int); +#endif + + + +#ifdef __HIPCC__ +extern "C" +#endif +int test(int nnn); + +#ifdef __HIPCC__ +extern "C" +#endif +void phase_correction( + double*, + double*, + double*, + int, + int, + int, + int, + int, + double, + double, + double, + int, + int); + +double gauss_kernel_norm( + double norm, + double std22, + double u_dist, + double v_dist); + +#ifdef ACCOMP +#pragma omp declare target (gauss_kernel_norm) +#endif + +#ifdef __HIPCC__ +extern "C" +#endif + +void cuda_fft( + int, + int, + int, + int, + int, + double*, + double*, + int, + MPI_Comm); + + +#endif