diff --git a/.gitignore b/.gitignore index 797747aa16a2fb9000f5268a91f10e289e07f077..77346fd87e48eca5a70138537312d012fec11328 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ *.o +.DS_Store +sync.sh phase_correction.c w-stacking.c w-stackingCfftw diff --git a/Build/Makefile.M100 b/Build/Makefile.M100 new file mode 100644 index 0000000000000000000000000000000000000000..1fa008e8cf29ab9f112d3ad399a6fe4dd5faa901 --- /dev/null +++ b/Build/Makefile.M100 @@ -0,0 +1,20 @@ +CC = gcc +CXX = g++ + +MPICC = mpicc +MPIC++ = mpiCC + + +#FFTW_INCL= -I/home/taffoni/sw/include +#FFTW_LIB= -L/home/taffoni/sw/lib + + +NVCC = nvcc +NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11 +NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda + +OMP= -fopenmp + +CFLAGS += -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL) + +OPTIMIZE = $(OMP) -O3 -mtune=native diff --git a/Build/Makefile.Macosx b/Build/Makefile.Macosx new file mode 100644 index 0000000000000000000000000000000000000000..dbbde55f903759d481ff55f4e8d6919183384ca1 --- /dev/null +++ b/Build/Makefile.Macosx @@ -0,0 +1,26 @@ +CC = icc +CXX = g++-11 + +MPICC = mpicc +MPIC++ = mpiCC + +FFTW_INCL= -I/usr/local/include +FFTW_LIB= /usr/local/lib/ + +GSL_INCL = +GSL_LIBS = + +MPI_LIB = +MPI_INCL= -I/home/taffoni/sw/Linux_x86_64/21.5/comm_libs/mpi/include +HDF5_INCL = +HDF5_LIB = + +OMP= -fopenmp + +NVCC = nvcc +NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11 +NVLIB = -L/home/taffoni -lcudart -lcuda + +CFLAGS += -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL) + +OPTIMIZE = $(OMP) -O3 diff --git a/Build/Makefile.Magellanus b/Build/Makefile.Magellanus new file mode 100644 index 0000000000000000000000000000000000000000..8f5dc582b872b0a3355d9b06fa4ebbcabda79843 --- /dev/null +++ b/Build/Makefile.Magellanus @@ -0,0 +1,34 @@ +CC = gcc-10 +CXX = g++-10 + +MPICC = mpicc +MPIC++ = mpiCC + +GSL_INCL = -I/home/taffoni/sw/include +GSL_LIBS = -L/home/taffoni/sw/lib + +FFTW_INCL= -I/home/taffoni/sw/include +FFTW_LIB= /home/taffoni/sw/lib -lfftw3_mpi -lfftw3 + +#-L/opt/cluster/openmpi/3.1.3/gnu/8.2.0/lib -lmpi +MPI_LIB = +#-I/opt/cluster/openmpi/3.1.3/gnu/8.2.0/include +MPI_INCL= -I/home/taffoni/sw/Linux_x86_64/21.5/comm_libs/mpi/include +HDF5_INCL = +HDF5_LIB = + +OMP = -mp=multicore,gpu -Mprof -cuda +#OMP = -fopenmp +NVCC = nvcc +NVFLAGS = -arch=sm_70 -Xcompiler -std=c++11 +NVLIB = -L/home/taffoni/sw/Linux_x86_64/21.5/cuda/11.3/lib64/ -lcudart -lcuda + + +CFLAGS += -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL) + +OPTIMIZE = $(OMP) -O3 + +# OMP GPU SPECIFIC FLAGS +#OPTIMIZE += -Wno-unused-result -foffload=-lm -ffast-math +#OPTIMIZE += -fcf-protection=none -fno-stack-protector -foffload=nvptx-none -foffload=-misa=sm_35 +#-ffast-math -fopt-info-all-omp -foffload=-misa=sm_35 -fcf-protection=none -fno-stack-protector -foffload=nvptx-none diff --git a/Build/Makefile.Marconi b/Build/Makefile.Marconi new file mode 100644 index 0000000000000000000000000000000000000000..0d68960b481ccaac10ce193c204dc5858c7d362c --- /dev/null +++ b/Build/Makefile.Marconi @@ -0,0 +1,20 @@ +CC = gcc +CXX = g++ + +MPICC = mpicc +MPIC++ = mpiCC + + +FFTW_INCL= -I/home/taffoni/sw/include +FFTW_LIB= /home/taffoni/sw/lib + + +NVCC = nvcc +NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11 +NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda + +OMP= -fopenmp + +CFLAGS += -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL) + +OPTIMIZE = $(OMP) -O3 -mtune=native diff --git a/Build/Makefile.systype b/Build/Makefile.systype new file mode 100644 index 0000000000000000000000000000000000000000..04922322304924d1c8a5397ede089516d9a1d256 --- /dev/null +++ b/Build/Makefile.systype @@ -0,0 +1,24 @@ +CC = gcc-10 +CXX = g++-10 + +MPICC = mpicc +MPIC++ = mpiCC + +OPTIMIZE = + + +GSL_INCL = +GSL_LIB = + +FFTW_INCL= +FFTW_LIB= + +NVCC = +NVFLAGS = +NVLIB = + +CFLAGS += + +MPICHLIB = +HDF5INCL = +HDF5LIB = diff --git a/Makefile b/Makefile index 99d13891512a600974a7a33c5eb43683eaf22e3f..23d161d0f6b432626329c9ffa52f94d2e31fab2a 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,31 @@ # comment/uncomment the various options depending hoe you want to build the program +# Set default values for compiler options if no systype options are given or found +CC = mpiCC +CXX = mpiCC +OPTIMIZE = -std=c++11 -Wall -g -O2 +MPICHLIB = -lmpich +SWITCHES = + +ifdef SYSTYPE +SYSTYPE := $(SYSTYPE) +include Build/Makefile.$(SYSTYPE) +else +include Build/Makefile.systype +endif + + +LIBS = -L$(FFTW_LIB) -lfftw3 -lm -lcudart -lcuda + # create MPI code OPT += -DUSE_MPI +#OPT += -DACCOMP # use FFTW (it can be switched on ONLY if MPI is active) -OPT += -DUSE_FFTW +ifeq (USE_MPI,$(findstring USE_MPI,$(OPT))) + OPT += -DUSE_FFTW + LIBS = -L$(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm +endif + +#OPT += -DNVIDIA # perform one-side communication (suggested) instead of reduce (only if MPI is active) OPT += -DONE_SIDE # write the full 3D cube of gridded visibilities and its FFT transform @@ -10,25 +33,8 @@ OPT += -DONE_SIDE # write the final image OPT += -DWRITE_IMAGE # perform w-stacking phase correction -#OPT += -DPHASE_ON - -CC = gcc -CXX = g++ -ifeq (USE_MPI,$(findstring USE_MPI,$(OPT))) - CC = mpicc - CXX = mpiCC -endif - -OMP = -fopenmp -#OMP = - -CFLAGS += -O3 -mcpu=native -CFLAGS += -I. -LIBS = -L$(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm +OPT += -DPHASE_ON -NVCC = nvcc -NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11 -NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda DEPS = w-stacking.h w-stacking-fftw.c w-stacking.cu phase_correction.cu COBJ = w-stacking.o w-stacking-fftw.o phase_correction.o @@ -39,27 +45,40 @@ w-stacking.c: w-stacking.cu phase_correction.c: phase_correction.cu cp phase_correction.cu phase_correction.c +ifeq (USE_MPI,$(findstring USE_MPI,$(OPT))) +%.o: %.c $(DEPS) + $(MPICC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS) +else %.o: %.c $(DEPS) - $(CC) $(OMP) -c -o $@ $< $(CFLAGS) $(OPT) + $(CC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS) +endif serial: $(COBJ) - $(CC) $(OMP) -o w-stackingCfftw_serial $(CFLAGS) $^ -lm + $(CC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_serial $^ $(LIBS) + +serial_omp: phase_correction.c + $(CC) $(OPTIMIZE) $(OPT) -o w-stackingOMP_serial w-stacking-fftw.c w-stacking_omp.c $(CFLAGS) $(LIBS) + +simple_mpi: phase_correction.c + $(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingMPI_simple w-stacking_omp.c w-stacking-fftw.c phase_correction.c $(CFLAGS) $(LIBS) + +mpi_omp: phase_correction.c + $(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingMPI_omp w-stacking_omp.c w-stacking-fftw.c phase_correction.c $(CFLAGS) $(LIBS) serial_cuda: - $(NVCC) $(OPT) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB) - $(CC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c - $(CXX) $(CFLAGS) $(OPT) -o w-stackingfftw_serial w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) -lm + $(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB) + $(CC) $(OPTIMIZE) $(OPT) -c w-stacking-fftw.c $(CFLAGS) $(LIBS) + $(CXX) $(OPTIMIZE) $(OPT) -o w-stackingfftw_serial w-stacking-fftw.o w-stacking.o phase_correction.o $(CFLAGS) $(NVLIB) -lm -mpi: $(COBJ) - $(CC) $(OMP) -o w-stackingCfftw $(CFLAGS) $^ $(LIBS) +mpi: $(COBJ) + $(MPICC) $(OPTIMIZE) -o w-stackingCfftw $^ $(CFLAGS) $(LIBS) mpi_cuda: - $(NVCC) $(NVFLAGS) $(OPT) -c w-stacking.cu phase_correction.cu $(NVLIB) - $(CC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c - $(CXX) $(CFLAGS) $(OPT) -o w-stackingfftw w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) $(LIBS) -lm + $(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB) + $(MPICC) $(OPTIMIZE) $(OPT) -c w-stacking-fftw.c $(CFLAGS) $(LIBS) + $(MPIC++) $(OPTIMIZE) $(OPT) -o w-stackingfftw w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) $(CFLAGS) $(LIBS) clean: rm *.o rm w-stacking.c rm phase_correction.c - diff --git a/phase_correction.cu b/phase_correction.cu index 9a439248899b8f592124a34e53a34bcd3e35c501..98bd6cc96939e214942e86cf947af32332841e40 100644 --- a/phase_correction.cu +++ b/phase_correction.cu @@ -170,6 +170,14 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in double preal, pimag; double radius2 = (xcoord*xcoord+ycoord*ycoord); + if(xcoord <= 1.0) + { + preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0)); + pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0)); + } else { + preal = cos(-2.0*PI*wterm*(sqrt(radius2-1.0)-1)); + pimag = 0.0; + } preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0)); pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0)); @@ -198,7 +206,7 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in image_imag[img_index] += gridss[index+1]; #endif // end of PHASE_ON - } + } } #endif // end of __CUDACC__ diff --git a/scripts/gridfftbin.m b/scripts/gridfftbin.m index d948638fdc3324cb4c0aad449c90c70b6b9fae09..b4bc237f7d2c3c65e1bc0b309ef5d9c3b18810cb 100644 --- a/scripts/gridfftbin.m +++ b/scripts/gridfftbin.m @@ -10,7 +10,8 @@ nplanes = 1; % id = '4'; suffix = ''; id = ''; -datadir = "/Users/cgheller/Work/Gridding/data/"; +d +atadir = "/Users/cgheller/Work/Gridding/data/"; filename1 = strcat(datadir,"fft_real",suffix,id,'.bin'); filename2 = strcat(datadir,"fft_img",suffix,id,'.bin'); s = dir(filename1); @@ -52,4 +53,3 @@ xlabel('cell'); ylabel('cell'); pngimage = strcat(datadir,'fft-2freq',suffix,id,'.png'); saveas(gcf,pngimage); - diff --git a/scripts/plotgrid.py b/scripts/plotgrid.py new file mode 100644 index 0000000000000000000000000000000000000000..2cf1bab67a5c54c9befd586a414eb160fa88800e --- /dev/null +++ b/scripts/plotgrid.py @@ -0,0 +1,27 @@ +#!/usr/bin/python3 +import numpy as np +import matplotlib.pyplot as plt +filename1 = "fft_real.bin" +filename2 = "fft_img.bin" +nplanes = 1 + +with open(filename1, 'rb') as f1: + vreal = np.fromfile(f1, dtype=np.float64) +with open(filename2, 'rb') as f2: + vimg = np.fromfile(f2, dtype=np.float64) + +xaxis = int(np.sqrt(vreal.size)) +yaxes = xaxis +residual = np.vectorize(complex)(vreal, vimg) + +cumul2d = residual.reshape((xaxis,yaxes,nplanes), order='F') +for i in range(nplanes): + gridded = np.squeeze(cumul2d[:,:,i]) + ax = plt.subplot() + img = ax.imshow(np.abs(np.fft.fftshift(gridded)), aspect='auto', interpolation='none', origin='lower') + ax.set_xlabel('cell') + ax.set_ylabel('cell') + cbar = plt.colorbar(img) + cbar.set_label('norm(FFT)',size=18) + figname='grid_image_' + str(i) + '.png' + plt.savefig(figname) diff --git a/w-stacking-fftw.c b/w-stacking-fftw.c index 0eda87e9b2dcef70714081129aff0fbeb468fae0..908dda8c47943f936d689ce71a202684122aef4e 100644 --- a/w-stacking-fftw.c +++ b/w-stacking-fftw.c @@ -7,9 +7,15 @@ #include <fftw3-mpi.h> #endif #endif +#include <omp.h> #include <math.h> #include <time.h> +#include <unistd.h> +#ifdef ACCOMP +#include "w-stacking_omp.h" +#else #include "w-stacking.h" +#endif #define PI 3.14159265359 #define NUM_OF_SECTORS -1 #define MIN(X, Y) (((X) < (Y)) ? (X) : (Y)) @@ -27,16 +33,16 @@ void Push(struct sectorlist** headRef, long data) { struct sectorlist* newNode = malloc(sizeof(struct sectorlist)); newNode->index = data; newNode->next = *headRef; - *headRef = newNode; + *headRef = newNode; } // Main Code -int main(int argc, char * argv[]) +int main(int argc, char * argv[]) { int rank; int size; - FILE * pFile; + FILE * pFile; FILE * pFile1; FILE * pFilereal; FILE * pFileimg; @@ -50,16 +56,16 @@ int main(int argc, char * argv[]) // //char datapath[900] = "/m100_scratch/userexternal/cgheller/gridding/hba-8hrs_gauss4new.binMS/"; //char datapath[900] = "/m100_scratch/userexternal/cgheller/Lofar/Observations/L798046_SB244_uv.uncorr_130B27932t_146MHz.pre-cal.binMS/"; - char datapath[900]; + char datapath[900]; char datapath_multi[NFILES][900]; char ufile[30] = "ucoord.bin"; - char vfile[30] = "vcoord.bin"; - char wfile[30] = "wcoord.bin"; - char weightsfile[30] = "weights.bin"; - char visrealfile[30] = "visibilities_real.bin"; - char visimgfile[30] = "visibilities_img.bin"; - char metafile[30] = "meta.txt"; + char vfile[30] = "vcoord.bin"; + char wfile[30] = "wcoord.bin"; + char weightsfile[30] = "weights.bin"; + char visrealfile[30] = "visibilities_real.bin"; + char visimgfile[30] = "visibilities_img.bin"; + char metafile[30] = "meta.txt"; char outfile[30] = "grid.txt"; char outfile1[30] = "coords.txt"; char outfile2[30] = "grid_real.bin"; @@ -94,7 +100,7 @@ int main(int argc, char * argv[]) double wmax,wmax0; double resolution; - // MESH SIZE + // MESH SIZE int grid_size_x = 2048; int grid_size_y = 2048; int local_grid_size_x;// = 8; @@ -102,6 +108,7 @@ int main(int argc, char * argv[]) int xaxis; int yaxis; int num_w_planes = 8; + // DAV: the corresponding KernelLen is calculated within the wstack function. It can be anyway hardcoded for optimization int w_support = 7; int num_threads;// = 4; @@ -117,11 +124,23 @@ int main(int argc, char * argv[]) struct timespec begin, finish, begin0, begink, finishk; double elapsed; long nsectors; + /* GT get nymber of threads exit if not given */ + if(argc == 1) { + fprintf(stderr, "Usage: %s number_of_OMP_Threads \n", argv[0]); + exit(1); + } + // Set the number of OpenMP threads + num_threads = atoi(argv[1]); + + if ( num_threads == 0 ) + { + fprintf(stderr, "Wrong parameter: %s\n\n", argv[1]); + fprintf(stderr, "Usage: %s number_of_OMP_Threads \n", argv[0]); + exit(1); + } clock_gettime(CLOCK_MONOTONIC, &begin0); start0 = clock(); - // Set the number of OpenMP threads - num_threads = atoi(argv[1]); // Intialize MPI environment #ifdef USE_MPI @@ -136,7 +155,21 @@ int main(int argc, char * argv[]) rank = 0; size = 1; #endif - if(rank == 0)printf("Running with %d threads\n",num_threads); + + if(rank == 0)printf("Running with %d threads\n",num_threads); + +#ifdef ACCOMP +if(rank == 0){ + if (0 == omp_get_num_devices()) { + printf("No accelerator found ... exit\n"); + exit(255); + } + printf("Number of available GPUs %d\n", omp_get_num_devices()); + #ifdef NVIDIA + prtAccelInfo(); + #endif + } +#endif // set the local size of the image local_grid_size_x = grid_size_x; @@ -144,7 +177,7 @@ int main(int argc, char * argv[]) if (nsectors < 0) nsectors = size; local_grid_size_y = grid_size_y/nsectors; //nsectors = size; - + // LOCAL grid size xaxis = local_grid_size_x; yaxis = local_grid_size_y; @@ -183,10 +216,10 @@ int main(int argc, char * argv[]) int nsub = 1000; //int nsub = 10; printf("Subtracting last %d measurements\n",nsub); - Nmeasures = Nmeasures-nsub; + Nmeasures = Nmeasures-nsub; Nvis = Nmeasures*freq_per_chan*polarisations; - // calculate the coordinates of the center + // calculate the coordinates of the center double uvshift = uvmin/(uvmax-uvmin); //printf("UVSHIFT %f %f %f %f %f\n",uvmin, uvmax, wmin, wmax, uvshift); @@ -196,12 +229,12 @@ int main(int argc, char * argv[]) printf("N. visibilities %ld\n",Nvis); } - // Set temporary local size of points + // Set temporary local size of points long nm_pe = (long)(Nmeasures/size); long remaining = Nmeasures%size; - - long startrow = rank*nm_pe; - if (rank == size-1)nm_pe = nm_pe+remaining; + + long startrow = rank*nm_pe; + if (rank == size-1)nm_pe = nm_pe+remaining; long Nmeasures_tot = Nmeasures; Nmeasures = nm_pe; @@ -209,10 +242,11 @@ int main(int argc, char * argv[]) Nvis = Nmeasures*freq_per_chan*polarisations; Nweights = Nmeasures*polarisations; - #ifdef VERBOSE +#ifdef VERBOSE printf("N. measurements on %d %ld\n",rank,Nmeasures); printf("N. visibilities on %d %ld\n",rank,Nvis); - #endif +#endif + // DAV: all these arrays can be allocatate statically for the sake of optimization. However be careful that if MPI is used // all the sizes are rescaled by the number of MPI tasks @@ -224,7 +258,7 @@ int main(int argc, char * argv[]) visreal = (float*) calloc(Nvis,sizeof(float)); visimg = (float*) calloc(Nvis,sizeof(float)); - if(rank == 0)printf("READING DATA\n"); + if(rank == 0)printf("READING DATA\n"); // Read data strcpy(filename,datapath); strcat(filename,ufile); @@ -239,7 +273,7 @@ int main(int argc, char * argv[]) strcat(filename,vfile); //printf("Reading %s\n",filename); - pFile = fopen (filename,"rb"); + pFile = fopen (filename,"rb"); fseek (pFile,startrow*sizeof(double),SEEK_SET); fread(vv,Nmeasures*sizeof(double),1,pFile); fclose(pFile); @@ -253,9 +287,9 @@ int main(int argc, char * argv[]) fread(ww,Nmeasures*sizeof(double),1,pFile); fclose(pFile); - #ifdef USE_MPI +#ifdef USE_MPI MPI_Barrier(MPI_COMM_WORLD); - #endif +#endif clock_gettime(CLOCK_MONOTONIC, &finish); end = clock(); @@ -263,13 +297,13 @@ int main(int argc, char * argv[]) setup_time1 = (finish.tv_sec - begin.tv_sec); setup_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; - if(rank == 0)printf("GRIDDING DATA\n"); + + if(rank == 0)printf("GRIDDING DATA\n"); // Create histograms and linked lists - clock_gettime(CLOCK_MONOTONIC, &begin); - start = clock(); - - //CLAAA + clock_gettime(CLOCK_MONOTONIC, &begin); + start = clock(); + // Initialize linked list struct sectorlist ** sectorhead; sectorhead = (struct sectorlist **) malloc((nsectors+1) * sizeof(struct sectorlist)); @@ -286,7 +320,7 @@ int main(int argc, char * argv[]) double uuh,vvh; for (long iphi = 0; iphi < Nmeasures; iphi++) { - boundary[iphi] = -1; + boundary[iphi] = -1; uuh = uu[iphi]; vvh = vv[iphi]; int binphi = (int)(vvh*nsectors); @@ -295,8 +329,8 @@ int main(int argc, char * argv[]) double downdist = vvh - (double)(binphi*yaxis)*dx; // histo_send[binphi]++; - Push(§orhead[binphi],iphi); - if(updist < w_supporth && updist >= 0.0) {histo_send[binphi+1]++; boundary[iphi] = binphi+1; Push(§orhead[binphi+1],iphi);}; + Push(§orhead[binphi],iphi); + if(updist < w_supporth && updist >= 0.0) {histo_send[binphi+1]++; boundary[iphi] = binphi+1; Push(§orhead[binphi+1],iphi);}; if(downdist < w_supporth && binphi > 0 && downdist >= 0.0) {histo_send[binphi-1]++; boundary[iphi] = binphi-1; Push(§orhead[binphi-1],iphi);}; } @@ -316,41 +350,39 @@ int main(int argc, char * argv[]) } #endif - #ifdef VERBOSE - for (int iii=0; iii<nsectors+1; iii++)printf("HISTO %d %d %ld\n",rank, iii, histo_send[iii]); - #endif - - // Create sector grid - double * gridss; - double * gridss_w; - double * gridss_real; - double * gridss_img; - double * grid; - long size_of_grid; - size_of_grid = 2*num_w_planes*xaxis*yaxis; - gridss = (double*) calloc(size_of_grid,sizeof(double)); - gridss_w = (double*) calloc(size_of_grid,sizeof(double)); - gridss_real = (double*) calloc(size_of_grid/2,sizeof(double)); - gridss_img = (double*) calloc(size_of_grid/2,sizeof(double)); - // Create destination slab - grid = (double*) calloc(size_of_grid,sizeof(double)); - // Create temporary global grid - #ifndef USE_MPI - double * gridtot = (double*) calloc(2*grid_size_x*grid_size_y*num_w_planes,sizeof(double)); - #endif - - double shift = (double)(dx*yaxis); +#ifdef VERBOSE + for (int iii=0; iii<nsectors+1; iii++)printf("HISTO %d %d %ld\n",rank, iii, histo_send[iii]); +#endif - // Open the MPI Memory Window for the slab - #ifdef USE_MPI - MPI_Win slabwin; - MPI_Win_create(grid, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &slabwin); - MPI_Win_fence(0,slabwin); - #endif +// Create sector grid + double * gridss; + double * gridss_w; + double * gridss_real; + double * gridss_img; + double * grid; + long size_of_grid; + size_of_grid = 2*num_w_planes*xaxis*yaxis; + gridss = (double*) calloc(size_of_grid,sizeof(double)); + gridss_w = (double*) calloc(size_of_grid,sizeof(double)); + gridss_real = (double*) calloc(size_of_grid/2,sizeof(double)); + gridss_img = (double*) calloc(size_of_grid/2,sizeof(double)); + // Create destination slab + grid = (double*) calloc(size_of_grid,sizeof(double)); + // Create temporary global grid +#ifndef USE_MPI + double * gridtot = (double*) calloc(2*grid_size_x*grid_size_y*num_w_planes,sizeof(double)); +#endif + double shift = (double)(dx*yaxis); + // Open the MPI Memory Window for the slab +#ifdef USE_MPI + MPI_Win slabwin; + MPI_Win_create(grid, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &slabwin); + MPI_Win_fence(0,slabwin); +#endif +#ifndef USE_MPI + pFile1 = fopen (outfile1,"w"); +#endif - #ifndef USE_MPI - pFile1 = fopen (outfile1,"w"); - #endif // loop over files // @@ -365,7 +397,6 @@ int main(int argc, char * argv[]) // for (int ifiles=0; ifiles<ndatasets; ifiles++) { - strcpy(filename,datapath_multi[ifiles]); printf("Processing %s, %d of %d\n",filename,ifiles+1,ndatasets); @@ -400,95 +431,93 @@ int main(int argc, char * argv[]) fread(weights,(Nweights)*sizeof(float),1,pFile); fclose(pFile); - strcpy(filename,datapath); - strcat(filename,visrealfile); - //printf("Reading %s\n",filename); - - pFile = fopen (filename,"rb"); - fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET); - fread(visreal,Nvis*sizeof(float),1,pFile); - fclose(pFile); + pFile = fopen (filename,"rb"); + fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET); + fread(visreal,Nvis*sizeof(float),1,pFile); + fclose(pFile); + strcpy(filename,datapath); + strcat(filename,visimgfile); +#ifdef VERBOSE + printf("Reading %s\n",filename); +#endif + pFile = fopen (filename,"rb"); + fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET); + fread(visimg,Nvis*sizeof(float),1,pFile); + fclose(pFile); - strcpy(filename,datapath); - strcat(filename,visimgfile); - //printf("Reading %s\n",filename); +#ifdef USE_MPI + MPI_Barrier(MPI_COMM_WORLD); +#endif + // Declare temporary arrays for the masking + double * uus; + double * vvs; + double * wws; + float * visreals; + float * visimgs; + float * weightss; + long isector; - pFile = fopen (filename,"rb"); - fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET); - fread(visimg,Nvis*sizeof(float),1,pFile); - fclose(pFile); + for (long isector_count=0; isector_count<nsectors; isector_count++) + { + clock_gettime(CLOCK_MONOTONIC, &begink); + startk = clock(); + // define local destination sector + //isector = (isector_count+rank)%size; + isector = isector_count; + // allocate sector arrays + long Nsec = histo_send[isector]; + uus = (double*) malloc(Nsec*sizeof(double)); + vvs = (double*) malloc(Nsec*sizeof(double)); + wws = (double*) malloc(Nsec*sizeof(double)); + long Nweightss = Nsec*polarisations; + long Nvissec = Nweightss*freq_per_chan; + weightss = (float*) malloc(Nweightss*sizeof(float)); + visreals = (float*) malloc(Nvissec*sizeof(float)); + visimgs = (float*) malloc(Nvissec*sizeof(float)); - #ifdef USE_MPI - MPI_Barrier(MPI_COMM_WORLD); - #endif + // select data for this sector + long icount = 0; + long ip = 0; + long inu = 0; +//CLAAAA + struct sectorlist * current; + current = sectorhead[isector]; - // Declare temporary arrays for the masking - double * uus; - double * vvs; - double * wws; - float * visreals; - float * visimgs; - float * weightss; - long isector; - for (long isector_count=0; isector_count<nsectors; isector_count++) - { - clock_gettime(CLOCK_MONOTONIC, &begink); - startk = clock(); - // define local destination sector - //isector = (isector_count+rank)%size; - isector = isector_count; - // allocate sector arrays - long Nsec = histo_send[isector]; - uus = (double*) malloc(Nsec*sizeof(double)); - vvs = (double*) malloc(Nsec*sizeof(double)); - wws = (double*) malloc(Nsec*sizeof(double)); - long Nweightss = Nsec*polarisations; - long Nvissec = Nweightss*freq_per_chan; - weightss = (float*) malloc(Nweightss*sizeof(float)); - visreals = (float*) malloc(Nvissec*sizeof(float)); - visimgs = (float*) malloc(Nvissec*sizeof(float)); - // select data for this sector - long icount = 0; - long ip = 0; - long inu = 0; -//CLAAAA - struct sectorlist * current; - current = sectorhead[isector]; - while (current->index != -1) + while (current->index != -1) { long ilocal = current->index; - //double vvh = vv[ilocal]; + //double vvh = vv[ilocal]; //int binphi = (int)(vvh*nsectors); - //if (binphi == isector || boundary[ilocal] == isector) - //{ - uus[icount] = uu[ilocal]; - vvs[icount] = vv[ilocal]-isector*shift; - wws[icount] = ww[ilocal]; - for (long ipol=0; ipol<polarisations; ipol++) - { - weightss[ip] = weights[ilocal*polarisations+ipol]; + //if (binphi == isector || boundary[ilocal] == isector) { + uus[icount] = uu[ilocal]; + vvs[icount] = vv[ilocal]-isector*shift; + wws[icount] = ww[ilocal]; + for (long ipol=0; ipol<polarisations; ipol++) + { + weightss[ip] = weights[ilocal*polarisations+ipol]; ip++; - } - for (long ifreq=0; ifreq<polarisations*freq_per_chan; ifreq++) - { - visreals[inu] = visreal[ilocal*polarisations*freq_per_chan+ifreq]; - visimgs[inu] = visimg[ilocal*polarisations*freq_per_chan+ifreq]; - //if(visimgs[inu]>1e10 || visimgs[inu]<-1e10)printf("%f %f %ld %ld %d %ld %ld\n",visreals[inu],visimgs[inu],inu,Nvissec,rank,ilocal*polarisations*freq_per_chan+ifreq,Nvis); - inu++; - } - icount++; + } + for (long ifreq=0; ifreq<polarisations*freq_per_chan; ifreq++) + { + visreals[inu] = visreal[ilocal*polarisations*freq_per_chan+ifreq]; + visimgs[inu] = visimg[ilocal*polarisations*freq_per_chan+ifreq]; + //if(visimgs[inu]>1e10 || visimgs[inu]<-1e10)printf("%f %f %ld %ld %d %ld %ld\n",visreals[inu],visimgs[inu],inu,Nvissec,rank,ilocal*polarisations*freq_per_chan+ifreq,Nvis); + inu++; + } + icount++; //} - current = current->next; - } + current = current->next; + } clock_gettime(CLOCK_MONOTONIC, &finishk); endk = clock(); compose_time += ((double) (endk - startk)) / CLOCKS_PER_SEC; compose_time1 += (finishk.tv_sec - begink.tv_sec); compose_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0; - #ifndef USE_MPI + + #ifndef USE_MPI double uumin = 1e20; double vvmin = 1e20; double uumax = -1e20; @@ -501,7 +530,7 @@ int main(int argc, char * argv[]) vvmin = MIN(vvmin,vvs[ipart]); vvmax = MAX(vvmax,vvs[ipart]); - + if(ipart%10 == 0)fprintf (pFile, "%ld %f %f %f\n",isector,uus[ipart],vvs[ipart]+isector*shift,wws[ipart]); } @@ -514,6 +543,7 @@ int main(int argc, char * argv[]) #endif clock_gettime(CLOCK_MONOTONIC, &begink); startk = clock(); + wstack(num_w_planes, Nsec, freq_per_chan, @@ -527,10 +557,22 @@ int main(int argc, char * argv[]) dx, dw, w_support, - xaxis, - yaxis, + xaxis, + yaxis, gridss, num_threads); + + +/* int z =0 ; +#pragma omp target map(to:test_i_gpu) map(from:z) +{ + int x; // only accessible from accelerator + x = 2; + z = x + test_i_gpu; +}*/ + + + clock_gettime(CLOCK_MONOTONIC, &finishk); endk = clock(); kernel_time += ((double) (endk - startk)) / CLOCKS_PER_SEC; @@ -556,9 +598,9 @@ int main(int argc, char * argv[]) #ifdef ONE_SIDE printf("One Side communication active\n"); MPI_Win_lock(MPI_LOCK_SHARED,target_rank,0,slabwin); - MPI_Accumulate(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,MPI_SUM,slabwin); + MPI_Accumulate(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,MPI_SUM,slabwin); MPI_Win_unlock(target_rank,slabwin); - //MPI_Put(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,slabwin); + //MPI_Put(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,slabwin); #else MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD); #endif //ONE_SIDE @@ -633,7 +675,7 @@ int main(int argc, char * argv[]) process_time1 = (finish.tv_sec - begin.tv_sec); process_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; clock_gettime(CLOCK_MONOTONIC, &begin); - + #ifdef WRITE_DATA // Write results @@ -712,7 +754,7 @@ int main(int argc, char * argv[]) #ifdef USE_FFTW // FFT transform the data (using distributed FFTW) - + if(rank == 0)printf("PERFORMING FFT\n"); clock_gettime(CLOCK_MONOTONIC, &begin); start = clock(); @@ -727,13 +769,13 @@ int main(int argc, char * argv[]) // and perform the FFT per w plane alloc_local = fftw_mpi_local_size_2d(grid_size_y, grid_size_x, MPI_COMM_WORLD,&local_n0, &local_0_start); fftwgrid = fftw_alloc_complex(alloc_local); - plan = fftw_mpi_plan_dft_2d(grid_size_y, grid_size_x, fftwgrid, fftwgrid, MPI_COMM_WORLD, FFTW_BACKWARD, FFTW_ESTIMATE); + plan = fftw_mpi_plan_dft_2d(grid_size_y, grid_size_x, fftwgrid, fftwgrid, MPI_COMM_WORLD, FFTW_BACKWARD, FFTW_ESTIMATE); long fftwindex = 0; long fftwindex2D = 0; for (int iw=0; iw<num_w_planes; iw++) { - //printf("FFTing plan %d\n",iw); + //printf("FFTing plan %d\n",iw); // select the w-plane to transform for (int iv=0; iv<yaxis; iv++) { @@ -814,7 +856,7 @@ int main(int argc, char * argv[]) for (int iw=0; iw<num_w_planes; iw++) for (int iv=0; iv<yaxis; iv++) for (int iu=0; iu<xaxis; iu++) - { + { long global_index = (iu + (iv+isector*yaxis)*xaxis + iw*grid_size_x*grid_size_y)*sizeof(double); long index = iu + iv*xaxis + iw*xaxis*yaxis; fseek(pFileimg, global_index, SEEK_SET); @@ -825,7 +867,7 @@ int main(int argc, char * argv[]) fwrite(gridss_img, size_of_grid/2, sizeof(double), pFileimg); } - + } #else /* @@ -851,7 +893,7 @@ int main(int argc, char * argv[]) MPI_Barrier(MPI_COMM_WORLD); #endif #endif //WRITE_DATA - + fftw_free(fftwgrid); // Phase correction diff --git a/w-stacking.cu b/w-stacking.cu index 499189619802f8e94c7a41cd49caa46c8afb7093..4fe36254789fff354e28fb887a5d4bb6c3b37704 100644 --- a/w-stacking.cu +++ b/w-stacking.cu @@ -6,6 +6,9 @@ #include <stdlib.h> #include <stdio.h> +#ifdef ACCOMP +#pragma omp declare target +#endif #ifdef __CUDACC__ double __device__ #else @@ -17,6 +20,9 @@ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist) conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22); return conv_weight; } +#ifdef ACCOMP +#pragma omp end declare target +#endif #ifdef __CUDACC__ //double __device__ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist) @@ -90,7 +96,7 @@ __global__ void convolve_g( double add_term_img = 0.0; long ifine = visindex; for (long ifreq=0; ifreq<freq_per_chan; ifreq++) - { + { long iweight = visindex/freq_per_chan; for (long ipol=0; ipol<polarizations; ipol++) { @@ -111,7 +117,9 @@ __global__ void convolve_g( } } #endif - +#ifdef ACCOMP +#pragma omp declare target +#endif void wstack( int num_w_planes, long num_points, @@ -216,7 +224,14 @@ void wstack( #ifdef _OPENMP omp_set_num_threads(num_threads); #endif - #pragma omp parallel for private(visindex) + +#ifdef ACCOMP + long Nvis = num_points*freq_per_chan*polarizations; + // #pragma omp target data map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) + // #pragma omp target teams distribute parallel for map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom: grid[0:2*num_w_planes*grid_size_x*grid_size_y]) +#else + #pragma omp parallel for private(visindex) +#endif for (i = 0; i < num_points; i++) { #ifdef _OPENMP @@ -258,6 +273,7 @@ void wstack( double u_dist = (double)j+0.5 - pos_u; long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); + double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist); // Loops over frequencies and polarizations double add_term_real = 0.0; @@ -265,7 +281,7 @@ void wstack( long ifine = visindex; // DAV: the following two loops are performend by each thread separately: no problems of race conditions for (long ifreq=0; ifreq<freq_per_chan; ifreq++) - { + { long iweight = visindex/freq_per_chan; for (long ipol=0; ipol<polarizations; ipol++) { @@ -287,18 +303,20 @@ void wstack( grid[iKer+1] += add_term_img; } } - + } // End switch between CUDA and CPU versions #endif //for (int i=0; i<100000; i++)printf("%f\n",grid[i]); } +#ifdef ACCOMP +#pragma omp end declare target +#endif int test(int nnn) { int mmm; - + mmm = nnn+1; return mmm; -} - +} diff --git a/w-stacking.h b/w-stacking.h index 35e1495ea90864c6dbc0120b3ba063f28afef583..7de9b80414742d64f91a82642770878aaac1dbf7 100644 --- a/w-stacking.h +++ b/w-stacking.h @@ -8,6 +8,7 @@ #ifdef __CUDACC__ extern "C" #endif + void wstack( int, long, diff --git a/w-stacking_omp.c b/w-stacking_omp.c new file mode 100644 index 0000000000000000000000000000000000000000..b848bc5c117b037d070159a58242f08019cc39e1 --- /dev/null +++ b/w-stacking_omp.c @@ -0,0 +1,247 @@ +#include <omp.h> +#include "w-stacking_omp.h" +#include <math.h> +#include <stdlib.h> +#include <stdio.h> +#ifdef NVIDIA +#include <cuda_runtime.h> +#endif + +#ifdef ACCOMP +#pragma omp declare target +#endif +double gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist) +{ + double conv_weight; + conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22); + return conv_weight; +} +#ifdef ACCOMP +#pragma omp end declare target +#endif + + + +void wstack( + int num_w_planes, + long num_points, + long freq_per_chan, + long polarizations, + double* uu, + double* vv, + double* ww, + float* vis_real, + float* vis_img, + float* weight, + double dx, + double dw, + int w_support, + int grid_size_x, + int grid_size_y, + double* grid, + int num_threads) +{ + //long index; + long visindex; + + // initialize the convolution kernel + // gaussian: + int KernelLen = (w_support-1)/2; + double std = 1.0; + double std22 = 1.0/(2.0*std*std); + double norm = std22/PI; + + // Loop over visibilities. +#ifdef _OPENMP + omp_set_num_threads(num_threads); +#endif + +#ifdef ACCOMP + long Nvis = num_points*freq_per_chan*polarizations; + long gpu_weight_dim = Nvis/freq_per_chan; + long gpu_grid_dim = 2*num_w_planes*grid_size_x*grid_size_y; +#pragma omp target teams distribute parallel for private(visindex) \ +map(to:num_points, KernelLen, std, std22, norm, num_w_planes, \ + uu[0:num_points], vv[0:num_points], ww[0:num_points], \ + vis_real[0:Nvis], vis_img[0:Nvis], weight[0:gpu_weight_dim], \ + grid_size_x, grid_size_y, freq_per_chan, polarizations, dx,dw, w_support, num_threads) \ + map(tofrom: grid[0:gpu_grid_dim]) +#endif + for (long i = 0; i < num_points; i++) + { +#ifdef _OPENMP + //int tid; + //tid = omp_get_thread_num(); + //printf("%d\n",tid); +#endif + + visindex = i*freq_per_chan*polarizations; + + double sum = 0.0; + int j, k; + //if (i%1000 == 0)printf("%ld\n",i); + + /* Convert UV coordinates to grid coordinates. */ + double pos_u = uu[i] / dx; + double pos_v = vv[i] / dx; + double ww_i = ww[i] / dw; + int grid_w = (int)ww_i; + int grid_u = (int)pos_u; + int grid_v = (int)pos_v; + + // check the boundaries + long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0; + long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1; + long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0; + long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1; + //printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax); + + + // Convolve this point onto the grid. + for (k = kmin; k <= kmax; k++) + { + + double v_dist = (double)k+0.5 - pos_v; + + for (j = jmin; j <= jmax; j++) + { + double u_dist = (double)j+0.5 - pos_u; + long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); + + + double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist); + // Loops over frequencies and polarizations + double add_term_real = 0.0; + double add_term_img = 0.0; + long ifine = visindex; + // DAV: the following two loops are performend by each thread separately: no problems of race conditions + for (long ifreq=0; ifreq<freq_per_chan; ifreq++) + { + long iweight = visindex/freq_per_chan; + for (long ipol=0; ipol<polarizations; ipol++) + { + if (!isnan(vis_real[ifine])) + { + //printf("%f %ld\n",weight[iweight],iweight); + add_term_real += weight[iweight] * vis_real[ifine] * conv_weight; + add_term_img += weight[iweight] * vis_img[ifine] * conv_weight; + //if(vis_img[ifine]>1e10 || vis_img[ifine]<-1e10)printf("%f %f %f %f %ld %ld\n",vis_real[ifine],vis_img[ifine],weight[iweight],conv_weight,ifine,num_points*freq_per_chan*polarizations); + } + ifine++; + iweight++; + } + } + // DAV: this is the critical call in terms of correctness of the results and of performance + #pragma omp atomic + grid[iKer] += add_term_real; + #pragma omp atomic + grid[iKer+1] += add_term_img; + } + } + + } + + //for (int i=0; i<100000; i++)printf("%f\n",grid[i]); +} + + +#ifdef NVIDIA +#define CUDAErrorCheck(funcall) \ +do { \ + cudaError_t ierr = funcall; \ + if (cudaSuccess != ierr) { \ + fprintf(stderr, "%s(line %d) : CUDA RT API error : %s(%d) -> %s\n", \ + __FILE__, __LINE__, #funcall, ierr, cudaGetErrorString(ierr)); \ + exit(ierr); \ + } \ +} while (0) + +static inline int _corePerSM(int major, int minor) +/** + * @brief Give the number of CUDA cores per streaming multiprocessor (SM). + * + * The number of CUDA cores per SM is determined by the compute capability. + * + * @param major Major revision number of the compute capability. + * @param minor Minor revision number of the compute capability. + * + * @return The number of CUDA cores per SM. + */ +{ + if (1 == major) { + if (0 == minor || 1 == minor || 2 == minor || 3 == minor) return 8; + } + if (2 == major) { + if (0 == minor) return 32; + if (1 == minor) return 48; + } + if (3 == major) { + if (0 == minor || 5 == minor || 7 == minor) return 192; + } + if (5 == major) { + if (0 == minor || 2 == minor) return 128; + } + if (6 == major) { + if (0 == minor) return 64; + if (1 == minor || 2 == minor) return 128; + } + if (7 == major) { + if (0 == minor || 2 == minor || 5 == minor) return 64; + } + return -1; +} + +void getGPUInfo(int iaccel) +{ + int corePerSM; + + struct cudaDeviceProp dev; + + CUDAErrorCheck(cudaSetDevice(iaccel)); + CUDAErrorCheck(cudaGetDeviceProperties(&dev, iaccel)); + corePerSM = _corePerSM(dev.major, dev.minor); + + printf("\n"); + printf("============================================================\n"); + printf("CUDA Device name : \"%s\"\n", dev.name); + printf("------------------------------------------------------------\n"); + printf("Comp. Capability : %d.%d\n", dev.major, dev.minor); + printf("max clock rate : %.0f MHz\n", dev.clockRate * 1.e-3f); + printf("number of SMs : %d\n", dev.multiProcessorCount); + printf("cores / SM : %d\n", corePerSM); + printf("# of CUDA cores : %d\n", corePerSM * dev.multiProcessorCount); + printf("------------------------------------------------------------\n"); + printf("global memory : %5.0f MBytes\n", dev.totalGlobalMem / 1048576.0f); + printf("shared mem. / SM : %5.1f KBytes\n", dev.sharedMemPerMultiprocessor / 1024.0f); + printf("32-bit reg. / SM : %d\n", dev.regsPerMultiprocessor); + printf("------------------------------------------------------------\n"); + printf("max # of threads / SM : %d\n", dev.maxThreadsPerMultiProcessor); + printf("max # of threads / block : %d\n", dev.maxThreadsPerBlock); + printf("max dim. of block : (%d, %d, %d)\n", + dev.maxThreadsDim[0], dev.maxThreadsDim[1], dev.maxThreadsDim[2]); + printf("max dim. of grid : (%d, %d, %d)\n", + dev.maxGridSize[0], dev.maxGridSize[1], dev.maxGridSize[2]); + printf("warp size : %d\n", dev.warpSize); + printf("============================================================\n"); + + int z = 0, x = 2; + #pragma omp target map(to:x) map(tofrom:z) + { + z=x+100; + } +} + +#endif + + + + + + +int test(int nnn) +{ + int mmm; + + mmm = nnn+1; + return mmm; +} diff --git a/w-stacking_omp.h b/w-stacking_omp.h new file mode 100644 index 0000000000000000000000000000000000000000..b655ffe1ecc37a742a175c61754104cb5150bec8 --- /dev/null +++ b/w-stacking_omp.h @@ -0,0 +1,82 @@ +#ifndef W_PROJECT_H_ +#define W_PROJECT_H_ +#endif + +#define NWORKERS -1 //100 +#define NTHREADS 32 +#define PI 3.14159265359 +#define REAL_TYPE double + +void wstack( + int, + long, + long, + long, + double*, + double*, + double*, + float*, + float*, + float*, + double, + double, + int, + int, + int, + double*, + int); + +int test(int nnn); + +double gauss_kernel_norm( + double norm, + double std22, + double u_dist, + double v_dist); + + + + +void phase_correction( + double*, + double*, + double*, + int, + int, + int, + int, + int); + +#ifdef ACCOMP +#ifdef NVIDIA +void getGPUInfo(int); +#endif +#pragma omp declare target (gauss_kernel_norm) +#endif + +#ifdef NVIDIA +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef PRTACCELINFO_H +#define PRTACCELINFO_H + +void prtAccelInfo(int iaccel); +/**< + * @brief Print some basic info of an accelerator. + * + * Strictly speaking, \c prtAccelInfo() can only print the basic info of an + * Nvidia CUDA device. + * + * @param iaccel The index of an accelerator. + * + * @return \c void. + */ + +#endif + +#ifdef __cplusplus +} +#endif +#endif