Skip to content
Snippets Groups Projects
Commit 668da3ac authored by Giuliano Taffoni's avatar Giuliano Taffoni
Browse files

New implementation compatible with gcc and GPU info

parent fa112e57
Branches openmp
Tags
No related merge requests found
...@@ -4,29 +4,31 @@ CXX = g++-10 ...@@ -4,29 +4,31 @@ CXX = g++-10
MPICC = mpicc MPICC = mpicc
MPIC++ = mpiCC MPIC++ = mpiCC
OPTIMIZE = -O3 GSL_INCL = -I/home/taffoni/sw/include
#-ffast-math -fopt-info-all-omp -fcf-protection=none -fno-stack-protector -foffload=nvptx-none GSL_LIBS = -L/home/taffoni/sw/lib
GSL_INCL = -I/home/taffoni/sw/include #-I/opt/cluster/openmpi/3.1.3/gnu/8.2.0/include
GSL_LIBS = -L/home/taffoni/sw/lib #-L/opt/cluster/openmpi/3.1.3/gnu/8.2.0/lib -lmpi
FFTW_INCL= -I/home/taffoni/sw/include FFTW_INCL= -I/home/taffoni/sw/include
FFTW_LIB= -L/home/taffoni/sw/lib -lfftw3_mpi -lfftw3 FFTW_LIB= -L/home/taffoni/sw/lib -lfftw3_mpi -lfftw3
#-L/opt/cluster/openmpi/3.1.3/gnu/8.2.0/lib -lmpi
MPI_LIB = MPI_LIB =
MPI_INCL= #-I/opt/cluster/openmpi/3.1.3/gnu/8.2.0/include
MPI_INCL= -I/home/taffoni/sw/Linux_x86_64/21.5/comm_libs/mpi/include
HDF5_INCL = HDF5_INCL =
HDF5_LIB = HDF5_LIB =
OMP = -fopenmp OMP = -mp=multicore,gpu -Mprof -cuda
#OMP = -fopenmp
NVCC = nvcc NVCC = nvcc
NVFLAGS = -arch=sm_70 -Xcompiler -std=c++11 NVFLAGS = -arch=sm_70 -Xcompiler -std=c++11
NVLIB = -L/home/taffoni/sw/Linux_x86_64/21.5/cuda/11.3/lib64/ -lcudart -lcuda NVLIB = -L/home/taffoni/sw/Linux_x86_64/21.5/cuda/11.3/lib64/ -lcudart -lcuda
CFLAGS += $(OPTIMIZE)
CFLAGS += -I. CFLAGS += -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL)
CFLAGS += -I/home/taffoni/sw/Linux_x86_64/21.5/comm_libs/mpi/include
CFLAGS += $(FFTW_INCL) $(GSL_INCL) OPTIMIZE = $(OMP) -O3
CFLAGS += $(FFTW_LIB) -lm
# OMP GPU SPECIFIC FLAGS
#OPTIMIZE += -Wno-unused-result -foffload=-lm -ffast-math
#OPTIMIZE += -fcf-protection=none -fno-stack-protector -foffload=nvptx-none -foffload=-misa=sm_35
#-ffast-math -fopt-info-all-omp -foffload=-misa=sm_35 -fcf-protection=none -fno-stack-protector -foffload=nvptx-none
...@@ -5,16 +5,16 @@ MPICC = mpicc ...@@ -5,16 +5,16 @@ MPICC = mpicc
MPIC++ = mpiCC MPIC++ = mpiCC
CFLAGS += -O3 -mcpu=native
CFLAGS += -I.
FFTW_INCL= -I/home/taffoni/sw/include FFTW_INCL= -I/home/taffoni/sw/include
FFTW_LIB= -L/home/taffoni/sw/lib FFTW_LIB= -L/home/taffoni/sw/lib
LIBS = $(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm
NVCC = nvcc NVCC = nvcc
NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11 NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11
NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda
OMP= -fopenmp
CFLAGS += -O3 -mtune=native CFLAGS += -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL)
OPTIMIZE = $(OMP) -O3 -mtune=native
...@@ -14,21 +14,22 @@ include Build/Makefile.systype ...@@ -14,21 +14,22 @@ include Build/Makefile.systype
endif endif
LIBS = $(FFTW_LIB) -lfftw3 -lm -lcudart -lcuda
# create MPI code # create MPI code
OPT += -DUSE_MPI OPT += -DUSE_MPI
#OPT += -DACCOMP
# use FFTW (it can be switched on ONLY if MPI is active) # use FFTW (it can be switched on ONLY if MPI is active)
ifeq (USE_MPI,$(findstring USE_MPI,$(OPT))) ifeq (USE_MPI,$(findstring USE_MPI,$(OPT)))
OPT += -DUSE_FFTW OPT += -DUSE_FFTW
LIBS = $(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm
endif endif
#OPT += -DNVIDIA
# perform one-side communication (suggested) instead of reduce (only if MPI is active) # perform one-side communication (suggested) instead of reduce (only if MPI is active)
OPT += -DONE_SIDE OPT += -DONE_SIDE
# write the full 3D cube of gridded visibilities and its FFT transform # write the full 3D cube of gridded visibilities and its FFT transform
#OPT += -DWRITE_DATA OPT += -DWRITE_DATA
# write the final image # write the final image
OPT += -DWRITE_IMAGE OPT += -DWRITE_IMAGE
# perform w-stacking phase correction # perform w-stacking phase correction
...@@ -46,27 +47,36 @@ phase_correction.c: phase_correction.cu ...@@ -46,27 +47,36 @@ phase_correction.c: phase_correction.cu
ifeq (USE_MPI,$(findstring USE_MPI,$(OPT))) ifeq (USE_MPI,$(findstring USE_MPI,$(OPT)))
%.o: %.c $(DEPS) %.o: %.c $(DEPS)
$(MPICC) -c -o $@ $< $(CFLAGS) $(OPT) $(MPICC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS)
else else
%.o: %.c $(DEPS) %.o: %.c $(DEPS)
$(CC) $(OMP) -c -o $@ $< $(CFLAGS) $(OPT) $(CC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS)
endif endif
serial: $(COBJ) serial: $(COBJ)
$(CC) -o w-stackingCfftw_serial $(CFLAGS) $^ -lm $(CC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_serial $^ $(LIBS)
serial_omp: phase_correction.c
$(CC) $(OPTIMIZE) $(OPT) -o w-stackingOMP_serial w-stacking-fftw.c w-stacking_omp.c $(CFLAGS) $(LIBS)
simple_mpi: phase_correction.c
$(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingMPI_simple w-stacking_omp.c w-stacking-fftw.c phase_correction.c $(CFLAGS) $(LIBS)
mpi_omp: phase_correction.c
$(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingMPI_omp w-stacking_omp.c w-stacking-fftw.c phase_correction.c $(CFLAGS) $(LIBS)
serial_cuda: serial_cuda:
$(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB) $(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB)
$(CC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c $(CC) $(OPTIMIZE) $(OPT) -c w-stacking-fftw.c $(CFLAGS) $(LIBS)
$(CXX) $(CFLAGS) $(OPT) -o w-stackingfftw_serial w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) -lm $(CXX) $(OPTIMIZE) $(OPT) -o w-stackingfftw_serial w-stacking-fftw.o w-stacking.o phase_correction.o $(CFLAGS) $(NVLIB) -lm
mpi: $(COBJ) mpi: $(COBJ)
$(MPICC) -o w-stackingCfftw $^ $(CFLAGS) $(MPICC) $(OPTIMIZE) -o w-stackingCfftw $^ $(CFLAGS) $(LIBS)
mpi_cuda: mpi_cuda:
$(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB) $(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB)
$(MPICC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c $(MPICC) $(OPTIMIZE) $(OPT) -c w-stacking-fftw.c $(CFLAGS) $(LIBS)
$(MPIC++) $(OPT) -o w-stackingfftw w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) $(CFLAGS) $(MPIC++) $(OPTIMIZE) $(OPT) -o w-stackingfftw w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) $(CFLAGS) $(LIBS)
clean: clean:
rm *.o rm *.o
......
...@@ -18,7 +18,11 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in ...@@ -18,7 +18,11 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
for (int iw=0; iw<num_w_planes; iw++) for (int iw=0; iw<num_w_planes; iw++)
{ {
double wterm = (double)iw/dnum_w_planes; double wterm = (double)iw/dnum_w_planes;
#ifdef ACCOMP
#pragma omp target teams distribute parallel for \
map(tofrom: image_real[0:xaxis*yaxis], image_imag[0:xaxis*yaxis]) \
map (to: gridss[0:2*num_w_planes*xaxis*yaxis])
#endif
for (int iv=0; iv<yaxis; iv++) for (int iv=0; iv<yaxis; iv++)
for (int iu=0; iu<xaxis; iu++) for (int iu=0; iu<xaxis; iu++)
{ {
...@@ -53,10 +57,14 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in ...@@ -53,10 +57,14 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
s = pimag; s = pimag;
//printf("%d %d %d %ld %ld\n",iu,iv,iw,index,img_index); //printf("%d %d %d %ld %ld\n",iu,iv,iw,index,img_index);
#pragma omp atomic
image_real[img_index] += p*r-q*s; image_real[img_index] += p*r-q*s;
#pragma omp atomic
image_imag[img_index] += p*s+q*r; image_imag[img_index] += p*s+q*r;
#else #else
#pragma omp atomic
image_real[img_index] += gridss[index]; image_real[img_index] += gridss[index];
#pragma omp atomic
image_imag[img_index] += gridss[index+1]; image_imag[img_index] += gridss[index+1];
#endif #endif
......
...@@ -7,9 +7,15 @@ ...@@ -7,9 +7,15 @@
#include <fftw3-mpi.h> #include <fftw3-mpi.h>
#endif #endif
#endif #endif
#include <omp.h>
#include <math.h> #include <math.h>
#include <time.h> #include <time.h>
#include <unistd.h>
#ifdef ACCOMP
#include "w-stacking_omp.h"
#else
#include "w-stacking.h" #include "w-stacking.h"
#endif
#define PI 3.14159265359 #define PI 3.14159265359
#define NUM_OF_SECTORS -1 #define NUM_OF_SECTORS -1
#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y)) #define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
...@@ -100,6 +106,8 @@ int main(int argc, char * argv[]) ...@@ -100,6 +106,8 @@ int main(int argc, char * argv[])
int xaxis; int xaxis;
int yaxis; int yaxis;
int num_w_planes = 1; int num_w_planes = 1;
// DAV: the corresponding KernelLen is calculated within the wstack function. It can be anyway hardcoded for optimization // DAV: the corresponding KernelLen is calculated within the wstack function. It can be anyway hardcoded for optimization
int w_support = 7; int w_support = 7;
int num_threads;// = 4; int num_threads;// = 4;
...@@ -115,11 +123,23 @@ int main(int argc, char * argv[]) ...@@ -115,11 +123,23 @@ int main(int argc, char * argv[])
struct timespec begin, finish, begin0, begink, finishk; struct timespec begin, finish, begin0, begink, finishk;
double elapsed; double elapsed;
long nsectors; long nsectors;
/* GT get nymber of threads exit if not given */
if(argc == 1) {
fprintf(stderr, "Usage: %s number_of_OMP_Threads \n", argv[0]);
exit(1);
}
// Set the number of OpenMP threads
num_threads = atoi(argv[1]);
if ( num_threads == 0 )
{
fprintf(stderr, "Wrong parameter: %s\n\n", argv[1]);
fprintf(stderr, "Usage: %s number_of_OMP_Threads \n", argv[0]);
exit(1);
}
clock_gettime(CLOCK_MONOTONIC, &begin0); clock_gettime(CLOCK_MONOTONIC, &begin0);
start0 = clock(); start0 = clock();
// Set the number of OpenMP threads
num_threads = atoi(argv[1]);
// Intialize MPI environment // Intialize MPI environment
#ifdef USE_MPI #ifdef USE_MPI
...@@ -134,8 +154,22 @@ int main(int argc, char * argv[]) ...@@ -134,8 +154,22 @@ int main(int argc, char * argv[])
rank = 0; rank = 0;
size = 1; size = 1;
#endif #endif
if(rank == 0)printf("Running with %d threads\n",num_threads); if(rank == 0)printf("Running with %d threads\n",num_threads);
#ifdef ACCOMP
if(rank == 0){
if (0 == omp_get_num_devices()) {
printf("No accelerator found ... exit\n");
exit(255);
}
printf("Number of available GPUs %d\n", omp_get_num_devices());
#ifdef NVIDIA
prtAccelInfo();
#endif
}
#endif
// set the local size of the image // set the local size of the image
local_grid_size_x = grid_size_x; local_grid_size_x = grid_size_x;
nsectors = NUM_OF_SECTORS; nsectors = NUM_OF_SECTORS;
...@@ -211,6 +245,7 @@ int main(int argc, char * argv[]) ...@@ -211,6 +245,7 @@ int main(int argc, char * argv[])
printf("N. visibilities on %d %ld\n",rank,Nvis); printf("N. visibilities on %d %ld\n",rank,Nvis);
#endif #endif
// DAV: all these arrays can be allocatate statically for the sake of optimization. However be careful that if MPI is used // DAV: all these arrays can be allocatate statically for the sake of optimization. However be careful that if MPI is used
// all the sizes are rescaled by the number of MPI tasks // all the sizes are rescaled by the number of MPI tasks
// Allocate arrays // Allocate arrays
...@@ -260,13 +295,13 @@ int main(int argc, char * argv[]) ...@@ -260,13 +295,13 @@ int main(int argc, char * argv[])
setup_time1 = (finish.tv_sec - begin.tv_sec); setup_time1 = (finish.tv_sec - begin.tv_sec);
setup_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0; setup_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
if(rank == 0)printf("GRIDDING DATA\n"); if(rank == 0)printf("GRIDDING DATA\n");
// Create histograms and linked lists // Create histograms and linked lists
clock_gettime(CLOCK_MONOTONIC, &begin); clock_gettime(CLOCK_MONOTONIC, &begin);
start = clock(); start = clock();
//CLAAA
// Initialize linked list // Initialize linked list
struct sectorlist ** sectorhead; struct sectorlist ** sectorhead;
sectorhead = (struct sectorlist **) malloc((nsectors+1) * sizeof(struct sectorlist)); sectorhead = (struct sectorlist **) malloc((nsectors+1) * sizeof(struct sectorlist));
...@@ -335,20 +370,18 @@ int main(int argc, char * argv[]) ...@@ -335,20 +370,18 @@ int main(int argc, char * argv[])
#ifndef USE_MPI #ifndef USE_MPI
double * gridtot = (double*) calloc(2*grid_size_x*grid_size_y*num_w_planes,sizeof(double)); double * gridtot = (double*) calloc(2*grid_size_x*grid_size_y*num_w_planes,sizeof(double));
#endif #endif
double shift = (double)(dx*yaxis); double shift = (double)(dx*yaxis);
// Open the MPI Memory Window for the slab // Open the MPI Memory Window for the slab
#ifdef USE_MPI #ifdef USE_MPI
MPI_Win slabwin; MPI_Win slabwin;
MPI_Win_create(grid, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &slabwin); MPI_Win_create(grid, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &slabwin);
MPI_Win_fence(0,slabwin); MPI_Win_fence(0,slabwin);
#endif #endif
#ifndef USE_MPI #ifndef USE_MPI
pFile1 = fopen (outfile1,"w"); pFile1 = fopen (outfile1,"w");
#endif #endif
// loop over files // loop over files
// //
kernel_time = 0.0; kernel_time = 0.0;
...@@ -357,9 +390,9 @@ int main(int argc, char * argv[]) ...@@ -357,9 +390,9 @@ int main(int argc, char * argv[])
reduce_time1 = 0.0; reduce_time1 = 0.0;
compose_time = 0.0; compose_time = 0.0;
compose_time1 = 0.0; compose_time1 = 0.0;
for (int ifiles=0; ifiles<ndatasets; ifiles++) for (int ifiles=0; ifiles<ndatasets; ifiles++)
{ {
strcpy(filename,datapath_multi[ifiles]); strcpy(filename,datapath_multi[ifiles]);
printf("Processing %s, %d of %d\n",filename,ifiles+1,ndatasets); printf("Processing %s, %d of %d\n",filename,ifiles+1,ndatasets);
strcat(filename,weightsfile); strcat(filename,weightsfile);
...@@ -368,20 +401,21 @@ int main(int argc, char * argv[]) ...@@ -368,20 +401,21 @@ int main(int argc, char * argv[])
fseek (pFile,startrow*polarisations*sizeof(float),SEEK_SET); fseek (pFile,startrow*polarisations*sizeof(float),SEEK_SET);
fread(weights,(Nweights)*sizeof(float),1,pFile); fread(weights,(Nweights)*sizeof(float),1,pFile);
fclose(pFile); fclose(pFile);
strcpy(filename,datapath); strcpy(filename,datapath);
strcat(filename,visrealfile); strcat(filename,visrealfile);
//printf("Reading %s\n",filename); #ifdef VERBOSE
printf("Reading %s\n",filename);
#endif
pFile = fopen (filename,"rb"); pFile = fopen (filename,"rb");
fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET); fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET);
fread(visreal,Nvis*sizeof(float),1,pFile); fread(visreal,Nvis*sizeof(float),1,pFile);
fclose(pFile); fclose(pFile);
strcpy(filename,datapath); strcpy(filename,datapath);
strcat(filename,visimgfile); strcat(filename,visimgfile);
//printf("Reading %s\n",filename); #ifdef VERBOSE
printf("Reading %s\n",filename);
#endif
pFile = fopen (filename,"rb"); pFile = fopen (filename,"rb");
fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET); fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET);
fread(visimg,Nvis*sizeof(float),1,pFile); fread(visimg,Nvis*sizeof(float),1,pFile);
...@@ -390,7 +424,6 @@ int main(int argc, char * argv[]) ...@@ -390,7 +424,6 @@ int main(int argc, char * argv[])
#ifdef USE_MPI #ifdef USE_MPI
MPI_Barrier(MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD);
#endif #endif
// Declare temporary arrays for the masking // Declare temporary arrays for the masking
double * uus; double * uus;
double * vvs; double * vvs;
...@@ -398,8 +431,8 @@ int main(int argc, char * argv[]) ...@@ -398,8 +431,8 @@ int main(int argc, char * argv[])
float * visreals; float * visreals;
float * visimgs; float * visimgs;
float * weightss; float * weightss;
long isector; long isector;
for (long isector_count=0; isector_count<nsectors; isector_count++) for (long isector_count=0; isector_count<nsectors; isector_count++)
{ {
clock_gettime(CLOCK_MONOTONIC, &begink); clock_gettime(CLOCK_MONOTONIC, &begink);
...@@ -425,13 +458,15 @@ int main(int argc, char * argv[]) ...@@ -425,13 +458,15 @@ int main(int argc, char * argv[])
//CLAAAA //CLAAAA
struct sectorlist * current; struct sectorlist * current;
current = sectorhead[isector]; current = sectorhead[isector];
while (current->index != -1) while (current->index != -1)
{ {
long ilocal = current->index; long ilocal = current->index;
//double vvh = vv[ilocal]; //double vvh = vv[ilocal];
//int binphi = (int)(vvh*nsectors); //int binphi = (int)(vvh*nsectors);
//if (binphi == isector || boundary[ilocal] == isector) //if (binphi == isector || boundary[ilocal] == isector) {
//{
uus[icount] = uu[ilocal]; uus[icount] = uu[ilocal];
vvs[icount] = vv[ilocal]-isector*shift; vvs[icount] = vv[ilocal]-isector*shift;
wws[icount] = ww[ilocal]; wws[icount] = ww[ilocal];
...@@ -457,6 +492,7 @@ int main(int argc, char * argv[]) ...@@ -457,6 +492,7 @@ int main(int argc, char * argv[])
compose_time += ((double) (endk - startk)) / CLOCKS_PER_SEC; compose_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
compose_time1 += (finishk.tv_sec - begink.tv_sec); compose_time1 += (finishk.tv_sec - begink.tv_sec);
compose_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0; compose_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
#ifndef USE_MPI #ifndef USE_MPI
double uumin = 1e20; double uumin = 1e20;
double vvmin = 1e20; double vvmin = 1e20;
...@@ -483,6 +519,7 @@ int main(int argc, char * argv[]) ...@@ -483,6 +519,7 @@ int main(int argc, char * argv[])
#endif #endif
clock_gettime(CLOCK_MONOTONIC, &begink); clock_gettime(CLOCK_MONOTONIC, &begink);
startk = clock(); startk = clock();
wstack(num_w_planes, wstack(num_w_planes,
Nsec, Nsec,
freq_per_chan, freq_per_chan,
...@@ -500,6 +537,18 @@ int main(int argc, char * argv[]) ...@@ -500,6 +537,18 @@ int main(int argc, char * argv[])
yaxis, yaxis,
gridss, gridss,
num_threads); num_threads);
/* int z =0 ;
#pragma omp target map(to:test_i_gpu) map(from:z)
{
int x; // only accessible from accelerator
x = 2;
z = x + test_i_gpu;
}*/
clock_gettime(CLOCK_MONOTONIC, &finishk); clock_gettime(CLOCK_MONOTONIC, &finishk);
endk = clock(); endk = clock();
kernel_time += ((double) (endk - startk)) / CLOCKS_PER_SEC; kernel_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
......
...@@ -6,6 +6,9 @@ ...@@ -6,6 +6,9 @@
#include <stdlib.h> #include <stdlib.h>
#include <stdio.h> #include <stdio.h>
#ifdef ACCOMP
#pragma omp declare target
#endif
#ifdef __CUDACC__ #ifdef __CUDACC__
double __device__ double __device__
#else #else
...@@ -17,6 +20,9 @@ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist) ...@@ -17,6 +20,9 @@ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22); conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22);
return conv_weight; return conv_weight;
} }
#ifdef ACCOMP
#pragma omp end declare target
#endif
#ifdef __CUDACC__ #ifdef __CUDACC__
//double __device__ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist) //double __device__ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
...@@ -111,7 +117,9 @@ __global__ void convolve_g( ...@@ -111,7 +117,9 @@ __global__ void convolve_g(
} }
} }
#endif #endif
#ifdef ACCOMP
#pragma omp declare target
#endif
void wstack( void wstack(
int num_w_planes, int num_w_planes,
long num_points, long num_points,
...@@ -216,7 +224,14 @@ void wstack( ...@@ -216,7 +224,14 @@ void wstack(
#ifdef _OPENMP #ifdef _OPENMP
omp_set_num_threads(num_threads); omp_set_num_threads(num_threads);
#endif #endif
#ifdef ACCOMP
long Nvis = num_points*freq_per_chan*polarizations;
// #pragma omp target data map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan])
// #pragma omp target teams distribute parallel for map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom: grid[0:2*num_w_planes*grid_size_x*grid_size_y])
#else
#pragma omp parallel for private(visindex) #pragma omp parallel for private(visindex)
#endif
for (i = 0; i < num_points; i++) for (i = 0; i < num_points; i++)
{ {
#ifdef _OPENMP #ifdef _OPENMP
...@@ -258,6 +273,7 @@ void wstack( ...@@ -258,6 +273,7 @@ void wstack(
double u_dist = (double)j+0.5 - pos_u; double u_dist = (double)j+0.5 - pos_u;
long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist); double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
// Loops over frequencies and polarizations // Loops over frequencies and polarizations
double add_term_real = 0.0; double add_term_real = 0.0;
...@@ -293,6 +309,9 @@ void wstack( ...@@ -293,6 +309,9 @@ void wstack(
#endif #endif
//for (int i=0; i<100000; i++)printf("%f\n",grid[i]); //for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
} }
#ifdef ACCOMP
#pragma omp end declare target
#endif
int test(int nnn) int test(int nnn)
{ {
...@@ -301,4 +320,3 @@ int test(int nnn) ...@@ -301,4 +320,3 @@ int test(int nnn)
mmm = nnn+1; mmm = nnn+1;
return mmm; return mmm;
} }
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#ifdef __CUDACC__ #ifdef __CUDACC__
extern "C" extern "C"
#endif #endif
void wstack( void wstack(
int, int,
long, long,
......
...@@ -3,6 +3,9 @@ ...@@ -3,6 +3,9 @@
#include <math.h> #include <math.h>
#include <stdlib.h> #include <stdlib.h>
#include <stdio.h> #include <stdio.h>
#ifdef NVIDIA
#include <cuda_runtime.h>
#endif
#ifdef ACCOMP #ifdef ACCOMP
#pragma omp declare target #pragma omp declare target
...@@ -17,9 +20,8 @@ double gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist ...@@ -17,9 +20,8 @@ double gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist
#pragma omp end declare target #pragma omp end declare target
#endif #endif
#ifdef ACCOMP
#pragma omp declare target
#endif
void wstack( void wstack(
int num_w_planes, int num_w_planes,
long num_points, long num_points,
...@@ -39,8 +41,7 @@ void wstack( ...@@ -39,8 +41,7 @@ void wstack(
double* grid, double* grid,
int num_threads) int num_threads)
{ {
long i; //long index;
long index;
long visindex; long visindex;
// initialize the convolution kernel // initialize the convolution kernel
...@@ -57,12 +58,16 @@ void wstack( ...@@ -57,12 +58,16 @@ void wstack(
#ifdef ACCOMP #ifdef ACCOMP
long Nvis = num_points*freq_per_chan*polarizations; long Nvis = num_points*freq_per_chan*polarizations;
// #pragma omp target data map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) long gpu_weight_dim = Nvis/freq_per_chan;
#pragma omp target teams distribute parallel for private(visindex) map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom: grid[0:2*num_w_planes*grid_size_x*grid_size_y]) long gpu_grid_dim = 2*num_w_planes*grid_size_x*grid_size_y;
#else #pragma omp target teams distribute parallel for private(visindex) \
#pragma omp parallel for private(visindex) map(to:num_points, KernelLen, std, std22, norm, num_w_planes, \
uu[0:num_points], vv[0:num_points], ww[0:num_points], \
vis_real[0:Nvis], vis_img[0:Nvis], weight[0:gpu_weight_dim], \
grid_size_x, grid_size_y, freq_per_chan, polarizations, dx,dw, w_support, num_threads) \
map(tofrom: grid[0:gpu_grid_dim])
#endif #endif
for (i = 0; i < num_points; i++) for (long i = 0; i < num_points; i++)
{ {
#ifdef _OPENMP #ifdef _OPENMP
//int tid; //int tid;
...@@ -135,12 +140,104 @@ void wstack( ...@@ -135,12 +140,104 @@ void wstack(
} }
} }
//for (int i=0; i<100000; i++)printf("%f\n",grid[i]); //for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
} }
#ifdef ACCOMP
#pragma omp end declare target
#ifdef NVIDIA
#define CUDAErrorCheck(funcall) \
do { \
cudaError_t ierr = funcall; \
if (cudaSuccess != ierr) { \
fprintf(stderr, "%s(line %d) : CUDA RT API error : %s(%d) -> %s\n", \
__FILE__, __LINE__, #funcall, ierr, cudaGetErrorString(ierr)); \
exit(ierr); \
} \
} while (0)
static inline int _corePerSM(int major, int minor)
/**
* @brief Give the number of CUDA cores per streaming multiprocessor (SM).
*
* The number of CUDA cores per SM is determined by the compute capability.
*
* @param major Major revision number of the compute capability.
* @param minor Minor revision number of the compute capability.
*
* @return The number of CUDA cores per SM.
*/
{
if (1 == major) {
if (0 == minor || 1 == minor || 2 == minor || 3 == minor) return 8;
}
if (2 == major) {
if (0 == minor) return 32;
if (1 == minor) return 48;
}
if (3 == major) {
if (0 == minor || 5 == minor || 7 == minor) return 192;
}
if (5 == major) {
if (0 == minor || 2 == minor) return 128;
}
if (6 == major) {
if (0 == minor) return 64;
if (1 == minor || 2 == minor) return 128;
}
if (7 == major) {
if (0 == minor || 2 == minor || 5 == minor) return 64;
}
return -1;
}
void getGPUInfo(int iaccel)
{
int corePerSM;
struct cudaDeviceProp dev;
CUDAErrorCheck(cudaSetDevice(iaccel));
CUDAErrorCheck(cudaGetDeviceProperties(&dev, iaccel));
corePerSM = _corePerSM(dev.major, dev.minor);
printf("\n");
printf("============================================================\n");
printf("CUDA Device name : \"%s\"\n", dev.name);
printf("------------------------------------------------------------\n");
printf("Comp. Capability : %d.%d\n", dev.major, dev.minor);
printf("max clock rate : %.0f MHz\n", dev.clockRate * 1.e-3f);
printf("number of SMs : %d\n", dev.multiProcessorCount);
printf("cores / SM : %d\n", corePerSM);
printf("# of CUDA cores : %d\n", corePerSM * dev.multiProcessorCount);
printf("------------------------------------------------------------\n");
printf("global memory : %5.0f MBytes\n", dev.totalGlobalMem / 1048576.0f);
printf("shared mem. / SM : %5.1f KBytes\n", dev.sharedMemPerMultiprocessor / 1024.0f);
printf("32-bit reg. / SM : %d\n", dev.regsPerMultiprocessor);
printf("------------------------------------------------------------\n");
printf("max # of threads / SM : %d\n", dev.maxThreadsPerMultiProcessor);
printf("max # of threads / block : %d\n", dev.maxThreadsPerBlock);
printf("max dim. of block : (%d, %d, %d)\n",
dev.maxThreadsDim[0], dev.maxThreadsDim[1], dev.maxThreadsDim[2]);
printf("max dim. of grid : (%d, %d, %d)\n",
dev.maxGridSize[0], dev.maxGridSize[1], dev.maxGridSize[2]);
printf("warp size : %d\n", dev.warpSize);
printf("============================================================\n");
int z = 0, x = 2;
#pragma omp target map(to:x) map(tofrom:z)
{
z=x+100;
}
}
#endif #endif
int test(int nnn) int test(int nnn)
{ {
int mmm; int mmm;
......
...@@ -34,6 +34,9 @@ double gauss_kernel_norm( ...@@ -34,6 +34,9 @@ double gauss_kernel_norm(
double u_dist, double u_dist,
double v_dist); double v_dist);
void phase_correction( void phase_correction(
double*, double*,
double*, double*,
...@@ -43,3 +46,37 @@ void phase_correction( ...@@ -43,3 +46,37 @@ void phase_correction(
int, int,
int, int,
int); int);
#ifdef ACCOMP
#ifdef NVIDIA
void getGPUInfo(int);
#endif
#pragma omp declare target (gauss_kernel_norm)
#endif
#ifdef NVIDIA
#ifdef __cplusplus
extern "C" {
#endif
#ifndef PRTACCELINFO_H
#define PRTACCELINFO_H
void prtAccelInfo(int iaccel);
/**<
* @brief Print some basic info of an accelerator.
*
* Strictly speaking, \c prtAccelInfo() can only print the basic info of an
* Nvidia CUDA device.
*
* @param iaccel The index of an accelerator.
*
* @return \c void.
*/
#endif
#ifdef __cplusplus
}
#endif
#endif
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment