Skip to content
Snippets Groups Projects
Commit 64adac63 authored by Claudio Gheller's avatar Claudio Gheller
Browse files

merge with openmp branch WITHOUT ACCOMP directives

parents 477b8f6a 668da3ac
No related branches found
No related tags found
No related merge requests found
*.o
.DS_Store
sync.sh
phase_correction.c
w-stacking.c
w-stackingCfftw
......
CC = gcc
CXX = g++
MPICC = mpicc
MPIC++ = mpiCC
#FFTW_INCL= -I/home/taffoni/sw/include
#FFTW_LIB= -L/home/taffoni/sw/lib
NVCC = nvcc
NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11
NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda
OMP= -fopenmp
CFLAGS += -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL)
OPTIMIZE = $(OMP) -O3 -mtune=native
CC = icc
CXX = g++-11
MPICC = mpicc
MPIC++ = mpiCC
FFTW_INCL= -I/usr/local/include
FFTW_LIB= /usr/local/lib/
GSL_INCL =
GSL_LIBS =
MPI_LIB =
MPI_INCL= -I/home/taffoni/sw/Linux_x86_64/21.5/comm_libs/mpi/include
HDF5_INCL =
HDF5_LIB =
OMP= -fopenmp
NVCC = nvcc
NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11
NVLIB = -L/home/taffoni -lcudart -lcuda
CFLAGS += -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL)
OPTIMIZE = $(OMP) -O3
CC = gcc-10
CXX = g++-10
MPICC = mpicc
MPIC++ = mpiCC
GSL_INCL = -I/home/taffoni/sw/include
GSL_LIBS = -L/home/taffoni/sw/lib
FFTW_INCL= -I/home/taffoni/sw/include
FFTW_LIB= /home/taffoni/sw/lib -lfftw3_mpi -lfftw3
#-L/opt/cluster/openmpi/3.1.3/gnu/8.2.0/lib -lmpi
MPI_LIB =
#-I/opt/cluster/openmpi/3.1.3/gnu/8.2.0/include
MPI_INCL= -I/home/taffoni/sw/Linux_x86_64/21.5/comm_libs/mpi/include
HDF5_INCL =
HDF5_LIB =
OMP = -mp=multicore,gpu -Mprof -cuda
#OMP = -fopenmp
NVCC = nvcc
NVFLAGS = -arch=sm_70 -Xcompiler -std=c++11
NVLIB = -L/home/taffoni/sw/Linux_x86_64/21.5/cuda/11.3/lib64/ -lcudart -lcuda
CFLAGS += -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL)
OPTIMIZE = $(OMP) -O3
# OMP GPU SPECIFIC FLAGS
#OPTIMIZE += -Wno-unused-result -foffload=-lm -ffast-math
#OPTIMIZE += -fcf-protection=none -fno-stack-protector -foffload=nvptx-none -foffload=-misa=sm_35
#-ffast-math -fopt-info-all-omp -foffload=-misa=sm_35 -fcf-protection=none -fno-stack-protector -foffload=nvptx-none
CC = gcc
CXX = g++
MPICC = mpicc
MPIC++ = mpiCC
FFTW_INCL= -I/home/taffoni/sw/include
FFTW_LIB= /home/taffoni/sw/lib
NVCC = nvcc
NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11
NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda
OMP= -fopenmp
CFLAGS += -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL)
OPTIMIZE = $(OMP) -O3 -mtune=native
CC = gcc-10
CXX = g++-10
MPICC = mpicc
MPIC++ = mpiCC
OPTIMIZE =
GSL_INCL =
GSL_LIB =
FFTW_INCL=
FFTW_LIB=
NVCC =
NVFLAGS =
NVLIB =
CFLAGS +=
MPICHLIB =
HDF5INCL =
HDF5LIB =
# comment/uncomment the various options depending hoe you want to build the program
# Set default values for compiler options if no systype options are given or found
CC = mpiCC
CXX = mpiCC
OPTIMIZE = -std=c++11 -Wall -g -O2
MPICHLIB = -lmpich
SWITCHES =
ifdef SYSTYPE
SYSTYPE := $(SYSTYPE)
include Build/Makefile.$(SYSTYPE)
else
include Build/Makefile.systype
endif
LIBS = -L$(FFTW_LIB) -lfftw3 -lm -lcudart -lcuda
# create MPI code
OPT += -DUSE_MPI
#OPT += -DACCOMP
# use FFTW (it can be switched on ONLY if MPI is active)
ifeq (USE_MPI,$(findstring USE_MPI,$(OPT)))
OPT += -DUSE_FFTW
LIBS = -L$(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm
endif
#OPT += -DNVIDIA
# perform one-side communication (suggested) instead of reduce (only if MPI is active)
OPT += -DONE_SIDE
# write the full 3D cube of gridded visibilities and its FFT transform
......@@ -10,25 +33,8 @@ OPT += -DONE_SIDE
# write the final image
OPT += -DWRITE_IMAGE
# perform w-stacking phase correction
#OPT += -DPHASE_ON
CC = gcc
CXX = g++
ifeq (USE_MPI,$(findstring USE_MPI,$(OPT)))
CC = mpicc
CXX = mpiCC
endif
OMP = -fopenmp
#OMP =
OPT += -DPHASE_ON
CFLAGS += -O3 -mcpu=native
CFLAGS += -I.
LIBS = -L$(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm
NVCC = nvcc
NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11
NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda
DEPS = w-stacking.h w-stacking-fftw.c w-stacking.cu phase_correction.cu
COBJ = w-stacking.o w-stacking-fftw.o phase_correction.o
......@@ -39,27 +45,40 @@ w-stacking.c: w-stacking.cu
phase_correction.c: phase_correction.cu
cp phase_correction.cu phase_correction.c
ifeq (USE_MPI,$(findstring USE_MPI,$(OPT)))
%.o: %.c $(DEPS)
$(CC) $(OMP) -c -o $@ $< $(CFLAGS) $(OPT)
$(MPICC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS)
else
%.o: %.c $(DEPS)
$(CC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS)
endif
serial: $(COBJ)
$(CC) $(OMP) -o w-stackingCfftw_serial $(CFLAGS) $^ -lm
$(CC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_serial $^ $(LIBS)
serial_omp: phase_correction.c
$(CC) $(OPTIMIZE) $(OPT) -o w-stackingOMP_serial w-stacking-fftw.c w-stacking_omp.c $(CFLAGS) $(LIBS)
simple_mpi: phase_correction.c
$(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingMPI_simple w-stacking_omp.c w-stacking-fftw.c phase_correction.c $(CFLAGS) $(LIBS)
mpi_omp: phase_correction.c
$(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingMPI_omp w-stacking_omp.c w-stacking-fftw.c phase_correction.c $(CFLAGS) $(LIBS)
serial_cuda:
$(NVCC) $(OPT) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB)
$(CC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c
$(CXX) $(CFLAGS) $(OPT) -o w-stackingfftw_serial w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) -lm
$(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB)
$(CC) $(OPTIMIZE) $(OPT) -c w-stacking-fftw.c $(CFLAGS) $(LIBS)
$(CXX) $(OPTIMIZE) $(OPT) -o w-stackingfftw_serial w-stacking-fftw.o w-stacking.o phase_correction.o $(CFLAGS) $(NVLIB) -lm
mpi: $(COBJ)
$(CC) $(OMP) -o w-stackingCfftw $(CFLAGS) $^ $(LIBS)
$(MPICC) $(OPTIMIZE) -o w-stackingCfftw $^ $(CFLAGS) $(LIBS)
mpi_cuda:
$(NVCC) $(NVFLAGS) $(OPT) -c w-stacking.cu phase_correction.cu $(NVLIB)
$(CC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c
$(CXX) $(CFLAGS) $(OPT) -o w-stackingfftw w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) $(LIBS) -lm
$(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB)
$(MPICC) $(OPTIMIZE) $(OPT) -c w-stacking-fftw.c $(CFLAGS) $(LIBS)
$(MPIC++) $(OPTIMIZE) $(OPT) -o w-stackingfftw w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) $(CFLAGS) $(LIBS)
clean:
rm *.o
rm w-stacking.c
rm phase_correction.c
......@@ -170,6 +170,14 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
double preal, pimag;
double radius2 = (xcoord*xcoord+ycoord*ycoord);
if(xcoord <= 1.0)
{
preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
} else {
preal = cos(-2.0*PI*wterm*(sqrt(radius2-1.0)-1));
pimag = 0.0;
}
preal = cos(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
pimag = sin(2.0*PI*wterm*(sqrt(1-radius2)-1.0));
......
......@@ -10,7 +10,8 @@ nplanes = 1;
% id = '4';
suffix = '';
id = '';
datadir = "/Users/cgheller/Work/Gridding/data/";
d
atadir = "/Users/cgheller/Work/Gridding/data/";
filename1 = strcat(datadir,"fft_real",suffix,id,'.bin');
filename2 = strcat(datadir,"fft_img",suffix,id,'.bin');
s = dir(filename1);
......@@ -52,4 +53,3 @@ xlabel('cell');
ylabel('cell');
pngimage = strcat(datadir,'fft-2freq',suffix,id,'.png');
saveas(gcf,pngimage);
#!/usr/bin/python3
import numpy as np
import matplotlib.pyplot as plt
filename1 = "fft_real.bin"
filename2 = "fft_img.bin"
nplanes = 1
with open(filename1, 'rb') as f1:
vreal = np.fromfile(f1, dtype=np.float64)
with open(filename2, 'rb') as f2:
vimg = np.fromfile(f2, dtype=np.float64)
xaxis = int(np.sqrt(vreal.size))
yaxes = xaxis
residual = np.vectorize(complex)(vreal, vimg)
cumul2d = residual.reshape((xaxis,yaxes,nplanes), order='F')
for i in range(nplanes):
gridded = np.squeeze(cumul2d[:,:,i])
ax = plt.subplot()
img = ax.imshow(np.abs(np.fft.fftshift(gridded)), aspect='auto', interpolation='none', origin='lower')
ax.set_xlabel('cell')
ax.set_ylabel('cell')
cbar = plt.colorbar(img)
cbar.set_label('norm(FFT)',size=18)
figname='grid_image_' + str(i) + '.png'
plt.savefig(figname)
......@@ -7,9 +7,15 @@
#include <fftw3-mpi.h>
#endif
#endif
#include <omp.h>
#include <math.h>
#include <time.h>
#include <unistd.h>
#ifdef ACCOMP
#include "w-stacking_omp.h"
#else
#include "w-stacking.h"
#endif
#define PI 3.14159265359
#define NUM_OF_SECTORS -1
#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
......@@ -102,6 +108,7 @@ int main(int argc, char * argv[])
int xaxis;
int yaxis;
int num_w_planes = 8;
// DAV: the corresponding KernelLen is calculated within the wstack function. It can be anyway hardcoded for optimization
int w_support = 7;
int num_threads;// = 4;
......@@ -117,11 +124,23 @@ int main(int argc, char * argv[])
struct timespec begin, finish, begin0, begink, finishk;
double elapsed;
long nsectors;
/* GT get nymber of threads exit if not given */
if(argc == 1) {
fprintf(stderr, "Usage: %s number_of_OMP_Threads \n", argv[0]);
exit(1);
}
// Set the number of OpenMP threads
num_threads = atoi(argv[1]);
if ( num_threads == 0 )
{
fprintf(stderr, "Wrong parameter: %s\n\n", argv[1]);
fprintf(stderr, "Usage: %s number_of_OMP_Threads \n", argv[0]);
exit(1);
}
clock_gettime(CLOCK_MONOTONIC, &begin0);
start0 = clock();
// Set the number of OpenMP threads
num_threads = atoi(argv[1]);
// Intialize MPI environment
#ifdef USE_MPI
......@@ -136,8 +155,22 @@ int main(int argc, char * argv[])
rank = 0;
size = 1;
#endif
if(rank == 0)printf("Running with %d threads\n",num_threads);
#ifdef ACCOMP
if(rank == 0){
if (0 == omp_get_num_devices()) {
printf("No accelerator found ... exit\n");
exit(255);
}
printf("Number of available GPUs %d\n", omp_get_num_devices());
#ifdef NVIDIA
prtAccelInfo();
#endif
}
#endif
// set the local size of the image
local_grid_size_x = grid_size_x;
nsectors = NUM_OF_SECTORS;
......@@ -214,6 +247,7 @@ int main(int argc, char * argv[])
printf("N. visibilities on %d %ld\n",rank,Nvis);
#endif
// DAV: all these arrays can be allocatate statically for the sake of optimization. However be careful that if MPI is used
// all the sizes are rescaled by the number of MPI tasks
// Allocate arrays
......@@ -263,13 +297,13 @@ int main(int argc, char * argv[])
setup_time1 = (finish.tv_sec - begin.tv_sec);
setup_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
if(rank == 0)printf("GRIDDING DATA\n");
// Create histograms and linked lists
clock_gettime(CLOCK_MONOTONIC, &begin);
start = clock();
//CLAAA
// Initialize linked list
struct sectorlist ** sectorhead;
sectorhead = (struct sectorlist **) malloc((nsectors+1) * sizeof(struct sectorlist));
......@@ -338,20 +372,18 @@ int main(int argc, char * argv[])
#ifndef USE_MPI
double * gridtot = (double*) calloc(2*grid_size_x*grid_size_y*num_w_planes,sizeof(double));
#endif
double shift = (double)(dx*yaxis);
// Open the MPI Memory Window for the slab
#ifdef USE_MPI
MPI_Win slabwin;
MPI_Win_create(grid, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &slabwin);
MPI_Win_fence(0,slabwin);
#endif
#ifndef USE_MPI
pFile1 = fopen (outfile1,"w");
#endif
// loop over files
//
kernel_time = 0.0;
......@@ -365,7 +397,6 @@ int main(int argc, char * argv[])
//
for (int ifiles=0; ifiles<ndatasets; ifiles++)
{
strcpy(filename,datapath_multi[ifiles]);
printf("Processing %s, %d of %d\n",filename,ifiles+1,ndatasets);
......@@ -400,19 +431,15 @@ int main(int argc, char * argv[])
fread(weights,(Nweights)*sizeof(float),1,pFile);
fclose(pFile);
strcpy(filename,datapath);
strcat(filename,visrealfile);
//printf("Reading %s\n",filename);
pFile = fopen (filename,"rb");
fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET);
fread(visreal,Nvis*sizeof(float),1,pFile);
fclose(pFile);
strcpy(filename,datapath);
strcat(filename,visimgfile);
//printf("Reading %s\n",filename);
#ifdef VERBOSE
printf("Reading %s\n",filename);
#endif
pFile = fopen (filename,"rb");
fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET);
fread(visimg,Nvis*sizeof(float),1,pFile);
......@@ -421,7 +448,6 @@ int main(int argc, char * argv[])
#ifdef USE_MPI
MPI_Barrier(MPI_COMM_WORLD);
#endif
// Declare temporary arrays for the masking
double * uus;
double * vvs;
......@@ -429,8 +455,8 @@ int main(int argc, char * argv[])
float * visreals;
float * visimgs;
float * weightss;
long isector;
for (long isector_count=0; isector_count<nsectors; isector_count++)
{
clock_gettime(CLOCK_MONOTONIC, &begink);
......@@ -456,13 +482,15 @@ int main(int argc, char * argv[])
//CLAAAA
struct sectorlist * current;
current = sectorhead[isector];
while (current->index != -1)
{
long ilocal = current->index;
//double vvh = vv[ilocal];
//int binphi = (int)(vvh*nsectors);
//if (binphi == isector || boundary[ilocal] == isector)
//{
//if (binphi == isector || boundary[ilocal] == isector) {
uus[icount] = uu[ilocal];
vvs[icount] = vv[ilocal]-isector*shift;
wws[icount] = ww[ilocal];
......@@ -488,6 +516,7 @@ int main(int argc, char * argv[])
compose_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
compose_time1 += (finishk.tv_sec - begink.tv_sec);
compose_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
#ifndef USE_MPI
double uumin = 1e20;
double vvmin = 1e20;
......@@ -514,6 +543,7 @@ int main(int argc, char * argv[])
#endif
clock_gettime(CLOCK_MONOTONIC, &begink);
startk = clock();
wstack(num_w_planes,
Nsec,
freq_per_chan,
......@@ -531,6 +561,18 @@ int main(int argc, char * argv[])
yaxis,
gridss,
num_threads);
/* int z =0 ;
#pragma omp target map(to:test_i_gpu) map(from:z)
{
int x; // only accessible from accelerator
x = 2;
z = x + test_i_gpu;
}*/
clock_gettime(CLOCK_MONOTONIC, &finishk);
endk = clock();
kernel_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
......
......@@ -6,6 +6,9 @@
#include <stdlib.h>
#include <stdio.h>
#ifdef ACCOMP
#pragma omp declare target
#endif
#ifdef __CUDACC__
double __device__
#else
......@@ -17,6 +20,9 @@ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22);
return conv_weight;
}
#ifdef ACCOMP
#pragma omp end declare target
#endif
#ifdef __CUDACC__
//double __device__ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
......@@ -111,7 +117,9 @@ __global__ void convolve_g(
}
}
#endif
#ifdef ACCOMP
#pragma omp declare target
#endif
void wstack(
int num_w_planes,
long num_points,
......@@ -216,7 +224,14 @@ void wstack(
#ifdef _OPENMP
omp_set_num_threads(num_threads);
#endif
#ifdef ACCOMP
long Nvis = num_points*freq_per_chan*polarizations;
// #pragma omp target data map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan])
// #pragma omp target teams distribute parallel for map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom: grid[0:2*num_w_planes*grid_size_x*grid_size_y])
#else
#pragma omp parallel for private(visindex)
#endif
for (i = 0; i < num_points; i++)
{
#ifdef _OPENMP
......@@ -258,6 +273,7 @@ void wstack(
double u_dist = (double)j+0.5 - pos_u;
long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
// Loops over frequencies and polarizations
double add_term_real = 0.0;
......@@ -293,6 +309,9 @@ void wstack(
#endif
//for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
}
#ifdef ACCOMP
#pragma omp end declare target
#endif
int test(int nnn)
{
......@@ -301,4 +320,3 @@ int test(int nnn)
mmm = nnn+1;
return mmm;
}
......@@ -8,6 +8,7 @@
#ifdef __CUDACC__
extern "C"
#endif
void wstack(
int,
long,
......
#include <omp.h>
#include "w-stacking_omp.h"
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#ifdef NVIDIA
#include <cuda_runtime.h>
#endif
#ifdef ACCOMP
#pragma omp declare target
#endif
double gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
{
double conv_weight;
conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22);
return conv_weight;
}
#ifdef ACCOMP
#pragma omp end declare target
#endif
void wstack(
int num_w_planes,
long num_points,
long freq_per_chan,
long polarizations,
double* uu,
double* vv,
double* ww,
float* vis_real,
float* vis_img,
float* weight,
double dx,
double dw,
int w_support,
int grid_size_x,
int grid_size_y,
double* grid,
int num_threads)
{
//long index;
long visindex;
// initialize the convolution kernel
// gaussian:
int KernelLen = (w_support-1)/2;
double std = 1.0;
double std22 = 1.0/(2.0*std*std);
double norm = std22/PI;
// Loop over visibilities.
#ifdef _OPENMP
omp_set_num_threads(num_threads);
#endif
#ifdef ACCOMP
long Nvis = num_points*freq_per_chan*polarizations;
long gpu_weight_dim = Nvis/freq_per_chan;
long gpu_grid_dim = 2*num_w_planes*grid_size_x*grid_size_y;
#pragma omp target teams distribute parallel for private(visindex) \
map(to:num_points, KernelLen, std, std22, norm, num_w_planes, \
uu[0:num_points], vv[0:num_points], ww[0:num_points], \
vis_real[0:Nvis], vis_img[0:Nvis], weight[0:gpu_weight_dim], \
grid_size_x, grid_size_y, freq_per_chan, polarizations, dx,dw, w_support, num_threads) \
map(tofrom: grid[0:gpu_grid_dim])
#endif
for (long i = 0; i < num_points; i++)
{
#ifdef _OPENMP
//int tid;
//tid = omp_get_thread_num();
//printf("%d\n",tid);
#endif
visindex = i*freq_per_chan*polarizations;
double sum = 0.0;
int j, k;
//if (i%1000 == 0)printf("%ld\n",i);
/* Convert UV coordinates to grid coordinates. */
double pos_u = uu[i] / dx;
double pos_v = vv[i] / dx;
double ww_i = ww[i] / dw;
int grid_w = (int)ww_i;
int grid_u = (int)pos_u;
int grid_v = (int)pos_v;
// check the boundaries
long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
//printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax);
// Convolve this point onto the grid.
for (k = kmin; k <= kmax; k++)
{
double v_dist = (double)k+0.5 - pos_v;
for (j = jmin; j <= jmax; j++)
{
double u_dist = (double)j+0.5 - pos_u;
long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
// Loops over frequencies and polarizations
double add_term_real = 0.0;
double add_term_img = 0.0;
long ifine = visindex;
// DAV: the following two loops are performend by each thread separately: no problems of race conditions
for (long ifreq=0; ifreq<freq_per_chan; ifreq++)
{
long iweight = visindex/freq_per_chan;
for (long ipol=0; ipol<polarizations; ipol++)
{
if (!isnan(vis_real[ifine]))
{
//printf("%f %ld\n",weight[iweight],iweight);
add_term_real += weight[iweight] * vis_real[ifine] * conv_weight;
add_term_img += weight[iweight] * vis_img[ifine] * conv_weight;
//if(vis_img[ifine]>1e10 || vis_img[ifine]<-1e10)printf("%f %f %f %f %ld %ld\n",vis_real[ifine],vis_img[ifine],weight[iweight],conv_weight,ifine,num_points*freq_per_chan*polarizations);
}
ifine++;
iweight++;
}
}
// DAV: this is the critical call in terms of correctness of the results and of performance
#pragma omp atomic
grid[iKer] += add_term_real;
#pragma omp atomic
grid[iKer+1] += add_term_img;
}
}
}
//for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
}
#ifdef NVIDIA
#define CUDAErrorCheck(funcall) \
do { \
cudaError_t ierr = funcall; \
if (cudaSuccess != ierr) { \
fprintf(stderr, "%s(line %d) : CUDA RT API error : %s(%d) -> %s\n", \
__FILE__, __LINE__, #funcall, ierr, cudaGetErrorString(ierr)); \
exit(ierr); \
} \
} while (0)
static inline int _corePerSM(int major, int minor)
/**
* @brief Give the number of CUDA cores per streaming multiprocessor (SM).
*
* The number of CUDA cores per SM is determined by the compute capability.
*
* @param major Major revision number of the compute capability.
* @param minor Minor revision number of the compute capability.
*
* @return The number of CUDA cores per SM.
*/
{
if (1 == major) {
if (0 == minor || 1 == minor || 2 == minor || 3 == minor) return 8;
}
if (2 == major) {
if (0 == minor) return 32;
if (1 == minor) return 48;
}
if (3 == major) {
if (0 == minor || 5 == minor || 7 == minor) return 192;
}
if (5 == major) {
if (0 == minor || 2 == minor) return 128;
}
if (6 == major) {
if (0 == minor) return 64;
if (1 == minor || 2 == minor) return 128;
}
if (7 == major) {
if (0 == minor || 2 == minor || 5 == minor) return 64;
}
return -1;
}
void getGPUInfo(int iaccel)
{
int corePerSM;
struct cudaDeviceProp dev;
CUDAErrorCheck(cudaSetDevice(iaccel));
CUDAErrorCheck(cudaGetDeviceProperties(&dev, iaccel));
corePerSM = _corePerSM(dev.major, dev.minor);
printf("\n");
printf("============================================================\n");
printf("CUDA Device name : \"%s\"\n", dev.name);
printf("------------------------------------------------------------\n");
printf("Comp. Capability : %d.%d\n", dev.major, dev.minor);
printf("max clock rate : %.0f MHz\n", dev.clockRate * 1.e-3f);
printf("number of SMs : %d\n", dev.multiProcessorCount);
printf("cores / SM : %d\n", corePerSM);
printf("# of CUDA cores : %d\n", corePerSM * dev.multiProcessorCount);
printf("------------------------------------------------------------\n");
printf("global memory : %5.0f MBytes\n", dev.totalGlobalMem / 1048576.0f);
printf("shared mem. / SM : %5.1f KBytes\n", dev.sharedMemPerMultiprocessor / 1024.0f);
printf("32-bit reg. / SM : %d\n", dev.regsPerMultiprocessor);
printf("------------------------------------------------------------\n");
printf("max # of threads / SM : %d\n", dev.maxThreadsPerMultiProcessor);
printf("max # of threads / block : %d\n", dev.maxThreadsPerBlock);
printf("max dim. of block : (%d, %d, %d)\n",
dev.maxThreadsDim[0], dev.maxThreadsDim[1], dev.maxThreadsDim[2]);
printf("max dim. of grid : (%d, %d, %d)\n",
dev.maxGridSize[0], dev.maxGridSize[1], dev.maxGridSize[2]);
printf("warp size : %d\n", dev.warpSize);
printf("============================================================\n");
int z = 0, x = 2;
#pragma omp target map(to:x) map(tofrom:z)
{
z=x+100;
}
}
#endif
int test(int nnn)
{
int mmm;
mmm = nnn+1;
return mmm;
}
#ifndef W_PROJECT_H_
#define W_PROJECT_H_
#endif
#define NWORKERS -1 //100
#define NTHREADS 32
#define PI 3.14159265359
#define REAL_TYPE double
void wstack(
int,
long,
long,
long,
double*,
double*,
double*,
float*,
float*,
float*,
double,
double,
int,
int,
int,
double*,
int);
int test(int nnn);
double gauss_kernel_norm(
double norm,
double std22,
double u_dist,
double v_dist);
void phase_correction(
double*,
double*,
double*,
int,
int,
int,
int,
int);
#ifdef ACCOMP
#ifdef NVIDIA
void getGPUInfo(int);
#endif
#pragma omp declare target (gauss_kernel_norm)
#endif
#ifdef NVIDIA
#ifdef __cplusplus
extern "C" {
#endif
#ifndef PRTACCELINFO_H
#define PRTACCELINFO_H
void prtAccelInfo(int iaccel);
/**<
* @brief Print some basic info of an accelerator.
*
* Strictly speaking, \c prtAccelInfo() can only print the basic info of an
* Nvidia CUDA device.
*
* @param iaccel The index of an accelerator.
*
* @return \c void.
*/
#endif
#ifdef __cplusplus
}
#endif
#endif
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment