Select Git revision
w-stacking.cu
-
Claudio Gheller authoredClaudio Gheller authored
w-stacking.cu 11.58 KiB
#ifdef _OPENMP
#include <omp.h>
#endif
#include "w-stacking.h"
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#ifdef ACCOMP
#pragma omp declare target
#endif
#ifdef __CUDACC__
double __device__
#else
double
#endif
// Gaussian Kernel
gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
{
double conv_weight;
conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22);
return conv_weight;
}
void makeGaussKernel(double * kernel,
int KernelLen,
double std22)
{
double norm = std22/PI;
int n = KernelLen, mid = n / 2;
for (int i = 0; i != mid + 1; i++) {
double term = (double)i / mid;
kernel[mid + i] = sqrt(norm) * exp(-(term*term)*std22);
}
for (int i = 0; i != mid; i++) kernel[i] = kernel[n - 1 - i];
}
// Kaiser-Bessel Kernel: it is adapted from WSClean
double bessel0(double x, double precision) {
// Calculate I_0 = SUM of m 0 -> inf [ (x/2)^(2m) ]
// This is the unnormalized bessel function of order 0.
double d = 0.0, ds = 1.0, sum = 1.0;
do {
d += 2.0;
ds *= x * x / (d * d);
sum += ds;
} while (ds > sum * precision);
return sum;
}
void makeKaiserBesselKernel(double * kernel,
int KernelLen,
double alpha,
double overSamplingFactor,
int withSinc) {
int n = KernelLen, mid = n / 2;
double * sincKernel = malloc((mid + 1)*sizeof(*sincKernel));
const double filterRatio = 1.0 / overSamplingFactor;
sincKernel[0] = filterRatio;
for (int i = 1; i != mid + 1; i++) {
double x = i;
sincKernel[i] =
withSinc ? (sin(PI * filterRatio * x) / (PI * x)) : filterRatio;
}
const double normFactor = overSamplingFactor / bessel0(alpha, 1e-8);
for (int i = 0; i != mid + 1; i++) {
double term = (double)i / mid;
kernel[mid + i] = sincKernel[i] *
bessel0(alpha * sqrt(1.0 - (term * term)), 1e-8) *
normFactor;
}
for (int i = 0; i != mid; i++) kernel[i] = kernel[n - 1 - i];
}
#ifdef ACCOMP
#pragma omp end declare target
#endif
#ifdef __CUDACC__
//double __device__ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
//{
// double conv_weight;
// conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22);
// return conv_weight;
//}
__global__ void convolve_g(
int num_w_planes,
long num_points,
long freq_per_chan,
long polarizations,
double* uu,
double* vv,
double* ww,
float* vis_real,
float* vis_img,
float* weight,
double dx,
double dw,
int KernelLen,
int grid_size_x,
int grid_size_y,
double* grid,
double std22)
{
//printf("DENTRO AL KERNEL\n");
long gid = blockIdx.x*blockDim.x + threadIdx.x;
if(gid < num_points)
{
long i = gid;
long visindex = i*freq_per_chan*polarizations;
double norm = std22/PI;
int j, k;
/* Convert UV coordinates to grid coordinates. */
double pos_u = uu[i] / dx;
double pos_v = vv[i] / dx;
double ww_i = ww[i] / dw;
int grid_w = (int)ww_i;
int grid_u = (int)pos_u;
int grid_v = (int)pos_v;
// check the boundaries
long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
//printf("%ld, %ld, %ld, %ld\n",jmin,jmax,kmin,kmax);
// Convolve this point onto the grid.
for (k = kmin; k <= kmax; k++)
{
double v_dist = (double)k+0.5 - pos_v;
for (j = jmin; j <= jmax; j++)
{
double u_dist = (double)j+0.5 - pos_u;
long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
//printf("--> %ld %d %d %d\n",iKer,j,k,grid_w);
double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
// Loops over frequencies and polarizations
double add_term_real = 0.0;
double add_term_img = 0.0;
long ifine = visindex;
for (long ifreq=0; ifreq<freq_per_chan; ifreq++)
{
long iweight = visindex/freq_per_chan;
for (long ipol=0; ipol<polarizations; ipol++)
{
double vistest = (double)vis_real[ifine];
if (!isnan(vistest))
{
add_term_real += weight[iweight] * vis_real[ifine] * conv_weight;
add_term_img += weight[iweight] * vis_img[ifine] * conv_weight;
}
ifine++;
iweight++;
}
}
atomicAdd(&(grid[iKer]),add_term_real);
atomicAdd(&(grid[iKer+1]),add_term_img);
}
}
}
}
#endif
#ifdef ACCOMP
#pragma omp declare target
#endif
void wstack(
int num_w_planes,
long num_points,
long freq_per_chan,
long polarizations,
double* uu,
double* vv,
double* ww,
float* vis_real,
float* vis_img,
float* weight,
double dx,
double dw,
int w_support,
int grid_size_x,
int grid_size_y,
double* grid,
int num_threads)
{
long i;
long index;
long visindex;
// initialize the convolution kernel
// gaussian:
int KernelLen = (w_support-1)/2;
double std = 1.0;
double std22 = 1.0/(2.0*std*std);
double norm = std22/PI;
double * convkernel = malloc(w_support*sizeof(*convkernel));
makeGaussKernel(convkernel,w_support,std22);
// Loop over visibilities.
// Switch between CUDA and GPU versions
#ifdef __CUDACC__
// Define the CUDA set up
int Nth = NTHREADS;
long Nbl = (long)(num_points/Nth) + 1;
if(NWORKERS == 1) {Nbl = 1; Nth = 1;};
long Nvis = num_points*freq_per_chan*polarizations;
printf("Running on GPU with %d threads and %d blocks\n",Nth,Nbl);
// Create GPU arrays and offload them
double * uu_g;
double * vv_g;
double * ww_g;
float * vis_real_g;
float * vis_img_g;
float * weight_g;
double * grid_g;
//for (int i=0; i<100000; i++)grid[i]=23.0;
cudaError_t mmm;
mmm=cudaMalloc(&uu_g,num_points*sizeof(double));
mmm=cudaMalloc(&vv_g,num_points*sizeof(double));
mmm=cudaMalloc(&ww_g,num_points*sizeof(double));
mmm=cudaMalloc(&vis_real_g,Nvis*sizeof(float));
mmm=cudaMalloc(&vis_img_g,Nvis*sizeof(float));
mmm=cudaMalloc(&weight_g,(Nvis/freq_per_chan)*sizeof(float));
mmm=cudaMalloc(&grid_g,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
mmm=cudaMemset(grid_g,0.0,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
mmm=cudaMemcpy(uu_g, uu, num_points*sizeof(double), cudaMemcpyHostToDevice);
mmm=cudaMemcpy(vv_g, vv, num_points*sizeof(double), cudaMemcpyHostToDevice);
mmm=cudaMemcpy(ww_g, ww, num_points*sizeof(double), cudaMemcpyHostToDevice);
mmm=cudaMemcpy(vis_real_g, vis_real, Nvis*sizeof(float), cudaMemcpyHostToDevice);
mmm=cudaMemcpy(vis_img_g, vis_img, Nvis*sizeof(float), cudaMemcpyHostToDevice);
mmm=cudaMemcpy(weight_g, weight, (Nvis/freq_per_chan)*sizeof(float), cudaMemcpyHostToDevice);
// Call main GPU Kernel
convolve_g <<<Nbl,Nth>>> (
num_w_planes,
num_points,
freq_per_chan,
polarizations,
uu_g,
vv_g,
ww_g,
vis_real_g,
vis_img_g,
weight_g,
dx,
dw,
KernelLen,
grid_size_x,
grid_size_y,
grid_g,
std22);
mmm = cudaMemcpy(grid, grid_g, 2*num_w_planes*grid_size_x*grid_size_y*sizeof(double), cudaMemcpyDeviceToHost);
//for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
printf("CUDA ERROR %s\n",cudaGetErrorString(mmm));
mmm=cudaFree(uu_g);
mmm=cudaFree(vv_g);
mmm=cudaFree(ww_g);
mmm=cudaFree(vis_real_g);
mmm=cudaFree(vis_img_g);
mmm=cudaFree(weight_g);
mmm=cudaFree(grid_g);
// Switch between CUDA and GPU versions
# else
#ifdef _OPENMP
omp_set_num_threads(num_threads);
#endif
#ifdef ACCOMP
long Nvis = num_points*freq_per_chan*polarizations;
// #pragma omp target data map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan])
// #pragma omp target teams distribute parallel for map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom: grid[0:2*num_w_planes*grid_size_x*grid_size_y])
#else
#pragma omp parallel for private(visindex)
#endif
for (i = 0; i < num_points; i++)
{
#ifdef _OPENMP
//int tid;
//tid = omp_get_thread_num();
//printf("%d\n",tid);
#endif
visindex = i*freq_per_chan*polarizations;
double sum = 0.0;
int j, k;
//if (i%1000 == 0)printf("%ld\n",i);
/* Convert UV coordinates to grid coordinates. */
double pos_u = uu[i] / dx;
double pos_v = vv[i] / dx;
double ww_i = ww[i] / dw;
int grid_w = (int)ww_i;
int grid_u = (int)pos_u;
int grid_v = (int)pos_v;
// check the boundaries
long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
//printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax);
// Convolve this point onto the grid.
for (k = kmin; k <= kmax; k++)
{
double v_dist = (double)k+0.5 - pos_v;
for (j = jmin; j <= jmax; j++)
{
int jKer = (int)j-pos_u+KernelLen;
int kKer = (int)k-pos_v+KernelLen;
double u_dist = (double)j+0.5 - pos_u;
long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
//double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
double conv_weight = convkernel[jKer]*convkernel[kKer];
// Loops over frequencies and polarizations
double add_term_real = 0.0;
double add_term_img = 0.0;
long ifine = visindex;
// DAV: the following two loops are performend by each thread separately: no problems of race conditions
for (long ifreq=0; ifreq<freq_per_chan; ifreq++)
{
long iweight = visindex/freq_per_chan;
for (long ipol=0; ipol<polarizations; ipol++)
{
if (!isnan(vis_real[ifine]))
{
//printf("%f %ld\n",weight[iweight],iweight);
add_term_real += weight[iweight] * vis_real[ifine] * conv_weight;
add_term_img += weight[iweight] * vis_img[ifine] * conv_weight;
//if(vis_img[ifine]>1e10 || vis_img[ifine]<-1e10)printf("%f %f %f %f %ld %ld\n",vis_real[ifine],vis_img[ifine],weight[iweight],conv_weight,ifine,num_points*freq_per_chan*polarizations);
}
ifine++;
iweight++;
}
}
// DAV: this is the critical call in terms of correctness of the results and of performance
#pragma omp atomic
grid[iKer] += add_term_real;
#pragma omp atomic
grid[iKer+1] += add_term_img;
}
}
}
// End switch between CUDA and CPU versions
#endif
//for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
}
#ifdef ACCOMP
#pragma omp end declare target
#endif
int test(int nnn)
{
int mmm;
mmm = nnn+1;
return mmm;
}