Select Git revision
w-stacking.hip.cpp
-
Emanuele De Rubeis authoredEmanuele De Rubeis authored
w-stacking.hip.cpp 15.51 KiB
#ifdef _OPENMP
#include <omp.h>
#endif
#include "w-stacking.hip.hpp"
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#ifdef __HIPCC__
#include "allvars_rccl.hip.hpp"
#endif
#include "proto.h"
#ifdef ACCOMP
#pragma omp declare target
#endif
#ifdef __HIPCC__
double __device__
#else
double
#endif
// Gaussian Kernel
gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
{
double conv_weight;
conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22);
return conv_weight;
}
void makeGaussKernel(double * kernel,
int KernelLen,
int increaseprecision,
double std22)
{
double norm = std22/PI;
int n = increaseprecision*KernelLen, mid = n / 2;
for (int i = 0; i != mid + 1; i++) {
double term = (double)i/(double)increaseprecision;
kernel[mid + i] = sqrt(norm) * exp(-(term*term)*std22);
}
for (int i = 0; i != mid; i++) kernel[i] = kernel[n - 1 - i];
// for (int i = 0; i < n; i++) printf("%f\n",kernel[i]);
}
// Kaiser-Bessel Kernel: it is adapted from WSClean
double bessel0(double x, double precision) {
// Calculate I_0 = SUM of m 0 -> inf [ (x/2)^(2m) ]
// This is the unnormalized bessel function of order 0.
double d = 0.0, ds = 1.0, sum = 1.0;
do {
d += 2.0;
ds *= x * x / (d * d);
sum += ds;
} while (ds > sum * precision);
return sum;
}
void makeKaiserBesselKernel(double * kernel,
int KernelLen,
int increaseprecision,
double alpha,
double overSamplingFactor,
int withSinc) {
int n = increaseprecision*KernelLen, mid = n / 2;
double * sincKernel = (double*)malloc((mid + 1) * sizeof(*sincKernel));
const double filterRatio = 1.0 / overSamplingFactor;
sincKernel[0] = filterRatio;
for (int i = 1; i != mid + 1; i++) {
double x = i;
sincKernel[i] =
withSinc ? (sin(PI * filterRatio * x) / (PI * x)) : filterRatio;
}
const double normFactor = overSamplingFactor / bessel0(alpha, 1e-8);
for (int i = 0; i != mid + 1; i++) {
double term = (double)i / mid;
kernel[mid + i] = sincKernel[i] *
bessel0(alpha * sqrt(1.0 - (term * term)), 1e-8) *
normFactor;
}
for (int i = 0; i != mid; i++) kernel[i] = kernel[n - 1 - i];
//for (int i = 0; i < n; i++) printf("%f\n",kernel[i]);
}
#ifdef ACCOMP
#pragma omp end declare target
#endif
#ifdef __HIPCC__
//double __device__ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
//{
// double conv_weight;
// conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22);
// return conv_weight;
//}
__global__ void convolve_g(
int num_w_planes,
myuint num_points,
myuint freq_per_chan,
myuint polarizations,
double* uu,
double* vv,
double* ww,
float* vis_real,
float* vis_img,
float* weight,
double dx,
double dw,
int KernelLen,
int grid_size_x,
int grid_size_y,
double* grid,
#if defined(GAUSS_HI_PRECISION)
double std22
#else
double std22,
double* convkernel
#endif
)
{
//printf("DENTRO AL KERNEL\n");
myuint gid = blockIdx.x*blockDim.x + threadIdx.x;
if(gid < num_points)
{
myuint i = gid;
myull visindex = i*freq_per_chan*polarizations;
double norm = std22/PI;
int j, k;
/* Convert UV coordinates to grid coordinates. */
double pos_u = uu[i] / dx;
double pos_v = vv[i] / dx;
double ww_i = ww[i] / dw;
int grid_w = (int)ww_i;
int grid_u = (int)pos_u;
int grid_v = (int)pos_v;
// check the boundaries
myuint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
myuint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
myuint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
myuint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
// Convolve this point onto the grid.
for (k = kmin; k <= kmax; k++)
{
double v_dist = (double)k - pos_v;
int increaseprecision = 5;
for (j = jmin; j <= jmax; j++)
{
double u_dist = (double)j+0.5 - pos_u;
myuint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
int jKer = (int)(increaseprecision * (fabs(u_dist+(double)KernelLen)));
int kKer = (int)(increaseprecision * (fabs(v_dist+(double)KernelLen)));
#ifdef GAUSS_HI_PRECISION
double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
#endif
#ifdef GAUSS
double conv_weight = convkernel[jKer]*convkernel[kKer];
#endif
#ifdef KAISERBESSEL
double conv_weight = convkernel[jKer]*convkernel[kKer];
#endif
// Loops over frequencies and polarizations
double add_term_real = 0.0;
double add_term_img = 0.0;
myull ifine = visindex;
for (myuint ifreq=0; ifreq<freq_per_chan; ifreq++)
{
myuint iweight = visindex/freq_per_chan;
for (myuint ipol=0; ipol<polarizations; ipol++)
{
double vistest = (double)vis_real[ifine];
if (!isnan(vistest))
{
add_term_real += weight[iweight] * vis_real[ifine] * conv_weight;
add_term_img += weight[iweight] * vis_img[ifine] * conv_weight;
}
ifine++;
iweight++;
}
}
atomicAdd(&(grid[iKer]),add_term_real);
atomicAdd(&(grid[iKer+1]),add_term_img);
}
}
}
}
#endif
#ifdef ACCOMP
#pragma omp declare target
#endif
void wstack(
int num_w_planes,
myuint num_points,
myuint freq_per_chan,
myuint polarizations,
double* uu,
double* vv,
double* ww,
float* vis_real,
float* vis_img,
float* weight,
double dx,
double dw,
int w_support,
int grid_size_x,
int grid_size_y,
double* grid,
int num_threads,
#ifdef HIPCC
int rank,
hipStream_t stream_stacking
#else
int rank
#endif
)
{
myuint i;
//myuint index;
myull visindex;
// initialize the convolution kernel
// gaussian:
int KernelLen = (w_support-1)/2;
int increaseprecision = 5; // this number must be odd: increaseprecison*w_support must be odd (w_support must be odd)
double std = 1.0;
double std22 = 1.0/(2.0*std*std);
double norm = std22/PI;
double * convkernel = (double*)malloc(increaseprecision*w_support*sizeof(*convkernel));
#ifdef GAUSS
makeGaussKernel(convkernel,w_support,increaseprecision,std22);
#endif
#ifdef KAISERBESSEL
double overSamplingFactor = 1.0;
int withSinc = 0;
double alpha = 8.6;
makeKaiserBesselKernel(convkernel, w_support, increaseprecision, alpha, overSamplingFactor, withSinc);
#endif
// Loop over visibilities.
// Switch between HIP and GPU versions
#ifdef __HIPCC__
// Define the HIP set up
int Nth = NTHREADS;
myuint Nbl = (myuint)(num_points/Nth) + 1;
if(NWORKERS == 1) {Nbl = 1; Nth = 1;};
myull Nvis = num_points*freq_per_chan*polarizations;
int ndevices;
int num = hipGetDeviceCount(&ndevices);
int res = hipSetDevice(rank % ndevices);
if ( rank == 0 ) {
if (0 == ndevices) {
shutdown_wstacking(NO_ACCELERATORS_FOUND, "No accelerators found", __FILE__, __LINE__ );
}
}
#ifdef NVIDIA
prtAccelInfo();
#endif
// Create GPU arrays and offload them
double * uu_g;
double * vv_g;
double * ww_g;
float * vis_real_g;
float * vis_img_g;
float * weight_g;
//double * grid_g;
double * convkernel_g;
//Create the event inside stream stacking
//hipEvent_t event_kernel;
//for (int i=0; i<100000; i++)grid[i]=23.0;
hipError_t mmm;
//mmm=hipEventCreate(&event_kernel);
mmm=hipMalloc(&uu_g,num_points*sizeof(double));
mmm=hipMalloc(&vv_g,num_points*sizeof(double));
mmm=hipMalloc(&ww_g,num_points*sizeof(double));
mmm=hipMalloc(&vis_real_g,Nvis*sizeof(float));
mmm=hipMalloc(&vis_img_g,Nvis*sizeof(float));
mmm=hipMalloc(&weight_g,(Nvis/freq_per_chan)*sizeof(float));
//mmm=hipMalloc(&grid_g,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
#if !defined(GAUSS_HI_PRECISION)
mmm=hipMalloc(&convkernel_g,increaseprecision*w_support*sizeof(double));
#endif
if (mmm != hipSuccess) {printf("!!! w-stacking.cu hipMalloc ERROR %d !!!\n", mmm);}
//mmm=hipMemset(grid_g,0.0,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
if (mmm != hipSuccess) {printf("!!! w-stacking.cu hipMemset ERROR %d !!!\n", mmm);}
mmm=hipMemcpyAsync(uu_g, uu, num_points*sizeof(double), hipMemcpyHostToDevice, stream_stacking);
mmm=hipMemcpyAsync(vv_g, vv, num_points*sizeof(double), hipMemcpyHostToDevice, stream_stacking);
mmm=hipMemcpyAsync(ww_g, ww, num_points*sizeof(double), hipMemcpyHostToDevice, stream_stacking);
mmm=hipMemcpyAsync(vis_real_g, vis_real, Nvis*sizeof(float), hipMemcpyHostToDevice, stream_stacking);
mmm=hipMemcpyAsync(vis_img_g, vis_img, Nvis*sizeof(float), hipMemcpyHostToDevice, stream_stacking);
mmm=hipMemcpyAsync(weight_g, weight, (Nvis/freq_per_chan)*sizeof(float), hipMemcpyHostToDevice, stream_stacking);
#if !defined(GAUSS_HI_PRECISION)
mmm=hipMemcpyAsync(convkernel_g, convkernel, increaseprecision*w_support*sizeof(double), hipMemcpyHostToDevice, stream_stacking);
#endif
if (mmm != hipSuccess) {printf("!!! w-stacking.cu hipMemcpyAsync ERROR %d !!!\n", mmm);}
// Call main GPU Kernel
#if defined(GAUSS_HI_PRECISION)
convolve_g <<<Nbl,Nth,0,stream_stacking>>> (
num_w_planes,
num_points,
freq_per_chan,
polarizations,
uu_g,
vv_g,
ww_g,
vis_real_g,
vis_img_g,
weight_g,
dx,
dw,
KernelLen,
grid_size_x,
grid_size_y,
grid,
std22
);
#else
convolve_g <<<Nbl,Nth,0,stream_stacking>>> (
num_w_planes,
num_points,
freq_per_chan,
polarizations,
uu_g,
vv_g,
ww_g,
vis_real_g,
vis_img_g,
weight_g,
dx,
dw,
KernelLen,
grid_size_x,
grid_size_y,
grid,
std22,
convkernel_g
);
#endif
mmm=hipStreamSynchronize(stream_stacking);
//Record the event
//mmm=hipEventRecord(event_kernel,stream_stacking);
//Wait until the kernel ends
//mmm=hipStreamWaitEvent(stream_stacking,event_kernel);
//for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
if (mmm != hipSuccess)
printf("HIP ERROR %s\n",hipGetErrorString(mmm));
mmm=hipFree(uu_g);
mmm=hipFree(vv_g);
mmm=hipFree(ww_g);
mmm=hipFree(vis_real_g);
mmm=hipFree(vis_img_g);
mmm=hipFree(weight_g);
//mmm=hipFree(grid_g);
#if !defined(GAUSS_HI_PRECISION)
mmm=hipFree(convkernel_g);
#endif
// Switch between HIP and GPU versions
# else
#ifdef _OPENMP
omp_set_num_threads(num_threads);
#endif
#if defined(ACCOMP) && (GPU_STACKING)
omp_set_default_device(rank % omp_get_num_devices());
myull Nvis = num_points*freq_per_chan*polarizations;
#pragma omp target teams distribute parallel for private(visindex) map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom:grid[0:2*num_w_planes*grid_size_x*grid_size_y])
#else
#pragma omp parallel for private(visindex)
#endif
for (i = 0; i < num_points; i++)
{
#ifdef _OPENMP
//int tid;
//tid = omp_get_thread_num();
//printf("%d\n",tid);
#endif
visindex = i*freq_per_chan*polarizations;
double sum = 0.0;
int j, k;
//if (i%1000 == 0)printf("%ld\n",i);
/* Convert UV coordinates to grid coordinates. */
double pos_u = uu[i] / dx;
double pos_v = vv[i] / dx;
double ww_i = ww[i] / dw;
int grid_w = (int)ww_i;
int grid_u = (int)pos_u;
int grid_v = (int)pos_v;
// check the boundaries
myuint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
myuint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
myuint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
myuint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
//printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax);
// Convolve this point onto the grid.
for (k = kmin; k <= kmax; k++)
{
//double v_dist = (double)k+0.5 - pos_v;
double v_dist = (double)k - pos_v;
for (j = jmin; j <= jmax; j++)
{
//double u_dist = (double)j+0.5 - pos_u;
double u_dist = (double)j - pos_u;
myuint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
int jKer = (int)(increaseprecision * (fabs(u_dist+(double)KernelLen)));
int kKer = (int)(increaseprecision * (fabs(v_dist+(double)KernelLen)));
#ifdef GAUSS_HI_PRECISION
double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
#endif
#ifdef GAUSS
double conv_weight = convkernel[jKer]*convkernel[kKer];
//if(jKer < 0 || jKer >= 35 || kKer < 0 || kKer >= 35)
// printf("%f %d %f %d\n",fabs(u_dist+(double)KernelLen),jKer,fabs(v_dist+(double)KernelLen),kKer);
//printf("%d %d %d %d %f %f %f %f %f\n",jKer, j, kKer, k, pos_u, pos_v, u_dist,v_dist,conv_weight);
#endif
#ifdef KAISERBESSEL
double conv_weight = convkernel[jKer]*convkernel[kKer];
#endif
// Loops over frequencies and polarizations
double add_term_real = 0.0;
double add_term_img = 0.0;
myull ifine = visindex;
// DAV: the following two loops are performend by each thread separately: no problems of race conditions
for (myuint ifreq=0; ifreq<freq_per_chan; ifreq++)
{
myuint iweight = visindex/freq_per_chan;
for (myuint ipol=0; ipol<polarizations; ipol++)
{
if (!isnan(vis_real[ifine]))
{
//printf("%f %ld\n",weight[iweight],iweight);
add_term_real += weight[iweight] * vis_real[ifine] * conv_weight;
add_term_img += weight[iweight] * vis_img[ifine] * conv_weight;
//if(vis_img[ifine]>1e10 || vis_img[ifine]<-1e10)printf("%f %f %f %f %ld %ld\n",vis_real[ifine],vis_img[ifine],weight[iweight],conv_weight,ifine,num_points*freq_per_chan*polarizations);
}
ifine++;
iweight++;
}
}
// DAV: this is the critical call in terms of correctness of the results and of performance
#pragma omp atomic
grid[iKer] += add_term_real;
#pragma omp atomic
grid[iKer+1] += add_term_img;
}
}
}
#if defined(ACCOMP) && (GPU_STACKING)
#pragma omp target exit data map(delete:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan], grid[0:2*num_w_planes*grid_size_x*grid_size_y])
#endif
// End switch between HIP and CPU versions
#endif
//for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
}
#ifdef ACCOMP
#pragma omp end declare target
#endif
int test(int nnn)
{
int mmm;
mmm = nnn+1;
return mmm;
}