#ifdef _OPENMP #include #endif #include "w-stacking.hip.hpp" #include #include #include #ifdef __HIPCC__ #include "allvars_rccl.hip.hpp" #endif #include "proto.h" #ifdef ACCOMP #pragma omp declare target #endif #ifdef __HIPCC__ double __device__ #else double #endif // Gaussian Kernel gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist) { double conv_weight; conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22); return conv_weight; } void makeGaussKernel(double * kernel, int KernelLen, int increaseprecision, double std22) { double norm = std22/PI; int n = increaseprecision*KernelLen, mid = n / 2; for (int i = 0; i != mid + 1; i++) { double term = (double)i/(double)increaseprecision; kernel[mid + i] = sqrt(norm) * exp(-(term*term)*std22); } for (int i = 0; i != mid; i++) kernel[i] = kernel[n - 1 - i]; // for (int i = 0; i < n; i++) printf("%f\n",kernel[i]); } // Kaiser-Bessel Kernel: it is adapted from WSClean double bessel0(double x, double precision) { // Calculate I_0 = SUM of m 0 -> inf [ (x/2)^(2m) ] // This is the unnormalized bessel function of order 0. double d = 0.0, ds = 1.0, sum = 1.0; do { d += 2.0; ds *= x * x / (d * d); sum += ds; } while (ds > sum * precision); return sum; } void makeKaiserBesselKernel(double * kernel, int KernelLen, int increaseprecision, double alpha, double overSamplingFactor, int withSinc) { int n = increaseprecision*KernelLen, mid = n / 2; double * sincKernel = (double*)malloc((mid + 1) * sizeof(*sincKernel)); const double filterRatio = 1.0 / overSamplingFactor; sincKernel[0] = filterRatio; for (int i = 1; i != mid + 1; i++) { double x = i; sincKernel[i] = withSinc ? (sin(PI * filterRatio * x) / (PI * x)) : filterRatio; } const double normFactor = overSamplingFactor / bessel0(alpha, 1e-8); for (int i = 0; i != mid + 1; i++) { double term = (double)i / mid; kernel[mid + i] = sincKernel[i] * bessel0(alpha * sqrt(1.0 - (term * term)), 1e-8) * normFactor; } for (int i = 0; i != mid; i++) kernel[i] = kernel[n - 1 - i]; //for (int i = 0; i < n; i++) printf("%f\n",kernel[i]); } #ifdef ACCOMP #pragma omp end declare target #endif #ifdef __HIPCC__ //double __device__ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist) //{ // double conv_weight; // conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22); // return conv_weight; //} __global__ void convolve_g( int num_w_planes, myuint num_points, myuint freq_per_chan, myuint polarizations, double* uu, double* vv, double* ww, float* vis_real, float* vis_img, float* weight, double dx, double dw, int KernelLen, int grid_size_x, int grid_size_y, double* grid, #if defined(GAUSS_HI_PRECISION) double std22 #else double std22, double* convkernel #endif ) { //printf("DENTRO AL KERNEL\n"); myuint gid = blockIdx.x*blockDim.x + threadIdx.x; if(gid < num_points) { myuint i = gid; myull visindex = i*freq_per_chan*polarizations; double norm = std22/PI; int j, k; /* Convert UV coordinates to grid coordinates. */ double pos_u = uu[i] / dx; double pos_v = vv[i] / dx; double ww_i = ww[i] / dw; int grid_w = (int)ww_i; int grid_u = (int)pos_u; int grid_v = (int)pos_v; // check the boundaries myuint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0; myuint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1; myuint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0; myuint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1; // Convolve this point onto the grid. for (k = kmin; k <= kmax; k++) { double v_dist = (double)k - pos_v; int increaseprecision = 5; for (j = jmin; j <= jmax; j++) { double u_dist = (double)j+0.5 - pos_u; myuint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); int jKer = (int)(increaseprecision * (fabs(u_dist+(double)KernelLen))); int kKer = (int)(increaseprecision * (fabs(v_dist+(double)KernelLen))); #ifdef GAUSS_HI_PRECISION double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist); #endif #ifdef GAUSS double conv_weight = convkernel[jKer]*convkernel[kKer]; #endif #ifdef KAISERBESSEL double conv_weight = convkernel[jKer]*convkernel[kKer]; #endif // Loops over frequencies and polarizations double add_term_real = 0.0; double add_term_img = 0.0; myull ifine = visindex; for (myuint ifreq=0; ifreq>> ( num_w_planes, num_points, freq_per_chan, polarizations, uu_g, vv_g, ww_g, vis_real_g, vis_img_g, weight_g, dx, dw, KernelLen, grid_size_x, grid_size_y, grid, std22 ); #else convolve_g <<>> ( num_w_planes, num_points, freq_per_chan, polarizations, uu_g, vv_g, ww_g, vis_real_g, vis_img_g, weight_g, dx, dw, KernelLen, grid_size_x, grid_size_y, grid, std22, convkernel_g ); #endif mmm=hipStreamSynchronize(stream_stacking); //Record the event //mmm=hipEventRecord(event_kernel,stream_stacking); //Wait until the kernel ends //mmm=hipStreamWaitEvent(stream_stacking,event_kernel); //for (int i=0; i<100000; i++)printf("%f\n",grid[i]); if (mmm != hipSuccess) printf("HIP ERROR %s\n",hipGetErrorString(mmm)); mmm=hipFree(uu_g); mmm=hipFree(vv_g); mmm=hipFree(ww_g); mmm=hipFree(vis_real_g); mmm=hipFree(vis_img_g); mmm=hipFree(weight_g); //mmm=hipFree(grid_g); #if !defined(GAUSS_HI_PRECISION) mmm=hipFree(convkernel_g); #endif // Switch between HIP and GPU versions # else #ifdef _OPENMP omp_set_num_threads(num_threads); #endif #if defined(ACCOMP) && (GPU_STACKING) omp_set_default_device(rank % omp_get_num_devices()); myull Nvis = num_points*freq_per_chan*polarizations; #pragma omp target teams distribute parallel for private(visindex) map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom:grid[0:2*num_w_planes*grid_size_x*grid_size_y]) #else #pragma omp parallel for private(visindex) #endif for (i = 0; i < num_points; i++) { #ifdef _OPENMP //int tid; //tid = omp_get_thread_num(); //printf("%d\n",tid); #endif visindex = i*freq_per_chan*polarizations; double sum = 0.0; int j, k; //if (i%1000 == 0)printf("%ld\n",i); /* Convert UV coordinates to grid coordinates. */ double pos_u = uu[i] / dx; double pos_v = vv[i] / dx; double ww_i = ww[i] / dw; int grid_w = (int)ww_i; int grid_u = (int)pos_u; int grid_v = (int)pos_v; // check the boundaries myuint jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0; myuint jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1; myuint kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0; myuint kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1; //printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax); // Convolve this point onto the grid. for (k = kmin; k <= kmax; k++) { //double v_dist = (double)k+0.5 - pos_v; double v_dist = (double)k - pos_v; for (j = jmin; j <= jmax; j++) { //double u_dist = (double)j+0.5 - pos_u; double u_dist = (double)j - pos_u; myuint iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); int jKer = (int)(increaseprecision * (fabs(u_dist+(double)KernelLen))); int kKer = (int)(increaseprecision * (fabs(v_dist+(double)KernelLen))); #ifdef GAUSS_HI_PRECISION double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist); #endif #ifdef GAUSS double conv_weight = convkernel[jKer]*convkernel[kKer]; //if(jKer < 0 || jKer >= 35 || kKer < 0 || kKer >= 35) // printf("%f %d %f %d\n",fabs(u_dist+(double)KernelLen),jKer,fabs(v_dist+(double)KernelLen),kKer); //printf("%d %d %d %d %f %f %f %f %f\n",jKer, j, kKer, k, pos_u, pos_v, u_dist,v_dist,conv_weight); #endif #ifdef KAISERBESSEL double conv_weight = convkernel[jKer]*convkernel[kKer]; #endif // Loops over frequencies and polarizations double add_term_real = 0.0; double add_term_img = 0.0; myull ifine = visindex; // DAV: the following two loops are performend by each thread separately: no problems of race conditions for (myuint ifreq=0; ifreq1e10 || vis_img[ifine]<-1e10)printf("%f %f %f %f %ld %ld\n",vis_real[ifine],vis_img[ifine],weight[iweight],conv_weight,ifine,num_points*freq_per_chan*polarizations); } ifine++; iweight++; } } // DAV: this is the critical call in terms of correctness of the results and of performance #pragma omp atomic grid[iKer] += add_term_real; #pragma omp atomic grid[iKer+1] += add_term_img; } } } #if defined(ACCOMP) && (GPU_STACKING) #pragma omp target exit data map(delete:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan], grid[0:2*num_w_planes*grid_size_x*grid_size_y]) #endif // End switch between HIP and CPU versions #endif //for (int i=0; i<100000; i++)printf("%f\n",grid[i]); } #ifdef ACCOMP #pragma omp end declare target #endif int test(int nnn) { int mmm; mmm = nnn+1; return mmm; }