#ifdef _OPENMP #include #endif #include "w-stacking.h" #include #include #include #ifdef ACCOMP #pragma omp declare target #endif #ifdef __CUDACC__ double __device__ #else double #endif // Gaussian Kernel gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist) { double conv_weight; conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22); return conv_weight; } void makeGaussKernel(double * kernel, int KernelLen, double std22) { double norm = std22/PI; int n = KernelLen, mid = n / 2; for (int i = 0; i != mid + 1; i++) { double term = (double)i / mid; kernel[mid + i] = sqrt(norm) * exp(-(term*term)*std22); } for (int i = 0; i != mid; i++) kernel[i] = kernel[n - 1 - i]; } // Kaiser-Bessel Kernel: it is adapted from WSClean double bessel0(double x, double precision) { // Calculate I_0 = SUM of m 0 -> inf [ (x/2)^(2m) ] // This is the unnormalized bessel function of order 0. double d = 0.0, ds = 1.0, sum = 1.0; do { d += 2.0; ds *= x * x / (d * d); sum += ds; } while (ds > sum * precision); return sum; } void makeKaiserBesselKernel(double * kernel, int KernelLen, double alpha, double overSamplingFactor, int withSinc) { int n = KernelLen, mid = n / 2; double * sincKernel = malloc((mid + 1)*sizeof(*sincKernel)); const double filterRatio = 1.0 / overSamplingFactor; sincKernel[0] = filterRatio; for (int i = 1; i != mid + 1; i++) { double x = i; sincKernel[i] = withSinc ? (sin(PI * filterRatio * x) / (PI * x)) : filterRatio; } const double normFactor = overSamplingFactor / bessel0(alpha, 1e-8); for (int i = 0; i != mid + 1; i++) { double term = (double)i / mid; kernel[mid + i] = sincKernel[i] * bessel0(alpha * sqrt(1.0 - (term * term)), 1e-8) * normFactor; } for (int i = 0; i != mid; i++) kernel[i] = kernel[n - 1 - i]; } #ifdef ACCOMP #pragma omp end declare target #endif #ifdef __CUDACC__ //double __device__ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist) //{ // double conv_weight; // conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22); // return conv_weight; //} __global__ void convolve_g( int num_w_planes, long num_points, long freq_per_chan, long polarizations, double* uu, double* vv, double* ww, float* vis_real, float* vis_img, float* weight, double dx, double dw, int KernelLen, int grid_size_x, int grid_size_y, double* grid, double std22) { //printf("DENTRO AL KERNEL\n"); long gid = blockIdx.x*blockDim.x + threadIdx.x; if(gid < num_points) { long i = gid; long visindex = i*freq_per_chan*polarizations; double norm = std22/PI; int j, k; /* Convert UV coordinates to grid coordinates. */ double pos_u = uu[i] / dx; double pos_v = vv[i] / dx; double ww_i = ww[i] / dw; int grid_w = (int)ww_i; int grid_u = (int)pos_u; int grid_v = (int)pos_v; // check the boundaries long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0; long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1; long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0; long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1; //printf("%ld, %ld, %ld, %ld\n",jmin,jmax,kmin,kmax); // Convolve this point onto the grid. for (k = kmin; k <= kmax; k++) { double v_dist = (double)k+0.5 - pos_v; for (j = jmin; j <= jmax; j++) { double u_dist = (double)j+0.5 - pos_u; long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); //printf("--> %ld %d %d %d\n",iKer,j,k,grid_w); double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist); // Loops over frequencies and polarizations double add_term_real = 0.0; double add_term_img = 0.0; long ifine = visindex; for (long ifreq=0; ifreq>> ( num_w_planes, num_points, freq_per_chan, polarizations, uu_g, vv_g, ww_g, vis_real_g, vis_img_g, weight_g, dx, dw, KernelLen, grid_size_x, grid_size_y, grid_g, std22); mmm = cudaMemcpy(grid, grid_g, 2*num_w_planes*grid_size_x*grid_size_y*sizeof(double), cudaMemcpyDeviceToHost); //for (int i=0; i<100000; i++)printf("%f\n",grid[i]); printf("CUDA ERROR %s\n",cudaGetErrorString(mmm)); mmm=cudaFree(uu_g); mmm=cudaFree(vv_g); mmm=cudaFree(ww_g); mmm=cudaFree(vis_real_g); mmm=cudaFree(vis_img_g); mmm=cudaFree(weight_g); mmm=cudaFree(grid_g); // Switch between CUDA and GPU versions # else #ifdef _OPENMP omp_set_num_threads(num_threads); #endif #ifdef ACCOMP long Nvis = num_points*freq_per_chan*polarizations; // #pragma omp target data map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) // #pragma omp target teams distribute parallel for map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom: grid[0:2*num_w_planes*grid_size_x*grid_size_y]) #else #pragma omp parallel for private(visindex) #endif for (i = 0; i < num_points; i++) { #ifdef _OPENMP //int tid; //tid = omp_get_thread_num(); //printf("%d\n",tid); #endif visindex = i*freq_per_chan*polarizations; double sum = 0.0; int j, k; //if (i%1000 == 0)printf("%ld\n",i); /* Convert UV coordinates to grid coordinates. */ double pos_u = uu[i] / dx; double pos_v = vv[i] / dx; double ww_i = ww[i] / dw; int grid_w = (int)ww_i; int grid_u = (int)pos_u; int grid_v = (int)pos_v; // check the boundaries long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0; long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1; long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0; long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1; //printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax); // Convolve this point onto the grid. for (k = kmin; k <= kmax; k++) { double v_dist = (double)k+0.5 - pos_v; for (j = jmin; j <= jmax; j++) { int jKer = (int)j-pos_u+KernelLen; int kKer = (int)k-pos_v+KernelLen; double u_dist = (double)j+0.5 - pos_u; long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y); //double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist); double conv_weight = convkernel[jKer]*convkernel[kKer]; // Loops over frequencies and polarizations double add_term_real = 0.0; double add_term_img = 0.0; long ifine = visindex; // DAV: the following two loops are performend by each thread separately: no problems of race conditions for (long ifreq=0; ifreq1e10 || vis_img[ifine]<-1e10)printf("%f %f %f %f %ld %ld\n",vis_real[ifine],vis_img[ifine],weight[iweight],conv_weight,ifine,num_points*freq_per_chan*polarizations); } ifine++; iweight++; } } // DAV: this is the critical call in terms of correctness of the results and of performance #pragma omp atomic grid[iKer] += add_term_real; #pragma omp atomic grid[iKer+1] += add_term_img; } } } // End switch between CUDA and CPU versions #endif //for (int i=0; i<100000; i++)printf("%f\n",grid[i]); } #ifdef ACCOMP #pragma omp end declare target #endif int test(int nnn) { int mmm; mmm = nnn+1; return mmm; }