Skip to content
Snippets Groups Projects
Select Git revision
  • 7205d63dcca9d2196b6640b1b755077ae5f30589
  • main default protected
  • merge
  • split_input
  • revamp
  • test_derubeis
  • glacopo
  • inverse_imaging
  • openmp
  • RICKv2.0
  • RICKv1.0
  • r0.1c
  • end-of-re_structuring
13 results

w-stacking.cu

Blame
  • w-stacking.cu 11.58 KiB
    #ifdef _OPENMP
    #include <omp.h>
    #endif
    #include "w-stacking.h"
    #include <math.h>
    #include <stdlib.h>
    #include <stdio.h>
    
    #ifdef ACCOMP
    #pragma omp  declare target
    #endif
    #ifdef __CUDACC__
    double __device__
    #else
    double
    #endif
    // Gaussian Kernel
    gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
    {
         double conv_weight;
         conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22);
         return conv_weight;
    }
    
    void makeGaussKernel(double * kernel,
    		     int KernelLen,
    		     double std22)
    {
    
      double norm = std22/PI;
      int n = KernelLen, mid = n / 2;
      for (int i = 0; i != mid + 1; i++) {
          double term = (double)i / mid;
          kernel[mid + i] = sqrt(norm) * exp(-(term*term)*std22);
      }
    
      for (int i = 0; i != mid; i++) kernel[i] = kernel[n - 1 - i];
    
    }
    
    // Kaiser-Bessel Kernel: it is adapted from WSClean
    double bessel0(double x, double precision) {
      // Calculate I_0 = SUM of m 0 -> inf [ (x/2)^(2m) ]
      // This is the unnormalized bessel function of order 0.
      double d = 0.0, ds = 1.0, sum = 1.0;
      do {
        d += 2.0;
        ds *= x * x / (d * d);
        sum += ds;
      } while (ds > sum * precision);
      return sum;
    }
    void makeKaiserBesselKernel(double * kernel,
    		            int KernelLen,
                                double alpha,
                                double overSamplingFactor,
                                int withSinc) {
      int n = KernelLen, mid = n / 2;
      double * sincKernel = malloc((mid + 1)*sizeof(*sincKernel));
      const double filterRatio = 1.0 / overSamplingFactor;
      sincKernel[0] = filterRatio;
      for (int i = 1; i != mid + 1; i++) {
        double x = i;
        sincKernel[i] =
            withSinc ? (sin(PI * filterRatio * x) / (PI * x)) : filterRatio;
      }
      const double normFactor = overSamplingFactor / bessel0(alpha, 1e-8);
      for (int i = 0; i != mid + 1; i++) {
        double term = (double)i / mid;
        kernel[mid + i] = sincKernel[i] *
                          bessel0(alpha * sqrt(1.0 - (term * term)), 1e-8) *
                          normFactor;
      }
      for (int i = 0; i != mid; i++) kernel[i] = kernel[n - 1 - i];
    }
    
    
    #ifdef ACCOMP
    #pragma omp end declare target
    #endif
    
    #ifdef __CUDACC__
    //double __device__ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
    //{
    //     double conv_weight;
    //     conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22);
    //     return conv_weight;
    //}
    
    __global__ void convolve_g(
         int num_w_planes,
         long num_points,
         long freq_per_chan,
         long polarizations,
         double* uu,
         double* vv,
         double* ww,
         float* vis_real,
         float* vis_img,
         float* weight,
         double dx,
         double dw,
         int KernelLen,
         int grid_size_x,
         int grid_size_y,
         double* grid,
         double std22)
    
    {
    	//printf("DENTRO AL KERNEL\n");
            long gid = blockIdx.x*blockDim.x + threadIdx.x;
    	if(gid < num_points)
    	{
    	long i = gid;
            long visindex = i*freq_per_chan*polarizations;
    	double norm = std22/PI;
    
            int j, k;
    
            /* Convert UV coordinates to grid coordinates. */
            double pos_u = uu[i] / dx;
            double pos_v = vv[i] / dx;
            double ww_i  = ww[i] / dw;
            int grid_w = (int)ww_i;
            int grid_u = (int)pos_u;
            int grid_v = (int)pos_v;
    
            // check the boundaries
            long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
            long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
            long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
            long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
            //printf("%ld, %ld, %ld, %ld\n",jmin,jmax,kmin,kmax);
    
    
            // Convolve this point onto the grid.
            for (k = kmin; k <= kmax; k++)
            {
    
                double v_dist = (double)k+0.5 - pos_v;
    
                for (j = jmin; j <= jmax; j++)
                {
                    double u_dist = (double)j+0.5 - pos_u;
                    long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
    		//printf("--> %ld %d %d %d\n",iKer,j,k,grid_w);
    
                    double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
                    // Loops over frequencies and polarizations
                    double add_term_real = 0.0;
                    double add_term_img = 0.0;
                    long ifine = visindex;
                    for (long ifreq=0; ifreq<freq_per_chan; ifreq++)
    		{
    		   long iweight = visindex/freq_per_chan;
                       for (long ipol=0; ipol<polarizations; ipol++)
                       {
                          double vistest = (double)vis_real[ifine];
                          if (!isnan(vistest))
                          {
                             add_term_real += weight[iweight] * vis_real[ifine] * conv_weight;
                             add_term_img += weight[iweight] * vis_img[ifine] * conv_weight;
    		      }
                          ifine++;
    		      iweight++;
                       }
    		}
    		atomicAdd(&(grid[iKer]),add_term_real);
    		atomicAdd(&(grid[iKer+1]),add_term_img);
                }
            }
    	}
    }
    #endif
    #ifdef ACCOMP
    #pragma  omp declare target
    #endif
    void wstack(
         int num_w_planes,
         long num_points,
         long freq_per_chan,
         long polarizations,
         double* uu,
         double* vv,
         double* ww,
         float* vis_real,
         float* vis_img,
         float* weight,
         double dx,
         double dw,
         int w_support,
         int grid_size_x,
         int grid_size_y,
         double* grid,
         int num_threads)
    {
        long i;
        long index;
        long visindex;
    
        // initialize the convolution kernel
        // gaussian:
        int KernelLen = (w_support-1)/2;
        double std = 1.0;
        double std22 = 1.0/(2.0*std*std);
        double norm = std22/PI;
        double * convkernel = malloc(w_support*sizeof(*convkernel));
        makeGaussKernel(convkernel,w_support,std22);
        
        // Loop over visibilities.
    // Switch between CUDA and GPU versions
    #ifdef __CUDACC__
        // Define the CUDA set up
        int Nth = NTHREADS;
        long Nbl = (long)(num_points/Nth) + 1;
        if(NWORKERS == 1) {Nbl = 1; Nth = 1;};
        long Nvis = num_points*freq_per_chan*polarizations;
        printf("Running on GPU with %d threads and %d blocks\n",Nth,Nbl);
    
        // Create GPU arrays and offload them
        double * uu_g;
        double * vv_g;
        double * ww_g;
        float * vis_real_g;
        float * vis_img_g;
        float * weight_g;
        double * grid_g;
    
        //for (int i=0; i<100000; i++)grid[i]=23.0;
        cudaError_t mmm;
        mmm=cudaMalloc(&uu_g,num_points*sizeof(double));
        mmm=cudaMalloc(&vv_g,num_points*sizeof(double));
        mmm=cudaMalloc(&ww_g,num_points*sizeof(double));
        mmm=cudaMalloc(&vis_real_g,Nvis*sizeof(float));
        mmm=cudaMalloc(&vis_img_g,Nvis*sizeof(float));
        mmm=cudaMalloc(&weight_g,(Nvis/freq_per_chan)*sizeof(float));
        mmm=cudaMalloc(&grid_g,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
        mmm=cudaMemset(grid_g,0.0,2*num_w_planes*grid_size_x*grid_size_y*sizeof(double));
    
        mmm=cudaMemcpy(uu_g, uu, num_points*sizeof(double), cudaMemcpyHostToDevice);
        mmm=cudaMemcpy(vv_g, vv, num_points*sizeof(double), cudaMemcpyHostToDevice);
        mmm=cudaMemcpy(ww_g, ww, num_points*sizeof(double), cudaMemcpyHostToDevice);
        mmm=cudaMemcpy(vis_real_g, vis_real, Nvis*sizeof(float), cudaMemcpyHostToDevice);
        mmm=cudaMemcpy(vis_img_g, vis_img, Nvis*sizeof(float), cudaMemcpyHostToDevice);
        mmm=cudaMemcpy(weight_g, weight, (Nvis/freq_per_chan)*sizeof(float), cudaMemcpyHostToDevice);
    
        // Call main GPU Kernel
        convolve_g <<<Nbl,Nth>>> (
    	       num_w_planes,
                   num_points,
                   freq_per_chan,
                   polarizations,
                   uu_g,
                   vv_g,
                   ww_g,
                   vis_real_g,
                   vis_img_g,
                   weight_g,
                   dx,
                   dw,
                   KernelLen,
                   grid_size_x,
                   grid_size_y,
                   grid_g,
    	       std22);
    
        mmm = cudaMemcpy(grid, grid_g, 2*num_w_planes*grid_size_x*grid_size_y*sizeof(double), cudaMemcpyDeviceToHost);
        //for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
        printf("CUDA ERROR %s\n",cudaGetErrorString(mmm));
        mmm=cudaFree(uu_g);
        mmm=cudaFree(vv_g);
        mmm=cudaFree(ww_g);
        mmm=cudaFree(vis_real_g);
        mmm=cudaFree(vis_img_g);
        mmm=cudaFree(weight_g);
        mmm=cudaFree(grid_g);
    
    // Switch between CUDA and GPU versions
    # else
    
    #ifdef _OPENMP
        omp_set_num_threads(num_threads);
    #endif
    
    #ifdef ACCOMP
        long Nvis = num_points*freq_per_chan*polarizations;
      //  #pragma omp target data map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan])
      //  #pragma omp target teams distribute parallel for  map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom: grid[0:2*num_w_planes*grid_size_x*grid_size_y])
    #else
        #pragma omp parallel for private(visindex)
    #endif
        for (i = 0; i < num_points; i++)
        {
    #ifdef _OPENMP
    	//int tid;
    	//tid = omp_get_thread_num();
    	//printf("%d\n",tid);
    #endif
    
            visindex = i*freq_per_chan*polarizations;
    
            double sum = 0.0;
            int j, k;
    	//if (i%1000 == 0)printf("%ld\n",i);
    
            /* Convert UV coordinates to grid coordinates. */
            double pos_u = uu[i] / dx;
            double pos_v = vv[i] / dx;
            double ww_i  = ww[i] / dw;
            int grid_w = (int)ww_i;
            int grid_u = (int)pos_u;
            int grid_v = (int)pos_v;
    
    	// check the boundaries
    	long jmin = (grid_u > KernelLen - 1) ? grid_u - KernelLen : 0;
    	long jmax = (grid_u < grid_size_x - KernelLen) ? grid_u + KernelLen : grid_size_x - 1;
    	long kmin = (grid_v > KernelLen - 1) ? grid_v - KernelLen : 0;
    	long kmax = (grid_v < grid_size_y - KernelLen) ? grid_v + KernelLen : grid_size_y - 1;
            //printf("%d, %ld, %ld, %d, %ld, %ld\n",grid_u,jmin,jmax,grid_v,kmin,kmax);
    
    
            // Convolve this point onto the grid.
            for (k = kmin; k <= kmax; k++)
            {
    
                double v_dist = (double)k+0.5 - pos_v;
    
                for (j = jmin; j <= jmax; j++)
                {
    		int jKer = (int)j-pos_u+KernelLen;
    		int kKer = (int)k-pos_v+KernelLen;
                    double u_dist = (double)j+0.5 - pos_u;
    		long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
    
    
    		//double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
    		double conv_weight = convkernel[jKer]*convkernel[kKer];
    		// Loops over frequencies and polarizations
    		double add_term_real = 0.0;
    		double add_term_img = 0.0;
    		long ifine = visindex;
    		// DAV: the following two loops are performend by each thread separately: no problems of race conditions
    		for (long ifreq=0; ifreq<freq_per_chan; ifreq++)
    		{
    		   long iweight = visindex/freq_per_chan;
    	           for (long ipol=0; ipol<polarizations; ipol++)
    	           {
                          if (!isnan(vis_real[ifine]))
                          {
    		         //printf("%f %ld\n",weight[iweight],iweight);
                             add_term_real += weight[iweight] * vis_real[ifine] * conv_weight;
    		         add_term_img += weight[iweight] * vis_img[ifine] * conv_weight;
    			 //if(vis_img[ifine]>1e10 || vis_img[ifine]<-1e10)printf("%f %f %f %f %ld %ld\n",vis_real[ifine],vis_img[ifine],weight[iweight],conv_weight,ifine,num_points*freq_per_chan*polarizations);
    		      }
    		      ifine++;
    		      iweight++;
    		   }
    	        }
    		// DAV: this is the critical call in terms of correctness of the results and of performance
    		#pragma omp atomic
    		grid[iKer] += add_term_real;
    		#pragma omp atomic
    		grid[iKer+1] += add_term_img;
                }
            }
    
        }
    // End switch between CUDA and CPU versions
    #endif
        //for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
    }
    #ifdef ACCOMP
    #pragma  omp end declare target
    #endif
    
    int test(int nnn)
    {
    	int mmm;
    
    	mmm = nnn+1;
    	return mmm;
    }