#include "allvars_nccl.h"
#include "proto.h"
#include <cuda.h>
#include <cuda_runtime.h>
#include <nccl.h>

/* 
 * Implements the gridding of data via GPU
 * by using NCCL library
 *
 */


#if defined( NCCL_REDUCE )

/*
#define NCCLCHECK(cmd) do {                         
ncclResult_t r = cmd;                             
if (r!= ncclSuccess) {                            
  printf("Failed, NCCL error %s:%d '%s'\n",       
	 __FILE__,__LINE__,ncclGetErrorString(r));   
  exit(EXIT_FAILURE);                             
 }                                                 
} while(0)
*/  


static uint64_t getHostHash(const char* string) {
  // Based on DJB2a, result = result * 33 ^ char                                                                                                 
  uint64_t result = 5381;
  for (int c = 0; string[c] != '\0'; c++){
    result = ((result << 5) + result) ^ string[c];
  }
  return result;
}


static void getHostName(char* hostname, int maxlen) {
  gethostname(hostname, maxlen);
  for (int i=0; i< maxlen; i++) {
    if (hostname[i] == '.') {
        hostname[i] = '\0';
        return;
    }
  }  
}




void gridding_data(){

  double shift = (double)(dx*yaxis);

  timing_wt.kernel     = 0.0;
  timing_wt.reduce     = 0.0;
  timing_wt.reduce_mpi = 0.0;
  timing_wt.reduce_sh  = 0.0;
  timing_wt.compose    = 0.0;
  
  // calculate the resolution in radians
  resolution = 1.0/MAX(fabs(metaData.uvmin),fabs(metaData.uvmax));

  // calculate the resolution in arcsec 
  double resolution_asec = (3600.0*180.0)/MAX(fabs(metaData.uvmin),fabs(metaData.uvmax))/PI;
  if ( rank == 0 )
    printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec);

  // find the largest value in histo_send[]
  //                                                                  
 
  
    //Initialize nccl

  //double *gridss_gpu, *grid_gpu;
  int local_rank = 0;

  uint64_t hostHashs[size];
  char hostname[1024];
  getHostName(hostname, 1024);
  hostHashs[rank] = getHostHash(hostname);
  MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD);
  for (int p=0; p<size; p++) {
     if (p == rank) break;
     if (hostHashs[p] == hostHashs[rank]) local_rank++;
  }
  
  ncclUniqueId id;
  ncclComm_t comm;
  cudaError_t nnn;
  cudaStream_t stream_reduce, stream_stacking;

  if (rank == 0) ncclGetUniqueId(&id);
  MPI_Bcast((void *)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);

  cudaSetDevice(local_rank);

  long long unsigned size_finta = (long long unsigned)(2*(long long unsigned)param.num_w_planes*(long long unsigned)xaxis*(long long unsigned)yaxis); 
  
  nnn = cudaMalloc(&grid_gpu, (size_t)(size_finta*sizeof(double)));
  if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaMalloc &grid_gpu ERROR %d !!!\n", nnn);}

  nnn = cudaMalloc(&gridss_gpu, (size_t)(size_finta*sizeof(double)));
  if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaMalloc &gridss_gpu ERROR %d !!!\n", nnn);}
  
  nnn = cudaStreamCreate(&stream_reduce);
  if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaStreamCreate &stream_reduce ERROR %d !!!\n", nnn);}

  cudaStreamCreate(&stream_stacking);
  if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaStreamCreate &stream_stacking ERROR %d !!!\n", nnn);}

  ncclCommInitRank(&comm, size, id, rank);

  for (uint isector = 0; isector < nsectors; isector++)
    {

      double start = CPU_TIME_wt;

      uint Nsec            = histo_send[isector];
      uint Nweightss       = Nsec*metaData.polarisations;
      uint Nvissec         = Nweightss*metaData.freq_per_chan;
      double_t *memory     = (double*) malloc ( (Nsec*3)*sizeof(double_t) +
						(Nvissec*2+Nweightss)*sizeof(float_t) );

      if ( memory == NULL )
	shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__);
  
      double_t *uus        = (double_t*) memory;
      double_t *vvs        = (double_t*) uus+Nsec;
      double_t *wws        = (double_t*) vvs+Nsec;
      float_t  *weightss   = (float_t*)((double_t*)wws+Nsec);
      float_t  *visreals   = (float_t*)weightss + Nweightss;
      float_t  *visimgs    = (float_t*)visreals + Nvissec;
  
      
      
      // select data for this sector
      uint icount = 0;
      uint ip = 0;
      uint inu = 0;

      #warning "this loop should be threaded"
      #warning "the counter of this loop should not be int"
      for(int iphi = histo_send[isector]-1; iphi>=0; iphi--)
        {

	  uint ilocal = sectorarray[isector][iphi];

	  uus[icount] = data.uu[ilocal];
	  vvs[icount] = data.vv[ilocal]-isector*shift;
	  wws[icount] = data.ww[ilocal];
	  for (uint ipol=0; ipol<metaData.polarisations; ipol++)
	    {
	      weightss[ip] = data.weights[ilocal*metaData.polarisations+ipol];
	      ip++;
	    }
	  for (uint ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++)
	    {
	      visreals[inu] = data.visreal[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq];
	      visimgs[inu]  = data.visimg[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq];

	      inu++;
	    }
	  icount++;
	}

      double uumin = 1e20;
      double vvmin = 1e20;
      double uumax = -1e20;
      double vvmax = -1e20;

      #pragma omp parallel reduction( min: uumin, vvmin) reduction( max: uumax, vvmax) num_threads(param.num_threads)
      {
        double my_uumin = 1e20;
        double my_vvmin = 1e20;
        double my_uumax = -1e20;
        double my_vvmax = -1e20;

       #pragma omp for
        for (uint ipart=0; ipart<Nsec; ipart++)
          {
            my_uumin = MIN(my_uumin, uus[ipart]);
            my_uumax = MAX(my_uumax, uus[ipart]);
            my_vvmin = MIN(my_vvmin, vvs[ipart]);
            my_vvmax = MAX(my_vvmax, vvs[ipart]);
          }

        uumin = MIN( uumin, my_uumin );
        uumax = MAX( uumax, my_uumax );
        vvmin = MIN( vvmin, my_vvmin );
        vvmax = MAX( vvmax, my_vvmax );
      }

      timing_wt.compose += CPU_TIME_wt - start;

      //printf("UU, VV, min, max = %f %f %f %f\n", uumin, uumax, vvmin, vvmax);
      
      // Make convolution on the grid

     #ifdef VERBOSE
      printf("Processing sector %ld\n",isector);
     #endif
          
      start = CPU_TIME_wt;
	    
     //We have to call different GPUs per MPI task!!! [GL]
#ifdef CUDACC
      wstack((long long unsigned)param.num_w_planes,
             Nsec,
             metaData.freq_per_chan,
             metaData.polarisations,
             uus,
             vvs,
             wws,
             visreals,
             visimgs,
             weightss,
             dx,
             dw,
             param.w_support,
             (long long unsigned)xaxis,
             (long long unsigned)yaxis,
             gridss_gpu,
             param.num_threads,
             rank,
             stream_stacking);
#else
      wstack(param.num_w_planes,
	     Nsec,
	     metaData.freq_per_chan,
	     metaData.polarisations,
	     uus,
	     vvs,
	     wws,
	     visreals,
	     visimgs,
	     weightss,
	     dx,
	     dw,
	     param.w_support,
	     xaxis,
	     yaxis,
	     gridss,
	     param.num_threads,
	     rank);
#endif
      //Allocate memory on devices non-blocking for the host                                                                                   
      ///////////////////////////////////////////////////////


      timing_wt.kernel += CPU_TIME_wt - start;
      
     #ifdef VERBOSE
      printf("Processed sector %ld\n",isector);
     #endif
      

      if( size > 1 )
	{
     
	  // Write grid in the corresponding remote slab
     
	  // int target_rank = (int)isector;    it implied that size >= nsectors
	  int target_rank = (int)(isector % size);

	  start = CPU_TIME_wt;

	  ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble, ncclSum, target_rank, comm, stream_reduce);
	  cudaStreamSynchronize(stream_reduce);
      
	  timing_wt.reduce += CPU_TIME_wt - start;

	  // Go to next sector
	  nnn = cudaMemset( gridss_gpu, 0.0, (size_t)(size_finta*sizeof(double)) );
	  if (nnn != cudaSuccess) {printf("!!! gridding_nccl.cu cudaMemset ERROR %d !!!\n", nnn);}
	}

      free(memory);
    }


  //Copy data back from device to host (to be deleted in next steps)
  
  //cudaMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost, stream_reduce);
  
#if !defined(CUFFTMP)
  cudaMemcpyAsync(grid, grid_gpu, (size_t)(size_finta*sizeof(double)), cudaMemcpyDeviceToHost, stream_reduce);
  cudaStreamSynchronize(stream_reduce);
#endif

  MPI_Barrier(MPI_COMM_WORLD);

#if !defined(CUFFTMP)
  cudaFree(grid_gpu);
  cudaFree(gridss_gpu);
#endif

  cudaStreamDestroy(stream_reduce);
  cudaStreamDestroy(stream_stacking);
  
  ncclCommDestroy(comm);

  return;
  
}

#endif
