//////////////////////////////////////////////////////////////////////////////////////////////////
// Assigment : write a CUDA code that does:
//             - each thread generates a pair of points as (x, y) coordinates randomly distributed;
//             - each thread computes the euclidean distance d = sqrt((x1 - x2)^2 + (y1 - y2)^2);
//             - the kernel prints the maximum distance;
//             - use one CUDA block and shared memory within the block
//////////////////////////////////////////////////////////////////////////////////////////////////

//////////////////////////////////////////////////////////////////////////////////////////////////
// Author: David Goz
// mail  : david.goz@inaf.it
// date  : 06.07.2024
// code tested using nvhpc
//
// - Compile the code:
//   $ nvc++ classwork.cu -o classwork -lm
// - Run the code:
//   $ ./classwork
//////////////////////////////////////////////////////////////////////////////////////////////////

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda.h>
#include <assert.h>

#define NUMPOINTS  1024
#define BLOCKSIZE  NUMPOINTS

#if NUMPOINTS != BLOCKSIZE
#error NUMPOINTS must be equal to BLOCKSIZE
#endif

#if BLOCKSIZE > 1024
#error BLOCKSIZE cannot be larger than 1024
#endif

#define SQUARE(A)      ((A) * (A))

typedef double MyData;

typedef struct PairPoints
{
  MyData x[2];
  MyData y[2];
} PairPoints;

__global__ void GPUDistance(const PairPoints *const points,
			    const int               size)
{
  /* global thread's ID */
  const int globalID = (threadIdx.x + (blockIdx.x * blockDim.x));
  /* local (i.e. within the block) thread's ID */
  const int localID  = threadIdx.x;

  if ((globalID >= size) || (globalID != localID))
    return;

  /* coalescent memory accesses */
  const PairPoints myPoints = points[localID];
  const MyData pair_distance_X2 = SQUARE(myPoints.x[0] - myPoints.x[1]);
  const MyData pair_distance_Y2 = SQUARE(myPoints.y[0] - myPoints.y[1]);  
  const MyData pair_distance    = sqrt(pair_distance_X2 + pair_distance_Y2);

  // shared-block memory statically allocated
  __shared__ MyData distance[BLOCKSIZE];
  
  /* store the distance in shared memory */
  distance[localID] = pair_distance;
  // block level synchronization barrier
  __syncthreads();

  /* the master thread within the block gets the max */
  if (localID == 0)
    {
      MyData max_dis = -1.0;
      for (size_t i=0 ; i<size ; i++)
	max_dis = ((max_dis < distance[i]) ? distance[i] : max_dis);

      printf("\t GPU maximum distance: %lg\n", max_dis);
    }
  
  return;
}

void CPUMaxDistance(const PairPoints *const points,
		    const        int        size)
{
  MyData distance = -1.0;
  for (size_t i=0 ; i<size ; i++)
    {
      const MyData pair_distance_X2 = SQUARE(points[i].x[0] - points[i].x[1]);
      const MyData pair_distance_Y2 = SQUARE(points[i].y[0] - points[i].y[1]);
      const MyData pair_distance    = sqrt(pair_distance_X2 + pair_distance_Y2);

      distance = ((distance < pair_distance) ? pair_distance : distance);
    }

  printf("\n\t CPU maximum distance: %lg\n", distance);
  
  return;
}

int main()
{
  /* host allocation */
  PairPoints *h_points = (PairPoints *)malloc(NUMPOINTS * sizeof(*h_points));
  assert(h_points != NULL);
  
  /* device allocation */
  PairPoints *d_points = NULL;
  cudaMalloc((void **)&d_points, (NUMPOINTS * sizeof(*d_points)));

  /* initialization */
  srand48(time(NULL));
  for(size_t i=0 ; i<NUMPOINTS ; i++)
    {
      h_points[i].x[0] = drand48();
      h_points[i].x[1] = drand48();
      h_points[i].y[0] = drand48();
      h_points[i].y[1] = drand48();
    }

  /* copy data to the device's memory */
  cudaMemcpy(d_points, h_points, (NUMPOINTS * sizeof(*d_points)), cudaMemcpyHostToDevice);
  
  const dim3 grid  = {1, 1, 1};
  const dim3 block = {BLOCKSIZE, 1, 1};
  
  // lunch kernel
  GPUDistance<<< grid, block >>>(d_points, NUMPOINTS);

  // check the result on the host while the kernel is executing on the GPU
  CPUMaxDistance(h_points, NUMPOINTS);

  free(h_points);
  
  // device synchronization
  cudaDeviceSynchronize();

  // free memory
  cudaFree(d_points);
  
  return 0;
}
