//////////////////////////////////////////////////////////////////////////////////////////////////
//
// OpenMP GPU Offload is available only on systems with NVIDIA GPUs with compute capability '>= cc70'
//
// Assigment : write a OMP-GPU code that does:
//             - each thread generates a pair of points as (x, y) coordinates randomly distributed;
//             - each thread computes the euclidean distance d = sqrt((x1 - x2)^2 + (y1 - y2)^2);
//             - the kernel prints the maximum distance;
//             - host manages asynchronously the GPU.
//////////////////////////////////////////////////////////////////////////////////////////////////

//////////////////////////////////////////////////////////////////////////////////////////////////
// Author: David Goz
// mail  : david.goz@inaf.it
// date  : 06.07.2024
// code tested using nvhpc
//
// - Compile the code:
//   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork_async.c -o classwork_async_omp -lm
// - Run the code:
//   $ ./classwork_async_omp
//////////////////////////////////////////////////////////////////////////////////////////////////

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
#include <time.h>
#include <assert.h>

#define NUMPOINTS  65536
#define SQUARE(A)  ((A) * (A))

#define NDEBUG

typedef double MyData;

typedef struct PairPoints
{
  MyData x[2];
  MyData y[2];
} PairPoints;

/* global host-device variables (i.e. on both global memories) */
/* a copy of 'GPUMaxDistance' is created in the global memory  */
/* of the GPU from the beginning of the program till its end.  */
#pragma omp declare target
MyData DeviceMaxDistance;
#pragma omp end declare target

void GPUMaxDistance(const PairPoints *const points,
		    const int               size)
{
#pragma omp target nowait                              \
        teams distribute                               \ 
        parallel for reduction(max: DeviceMaxDistance) \
        map(to: points[0:size])
  for (size_t i=0 ; i<size ; i++)
    {
#if !defined(NDEBUG)
      
      const int nteam  = omp_get_num_teams();
      const int team   = omp_get_team_num();
      const int tid    = omp_get_thread_num();
      const int nthr   = omp_get_num_threads();
      const int whoAmI = tid + (team * nthr);

      if (!omp_is_initial_device() && !whoAmI)
	{
	  printf("\n\t GPU is running:");
	  printf("\n\t\t Number of teams: %d - Number of threads per team: %d\n\n", nteam, nthr);
	}
      
#endif /* NDEBUG */
      
      const PairPoints myPoint      = points[i];
      const MyData pair_distance_X2 = SQUARE(myPoint.x[0] - myPoint.x[1]);
      const MyData pair_distance_Y2 = SQUARE(myPoint.y[0] - myPoint.y[1]);
      const MyData pair_distance    = sqrt(pair_distance_X2 + pair_distance_Y2);

      DeviceMaxDistance = ((DeviceMaxDistance < pair_distance) ? pair_distance : DeviceMaxDistance);
    } /* end-target region */

  /* data synchronization host-device, the host fetches data from the GPU */
  /* By default, the host must wait for the completion of the fetching,   */
  /* however the 'nowait' clause enables the target update construct to   */
  /* execute asynchronously with respect to the encountering host thread. */
  #pragma omp target update from(DeviceMaxDistance) nowait
	
  return;
}

void CPUMaxDistance(const PairPoints *const points,
		    const        int        size)
{
  MyData distance = -1.0;
  for (size_t i=0 ; i<size ; i++)
    {
      const MyData pair_distance_X2 = SQUARE(points[i].x[0] - points[i].x[1]);
      const MyData pair_distance_Y2 = SQUARE(points[i].y[0] - points[i].y[1]);
      const MyData pair_distance    = sqrt(pair_distance_X2 + pair_distance_Y2);

      distance = ((distance < pair_distance) ? pair_distance : distance);
    }

  printf("\n\t CPU maximum distance: %lg\n", distance);
  
  return;
}

int main()
{
  /* host allocation */
  PairPoints *points = (PairPoints *)malloc(NUMPOINTS * sizeof(*points));
  assert(points != NULL);

  /* initialization */
  srand48(time(NULL));
  for(size_t i=0 ; i<NUMPOINTS ; i++)
    {
      points[i].x[0] = drand48();
      points[i].x[1] = drand48();
      points[i].y[0] = drand48();
      points[i].y[1] = drand48();
    }
  
  /* lunch the kernel asynchrounusly (i.e. the host */
  /* does not wait the completion of GPU execution) */
  GPUMaxDistance(points, NUMPOINTS);

  // check the result on the host
  CPUMaxDistance(points, NUMPOINTS);

  /* host-device synchronization in order to ensure that the */
  /* kernel has been executed on the GPU and the global      */
  /* variable 'DeviceMaxDistance' is synchronized between    */
  /* host and device data environment.                       */
  #pragma omp taskwait
  
  printf("\n\t GPU maximum distance: %lg\n", DeviceMaxDistance);
  
  free(points);
  
  return 0;
}
