//////////////////////////////////////////////////////////////////////////////////////////////////
//
// OpenMP GPU Offload is available only on systems with NVIDIA GPUs with compute capability '>= cc70'
//
// Assigment : write a OMP-GPU code corresponding to the
// following sequential C code
//
// #include <stdio.h>
// #define ROW 4
// #define COL 8
// int main()
// {
//   int Matrix[ROW * COL];
//
//   for (int row=0 ; row<ROW ; row++)
//      for (int col=0 ; col<COL ; col++)
//         Matrix[(row * COL) + col] = ((row * COL) + col);
//
//   return 0;
// }
//////////////////////////////////////////////////////////////////////////////////////////////////

//////////////////////////////////////////////////////////////////////////////////////////////////
// Author: David Goz
// mail  : david.goz@inaf.it
// date  : 08.07.2024
// code tested using nvhpc
//
// - Compile the code:
//   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork.c -o classwork_omp
// - Run the code:
//   $ ./classwork_omp
//////////////////////////////////////////////////////////////////////////////////////////////////

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <omp.h>

#define BLOCKSIZE 32
#define ROW       1089
#define COL       1111

#define NDEBUG

typedef int MyData;

void GPUMatrix(MyData *Matrix,
	       const int dim)
{
  /* exploit synchronously the GPU                                                             */
  /* allocate a buffer of size 'dim' and copy it to host memory at the end of kernel execution */
  /* create a given number of teams and distribute them across iterations                      */
  /* create a given number of threads per each team                                            */

#pragma omp target                                                                \
            teams distribute num_teams(((ROW * COL) + BLOCKSIZE - 1) / BLOCKSIZE) \
	    parallel for num_threads(BLOCKSIZE)                                   \
	    map(from: Matrix[0:dim])
  for (size_t index=0 ; index<dim ; index++)
    {
#if !defined(NDEBUG)
      
      const int nteam  = omp_get_num_teams();
      const int team   = omp_get_team_num();
      const int tid    = omp_get_thread_num();
      const int nthr   = omp_get_num_threads();
      const int whoAmI = tid + (team * nthr);

      if (!omp_is_initial_device() && !whoAmI)
	{
	  printf("\n\t GPU is running:");
	  printf("\n\t\t Number of teams: %d - Number of threads per team: %d\n\n", nteam, nthr);
	}
      
#endif /* NDEBUG */

      Matrix[index] = index;
    } /* end-target region */
  
  return;
}

int main()
{
  /* host matrix . 1D mapping matrix[i][j] ---> matrix[(i * COL) + j] */
  MyData *matrix = (MyData *)malloc(ROW * COL * sizeof(*matrix));
  assert(matrix != NULL);
  
  // kernel
  GPUMatrix(matrix, (ROW * COL));
  
  // check the result
  int flag = 0;
  for (size_t row=0 ; row<ROW ; row++)
    {
      const size_t i = (row * COL);
      
      for (size_t col=0 ; col<COL ; col++)
	{
	  flag = ((matrix[i + col] != (i + col)) ? -1 : flag);
	} /* col loop */
    } /* row loop */

  // free host memory
  free(matrix);

  if (flag)
    printf("\n\t Result wrong! \n\n");
  else
    printf("\n\t Result OK! \n\n");
  
  return 0;
}
