//////////////////////////////////////////////////////////////////////////////////////////////////
//
// OpenMP GPU Offload is available only on systems with NVIDIA GPUs with compute capability '>= cc70'
//
// Assigment : write a OMP-GPU code corresponding to the
// following sequential C code
//
// #include <stdio.h>
// int main()
// {
//   /* Matrix multiplication: C = A X B */
//
//   int A[N][N], B[N][N], C[N][N];
//
//   for (int ii=0 ; ii<N ; ii++)
//      for (int jj=0 ; jj<N ; jj++)
//         for (int kk=0 ; kk<N ; kk++)
//            C[ii][jj] += A[ii][kk] * B[kk][jj];
//
//   return 0;
// }
//////////////////////////////////////////////////////////////////////////////////////////////////

//////////////////////////////////////////////////////////////////////////////////////////////////
// Author: David Goz
// mail  : david.goz@inaf.it
// date  : 08.07.2024
// code tested using nvhpc
//
// - Compile the code:
//   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork.c -o classwork_omp
// - Run the code:
//   $ ./classwork_omp <N>
//////////////////////////////////////////////////////////////////////////////////////////////////

#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <assert.h>
//#define NDEBUG

#define BLOCKSIZE 32
#define SQUARE(SIZE) ((SIZE) * (SIZE))

typedef long int MyData;

void matrix_init(      MyData *const AB,
		 const MyData        N)
{
  MyData *const restrict A = AB;
  MyData *const restrict B = AB + SQUARE(N);
  
  for (MyData ii=0 ; ii<N ; ii++)
    for (MyData jj=0 ; jj<N ; jj++)
      {
	A[(ii * N) + jj] = (ii - jj);
	B[(ii * N) + jj] = 1;
      }
  
  return;
}

void GPU_MM(const MyData *const restrict A,
	    const MyData *const restrict B,
	          MyData *const restrict C,
	    const MyData                 N)
{
  /* spawnig N^2 threads */
  /* mapping is not specify because we rely on 'omp target enter/exit data construct' */

#pragma omp target                                                            \
            teams distribute num_teams((SQUARE(N) + BLOCKSIZE - 1)/BLOCKSIZE) \
            parallel for collapse(2) num_threads(BLOCKSIZE)
  for (MyData ii=0 ; ii<N ; ii++)
    { 
      for (MyData jj=0 ; jj<N ; jj++)
	{
#if !defined(NDEBUG)
	  
	  /* ID within the team (CUDA block) */
	  const MyData localID  = omp_get_thread_num();
	  /* Team ID (ID CUDA block) */
	  const MyData team     = omp_get_team_num();
	  /* Team size (CUDA block size) */
	  const MyData nthr     = omp_get_num_threads();
	  /* global thread index */
	  const MyData globalID = (localID + (team * nthr));

	  printf("\n\t globalID: %ld - ii: %ld - jj: %ld\n", globalID, ii, jj);

#endif /* NDEBUG */	  
	  
	  MyData sum = 0;
	  for (MyData kk=0 ; kk<N ; kk++)
	    {
	      sum += (A[(ii * N) + kk] * B[(kk * N) + jj]);
	    } /* kk loop */
	  C[(ii * N) + jj] = sum;
	} /* jj loop */
    } /* ii loop */

  return;
}

void check(const MyData *const __restrict__ GPU_MM,
	   const MyData *const __restrict__ A,
	   const MyData                     N)
{
  int flag = 0;

  for (MyData ii=0 ; ii<N ; ii++)
    {
      MyData row_a = 0;      
      for (MyData kk=0 ; kk<N ; kk++)
	row_a += A[(ii * N) + kk];

      for (MyData jj=0 ; jj<N ; jj++)
	if (GPU_MM[(ii * N) + jj] != row_a)
	  {
	    flag = 1;
	    break;
	  }
    }

  if (flag)
    printf("\n\t Result WRONG \n\n");
  else
    printf("\n\t Result OK \n\n");
  
  return;
}

int main(int arg, char *argv[])
{
  MyData N;
  
  if (arg < 2)
    {
      printf("\n\t Usage: $ ./classwork <number_of_matrix_elements> \n");
      exit(EXIT_FAILURE);
    }
  else
    {
      N = atoi(argv[1]);
      if (N <= 0)
	{
	  printf("\n\t Number of matrix elements must be greater than zero \n");
	  exit(EXIT_FAILURE);
	}
      else
	{
	  printf("\n\t Matrix size    : %d x %d", N, N);
	}
    }

  /* host memory allocation */
  MyData *buffer = (MyData *)malloc(3 * SQUARE(N) * sizeof(MyData));
  assert(buffer != NULL);
  /* set-up host pointers */
  MyData *const restrict A = buffer;
  MyData *const restrict B = A + SQUARE(N);
  MyData *const restrict C = B + SQUARE(N);
  
  // matrix initialization
  matrix_init(buffer, N);

  /* device memory allocation and copy */
#pragma omp target enter data map(to   : A[0:SQUARE(N)]) \
                              map(to   : B[0:SQUARE(N)]) \
                              map(alloc: C[0:SQUARE(N)])

  GPU_MM(A, B, C, N);

  /* host-device data synchronization */
#pragma omp target exit data map(from: C[0:SQUARE(N)])

  // check the result
  check(C, A, N);

  free(buffer);
  
  return 0;
}
