////////////////////////////////////////////////////////////////////////////////////////////////
// - You are given 512 x 512 integer matrix M;
// - Each row is assigned to a thread-block;
// - Each thread is assigned a matrix element M[i][j];
// - It changes M[i][j] to M[i][j] + M[i][j+1] (where possible);
// - Exploit shared-memory.
////////////////////////////////////////////////////////////////////////////////////////////////

//////////////////////////////////////////////////////////////////////////////////////////////////
// Author: David Goz
// mail  : david.goz@inaf.it
// date  : 14.07.2024
// code tested using nvhpc
//
// - Compile the code:
//   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork.c -o classwork_omp
// - Run the code:
//   $ ./classwork_omp
//////////////////////////////////////////////////////////////////////////////////////////////////

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <time.h>
#include <assert.h>
#include <omp.h>
#include <string.h>

#define N                     512
#define SIZE                  (N * N) // matrix size
typedef int64_t MyData;               // do not change
#define BLOCKSIZE             N       // number of threads per block

// sanity check
#if BLOCKSIZE > 1024
#error BLOCKSIZE cannot be larger than 1024
#endif

#if BLOCKSIZE != N
#error BLOCKSIZE must be equal to N
#endif

#define LOOP 100
#define NDEBUG

double wall_time()
{
  struct timespec ts;
  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
  const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;

  return ret;
}

void CPU_matrix(MyData *const matrix)
{
  for (size_t i=0 ; i<N ; i++)
    for (size_t j=0 ; j<N-1 ; j++)
      matrix[(i * N) + j] += matrix[(i * N) + j + 1];

  return;
}

void GPU_matrix_loops(      MyData *const restrict matrix,
		      const MyData *const restrict buffer)
{
#pragma omp target teams distribute num_teams(N)
  for (size_t i=0 ; i<N ; i++)
    {
      #pragma omp parallel for num_threads(N)
      for (size_t j=0 ; j<N ; j++)
	{
#if !defined(NDEBUG)

	  const int nteam  = omp_get_num_teams();
	  const int team   = omp_get_team_num();
	  const int tid    = omp_get_thread_num();
	  const int nthr   = omp_get_num_threads();
	  const int whoAmI = tid + (team * nthr);
	    
	  if (!omp_is_initial_device() && !tid)
	    printf("\n\t\t Team: %d - nthr: %d - whoAmI: %d - i: %ld - j: %ld\n\n",
		   team, nthr, whoAmI, i, j);

#endif /* NDEBUG */
	   
	  matrix[(i * N) + j] = ((j < N-1)                                       ?
				 (buffer[(i * N) + j] + buffer[(i * N) + j + 1]) :
				 buffer[(i * N) + j]);
	} /* j loop distributed across threads within a team */
    } /* i loop distributed across teams */
  
  return;
}

void GPU_matrix_no_loops(      MyData *const restrict matrix,
			 const MyData *const restrict buffer)
{
 #pragma omp target
  {
   #pragma omp teams num_teams(N)
    {
      const size_t team = omp_get_team_num();

     #pragma omp parallel firstprivate(matrix, buffer, team) num_threads(N)
      {
	const size_t tid      = omp_get_thread_num();
	const size_t globalID = ((team * N) + tid);

	matrix[globalID] = ((tid < (N - 1))                           ?
			    (buffer[globalID] + buffer[globalID + 1]) :
			    buffer[globalID]);
      } /* omp parallel */
    } /* omp teams */
  } /* omp target */

  return;
}

void GPU_matrix_shared_loops(MyData *const restrict matrix)
{
  #pragma omp target teams distribute num_teams(N)
  for (size_t i=0 ; i<N ; i++)
    {
      MyData buffer[BLOCKSIZE + 1];
      
      #pragma omp parallel for shared(i, buffer) num_threads(N)
      for (size_t j=0 ; j<N ; j++)
	{
#if !defined(NDEBUG)

	  const int nteam  = omp_get_num_teams();
	  const int team   = omp_get_team_num();
	  const int tid    = omp_get_thread_num();
	  const int nthr   = omp_get_num_threads();
	  const int whoAmI = tid + (team * nthr);
	    
	  if (!omp_is_initial_device() && !tid)
	    printf("\n\t\t Team: %d - nthr: %d - whoAmI: %d - i: %ld - j: %ld\n\n",
		   team, nthr, whoAmI, i, j);

#endif /* NDEBUG */

	  /* shared memory initialization */
	  for (size_t item=j ; item<(BLOCKSIZE + 1) ; item+=BLOCKSIZE)
	    buffer[item] = (MyData)0;

	  /* load data into shared memory */
	  const size_t globalID = ((i * N) + j);

	  buffer[j] = matrix[globalID];
	} /* implicit thread-block synchronization */

      #pragma omp parallel for shared(i , buffer) num_threads(N)
      for (size_t j=0 ; j<N ; j++)
	{
	  const size_t globalID = ((i * N) + j);
	  
	  matrix[globalID] = (buffer[j] + buffer[j + 1]);
	} /* implicit thread-block synchronization */
    } /* i loop distributed across teams */
  
  return;
}

void GPU_matrix_shared_no_loops(MyData *const restrict matrix)
{
  #pragma omp target
  {
    #pragma omp teams num_teams(N)
    {
      MyData buffer[BLOCKSIZE + 1];
      
      const size_t team_N = (N * omp_get_team_num());

      #pragma omp parallel shared(buffer) firstprivate(team_N, matrix) num_threads(N)
      {
	const size_t tid = omp_get_thread_num();
	
	for (size_t item=tid ; item<(BLOCKSIZE + 1) ; item+=BLOCKSIZE)
	  buffer[item] = (MyData)0;

	buffer[tid] = matrix[team_N + tid];
      }

      #pragma omp parallel shared(buffer) firstprivate(team_N, matrix) num_threads(N)
      {
	const size_t tid = omp_get_thread_num();

	matrix[team_N + tid] = (buffer[tid] + buffer[tid + 1]);
      }
    } /* omp teams */
  } /* omp target */

  return;
}

void check(const MyData *const __restrict__ cpu_matrix,
	   const MyData *const __restrict__ gpu_matrix,
	   const char   *const __restrict__ msg)
{
  int flag;
  for (size_t i=0 ; i<SIZE ; i++)
    flag = ((cpu_matrix[i] != gpu_matrix[i]) ? 1 : 0);

  if (!flag)
    printf("\n\t Result OK %s", msg);
  else
    printf("\n\t Result wrong %s", msg);
  
  return;
}

int main()
{
  double time;
  // host reference matrix A
  MyData *A = (MyData *)malloc(SIZE * sizeof(MyData));
  assert (A != NULL);
  for (size_t i=0 ; i<SIZE ; i++)
    A[i] = (lrand48() % SIZE);

  ////////////////////////// CPU MATRIX //////////////////////////////////////////
  // allocate a second matrix where to perfom the actual computation
  MyData *matrix = (MyData *)malloc(SIZE * sizeof(MyData));
  assert(matrix != NULL);

  // perform the actual computation
  time = 0.0;
  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
    {
      memcpy(matrix, A, (SIZE * sizeof(MyData)));
      
      const double start = wall_time();
      CPU_matrix(matrix);
      time += (wall_time() - start);
    }
  printf("\n\t CPU time %lg [s]\n", (time / LOOP));
  ////////////////////////////////////////////////////////////////////////////////

  ////////////////////////////// GPU MATRIX //////////////////////////////////////
  MyData *gpu_matrix = (MyData *)malloc(SIZE * sizeof(MyData));
  assert(gpu_matrix != NULL);  

#pragma omp target data map(to: A[0:SIZE]) map(from: gpu_matrix[0:SIZE])
  {
    time = 0.0;
    for (unsigned short int loop=0 ; loop<LOOP ; loop++)
      {      
	double start = wall_time();
	GPU_matrix_loops(gpu_matrix, A);
	time += (wall_time() - start);
      }
  } /* omp target data */

  check(matrix, gpu_matrix, "\t global memory implementation with loops --->");
  printf("\n\t GPU time  %lg [s]\n", (time / LOOP));
  //////////////////////////////////////////////////////////////////////////////////////////

  ////////////////////////////// GPU MATRIX //////////////////////////////////////

#pragma omp target data map(to: A[0:SIZE]) map(from: gpu_matrix[0:SIZE])
  {
    time = 0.0;
    for (unsigned short int loop=0 ; loop<LOOP ; loop++)
      {      
	double start = wall_time();
	GPU_matrix_no_loops(gpu_matrix, A);
	time += (wall_time() - start);
      }
  } /* omp target data */

  check(matrix, gpu_matrix, "\t global memory implementation without loops --->");
  printf("\n\t GPU time  %lg [s]\n", (time / LOOP));
  //////////////////////////////////////////////////////////////////////////////////////////

  ////////////////////////// GPU MATRIX shared memory //////////////////////////////////////
  memcpy(gpu_matrix, A, (SIZE * sizeof(MyData)));
  /* gpu_matrix is allocated on the GPU */
  #pragma omp target enter data map(alloc: gpu_matrix[0:SIZE])
    
  time = 0.0;
  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
    {
      // restore data into gpu_matrix
      #pragma omp target update to(gpu_matrix[0:SIZE])

      double start = wall_time();
      GPU_matrix_shared_loops(gpu_matrix);
      time += (wall_time() - start);
    }

  #pragma omp target exit data map(from: gpu_matrix[0:SIZE])  
  check(matrix, gpu_matrix, "\t shared memory implementation with loops --->");
  printf("\n\t GPU time  %lg [s]\n", (time / LOOP));
  //////////////////////////////////////////////////////////////////////////////////////////

    ////////////////////////// GPU MATRIX shared memory //////////////////////////////////////
  memcpy(gpu_matrix, A, (SIZE * sizeof(MyData)));
  /* gpu_matrix is allocated on the GPU */
  #pragma omp target enter data map(alloc: gpu_matrix[0:SIZE])
    
  time = 0.0;
  for (unsigned short int loop=0 ; loop<LOOP ; loop++)
    {
      // restore data into gpu_matrix
      #pragma omp target update to(gpu_matrix[0:SIZE])

      double start = wall_time();
      GPU_matrix_shared_no_loops(gpu_matrix);
      time += (wall_time() - start);
    }

  #pragma omp target exit data map(from: gpu_matrix[0:SIZE])  
  check(matrix, gpu_matrix, "\t shared memory implementation without loops --->");
  printf("\n\t GPU time  %lg [s]\n", (time / LOOP));
  //////////////////////////////////////////////////////////////////////////////////////////

  // free CPU memory
  free(gpu_matrix);
  free(matrix);
  free(A);
  
  return EXIT_SUCCESS;
}
