//////////////////////////////////////////////////////////////////////////////////////////////////
//
// OpenMP GPU Offload is available only on systems with NVIDIA GPUs with compute capability '>= cc70'
//
// Assigment : write an OMP-GPU code corresponding to the
// following sequential C code
//
// #include <stdio.h>
// #define N 100
// int main()
// {
//   int A[N];
//
//   for (int i=0 ; i<N ; i++)
//     A[i] = (i * i);
//
//   return 0;
// }
//////////////////////////////////////////////////////////////////////////////////////////////////

//////////////////////////////////////////////////////////////////////////////////////////////////
// Author: David Goz
// mail  : david.goz@inaf.it
// date  : 06.07.2024
// code tested using nvhpc
//
// - Compile the code:
//   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork_2.c -o classwork_2_omp
// - Run the code:
//   $ ./classwork_2_omp

//////////////////////////////////////////////////////////////////////////////////////////////////

#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <openacc.h>

#define N        100
#define NDEBUG

void GPUkernel(      int *A,
	       const int  size)
{
  /* map A to the address space of the accelerator */
#pragma omp target data map(from: A[0:size])
  {
    /* kernel to be executed on the accelerator */
   #pragma omp target
    {
      /* create a bunch of teams (CUDA blocks) */
      #pragma omp teams
      {
	/* distribute the teams over index iterations and */
	/* spawn a bunch of threads within each team */
	#pragma omp distribute parallel for
	for (int i=0 ; i<N ; i++)
	  {
#if !defined(NDEBUG)

	    const int team  = omp_get_team_num();
	    const int nteam = omp_get_num_teams();
	    const int tid   = omp_get_thread_num();
	    const int nthr  = omp_get_num_threads();

	    const int whoAmI = tid + (team * nthr);
	    
	    if (!omp_is_initial_device())
	      {
		if (whoAmI == 0)
		  {
		    printf("\n\t GPU is executing GPUkernel\n" );
		    printf("\n\t team : %d",   team);
		    printf("\n\t nteam: %d",   nteam);
		    printf("\n\t tid  : %d",   tid);
		    printf("\n\t nthr : %d\n", nthr);
		  }
	      }
	    else
	      printf("\n\t CPU is executing GPUkernel\n" );
      
#endif /* NDEBUG */	    
	    
	    A[i] = (i * i);
	  } /* omp parallel for */
      } /* omp teams */
    } /* omp target */
  } /* omp target data */
    
  return;
}

int main()
{
  /* host array */
  int *A = (int *)malloc(N * sizeof(*A));
  assert(A != NULL);
  
  // kernel lunch
  GPUkernel(A, N);
  
  // check the result
  printf("\n");
  for (size_t i=0 ; i<N ; i++)
    printf("\t A[%d] = %d", i, A[i]);
  printf("\n\n");

  // free host memory
  free(A);
  
  return 0;
}
