//////////////////////////////////////////////////////////////////////////////////////////////////
//
// OpenACC GPU Offload is available only on systems with NVIDIA GPUs with compute capability '>= cc70'
//
// Assigment : write an OPENACC-GPU code corresponding to the
// following sequential C code
//
// #include <stdio.h>
// #define N 100
// int main()
// {
//   for (int i=0 ; i<N ; i++)
//     printf("%d\n", (i * i));

//   return 0;
// }
//////////////////////////////////////////////////////////////////////////////////////////////////

//////////////////////////////////////////////////////////////////////////////////////////////////
// Author: David Goz
// mail  : david.goz@inaf.it
// date  : 06.07.2024
// code tested using nvhpc
//
// - Compile the code to run on :
//   $ nvc -acc -gpu=ccnative,debug,lineinfo -Minfo=all -v classwork_1.c -o classwork_1_acc
// - Run the code:
//   $ ./classwork_1_acc
// - Check the result:
//   $ ./classwork_1_acc | tail -n 100 | sort -nk 5

//////////////////////////////////////////////////////////////////////////////////////////////////

#include <stdio.h>
#include <openacc.h>

#define N        100
#define NThreads 1024

#define NDEBUG

void GPUkernelSerial(const int size)
{
#pragma acc parallel seq
  {
#if !defined(NDEBUG)

    if (!omp_is_initial_device())
      printf("\n\t GPU is executing GPUkernelSerial\n" );
    else
      printf("\n\t CPU is executing GPUkernelSerial\n" );

#endif /* NDEBUG */
    
    const int whoAmI = omp_get_thread_num();
    
    for (int i=0 ; i<size ; i++)
      printf("Hello from OPENACC-GPU thread: %d - result %d\n", whoAmI, (i * i));
  } /* omp target region - implicit barrier */

  return;
}

void GPUkernelParallel(const int size)
{
#pragma omp target
  {
#if !defined(NDEBUG)
    
    if (!omp_is_initial_device())
      printf("\n\t GPU is executing GPUkernelSerial\n" );
    else
      printf("\n\t CPU is executing GPUkernelSerial\n" );

#endif /* NDEBUG */

    #pragma omp teams distribute parallel for
    for (int i=0 ; i<size ; i++)
      {
	/* get CUDA blockIdx.x */
	const int team = omp_get_team_num();

	/* get CUDA threadIdx.x */
	const int tid = omp_get_thread_num();

	/* get CUDA blockDim.x */
	const int nthr = omp_get_num_threads();
	
	const int whoAmI = tid + (team * nthr);
	
	printf("Hello from OMP-GPU thread: %d - result %d\n", whoAmI, (i * i));
      }
  } /* omp target region - implicit barrier */

  return;
}

int main()
{
  printf("\n\t The host issues the kernel on the GPU in serial\n");
  /* kernel lunch using one GPU thread */
  GPUkernelSerial(N);

  printf("\n\t The host issues the kernel on the GPU in parallel\n");
  /* kernel lunch using N GPU threads */
  GPUkernelParallel(N);
  
  return 0;
}
