////////////////////////////////////////////////////////////////////////////////////////////////////
// 
//
// Author: David Goz
// mail  : david.goz@inaf.it
// date  : 28.08.2024
// code tested using nvhpc
//
// The code does not compile with nvc
// The code compiles with clang

////////////////////////////////////////////////////////////////////////////////////////////////////


#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <assert.h>

typedef int MyData;
#define N_PER_DEV   1000000

#define NDEBUG

int main()
{
  // get the number of the available devices
  const int NumDev = omp_get_num_devices();

  // global vector size
  const int size = (NumDev * N_PER_DEV);
  assert(size > 0);

  MyData *buffer = (MyData *)malloc(2 * size * sizeof(MyData));
  assert(buffer != NULL);
  MyData *const restrict A = buffer;
  MyData *const restrict B = A + size;
  MyData sum_cpu = (MyData)0;
  
 #pragma omp parallel for simd reduction(+: sum_cpu)
  for (int i=0 ; i<size ; i++)
    {
      A[i] = rand() % N_PER_DEV;
      B[i] = rand() % N_PER_DEV;
      sum_cpu += A[i] + B[i];
    }

  MyData sum_gpu = (MyData)0;

#pragma omp parallel num_threads(NumDev) reduction(task, +:sum_gpu)
  {
   #pragma omp single
   {
     if (NumDev != omp_get_num_threads())
       exit(EXIT_FAILURE);
     else
     {
       printf("\n\t Using %d GPUs \n", NumDev);
       fflush(stdout);
      }
    } // implicit barrier

    const int tid    = omp_get_thread_num();
    const int offset = (tid * N_PER_DEV);
    
   #pragma omp target   					 \
	       map(to: A[offset:N_PER_DEV], B[offset:N_PER_DEV]) \
               device(tid)                                       \
               in_reduction(+: sum_gpu)
    #pragma omp loop reduction(+: sum_gpu)
    for (int i=offset ; i<(offset + N_PER_DEV) ; i++)
      sum_gpu += (A[i] + B[i]);
  } // omp parallel

  if (sum_cpu == sum_gpu)
    printf("\n\t Result OK \n");
  else
    printf("\n\t Result wrong \n");
  
  free(buffer);
  
  return 0;
}
