////////////////////////////////////////////////////////////////////////////////////////////////////
//
// Splitting the asynchronous vector addition task graph across four devices
//
// Author: David Goz
// mail  : david.goz@inaf.it
// date  : 28.08.2024
// code tested using nvhpc
//
// - Compile the code:
//   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v asynchronous.c -o asynchronous_omp
// - Run the code:
//   $ export OMP_TARGET_OFFLOAD=mandatory
//   $ ./asynchronous_omp
////////////////////////////////////////////////////////////////////////////////////////////////////


#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <assert.h>

typedef int MyData;

#define NDEBUG

void check(const MyData *const C,
	   const size_t        size)
{
  int flag = 0;
  for (size_t i=0 ; i<size ; i++)
    flag = ((C[i] != 98) ? 1 : flag);

  if (flag)
    printf("\n\t Result wrong \n");
  else
    printf("\n\t Result OK \n");

  return;
}

int main()
{
    // alloc data on the device
  const int NumDev = omp_get_num_devices();

  if (NumDev != 4)
    {
      printf("\n\t The program runs using 4 GPUs... aborting...\n");
      exit(EXIT_FAILURE);
    }

  const int size = 1000000;
  MyData *buffer = (MyData *)malloc(3 * size * sizeof(MyData));
  assert(buffer != NULL);
  MyData *const restrict A = buffer;
  MyData *const restrict B = A + size;
  MyData *const restrict C = B + size;

  // init A on GPU 0
  #pragma omp target nowait                 \
                     map(tofrom: A[0:size]) \
                     depend(out: A[0:size]) \
                     device(0)
  {
    #pragma omp loop
    for (int i=0 ; i<size ; i++)
      A[i] = (MyData)i;
  } // device 0
  
  // init B on GPU 1
  #pragma omp target nowait                 \
                     map(tofrom: B[0:size]) \
                     depend(out: B[0:size]) \
                     device(1)
  {
    #pragma omp loop
    for (int i=0 ; i<size ; i++)
      B[i] = (MyData)-i;
  } // device 1

  // init C on GPU 2
  #pragma omp target nowait                 \
                     map(tofrom: C[0:size]) \
                     depend(out: C[0:size]) \
                     device(2)
  {
    #pragma omp loop
    for (int i=0 ; i<size ; i++)
      C[i] = 99;
  } // device 2

  // perform the calculation on GPU 3
  #pragma omp target nowait                                                    \
                     map(to: A[0:size], B[0:size]) map(tofrom: C[0:size])      \
                     depend(in: A[0:size], B[0:size]) depend(inout: C[0:size]) \
                     device(3)
  {
    #pragma omp loop
    for (int i=0 ; i<size ; i++)
      C[i] += (A[i] + B[i] - 1);
  } // device 3

  #pragma omp task depend(in: C[0:size])
  check(C, size);
  
  free(buffer);
  
  return 0;
}
