////////////////////////////////////////////////////////////////////////////////////////////////////
//
// Passing OpenMP data to cuBlas.
//
// Author: David Goz
// mail  : david.goz@inaf.it
// date  : 02.09.2024
// code tested using nvhpc
//
// - Compile the code:
//   $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v
//         hybrid_cuda_omp.c -o hybrid_cuda_omp -lm -lcudart -lcublas
// - Run the code:
//   $ export OMP_TARGET_OFFLOAD=mandatory
//   $ ./hybrid_cuda_omp
////////////////////////////////////////////////////////////////////////////////////////////////////

#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <assert.h>
#include <float.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>

#define N     512
#define SIZE  ((N) * (N))
#define ALPHA 1.0
#define BETA  0.0

typedef double MyData;

void InitHost(MyData *const restrict A,
	      MyData *const restrict B,
	      MyData *const restrict C)
{
  //#pragma omp parallel for collapse(2)
  for (int i=0 ; i<N ; i++)
    for (int j=0 ; j<N ; j++)
      {
	A[(i * N) + j] = 1.0;
	B[(i * N) + j] = 2.0;
	C[(i * N) + j] = 0.0;
      } 
}

void InitDev(MyData *const restrict A,
	     MyData *const restrict B,
	     MyData *const restrict C)
{
 #pragma omp target teams loop collapse(2)
  for (int i=0 ; i<N ; i++)
    for (int j=0 ; j<N ; j++)
      {
	A[(i * N) + j] = 1.0;
	B[(i * N) + j] = 2.0;
	C[(i * N) + j] = 0.0;
      }

  return;
}

void HostDgemm(MyData *const restrict A,
	       MyData *const restrict B,
	       MyData *const restrict C,
	       const MyData           alpha,
	       const MyData           beta)
{
  // C = alpha * A * B + beta * C;

  // naive calculation
  //  #pragma omp parallel for collapse(2)
  for (int i=0 ; i<N ; i++)
    for (int j=0 ; j<N ; j++)
      {
	MyData sum = 0.0;
	for (int k=0 ; k<N ; k++)
	  sum += A[(i * N) + k] * B[(k * N) + j];

	C[(i * N) + j] = (alpha * sum) + (beta * C[(i * N) + j]);
      }

  return;
}

void check(MyData *const restrict host_array,
	   MyData *const restrict dev_array)
{
  int flag = 0;
  for (size_t i=0 ; i<SIZE ; i++)
    flag = ((fabs(host_array[i] - dev_array[i]) > FLT_EPSILON) ? 1 : flag);

  if (!flag)
    printf("\n\t Result OK");
  else
    printf("\n\t Result wrong");
  
  return;
}

int main()
{
  // Host allocation
  MyData *buffer = (MyData *)malloc(4 * SIZE * sizeof(MyData));
  assert(buffer != NULL);
  MyData *const restrict A  = buffer;
  MyData *const restrict B  = A + SIZE;
  MyData *const restrict C  = B + SIZE;
  MyData *const restrict CC = C + SIZE;

  // Spawning 2 host threads
  #pragma omp parallel num_threads(2)
  {
    // Evaluate the Dgemm on the host
    #pragma omp single nowait
    {
      InitHost(A, B, CC);
      HostDgemm(A, B, CC, ALPHA, BETA);
    } // omp single

    #pragma omp single nowait
    {
      // Initialize cuBLAS library
      cublasHandle_t handle;
      cublasCreate(&handle);
      
      // Allocate A, B, C on the device
      #pragma omp target enter data map(alloc: A[0:SIZE], B[0:SIZE], C[0:SIZE])

      // Init device with blocking omp target directive
      InitDev(A, B, C);

      // Define a target data region where A, B, and C pointers
      // refer to device's address space
      #pragma omp target data use_device_addr(A, B, C)
      {
	MyData const alpha = ALPHA;
	MyData const beta  = BETA;
	
	cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N,
		    &alpha, A, N, B, N, &beta, C, N);

	// CUDA synchronization point
	cudaDeviceSynchronize();
      }

      // Fetch data from the device and deallocate
      #pragma omp target exit data map(from: C[0:SIZE]) map(delete: A[0:SIZE], B[0:SIZE])
      
      cublasDestroy(handle);
    } // omp single
  } // synchronization point

  check(CC, C);
  
  free(buffer);
  
  return 0;
}
