hybrid_cublas_omp.c

////////////////////////////////////////////////////////////////////////////////////////////////////
//
// Using CUDA data in OpenMP
//
// Author: David Goz
// mail  : david.goz@inaf.it
// date  : 03.09.2024
// code tested using nvhpc
//
// - Compile the code:
//      - using nvc
//          $ nvc -O3 -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v
//                 hybrid_cuda_omp.c -o hybrid_cuda_omp -lm -lcudart -lcublas
//      - using clang
//          $ clang -O3 -v -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda 
//                 hybrid_cuda_omp.c -o hybrid_cuda_omp -lm -lcudart -lcublas
//
// - Run the code:
//   $ export OMP_TARGET_OFFLOAD=mandatory
//   $ ./hybrid_cuda_omp
////////////////////////////////////////////////////////////////////////////////////////////////////

#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <assert.h>
#include <float.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <omp.h>

#define N       2048
#define SIZE    ((N) * (N))
#define ALPHA   1.0
#define BETA    0.0
#define HOST    0
#define DEV     1
#define LOOP    10
#define INIT    0
#define KERNEL  1
#define DATA    2

typedef double MyData;

static double _time[2][3];
static int thr[2];

double process_time()
{
  struct timespec ts;
  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
  const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;

  return ret;
}

double thread_time()
{
  struct timespec ts;
  clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
  const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;

  return ret;
}

void InitHost(MyData *const restrict A,
	      MyData *const restrict B,
	      MyData *const restrict C,
	      int    *const restrict thr)
{
  double start;

  #pragma omp parallel 
  {
    #pragma omp barrier
    #pragma omp master
    {
      *thr = omp_get_num_threads();
      start = thread_time();
    }
   #pragma omp barrier

    #pragma omp for collapse(2)
    for (int i=0 ; i<N ; i++)
      for (int j=0 ; j<N ; j++)
	{
	  A[(i * N) + j] = 1.0;
	  B[(i * N) + j] = 2.0;
	  C[(i * N) + j] = 0.0;
	}

    #pragma omp master
    {
      _time[HOST][INIT] += (thread_time() - start); 
    }
  } // omp parallel

  return;
}

void InitDev(MyData *const restrict A,
	     MyData *const restrict B,
	     MyData *const restrict C)
{
  const double start = thread_time();

 #pragma omp target teams loop collapse(2)
  for (int i=0 ; i<N ; i++)
    for (int j=0 ; j<N ; j++)
      {
	A[(i * N) + j] = 1.0;
	B[(i * N) + j] = 2.0;
	C[(i * N) + j] = 0.0;
      }

  _time[DEV][INIT] += (thread_time() - start);

  return;
}

void HostDgemm(MyData *const restrict A,
	       MyData *const restrict B,
	       MyData *const restrict C,
	       const MyData           alpha,
	       const MyData           beta,
	       int    *const restrict thr)
{
  // C = alpha * A * B + beta * C;

  double start;

  // naive calculation
 #pragma omp parallel
  {
    #pragma omp barrier
    #pragma omp master
    {
      *thr = omp_get_num_threads();
      start = thread_time();
    }
    #pragma omp barrier

    #pragma omp for collapse(2)
    for (int i=0 ; i<N ; i++)
      for (int j=0 ; j<N ; j++)
	{
	  MyData sum = 0.0;
	  for (int k=0 ; k<N ; k++)
	    sum += A[(i * N) + k] * B[(k * N) + j];
  
	  C[(i * N) + j] = (alpha * sum) + (beta * C[(i * N) + j]);
	}

    #pragma omp master
    {
      _time[HOST][KERNEL] += (thread_time() - start);
    }
  } // omp parallel

  return;
}

void check(MyData *const restrict host_array,
	   MyData *const restrict dev_array)
{
  int flag = 0;
  for (size_t i=0 ; i<SIZE ; i++)
    flag = ((fabs(host_array[i] - dev_array[i]) > FLT_EPSILON) ? 1 : flag);

  if (!flag)
    printf("\n\t Result OK \n");
  else
    printf("\n\t Result wrong \n");
  
  return;
}

int main()
{
  // Host allocation
  MyData *buffer = (MyData *)malloc(4 * SIZE * sizeof(MyData));
  assert(buffer != NULL);
  MyData *const restrict A     = buffer;
  MyData *const restrict B     = A + SIZE;
  MyData *const restrict C_CPU = B + SIZE;
  MyData *const restrict C_GPU = C_CPU + SIZE;

  // Spawning 2 host threads
  #pragma omp parallel num_threads(2)
  {
    // Evaluate the Dgemm on the host
    #pragma omp single nowait
    {
      // allowing nested parallelism
      omp_set_max_active_levels(2);
      
      for (int loop=0 ; loop<LOOP ; loop++)
	{
	  InitHost(A, B, C_CPU, &thr[0]);
	  HostDgemm(A, B, C_CPU, ALPHA, BETA, &thr[1]);
	}
    } // omp single

    #pragma omp single nowait
    {
      // Initialize cuBLAS library
      cublasHandle_t handle;
      cublasCreate(&handle);
      // Allocate A, B, C on the device using CUDA API
      MyData *d_buffer = NULL;
      cudaMalloc((void **)&d_buffer, (3 * SIZE * sizeof(MyData)));
      assert(d_buffer != NULL);
      MyData *const restrict d_A = d_buffer;
      MyData *const restrict d_B = d_A + SIZE;
      MyData *const restrict d_C = d_B + SIZE;

      // get the default device
      const int dev = omp_get_default_device();

      // Associate external device pointers
      omp_target_associate_ptr(A,     d_A, (SIZE * sizeof(MyData)), 0, dev);
      omp_target_associate_ptr(B,     d_B, (SIZE * sizeof(MyData)), 0, dev);
      omp_target_associate_ptr(C_GPU, d_C, (SIZE * sizeof(MyData)), 0, dev);
      
      for (int loop=0 ; loop<LOOP ; loop++)
	{
	  // Init device with blocking omp target directive
	  InitDev(A, B, C_GPU);

	  // Apply DGEMM using device pointers directly
	  const MyData alpha = ALPHA;
	  const MyData beta  = BETA;

	  double start = thread_time();
	  cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N,
		      &alpha, 
		      d_A, N, 
		      d_B, N, 
		      &beta, 
		      d_C, N);

	  // CUDA synchronization point
	  cudaDeviceSynchronize();
	  _time[DEV][KERNEL] += (thread_time() - start);

	  // Fetch data from the device and deallocate
	  start = thread_time();
	  cudaMemcpy(C_GPU, d_C, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost);
	  _time[DEV][DATA] += (thread_time() - start);
	} // LOOP

      // release pointer association
      omp_target_disassociate_ptr(A, dev);
      omp_target_disassociate_ptr(B, dev);
      omp_target_disassociate_ptr(C_GPU, dev);
      
      // deallocate device's memory
      cudaFree(d_buffer);

      cublasDestroy(handle);
    } // omp single
  } // synchronization point

  check(C_CPU, C_GPU);

  free(buffer);

  printf("\n\t Matrix size: %d x %d\n", N, N);

  printf("\n\t Host execution time:");
  printf("\n\t\t Init      : %lg [s] - threads: %d", _time[HOST][INIT]/LOOP, thr[0]);
  printf("\n\t\t Dgemm     : %lg [s] - threads: %d\n", _time[HOST][KERNEL]/LOOP, thr[1]);

  printf("\n\t Device execution time:");
  printf("\n\t\t Init      : %lg [s]", _time[DEV][INIT]/LOOP);
  printf("\n\t\t Dgemm     : %lg [s]", _time[DEV][KERNEL]/LOOP);
  printf("\n\t\t Fetch data: %lg [s]\n\n", _time[DEV][DATA]/LOOP);
  return 0;
}