Skip to content
Snippets Groups Projects
Select Git revision
  • ee2646840b127634f680f06aa111643fdfafe822
  • main default protected
  • hpc_school
  • energy
  • openacc
  • mpi_bugfix
6 results

hybrid_cublas_omp.c

Blame
  • hybrid_cublas_omp.c 6.78 KiB
    ////////////////////////////////////////////////////////////////////////////////////////////////////
    //
    // Using CUDA data in OpenMP
    //
    // Author: David Goz
    // mail  : david.goz@inaf.it
    // date  : 03.09.2024
    // code tested using nvhpc
    //
    // - Compile the code:
    //      - using nvc
    //          $ nvc -O3 -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v
    //                 hybrid_cuda_omp.c -o hybrid_cuda_omp -lm -lcudart -lcublas
    //      - using clang
    //          $ clang -O3 -v -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda 
    //                 hybrid_cuda_omp.c -o hybrid_cuda_omp -lm -lcudart -lcublas
    //
    // - Run the code:
    //   $ export OMP_TARGET_OFFLOAD=mandatory
    //   $ ./hybrid_cuda_omp
    ////////////////////////////////////////////////////////////////////////////////////////////////////
    
    #include <cuda_runtime.h>
    #include <cublas_v2.h>
    #include <assert.h>
    #include <float.h>
    #include <math.h>
    #include <stdio.h>
    #include <stdlib.h>
    #include <time.h>
    #include <omp.h>
    
    #define N       2048
    #define SIZE    ((N) * (N))
    #define ALPHA   1.0
    #define BETA    0.0
    #define HOST    0
    #define DEV     1
    #define LOOP    10
    #define INIT    0
    #define KERNEL  1
    #define DATA    2
    
    typedef double MyData;
    
    static double _time[2][3];
    static int thr[2];
    
    double process_time()
    {
      struct timespec ts;
      clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
      const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;
    
      return ret;
    }
    
    double thread_time()
    {
      struct timespec ts;
      clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
      const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;
    
      return ret;
    }
    
    void InitHost(MyData *const restrict A,
    	      MyData *const restrict B,
    	      MyData *const restrict C,
    	      int    *const restrict thr)
    {
      double start;
    
      #pragma omp parallel 
      {
        #pragma omp barrier
        #pragma omp master
        {
          *thr = omp_get_num_threads();
          start = thread_time();
        }
       #pragma omp barrier
    
        #pragma omp for collapse(2)
        for (int i=0 ; i<N ; i++)
          for (int j=0 ; j<N ; j++)
    	{
    	  A[(i * N) + j] = 1.0;
    	  B[(i * N) + j] = 2.0;
    	  C[(i * N) + j] = 0.0;
    	}
    
        #pragma omp master
        {
          _time[HOST][INIT] += (thread_time() - start); 
        }
      } // omp parallel
    
      return;
    }
    
    void InitDev(MyData *const restrict A,
    	     MyData *const restrict B,
    	     MyData *const restrict C)
    {
      const double start = thread_time();
    
     #pragma omp target teams loop collapse(2)
      for (int i=0 ; i<N ; i++)
        for (int j=0 ; j<N ; j++)
          {
    	A[(i * N) + j] = 1.0;
    	B[(i * N) + j] = 2.0;
    	C[(i * N) + j] = 0.0;
          }
    
      _time[DEV][INIT] += (thread_time() - start);
    
      return;
    }
    
    void HostDgemm(MyData *const restrict A,
    	       MyData *const restrict B,
    	       MyData *const restrict C,
    	       const MyData           alpha,
    	       const MyData           beta,
    	       int    *const restrict thr)
    {
      // C = alpha * A * B + beta * C;
    
      double start;
    
      // naive calculation
     #pragma omp parallel
      {
        #pragma omp barrier
        #pragma omp master
        {
          *thr = omp_get_num_threads();
          start = thread_time();
        }
        #pragma omp barrier
    
        #pragma omp for collapse(2)
        for (int i=0 ; i<N ; i++)
          for (int j=0 ; j<N ; j++)
    	{
    	  MyData sum = 0.0;
    	  for (int k=0 ; k<N ; k++)
    	    sum += A[(i * N) + k] * B[(k * N) + j];
      
    	  C[(i * N) + j] = (alpha * sum) + (beta * C[(i * N) + j]);
    	}
    
        #pragma omp master
        {
          _time[HOST][KERNEL] += (thread_time() - start);
        }
      } // omp parallel
    
      return;
    }
    
    void check(MyData *const restrict host_array,
    	   MyData *const restrict dev_array)
    {
      int flag = 0;
      for (size_t i=0 ; i<SIZE ; i++)
        flag = ((fabs(host_array[i] - dev_array[i]) > FLT_EPSILON) ? 1 : flag);
    
      if (!flag)
        printf("\n\t Result OK \n");
      else
        printf("\n\t Result wrong \n");
      
      return;
    }
    
    int main()
    {
      // Host allocation
      MyData *buffer = (MyData *)malloc(4 * SIZE * sizeof(MyData));
      assert(buffer != NULL);
      MyData *const restrict A     = buffer;
      MyData *const restrict B     = A + SIZE;
      MyData *const restrict C_CPU = B + SIZE;
      MyData *const restrict C_GPU = C_CPU + SIZE;
    
      // Spawning 2 host threads
      #pragma omp parallel num_threads(2)
      {
        // Evaluate the Dgemm on the host
        #pragma omp single nowait
        {
          // allowing nested parallelism
          omp_set_max_active_levels(2);
          
          for (int loop=0 ; loop<LOOP ; loop++)
    	{
    	  InitHost(A, B, C_CPU, &thr[0]);
    	  HostDgemm(A, B, C_CPU, ALPHA, BETA, &thr[1]);
    	}
        } // omp single
    
        #pragma omp single nowait
        {
          // Initialize cuBLAS library
          cublasHandle_t handle;
          cublasCreate(&handle);
    
          // Allocate A, B, C on the device using CUDA API
          MyData *d_buffer = NULL;
          cudaMalloc((void **)&d_buffer, (3 * SIZE * sizeof(MyData)));
          assert(d_buffer != NULL);
          MyData *const restrict d_A = d_buffer;
          MyData *const restrict d_B = d_A + SIZE;
          MyData *const restrict d_C = d_B + SIZE;
    
          // get the default device
          const int dev = omp_get_default_device();
    
          // Associate external device pointers
          omp_target_associate_ptr(A,     d_A, (SIZE * sizeof(MyData)), 0, dev);
          omp_target_associate_ptr(B,     d_B, (SIZE * sizeof(MyData)), 0, dev);
          omp_target_associate_ptr(C_GPU, d_C, (SIZE * sizeof(MyData)), 0, dev);
          
          for (int loop=0 ; loop<LOOP ; loop++)
    	{
    	  // Init device with blocking omp target directive
    	  InitDev(A, B, C_GPU);
    
    	  // Apply DGEMM using device pointers directly
    	  const MyData alpha = ALPHA;
    	  const MyData beta  = BETA;
    
    	  double start = thread_time();
    	  cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N,
    		      &alpha, 
    		      d_A, N, 
    		      d_B, N, 
    		      &beta, 
    		      d_C, N);
    
    	  // CUDA synchronization point
    	  cudaDeviceSynchronize();
    	  _time[DEV][KERNEL] += (thread_time() - start);
    
    	  // Fetch data from the device and deallocate
    	  start = thread_time();
    	  cudaMemcpy(C_GPU, d_C, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost);
    	  _time[DEV][DATA] += (thread_time() - start);
    	} // LOOP
    
          // release pointer association
          omp_target_disassociate_ptr(A, dev);
          omp_target_disassociate_ptr(B, dev);
          omp_target_disassociate_ptr(C_GPU, dev);
          
          // deallocate device's memory
          cudaFree(d_buffer);
    
          cublasDestroy(handle);
        } // omp single
      } // synchronization point
    
      check(C_CPU, C_GPU);
    
      free(buffer);
    
      printf("\n\t Matrix size: %d x %d\n", N, N);
    
      printf("\n\t Host execution time:");
      printf("\n\t\t Init      : %lg [s] - threads: %d", _time[HOST][INIT]/LOOP, thr[0]);
      printf("\n\t\t Dgemm     : %lg [s] - threads: %d\n", _time[HOST][KERNEL]/LOOP, thr[1]);
    
      printf("\n\t Device execution time:");
      printf("\n\t\t Init      : %lg [s]", _time[DEV][INIT]/LOOP);
      printf("\n\t\t Dgemm     : %lg [s]", _time[DEV][KERNEL]/LOOP);
      printf("\n\t\t Fetch data: %lg [s]\n\n", _time[DEV][DATA]/LOOP);
      
      return 0;
    }