Select Git revision
hybrid_cublas_omp.c
hybrid_cublas_omp.c 6.78 KiB
////////////////////////////////////////////////////////////////////////////////////////////////////
//
// Using CUDA data in OpenMP
//
// Author: David Goz
// mail : david.goz@inaf.it
// date : 03.09.2024
// code tested using nvhpc
//
// - Compile the code:
// - using nvc
// $ nvc -O3 -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v
// hybrid_cuda_omp.c -o hybrid_cuda_omp -lm -lcudart -lcublas
// - using clang
// $ clang -O3 -v -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda
// hybrid_cuda_omp.c -o hybrid_cuda_omp -lm -lcudart -lcublas
//
// - Run the code:
// $ export OMP_TARGET_OFFLOAD=mandatory
// $ ./hybrid_cuda_omp
////////////////////////////////////////////////////////////////////////////////////////////////////
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <assert.h>
#include <float.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <omp.h>
#define N 2048
#define SIZE ((N) * (N))
#define ALPHA 1.0
#define BETA 0.0
#define HOST 0
#define DEV 1
#define LOOP 10
#define INIT 0
#define KERNEL 1
#define DATA 2
typedef double MyData;
static double _time[2][3];
static int thr[2];
double process_time()
{
struct timespec ts;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;
return ret;
}
double thread_time()
{
struct timespec ts;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;
return ret;
}
void InitHost(MyData *const restrict A,
MyData *const restrict B,
MyData *const restrict C,
int *const restrict thr)
{
double start;
#pragma omp parallel
{
#pragma omp barrier
#pragma omp master
{
*thr = omp_get_num_threads();
start = thread_time();
}
#pragma omp barrier
#pragma omp for collapse(2)
for (int i=0 ; i<N ; i++)
for (int j=0 ; j<N ; j++)
{
A[(i * N) + j] = 1.0;
B[(i * N) + j] = 2.0;
C[(i * N) + j] = 0.0;
}
#pragma omp master
{
_time[HOST][INIT] += (thread_time() - start);
}
} // omp parallel
return;
}
void InitDev(MyData *const restrict A,
MyData *const restrict B,
MyData *const restrict C)
{
const double start = thread_time();
#pragma omp target teams loop collapse(2)
for (int i=0 ; i<N ; i++)
for (int j=0 ; j<N ; j++)
{
A[(i * N) + j] = 1.0;
B[(i * N) + j] = 2.0;
C[(i * N) + j] = 0.0;
}
_time[DEV][INIT] += (thread_time() - start);
return;
}
void HostDgemm(MyData *const restrict A,
MyData *const restrict B,
MyData *const restrict C,
const MyData alpha,
const MyData beta,
int *const restrict thr)
{
// C = alpha * A * B + beta * C;
double start;
// naive calculation
#pragma omp parallel
{
#pragma omp barrier
#pragma omp master
{
*thr = omp_get_num_threads();
start = thread_time();
}
#pragma omp barrier
#pragma omp for collapse(2)
for (int i=0 ; i<N ; i++)
for (int j=0 ; j<N ; j++)
{
MyData sum = 0.0;
for (int k=0 ; k<N ; k++)
sum += A[(i * N) + k] * B[(k * N) + j];
C[(i * N) + j] = (alpha * sum) + (beta * C[(i * N) + j]);
}
#pragma omp master
{
_time[HOST][KERNEL] += (thread_time() - start);
}
} // omp parallel
return;
}
void check(MyData *const restrict host_array,
MyData *const restrict dev_array)
{
int flag = 0;
for (size_t i=0 ; i<SIZE ; i++)
flag = ((fabs(host_array[i] - dev_array[i]) > FLT_EPSILON) ? 1 : flag);
if (!flag)
printf("\n\t Result OK \n");
else
printf("\n\t Result wrong \n");
return;
}
int main()
{
// Host allocation
MyData *buffer = (MyData *)malloc(4 * SIZE * sizeof(MyData));
assert(buffer != NULL);
MyData *const restrict A = buffer;
MyData *const restrict B = A + SIZE;
MyData *const restrict C_CPU = B + SIZE;
MyData *const restrict C_GPU = C_CPU + SIZE;
// Spawning 2 host threads
#pragma omp parallel num_threads(2)
{
// Evaluate the Dgemm on the host
#pragma omp single nowait
{
// allowing nested parallelism
omp_set_max_active_levels(2);
for (int loop=0 ; loop<LOOP ; loop++)
{
InitHost(A, B, C_CPU, &thr[0]);
HostDgemm(A, B, C_CPU, ALPHA, BETA, &thr[1]);
}
} // omp single
#pragma omp single nowait
{
// Initialize cuBLAS library
cublasHandle_t handle;
cublasCreate(&handle);
// Allocate A, B, C on the device using CUDA API
MyData *d_buffer = NULL;
cudaMalloc((void **)&d_buffer, (3 * SIZE * sizeof(MyData)));
assert(d_buffer != NULL);
MyData *const restrict d_A = d_buffer;
MyData *const restrict d_B = d_A + SIZE;
MyData *const restrict d_C = d_B + SIZE;
// get the default device
const int dev = omp_get_default_device();
// Associate external device pointers
omp_target_associate_ptr(A, d_A, (SIZE * sizeof(MyData)), 0, dev);
omp_target_associate_ptr(B, d_B, (SIZE * sizeof(MyData)), 0, dev);
omp_target_associate_ptr(C_GPU, d_C, (SIZE * sizeof(MyData)), 0, dev);
for (int loop=0 ; loop<LOOP ; loop++)
{
// Init device with blocking omp target directive
InitDev(A, B, C_GPU);
// Apply DGEMM using device pointers directly
const MyData alpha = ALPHA;
const MyData beta = BETA;
double start = thread_time();
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N,
&alpha,
d_A, N,
d_B, N,
&beta,
d_C, N);
// CUDA synchronization point
cudaDeviceSynchronize();
_time[DEV][KERNEL] += (thread_time() - start);
// Fetch data from the device and deallocate
start = thread_time();
cudaMemcpy(C_GPU, d_C, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost);
_time[DEV][DATA] += (thread_time() - start);
} // LOOP
// release pointer association
omp_target_disassociate_ptr(A, dev);
omp_target_disassociate_ptr(B, dev);
omp_target_disassociate_ptr(C_GPU, dev);
// deallocate device's memory
cudaFree(d_buffer);
cublasDestroy(handle);
} // omp single
} // synchronization point
check(C_CPU, C_GPU);
free(buffer);
printf("\n\t Matrix size: %d x %d\n", N, N);
printf("\n\t Host execution time:");
printf("\n\t\t Init : %lg [s] - threads: %d", _time[HOST][INIT]/LOOP, thr[0]);
printf("\n\t\t Dgemm : %lg [s] - threads: %d\n", _time[HOST][KERNEL]/LOOP, thr[1]);
printf("\n\t Device execution time:");
printf("\n\t\t Init : %lg [s]", _time[DEV][INIT]/LOOP);
printf("\n\t\t Dgemm : %lg [s]", _time[DEV][KERNEL]/LOOP);
printf("\n\t\t Fetch data: %lg [s]\n\n", _time[DEV][DATA]/LOOP);
return 0;
}