diff --git a/cuda-omp/hybrid/hybrid_cuda_omp.c b/cuda-omp/hybrid/hybrid_cuda_omp.c new file mode 100644 index 0000000000000000000000000000000000000000..823be5dba91c9d5d13dc7dc146a609ac862f2497 --- /dev/null +++ b/cuda-omp/hybrid/hybrid_cuda_omp.c @@ -0,0 +1,159 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Passing OpenMP data to cuBlas. +// +// Author: David Goz +// mail : david.goz@inaf.it +// date : 02.09.2024 +// code tested using nvhpc +// +// - Compile the code: +// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v +// hybrid_cuda_omp.c -o hybrid_cuda_omp -lm -lcudart -lcublas +// - Run the code: +// $ export OMP_TARGET_OFFLOAD=mandatory +// $ ./hybrid_cuda_omp +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include +#include +#include +#include +#include +#include +#include + +#define N 512 +#define SIZE ((N) * (N)) +#define ALPHA 1.0 +#define BETA 0.0 + +typedef double MyData; + +void InitHost(MyData *const restrict A, + MyData *const restrict B, + MyData *const restrict C) +{ + //#pragma omp parallel for collapse(2) + for (int i=0 ; i FLT_EPSILON) ? 1 : flag); + + if (!flag) + printf("\n\t Result OK"); + else + printf("\n\t Result wrong"); + + return; +} + +int main() +{ + // Host allocation + MyData *buffer = (MyData *)malloc(4 * SIZE * sizeof(MyData)); + assert(buffer != NULL); + MyData *const restrict A = buffer; + MyData *const restrict B = A + SIZE; + MyData *const restrict C = B + SIZE; + MyData *const restrict CC = C + SIZE; + + // Spawning 2 host threads + #pragma omp parallel num_threads(2) + { + // Evaluate the Dgemm on the host + #pragma omp single nowait + { + InitHost(A, B, CC); + HostDgemm(A, B, CC, ALPHA, BETA); + } // omp single + + #pragma omp single nowait + { + // Initialize cuBLAS library + cublasHandle_t handle; + cublasCreate(&handle); + + // Allocate A, B, C on the device + #pragma omp target enter data map(alloc: A[0:SIZE], B[0:SIZE], C[0:SIZE]) + + // Init device with blocking omp target directive + InitDev(A, B, C); + + // Define a target data region where A, B, and C pointers + // refer to device's address space + #pragma omp target data use_device_addr(A, B, C) + { + MyData const alpha = ALPHA; + MyData const beta = BETA; + + cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, + &alpha, A, N, B, N, &beta, C, N); + + // CUDA synchronization point + cudaDeviceSynchronize(); + } + + // Fetch data from the device and deallocate + #pragma omp target exit data map(from: C[0:SIZE]) map(delete: A[0:SIZE], B[0:SIZE]) + + cublasDestroy(handle); + } // omp single + } // synchronization point + + check(CC, C); + + free(buffer); + + return 0; +} diff --git a/cuda-omp/omp/miscellaneous/asynchronous.c b/cuda-omp/omp/miscellaneous/asynchronous.c new file mode 100644 index 0000000000000000000000000000000000000000..974861e154f879a5b6f3c50c78eb6e24ee2cb96e --- /dev/null +++ b/cuda-omp/omp/miscellaneous/asynchronous.c @@ -0,0 +1,110 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Splitting the asynchronous vector addition task graph across four devices +// +// Author: David Goz +// mail : david.goz@inaf.it +// date : 28.08.2024 +// code tested using nvhpc +// +// - Compile the code: +// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v asynchronous.c -o asynchronous_omp +// - Run the code: +// $ export OMP_TARGET_OFFLOAD=mandatory +// $ ./asynchronous_omp +//////////////////////////////////////////////////////////////////////////////////////////////////// + + +#include +#include +#include +#include + +typedef int MyData; + +#define NDEBUG + +void check(const MyData *const C, + const size_t size) +{ + int flag = 0; + for (size_t i=0 ; i +#include +#include +#include + +typedef int MyData; + +#define NDEBUG + +void check(const MyData *const C, + const size_t size) +{ + int flag = 0; + for (size_t i=0 ; i #include #include @@ -15,30 +34,10 @@ typedef int MyData; #error "N_PER_DEV < BLOCKSIZE" #endif -#define NDEBUG +#define TRUE 1 +#define FALSE 0 -void check(const MyData *const restrict vector_cpu, - const MyData *const restrict vector_gpu, - const size_t size) -{ - int flag = 0; - for (size_t i=0 ; i 0); @@ -102,7 +104,8 @@ int main() B[i] = rand() % N_PER_DEV; C_CPU[i] = A[i] + B[i]; } - + + // each device is managed by a single OMP thread #pragma omp parallel num_threads(NumDev) { // check @@ -119,11 +122,22 @@ int main() const int tid = omp_get_thread_num(); const int offset = (tid * N_PER_DEV); - const int nblocks = ((N_PER_DEV + BLOCKSIZE - 1) / BLOCKSIZE); - - VectorAdd(A, B, C_GPU, offset, N_PER_DEV, tid, nblocks); + + VectorAdd(A, B, C_GPU, offset, N_PER_DEV, tid, nblocks, FALSE); } // omp parallel + check(C_CPU, C_GPU, size); + memset(C_GPU, 0, (size * sizeof(MyData))); + + // one OMP thread manages asynchronously all the devices + for (int dev=0 ; dev +#include +#include +#include + +typedef int MyData; +#define N_PER_DEV 1000000 + +#define NDEBUG + +int main() +{ + // get the number of the available devices + const int NumDev = omp_get_num_devices(); + + // global vector size + const int size = (NumDev * N_PER_DEV); + assert(size > 0); + + MyData *buffer = (MyData *)malloc(2 * size * sizeof(MyData)); + assert(buffer != NULL); + MyData *const restrict A = buffer; + MyData *const restrict B = A + size; + MyData sum_cpu = (MyData)0; + + #pragma omp parallel for simd reduction(+: sum_cpu) + for (int i=0 ; i