//////////////////////////////////////////////////////////////////////////////////////////////////// // // Using CUDA data in OpenMP // // Author: David Goz // mail : david.goz@inaf.it // date : 03.09.2024 // code tested using nvhpc // // - Compile the code: // - using nvc // $ nvc -O3 -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v // hybrid_cuda_omp.c -o hybrid_cuda_omp -lm -lcudart -lcublas // - using clang // $ clang -O3 -v -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda // hybrid_cuda_omp.c -o hybrid_cuda_omp -lm -lcudart -lcublas // // - Run the code: // $ export OMP_TARGET_OFFLOAD=mandatory // $ ./hybrid_cuda_omp //////////////////////////////////////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include #include #define N 2048 #define SIZE ((N) * (N)) #define ALPHA 1.0 #define BETA 0.0 #define HOST 0 #define DEV 1 #define LOOP 10 #define INIT 0 #define KERNEL 1 #define DATA 2 typedef double MyData; static double _time[2][3]; static int thr[2]; double process_time() { struct timespec ts; clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9; return ret; } double thread_time() { struct timespec ts; clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts); const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9; return ret; } void InitHost(MyData *const restrict A, MyData *const restrict B, MyData *const restrict C, int *const restrict thr) { double start; #pragma omp parallel { #pragma omp barrier #pragma omp master { *thr = omp_get_num_threads(); start = thread_time(); } #pragma omp barrier #pragma omp for collapse(2) for (int i=0 ; i FLT_EPSILON) ? 1 : flag); if (!flag) printf("\n\t Result OK \n"); else printf("\n\t Result wrong \n"); return; } int main() { // Host allocation MyData *buffer = (MyData *)malloc(4 * SIZE * sizeof(MyData)); assert(buffer != NULL); MyData *const restrict A = buffer; MyData *const restrict B = A + SIZE; MyData *const restrict C_CPU = B + SIZE; MyData *const restrict C_GPU = C_CPU + SIZE; // Spawning 2 host threads #pragma omp parallel num_threads(2) { // Evaluate the Dgemm on the host #pragma omp single nowait { // allowing nested parallelism omp_set_max_active_levels(2); for (int loop=0 ; loop