diff --git a/cuda-omp/cuda/7/mat_mult_block.cu b/cuda-omp/cuda/7/mat_mult_block.cu index 7808f021a87df0329a4784f72fd1966aa133e737..814f21c8f935f7e60198fad20af441cc465f1330 100644 --- a/cuda-omp/cuda/7/mat_mult_block.cu +++ b/cuda-omp/cuda/7/mat_mult_block.cu @@ -257,9 +257,9 @@ int main() const dim3 block = {(BLOCK * BLOCK), 1, 1}; /////////////////////////// GPU naive block algorithm //////////////////////////////////////// - time = 0.0; GPU_mat_mult_block<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up cudaDeviceSynchronize(); + time = 0.0; for (unsigned short int loop=0 ; loop<LOOP ; loop++) { const double start = wall_time(); @@ -274,9 +274,9 @@ int main() ///////////////////////////////////////////////////////////////////////////////////// /////////////////////////// GPU block shared memory algorithm /////////////////////// - time = 0.0; GPU_mat_mult_block_shared<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up cudaDeviceSynchronize(); + time = 0.0; for (unsigned short int loop=0 ; loop<LOOP ; loop++) { const double start = wall_time(); diff --git a/cuda-omp/hybrid/hybrid_cublas_omp.c b/cuda-omp/hybrid/hybrid_cublas_omp.c new file mode 100644 index 0000000000000000000000000000000000000000..ab6aabe9a9984b5b0f30c2b5e7c91b94293f2336 --- /dev/null +++ b/cuda-omp/hybrid/hybrid_cublas_omp.c @@ -0,0 +1,282 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Using CUDA data in OpenMP +// +// Author: David Goz +// mail : david.goz@inaf.it +// date : 03.09.2024 +// code tested using nvhpc +// +// - Compile the code: +// - using nvc +// $ nvc -O3 -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v +// hybrid_cuda_omp.c -o hybrid_cuda_omp -lm -lcudart -lcublas +// - using clang +// $ clang -O3 -v -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda +// hybrid_cuda_omp.c -o hybrid_cuda_omp -lm -lcudart -lcublas +// +// - Run the code: +// $ export OMP_TARGET_OFFLOAD=mandatory +// $ ./hybrid_cuda_omp +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include <cuda_runtime.h> +#include <cublas_v2.h> +#include <assert.h> +#include <float.h> +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include <time.h> +#include <omp.h> + +#define N 2048 +#define SIZE ((N) * (N)) +#define ALPHA 1.0 +#define BETA 0.0 +#define HOST 0 +#define DEV 1 +#define LOOP 10 +#define INIT 0 +#define KERNEL 1 +#define DATA 2 + +typedef double MyData; + +static double _time[2][3]; +static int thr[2]; + +double process_time() +{ + struct timespec ts; + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); + const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9; + + return ret; +} + +double thread_time() +{ + struct timespec ts; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts); + const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9; + + return ret; +} + +void InitHost(MyData *const restrict A, + MyData *const restrict B, + MyData *const restrict C, + int *const restrict thr) +{ + double start; + + #pragma omp parallel + { + #pragma omp barrier + #pragma omp master + { + *thr = omp_get_num_threads(); + start = thread_time(); + } + #pragma omp barrier + + #pragma omp for collapse(2) + for (int i=0 ; i<N ; i++) + for (int j=0 ; j<N ; j++) + { + A[(i * N) + j] = 1.0; + B[(i * N) + j] = 2.0; + C[(i * N) + j] = 0.0; + } + + #pragma omp master + { + _time[HOST][INIT] += (thread_time() - start); + } + } // omp parallel + + return; +} + +void InitDev(MyData *const restrict A, + MyData *const restrict B, + MyData *const restrict C) +{ + const double start = thread_time(); + + #pragma omp target teams loop collapse(2) + for (int i=0 ; i<N ; i++) + for (int j=0 ; j<N ; j++) + { + A[(i * N) + j] = 1.0; + B[(i * N) + j] = 2.0; + C[(i * N) + j] = 0.0; + } + + _time[DEV][INIT] += (thread_time() - start); + + return; +} + +void HostDgemm(MyData *const restrict A, + MyData *const restrict B, + MyData *const restrict C, + const MyData alpha, + const MyData beta, + int *const restrict thr) +{ + // C = alpha * A * B + beta * C; + + double start; + + // naive calculation + #pragma omp parallel + { + #pragma omp barrier + #pragma omp master + { + *thr = omp_get_num_threads(); + start = thread_time(); + } + #pragma omp barrier + + #pragma omp for collapse(2) + for (int i=0 ; i<N ; i++) + for (int j=0 ; j<N ; j++) + { + MyData sum = 0.0; + for (int k=0 ; k<N ; k++) + sum += A[(i * N) + k] * B[(k * N) + j]; + + C[(i * N) + j] = (alpha * sum) + (beta * C[(i * N) + j]); + } + + #pragma omp master + { + _time[HOST][KERNEL] += (thread_time() - start); + } + } // omp parallel + + return; +} + +void check(MyData *const restrict host_array, + MyData *const restrict dev_array) +{ + int flag = 0; + for (size_t i=0 ; i<SIZE ; i++) + flag = ((fabs(host_array[i] - dev_array[i]) > FLT_EPSILON) ? 1 : flag); + + if (!flag) + printf("\n\t Result OK \n"); + else + printf("\n\t Result wrong \n"); + + return; +} + +int main() +{ + // Host allocation + MyData *buffer = (MyData *)malloc(4 * SIZE * sizeof(MyData)); + assert(buffer != NULL); + MyData *const restrict A = buffer; + MyData *const restrict B = A + SIZE; + MyData *const restrict C_CPU = B + SIZE; + MyData *const restrict C_GPU = C_CPU + SIZE; + + // Spawning 2 host threads + #pragma omp parallel num_threads(2) + { + // Evaluate the Dgemm on the host + #pragma omp single nowait + { + // allowing nested parallelism + omp_set_max_active_levels(2); + + for (int loop=0 ; loop<LOOP ; loop++) + { + InitHost(A, B, C_CPU, &thr[0]); + HostDgemm(A, B, C_CPU, ALPHA, BETA, &thr[1]); + } + } // omp single + + #pragma omp single nowait + { + // Initialize cuBLAS library + cublasHandle_t handle; + cublasCreate(&handle); + + // Allocate A, B, C on the device using CUDA API + MyData *d_buffer = NULL; + cudaMalloc((void **)&d_buffer, (3 * SIZE * sizeof(MyData))); + assert(d_buffer != NULL); + MyData *const restrict d_A = d_buffer; + MyData *const restrict d_B = d_A + SIZE; + MyData *const restrict d_C = d_B + SIZE; + + // get the default device + const int dev = omp_get_default_device(); + + // Associate external device pointers + omp_target_associate_ptr(A, d_A, (SIZE * sizeof(MyData)), 0, dev); + omp_target_associate_ptr(B, d_B, (SIZE * sizeof(MyData)), 0, dev); + omp_target_associate_ptr(C_GPU, d_C, (SIZE * sizeof(MyData)), 0, dev); + + for (int loop=0 ; loop<LOOP ; loop++) + { + // Init device with blocking omp target directive + InitDev(A, B, C_GPU); + + // Apply DGEMM using device pointers directly + const MyData alpha = ALPHA; + const MyData beta = BETA; + + double start = thread_time(); + cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, + &alpha, + d_A, N, + d_B, N, + &beta, + d_C, N); + + // CUDA synchronization point + cudaDeviceSynchronize(); + _time[DEV][KERNEL] += (thread_time() - start); + + // Fetch data from the device and deallocate + start = thread_time(); + cudaMemcpy(C_GPU, d_C, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost); + _time[DEV][DATA] += (thread_time() - start); + } // LOOP + + // release pointer association + omp_target_disassociate_ptr(A, dev); + omp_target_disassociate_ptr(B, dev); + omp_target_disassociate_ptr(C_GPU, dev); + + // deallocate device's memory + cudaFree(d_buffer); + + cublasDestroy(handle); + } // omp single + } // synchronization point + + check(C_CPU, C_GPU); + + free(buffer); + + printf("\n\t Matrix size: %d x %d\n", N, N); + + printf("\n\t Host execution time:"); + printf("\n\t\t Init : %lg [s] - threads: %d", _time[HOST][INIT]/LOOP, thr[0]); + printf("\n\t\t Dgemm : %lg [s] - threads: %d\n", _time[HOST][KERNEL]/LOOP, thr[1]); + + printf("\n\t Device execution time:"); + printf("\n\t\t Init : %lg [s]", _time[DEV][INIT]/LOOP); + printf("\n\t\t Dgemm : %lg [s]", _time[DEV][KERNEL]/LOOP); + printf("\n\t\t Fetch data: %lg [s]\n\n", _time[DEV][DATA]/LOOP); + + return 0; +} diff --git a/cuda-omp/hybrid/hybrid_cuda_omp.c b/cuda-omp/hybrid/hybrid_cuda_omp.c deleted file mode 100644 index 823be5dba91c9d5d13dc7dc146a609ac862f2497..0000000000000000000000000000000000000000 --- a/cuda-omp/hybrid/hybrid_cuda_omp.c +++ /dev/null @@ -1,159 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////////////////////////// -// -// Passing OpenMP data to cuBlas. -// -// Author: David Goz -// mail : david.goz@inaf.it -// date : 02.09.2024 -// code tested using nvhpc -// -// - Compile the code: -// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v -// hybrid_cuda_omp.c -o hybrid_cuda_omp -lm -lcudart -lcublas -// - Run the code: -// $ export OMP_TARGET_OFFLOAD=mandatory -// $ ./hybrid_cuda_omp -//////////////////////////////////////////////////////////////////////////////////////////////////// - -#include <cuda_runtime.h> -#include <cublas_v2.h> -#include <assert.h> -#include <float.h> -#include <math.h> -#include <stdio.h> -#include <stdlib.h> - -#define N 512 -#define SIZE ((N) * (N)) -#define ALPHA 1.0 -#define BETA 0.0 - -typedef double MyData; - -void InitHost(MyData *const restrict A, - MyData *const restrict B, - MyData *const restrict C) -{ - //#pragma omp parallel for collapse(2) - for (int i=0 ; i<N ; i++) - for (int j=0 ; j<N ; j++) - { - A[(i * N) + j] = 1.0; - B[(i * N) + j] = 2.0; - C[(i * N) + j] = 0.0; - } -} - -void InitDev(MyData *const restrict A, - MyData *const restrict B, - MyData *const restrict C) -{ - #pragma omp target teams loop collapse(2) - for (int i=0 ; i<N ; i++) - for (int j=0 ; j<N ; j++) - { - A[(i * N) + j] = 1.0; - B[(i * N) + j] = 2.0; - C[(i * N) + j] = 0.0; - } - - return; -} - -void HostDgemm(MyData *const restrict A, - MyData *const restrict B, - MyData *const restrict C, - const MyData alpha, - const MyData beta) -{ - // C = alpha * A * B + beta * C; - - // naive calculation - // #pragma omp parallel for collapse(2) - for (int i=0 ; i<N ; i++) - for (int j=0 ; j<N ; j++) - { - MyData sum = 0.0; - for (int k=0 ; k<N ; k++) - sum += A[(i * N) + k] * B[(k * N) + j]; - - C[(i * N) + j] = (alpha * sum) + (beta * C[(i * N) + j]); - } - - return; -} - -void check(MyData *const restrict host_array, - MyData *const restrict dev_array) -{ - int flag = 0; - for (size_t i=0 ; i<SIZE ; i++) - flag = ((fabs(host_array[i] - dev_array[i]) > FLT_EPSILON) ? 1 : flag); - - if (!flag) - printf("\n\t Result OK"); - else - printf("\n\t Result wrong"); - - return; -} - -int main() -{ - // Host allocation - MyData *buffer = (MyData *)malloc(4 * SIZE * sizeof(MyData)); - assert(buffer != NULL); - MyData *const restrict A = buffer; - MyData *const restrict B = A + SIZE; - MyData *const restrict C = B + SIZE; - MyData *const restrict CC = C + SIZE; - - // Spawning 2 host threads - #pragma omp parallel num_threads(2) - { - // Evaluate the Dgemm on the host - #pragma omp single nowait - { - InitHost(A, B, CC); - HostDgemm(A, B, CC, ALPHA, BETA); - } // omp single - - #pragma omp single nowait - { - // Initialize cuBLAS library - cublasHandle_t handle; - cublasCreate(&handle); - - // Allocate A, B, C on the device - #pragma omp target enter data map(alloc: A[0:SIZE], B[0:SIZE], C[0:SIZE]) - - // Init device with blocking omp target directive - InitDev(A, B, C); - - // Define a target data region where A, B, and C pointers - // refer to device's address space - #pragma omp target data use_device_addr(A, B, C) - { - MyData const alpha = ALPHA; - MyData const beta = BETA; - - cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, - &alpha, A, N, B, N, &beta, C, N); - - // CUDA synchronization point - cudaDeviceSynchronize(); - } - - // Fetch data from the device and deallocate - #pragma omp target exit data map(from: C[0:SIZE]) map(delete: A[0:SIZE], B[0:SIZE]) - - cublasDestroy(handle); - } // omp single - } // synchronization point - - check(CC, C); - - free(buffer); - - return 0; -} diff --git a/cuda-omp/hybrid/hybrid_omp_cublas.c b/cuda-omp/hybrid/hybrid_omp_cublas.c new file mode 100644 index 0000000000000000000000000000000000000000..126eef389924cfbb0aeb0c696730d4b019716fa2 --- /dev/null +++ b/cuda-omp/hybrid/hybrid_omp_cublas.c @@ -0,0 +1,268 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Passing OpenMP data to foreign runtime (cuBLAS library). +// +// Author: David Goz +// mail : david.goz@inaf.it +// date : 02.09.2024 +// code tested using nvhpc +// +// - Compile the code: +// - using nvc +// $ nvc -O3 -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v +// hybrid_omp_cuda.c -o hybrid_omp_cuda -lm -lcudart -lcublas +// - using clang +// $ clang -O3 -v -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda +// hybrid_omp_cuda.c -o hybrid_omp_cuda -lm -lcudart -lcublas +// +// - Run the code: +// $ export OMP_TARGET_OFFLOAD=mandatory +// $ ./hybrid_omp_cuda +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include <cuda_runtime.h> +#include <cublas_v2.h> +#include <assert.h> +#include <float.h> +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include <time.h> +#include <omp.h> + +#define N 2048 +#define SIZE ((N) * (N)) +#define ALPHA 1.0 +#define BETA 0.0 +#define HOST 0 +#define DEV 1 +#define LOOP 10 +#define INIT 0 +#define KERNEL 1 +#define DATA 2 + +typedef double MyData; + +static double _time[2][3]; +static int thr[2]; + +double process_time() +{ + struct timespec ts; + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); + const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9; + + return ret; +} + +double thread_time() +{ + struct timespec ts; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts); + const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9; + + return ret; +} + +void InitHost(MyData *const restrict A, + MyData *const restrict B, + MyData *const restrict C, + int *const restrict thr) +{ + double start; + + #pragma omp parallel + { + #pragma omp barrier + #pragma omp master + { + *thr = omp_get_num_threads(); + start = thread_time(); + } + #pragma omp barrier + + #pragma omp for collapse(2) + for (int i=0 ; i<N ; i++) + for (int j=0 ; j<N ; j++) + { + A[(i * N) + j] = 1.0; + B[(i * N) + j] = 2.0; + C[(i * N) + j] = 0.0; + } + + #pragma omp master + { + _time[HOST][INIT] += (thread_time() - start); + } + } // omp parallel + + return; +} + +void InitDev(MyData *const restrict A, + MyData *const restrict B, + MyData *const restrict C) +{ + const double start = thread_time(); + + #pragma omp target teams loop collapse(2) + for (int i=0 ; i<N ; i++) + for (int j=0 ; j<N ; j++) + { + A[(i * N) + j] = 1.0; + B[(i * N) + j] = 2.0; + C[(i * N) + j] = 0.0; + } + + _time[DEV][INIT] += (thread_time() - start); + + return; +} + +void HostDgemm(MyData *const restrict A, + MyData *const restrict B, + MyData *const restrict C, + const MyData alpha, + const MyData beta, + int *const restrict thr) +{ + // C = alpha * A * B + beta * C; + + double start; + + // naive calculation + #pragma omp parallel + { + #pragma omp barrier + #pragma omp master + { + *thr = omp_get_num_threads(); + start = thread_time(); + } + #pragma omp barrier + + #pragma omp for collapse(2) + for (int i=0 ; i<N ; i++) + for (int j=0 ; j<N ; j++) + { + MyData sum = 0.0; + for (int k=0 ; k<N ; k++) + sum += A[(i * N) + k] * B[(k * N) + j]; + + C[(i * N) + j] = (alpha * sum) + (beta * C[(i * N) + j]); + } + + #pragma omp master + { + _time[HOST][KERNEL] += (thread_time() - start); + } + } // omp parallel + + return; +} + +void check(MyData *const restrict host_array, + MyData *const restrict dev_array) +{ + int flag = 0; + for (size_t i=0 ; i<SIZE ; i++) + flag = ((fabs(host_array[i] - dev_array[i]) > FLT_EPSILON) ? 1 : flag); + + if (!flag) + printf("\n\t Result OK \n"); + else + printf("\n\t Result wrong \n"); + + return; +} + +int main() +{ + // Host allocation + MyData *buffer = (MyData *)malloc(4 * SIZE * sizeof(MyData)); + assert(buffer != NULL); + MyData *const restrict A = buffer; + MyData *const restrict B = A + SIZE; + MyData *const restrict C = B + SIZE; + MyData *const restrict CC = C + SIZE; + + // Spawning 2 host threads + #pragma omp parallel num_threads(2) + { + // Evaluate the Dgemm on the host + #pragma omp single nowait + { + // allowing nested parallelism + omp_set_max_active_levels(2); + + for (int loop=0 ; loop<LOOP ; loop++) + { + InitHost(A, B, CC, &thr[0]); + HostDgemm(A, B, CC, ALPHA, BETA, &thr[1]); + } + } // omp single + + #pragma omp single nowait + { + // Initialize cuBLAS library + cublasHandle_t handle; + cublasCreate(&handle); + + // Allocate A, B, C on the device + #pragma omp target enter data map(alloc: A[0:SIZE], B[0:SIZE], C[0:SIZE]) + + for (int loop=0 ; loop<LOOP ; loop++) + { + // Init device with blocking omp target directive + InitDev(A, B, C); + + // Define a target data region where A, B, and C pointers + // refer to device's address space + #pragma omp target data use_device_ptr(A, B, C) + { + const MyData alpha = ALPHA; + const MyData beta = BETA; + + const double start = thread_time(); + cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, + &alpha, + A, N, + B, N, + &beta, + C, N); + + // CUDA synchronization point + cudaDeviceSynchronize(); + _time[DEV][KERNEL] += (thread_time() - start); + } + + // Fetch data from the device and deallocate + const double start = thread_time(); + #pragma omp target update from (C[0:SIZE]) + _time[DEV][DATA] += (thread_time() - start); + } // LOOP + + // deallocate device's memory + #pragma omp target exit data map(delete: A[0:SIZE], B[0:SIZE], C[0:SIZE]) + + cublasDestroy(handle); + } // omp single + } // synchronization point + + check(CC, C); + + free(buffer); + + printf("\n\t Matrix size: %d x %d\n", N, N); + + printf("\n\t Host execution time:"); + printf("\n\t\t Init : %lg [s] - threads: %d", _time[HOST][INIT]/LOOP, thr[0]); + printf("\n\t\t Dgemm : %lg [s] - threads: %d\n", _time[HOST][KERNEL]/LOOP, thr[1]); + + printf("\n\t Device execution time:"); + printf("\n\t\t Init : %lg [s]", _time[DEV][INIT]/LOOP); + printf("\n\t\t Dgemm : %lg [s]", _time[DEV][KERNEL]/LOOP); + printf("\n\t\t Fetch data: %lg [s]\n\n", _time[DEV][DATA]/LOOP); + + return 0; +} diff --git a/cuda-omp/hybrid/hybrid_omp_cuda.cu b/cuda-omp/hybrid/hybrid_omp_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..221044bcc4b57ec9faa8e418a5ea294a82f18de7 --- /dev/null +++ b/cuda-omp/hybrid/hybrid_omp_cuda.cu @@ -0,0 +1,281 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Passing OpenMP data to foreign runtime (cuBLAS library). +// +// Author: David Goz +// mail : david.goz@inaf.it +// date : 03.09.2024 +// code tested using nvhpc +// +// - Compile the code: +// - using nvc +// $ nvc++ -O3 -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v +// hybrid_omp_cuda.cu -o hybrid_omp_cuda -lm && ./hybrid_omp_cuda +// +// - using clang +// $ clang -O3 -v -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda +// hybrid_omp_cuda.c -o hybrid_omp_cuda -lm +// +// - Run the code: +// $ export OMP_TARGET_OFFLOAD=mandatory +// $ ./hybrid_omp_cuda +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include <cuda.h> +#include <math.h> +#include <assert.h> +#include <float.h> +#include <stdio.h> +#include <stdlib.h> +#include <time.h> +#include <omp.h> + +#define N 1024 +#define SIZE ((N) * (N)) +#define HOST 0 +#define DEV 1 +#define LOOP 10 +#define INIT 0 +#define KERNEL 1 +#define DATA 2 +#define BLOCKSIZE 1024 + +typedef double MyData; + +static double _time[2][3]; +static int thr[2]; + +double process_time() +{ + struct timespec ts; + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); + const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9; + + return ret; +} + +double thread_time() +{ + struct timespec ts; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts); + const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9; + + return ret; +} + +void InitHost(MyData *const restrict A, + MyData *const restrict B, + MyData *const restrict C, + int *const restrict thr) +{ + double start; + + #pragma omp parallel + { + #pragma omp barrier + #pragma omp master + { + *thr = omp_get_num_threads(); + start = thread_time(); + } + #pragma omp barrier + + #pragma omp for collapse(2) + for (int i=0 ; i<N ; i++) + for (int j=0 ; j<N ; j++) + { + A[(i * N) + j] = 1.0; + B[(i * N) + j] = (1.0 / M_PI); + C[(i * N) + j] = 0.0; + } + + #pragma omp master + { + _time[HOST][INIT] += (thread_time() - start); + } + } // omp parallel + + return; +} + +void InitDev(MyData *const restrict A, + MyData *const restrict B, + MyData *const restrict C) +{ + const double start = thread_time(); + + #pragma omp target teams loop collapse(2) is_device_ptr(A, B, C) + for (int i=0 ; i<N ; i++) + for (int j=0 ; j<N ; j++) + { + A[(i * N) + j] = 1.0; + B[(i * N) + j] = (1.0 / M_PI); + C[(i * N) + j] = 0.0; + } + + _time[DEV][INIT] += (thread_time() - start); + + return; +} + +void HostMM(MyData *const restrict A, + MyData *const restrict B, + MyData *const restrict C, + int *const restrict thr) +{ + // C = alpha * A * B + beta * C; + + double start; + + // naive calculation + #pragma omp parallel + { + #pragma omp barrier + #pragma omp master + { + *thr = omp_get_num_threads(); + start = thread_time(); + } + #pragma omp barrier + + #pragma omp for collapse(2) + for (int i=0 ; i<N ; i++) + for (int j=0 ; j<N ; j++) + { + MyData sum = 0.0; + for (int k=0 ; k<N ; k++) + sum += A[(i * N) + k] * B[(k * N) + j]; + + C[(i * N) + j] = sum; + } + + #pragma omp master + { + _time[HOST][KERNEL] += (thread_time() - start); + } + } // omp parallel + + return; +} + +__global__ void DevMM(MyData *const restrict A, + MyData *const restrict B, + MyData *const restrict C, + const int n) +{ + const int size = (n * n); + const int globalID = threadIdx.x + (blockIdx.x * blockDim.x); + + if (globalID >= size) + return; + + const int i = (globalID / N); + const int j = (globalID % N); + + MyData sum = 0.0; + for (int k=0 ; k<N ; k++) + sum += (A[(i * N) + k] * B[(k * N) + j]); + + C[(i * N) + j] = sum; + + return; +} + +void check(MyData *const restrict host_array, + MyData *const restrict dev_array) +{ + int flag = 0; + for (size_t i=0 ; i<SIZE ; i++) + flag = ((fabs(host_array[i] - dev_array[i]) > FLT_EPSILON) ? 1 : flag); + + if (!flag) + printf("\n\t Result OK \n"); + else + printf("\n\t Result wrong \n"); + + return; +} + +int main() +{ + // Host allocation + MyData *h_buffer = (MyData *)malloc(2 * SIZE * sizeof(MyData)); + assert(h_buffer != NULL); + MyData *const restrict C_HOST = h_buffer; + MyData *const restrict C_DEV = C_HOST + SIZE; + + // Spawning 2 host threads + #pragma omp parallel num_threads(2) + { + // Evaluate the Dgemm on the host + #pragma omp single nowait + { + // allowing nested parallelism + omp_set_max_active_levels(2); + + MyData *tmp = (MyData *)malloc(2 * SIZE * sizeof(MyData)); + MyData *const restrict A = tmp; + MyData *const restrict B = A + SIZE; + + for (int loop=0 ; loop<LOOP ; loop++) + { + InitHost(A, B, C_HOST, &thr[0]); + HostMM(A, B, C_HOST, &thr[1]); + } + + free(tmp); + } // omp single + + #pragma omp single nowait + { + // Device allocation + const int dev = omp_get_default_device(); + const int host = omp_get_initial_device(); + MyData *d_buffer = (MyData *)omp_target_alloc((3 * SIZE * sizeof(MyData)), dev); + assert(d_buffer != NULL); + MyData *const restrict d_A = d_buffer; + MyData *const restrict d_B = d_A + SIZE; + MyData *const restrict d_C = d_B + SIZE; + + const dim3 nblock = {((SIZE + BLOCKSIZE - 1) / BLOCKSIZE), 1, 1}; + const dim3 block = {BLOCKSIZE, 1, 1}; + + for (int loop=0 ; loop<LOOP ; loop++) + { + // Init device with blocking omp target directive + InitDev(d_A, d_B, d_C); + + double start = thread_time(); + DevMM<<< nblock, block >>>(d_A, d_B, d_C, N); + // CUDA synchronization point + cudaDeviceSynchronize(); + _time[DEV][KERNEL] += (thread_time() - start); + + // Fetch data from the device and deallocate + start = thread_time(); + omp_target_memcpy(C_DEV, d_C, (SIZE * sizeof(MyData)), 0, 0, host, dev); + _time[DEV][DATA] += (thread_time() - start); + } // LOOP + + // deallocate device's memory + omp_target_free(d_buffer, dev); + } // omp single + } // synchronization point + + check(C_HOST, C_DEV); + + free(h_buffer); + + printf("\n\t Matrix size: %d x %d\n", N, N); + + printf("\n\t Host execution time:"); + printf("\n\t\t Init : %lg [s] - threads: %d", _time[HOST][INIT]/LOOP, thr[0]); + printf("\n\t\t Dgemm : %lg [s] - threads: %d\n", _time[HOST][KERNEL]/LOOP, thr[1]); + + printf("\n\t Device execution time:"); + printf("\n\t\t Init : %lg [s]", _time[DEV][INIT]/LOOP); + printf("\n\t\t Dgemm : %lg [s]", _time[DEV][KERNEL]/LOOP); + printf("\n\t\t Fetch data: %lg [s]\n\n", _time[DEV][DATA]/LOOP); + + return 0; +} diff --git a/cuda-omp/omp/miscellaneous/host_device_address_space.c b/cuda-omp/omp/miscellaneous/host_device_address_space.c new file mode 100644 index 0000000000000000000000000000000000000000..7b2c81b475c83ee209721735ca3b86b69ef9d75b --- /dev/null +++ b/cuda-omp/omp/miscellaneous/host_device_address_space.c @@ -0,0 +1,76 @@ +////////////////////////////////////////////////////////////////////////////////////////////////// +// Author: David Goz +// mail : david.goz@inaf.it +// date : 03.09.2024 +// code tested using nvhpc +// +// - Compile the code: +// - nvc complains about the '#pragma omp target has_device_addr' +// - clang works +////////////////////////////////////////////////////////////////////////////////////////////////// + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <assert.h> + +static int answer = 42; +#define SIZE 3 + +int main() +{ + int *A = (int *)malloc(SIZE * sizeof(int)); + assert(A != NULL); + memset(A, 0, (SIZE * sizeof(int))); + + printf("\n\t Host address of 'answer' [value: %d] is: %p", answer, &answer); + printf("\n\t Host address of 'A' is: %p", A); + + // Map 'answer' to the device + #pragma omp target data map(alloc: answer, A[0:SIZE]) + { + #pragma omp target update to(answer, A[0:SIZE]) + + // Update 'answer' on the device + #pragma omp target map(answer) + { + answer += 1; + for (int i=0 ; i<SIZE ; i++) + A[i] += (i + 1); + } + + #pragma omp target update from(answer, A[0:SIZE]) + + printf("\n\t Address of 'answer' [value: %d] in target data region is: %p", + answer, &answer); + printf("\n\t Address of 'A' in target data region is: %p", A); + printf("\n\t Value of A:"); + for (int i=0 ; i<SIZE ; i++) + printf("\n\t\t A[%d] = %d", i, A[i]); + printf("\n"); + + #pragma omp target data use_device_addr(answer) use_device_ptr(A) + { + #pragma omp target has_device_addr(answer) + { + answer += 1; + for (int i=0 ; i<SIZE ; i++) + A[i] += (i + 1); + } + + printf("\n\t Device address of 'answer' is: %p", &answer); + printf("\n\t Device address of 'A' is: %p", A); + } + + #pragma omp target update from(answer, A[0:SIZE]) + printf("\n\t Last value of 'answer' is: %d", answer); + printf("\n\t Last value if 'A' is:"); + for (int i=0 ; i<SIZE ; i++) + printf("\n\t\t A[%d] = %d", i, A[i]); + printf("\n\n"); + } // omp target data + + free(A); + + return 0; +} diff --git a/cuda-omp/omp/miscellaneous/structure.c b/cuda-omp/omp/miscellaneous/structure.c index fc0041c02e55eb438685ab113ca2da41b7b658a0..7147e2d55e31525f273cc98b48da0dd01cfecfea 100644 --- a/cuda-omp/omp/miscellaneous/structure.c +++ b/cuda-omp/omp/miscellaneous/structure.c @@ -86,6 +86,6 @@ int main() free(A.ptr); free(B.ptr); free(C.ptr); - + return 0; } diff --git a/cuda-omp/omp/miscellaneous/structure_routines.c b/cuda-omp/omp/miscellaneous/structure_routines.c new file mode 100644 index 0000000000000000000000000000000000000000..91299044a7b23ea5f08729d8803663ef3b973e80 --- /dev/null +++ b/cuda-omp/omp/miscellaneous/structure_routines.c @@ -0,0 +1,126 @@ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Author: David Goz +// mail : david.goz@inaf.it +// date : 31.07.2024 +// code tested using nvhpc +// +// - Compile the code: +// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v structure_routines.c -o structure_routines_omp +// - Run the code: +// $ export OMP_TARGET_OFFLOAD=mandatory +// $ ./structure_routines_omp +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> +#include <omp.h> + +#define SIZE 8 +#define SIZE_2 (SIZE / 2) +typedef double MyData; + +typedef struct my_span +{ + size_t N; + MyData *A; + MyData *B; +} span; + +span d_S; +#pragma omp declare target(d_S) + +void allocate( span *my_struct, + const size_t size) +{ + /* allocate the buffer on the host memory */ + my_struct->A = (MyData *)calloc(size, sizeof(MyData)); + my_struct->B = (MyData *)calloc(size, sizeof(MyData)); + assert((my_struct->A != NULL) && (my_struct->B != NULL)); + my_struct->N = size; + + for (size_t i=0 ; i<size; i++) + { + my_struct->A[i] = (MyData)(3 * i); + my_struct->B[i] = (MyData)(2 * i); + } + + return; +} + +void print(const span *const ptr, + const char *const string) +{ + int flag = 0; + + printf("\n"); + for (int i=0 ; i<ptr->N ; i++) + { + printf("\n\t %s[%d] = %lg", string, i, ptr->A[i]); + printf("\n\t %s[%d] = %lg", string, i, ptr->B[i]); + flag = (((ptr->A[i] != 0) || (ptr->B[i] != 0)) ? 1 : flag); + } + printf("\n"); + + if (flag) + printf("\n\t Result wrong \n\n"); + else + printf("\n\t Result OK \n\n"); + + return; +} + +int main() +{ + /* host allocation */ + span h_S; + allocate(&h_S, SIZE); + + /* allocating GPU memory using OMP routines */ + const int dev = omp_get_default_device(); + const int host = omp_get_initial_device(); + MyData *d_buffer = (double *)omp_target_alloc(2 * SIZE * sizeof(MyData), dev); + assert(d_buffer != NULL); + + /* set the pointers within the GPU */ +#pragma omp target is_device_ptr(d_buffer) device(dev) + { + d_S.N = SIZE; + d_S.A = d_buffer; + d_S.B = d_buffer + SIZE; + } + + /* copy data to the GPU */ + omp_target_memcpy(d_buffer, h_S.A, (SIZE * sizeof(MyData)), 0, 0, dev, host); + omp_target_memcpy(d_buffer, h_S.B, (SIZE * sizeof(MyData)), (SIZE * sizeof(MyData)), 0, dev, host); + + /* perform the calculation on the GPU */ +#pragma omp target device(dev) + { + #pragma omp loop + for (size_t i=0 ; i<d_S.N ; i++) + { + d_S.A[i] -= (MyData)(3 * i); + d_S.B[i] -= (MyData)(2 * i); + } + } + + /* copy data from the GPU */ + omp_target_memcpy(h_S.A, d_buffer, (SIZE * sizeof(MyData)), + 0, 0, host, dev); + omp_target_memcpy(h_S.B, d_buffer, (SIZE * sizeof(MyData)), + 0, (SIZE * sizeof(MyData)), host, dev); + + /* check the data */ + print(&h_S, "d_S"); + + /* free GPU memory */ + omp_target_free(d_buffer, dev); + + /* free host memory */ + free(h_S.A); + free(h_S.B); + + return 0; +}