diff --git a/cuda-omp/omp/7/mat_mult.c b/cuda-omp/omp/7/mat_mult.c index 789e7f2a73299e8fa3fcc7039efdd033a0da999e..76e5ab2394a70d3956d8d5c1fc00549e26bb8d08 100644 --- a/cuda-omp/omp/7/mat_mult.c +++ b/cuda-omp/omp/7/mat_mult.c @@ -88,7 +88,7 @@ void GPU_mat_mult(const MyData *const restrict A, C[(i * size) + j] = value; } // omp threads } // omp target teams - + return; } diff --git a/cuda-omp/omp/miscellaneous/memory_management.c b/cuda-omp/omp/miscellaneous/memory_management.c new file mode 100644 index 0000000000000000000000000000000000000000..373945ef40be86abe2ae3b0c9cb09b6a0a391e62 --- /dev/null +++ b/cuda-omp/omp/miscellaneous/memory_management.c @@ -0,0 +1,420 @@ +//////////////////////////////////////////////////////////////////////////////////////////////// +// - Block matrix multiplication algorithm +// +// const size_t Nblocks = (N / Bsize); +// +// // loop over blocks of matrix C +// for (size_t ib=0 ; ib<Nblocks ; ib++) +// { +// for (size_t jb=0 ; jb<Nblocks ; jb++) +// { +// +// // loop over blocks of rows of A +// for (size_t kb=0 ; kb<Nblocks ; kb++) +// { +// +// // Matrix multiplication within a block +// for (size_t i=(ib * Bsize) ; i<((ib + 1) * Bsize) ; i++) +// for (size_t j=(jb * Bsize) ; j<((jb + 1) * Bsize) ; j++) +// for (size_t k=(kb * Bsize) ; k<((kb + 1) * Bsize) ; k++) +// C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j]; +// +// } // kb +// } // jb +// } // ib +// +// - Exploit GPU shared-memory. +// - Use explicit target memory allocator and memory management OpenMP APIs +//////////////////////////////////////////////////////////////////////////////////////////////// + +////////////////////////////////////////////////////////////////////////////////////////////////// +// Author: David Goz +// mail : david.goz@inaf.it +// date : 26.08.2024 +// code tested using nvhpc +// +// - Compile the code: +// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v memory_management.c -o memory_management_omp +// - Run the code: +// $ export OMP_TARGET_OFFLOAD=MANDATORY +// $ ./memory_management_omp +// +// N.B. team size in the range [32, 1024) +////////////////////////////////////////////////////////////////////////////////////////////////// + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <time.h> +#include <assert.h> +#include <omp.h> +#include <string.h> +#include <math.h> +#include <float.h> + +#define N 1024 +#define SIZE (N * N) // matrix size +typedef double MyData; // do not change +#define _BLOCK_ 16 +#define BLOCKSIZE ((_BLOCK_) * (_BLOCK_)) /* 32 <= BLOCKSIZE < 1024 */ + +// sanity check +#if _BLOCK_*_BLOCK_ > 1024 +#error _BLOCK_*_BLOCK_ cannot larger than 1024 +#endif + +#if _BLOCK_ > N +#error _BLOCK_ > N +#endif + +#define LOOP 100 +#define NDEBUG + +double wall_time() +{ + struct timespec ts; + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); + const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9; + + return ret; +} + +void CPU_mat_mult_block(const MyData *const restrict A, + const MyData *const restrict B, + MyData *const restrict C, + const size_t size) +{ + const size_t Nblocks = (size / _BLOCK_); + + // loop over blocks of matrix C + for (size_t ib=0 ; ib<Nblocks ; ib++) + { + for (size_t jb=0 ; jb<Nblocks ; jb++) + { + // loop over blocks of rows of A + for (size_t kb=0 ; kb<Nblocks ; kb++) + { + // Matrix multiplication within a block + for (size_t i=(ib * _BLOCK_) ; i<((ib + 1) * _BLOCK_) ; i++) + for (size_t j=(jb * _BLOCK_) ; j<((jb + 1) * _BLOCK_) ; j++) + for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++) + C[(i * size) + j] += A[(i * size) + k] * B[(k * size) + j]; + } // kb + } // jb + } // ib + + return; +} + +void GPU_mat_mult_block(const MyData *const restrict A, + const MyData *const restrict B, + MyData *const restrict C, + const size_t size) +{ + const size_t Nblocks = (size / _BLOCK_); + + #pragma omp target \ + teams distribute collapse(2) num_teams(Nblocks * Nblocks) \ + thread_limit(BLOCKSIZE) \ + is_device_ptr(A, B, C) + for (size_t ib=0 ; ib<Nblocks ; ib++) + { + for (size_t jb=0 ; jb<Nblocks ; jb++) + { + for (size_t kb=0 ; kb<Nblocks ; kb++) + { + #pragma omp parallel for collapse(2) num_threads(BLOCKSIZE) + for (size_t i=(ib * _BLOCK_) ; i<((ib + 1) * _BLOCK_) ; i++) + { + for (size_t j=(jb * _BLOCK_) ; j<((jb + 1) * _BLOCK_) ; j++) + { + MyData value = ((kb == 0) ? (MyData)0 : C[(i * size) + j]); + for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++) + value += A[(i * size) + k] * B[(k * size) + j]; + + C[(i * size) + j] = value; + } // j + } // i + } // kb + } // jb + } // ib + + return; +} + +void GPU_mat_mult_block_shared(const MyData *const restrict A, + const MyData *const restrict B, + MyData *const restrict C, + const size_t size) +{ + const size_t Nblocks = (size / _BLOCK_); + // Team local copies of tiles + MyData Ablock[BLOCKSIZE]; + MyData Bblock[BLOCKSIZE]; + MyData Cblock[BLOCKSIZE]; + + #pragma omp target \ + teams distribute collapse(2) num_teams(Nblocks * Nblocks) \ + private(Ablock, Bblock, Cblock) \ + thread_limit(BLOCKSIZE) \ + is_device_ptr(A, B, C) + for (size_t ib=0 ; ib<Nblocks ; ib++) + { + for (size_t jb=0 ; jb<Nblocks ; jb++) + { + for (size_t kb=0 ; kb<Nblocks ; kb++) + { + #pragma omp parallel num_threads(BLOCKSIZE) + { + if (kb == 0) // Cblock initialization + { + #pragma omp for collapse(2) nowait + for (size_t i=0 ; i<_BLOCK_ ; i++) + for (size_t j=0 ; j<_BLOCK_ ; j++) + Cblock[(i * _BLOCK_) + j] = (MyData)0; + } + + // copy block of A into pteam memory Ablock + #pragma omp for collapse(2) nowait // implicit barrier is removed + for (size_t i=(ib * _BLOCK_) ; i<((ib + 1) * _BLOCK_) ; i++) + for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++) + Ablock[(i % _BLOCK_) * _BLOCK_ + (k % _BLOCK_)] = A[(i * N) + k]; + + // copy block of B into pteam memory Bblock + #pragma omp for collapse(2) // implicit barrier to ensure that shared Ablock and Bblock can be accessed by all threads within the block + for (size_t j=(jb * _BLOCK_) ; j<((jb + 1) * _BLOCK_) ; j++) + for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++) + Bblock[(k % _BLOCK_) * _BLOCK_ + (j % _BLOCK_)] = B[(k * N) + j]; + + // matrix multiply within the block + #pragma omp for collapse(2) nowait + for (size_t i=(ib * _BLOCK_) ; i<((ib + 1) * _BLOCK_) ; i++) + for (size_t j=(jb * _BLOCK_) ; j<((jb + 1) * _BLOCK_) ; j++) + { + MyData value = (MyData)0; + for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++) + value += Ablock[(i % _BLOCK_) * _BLOCK_ + (k % _BLOCK_)] * + Bblock[(k % _BLOCK_) * _BLOCK_ + (j % _BLOCK_)]; + + Cblock[((i % _BLOCK_) * _BLOCK_) + (j % _BLOCK_)] += value; + + if (kb == (Nblocks - 1)) + C[(i * N) + j] = Cblock[((i % _BLOCK_) * _BLOCK_) + (j % _BLOCK_)]; + } + } // omp parallel + } // kb + } // jb + } // ib + + return; +} + +void GPU_mat_mult_block_shared_no_loops(const MyData *const restrict A, + const MyData *const restrict B, + MyData *const restrict C, + const size_t size) +{ + const size_t Nblocks = (size / _BLOCK_); + // Team local copies of tiles + MyData Ablock[BLOCKSIZE]; + MyData Bblock[BLOCKSIZE]; + MyData Cblock[BLOCKSIZE]; + + #pragma omp target \ + teams num_teams(Nblocks * Nblocks) \ + private(Ablock, Bblock, Cblock) \ + thread_limit(BLOCKSIZE) \ + is_device_ptr(A, B, C) + { + const size_t ib = omp_get_team_num() / Nblocks; + const size_t jb = omp_get_team_num() % Nblocks; + + #pragma omp parallel num_threads(BLOCKSIZE) + { +#if !defined(NDEBUG) + + /* ID within the team (CUDA block) */ + const int localID = omp_get_thread_num(); + /* Team ID (ID CUDA block) */ + const int team = omp_get_team_num(); + /* Team size (CUDA block size) */ + const int nthr = omp_get_num_threads(); + /* global thread index */ + const int globalID = (localID + (team * nthr)); + + printf("\n\t globalID: %d - localID: %d - nthr: %d - team: %d\n", + globalID, localID, nthr, team); + + #pragma omp barrier + +#endif /* NDEBUG */ + + // local matrix's indexes mapped to each OMP GPU-thread within its own team + const size_t i_local = omp_get_thread_num() / _BLOCK_; + const size_t j_local = omp_get_thread_num() % _BLOCK_; + + // global matrix's indexes mapped to each OMP GPU-thread + const size_t i_global = i_local + (ib * _BLOCK_); + const size_t j_global = j_local + (jb * _BLOCK_); + + // Cblock initialization + Cblock[(i_local * _BLOCK_) + j_local] = (MyData)0; + + // loop over blocks + for (size_t block=0 ; block<Nblocks ; block++) + { + const size_t j_A = j_local + (block * _BLOCK_); + const size_t i_B = i_local + (block * _BLOCK_); + + // waits until all threads in the thread block have reached this point and shared memory accesses + #pragma omp barrier + + Ablock[(i_local * _BLOCK_) + j_local] = A[(i_global * size) + j_A]; + Bblock[(i_local * _BLOCK_) + j_local] = B[(i_B * size) + j_global]; + + // waits until all threads in the thread block have reached this point and shared memory accesses + #pragma omp barrier + + // perform the matrix multiplication within the block + MyData value = (MyData)0; + for (size_t k=0 ; k<_BLOCK_ ; k++) + value += Ablock[(i_local * _BLOCK_) + k] * Bblock[(k * _BLOCK_) + j_local]; + + // store the partial result in shared memory + Cblock[(i_local * _BLOCK_) + j_local] += value; + } // block loop + + // store the result into global memory + C[(i_global * size) + j_global] = Cblock[(i_local * _BLOCK_) + j_local]; + } // omp parallel + } // target teams + + return; +} + +void check(const MyData *const __restrict__ cpu_matrix, + const MyData *const __restrict__ gpu_matrix) +{ + int flag = 0; + for (size_t i=0 ; i<SIZE ; i++) + flag = ((fabs(cpu_matrix[i] - gpu_matrix[i]) > FLT_EPSILON) ? 1 : flag); + + if (!flag) + printf("\n\t Result OK"); + else + printf("\n\t Result wrong"); + +#if !defined(NDEBUG) + + for (size_t i=0 ; i<N ; i++) + { + printf("\n"); + for (size_t j=0 ; j<N ; j++) + { + const size_t index = ((i * N) + j); + printf("\n\t cpu_matrix[%d] = %lg - gpu_matrix[%d] = %lg - diff = %lg", + index, cpu_matrix[index], index, gpu_matrix[index], fabs(cpu_matrix[index] - gpu_matrix[index])); + } + } + printf("\n"); + +#endif // NDEBUG + + return; +} + +int main() +{ + double time; + MyData *buffer_cpu = (MyData *)calloc(4 * SIZE, sizeof(MyData)); + assert(buffer_cpu != NULL); + + // host reference matrix A + MyData *const restrict A = buffer_cpu; + MyData *const restrict B = A + SIZE; + MyData *const restrict C_CPU = B + SIZE; + MyData *const restrict C_GPU_HOST = C_CPU + SIZE; + for (size_t i=0 ; i<SIZE ; i++) + { + A[i] = drand48(); + B[i] = drand48(); + } + + ////////////////////////// CPU naive algorithm /////////////////////////////////////////////////// + CPU_mat_mult_block(A, B, C_CPU, N); + ////////////////////////////////////////////////////////////////////////////////////////////////// + + // get the host id + const int dev_host = omp_get_initial_device(); + + // get the default device + const int dev_gpu = omp_get_default_device(); + assert(dev_host != dev_gpu); + + // alloc data to the GPU + MyData *const buffer_gpu = (MyData *)omp_target_alloc(3 * SIZE * sizeof(MyData), dev_gpu); + assert(buffer_gpu != NULL); + MyData *const restrict A_GPU = buffer_gpu; + MyData *const restrict B_GPU = A_GPU + SIZE; + MyData *const restrict C_GPU = B_GPU + SIZE; + + // copy data to GPU + omp_target_memcpy(buffer_gpu, buffer_cpu, (2 * SIZE * sizeof(MyData)), 0, 0, dev_gpu, dev_host); + + /////////////////////////// GPU naive block algorithm //////////////////////////////////////////// + time = 0.0; + GPU_mat_mult_block(A_GPU, B_GPU, C_GPU, N); // warm-up + for (unsigned short int loop=0 ; loop<LOOP ; loop++) + { + const double start = wall_time(); + GPU_mat_mult_block(A_GPU, B_GPU, C_GPU, N); + time += (wall_time() - start); + } + // copy data device-to-host + omp_target_memcpy(C_GPU_HOST, C_GPU, (SIZE * sizeof(MyData)), 0, 0, dev_host, dev_gpu); + + check(C_CPU, C_GPU_HOST); + memset(C_GPU_HOST, 0, (SIZE * sizeof(MyData))); + printf("\n\t GPU block time %lg [s]\n", (time / LOOP)); + ////////////////////////////////////////////////////////////////////////////////////////////////// + + /////////////////////////// GPU naive block shared memory algorithm ////////////////////////////// + time = 0.0; + GPU_mat_mult_block_shared(A_GPU, B_GPU, C_GPU, N); // warm-up + for (unsigned short int loop=0 ; loop<LOOP ; loop++) + { + const double start = wall_time(); + GPU_mat_mult_block_shared(A_GPU, B_GPU, C_GPU, N); + time += (wall_time() - start); + } + + omp_target_memcpy(C_GPU_HOST, C_GPU, (SIZE * sizeof(MyData)), 0, 0, dev_host, dev_gpu); + check(C_CPU, C_GPU_HOST); + memset(C_GPU_HOST, 0, (SIZE * sizeof(MyData))); + printf("\n\t GPU block pteam memory time %lg [s]\n", (time / LOOP)); + ////////////////////////////////////////////////////////////////////////////////////////////////// + + /////////////////////////// GPU naive block shared memory no loops algorithm ///////////////////// + time = 0.0; + GPU_mat_mult_block_shared_no_loops(A_GPU, B_GPU, C_GPU, N); // warm-up + for (unsigned short int loop=0 ; loop<LOOP ; loop++) + { + const double start = wall_time(); + GPU_mat_mult_block_shared_no_loops(A_GPU, B_GPU, C_GPU, N); + time += (wall_time() - start); + } + + omp_target_memcpy(C_GPU_HOST, C_GPU, (SIZE * sizeof(MyData)), 0, 0, dev_host, dev_gpu); + check(C_CPU, C_GPU_HOST); + printf("\n\t GPU block no loops pteam memory time %lg [s]\n", (time / LOOP)); + ////////////////////////////////////////////////////////////////////////////////////////////////// + + // free CPU memory + free(buffer_cpu); + // free GPU memory + omp_target_free(buffer_gpu, dev_gpu); + + printf("\n"); + + return EXIT_SUCCESS; +} diff --git a/cuda-omp/omp/miscellaneous/unified_shared_memory.c b/cuda-omp/omp/miscellaneous/unified_shared_memory.c new file mode 100644 index 0000000000000000000000000000000000000000..2228c8eb63521bce6e43aaf3c4cd39a7f8c04567 --- /dev/null +++ b/cuda-omp/omp/miscellaneous/unified_shared_memory.c @@ -0,0 +1,389 @@ +//////////////////////////////////////////////////////////////////////////////////////////////// +// - Block matrix multiplication algorithm +// +// const size_t Nblocks = (N / Bsize); +// +// // loop over blocks of matrix C +// for (size_t ib=0 ; ib<Nblocks ; ib++) +// { +// for (size_t jb=0 ; jb<Nblocks ; jb++) +// { +// +// // loop over blocks of rows of A +// for (size_t kb=0 ; kb<Nblocks ; kb++) +// { +// +// // Matrix multiplication within a block +// for (size_t i=(ib * Bsize) ; i<((ib + 1) * Bsize) ; i++) +// for (size_t j=(jb * Bsize) ; j<((jb + 1) * Bsize) ; j++) +// for (size_t k=(kb * Bsize) ; k<((kb + 1) * Bsize) ; k++) +// C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j]; +// +// } // kb +// } // jb +// } // ib +// +// - Exploit GPU shared-memory. +// - Use explicit target memory allocator and memory management OpenMP APIs +//////////////////////////////////////////////////////////////////////////////////////////////// + +////////////////////////////////////////////////////////////////////////////////////////////////// +// Author: David Goz +// mail : david.goz@inaf.it +// date : 26.08.2024 +// code tested using nvhpc +// +// - Compile the code: +// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v memory_management.c -o memory_management_omp +// - Run the code: +// $ export OMP_TARGET_OFFLOAD=MANDATORY +// $ ./memory_management_omp +// +// N.B. team size in the range [32, 1024) +////////////////////////////////////////////////////////////////////////////////////////////////// + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <time.h> +#include <assert.h> +#include <omp.h> +#include <string.h> +#include <math.h> +#include <float.h> + +#define N 1024 +#define SIZE (N * N) // matrix size +typedef double MyData; // do not change +#define _BLOCK_ 16 +#define BLOCKSIZE ((_BLOCK_) * (_BLOCK_)) /* 32 <= BLOCKSIZE < 1024 */ + +// sanity check +#if _BLOCK_*_BLOCK_ > 1024 +#error _BLOCK_*_BLOCK_ cannot larger than 1024 +#endif + +#if _BLOCK_ > N +#error _BLOCK_ > N +#endif + +#define LOOP 100 +#define NDEBUG + +double wall_time() +{ + struct timespec ts; + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); + const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9; + + return ret; +} + +void CPU_mat_mult_block(const MyData *const restrict A, + const MyData *const restrict B, + MyData *const restrict C, + const size_t size) +{ + const size_t Nblocks = (size / _BLOCK_); + + // loop over blocks of matrix C + for (size_t ib=0 ; ib<Nblocks ; ib++) + { + for (size_t jb=0 ; jb<Nblocks ; jb++) + { + // loop over blocks of rows of A + for (size_t kb=0 ; kb<Nblocks ; kb++) + { + // Matrix multiplication within a block + for (size_t i=(ib * _BLOCK_) ; i<((ib + 1) * _BLOCK_) ; i++) + for (size_t j=(jb * _BLOCK_) ; j<((jb + 1) * _BLOCK_) ; j++) + for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++) + C[(i * size) + j] += A[(i * size) + k] * B[(k * size) + j]; + } // kb + } // jb + } // ib + + return; +} + +void GPU_mat_mult_block(const MyData *const restrict A, + const MyData *const restrict B, + MyData *const restrict C, + const size_t size) +{ + const size_t Nblocks = (size / _BLOCK_); + + #pragma omp requires unified_shared_memory + + #pragma omp target \ + teams distribute collapse(2) num_teams(Nblocks * Nblocks) \ + thread_limit(BLOCKSIZE) + for (size_t ib=0 ; ib<Nblocks ; ib++) + { + for (size_t jb=0 ; jb<Nblocks ; jb++) + { + for (size_t kb=0 ; kb<Nblocks ; kb++) + { + #pragma omp parallel for collapse(2) num_threads(BLOCKSIZE) + for (size_t i=(ib * _BLOCK_) ; i<((ib + 1) * _BLOCK_) ; i++) + { + for (size_t j=(jb * _BLOCK_) ; j<((jb + 1) * _BLOCK_) ; j++) + { + MyData value = ((kb == 0) ? (MyData)0 : C[(i * size) + j]); + for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++) + value += A[(i * size) + k] * B[(k * size) + j]; + + C[(i * size) + j] = value; + } // j + } // i + } // kb + } // jb + } // ib + + return; +} + +void GPU_mat_mult_block_shared(const MyData *const restrict A, + const MyData *const restrict B, + MyData *const restrict C, + const size_t size) +{ + const size_t Nblocks = (size / _BLOCK_); + // Team local copies of tiles + MyData Ablock[BLOCKSIZE]; + MyData Bblock[BLOCKSIZE]; + MyData Cblock[BLOCKSIZE]; + + #pragma omp requires unified_shared_memory + + #pragma omp target \ + teams distribute collapse(2) num_teams(Nblocks * Nblocks) \ + private(Ablock, Bblock, Cblock) \ + thread_limit(BLOCKSIZE) + for (size_t ib=0 ; ib<Nblocks ; ib++) + { + for (size_t jb=0 ; jb<Nblocks ; jb++) + { + for (size_t kb=0 ; kb<Nblocks ; kb++) + { + #pragma omp parallel num_threads(BLOCKSIZE) + { + if (kb == 0) // Cblock initialization + { + #pragma omp for collapse(2) nowait + for (size_t i=0 ; i<_BLOCK_ ; i++) + for (size_t j=0 ; j<_BLOCK_ ; j++) + Cblock[(i * _BLOCK_) + j] = (MyData)0; + } + + // copy block of A into pteam memory Ablock + #pragma omp for collapse(2) nowait // implicit barrier is removed + for (size_t i=(ib * _BLOCK_) ; i<((ib + 1) * _BLOCK_) ; i++) + for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++) + Ablock[(i % _BLOCK_) * _BLOCK_ + (k % _BLOCK_)] = A[(i * N) + k]; + + // copy block of B into pteam memory Bblock + #pragma omp for collapse(2) // implicit barrier to ensure that shared Ablock and Bblock can be accessed by all threads within the block + for (size_t j=(jb * _BLOCK_) ; j<((jb + 1) * _BLOCK_) ; j++) + for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++) + Bblock[(k % _BLOCK_) * _BLOCK_ + (j % _BLOCK_)] = B[(k * N) + j]; + + // matrix multiply within the block + #pragma omp for collapse(2) nowait + for (size_t i=(ib * _BLOCK_) ; i<((ib + 1) * _BLOCK_) ; i++) + for (size_t j=(jb * _BLOCK_) ; j<((jb + 1) * _BLOCK_) ; j++) + { + MyData value = (MyData)0; + for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++) + value += Ablock[(i % _BLOCK_) * _BLOCK_ + (k % _BLOCK_)] * + Bblock[(k % _BLOCK_) * _BLOCK_ + (j % _BLOCK_)]; + + Cblock[((i % _BLOCK_) * _BLOCK_) + (j % _BLOCK_)] += value; + + if (kb == (Nblocks - 1)) + C[(i * N) + j] = Cblock[((i % _BLOCK_) * _BLOCK_) + (j % _BLOCK_)]; + } + } // omp parallel + } // kb + } // jb + } // ib + + return; +} + +void GPU_mat_mult_block_shared_no_loops(const MyData *const restrict A, + const MyData *const restrict B, + MyData *const restrict C, + const size_t size) +{ + const size_t Nblocks = (size / _BLOCK_); + // Team local copies of tiles + MyData Ablock[BLOCKSIZE]; + MyData Bblock[BLOCKSIZE]; + MyData Cblock[BLOCKSIZE]; + + #pragma omp requires unified_shared_memory + + #pragma omp target \ + teams num_teams(Nblocks * Nblocks) \ + private(Ablock, Bblock, Cblock) \ + thread_limit(BLOCKSIZE) + { + const size_t ib = omp_get_team_num() / Nblocks; + const size_t jb = omp_get_team_num() % Nblocks; + + #pragma omp parallel num_threads(BLOCKSIZE) + { + // local matrix's indexes mapped to each OMP GPU-thread within its own team + const size_t i_local = omp_get_thread_num() / _BLOCK_; + const size_t j_local = omp_get_thread_num() % _BLOCK_; + + // global matrix's indexes mapped to each OMP GPU-thread + const size_t i_global = i_local + (ib * _BLOCK_); + const size_t j_global = j_local + (jb * _BLOCK_); + + // Cblock initialization + Cblock[(i_local * _BLOCK_) + j_local] = (MyData)0; + + // loop over blocks + for (size_t block=0 ; block<Nblocks ; block++) + { + const size_t j_A = j_local + (block * _BLOCK_); + const size_t i_B = i_local + (block * _BLOCK_); + + // waits until all threads in the thread block have reached this point and shared memory accesses + #pragma omp barrier + + Ablock[(i_local * _BLOCK_) + j_local] = A[(i_global * size) + j_A]; + Bblock[(i_local * _BLOCK_) + j_local] = B[(i_B * size) + j_global]; + + // waits until all threads in the thread block have reached this point and shared memory accesses + #pragma omp barrier + + // perform the matrix multiplication within the block + MyData value = (MyData)0; + for (size_t k=0 ; k<_BLOCK_ ; k++) + value += Ablock[(i_local * _BLOCK_) + k] * Bblock[(k * _BLOCK_) + j_local]; + + // store the partial result in shared memory + Cblock[(i_local * _BLOCK_) + j_local] += value; + } // block loop + + // store the result into global memory + C[(i_global * size) + j_global] = Cblock[(i_local * _BLOCK_) + j_local]; + } // omp parallel + } // target teams + + return; +} + +void check(const MyData *const __restrict__ cpu_matrix, + const MyData *const __restrict__ gpu_matrix) +{ + int flag = 0; + for (size_t i=0 ; i<SIZE ; i++) + flag = ((fabs(cpu_matrix[i] - gpu_matrix[i]) > FLT_EPSILON) ? 1 : flag); + + if (!flag) + printf("\n\t Result OK"); + else + printf("\n\t Result wrong"); + +#if !defined(NDEBUG) + + for (size_t i=0 ; i<N ; i++) + { + printf("\n"); + for (size_t j=0 ; j<N ; j++) + { + const size_t index = ((i * N) + j); + printf("\n\t cpu_matrix[%d] = %lg - gpu_matrix[%d] = %lg - diff = %lg", + index, cpu_matrix[index], index, gpu_matrix[index], fabs(cpu_matrix[index] - gpu_matrix[index])); + } + } + printf("\n"); + +#endif // NDEBUG + + return; +} + +int main() +{ + double time; + MyData *buffer_cpu = (MyData *)calloc(4 * SIZE, sizeof(MyData)); + assert(buffer_cpu != NULL); + + // host reference matrix A + MyData *const restrict A = buffer_cpu; + MyData *const restrict B = A + SIZE; + MyData *const restrict C_CPU = B + SIZE; + MyData *const restrict C_GPU = C_CPU + SIZE; + for (size_t i=0 ; i<SIZE ; i++) + { + A[i] = drand48(); + B[i] = drand48(); + } + + ////////////////////////// CPU naive algorithm /////////////////////////////////////////////////// + CPU_mat_mult_block(A, B, C_CPU, N); + ////////////////////////////////////////////////////////////////////////////////////////////////// + + // get the host id + const int dev_host = omp_get_initial_device(); + + // get the default device + const int dev_gpu = omp_get_default_device(); + assert(dev_host != dev_gpu); + + /////////////////////////// GPU naive block algorithm //////////////////////////////////////////// + time = 0.0; + GPU_mat_mult_block(A, B, C_GPU, N); // warm-up + for (unsigned short int loop=0 ; loop<LOOP ; loop++) + { + const double start = wall_time(); + GPU_mat_mult_block(A, B, C_GPU, N); + time += (wall_time() - start); + } + + check(C_CPU, C_GPU); + memset(C_GPU, 0, (SIZE * sizeof(MyData))); + printf("\n\t GPU block time %lg [s]\n", (time / LOOP)); + ////////////////////////////////////////////////////////////////////////////////////////////////// + + /////////////////////////// GPU naive block shared memory algorithm ////////////////////////////// + time = 0.0; + GPU_mat_mult_block_shared(A, B, C_GPU, N); // warm-up + for (unsigned short int loop=0 ; loop<LOOP ; loop++) + { + const double start = wall_time(); + GPU_mat_mult_block_shared(A, B, C_GPU, N); + time += (wall_time() - start); + } + + check(C_CPU, C_GPU); + memset(C_GPU, 0, (SIZE * sizeof(MyData))); + printf("\n\t GPU block pteam memory time %lg [s]\n", (time / LOOP)); + ////////////////////////////////////////////////////////////////////////////////////////////////// + + /////////////////////////// GPU naive block shared memory no loops algorithm ///////////////////// + time = 0.0; + GPU_mat_mult_block_shared_no_loops(A, B, C_GPU, N); // warm-up + for (unsigned short int loop=0 ; loop<LOOP ; loop++) + { + const double start = wall_time(); + GPU_mat_mult_block_shared_no_loops(A, B, C_GPU, N); + time += (wall_time() - start); + } + + check(C_CPU, C_GPU); + printf("\n\t GPU block no loops pteam memory time %lg [s]\n", (time / LOOP)); + ////////////////////////////////////////////////////////////////////////////////////////////////// + + // free CPU memory + free(buffer_cpu); + + printf("\n"); + + return EXIT_SUCCESS; +}