diff --git a/cuda-omp/cuda/7/mat_mult.cu b/cuda-omp/cuda/7/mat_mult.cu new file mode 100644 index 0000000000000000000000000000000000000000..302f3e5bee56438b02cd37672bff62f641c5d247 --- /dev/null +++ b/cuda-omp/cuda/7/mat_mult.cu @@ -0,0 +1,176 @@ +//////////////////////////////////////////////////////////////////////////////////////////////// +// - Naive matrix multiplication algorithm +// for (size_t i=0 ; i<N ; i++) +// for (size_t j=0 ; j<N ; j++) +// for (size_t k=0 ; k<_N ; k++) +// C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j]; +// +//////////////////////////////////////////////////////////////////////////////////////////////// + +////////////////////////////////////////////////////////////////////////////////////////////////// +// Author: David Goz +// mail : david.goz@inaf.it +// date : 20.08.2024 +// code tested using nvhpc +// +// - Compile the code: +// $ nvc++ mat_mult.cu -o mat_mult +// - Run the code: +// $ ./mat_mult +////////////////////////////////////////////////////////////////////////////////////////////////// + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <time.h> +#include <assert.h> +#include <cuda.h> +#include <string.h> +#include <math.h> +#include <float.h> + +#define N 1024 +#define SIZE (N * N) // matrix size +typedef double MyData; // do not change +#define BLOCKSIZE 1024 + +// sanity check +#if BLOCKSIZE > 1024 +#error BLOCKSIZE must be <= 1024 +#endif + +#define LOOP 100 +#define NDEBUG + +double wall_time() +{ + struct timespec ts; + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); + const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9; + + return ret; +} + +void CPU_mat_mult(const MyData *const restrict A, + const MyData *const restrict B, + MyData *const restrict C, + const size_t size) +{ + for (size_t i=0 ; i<size ; i++) + for (size_t j=0 ; j<size ; j++) + for (size_t k=0 ; k<size ; k++) + C[(i * size) + j] += (A[(i * size) + k] * B[(k * size) + j]); + + return; +} + +__global__ void GPU_mat_mult(const MyData *const restrict A, + const MyData *const restrict B, + MyData *const restrict C, + const size_t size) +{ + const size_t globalID = threadIdx.x + (blockIdx.x * blockDim.x); + if (globalID >= SIZE) + return; + + const size_t i = globalID / size; + const size_t j = globalID % size; + + MyData value = (MyData)0; + for (size_t k=0 ; k<size ; k++) + value += A[(i * N) + k] * B[(k * N) + j]; + + C[(i * N) + j] = value; + + return; +} + +void check(const MyData *const __restrict__ cpu_matrix, + const MyData *const __restrict__ gpu_matrix) +{ + int flag = 0; + for (size_t i=0 ; i<SIZE ; i++) + flag = ((fabs(cpu_matrix[i] - gpu_matrix[i]) > FLT_EPSILON) ? 1 : flag); + + if (!flag) + printf("\n\t Result OK"); + else + printf("\n\t Result wrong"); + +#if !defined(NDEBUG) + + for (size_t i=0 ; i<N ; i++) + { + printf("\n"); + for (size_t j=0 ; j<N ; j++) + { + const size_t index = ((i * N) + j); + printf("\n\t cpu_matrix[%d] = %lg - gpu_matrix[%d] = %lg - diff = %lg", + index, cpu_matrix[index], index, gpu_matrix[index], fabs(cpu_matrix[index] - gpu_matrix[index])); + } + } + printf("\n"); + +#endif // NDEBUG + + return; +} + +int main() +{ + double time; + MyData *buffer_cpu = (MyData *)calloc(4 * SIZE, sizeof(MyData)); + assert(buffer_cpu != NULL); + + // host reference matrix A + MyData *const restrict A_CPU = buffer_cpu; + MyData *const restrict B_CPU = A_CPU + SIZE; + MyData *const restrict C_CPU = B_CPU + SIZE; + MyData *const restrict C_GPU_host = C_CPU + SIZE; + for (size_t i=0 ; i<SIZE ; i++) + { + A_CPU[i] = drand48(); + B_CPU[i] = drand48(); + } + + ////////////////////////// CPU naive algorithm ////////////////////////////////////////// + CPU_mat_mult(A_CPU, B_CPU, C_CPU, N); + ///////////////////////////////////////////////////////////////////////////////////////// + + // copy/alloc data to the GPU + MyData *buffer_gpu = NULL; + cudaMalloc((void **)&buffer_gpu, (3 * SIZE * sizeof(MyData))); + MyData *const restrict A_GPU = buffer_gpu; + MyData *const restrict B_GPU = A_GPU + SIZE; + MyData *const restrict C_GPU = B_GPU + SIZE; + cudaMemcpy(A_GPU, A_CPU, (2 * SIZE * sizeof(MyData)), cudaMemcpyHostToDevice); + + const dim3 block = {BLOCKSIZE, 1, 1}; + const dim3 nblocks = {(SIZE + BLOCKSIZE -1)/BLOCKSIZE, 1, 1}; + + /////////////////////////// GPU naive algorithm //////////////////////////////////////// + time = 0.0; + GPU_mat_mult<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up + cudaDeviceSynchronize(); + for (unsigned short int loop=0 ; loop<LOOP ; loop++) + { + const double start = wall_time(); + GPU_mat_mult<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); + cudaDeviceSynchronize(); + time += (wall_time() - start); + } + cudaMemcpy(C_GPU_host, C_GPU, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost); + + check(C_CPU, C_GPU_host); + printf("\n\t GPU naive time %lg [s]\n", (time / LOOP)); + ///////////////////////////////////////////////////////////////////////////////////// + + // free CPU memory + free(buffer_cpu); + // free GPU memory + cudaFree(buffer_gpu); + + printf("\n"); + + return EXIT_SUCCESS; +} diff --git a/cuda-omp/cuda/7/mat_mult_block.cu b/cuda-omp/cuda/7/mat_mult_block.cu new file mode 100644 index 0000000000000000000000000000000000000000..7808f021a87df0329a4784f72fd1966aa133e737 --- /dev/null +++ b/cuda-omp/cuda/7/mat_mult_block.cu @@ -0,0 +1,301 @@ +//////////////////////////////////////////////////////////////////////////////////////////////// +// - Block matrix multiplication algorithm +// +// const size_t Nblocks = (N / Bsize); +// +// // loop over blocks of matrix C +// for (size_t ib=0 ; ib<Nblocks ; ib++) +// { +// for (size_t jb=0 ; jb<Nblocks ; jb++) +// { +// +// // loop over blocks of rows of A +// for (size_t kb=0 ; kb<Nblocks ; kb++) +// { +// +// // Matrix multiplication within a block +// for (size_t i=(ib * Bsize) ; i<((ib + 1) * Bsize) ; i++) +// for (size_t j=(jb * Bsize) ; j<((jb + 1) * Bsize) ; j++) +// for (size_t k=(kb * Bsize) ; k<((kb + 1) * Bsize) ; k++) +// C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j]; +// +// } // kb +// } // jb +// } // ib +// +// - Exploit GPU shared-memory. +//////////////////////////////////////////////////////////////////////////////////////////////// + +////////////////////////////////////////////////////////////////////////////////////////////////// +// Author: David Goz +// mail : david.goz@inaf.it +// date : 22.08.2024 +// code tested using nvhpc +// +// - Compile the code: +// $ nvc++ mat_mult_block.cu -o mat_mult_block +// - Run the code: +// $ ./mat_mult_block +////////////////////////////////////////////////////////////////////////////////////////////////// + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <time.h> +#include <assert.h> +#include <cuda.h> +#include <string.h> +#include <math.h> +#include <float.h> + +#define N 1024 +#define SIZE (N * N) // matrix size +typedef double MyData; // do not change +#define BLOCK 16 + +// sanity check +#if BLOCK * BLOCK > 1024 +#error BLOCKSIZE must be <= 1024 +#endif + +#if BLOCK * BLOCK > SIZE +#error BLOCKSIZE must be <= SIZE +#endif + +#define LOOP 100 +#define NDEBUG + +double wall_time() +{ + struct timespec ts; + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); + const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9; + + return ret; +} + +void CPU_mat_mult(const MyData *const restrict A, + const MyData *const restrict B, + MyData *const restrict C, + const size_t size) +{ + for (size_t i=0 ; i<size ; i++) + for (size_t j=0 ; j<size ; j++) + for (size_t k=0 ; k<size ; k++) + C[(i * size) + j] += (A[(i * size) + k] * B[(k * size) + j]); + + return; +} + +void CPU_mat_mult_block(const MyData *const restrict A, + const MyData *const restrict B, + MyData *const restrict C, + const size_t size) +{ + const size_t Nblocks = (size / BLOCK); + + // loop over blocks of matrix C + for (size_t ib=0 ; ib<Nblocks ; ib++) + { + for (size_t jb=0 ; jb<Nblocks ; jb++) + { + // loop over blocks of rows of A + for (size_t kb=0 ; kb<Nblocks ; kb++) + { + // Matrix multiplication within a block + for (size_t i=(ib * BLOCK) ; i<((ib + 1) * BLOCK) ; i++) + for (size_t j=(jb * BLOCK) ; j<((jb + 1) * BLOCK) ; j++) + for (size_t k=(kb * BLOCK) ; k<((kb + 1) * BLOCK) ; k++) + C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j]; + } // kb + } // jb + } // ib + + return; +} + +__global__ void GPU_mat_mult_block(const MyData *const restrict A, + const MyData *const restrict B, + MyData *const restrict C, + const size_t size) +{ + const size_t size2 = (size * size); + + const size_t globalID = threadIdx.x + (blockIdx.x * blockDim.x); + if (globalID >= size2) + return; + + const size_t Nblocks = size / BLOCK; // number of blocks to loop over A and B + const size_t ib = blockIdx.x / Nblocks; // indexes of starting matrix's block + const size_t jb = blockIdx.x % Nblocks; + const size_t i_local = threadIdx.x / BLOCK; // local matrix's indexes mapped to each CUDA thread + const size_t j_local = threadIdx.x % BLOCK; // within its own block + const size_t i_global = i_local + (ib * BLOCK); // global matrix's indexes mapped to each CUDA thread + const size_t j_global = j_local + (jb * BLOCK); // N.B. uncoalescent memory accesses to A and B matrices + + C[(i_global * size) + j_global] = (MyData)0; + + // loop over blocks + for (size_t block=0 ; block<Nblocks ; block++) + { + const size_t j_A = (block * BLOCK); + const size_t i_B = (block * BLOCK); + + // perform the matrix multiplication within the block + MyData value = (MyData)0; + for (size_t k=0 ; k<BLOCK ; k++) + value += A[(i_global * size) + k + j_A] * B[((k + i_B) * size) + j_global]; + + C[(i_global * size) + j_global] += value; + } + + return; +} + +__global__ void GPU_mat_mult_block_shared(const MyData *const restrict A, + const MyData *const restrict B, + MyData *const restrict C, + const size_t size) +{ + __shared__ MyData Ablock[BLOCK * BLOCK]; + __shared__ MyData Bblock[BLOCK * BLOCK]; + __shared__ MyData Cblock[BLOCK * BLOCK]; + + const size_t globalID = threadIdx.x + (blockIdx.x * blockDim.x); + const size_t size2 = (size * size); + if (globalID >= size2) + return; + + const size_t Nblocks = size / BLOCK; // number of blocks to loop over + const size_t ib = blockIdx.x / Nblocks; // indexes of starting matrix's block + const size_t jb = blockIdx.x % Nblocks; + const size_t i_local = threadIdx.x / BLOCK; // local matrix's indexes mapped to each CUDA thread + const size_t j_local = threadIdx.x % BLOCK; // within its own block + const size_t i_global = i_local + (ib * BLOCK); // global matrix's indexes mapped to each CUDA thread + const size_t j_global = j_local + (jb * BLOCK); // N.B. uncoalescent memory accesses to A and B matrices + + // Init Cblock + Cblock[(i_local * BLOCK) + j_local] = (MyData)0; + + // loop over blocks + for (size_t block=0 ; block<Nblocks ; block++) + { + const size_t j_A = (block * BLOCK); + const size_t i_B = (block * BLOCK); + + // waits until all threads in the thread block have reached this point and shared memory accesses + // made by these threads prior to __syncthreads() are visible to all threads in the block. + __syncthreads(); + + // copy block of A into shared memory + Ablock[(i_local * BLOCK) + j_local] = A[(i_global * size) + j_local + j_A]; + // copy block of B into shared memory + Bblock[(i_local * BLOCK) + j_local] = B[((i_local + i_B) * size) + j_global]; + + // waits until all threads in the thread block have reached this point and shared memory accesses // made by these threads prior to __syncthreads() are visible to all threads in the block. + __syncthreads(); + + // perform the matrix multiplication within the block + MyData value = (MyData)0; + for (size_t k=0 ; k<BLOCK ; k++) + value += Ablock[(i_local * BLOCK) + k] * Bblock[(k * BLOCK) + j_local]; + + // store the partial result in shared memory + Cblock[(i_local * BLOCK) + j_local] += value; + } + + C[(i_global * size) + j_global] = Cblock[(i_local * BLOCK) + j_local]; + + return; +} + +void check(const MyData *const restrict cpu_matrix, + const MyData *const restrict gpu_matrix) +{ + int flag = 0; + for (size_t i=0 ; i<SIZE ; i++) + flag = ((fabs(cpu_matrix[i] - gpu_matrix[i]) > FLT_EPSILON) ? 1 : flag); + + if (!flag) + printf("\n\t Result OK"); + else + printf("\n\t Result wrong"); + + return; +} + +int main() +{ + double time; + MyData *buffer_cpu = (MyData *)calloc(4 * SIZE, sizeof(MyData)); + assert(buffer_cpu != NULL); + + // host reference matrix A + MyData *const restrict A_CPU = buffer_cpu; + MyData *const restrict B_CPU = A_CPU + SIZE; + MyData *const restrict C_CPU = B_CPU + SIZE; + MyData *const restrict C_GPU_host = C_CPU + SIZE; + for (size_t i=0 ; i<SIZE ; i++) + { + A_CPU[i] = drand48(); + B_CPU[i] = drand48(); + } + + ////////////////////////// CPU naive algorithm ////////////////////////////////////////// + CPU_mat_mult_block(A_CPU, B_CPU, C_CPU, N); + ///////////////////////////////////////////////////////////////////////////////////////// + + // copy/alloc data to the GPU + MyData *buffer_gpu = NULL; + cudaMalloc((void **)&buffer_gpu, (3 * SIZE * sizeof(MyData))); + MyData *const A_GPU = buffer_gpu; + MyData *const B_GPU = A_GPU + SIZE; + MyData *const C_GPU = B_GPU + SIZE; + cudaMemcpy(A_GPU, A_CPU, (2 * SIZE * sizeof(MyData)), cudaMemcpyHostToDevice); + + const dim3 nblocks = {(SIZE + (BLOCK * BLOCK) -1)/(BLOCK * BLOCK), 1, 1}; + const dim3 block = {(BLOCK * BLOCK), 1, 1}; + + /////////////////////////// GPU naive block algorithm //////////////////////////////////////// + time = 0.0; + GPU_mat_mult_block<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up + cudaDeviceSynchronize(); + for (unsigned short int loop=0 ; loop<LOOP ; loop++) + { + const double start = wall_time(); + GPU_mat_mult_block<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); + cudaDeviceSynchronize(); + time += (wall_time() - start); + } + cudaMemcpy(C_GPU_host, C_GPU, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost); + + check(C_CPU, C_GPU_host); + printf("\n\t GPU naive block time %lg [s]\n", (time / LOOP)); + ///////////////////////////////////////////////////////////////////////////////////// + + /////////////////////////// GPU block shared memory algorithm /////////////////////// + time = 0.0; + GPU_mat_mult_block_shared<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up + cudaDeviceSynchronize(); + for (unsigned short int loop=0 ; loop<LOOP ; loop++) + { + const double start = wall_time(); + GPU_mat_mult_block_shared<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); + cudaDeviceSynchronize(); + time += (wall_time() - start); + } + cudaMemcpy(C_GPU_host, C_GPU, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost); + + check(C_CPU, C_GPU_host); + printf("\n\t GPU block shared memory time %lg [s]\n", (time / LOOP)); + ///////////////////////////////////////////////////////////////////////////////////// + + // free CPU memory + free(buffer_cpu); + // free GPU memory + cudaFree(buffer_gpu); + + printf("\n"); + + return EXIT_SUCCESS; +} diff --git a/cuda-omp/omp/6/classwork_omp b/cuda-omp/omp/6/classwork_omp deleted file mode 100755 index 6766c7bdf174daf8fcd104c4bc67d02ba4885384..0000000000000000000000000000000000000000 Binary files a/cuda-omp/omp/6/classwork_omp and /dev/null differ diff --git a/cuda-omp/omp/7/mat_mult.c b/cuda-omp/omp/7/mat_mult.c index 8ad7fcd4c5ca812149749ffdfe12cbb98ab7ab32..789e7f2a73299e8fa3fcc7039efdd033a0da999e 100644 --- a/cuda-omp/omp/7/mat_mult.c +++ b/cuda-omp/omp/7/mat_mult.c @@ -14,9 +14,10 @@ // code tested using nvhpc // // - Compile the code: -// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork.c -o classwork_omp +// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v mat_mult.c -o mat_mult_omp // - Run the code: -// $ ./classwork_omp +// $ export OMP_TARGET_OFFLOAD=MANDATORY +// $ ./mat_mult_omp ////////////////////////////////////////////////////////////////////////////////////////////////// #include <stdio.h> @@ -26,8 +27,10 @@ #include <assert.h> #include <omp.h> #include <string.h> +#include <math.h> +#include <float.h> -#define N 512 +#define N 1024 #define SIZE (N * N) // matrix size typedef double MyData; // do not change @@ -61,22 +64,30 @@ void GPU_mat_mult(const MyData *const restrict A, MyData *const restrict C, const size_t size) { - #pragma omp target - { - #pragma omp teams distribute num_teams(size) - for (size_t i=0 ; i<size ; i++) - { - #pragma omp parallel for num_threads(size) - for (size_t j=0 ; j<size ; j++) - { - MyData value = (MyData)0; - for (size_t k=0 ; k<size ; k++) - value += (A[(i * size) + k] * B[(k * size) + j]); - - C[(i * size) + j] = value; - } // omp thread - } // omp teams - } // omp target + #pragma omp target teams distribute num_teams(size) + for (size_t i=0 ; i<size ; i++) + { + #pragma omp parallel for firstprivate(i) num_threads(size) + for (size_t j=0 ; j<size ; j++) + { +#if !defined(NDEBUG) + + const int num_teams = omp_get_num_teams(); + const int num_threads = omp_get_num_threads(); + + if ((omp_get_team_num() == 0) && (omp_get_thread_num() == 0)) + printf("\n\t Kernel GPU_mat_mult: teams: %d - threads: %d\n", + num_teams, num_threads); + +#endif // NDEBUG + + MyData value = (MyData)0; + for (size_t k=0 ; k<size ; k++) + value += (A[(i * size) + k] * B[(k * size) + j]); + + C[(i * size) + j] = value; + } // omp threads + } // omp target teams return; } @@ -86,23 +97,31 @@ void GPU_mat_mult_no_loops(const MyData *const restrict A, MyData *const restrict C, const size_t size) { - #pragma omp target + #pragma omp target teams num_teams(size) { - #pragma omp teams num_teams(size) + const size_t team_size = (size * omp_get_team_num()); + + #pragma omp parallel firstprivate(team_size) num_threads(size) { - const size_t team_size = (size * omp_get_team_num()); +#if !defined(NDEBUG) + + const int num_teams = omp_get_num_teams(); + const int num_threads = omp_get_num_threads(); - #pragma omp parallel firstprivate(team_size) num_threads(size) - { - const size_t tid = omp_get_thread_num(); - MyData value = (MyData)0; - for (size_t k=0 ; k<size ; k++) - value += (A[team_size + k] * B[(k * size) + tid]); - - C[team_size + tid] = value; - } // omp threads - } // omp teams - } // omp target + if ((omp_get_team_num() == 0) && (omp_get_thread_num() == 0)) + printf("\n\t Kernel GPU_mat_mult_no_loops: teams: %d - threads: %d\n", + num_teams, num_threads); + +#endif // NDEBUG + + const size_t tid = omp_get_thread_num(); + MyData value = (MyData)0; + for (size_t k=0 ; k<size ; k++) + value += (A[team_size + k] * B[(k * size) + tid]); + + C[team_size + tid] = value; + } // omp threads + } // omp target teams return; } @@ -110,14 +129,30 @@ void GPU_mat_mult_no_loops(const MyData *const restrict A, void check(const MyData *const __restrict__ cpu_matrix, const MyData *const __restrict__ gpu_matrix) { - int flag; + int flag = 0; for (size_t i=0 ; i<SIZE ; i++) - flag = ((cpu_matrix[i] != gpu_matrix[i]) ? 1 : 0); + flag = ((fabs(cpu_matrix[i] - gpu_matrix[i]) > FLT_EPSILON) ? 1 : flag); if (!flag) printf("\n\t Result OK"); else printf("\n\t Result wrong"); + +#if !defined(NDEBUG) + + for (size_t i=0 ; i<N ; i++) + { + printf("\n"); + for (size_t j=0 ; j<N ; j++) + { + const size_t index = ((i * N) + j); + printf("\n\t cpu_matrix[%d] = %lg - gpu_matrix[%d] = %lg - diff = %lg", + index, cpu_matrix[index], index, gpu_matrix[index], fabs(cpu_matrix[index] - gpu_matrix[index])); + } + } + printf("\n"); + +#endif // NDEBUG return; } @@ -148,6 +183,7 @@ int main() /////////////////////////// GPU naive algorithm //////////////////////////////////////// time = 0.0; + GPU_mat_mult(A, B, C_GPU, N); // warm-up for (unsigned short int loop=0 ; loop<LOOP ; loop++) { const double start = wall_time(); @@ -162,6 +198,7 @@ int main() /////////////////////////// GPU naive no loops algorithm //////////////////////////// time = 0.0; + GPU_mat_mult_no_loops(A, B, C_GPU, N); // warm-up for (unsigned short int loop=0 ; loop<LOOP ; loop++) { const double start = wall_time(); diff --git a/cuda-omp/omp/7/mat_mult_block.c b/cuda-omp/omp/7/mat_mult_block.c index cd84ad1ee00f8f85afc523f058ceb3b6f29cb922..9c3d81e7352e7db3c5a330eb66a98934e1252824 100644 --- a/cuda-omp/omp/7/mat_mult_block.c +++ b/cuda-omp/omp/7/mat_mult_block.c @@ -11,29 +11,32 @@ // // // loop over blocks of rows of A // for (size_t kb=0 ; kb<Nblocks ; kb++) +// { // -// -// -// for (size_t i=0 ; i<N ; i++) -// for (size_t j=0 ; j<N ; j++) -// for (size_t k=0 ; k<_N ; k++) +// // Matrix multiplication within a block +// for (size_t i=(ib * Bsize) ; i<((ib + 1) * Bsize) ; i++) +// for (size_t j=(jb * Bsize) ; j<((jb + 1) * Bsize) ; j++) +// for (size_t k=(kb * Bsize) ; k<((kb + 1) * Bsize) ; k++) // C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j]; // +// } // kb // } // jb // } // ib -// - Exploit shared-memory. +// +// - Exploit GPU shared-memory. //////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////// // Author: David Goz // mail : david.goz@inaf.it -// date : 31.07.2024 +// date : 20.08.2024 // code tested using nvhpc // // - Compile the code: -// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork.c -o classwork_omp +// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v mat_mult_block.c -o mat_mult_block_omp // - Run the code: -// $ ./classwork_omp +// $ export OMP_TARGET_OFFLOAD=MANDATORY +// $ ./mat_mult_block_omp ////////////////////////////////////////////////////////////////////////////////////////////////// #include <stdio.h> @@ -43,15 +46,18 @@ #include <assert.h> #include <omp.h> #include <string.h> +#include <math.h> +#include <float.h> -#define N 512 -#define SIZE (N * N) // matrix size +#define N 1024 +#define SIZE (N * N) // matrix size typedef double MyData; // do not change -#define BLOCKSIZE 32 // number of threads per block +#define _BLOCK_ 16 +#define BLOCKSIZE ((_BLOCK_) * (_BLOCK_)) // sanity check -#if BLOCKSIZE > 1024 -#error BLOCKSIZE cannot be larger than 1024 +#if _BLOCK_*_BLOCK_ > 1024 +#error _BLOCK_*_BLOCK_ cannot larger than 1024 #endif #define LOOP 100 @@ -66,81 +72,244 @@ double wall_time() return ret; } -void CPU_mat_mult(const MyData *const restrict A, - const MyData *const restrict B, - MyData *const restrict C, - const size_t size) +void CPU_mat_mult_block(const MyData *const restrict A, + const MyData *const restrict B, + MyData *const restrict C, + const size_t size) { - for (size_t i=0 ; i<size ; i++) - for (size_t j=0 ; j<size ; j++) - for (size_t k=0 ; k<size ; k++) - C[(i * size) + j] += (A[(i * size) + k] * B[(k * size) + j]); + const size_t Nblocks = (size / _BLOCK_); + // loop over blocks of matrix C + for (size_t ib=0 ; ib<Nblocks ; ib++) + { + for (size_t jb=0 ; jb<Nblocks ; jb++) + { + // loop over blocks of rows of A + for (size_t kb=0 ; kb<Nblocks ; kb++) + { + // Matrix multiplication within a block + for (size_t i=(ib * _BLOCK_) ; i<((ib + 1) * _BLOCK_) ; i++) + for (size_t j=(jb * _BLOCK_) ; j<((jb + 1) * _BLOCK_) ; j++) + for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++) + C[(i * size) + j] += A[(i * size) + k] * B[(k * size) + j]; + } // kb + } // jb + } // ib + return; } -void GPU_mat_mult(const MyData *const restrict A, - const MyData *const restrict B, - MyData *const restrict C, - const size_t size) +void GPU_mat_mult_block(const MyData *const restrict A, + const MyData *const restrict B, + MyData *const restrict C, + const size_t size) { - #pragma omp target - { - #pragma omp teams distribute num_teams(size) - for (size_t i=0 ; i<size ; i++) - { - #pragma omp parallel for num_threads(size) - for (size_t j=0 ; j<size ; j++) - { - MyData value = (MyData)0; - for (size_t k=0 ; k<size ; k++) - value += (A[(i * size) + k] * B[(k * size) + j]); - - C[(i * size) + j] = value; - } // omp thread - } // omp teams - } // omp target + const size_t Nblocks = (size / _BLOCK_); + + #pragma omp target teams distribute collapse(2) num_teams(Nblocks * Nblocks) thread_limit(BLOCKSIZE) + for (size_t ib=0 ; ib<Nblocks ; ib++) + { + for (size_t jb=0 ; jb<Nblocks ; jb++) + { + for (size_t kb=0 ; kb<Nblocks ; kb++) + { + #pragma omp parallel for collapse(2) num_threads(BLOCKSIZE) + for (size_t i=(ib * _BLOCK_) ; i<((ib + 1) * _BLOCK_) ; i++) + { + for (size_t j=(jb * _BLOCK_) ; j<((jb + 1) * _BLOCK_) ; j++) + { +#if !defined(NDEBUG) + + const int num_teams = omp_get_num_teams(); + const int num_threads = omp_get_num_threads(); + + if ((omp_get_team_num() == 0) && (omp_get_thread_num() == 0) && (kb == 0)) + printf("\n\t Kernel GPU_mat_mult: teams: %d - threads: %d\n", + num_teams, num_threads); + +#endif // NDEBUG + + MyData value = ((kb == 0) ? (MyData)0 : C[(i * size) + j]); + for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++) + value += A[(i * size) + k] * B[(k * size) + j]; + + C[(i * size) + j] = value; + } // j + } // i + } // kb + } // jb + } // ib + + return; +} + +void GPU_mat_mult_block_shared(const MyData *const restrict A, + const MyData *const restrict B, + MyData *const restrict C, + const size_t size) +{ + const size_t Nblocks = (size / _BLOCK_); + // Team local copies of tiles + MyData Ablock[BLOCKSIZE]; + MyData Bblock[BLOCKSIZE]; + MyData Cblock[BLOCKSIZE]; + + // #pragma omp target uses_allocators(omp_pteam_mem_alloc) allocate(omp_pteam_mem_alloc: Ablock, Bblock) + #pragma omp target teams distribute collapse(2) num_teams(Nblocks * Nblocks) \ + private(Ablock, Bblock, Cblock) \ + thread_limit(BLOCKSIZE) + for (size_t ib=0 ; ib<Nblocks ; ib++) + { + for (size_t jb=0 ; jb<Nblocks ; jb++) + { + for (size_t kb=0 ; kb<Nblocks ; kb++) + { + #pragma omp parallel num_threads(BLOCKSIZE) + { +#if !defined(NDEBUG) + + const int num_teams = omp_get_num_teams(); + const int num_threads = omp_get_num_threads(); + + if ((omp_get_team_num() == 0) && (omp_get_thread_num() == 0) && (kb == 0)) + printf("\n\t Kernel GPU_mat_mult: teams: %d - threads: %d\n", + num_teams, num_threads); + +#endif // NDEBUG + + if (kb == 0) // Cblock initialization + { + #pragma omp for collapse(2) nowait + for (size_t i=0 ; i<_BLOCK_ ; i++) + for (size_t j=0 ; j<_BLOCK_ ; j++) + Cblock[(i * _BLOCK_) + j] = (MyData)0; + } + + // copy block of A into pteam memory Ablock + #pragma omp for collapse(2) nowait // implicit barrier is removed + for (size_t i=(ib * _BLOCK_) ; i<((ib + 1) * _BLOCK_) ; i++) + for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++) + Ablock[(i % _BLOCK_) * _BLOCK_ + (k % _BLOCK_)] = A[(i * N) + k]; + + // copy block of B into pteam memory Bblock + #pragma omp for collapse(2) // implicit barrier to ensure that shared Ablock and Bblock can be accessed by all threads within the block + for (size_t j=(jb * _BLOCK_) ; j<((jb + 1) * _BLOCK_) ; j++) + for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++) + Bblock[(k % _BLOCK_) * _BLOCK_ + (j % _BLOCK_)] = B[(k * N) + j]; + + // matrix multiply within the block + #pragma omp for collapse(2) nowait + for (size_t i=(ib * _BLOCK_) ; i<((ib + 1) * _BLOCK_) ; i++) + for (size_t j=(jb * _BLOCK_) ; j<((jb + 1) * _BLOCK_) ; j++) + { + MyData value = (MyData)0; + for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++) + value += Ablock[(i % _BLOCK_) * _BLOCK_ + (k % _BLOCK_)] * + Bblock[(k % _BLOCK_) * _BLOCK_ + (j % _BLOCK_)]; + + Cblock[((i % _BLOCK_) * _BLOCK_) + (j % _BLOCK_)] += value; + + if (kb == (Nblocks - 1)) + C[(i * N) + j] = Cblock[((i % _BLOCK_) * _BLOCK_) + (j % _BLOCK_)]; + } + } // omp parallel + } // kb + } // jb + } // ib return; } -void GPU_mat_mult_no_loops(const MyData *const restrict A, - const MyData *const restrict B, - MyData *const restrict C, - const size_t size) +void GPU_mat_mult_block_shared_no_loops(const MyData *const restrict A, + const MyData *const restrict B, + MyData *const restrict C, + const size_t size) { - #pragma omp target + const size_t Nblocks = (size / _BLOCK_); + // Team local copies of tiles + MyData Ablock[BLOCKSIZE]; + MyData Bblock[BLOCKSIZE]; + MyData Cblock[BLOCKSIZE]; + + #pragma omp target teams num_teams(Nblocks * Nblocks) \ + private(Ablock, Bblock, Cblock) \ + thread_limit(BLOCKSIZE) { - #pragma omp teams num_teams(size) + const size_t ib = omp_get_team_num() / Nblocks; + const size_t jb = omp_get_team_num() % Nblocks; + + #pragma omp parallel num_threads(BLOCKSIZE) { - const size_t team_size = (size * omp_get_team_num()); + // local matrix's indexes mapped to each OMP GPU-thread within its own team + const size_t i_local = omp_get_thread_num() / _BLOCK_; + const size_t j_local = omp_get_thread_num() % _BLOCK_; + + // global matrix's indexes mapped to each OMP GPU-thread + const size_t i_global = i_local + (ib * _BLOCK_); + const size_t j_global = j_local + (jb * _BLOCK_); - #pragma omp parallel firstprivate(team_size) num_threads(size) - { - const size_t tid = omp_get_thread_num(); - MyData value = (MyData)0; - for (size_t k=0 ; k<size ; k++) - value += (A[team_size + k] * B[(k * size) + tid]); - - C[team_size + tid] = value; - } // omp threads - } // omp teams - } // omp target + // Cblock initialization + Cblock[(i_local * _BLOCK_) + j_local] = (MyData)0; + + // loop over blocks + for (size_t block=0 ; block<Nblocks ; block++) + { + const size_t j_A = j_local + (block * _BLOCK_); + const size_t i_B = i_local + (block * _BLOCK_); + + // waits until all threads in the thread block have reached this point and shared memory accesses + #pragma omp barrier + + Ablock[(i_local * _BLOCK_) + j_local] = A[(i_global * size) + j_A]; + Bblock[(i_local * _BLOCK_) + j_local] = B[(i_B * size) + j_global]; + + // waits until all threads in the thread block have reached this point and shared memory accesses + #pragma omp barrier + // perform the matrix multiplication within the block + MyData value = (MyData)0; + for (size_t k=0 ; k<_BLOCK_ ; k++) + value += Ablock[(i_local * _BLOCK_) + k] * Bblock[(k * _BLOCK_) + j_local]; + + // store the partial result in shared memory + Cblock[(i_local * _BLOCK_) + j_local] += value; + } // block loop + + // store the result into global memory + C[(i_global * size) + j_global] = Cblock[(i_local * _BLOCK_) + j_local]; + } // omp parallel + } // target teams + return; } void check(const MyData *const __restrict__ cpu_matrix, const MyData *const __restrict__ gpu_matrix) { - int flag; + int flag = 0; for (size_t i=0 ; i<SIZE ; i++) - flag = ((cpu_matrix[i] != gpu_matrix[i]) ? 1 : 0); + flag = ((fabs(cpu_matrix[i] - gpu_matrix[i]) > FLT_EPSILON) ? 1 : flag); if (!flag) printf("\n\t Result OK"); else printf("\n\t Result wrong"); + +#if !defined(NDEBUG) + + for (size_t i=0 ; i<N ; i++) + { + printf("\n"); + for (size_t j=0 ; j<N ; j++) + { + const size_t index = ((i * N) + j); + printf("\n\t cpu_matrix[%d] = %lg - gpu_matrix[%d] = %lg - diff = %lg", + index, cpu_matrix[index], index, gpu_matrix[index], fabs(cpu_matrix[index] - gpu_matrix[index])); + } + } + printf("\n"); + +#endif // NDEBUG return; } @@ -162,40 +331,57 @@ int main() B[i] = drand48(); } - ////////////////////////// CPU naive algorithm ////////////////////////////////////////// - CPU_mat_mult(A, B, C_CPU, N); - ///////////////////////////////////////////////////////////////////////////////////////// + ////////////////////////// CPU naive algorithm /////////////////////////////////////////////////// + CPU_mat_mult_block(A, B, C_CPU, N); + ////////////////////////////////////////////////////////////////////////////////////////////////// // copy/alloc data to the GPU #pragma omp target enter data map(to: A[0:SIZE], B[0:SIZE]) map(alloc: C_GPU[0:SIZE]) - /////////////////////////// GPU naive algorithm //////////////////////////////////////// + /////////////////////////// GPU naive block algorithm //////////////////////////////////////////// + time = 0.0; + GPU_mat_mult_block(A, B, C_GPU, N); // warm-up + for (unsigned short int loop=0 ; loop<LOOP ; loop++) + { + const double start = wall_time(); + GPU_mat_mult_block(A, B, C_GPU, N); + time += (wall_time() - start); + } + + #pragma omp target update from(C_GPU[0:SIZE]) + check(C_CPU, C_GPU); + printf("\n\t GPU block time %lg [s]\n", (time / LOOP)); + ////////////////////////////////////////////////////////////////////////////////////////////////// + + /////////////////////////// GPU naive block shared memory algorithm ////////////////////////////// time = 0.0; + GPU_mat_mult_block_shared(A, B, C_GPU, N); // warm-up for (unsigned short int loop=0 ; loop<LOOP ; loop++) { const double start = wall_time(); - GPU_mat_mult(A, B, C_GPU, N); + GPU_mat_mult_block_shared(A, B, C_GPU, N); time += (wall_time() - start); } #pragma omp target update from(C_GPU[0:SIZE]) check(C_CPU, C_GPU); - printf("\n\t GPU naive time %lg [s]\n", (time / LOOP)); - //////////////////////////////////////////////////////////////////////////////// + printf("\n\t GPU block pteam memory time %lg [s]\n", (time / LOOP)); + ////////////////////////////////////////////////////////////////////////////////////////////////// - /////////////////////////// GPU naive no loops algorithm //////////////////////////// + /////////////////////////// GPU naive block shared memory no loops algorithm ///////////////////// time = 0.0; + GPU_mat_mult_block_shared_no_loops(A, B, C_GPU, N); // warm-up for (unsigned short int loop=0 ; loop<LOOP ; loop++) { const double start = wall_time(); - GPU_mat_mult_no_loops(A, B, C_GPU, N); + GPU_mat_mult_block_shared_no_loops(A, B, C_GPU, N); time += (wall_time() - start); } #pragma omp target update from(C_GPU[0:SIZE]) check(C_CPU, C_GPU); - printf("\n\t GPU naive no loops time %lg [s]\n", (time / LOOP)); - //////////////////////////////////////////////////////////////////////////////// + printf("\n\t GPU block no loops pteam memory time %lg [s]\n", (time / LOOP)); + ////////////////////////////////////////////////////////////////////////////////////////////////// // free CPU memory free(buffer);