Skip to content
Snippets Groups Projects
Commit 4921ce23 authored by David Goz's avatar David Goz
Browse files

cuda/omp matrix multiply examples

parent 7e6f37d5
No related branches found
No related tags found
No related merge requests found
////////////////////////////////////////////////////////////////////////////////////////////////
// - Naive matrix multiplication algorithm
// for (size_t i=0 ; i<N ; i++)
// for (size_t j=0 ; j<N ; j++)
// for (size_t k=0 ; k<_N ; k++)
// C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j];
//
////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////
// Author: David Goz
// mail : david.goz@inaf.it
// date : 20.08.2024
// code tested using nvhpc
//
// - Compile the code:
// $ nvc++ mat_mult.cu -o mat_mult
// - Run the code:
// $ ./mat_mult
//////////////////////////////////////////////////////////////////////////////////////////////////
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <time.h>
#include <assert.h>
#include <cuda.h>
#include <string.h>
#include <math.h>
#include <float.h>
#define N 1024
#define SIZE (N * N) // matrix size
typedef double MyData; // do not change
#define BLOCKSIZE 1024
// sanity check
#if BLOCKSIZE > 1024
#error BLOCKSIZE must be <= 1024
#endif
#define LOOP 100
#define NDEBUG
double wall_time()
{
struct timespec ts;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;
return ret;
}
void CPU_mat_mult(const MyData *const restrict A,
const MyData *const restrict B,
MyData *const restrict C,
const size_t size)
{
for (size_t i=0 ; i<size ; i++)
for (size_t j=0 ; j<size ; j++)
for (size_t k=0 ; k<size ; k++)
C[(i * size) + j] += (A[(i * size) + k] * B[(k * size) + j]);
return;
}
__global__ void GPU_mat_mult(const MyData *const restrict A,
const MyData *const restrict B,
MyData *const restrict C,
const size_t size)
{
const size_t globalID = threadIdx.x + (blockIdx.x * blockDim.x);
if (globalID >= SIZE)
return;
const size_t i = globalID / size;
const size_t j = globalID % size;
MyData value = (MyData)0;
for (size_t k=0 ; k<size ; k++)
value += A[(i * N) + k] * B[(k * N) + j];
C[(i * N) + j] = value;
return;
}
void check(const MyData *const __restrict__ cpu_matrix,
const MyData *const __restrict__ gpu_matrix)
{
int flag = 0;
for (size_t i=0 ; i<SIZE ; i++)
flag = ((fabs(cpu_matrix[i] - gpu_matrix[i]) > FLT_EPSILON) ? 1 : flag);
if (!flag)
printf("\n\t Result OK");
else
printf("\n\t Result wrong");
#if !defined(NDEBUG)
for (size_t i=0 ; i<N ; i++)
{
printf("\n");
for (size_t j=0 ; j<N ; j++)
{
const size_t index = ((i * N) + j);
printf("\n\t cpu_matrix[%d] = %lg - gpu_matrix[%d] = %lg - diff = %lg",
index, cpu_matrix[index], index, gpu_matrix[index], fabs(cpu_matrix[index] - gpu_matrix[index]));
}
}
printf("\n");
#endif // NDEBUG
return;
}
int main()
{
double time;
MyData *buffer_cpu = (MyData *)calloc(4 * SIZE, sizeof(MyData));
assert(buffer_cpu != NULL);
// host reference matrix A
MyData *const restrict A_CPU = buffer_cpu;
MyData *const restrict B_CPU = A_CPU + SIZE;
MyData *const restrict C_CPU = B_CPU + SIZE;
MyData *const restrict C_GPU_host = C_CPU + SIZE;
for (size_t i=0 ; i<SIZE ; i++)
{
A_CPU[i] = drand48();
B_CPU[i] = drand48();
}
////////////////////////// CPU naive algorithm //////////////////////////////////////////
CPU_mat_mult(A_CPU, B_CPU, C_CPU, N);
/////////////////////////////////////////////////////////////////////////////////////////
// copy/alloc data to the GPU
MyData *buffer_gpu = NULL;
cudaMalloc((void **)&buffer_gpu, (3 * SIZE * sizeof(MyData)));
MyData *const restrict A_GPU = buffer_gpu;
MyData *const restrict B_GPU = A_GPU + SIZE;
MyData *const restrict C_GPU = B_GPU + SIZE;
cudaMemcpy(A_GPU, A_CPU, (2 * SIZE * sizeof(MyData)), cudaMemcpyHostToDevice);
const dim3 block = {BLOCKSIZE, 1, 1};
const dim3 nblocks = {(SIZE + BLOCKSIZE -1)/BLOCKSIZE, 1, 1};
/////////////////////////// GPU naive algorithm ////////////////////////////////////////
time = 0.0;
GPU_mat_mult<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up
cudaDeviceSynchronize();
for (unsigned short int loop=0 ; loop<LOOP ; loop++)
{
const double start = wall_time();
GPU_mat_mult<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N);
cudaDeviceSynchronize();
time += (wall_time() - start);
}
cudaMemcpy(C_GPU_host, C_GPU, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost);
check(C_CPU, C_GPU_host);
printf("\n\t GPU naive time %lg [s]\n", (time / LOOP));
/////////////////////////////////////////////////////////////////////////////////////
// free CPU memory
free(buffer_cpu);
// free GPU memory
cudaFree(buffer_gpu);
printf("\n");
return EXIT_SUCCESS;
}
////////////////////////////////////////////////////////////////////////////////////////////////
// - Block matrix multiplication algorithm
//
// const size_t Nblocks = (N / Bsize);
//
// // loop over blocks of matrix C
// for (size_t ib=0 ; ib<Nblocks ; ib++)
// {
// for (size_t jb=0 ; jb<Nblocks ; jb++)
// {
//
// // loop over blocks of rows of A
// for (size_t kb=0 ; kb<Nblocks ; kb++)
// {
//
// // Matrix multiplication within a block
// for (size_t i=(ib * Bsize) ; i<((ib + 1) * Bsize) ; i++)
// for (size_t j=(jb * Bsize) ; j<((jb + 1) * Bsize) ; j++)
// for (size_t k=(kb * Bsize) ; k<((kb + 1) * Bsize) ; k++)
// C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j];
//
// } // kb
// } // jb
// } // ib
//
// - Exploit GPU shared-memory.
////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////
// Author: David Goz
// mail : david.goz@inaf.it
// date : 22.08.2024
// code tested using nvhpc
//
// - Compile the code:
// $ nvc++ mat_mult_block.cu -o mat_mult_block
// - Run the code:
// $ ./mat_mult_block
//////////////////////////////////////////////////////////////////////////////////////////////////
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <time.h>
#include <assert.h>
#include <cuda.h>
#include <string.h>
#include <math.h>
#include <float.h>
#define N 1024
#define SIZE (N * N) // matrix size
typedef double MyData; // do not change
#define BLOCK 16
// sanity check
#if BLOCK * BLOCK > 1024
#error BLOCKSIZE must be <= 1024
#endif
#if BLOCK * BLOCK > SIZE
#error BLOCKSIZE must be <= SIZE
#endif
#define LOOP 100
#define NDEBUG
double wall_time()
{
struct timespec ts;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;
return ret;
}
void CPU_mat_mult(const MyData *const restrict A,
const MyData *const restrict B,
MyData *const restrict C,
const size_t size)
{
for (size_t i=0 ; i<size ; i++)
for (size_t j=0 ; j<size ; j++)
for (size_t k=0 ; k<size ; k++)
C[(i * size) + j] += (A[(i * size) + k] * B[(k * size) + j]);
return;
}
void CPU_mat_mult_block(const MyData *const restrict A,
const MyData *const restrict B,
MyData *const restrict C,
const size_t size)
{
const size_t Nblocks = (size / BLOCK);
// loop over blocks of matrix C
for (size_t ib=0 ; ib<Nblocks ; ib++)
{
for (size_t jb=0 ; jb<Nblocks ; jb++)
{
// loop over blocks of rows of A
for (size_t kb=0 ; kb<Nblocks ; kb++)
{
// Matrix multiplication within a block
for (size_t i=(ib * BLOCK) ; i<((ib + 1) * BLOCK) ; i++)
for (size_t j=(jb * BLOCK) ; j<((jb + 1) * BLOCK) ; j++)
for (size_t k=(kb * BLOCK) ; k<((kb + 1) * BLOCK) ; k++)
C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j];
} // kb
} // jb
} // ib
return;
}
__global__ void GPU_mat_mult_block(const MyData *const restrict A,
const MyData *const restrict B,
MyData *const restrict C,
const size_t size)
{
const size_t size2 = (size * size);
const size_t globalID = threadIdx.x + (blockIdx.x * blockDim.x);
if (globalID >= size2)
return;
const size_t Nblocks = size / BLOCK; // number of blocks to loop over A and B
const size_t ib = blockIdx.x / Nblocks; // indexes of starting matrix's block
const size_t jb = blockIdx.x % Nblocks;
const size_t i_local = threadIdx.x / BLOCK; // local matrix's indexes mapped to each CUDA thread
const size_t j_local = threadIdx.x % BLOCK; // within its own block
const size_t i_global = i_local + (ib * BLOCK); // global matrix's indexes mapped to each CUDA thread
const size_t j_global = j_local + (jb * BLOCK); // N.B. uncoalescent memory accesses to A and B matrices
C[(i_global * size) + j_global] = (MyData)0;
// loop over blocks
for (size_t block=0 ; block<Nblocks ; block++)
{
const size_t j_A = (block * BLOCK);
const size_t i_B = (block * BLOCK);
// perform the matrix multiplication within the block
MyData value = (MyData)0;
for (size_t k=0 ; k<BLOCK ; k++)
value += A[(i_global * size) + k + j_A] * B[((k + i_B) * size) + j_global];
C[(i_global * size) + j_global] += value;
}
return;
}
__global__ void GPU_mat_mult_block_shared(const MyData *const restrict A,
const MyData *const restrict B,
MyData *const restrict C,
const size_t size)
{
__shared__ MyData Ablock[BLOCK * BLOCK];
__shared__ MyData Bblock[BLOCK * BLOCK];
__shared__ MyData Cblock[BLOCK * BLOCK];
const size_t globalID = threadIdx.x + (blockIdx.x * blockDim.x);
const size_t size2 = (size * size);
if (globalID >= size2)
return;
const size_t Nblocks = size / BLOCK; // number of blocks to loop over
const size_t ib = blockIdx.x / Nblocks; // indexes of starting matrix's block
const size_t jb = blockIdx.x % Nblocks;
const size_t i_local = threadIdx.x / BLOCK; // local matrix's indexes mapped to each CUDA thread
const size_t j_local = threadIdx.x % BLOCK; // within its own block
const size_t i_global = i_local + (ib * BLOCK); // global matrix's indexes mapped to each CUDA thread
const size_t j_global = j_local + (jb * BLOCK); // N.B. uncoalescent memory accesses to A and B matrices
// Init Cblock
Cblock[(i_local * BLOCK) + j_local] = (MyData)0;
// loop over blocks
for (size_t block=0 ; block<Nblocks ; block++)
{
const size_t j_A = (block * BLOCK);
const size_t i_B = (block * BLOCK);
// waits until all threads in the thread block have reached this point and shared memory accesses
// made by these threads prior to __syncthreads() are visible to all threads in the block.
__syncthreads();
// copy block of A into shared memory
Ablock[(i_local * BLOCK) + j_local] = A[(i_global * size) + j_local + j_A];
// copy block of B into shared memory
Bblock[(i_local * BLOCK) + j_local] = B[((i_local + i_B) * size) + j_global];
// waits until all threads in the thread block have reached this point and shared memory accesses // made by these threads prior to __syncthreads() are visible to all threads in the block.
__syncthreads();
// perform the matrix multiplication within the block
MyData value = (MyData)0;
for (size_t k=0 ; k<BLOCK ; k++)
value += Ablock[(i_local * BLOCK) + k] * Bblock[(k * BLOCK) + j_local];
// store the partial result in shared memory
Cblock[(i_local * BLOCK) + j_local] += value;
}
C[(i_global * size) + j_global] = Cblock[(i_local * BLOCK) + j_local];
return;
}
void check(const MyData *const restrict cpu_matrix,
const MyData *const restrict gpu_matrix)
{
int flag = 0;
for (size_t i=0 ; i<SIZE ; i++)
flag = ((fabs(cpu_matrix[i] - gpu_matrix[i]) > FLT_EPSILON) ? 1 : flag);
if (!flag)
printf("\n\t Result OK");
else
printf("\n\t Result wrong");
return;
}
int main()
{
double time;
MyData *buffer_cpu = (MyData *)calloc(4 * SIZE, sizeof(MyData));
assert(buffer_cpu != NULL);
// host reference matrix A
MyData *const restrict A_CPU = buffer_cpu;
MyData *const restrict B_CPU = A_CPU + SIZE;
MyData *const restrict C_CPU = B_CPU + SIZE;
MyData *const restrict C_GPU_host = C_CPU + SIZE;
for (size_t i=0 ; i<SIZE ; i++)
{
A_CPU[i] = drand48();
B_CPU[i] = drand48();
}
////////////////////////// CPU naive algorithm //////////////////////////////////////////
CPU_mat_mult_block(A_CPU, B_CPU, C_CPU, N);
/////////////////////////////////////////////////////////////////////////////////////////
// copy/alloc data to the GPU
MyData *buffer_gpu = NULL;
cudaMalloc((void **)&buffer_gpu, (3 * SIZE * sizeof(MyData)));
MyData *const A_GPU = buffer_gpu;
MyData *const B_GPU = A_GPU + SIZE;
MyData *const C_GPU = B_GPU + SIZE;
cudaMemcpy(A_GPU, A_CPU, (2 * SIZE * sizeof(MyData)), cudaMemcpyHostToDevice);
const dim3 nblocks = {(SIZE + (BLOCK * BLOCK) -1)/(BLOCK * BLOCK), 1, 1};
const dim3 block = {(BLOCK * BLOCK), 1, 1};
/////////////////////////// GPU naive block algorithm ////////////////////////////////////////
time = 0.0;
GPU_mat_mult_block<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up
cudaDeviceSynchronize();
for (unsigned short int loop=0 ; loop<LOOP ; loop++)
{
const double start = wall_time();
GPU_mat_mult_block<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N);
cudaDeviceSynchronize();
time += (wall_time() - start);
}
cudaMemcpy(C_GPU_host, C_GPU, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost);
check(C_CPU, C_GPU_host);
printf("\n\t GPU naive block time %lg [s]\n", (time / LOOP));
/////////////////////////////////////////////////////////////////////////////////////
/////////////////////////// GPU block shared memory algorithm ///////////////////////
time = 0.0;
GPU_mat_mult_block_shared<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up
cudaDeviceSynchronize();
for (unsigned short int loop=0 ; loop<LOOP ; loop++)
{
const double start = wall_time();
GPU_mat_mult_block_shared<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N);
cudaDeviceSynchronize();
time += (wall_time() - start);
}
cudaMemcpy(C_GPU_host, C_GPU, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost);
check(C_CPU, C_GPU_host);
printf("\n\t GPU block shared memory time %lg [s]\n", (time / LOOP));
/////////////////////////////////////////////////////////////////////////////////////
// free CPU memory
free(buffer_cpu);
// free GPU memory
cudaFree(buffer_gpu);
printf("\n");
return EXIT_SUCCESS;
}
File deleted
......@@ -14,9 +14,10 @@
// code tested using nvhpc
//
// - Compile the code:
// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork.c -o classwork_omp
// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v mat_mult.c -o mat_mult_omp
// - Run the code:
// $ ./classwork_omp
// $ export OMP_TARGET_OFFLOAD=MANDATORY
// $ ./mat_mult_omp
//////////////////////////////////////////////////////////////////////////////////////////////////
#include <stdio.h>
......@@ -26,8 +27,10 @@
#include <assert.h>
#include <omp.h>
#include <string.h>
#include <math.h>
#include <float.h>
#define N 512
#define N 1024
#define SIZE (N * N) // matrix size
typedef double MyData; // do not change
......@@ -61,22 +64,30 @@ void GPU_mat_mult(const MyData *const restrict A,
MyData *const restrict C,
const size_t size)
{
#pragma omp target
{
#pragma omp teams distribute num_teams(size)
#pragma omp target teams distribute num_teams(size)
for (size_t i=0 ; i<size ; i++)
{
#pragma omp parallel for num_threads(size)
#pragma omp parallel for firstprivate(i) num_threads(size)
for (size_t j=0 ; j<size ; j++)
{
#if !defined(NDEBUG)
const int num_teams = omp_get_num_teams();
const int num_threads = omp_get_num_threads();
if ((omp_get_team_num() == 0) && (omp_get_thread_num() == 0))
printf("\n\t Kernel GPU_mat_mult: teams: %d - threads: %d\n",
num_teams, num_threads);
#endif // NDEBUG
MyData value = (MyData)0;
for (size_t k=0 ; k<size ; k++)
value += (A[(i * size) + k] * B[(k * size) + j]);
C[(i * size) + j] = value;
} // omp thread
} // omp teams
} // omp target
} // omp threads
} // omp target teams
return;
}
......@@ -86,14 +97,23 @@ void GPU_mat_mult_no_loops(const MyData *const restrict A,
MyData *const restrict C,
const size_t size)
{
#pragma omp target
{
#pragma omp teams num_teams(size)
#pragma omp target teams num_teams(size)
{
const size_t team_size = (size * omp_get_team_num());
#pragma omp parallel firstprivate(team_size) num_threads(size)
{
#if !defined(NDEBUG)
const int num_teams = omp_get_num_teams();
const int num_threads = omp_get_num_threads();
if ((omp_get_team_num() == 0) && (omp_get_thread_num() == 0))
printf("\n\t Kernel GPU_mat_mult_no_loops: teams: %d - threads: %d\n",
num_teams, num_threads);
#endif // NDEBUG
const size_t tid = omp_get_thread_num();
MyData value = (MyData)0;
for (size_t k=0 ; k<size ; k++)
......@@ -101,8 +121,7 @@ void GPU_mat_mult_no_loops(const MyData *const restrict A,
C[team_size + tid] = value;
} // omp threads
} // omp teams
} // omp target
} // omp target teams
return;
}
......@@ -110,15 +129,31 @@ void GPU_mat_mult_no_loops(const MyData *const restrict A,
void check(const MyData *const __restrict__ cpu_matrix,
const MyData *const __restrict__ gpu_matrix)
{
int flag;
int flag = 0;
for (size_t i=0 ; i<SIZE ; i++)
flag = ((cpu_matrix[i] != gpu_matrix[i]) ? 1 : 0);
flag = ((fabs(cpu_matrix[i] - gpu_matrix[i]) > FLT_EPSILON) ? 1 : flag);
if (!flag)
printf("\n\t Result OK");
else
printf("\n\t Result wrong");
#if !defined(NDEBUG)
for (size_t i=0 ; i<N ; i++)
{
printf("\n");
for (size_t j=0 ; j<N ; j++)
{
const size_t index = ((i * N) + j);
printf("\n\t cpu_matrix[%d] = %lg - gpu_matrix[%d] = %lg - diff = %lg",
index, cpu_matrix[index], index, gpu_matrix[index], fabs(cpu_matrix[index] - gpu_matrix[index]));
}
}
printf("\n");
#endif // NDEBUG
return;
}
......@@ -148,6 +183,7 @@ int main()
/////////////////////////// GPU naive algorithm ////////////////////////////////////////
time = 0.0;
GPU_mat_mult(A, B, C_GPU, N); // warm-up
for (unsigned short int loop=0 ; loop<LOOP ; loop++)
{
const double start = wall_time();
......@@ -162,6 +198,7 @@ int main()
/////////////////////////// GPU naive no loops algorithm ////////////////////////////
time = 0.0;
GPU_mat_mult_no_loops(A, B, C_GPU, N); // warm-up
for (unsigned short int loop=0 ; loop<LOOP ; loop++)
{
const double start = wall_time();
......
......@@ -11,29 +11,32 @@
//
// // loop over blocks of rows of A
// for (size_t kb=0 ; kb<Nblocks ; kb++)
// {
//
//
//
// for (size_t i=0 ; i<N ; i++)
// for (size_t j=0 ; j<N ; j++)
// for (size_t k=0 ; k<_N ; k++)
// // Matrix multiplication within a block
// for (size_t i=(ib * Bsize) ; i<((ib + 1) * Bsize) ; i++)
// for (size_t j=(jb * Bsize) ; j<((jb + 1) * Bsize) ; j++)
// for (size_t k=(kb * Bsize) ; k<((kb + 1) * Bsize) ; k++)
// C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j];
//
// } // kb
// } // jb
// } // ib
// - Exploit shared-memory.
//
// - Exploit GPU shared-memory.
////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////
// Author: David Goz
// mail : david.goz@inaf.it
// date : 31.07.2024
// date : 20.08.2024
// code tested using nvhpc
//
// - Compile the code:
// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork.c -o classwork_omp
// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v mat_mult_block.c -o mat_mult_block_omp
// - Run the code:
// $ ./classwork_omp
// $ export OMP_TARGET_OFFLOAD=MANDATORY
// $ ./mat_mult_block_omp
//////////////////////////////////////////////////////////////////////////////////////////////////
#include <stdio.h>
......@@ -43,15 +46,18 @@
#include <assert.h>
#include <omp.h>
#include <string.h>
#include <math.h>
#include <float.h>
#define N 512
#define N 1024
#define SIZE (N * N) // matrix size
typedef double MyData; // do not change
#define BLOCKSIZE 32 // number of threads per block
#define _BLOCK_ 16
#define BLOCKSIZE ((_BLOCK_) * (_BLOCK_))
// sanity check
#if BLOCKSIZE > 1024
#error BLOCKSIZE cannot be larger than 1024
#if _BLOCK_*_BLOCK_ > 1024
#error _BLOCK_*_BLOCK_ cannot larger than 1024
#endif
#define LOOP 100
......@@ -66,66 +72,213 @@ double wall_time()
return ret;
}
void CPU_mat_mult(const MyData *const restrict A,
void CPU_mat_mult_block(const MyData *const restrict A,
const MyData *const restrict B,
MyData *const restrict C,
const size_t size)
{
for (size_t i=0 ; i<size ; i++)
for (size_t j=0 ; j<size ; j++)
for (size_t k=0 ; k<size ; k++)
C[(i * size) + j] += (A[(i * size) + k] * B[(k * size) + j]);
const size_t Nblocks = (size / _BLOCK_);
// loop over blocks of matrix C
for (size_t ib=0 ; ib<Nblocks ; ib++)
{
for (size_t jb=0 ; jb<Nblocks ; jb++)
{
// loop over blocks of rows of A
for (size_t kb=0 ; kb<Nblocks ; kb++)
{
// Matrix multiplication within a block
for (size_t i=(ib * _BLOCK_) ; i<((ib + 1) * _BLOCK_) ; i++)
for (size_t j=(jb * _BLOCK_) ; j<((jb + 1) * _BLOCK_) ; j++)
for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++)
C[(i * size) + j] += A[(i * size) + k] * B[(k * size) + j];
} // kb
} // jb
} // ib
return;
}
void GPU_mat_mult(const MyData *const restrict A,
void GPU_mat_mult_block(const MyData *const restrict A,
const MyData *const restrict B,
MyData *const restrict C,
const size_t size)
{
#pragma omp target
const size_t Nblocks = (size / _BLOCK_);
#pragma omp target teams distribute collapse(2) num_teams(Nblocks * Nblocks) thread_limit(BLOCKSIZE)
for (size_t ib=0 ; ib<Nblocks ; ib++)
{
#pragma omp teams distribute num_teams(size)
for (size_t i=0 ; i<size ; i++)
for (size_t jb=0 ; jb<Nblocks ; jb++)
{
#pragma omp parallel for num_threads(size)
for (size_t j=0 ; j<size ; j++)
for (size_t kb=0 ; kb<Nblocks ; kb++)
{
MyData value = (MyData)0;
for (size_t k=0 ; k<size ; k++)
value += (A[(i * size) + k] * B[(k * size) + j]);
#pragma omp parallel for collapse(2) num_threads(BLOCKSIZE)
for (size_t i=(ib * _BLOCK_) ; i<((ib + 1) * _BLOCK_) ; i++)
{
for (size_t j=(jb * _BLOCK_) ; j<((jb + 1) * _BLOCK_) ; j++)
{
#if !defined(NDEBUG)
const int num_teams = omp_get_num_teams();
const int num_threads = omp_get_num_threads();
if ((omp_get_team_num() == 0) && (omp_get_thread_num() == 0) && (kb == 0))
printf("\n\t Kernel GPU_mat_mult: teams: %d - threads: %d\n",
num_teams, num_threads);
#endif // NDEBUG
MyData value = ((kb == 0) ? (MyData)0 : C[(i * size) + j]);
for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++)
value += A[(i * size) + k] * B[(k * size) + j];
C[(i * size) + j] = value;
} // omp thread
} // omp teams
} // omp target
} // j
} // i
} // kb
} // jb
} // ib
return;
}
void GPU_mat_mult_no_loops(const MyData *const restrict A,
void GPU_mat_mult_block_shared(const MyData *const restrict A,
const MyData *const restrict B,
MyData *const restrict C,
const size_t size)
{
#pragma omp target
const size_t Nblocks = (size / _BLOCK_);
// Team local copies of tiles
MyData Ablock[BLOCKSIZE];
MyData Bblock[BLOCKSIZE];
MyData Cblock[BLOCKSIZE];
// #pragma omp target uses_allocators(omp_pteam_mem_alloc) allocate(omp_pteam_mem_alloc: Ablock, Bblock)
#pragma omp target teams distribute collapse(2) num_teams(Nblocks * Nblocks) \
private(Ablock, Bblock, Cblock) \
thread_limit(BLOCKSIZE)
for (size_t ib=0 ; ib<Nblocks ; ib++)
{
for (size_t jb=0 ; jb<Nblocks ; jb++)
{
for (size_t kb=0 ; kb<Nblocks ; kb++)
{
#pragma omp teams num_teams(size)
#pragma omp parallel num_threads(BLOCKSIZE)
{
const size_t team_size = (size * omp_get_team_num());
#if !defined(NDEBUG)
#pragma omp parallel firstprivate(team_size) num_threads(size)
const int num_teams = omp_get_num_teams();
const int num_threads = omp_get_num_threads();
if ((omp_get_team_num() == 0) && (omp_get_thread_num() == 0) && (kb == 0))
printf("\n\t Kernel GPU_mat_mult: teams: %d - threads: %d\n",
num_teams, num_threads);
#endif // NDEBUG
if (kb == 0) // Cblock initialization
{
#pragma omp for collapse(2) nowait
for (size_t i=0 ; i<_BLOCK_ ; i++)
for (size_t j=0 ; j<_BLOCK_ ; j++)
Cblock[(i * _BLOCK_) + j] = (MyData)0;
}
// copy block of A into pteam memory Ablock
#pragma omp for collapse(2) nowait // implicit barrier is removed
for (size_t i=(ib * _BLOCK_) ; i<((ib + 1) * _BLOCK_) ; i++)
for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++)
Ablock[(i % _BLOCK_) * _BLOCK_ + (k % _BLOCK_)] = A[(i * N) + k];
// copy block of B into pteam memory Bblock
#pragma omp for collapse(2) // implicit barrier to ensure that shared Ablock and Bblock can be accessed by all threads within the block
for (size_t j=(jb * _BLOCK_) ; j<((jb + 1) * _BLOCK_) ; j++)
for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++)
Bblock[(k % _BLOCK_) * _BLOCK_ + (j % _BLOCK_)] = B[(k * N) + j];
// matrix multiply within the block
#pragma omp for collapse(2) nowait
for (size_t i=(ib * _BLOCK_) ; i<((ib + 1) * _BLOCK_) ; i++)
for (size_t j=(jb * _BLOCK_) ; j<((jb + 1) * _BLOCK_) ; j++)
{
const size_t tid = omp_get_thread_num();
MyData value = (MyData)0;
for (size_t k=0 ; k<size ; k++)
value += (A[team_size + k] * B[(k * size) + tid]);
for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++)
value += Ablock[(i % _BLOCK_) * _BLOCK_ + (k % _BLOCK_)] *
Bblock[(k % _BLOCK_) * _BLOCK_ + (j % _BLOCK_)];
C[team_size + tid] = value;
} // omp threads
} // omp teams
} // omp target
Cblock[((i % _BLOCK_) * _BLOCK_) + (j % _BLOCK_)] += value;
if (kb == (Nblocks - 1))
C[(i * N) + j] = Cblock[((i % _BLOCK_) * _BLOCK_) + (j % _BLOCK_)];
}
} // omp parallel
} // kb
} // jb
} // ib
return;
}
void GPU_mat_mult_block_shared_no_loops(const MyData *const restrict A,
const MyData *const restrict B,
MyData *const restrict C,
const size_t size)
{
const size_t Nblocks = (size / _BLOCK_);
// Team local copies of tiles
MyData Ablock[BLOCKSIZE];
MyData Bblock[BLOCKSIZE];
MyData Cblock[BLOCKSIZE];
#pragma omp target teams num_teams(Nblocks * Nblocks) \
private(Ablock, Bblock, Cblock) \
thread_limit(BLOCKSIZE)
{
const size_t ib = omp_get_team_num() / Nblocks;
const size_t jb = omp_get_team_num() % Nblocks;
#pragma omp parallel num_threads(BLOCKSIZE)
{
// local matrix's indexes mapped to each OMP GPU-thread within its own team
const size_t i_local = omp_get_thread_num() / _BLOCK_;
const size_t j_local = omp_get_thread_num() % _BLOCK_;
// global matrix's indexes mapped to each OMP GPU-thread
const size_t i_global = i_local + (ib * _BLOCK_);
const size_t j_global = j_local + (jb * _BLOCK_);
// Cblock initialization
Cblock[(i_local * _BLOCK_) + j_local] = (MyData)0;
// loop over blocks
for (size_t block=0 ; block<Nblocks ; block++)
{
const size_t j_A = j_local + (block * _BLOCK_);
const size_t i_B = i_local + (block * _BLOCK_);
// waits until all threads in the thread block have reached this point and shared memory accesses
#pragma omp barrier
Ablock[(i_local * _BLOCK_) + j_local] = A[(i_global * size) + j_A];
Bblock[(i_local * _BLOCK_) + j_local] = B[(i_B * size) + j_global];
// waits until all threads in the thread block have reached this point and shared memory accesses
#pragma omp barrier
// perform the matrix multiplication within the block
MyData value = (MyData)0;
for (size_t k=0 ; k<_BLOCK_ ; k++)
value += Ablock[(i_local * _BLOCK_) + k] * Bblock[(k * _BLOCK_) + j_local];
// store the partial result in shared memory
Cblock[(i_local * _BLOCK_) + j_local] += value;
} // block loop
// store the result into global memory
C[(i_global * size) + j_global] = Cblock[(i_local * _BLOCK_) + j_local];
} // omp parallel
} // target teams
return;
}
......@@ -133,15 +286,31 @@ void GPU_mat_mult_no_loops(const MyData *const restrict A,
void check(const MyData *const __restrict__ cpu_matrix,
const MyData *const __restrict__ gpu_matrix)
{
int flag;
int flag = 0;
for (size_t i=0 ; i<SIZE ; i++)
flag = ((cpu_matrix[i] != gpu_matrix[i]) ? 1 : 0);
flag = ((fabs(cpu_matrix[i] - gpu_matrix[i]) > FLT_EPSILON) ? 1 : flag);
if (!flag)
printf("\n\t Result OK");
else
printf("\n\t Result wrong");
#if !defined(NDEBUG)
for (size_t i=0 ; i<N ; i++)
{
printf("\n");
for (size_t j=0 ; j<N ; j++)
{
const size_t index = ((i * N) + j);
printf("\n\t cpu_matrix[%d] = %lg - gpu_matrix[%d] = %lg - diff = %lg",
index, cpu_matrix[index], index, gpu_matrix[index], fabs(cpu_matrix[index] - gpu_matrix[index]));
}
}
printf("\n");
#endif // NDEBUG
return;
}
......@@ -162,40 +331,57 @@ int main()
B[i] = drand48();
}
////////////////////////// CPU naive algorithm //////////////////////////////////////////
CPU_mat_mult(A, B, C_CPU, N);
/////////////////////////////////////////////////////////////////////////////////////////
////////////////////////// CPU naive algorithm ///////////////////////////////////////////////////
CPU_mat_mult_block(A, B, C_CPU, N);
//////////////////////////////////////////////////////////////////////////////////////////////////
// copy/alloc data to the GPU
#pragma omp target enter data map(to: A[0:SIZE], B[0:SIZE]) map(alloc: C_GPU[0:SIZE])
/////////////////////////// GPU naive algorithm ////////////////////////////////////////
/////////////////////////// GPU naive block algorithm ////////////////////////////////////////////
time = 0.0;
GPU_mat_mult_block(A, B, C_GPU, N); // warm-up
for (unsigned short int loop=0 ; loop<LOOP ; loop++)
{
const double start = wall_time();
GPU_mat_mult_block(A, B, C_GPU, N);
time += (wall_time() - start);
}
#pragma omp target update from(C_GPU[0:SIZE])
check(C_CPU, C_GPU);
printf("\n\t GPU block time %lg [s]\n", (time / LOOP));
//////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////// GPU naive block shared memory algorithm //////////////////////////////
time = 0.0;
GPU_mat_mult_block_shared(A, B, C_GPU, N); // warm-up
for (unsigned short int loop=0 ; loop<LOOP ; loop++)
{
const double start = wall_time();
GPU_mat_mult(A, B, C_GPU, N);
GPU_mat_mult_block_shared(A, B, C_GPU, N);
time += (wall_time() - start);
}
#pragma omp target update from(C_GPU[0:SIZE])
check(C_CPU, C_GPU);
printf("\n\t GPU naive time %lg [s]\n", (time / LOOP));
////////////////////////////////////////////////////////////////////////////////
printf("\n\t GPU block pteam memory time %lg [s]\n", (time / LOOP));
//////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////// GPU naive no loops algorithm ////////////////////////////
/////////////////////////// GPU naive block shared memory no loops algorithm /////////////////////
time = 0.0;
GPU_mat_mult_block_shared_no_loops(A, B, C_GPU, N); // warm-up
for (unsigned short int loop=0 ; loop<LOOP ; loop++)
{
const double start = wall_time();
GPU_mat_mult_no_loops(A, B, C_GPU, N);
GPU_mat_mult_block_shared_no_loops(A, B, C_GPU, N);
time += (wall_time() - start);
}
#pragma omp target update from(C_GPU[0:SIZE])
check(C_CPU, C_GPU);
printf("\n\t GPU naive no loops time %lg [s]\n", (time / LOOP));
////////////////////////////////////////////////////////////////////////////////
printf("\n\t GPU block no loops pteam memory time %lg [s]\n", (time / LOOP));
//////////////////////////////////////////////////////////////////////////////////////////////////
// free CPU memory
free(buffer);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment