Skip to content
Snippets Groups Projects
Commit aa704d3b authored by David Goz's avatar David Goz
Browse files

memory management and unified memory

parent 4921ce23
No related branches found
No related tags found
No related merge requests found
////////////////////////////////////////////////////////////////////////////////////////////////
// - Block matrix multiplication algorithm
//
// const size_t Nblocks = (N / Bsize);
//
// // loop over blocks of matrix C
// for (size_t ib=0 ; ib<Nblocks ; ib++)
// {
// for (size_t jb=0 ; jb<Nblocks ; jb++)
// {
//
// // loop over blocks of rows of A
// for (size_t kb=0 ; kb<Nblocks ; kb++)
// {
//
// // Matrix multiplication within a block
// for (size_t i=(ib * Bsize) ; i<((ib + 1) * Bsize) ; i++)
// for (size_t j=(jb * Bsize) ; j<((jb + 1) * Bsize) ; j++)
// for (size_t k=(kb * Bsize) ; k<((kb + 1) * Bsize) ; k++)
// C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j];
//
// } // kb
// } // jb
// } // ib
//
// - Exploit GPU shared-memory.
// - Use explicit target memory allocator and memory management OpenMP APIs
////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////
// Author: David Goz
// mail : david.goz@inaf.it
// date : 26.08.2024
// code tested using nvhpc
//
// - Compile the code:
// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v memory_management.c -o memory_management_omp
// - Run the code:
// $ export OMP_TARGET_OFFLOAD=MANDATORY
// $ ./memory_management_omp
//
// N.B. team size in the range [32, 1024)
//////////////////////////////////////////////////////////////////////////////////////////////////
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <time.h>
#include <assert.h>
#include <omp.h>
#include <string.h>
#include <math.h>
#include <float.h>
#define N 1024
#define SIZE (N * N) // matrix size
typedef double MyData; // do not change
#define _BLOCK_ 16
#define BLOCKSIZE ((_BLOCK_) * (_BLOCK_)) /* 32 <= BLOCKSIZE < 1024 */
// sanity check
#if _BLOCK_*_BLOCK_ > 1024
#error _BLOCK_*_BLOCK_ cannot larger than 1024
#endif
#if _BLOCK_ > N
#error _BLOCK_ > N
#endif
#define LOOP 100
#define NDEBUG
double wall_time()
{
struct timespec ts;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;
return ret;
}
void CPU_mat_mult_block(const MyData *const restrict A,
const MyData *const restrict B,
MyData *const restrict C,
const size_t size)
{
const size_t Nblocks = (size / _BLOCK_);
// loop over blocks of matrix C
for (size_t ib=0 ; ib<Nblocks ; ib++)
{
for (size_t jb=0 ; jb<Nblocks ; jb++)
{
// loop over blocks of rows of A
for (size_t kb=0 ; kb<Nblocks ; kb++)
{
// Matrix multiplication within a block
for (size_t i=(ib * _BLOCK_) ; i<((ib + 1) * _BLOCK_) ; i++)
for (size_t j=(jb * _BLOCK_) ; j<((jb + 1) * _BLOCK_) ; j++)
for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++)
C[(i * size) + j] += A[(i * size) + k] * B[(k * size) + j];
} // kb
} // jb
} // ib
return;
}
void GPU_mat_mult_block(const MyData *const restrict A,
const MyData *const restrict B,
MyData *const restrict C,
const size_t size)
{
const size_t Nblocks = (size / _BLOCK_);
#pragma omp target \
teams distribute collapse(2) num_teams(Nblocks * Nblocks) \
thread_limit(BLOCKSIZE) \
is_device_ptr(A, B, C)
for (size_t ib=0 ; ib<Nblocks ; ib++)
{
for (size_t jb=0 ; jb<Nblocks ; jb++)
{
for (size_t kb=0 ; kb<Nblocks ; kb++)
{
#pragma omp parallel for collapse(2) num_threads(BLOCKSIZE)
for (size_t i=(ib * _BLOCK_) ; i<((ib + 1) * _BLOCK_) ; i++)
{
for (size_t j=(jb * _BLOCK_) ; j<((jb + 1) * _BLOCK_) ; j++)
{
MyData value = ((kb == 0) ? (MyData)0 : C[(i * size) + j]);
for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++)
value += A[(i * size) + k] * B[(k * size) + j];
C[(i * size) + j] = value;
} // j
} // i
} // kb
} // jb
} // ib
return;
}
void GPU_mat_mult_block_shared(const MyData *const restrict A,
const MyData *const restrict B,
MyData *const restrict C,
const size_t size)
{
const size_t Nblocks = (size / _BLOCK_);
// Team local copies of tiles
MyData Ablock[BLOCKSIZE];
MyData Bblock[BLOCKSIZE];
MyData Cblock[BLOCKSIZE];
#pragma omp target \
teams distribute collapse(2) num_teams(Nblocks * Nblocks) \
private(Ablock, Bblock, Cblock) \
thread_limit(BLOCKSIZE) \
is_device_ptr(A, B, C)
for (size_t ib=0 ; ib<Nblocks ; ib++)
{
for (size_t jb=0 ; jb<Nblocks ; jb++)
{
for (size_t kb=0 ; kb<Nblocks ; kb++)
{
#pragma omp parallel num_threads(BLOCKSIZE)
{
if (kb == 0) // Cblock initialization
{
#pragma omp for collapse(2) nowait
for (size_t i=0 ; i<_BLOCK_ ; i++)
for (size_t j=0 ; j<_BLOCK_ ; j++)
Cblock[(i * _BLOCK_) + j] = (MyData)0;
}
// copy block of A into pteam memory Ablock
#pragma omp for collapse(2) nowait // implicit barrier is removed
for (size_t i=(ib * _BLOCK_) ; i<((ib + 1) * _BLOCK_) ; i++)
for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++)
Ablock[(i % _BLOCK_) * _BLOCK_ + (k % _BLOCK_)] = A[(i * N) + k];
// copy block of B into pteam memory Bblock
#pragma omp for collapse(2) // implicit barrier to ensure that shared Ablock and Bblock can be accessed by all threads within the block
for (size_t j=(jb * _BLOCK_) ; j<((jb + 1) * _BLOCK_) ; j++)
for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++)
Bblock[(k % _BLOCK_) * _BLOCK_ + (j % _BLOCK_)] = B[(k * N) + j];
// matrix multiply within the block
#pragma omp for collapse(2) nowait
for (size_t i=(ib * _BLOCK_) ; i<((ib + 1) * _BLOCK_) ; i++)
for (size_t j=(jb * _BLOCK_) ; j<((jb + 1) * _BLOCK_) ; j++)
{
MyData value = (MyData)0;
for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++)
value += Ablock[(i % _BLOCK_) * _BLOCK_ + (k % _BLOCK_)] *
Bblock[(k % _BLOCK_) * _BLOCK_ + (j % _BLOCK_)];
Cblock[((i % _BLOCK_) * _BLOCK_) + (j % _BLOCK_)] += value;
if (kb == (Nblocks - 1))
C[(i * N) + j] = Cblock[((i % _BLOCK_) * _BLOCK_) + (j % _BLOCK_)];
}
} // omp parallel
} // kb
} // jb
} // ib
return;
}
void GPU_mat_mult_block_shared_no_loops(const MyData *const restrict A,
const MyData *const restrict B,
MyData *const restrict C,
const size_t size)
{
const size_t Nblocks = (size / _BLOCK_);
// Team local copies of tiles
MyData Ablock[BLOCKSIZE];
MyData Bblock[BLOCKSIZE];
MyData Cblock[BLOCKSIZE];
#pragma omp target \
teams num_teams(Nblocks * Nblocks) \
private(Ablock, Bblock, Cblock) \
thread_limit(BLOCKSIZE) \
is_device_ptr(A, B, C)
{
const size_t ib = omp_get_team_num() / Nblocks;
const size_t jb = omp_get_team_num() % Nblocks;
#pragma omp parallel num_threads(BLOCKSIZE)
{
#if !defined(NDEBUG)
/* ID within the team (CUDA block) */
const int localID = omp_get_thread_num();
/* Team ID (ID CUDA block) */
const int team = omp_get_team_num();
/* Team size (CUDA block size) */
const int nthr = omp_get_num_threads();
/* global thread index */
const int globalID = (localID + (team * nthr));
printf("\n\t globalID: %d - localID: %d - nthr: %d - team: %d\n",
globalID, localID, nthr, team);
#pragma omp barrier
#endif /* NDEBUG */
// local matrix's indexes mapped to each OMP GPU-thread within its own team
const size_t i_local = omp_get_thread_num() / _BLOCK_;
const size_t j_local = omp_get_thread_num() % _BLOCK_;
// global matrix's indexes mapped to each OMP GPU-thread
const size_t i_global = i_local + (ib * _BLOCK_);
const size_t j_global = j_local + (jb * _BLOCK_);
// Cblock initialization
Cblock[(i_local * _BLOCK_) + j_local] = (MyData)0;
// loop over blocks
for (size_t block=0 ; block<Nblocks ; block++)
{
const size_t j_A = j_local + (block * _BLOCK_);
const size_t i_B = i_local + (block * _BLOCK_);
// waits until all threads in the thread block have reached this point and shared memory accesses
#pragma omp barrier
Ablock[(i_local * _BLOCK_) + j_local] = A[(i_global * size) + j_A];
Bblock[(i_local * _BLOCK_) + j_local] = B[(i_B * size) + j_global];
// waits until all threads in the thread block have reached this point and shared memory accesses
#pragma omp barrier
// perform the matrix multiplication within the block
MyData value = (MyData)0;
for (size_t k=0 ; k<_BLOCK_ ; k++)
value += Ablock[(i_local * _BLOCK_) + k] * Bblock[(k * _BLOCK_) + j_local];
// store the partial result in shared memory
Cblock[(i_local * _BLOCK_) + j_local] += value;
} // block loop
// store the result into global memory
C[(i_global * size) + j_global] = Cblock[(i_local * _BLOCK_) + j_local];
} // omp parallel
} // target teams
return;
}
void check(const MyData *const __restrict__ cpu_matrix,
const MyData *const __restrict__ gpu_matrix)
{
int flag = 0;
for (size_t i=0 ; i<SIZE ; i++)
flag = ((fabs(cpu_matrix[i] - gpu_matrix[i]) > FLT_EPSILON) ? 1 : flag);
if (!flag)
printf("\n\t Result OK");
else
printf("\n\t Result wrong");
#if !defined(NDEBUG)
for (size_t i=0 ; i<N ; i++)
{
printf("\n");
for (size_t j=0 ; j<N ; j++)
{
const size_t index = ((i * N) + j);
printf("\n\t cpu_matrix[%d] = %lg - gpu_matrix[%d] = %lg - diff = %lg",
index, cpu_matrix[index], index, gpu_matrix[index], fabs(cpu_matrix[index] - gpu_matrix[index]));
}
}
printf("\n");
#endif // NDEBUG
return;
}
int main()
{
double time;
MyData *buffer_cpu = (MyData *)calloc(4 * SIZE, sizeof(MyData));
assert(buffer_cpu != NULL);
// host reference matrix A
MyData *const restrict A = buffer_cpu;
MyData *const restrict B = A + SIZE;
MyData *const restrict C_CPU = B + SIZE;
MyData *const restrict C_GPU_HOST = C_CPU + SIZE;
for (size_t i=0 ; i<SIZE ; i++)
{
A[i] = drand48();
B[i] = drand48();
}
////////////////////////// CPU naive algorithm ///////////////////////////////////////////////////
CPU_mat_mult_block(A, B, C_CPU, N);
//////////////////////////////////////////////////////////////////////////////////////////////////
// get the host id
const int dev_host = omp_get_initial_device();
// get the default device
const int dev_gpu = omp_get_default_device();
assert(dev_host != dev_gpu);
// alloc data to the GPU
MyData *const buffer_gpu = (MyData *)omp_target_alloc(3 * SIZE * sizeof(MyData), dev_gpu);
assert(buffer_gpu != NULL);
MyData *const restrict A_GPU = buffer_gpu;
MyData *const restrict B_GPU = A_GPU + SIZE;
MyData *const restrict C_GPU = B_GPU + SIZE;
// copy data to GPU
omp_target_memcpy(buffer_gpu, buffer_cpu, (2 * SIZE * sizeof(MyData)), 0, 0, dev_gpu, dev_host);
/////////////////////////// GPU naive block algorithm ////////////////////////////////////////////
time = 0.0;
GPU_mat_mult_block(A_GPU, B_GPU, C_GPU, N); // warm-up
for (unsigned short int loop=0 ; loop<LOOP ; loop++)
{
const double start = wall_time();
GPU_mat_mult_block(A_GPU, B_GPU, C_GPU, N);
time += (wall_time() - start);
}
// copy data device-to-host
omp_target_memcpy(C_GPU_HOST, C_GPU, (SIZE * sizeof(MyData)), 0, 0, dev_host, dev_gpu);
check(C_CPU, C_GPU_HOST);
memset(C_GPU_HOST, 0, (SIZE * sizeof(MyData)));
printf("\n\t GPU block time %lg [s]\n", (time / LOOP));
//////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////// GPU naive block shared memory algorithm //////////////////////////////
time = 0.0;
GPU_mat_mult_block_shared(A_GPU, B_GPU, C_GPU, N); // warm-up
for (unsigned short int loop=0 ; loop<LOOP ; loop++)
{
const double start = wall_time();
GPU_mat_mult_block_shared(A_GPU, B_GPU, C_GPU, N);
time += (wall_time() - start);
}
omp_target_memcpy(C_GPU_HOST, C_GPU, (SIZE * sizeof(MyData)), 0, 0, dev_host, dev_gpu);
check(C_CPU, C_GPU_HOST);
memset(C_GPU_HOST, 0, (SIZE * sizeof(MyData)));
printf("\n\t GPU block pteam memory time %lg [s]\n", (time / LOOP));
//////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////// GPU naive block shared memory no loops algorithm /////////////////////
time = 0.0;
GPU_mat_mult_block_shared_no_loops(A_GPU, B_GPU, C_GPU, N); // warm-up
for (unsigned short int loop=0 ; loop<LOOP ; loop++)
{
const double start = wall_time();
GPU_mat_mult_block_shared_no_loops(A_GPU, B_GPU, C_GPU, N);
time += (wall_time() - start);
}
omp_target_memcpy(C_GPU_HOST, C_GPU, (SIZE * sizeof(MyData)), 0, 0, dev_host, dev_gpu);
check(C_CPU, C_GPU_HOST);
printf("\n\t GPU block no loops pteam memory time %lg [s]\n", (time / LOOP));
//////////////////////////////////////////////////////////////////////////////////////////////////
// free CPU memory
free(buffer_cpu);
// free GPU memory
omp_target_free(buffer_gpu, dev_gpu);
printf("\n");
return EXIT_SUCCESS;
}
////////////////////////////////////////////////////////////////////////////////////////////////
// - Block matrix multiplication algorithm
//
// const size_t Nblocks = (N / Bsize);
//
// // loop over blocks of matrix C
// for (size_t ib=0 ; ib<Nblocks ; ib++)
// {
// for (size_t jb=0 ; jb<Nblocks ; jb++)
// {
//
// // loop over blocks of rows of A
// for (size_t kb=0 ; kb<Nblocks ; kb++)
// {
//
// // Matrix multiplication within a block
// for (size_t i=(ib * Bsize) ; i<((ib + 1) * Bsize) ; i++)
// for (size_t j=(jb * Bsize) ; j<((jb + 1) * Bsize) ; j++)
// for (size_t k=(kb * Bsize) ; k<((kb + 1) * Bsize) ; k++)
// C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j];
//
// } // kb
// } // jb
// } // ib
//
// - Exploit GPU shared-memory.
// - Use explicit target memory allocator and memory management OpenMP APIs
////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////
// Author: David Goz
// mail : david.goz@inaf.it
// date : 26.08.2024
// code tested using nvhpc
//
// - Compile the code:
// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v memory_management.c -o memory_management_omp
// - Run the code:
// $ export OMP_TARGET_OFFLOAD=MANDATORY
// $ ./memory_management_omp
//
// N.B. team size in the range [32, 1024)
//////////////////////////////////////////////////////////////////////////////////////////////////
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <time.h>
#include <assert.h>
#include <omp.h>
#include <string.h>
#include <math.h>
#include <float.h>
#define N 1024
#define SIZE (N * N) // matrix size
typedef double MyData; // do not change
#define _BLOCK_ 16
#define BLOCKSIZE ((_BLOCK_) * (_BLOCK_)) /* 32 <= BLOCKSIZE < 1024 */
// sanity check
#if _BLOCK_*_BLOCK_ > 1024
#error _BLOCK_*_BLOCK_ cannot larger than 1024
#endif
#if _BLOCK_ > N
#error _BLOCK_ > N
#endif
#define LOOP 100
#define NDEBUG
double wall_time()
{
struct timespec ts;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;
return ret;
}
void CPU_mat_mult_block(const MyData *const restrict A,
const MyData *const restrict B,
MyData *const restrict C,
const size_t size)
{
const size_t Nblocks = (size / _BLOCK_);
// loop over blocks of matrix C
for (size_t ib=0 ; ib<Nblocks ; ib++)
{
for (size_t jb=0 ; jb<Nblocks ; jb++)
{
// loop over blocks of rows of A
for (size_t kb=0 ; kb<Nblocks ; kb++)
{
// Matrix multiplication within a block
for (size_t i=(ib * _BLOCK_) ; i<((ib + 1) * _BLOCK_) ; i++)
for (size_t j=(jb * _BLOCK_) ; j<((jb + 1) * _BLOCK_) ; j++)
for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++)
C[(i * size) + j] += A[(i * size) + k] * B[(k * size) + j];
} // kb
} // jb
} // ib
return;
}
void GPU_mat_mult_block(const MyData *const restrict A,
const MyData *const restrict B,
MyData *const restrict C,
const size_t size)
{
const size_t Nblocks = (size / _BLOCK_);
#pragma omp requires unified_shared_memory
#pragma omp target \
teams distribute collapse(2) num_teams(Nblocks * Nblocks) \
thread_limit(BLOCKSIZE)
for (size_t ib=0 ; ib<Nblocks ; ib++)
{
for (size_t jb=0 ; jb<Nblocks ; jb++)
{
for (size_t kb=0 ; kb<Nblocks ; kb++)
{
#pragma omp parallel for collapse(2) num_threads(BLOCKSIZE)
for (size_t i=(ib * _BLOCK_) ; i<((ib + 1) * _BLOCK_) ; i++)
{
for (size_t j=(jb * _BLOCK_) ; j<((jb + 1) * _BLOCK_) ; j++)
{
MyData value = ((kb == 0) ? (MyData)0 : C[(i * size) + j]);
for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++)
value += A[(i * size) + k] * B[(k * size) + j];
C[(i * size) + j] = value;
} // j
} // i
} // kb
} // jb
} // ib
return;
}
void GPU_mat_mult_block_shared(const MyData *const restrict A,
const MyData *const restrict B,
MyData *const restrict C,
const size_t size)
{
const size_t Nblocks = (size / _BLOCK_);
// Team local copies of tiles
MyData Ablock[BLOCKSIZE];
MyData Bblock[BLOCKSIZE];
MyData Cblock[BLOCKSIZE];
#pragma omp requires unified_shared_memory
#pragma omp target \
teams distribute collapse(2) num_teams(Nblocks * Nblocks) \
private(Ablock, Bblock, Cblock) \
thread_limit(BLOCKSIZE)
for (size_t ib=0 ; ib<Nblocks ; ib++)
{
for (size_t jb=0 ; jb<Nblocks ; jb++)
{
for (size_t kb=0 ; kb<Nblocks ; kb++)
{
#pragma omp parallel num_threads(BLOCKSIZE)
{
if (kb == 0) // Cblock initialization
{
#pragma omp for collapse(2) nowait
for (size_t i=0 ; i<_BLOCK_ ; i++)
for (size_t j=0 ; j<_BLOCK_ ; j++)
Cblock[(i * _BLOCK_) + j] = (MyData)0;
}
// copy block of A into pteam memory Ablock
#pragma omp for collapse(2) nowait // implicit barrier is removed
for (size_t i=(ib * _BLOCK_) ; i<((ib + 1) * _BLOCK_) ; i++)
for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++)
Ablock[(i % _BLOCK_) * _BLOCK_ + (k % _BLOCK_)] = A[(i * N) + k];
// copy block of B into pteam memory Bblock
#pragma omp for collapse(2) // implicit barrier to ensure that shared Ablock and Bblock can be accessed by all threads within the block
for (size_t j=(jb * _BLOCK_) ; j<((jb + 1) * _BLOCK_) ; j++)
for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++)
Bblock[(k % _BLOCK_) * _BLOCK_ + (j % _BLOCK_)] = B[(k * N) + j];
// matrix multiply within the block
#pragma omp for collapse(2) nowait
for (size_t i=(ib * _BLOCK_) ; i<((ib + 1) * _BLOCK_) ; i++)
for (size_t j=(jb * _BLOCK_) ; j<((jb + 1) * _BLOCK_) ; j++)
{
MyData value = (MyData)0;
for (size_t k=(kb * _BLOCK_) ; k<((kb + 1) * _BLOCK_) ; k++)
value += Ablock[(i % _BLOCK_) * _BLOCK_ + (k % _BLOCK_)] *
Bblock[(k % _BLOCK_) * _BLOCK_ + (j % _BLOCK_)];
Cblock[((i % _BLOCK_) * _BLOCK_) + (j % _BLOCK_)] += value;
if (kb == (Nblocks - 1))
C[(i * N) + j] = Cblock[((i % _BLOCK_) * _BLOCK_) + (j % _BLOCK_)];
}
} // omp parallel
} // kb
} // jb
} // ib
return;
}
void GPU_mat_mult_block_shared_no_loops(const MyData *const restrict A,
const MyData *const restrict B,
MyData *const restrict C,
const size_t size)
{
const size_t Nblocks = (size / _BLOCK_);
// Team local copies of tiles
MyData Ablock[BLOCKSIZE];
MyData Bblock[BLOCKSIZE];
MyData Cblock[BLOCKSIZE];
#pragma omp requires unified_shared_memory
#pragma omp target \
teams num_teams(Nblocks * Nblocks) \
private(Ablock, Bblock, Cblock) \
thread_limit(BLOCKSIZE)
{
const size_t ib = omp_get_team_num() / Nblocks;
const size_t jb = omp_get_team_num() % Nblocks;
#pragma omp parallel num_threads(BLOCKSIZE)
{
// local matrix's indexes mapped to each OMP GPU-thread within its own team
const size_t i_local = omp_get_thread_num() / _BLOCK_;
const size_t j_local = omp_get_thread_num() % _BLOCK_;
// global matrix's indexes mapped to each OMP GPU-thread
const size_t i_global = i_local + (ib * _BLOCK_);
const size_t j_global = j_local + (jb * _BLOCK_);
// Cblock initialization
Cblock[(i_local * _BLOCK_) + j_local] = (MyData)0;
// loop over blocks
for (size_t block=0 ; block<Nblocks ; block++)
{
const size_t j_A = j_local + (block * _BLOCK_);
const size_t i_B = i_local + (block * _BLOCK_);
// waits until all threads in the thread block have reached this point and shared memory accesses
#pragma omp barrier
Ablock[(i_local * _BLOCK_) + j_local] = A[(i_global * size) + j_A];
Bblock[(i_local * _BLOCK_) + j_local] = B[(i_B * size) + j_global];
// waits until all threads in the thread block have reached this point and shared memory accesses
#pragma omp barrier
// perform the matrix multiplication within the block
MyData value = (MyData)0;
for (size_t k=0 ; k<_BLOCK_ ; k++)
value += Ablock[(i_local * _BLOCK_) + k] * Bblock[(k * _BLOCK_) + j_local];
// store the partial result in shared memory
Cblock[(i_local * _BLOCK_) + j_local] += value;
} // block loop
// store the result into global memory
C[(i_global * size) + j_global] = Cblock[(i_local * _BLOCK_) + j_local];
} // omp parallel
} // target teams
return;
}
void check(const MyData *const __restrict__ cpu_matrix,
const MyData *const __restrict__ gpu_matrix)
{
int flag = 0;
for (size_t i=0 ; i<SIZE ; i++)
flag = ((fabs(cpu_matrix[i] - gpu_matrix[i]) > FLT_EPSILON) ? 1 : flag);
if (!flag)
printf("\n\t Result OK");
else
printf("\n\t Result wrong");
#if !defined(NDEBUG)
for (size_t i=0 ; i<N ; i++)
{
printf("\n");
for (size_t j=0 ; j<N ; j++)
{
const size_t index = ((i * N) + j);
printf("\n\t cpu_matrix[%d] = %lg - gpu_matrix[%d] = %lg - diff = %lg",
index, cpu_matrix[index], index, gpu_matrix[index], fabs(cpu_matrix[index] - gpu_matrix[index]));
}
}
printf("\n");
#endif // NDEBUG
return;
}
int main()
{
double time;
MyData *buffer_cpu = (MyData *)calloc(4 * SIZE, sizeof(MyData));
assert(buffer_cpu != NULL);
// host reference matrix A
MyData *const restrict A = buffer_cpu;
MyData *const restrict B = A + SIZE;
MyData *const restrict C_CPU = B + SIZE;
MyData *const restrict C_GPU = C_CPU + SIZE;
for (size_t i=0 ; i<SIZE ; i++)
{
A[i] = drand48();
B[i] = drand48();
}
////////////////////////// CPU naive algorithm ///////////////////////////////////////////////////
CPU_mat_mult_block(A, B, C_CPU, N);
//////////////////////////////////////////////////////////////////////////////////////////////////
// get the host id
const int dev_host = omp_get_initial_device();
// get the default device
const int dev_gpu = omp_get_default_device();
assert(dev_host != dev_gpu);
/////////////////////////// GPU naive block algorithm ////////////////////////////////////////////
time = 0.0;
GPU_mat_mult_block(A, B, C_GPU, N); // warm-up
for (unsigned short int loop=0 ; loop<LOOP ; loop++)
{
const double start = wall_time();
GPU_mat_mult_block(A, B, C_GPU, N);
time += (wall_time() - start);
}
check(C_CPU, C_GPU);
memset(C_GPU, 0, (SIZE * sizeof(MyData)));
printf("\n\t GPU block time %lg [s]\n", (time / LOOP));
//////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////// GPU naive block shared memory algorithm //////////////////////////////
time = 0.0;
GPU_mat_mult_block_shared(A, B, C_GPU, N); // warm-up
for (unsigned short int loop=0 ; loop<LOOP ; loop++)
{
const double start = wall_time();
GPU_mat_mult_block_shared(A, B, C_GPU, N);
time += (wall_time() - start);
}
check(C_CPU, C_GPU);
memset(C_GPU, 0, (SIZE * sizeof(MyData)));
printf("\n\t GPU block pteam memory time %lg [s]\n", (time / LOOP));
//////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////// GPU naive block shared memory no loops algorithm /////////////////////
time = 0.0;
GPU_mat_mult_block_shared_no_loops(A, B, C_GPU, N); // warm-up
for (unsigned short int loop=0 ; loop<LOOP ; loop++)
{
const double start = wall_time();
GPU_mat_mult_block_shared_no_loops(A, B, C_GPU, N);
time += (wall_time() - start);
}
check(C_CPU, C_GPU);
printf("\n\t GPU block no loops pteam memory time %lg [s]\n", (time / LOOP));
//////////////////////////////////////////////////////////////////////////////////////////////////
// free CPU memory
free(buffer_cpu);
printf("\n");
return EXIT_SUCCESS;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment