Skip to content
Snippets Groups Projects
Commit 7e6f37d5 authored by David Goz's avatar David Goz :sleeping:
Browse files

omp block matrix

parent 86b043e3
No related branches found
No related tags found
No related merge requests found
////////////////////////////////////////////////////////////////////////////////////////////////
// - Naive matrix multiplication algorithm
// for (size_t i=0 ; i<N ; i++)
// for (size_t j=0 ; j<N ; j++)
// for (size_t k=0 ; k<_N ; k++)
// C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j];
//
////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////
// Author: David Goz
// mail : david.goz@inaf.it
// date : 31.07.2024
// code tested using nvhpc
//
// - Compile the code:
// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork.c -o classwork_omp
// - Run the code:
// $ ./classwork_omp
//////////////////////////////////////////////////////////////////////////////////////////////////
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <time.h>
#include <assert.h>
#include <omp.h>
#include <string.h>
#define N 512
#define SIZE (N * N) // matrix size
typedef double MyData; // do not change
#define LOOP 100
#define NDEBUG
double wall_time()
{
struct timespec ts;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;
return ret;
}
void CPU_mat_mult(const MyData *const restrict A,
const MyData *const restrict B,
MyData *const restrict C,
const size_t size)
{
for (size_t i=0 ; i<size ; i++)
for (size_t j=0 ; j<size ; j++)
for (size_t k=0 ; k<size ; k++)
C[(i * size) + j] += (A[(i * size) + k] * B[(k * size) + j]);
return;
}
void GPU_mat_mult(const MyData *const restrict A,
const MyData *const restrict B,
MyData *const restrict C,
const size_t size)
{
#pragma omp target
{
#pragma omp teams distribute num_teams(size)
for (size_t i=0 ; i<size ; i++)
{
#pragma omp parallel for num_threads(size)
for (size_t j=0 ; j<size ; j++)
{
MyData value = (MyData)0;
for (size_t k=0 ; k<size ; k++)
value += (A[(i * size) + k] * B[(k * size) + j]);
C[(i * size) + j] = value;
} // omp thread
} // omp teams
} // omp target
return;
}
void GPU_mat_mult_no_loops(const MyData *const restrict A,
const MyData *const restrict B,
MyData *const restrict C,
const size_t size)
{
#pragma omp target
{
#pragma omp teams num_teams(size)
{
const size_t team_size = (size * omp_get_team_num());
#pragma omp parallel firstprivate(team_size) num_threads(size)
{
const size_t tid = omp_get_thread_num();
MyData value = (MyData)0;
for (size_t k=0 ; k<size ; k++)
value += (A[team_size + k] * B[(k * size) + tid]);
C[team_size + tid] = value;
} // omp threads
} // omp teams
} // omp target
return;
}
void check(const MyData *const __restrict__ cpu_matrix,
const MyData *const __restrict__ gpu_matrix)
{
int flag;
for (size_t i=0 ; i<SIZE ; i++)
flag = ((cpu_matrix[i] != gpu_matrix[i]) ? 1 : 0);
if (!flag)
printf("\n\t Result OK");
else
printf("\n\t Result wrong");
return;
}
int main()
{
double time;
MyData *buffer = (MyData *)calloc(4 * SIZE, sizeof(MyData));
assert(buffer != NULL);
// host reference matrix A
MyData *const restrict A = buffer;
MyData *const restrict B = A + SIZE;
MyData *const restrict C_CPU = B + SIZE;
MyData *const restrict C_GPU = C_CPU + SIZE;
for (size_t i=0 ; i<SIZE ; i++)
{
A[i] = drand48();
B[i] = drand48();
}
////////////////////////// CPU naive algorithm //////////////////////////////////////////
CPU_mat_mult(A, B, C_CPU, N);
/////////////////////////////////////////////////////////////////////////////////////////
// copy/alloc data to the GPU
#pragma omp target enter data map(to: A[0:SIZE], B[0:SIZE]) map(alloc: C_GPU[0:SIZE])
/////////////////////////// GPU naive algorithm ////////////////////////////////////////
time = 0.0;
for (unsigned short int loop=0 ; loop<LOOP ; loop++)
{
const double start = wall_time();
GPU_mat_mult(A, B, C_GPU, N);
time += (wall_time() - start);
}
#pragma omp target update from(C_GPU[0:SIZE])
check(C_CPU, C_GPU);
printf("\n\t GPU naive time %lg [s]\n", (time / LOOP));
////////////////////////////////////////////////////////////////////////////////
/////////////////////////// GPU naive no loops algorithm ////////////////////////////
time = 0.0;
for (unsigned short int loop=0 ; loop<LOOP ; loop++)
{
const double start = wall_time();
GPU_mat_mult_no_loops(A, B, C_GPU, N);
time += (wall_time() - start);
}
#pragma omp target update from(C_GPU[0:SIZE])
check(C_CPU, C_GPU);
printf("\n\t GPU naive no loops time %lg [s]\n", (time / LOOP));
////////////////////////////////////////////////////////////////////////////////
// free CPU memory
free(buffer);
// free GPU memory
#pragma omp target exit data map(delete: A[0:SIZE], B[0:SIZE], C_CPU[0:SIZE])
printf("\n");
return EXIT_SUCCESS;
}
////////////////////////////////////////////////////////////////////////////////////////////////
// - Block matrix multiplication algorithm
//
// const size_t Nblocks = (N / Bsize);
//
// // loop over blocks of matrix C
// for (size_t ib=0 ; ib<Nblocks ; ib++)
// {
// for (size_t jb=0 ; jb<Nblocks ; jb++)
// {
//
// // loop over blocks of rows of A
// for (size_t kb=0 ; kb<Nblocks ; kb++)
//
//
//
// for (size_t i=0 ; i<N ; i++)
// for (size_t j=0 ; j<N ; j++)
// for (size_t k=0 ; k<_N ; k++)
// C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j];
//
// } // jb
// } // ib
// - Exploit shared-memory.
////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////
// Author: David Goz
// mail : david.goz@inaf.it
// date : 31.07.2024
// code tested using nvhpc
//
// - Compile the code:
// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork.c -o classwork_omp
// - Run the code:
// $ ./classwork_omp
//////////////////////////////////////////////////////////////////////////////////////////////////
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <time.h>
#include <assert.h>
#include <omp.h>
#include <string.h>
#define N 512
#define SIZE (N * N) // matrix size
typedef double MyData; // do not change
#define BLOCKSIZE 32 // number of threads per block
// sanity check
#if BLOCKSIZE > 1024
#error BLOCKSIZE cannot be larger than 1024
#endif
#define LOOP 100
#define NDEBUG
double wall_time()
{
struct timespec ts;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;
return ret;
}
void CPU_mat_mult(const MyData *const restrict A,
const MyData *const restrict B,
MyData *const restrict C,
const size_t size)
{
for (size_t i=0 ; i<size ; i++)
for (size_t j=0 ; j<size ; j++)
for (size_t k=0 ; k<size ; k++)
C[(i * size) + j] += (A[(i * size) + k] * B[(k * size) + j]);
return;
}
void GPU_mat_mult(const MyData *const restrict A,
const MyData *const restrict B,
MyData *const restrict C,
const size_t size)
{
#pragma omp target
{
#pragma omp teams distribute num_teams(size)
for (size_t i=0 ; i<size ; i++)
{
#pragma omp parallel for num_threads(size)
for (size_t j=0 ; j<size ; j++)
{
MyData value = (MyData)0;
for (size_t k=0 ; k<size ; k++)
value += (A[(i * size) + k] * B[(k * size) + j]);
C[(i * size) + j] = value;
} // omp thread
} // omp teams
} // omp target
return;
}
void GPU_mat_mult_no_loops(const MyData *const restrict A,
const MyData *const restrict B,
MyData *const restrict C,
const size_t size)
{
#pragma omp target
{
#pragma omp teams num_teams(size)
{
const size_t team_size = (size * omp_get_team_num());
#pragma omp parallel firstprivate(team_size) num_threads(size)
{
const size_t tid = omp_get_thread_num();
MyData value = (MyData)0;
for (size_t k=0 ; k<size ; k++)
value += (A[team_size + k] * B[(k * size) + tid]);
C[team_size + tid] = value;
} // omp threads
} // omp teams
} // omp target
return;
}
void check(const MyData *const __restrict__ cpu_matrix,
const MyData *const __restrict__ gpu_matrix)
{
int flag;
for (size_t i=0 ; i<SIZE ; i++)
flag = ((cpu_matrix[i] != gpu_matrix[i]) ? 1 : 0);
if (!flag)
printf("\n\t Result OK");
else
printf("\n\t Result wrong");
return;
}
int main()
{
double time;
MyData *buffer = (MyData *)calloc(4 * SIZE, sizeof(MyData));
assert(buffer != NULL);
// host reference matrix A
MyData *const restrict A = buffer;
MyData *const restrict B = A + SIZE;
MyData *const restrict C_CPU = B + SIZE;
MyData *const restrict C_GPU = C_CPU + SIZE;
for (size_t i=0 ; i<SIZE ; i++)
{
A[i] = drand48();
B[i] = drand48();
}
////////////////////////// CPU naive algorithm //////////////////////////////////////////
CPU_mat_mult(A, B, C_CPU, N);
/////////////////////////////////////////////////////////////////////////////////////////
// copy/alloc data to the GPU
#pragma omp target enter data map(to: A[0:SIZE], B[0:SIZE]) map(alloc: C_GPU[0:SIZE])
/////////////////////////// GPU naive algorithm ////////////////////////////////////////
time = 0.0;
for (unsigned short int loop=0 ; loop<LOOP ; loop++)
{
const double start = wall_time();
GPU_mat_mult(A, B, C_GPU, N);
time += (wall_time() - start);
}
#pragma omp target update from(C_GPU[0:SIZE])
check(C_CPU, C_GPU);
printf("\n\t GPU naive time %lg [s]\n", (time / LOOP));
////////////////////////////////////////////////////////////////////////////////
/////////////////////////// GPU naive no loops algorithm ////////////////////////////
time = 0.0;
for (unsigned short int loop=0 ; loop<LOOP ; loop++)
{
const double start = wall_time();
GPU_mat_mult_no_loops(A, B, C_GPU, N);
time += (wall_time() - start);
}
#pragma omp target update from(C_GPU[0:SIZE])
check(C_CPU, C_GPU);
printf("\n\t GPU naive no loops time %lg [s]\n", (time / LOOP));
////////////////////////////////////////////////////////////////////////////////
// free CPU memory
free(buffer);
// free GPU memory
#pragma omp target exit data map(delete: A[0:SIZE], B[0:SIZE], C_CPU[0:SIZE])
printf("\n");
return EXIT_SUCCESS;
}
......@@ -17,7 +17,8 @@
#include <assert.h>
#include <omp.h>
#define N 8
#define SIZE 8
#define SIZE_2 (SIZE / 2)
typedef double MyData;
typedef struct my_span
......@@ -29,26 +30,20 @@ typedef struct my_span
void allocate( span *my_struct,
const size_t size)
{
span tmp;
/* allocate the buffer on the host memory */
tmp.ptr = (MyData *)malloc(size * sizeof(MyData));
assert(tmp.ptr != NULL);
tmp.N = size;
/* declare how the object 'span' has to be mapped on the device */
#pragma omp declare mapper(span tmp) map(from: tmp, tmp.ptr[0: tmp.N])
my_struct->ptr = tmp.ptr;
my_struct->N = tmp.N;
my_struct->ptr = (MyData *)calloc(size, sizeof(MyData));
assert(my_struct->ptr != NULL);
my_struct->N = size;
return;
}
void print(const span *const A)
void print(const span *const A,
const char *const string)
{
printf("\n");
for (size_t i=0 ; i<A->N ; i++)
printf("\n\t array[%i] = %lg", i, A->ptr[i]);
for (int i=0 ; i<A->N ; i++)
printf("\n\t %s[%d] = %lg", string, i, A->ptr[i]);
printf("\n\n");
return;
......@@ -56,27 +51,41 @@ void print(const span *const A)
int main()
{
span A, B;
span A, B, C;
allocate(&A, N);
allocate(&B, N);
allocate(&A, SIZE);
allocate(&B, SIZE);
allocate(&C, SIZE);
/* declare how the object 'span' has to be mapped on the device */
#pragma omp declare mapper(all : span S) map(to: S) map(from: S.ptr[0 : S.N])
#pragma omp declare mapper(left : span S) map(to: S) map(from: S.ptr[0 : S.N/2])
#pragma omp declare mapper(right: span S) map(to: S) map(from: S.ptr[S.N/2: S.N/2])
/* init on the GPU */
#pragma omp target
{
for (size_t i=0 ; i<N ; i++)
#pragma omp target map(mapper(all): A) map(mapper(left): B) map(mapper(right): C)
{
#pragma omp loop
for (size_t i=0 ; i<SIZE ; i++)
A.ptr[i] = (MyData)(i);
B.ptr[i] = (MyData)(2 * i);
}
#pragma omp loop
for (size_t ii=0 ; ii<SIZE_2 ; ii++)
B.ptr[ii] = (MyData)(ii);
#pragma omp loop
for (size_t iii=SIZE_2 ; iii<SIZE ; iii++)
C.ptr[iii] = (MyData)(iii);
}
print(&A);
print(&B);
print(&A, "A");
print(&B, "B");
print(&C, "C");
/* free the host's memory */
free(A.ptr);
free(B.ptr);
free(C.ptr);
return 0;
}
File added
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment