Skip to content
Snippets Groups Projects
Commit ee264684 authored by David Goz's avatar David Goz
Browse files

miscellaneous update

parent 726c5ecb
No related branches found
No related tags found
No related merge requests found
......@@ -257,9 +257,9 @@ int main()
const dim3 block = {(BLOCK * BLOCK), 1, 1};
/////////////////////////// GPU naive block algorithm ////////////////////////////////////////
time = 0.0;
GPU_mat_mult_block<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up
cudaDeviceSynchronize();
time = 0.0;
for (unsigned short int loop=0 ; loop<LOOP ; loop++)
{
const double start = wall_time();
......@@ -274,9 +274,9 @@ int main()
/////////////////////////////////////////////////////////////////////////////////////
/////////////////////////// GPU block shared memory algorithm ///////////////////////
time = 0.0;
GPU_mat_mult_block_shared<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up
cudaDeviceSynchronize();
time = 0.0;
for (unsigned short int loop=0 ; loop<LOOP ; loop++)
{
const double start = wall_time();
......
////////////////////////////////////////////////////////////////////////////////////////////////////
//
// Using CUDA data in OpenMP
//
// Author: David Goz
// mail : david.goz@inaf.it
// date : 03.09.2024
// code tested using nvhpc
//
// - Compile the code:
// - using nvc
// $ nvc -O3 -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v
// hybrid_cuda_omp.c -o hybrid_cuda_omp -lm -lcudart -lcublas
// - using clang
// $ clang -O3 -v -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda
// hybrid_cuda_omp.c -o hybrid_cuda_omp -lm -lcudart -lcublas
//
// - Run the code:
// $ export OMP_TARGET_OFFLOAD=mandatory
// $ ./hybrid_cuda_omp
////////////////////////////////////////////////////////////////////////////////////////////////////
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <assert.h>
#include <float.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <omp.h>
#define N 2048
#define SIZE ((N) * (N))
#define ALPHA 1.0
#define BETA 0.0
#define HOST 0
#define DEV 1
#define LOOP 10
#define INIT 0
#define KERNEL 1
#define DATA 2
typedef double MyData;
static double _time[2][3];
static int thr[2];
double process_time()
{
struct timespec ts;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;
return ret;
}
double thread_time()
{
struct timespec ts;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;
return ret;
}
void InitHost(MyData *const restrict A,
MyData *const restrict B,
MyData *const restrict C,
int *const restrict thr)
{
double start;
#pragma omp parallel
{
#pragma omp barrier
#pragma omp master
{
*thr = omp_get_num_threads();
start = thread_time();
}
#pragma omp barrier
#pragma omp for collapse(2)
for (int i=0 ; i<N ; i++)
for (int j=0 ; j<N ; j++)
{
A[(i * N) + j] = 1.0;
B[(i * N) + j] = 2.0;
C[(i * N) + j] = 0.0;
}
#pragma omp master
{
_time[HOST][INIT] += (thread_time() - start);
}
} // omp parallel
return;
}
void InitDev(MyData *const restrict A,
MyData *const restrict B,
MyData *const restrict C)
{
const double start = thread_time();
#pragma omp target teams loop collapse(2)
for (int i=0 ; i<N ; i++)
for (int j=0 ; j<N ; j++)
{
A[(i * N) + j] = 1.0;
B[(i * N) + j] = 2.0;
C[(i * N) + j] = 0.0;
}
_time[DEV][INIT] += (thread_time() - start);
return;
}
void HostDgemm(MyData *const restrict A,
MyData *const restrict B,
MyData *const restrict C,
const MyData alpha,
const MyData beta,
int *const restrict thr)
{
// C = alpha * A * B + beta * C;
double start;
// naive calculation
#pragma omp parallel
{
#pragma omp barrier
#pragma omp master
{
*thr = omp_get_num_threads();
start = thread_time();
}
#pragma omp barrier
#pragma omp for collapse(2)
for (int i=0 ; i<N ; i++)
for (int j=0 ; j<N ; j++)
{
MyData sum = 0.0;
for (int k=0 ; k<N ; k++)
sum += A[(i * N) + k] * B[(k * N) + j];
C[(i * N) + j] = (alpha * sum) + (beta * C[(i * N) + j]);
}
#pragma omp master
{
_time[HOST][KERNEL] += (thread_time() - start);
}
} // omp parallel
return;
}
void check(MyData *const restrict host_array,
MyData *const restrict dev_array)
{
int flag = 0;
for (size_t i=0 ; i<SIZE ; i++)
flag = ((fabs(host_array[i] - dev_array[i]) > FLT_EPSILON) ? 1 : flag);
if (!flag)
printf("\n\t Result OK \n");
else
printf("\n\t Result wrong \n");
return;
}
int main()
{
// Host allocation
MyData *buffer = (MyData *)malloc(4 * SIZE * sizeof(MyData));
assert(buffer != NULL);
MyData *const restrict A = buffer;
MyData *const restrict B = A + SIZE;
MyData *const restrict C_CPU = B + SIZE;
MyData *const restrict C_GPU = C_CPU + SIZE;
// Spawning 2 host threads
#pragma omp parallel num_threads(2)
{
// Evaluate the Dgemm on the host
#pragma omp single nowait
{
// allowing nested parallelism
omp_set_max_active_levels(2);
for (int loop=0 ; loop<LOOP ; loop++)
{
InitHost(A, B, C_CPU, &thr[0]);
HostDgemm(A, B, C_CPU, ALPHA, BETA, &thr[1]);
}
} // omp single
#pragma omp single nowait
{
// Initialize cuBLAS library
cublasHandle_t handle;
cublasCreate(&handle);
// Allocate A, B, C on the device using CUDA API
MyData *d_buffer = NULL;
cudaMalloc((void **)&d_buffer, (3 * SIZE * sizeof(MyData)));
assert(d_buffer != NULL);
MyData *const restrict d_A = d_buffer;
MyData *const restrict d_B = d_A + SIZE;
MyData *const restrict d_C = d_B + SIZE;
// get the default device
const int dev = omp_get_default_device();
// Associate external device pointers
omp_target_associate_ptr(A, d_A, (SIZE * sizeof(MyData)), 0, dev);
omp_target_associate_ptr(B, d_B, (SIZE * sizeof(MyData)), 0, dev);
omp_target_associate_ptr(C_GPU, d_C, (SIZE * sizeof(MyData)), 0, dev);
for (int loop=0 ; loop<LOOP ; loop++)
{
// Init device with blocking omp target directive
InitDev(A, B, C_GPU);
// Apply DGEMM using device pointers directly
const MyData alpha = ALPHA;
const MyData beta = BETA;
double start = thread_time();
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N,
&alpha,
d_A, N,
d_B, N,
&beta,
d_C, N);
// CUDA synchronization point
cudaDeviceSynchronize();
_time[DEV][KERNEL] += (thread_time() - start);
// Fetch data from the device and deallocate
start = thread_time();
cudaMemcpy(C_GPU, d_C, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost);
_time[DEV][DATA] += (thread_time() - start);
} // LOOP
// release pointer association
omp_target_disassociate_ptr(A, dev);
omp_target_disassociate_ptr(B, dev);
omp_target_disassociate_ptr(C_GPU, dev);
// deallocate device's memory
cudaFree(d_buffer);
cublasDestroy(handle);
} // omp single
} // synchronization point
check(C_CPU, C_GPU);
free(buffer);
printf("\n\t Matrix size: %d x %d\n", N, N);
printf("\n\t Host execution time:");
printf("\n\t\t Init : %lg [s] - threads: %d", _time[HOST][INIT]/LOOP, thr[0]);
printf("\n\t\t Dgemm : %lg [s] - threads: %d\n", _time[HOST][KERNEL]/LOOP, thr[1]);
printf("\n\t Device execution time:");
printf("\n\t\t Init : %lg [s]", _time[DEV][INIT]/LOOP);
printf("\n\t\t Dgemm : %lg [s]", _time[DEV][KERNEL]/LOOP);
printf("\n\t\t Fetch data: %lg [s]\n\n", _time[DEV][DATA]/LOOP);
return 0;
}
////////////////////////////////////////////////////////////////////////////////////////////////////
//
// Passing OpenMP data to cuBlas.
// Passing OpenMP data to foreign runtime (cuBLAS library).
//
// Author: David Goz
// mail : david.goz@inaf.it
......@@ -8,11 +8,16 @@
// code tested using nvhpc
//
// - Compile the code:
// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v
// hybrid_cuda_omp.c -o hybrid_cuda_omp -lm -lcudart -lcublas
// - using nvc
// $ nvc -O3 -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v
// hybrid_omp_cuda.c -o hybrid_omp_cuda -lm -lcudart -lcublas
// - using clang
// $ clang -O3 -v -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda
// hybrid_omp_cuda.c -o hybrid_omp_cuda -lm -lcudart -lcublas
//
// - Run the code:
// $ export OMP_TARGET_OFFLOAD=mandatory
// $ ./hybrid_cuda_omp
// $ ./hybrid_omp_cuda
////////////////////////////////////////////////////////////////////////////////////////////////////
#include <cuda_runtime.h>
......@@ -22,19 +27,61 @@
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <omp.h>
#define N 512
#define N 2048
#define SIZE ((N) * (N))
#define ALPHA 1.0
#define BETA 0.0
#define HOST 0
#define DEV 1
#define LOOP 10
#define INIT 0
#define KERNEL 1
#define DATA 2
typedef double MyData;
static double _time[2][3];
static int thr[2];
double process_time()
{
struct timespec ts;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;
return ret;
}
double thread_time()
{
struct timespec ts;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;
return ret;
}
void InitHost(MyData *const restrict A,
MyData *const restrict B,
MyData *const restrict C)
MyData *const restrict C,
int *const restrict thr)
{
double start;
#pragma omp parallel
{
#pragma omp barrier
#pragma omp master
{
//#pragma omp parallel for collapse(2)
*thr = omp_get_num_threads();
start = thread_time();
}
#pragma omp barrier
#pragma omp for collapse(2)
for (int i=0 ; i<N ; i++)
for (int j=0 ; j<N ; j++)
{
......@@ -42,12 +89,22 @@ void InitHost(MyData *const restrict A,
B[(i * N) + j] = 2.0;
C[(i * N) + j] = 0.0;
}
#pragma omp master
{
_time[HOST][INIT] += (thread_time() - start);
}
} // omp parallel
return;
}
void InitDev(MyData *const restrict A,
MyData *const restrict B,
MyData *const restrict C)
{
const double start = thread_time();
#pragma omp target teams loop collapse(2)
for (int i=0 ; i<N ; i++)
for (int j=0 ; j<N ; j++)
......@@ -57,6 +114,8 @@ void InitDev(MyData *const restrict A,
C[(i * N) + j] = 0.0;
}
_time[DEV][INIT] += (thread_time() - start);
return;
}
......@@ -64,12 +123,25 @@ void HostDgemm(MyData *const restrict A,
MyData *const restrict B,
MyData *const restrict C,
const MyData alpha,
const MyData beta)
const MyData beta,
int *const restrict thr)
{
// C = alpha * A * B + beta * C;
double start;
// naive calculation
// #pragma omp parallel for collapse(2)
#pragma omp parallel
{
#pragma omp barrier
#pragma omp master
{
*thr = omp_get_num_threads();
start = thread_time();
}
#pragma omp barrier
#pragma omp for collapse(2)
for (int i=0 ; i<N ; i++)
for (int j=0 ; j<N ; j++)
{
......@@ -80,6 +152,12 @@ void HostDgemm(MyData *const restrict A,
C[(i * N) + j] = (alpha * sum) + (beta * C[(i * N) + j]);
}
#pragma omp master
{
_time[HOST][KERNEL] += (thread_time() - start);
}
} // omp parallel
return;
}
......@@ -91,9 +169,9 @@ void check(MyData *const restrict host_array,
flag = ((fabs(host_array[i] - dev_array[i]) > FLT_EPSILON) ? 1 : flag);
if (!flag)
printf("\n\t Result OK");
printf("\n\t Result OK \n");
else
printf("\n\t Result wrong");
printf("\n\t Result wrong \n");
return;
}
......@@ -114,8 +192,14 @@ int main()
// Evaluate the Dgemm on the host
#pragma omp single nowait
{
InitHost(A, B, CC);
HostDgemm(A, B, CC, ALPHA, BETA);
// allowing nested parallelism
omp_set_max_active_levels(2);
for (int loop=0 ; loop<LOOP ; loop++)
{
InitHost(A, B, CC, &thr[0]);
HostDgemm(A, B, CC, ALPHA, BETA, &thr[1]);
}
} // omp single
#pragma omp single nowait
......@@ -127,25 +211,39 @@ int main()
// Allocate A, B, C on the device
#pragma omp target enter data map(alloc: A[0:SIZE], B[0:SIZE], C[0:SIZE])
for (int loop=0 ; loop<LOOP ; loop++)
{
// Init device with blocking omp target directive
InitDev(A, B, C);
// Define a target data region where A, B, and C pointers
// refer to device's address space
#pragma omp target data use_device_addr(A, B, C)
#pragma omp target data use_device_ptr(A, B, C)
{
MyData const alpha = ALPHA;
MyData const beta = BETA;
const MyData alpha = ALPHA;
const MyData beta = BETA;
const double start = thread_time();
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N,
&alpha, A, N, B, N, &beta, C, N);
&alpha,
A, N,
B, N,
&beta,
C, N);
// CUDA synchronization point
cudaDeviceSynchronize();
_time[DEV][KERNEL] += (thread_time() - start);
}
// Fetch data from the device and deallocate
#pragma omp target exit data map(from: C[0:SIZE]) map(delete: A[0:SIZE], B[0:SIZE])
const double start = thread_time();
#pragma omp target update from (C[0:SIZE])
_time[DEV][DATA] += (thread_time() - start);
} // LOOP
// deallocate device's memory
#pragma omp target exit data map(delete: A[0:SIZE], B[0:SIZE], C[0:SIZE])
cublasDestroy(handle);
} // omp single
......@@ -155,5 +253,16 @@ int main()
free(buffer);
printf("\n\t Matrix size: %d x %d\n", N, N);
printf("\n\t Host execution time:");
printf("\n\t\t Init : %lg [s] - threads: %d", _time[HOST][INIT]/LOOP, thr[0]);
printf("\n\t\t Dgemm : %lg [s] - threads: %d\n", _time[HOST][KERNEL]/LOOP, thr[1]);
printf("\n\t Device execution time:");
printf("\n\t\t Init : %lg [s]", _time[DEV][INIT]/LOOP);
printf("\n\t\t Dgemm : %lg [s]", _time[DEV][KERNEL]/LOOP);
printf("\n\t\t Fetch data: %lg [s]\n\n", _time[DEV][DATA]/LOOP);
return 0;
}
////////////////////////////////////////////////////////////////////////////////////////////////////
//
// Passing OpenMP data to foreign runtime (cuBLAS library).
//
// Author: David Goz
// mail : david.goz@inaf.it
// date : 03.09.2024
// code tested using nvhpc
//
// - Compile the code:
// - using nvc
// $ nvc++ -O3 -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v
// hybrid_omp_cuda.cu -o hybrid_omp_cuda -lm && ./hybrid_omp_cuda
//
// - using clang
// $ clang -O3 -v -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda
// hybrid_omp_cuda.c -o hybrid_omp_cuda -lm
//
// - Run the code:
// $ export OMP_TARGET_OFFLOAD=mandatory
// $ ./hybrid_omp_cuda
////////////////////////////////////////////////////////////////////////////////////////////////////
#include <cuda.h>
#include <math.h>
#include <assert.h>
#include <float.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <omp.h>
#define N 1024
#define SIZE ((N) * (N))
#define HOST 0
#define DEV 1
#define LOOP 10
#define INIT 0
#define KERNEL 1
#define DATA 2
#define BLOCKSIZE 1024
typedef double MyData;
static double _time[2][3];
static int thr[2];
double process_time()
{
struct timespec ts;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;
return ret;
}
double thread_time()
{
struct timespec ts;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9;
return ret;
}
void InitHost(MyData *const restrict A,
MyData *const restrict B,
MyData *const restrict C,
int *const restrict thr)
{
double start;
#pragma omp parallel
{
#pragma omp barrier
#pragma omp master
{
*thr = omp_get_num_threads();
start = thread_time();
}
#pragma omp barrier
#pragma omp for collapse(2)
for (int i=0 ; i<N ; i++)
for (int j=0 ; j<N ; j++)
{
A[(i * N) + j] = 1.0;
B[(i * N) + j] = (1.0 / M_PI);
C[(i * N) + j] = 0.0;
}
#pragma omp master
{
_time[HOST][INIT] += (thread_time() - start);
}
} // omp parallel
return;
}
void InitDev(MyData *const restrict A,
MyData *const restrict B,
MyData *const restrict C)
{
const double start = thread_time();
#pragma omp target teams loop collapse(2) is_device_ptr(A, B, C)
for (int i=0 ; i<N ; i++)
for (int j=0 ; j<N ; j++)
{
A[(i * N) + j] = 1.0;
B[(i * N) + j] = (1.0 / M_PI);
C[(i * N) + j] = 0.0;
}
_time[DEV][INIT] += (thread_time() - start);
return;
}
void HostMM(MyData *const restrict A,
MyData *const restrict B,
MyData *const restrict C,
int *const restrict thr)
{
// C = alpha * A * B + beta * C;
double start;
// naive calculation
#pragma omp parallel
{
#pragma omp barrier
#pragma omp master
{
*thr = omp_get_num_threads();
start = thread_time();
}
#pragma omp barrier
#pragma omp for collapse(2)
for (int i=0 ; i<N ; i++)
for (int j=0 ; j<N ; j++)
{
MyData sum = 0.0;
for (int k=0 ; k<N ; k++)
sum += A[(i * N) + k] * B[(k * N) + j];
C[(i * N) + j] = sum;
}
#pragma omp master
{
_time[HOST][KERNEL] += (thread_time() - start);
}
} // omp parallel
return;
}
__global__ void DevMM(MyData *const restrict A,
MyData *const restrict B,
MyData *const restrict C,
const int n)
{
const int size = (n * n);
const int globalID = threadIdx.x + (blockIdx.x * blockDim.x);
if (globalID >= size)
return;
const int i = (globalID / N);
const int j = (globalID % N);
MyData sum = 0.0;
for (int k=0 ; k<N ; k++)
sum += (A[(i * N) + k] * B[(k * N) + j]);
C[(i * N) + j] = sum;
return;
}
void check(MyData *const restrict host_array,
MyData *const restrict dev_array)
{
int flag = 0;
for (size_t i=0 ; i<SIZE ; i++)
flag = ((fabs(host_array[i] - dev_array[i]) > FLT_EPSILON) ? 1 : flag);
if (!flag)
printf("\n\t Result OK \n");
else
printf("\n\t Result wrong \n");
return;
}
int main()
{
// Host allocation
MyData *h_buffer = (MyData *)malloc(2 * SIZE * sizeof(MyData));
assert(h_buffer != NULL);
MyData *const restrict C_HOST = h_buffer;
MyData *const restrict C_DEV = C_HOST + SIZE;
// Spawning 2 host threads
#pragma omp parallel num_threads(2)
{
// Evaluate the Dgemm on the host
#pragma omp single nowait
{
// allowing nested parallelism
omp_set_max_active_levels(2);
MyData *tmp = (MyData *)malloc(2 * SIZE * sizeof(MyData));
MyData *const restrict A = tmp;
MyData *const restrict B = A + SIZE;
for (int loop=0 ; loop<LOOP ; loop++)
{
InitHost(A, B, C_HOST, &thr[0]);
HostMM(A, B, C_HOST, &thr[1]);
}
free(tmp);
} // omp single
#pragma omp single nowait
{
// Device allocation
const int dev = omp_get_default_device();
const int host = omp_get_initial_device();
MyData *d_buffer = (MyData *)omp_target_alloc((3 * SIZE * sizeof(MyData)), dev);
assert(d_buffer != NULL);
MyData *const restrict d_A = d_buffer;
MyData *const restrict d_B = d_A + SIZE;
MyData *const restrict d_C = d_B + SIZE;
const dim3 nblock = {((SIZE + BLOCKSIZE - 1) / BLOCKSIZE), 1, 1};
const dim3 block = {BLOCKSIZE, 1, 1};
for (int loop=0 ; loop<LOOP ; loop++)
{
// Init device with blocking omp target directive
InitDev(d_A, d_B, d_C);
double start = thread_time();
DevMM<<< nblock, block >>>(d_A, d_B, d_C, N);
// CUDA synchronization point
cudaDeviceSynchronize();
_time[DEV][KERNEL] += (thread_time() - start);
// Fetch data from the device and deallocate
start = thread_time();
omp_target_memcpy(C_DEV, d_C, (SIZE * sizeof(MyData)), 0, 0, host, dev);
_time[DEV][DATA] += (thread_time() - start);
} // LOOP
// deallocate device's memory
omp_target_free(d_buffer, dev);
} // omp single
} // synchronization point
check(C_HOST, C_DEV);
free(h_buffer);
printf("\n\t Matrix size: %d x %d\n", N, N);
printf("\n\t Host execution time:");
printf("\n\t\t Init : %lg [s] - threads: %d", _time[HOST][INIT]/LOOP, thr[0]);
printf("\n\t\t Dgemm : %lg [s] - threads: %d\n", _time[HOST][KERNEL]/LOOP, thr[1]);
printf("\n\t Device execution time:");
printf("\n\t\t Init : %lg [s]", _time[DEV][INIT]/LOOP);
printf("\n\t\t Dgemm : %lg [s]", _time[DEV][KERNEL]/LOOP);
printf("\n\t\t Fetch data: %lg [s]\n\n", _time[DEV][DATA]/LOOP);
return 0;
}
//////////////////////////////////////////////////////////////////////////////////////////////////
// Author: David Goz
// mail : david.goz@inaf.it
// date : 03.09.2024
// code tested using nvhpc
//
// - Compile the code:
// - nvc complains about the '#pragma omp target has_device_addr'
// - clang works
//////////////////////////////////////////////////////////////////////////////////////////////////
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
static int answer = 42;
#define SIZE 3
int main()
{
int *A = (int *)malloc(SIZE * sizeof(int));
assert(A != NULL);
memset(A, 0, (SIZE * sizeof(int)));
printf("\n\t Host address of 'answer' [value: %d] is: %p", answer, &answer);
printf("\n\t Host address of 'A' is: %p", A);
// Map 'answer' to the device
#pragma omp target data map(alloc: answer, A[0:SIZE])
{
#pragma omp target update to(answer, A[0:SIZE])
// Update 'answer' on the device
#pragma omp target map(answer)
{
answer += 1;
for (int i=0 ; i<SIZE ; i++)
A[i] += (i + 1);
}
#pragma omp target update from(answer, A[0:SIZE])
printf("\n\t Address of 'answer' [value: %d] in target data region is: %p",
answer, &answer);
printf("\n\t Address of 'A' in target data region is: %p", A);
printf("\n\t Value of A:");
for (int i=0 ; i<SIZE ; i++)
printf("\n\t\t A[%d] = %d", i, A[i]);
printf("\n");
#pragma omp target data use_device_addr(answer) use_device_ptr(A)
{
#pragma omp target has_device_addr(answer)
{
answer += 1;
for (int i=0 ; i<SIZE ; i++)
A[i] += (i + 1);
}
printf("\n\t Device address of 'answer' is: %p", &answer);
printf("\n\t Device address of 'A' is: %p", A);
}
#pragma omp target update from(answer, A[0:SIZE])
printf("\n\t Last value of 'answer' is: %d", answer);
printf("\n\t Last value if 'A' is:");
for (int i=0 ; i<SIZE ; i++)
printf("\n\t\t A[%d] = %d", i, A[i]);
printf("\n\n");
} // omp target data
free(A);
return 0;
}
////////////////////////////////////////////////////////////////////////////////////////////////////
//
// Author: David Goz
// mail : david.goz@inaf.it
// date : 31.07.2024
// code tested using nvhpc
//
// - Compile the code:
// $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v structure_routines.c -o structure_routines_omp
// - Run the code:
// $ export OMP_TARGET_OFFLOAD=mandatory
// $ ./structure_routines_omp
////////////////////////////////////////////////////////////////////////////////////////////////////
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <omp.h>
#define SIZE 8
#define SIZE_2 (SIZE / 2)
typedef double MyData;
typedef struct my_span
{
size_t N;
MyData *A;
MyData *B;
} span;
span d_S;
#pragma omp declare target(d_S)
void allocate( span *my_struct,
const size_t size)
{
/* allocate the buffer on the host memory */
my_struct->A = (MyData *)calloc(size, sizeof(MyData));
my_struct->B = (MyData *)calloc(size, sizeof(MyData));
assert((my_struct->A != NULL) && (my_struct->B != NULL));
my_struct->N = size;
for (size_t i=0 ; i<size; i++)
{
my_struct->A[i] = (MyData)(3 * i);
my_struct->B[i] = (MyData)(2 * i);
}
return;
}
void print(const span *const ptr,
const char *const string)
{
int flag = 0;
printf("\n");
for (int i=0 ; i<ptr->N ; i++)
{
printf("\n\t %s[%d] = %lg", string, i, ptr->A[i]);
printf("\n\t %s[%d] = %lg", string, i, ptr->B[i]);
flag = (((ptr->A[i] != 0) || (ptr->B[i] != 0)) ? 1 : flag);
}
printf("\n");
if (flag)
printf("\n\t Result wrong \n\n");
else
printf("\n\t Result OK \n\n");
return;
}
int main()
{
/* host allocation */
span h_S;
allocate(&h_S, SIZE);
/* allocating GPU memory using OMP routines */
const int dev = omp_get_default_device();
const int host = omp_get_initial_device();
MyData *d_buffer = (double *)omp_target_alloc(2 * SIZE * sizeof(MyData), dev);
assert(d_buffer != NULL);
/* set the pointers within the GPU */
#pragma omp target is_device_ptr(d_buffer) device(dev)
{
d_S.N = SIZE;
d_S.A = d_buffer;
d_S.B = d_buffer + SIZE;
}
/* copy data to the GPU */
omp_target_memcpy(d_buffer, h_S.A, (SIZE * sizeof(MyData)), 0, 0, dev, host);
omp_target_memcpy(d_buffer, h_S.B, (SIZE * sizeof(MyData)), (SIZE * sizeof(MyData)), 0, dev, host);
/* perform the calculation on the GPU */
#pragma omp target device(dev)
{
#pragma omp loop
for (size_t i=0 ; i<d_S.N ; i++)
{
d_S.A[i] -= (MyData)(3 * i);
d_S.B[i] -= (MyData)(2 * i);
}
}
/* copy data from the GPU */
omp_target_memcpy(h_S.A, d_buffer, (SIZE * sizeof(MyData)),
0, 0, host, dev);
omp_target_memcpy(h_S.B, d_buffer, (SIZE * sizeof(MyData)),
0, (SIZE * sizeof(MyData)), host, dev);
/* check the data */
print(&h_S, "d_S");
/* free GPU memory */
omp_target_free(d_buffer, dev);
/* free host memory */
free(h_S.A);
free(h_S.B);
return 0;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment