Skip to content
Snippets Groups Projects
Commit 668da3ac authored by Giuliano Taffoni's avatar Giuliano Taffoni
Browse files

New implementation compatible with gcc and GPU info

parent fa112e57
Branches openmp
Tags
No related merge requests found
......@@ -4,29 +4,31 @@ CXX = g++-10
MPICC = mpicc
MPIC++ = mpiCC
OPTIMIZE = -O3
#-ffast-math -fopt-info-all-omp -fcf-protection=none -fno-stack-protector -foffload=nvptx-none
GSL_INCL = -I/home/taffoni/sw/include #-I/opt/cluster/openmpi/3.1.3/gnu/8.2.0/include
GSL_LIBS = -L/home/taffoni/sw/lib #-L/opt/cluster/openmpi/3.1.3/gnu/8.2.0/lib -lmpi
GSL_INCL = -I/home/taffoni/sw/include
GSL_LIBS = -L/home/taffoni/sw/lib
FFTW_INCL= -I/home/taffoni/sw/include
FFTW_LIB= -L/home/taffoni/sw/lib -lfftw3_mpi -lfftw3
#-L/opt/cluster/openmpi/3.1.3/gnu/8.2.0/lib -lmpi
MPI_LIB =
MPI_INCL=
#-I/opt/cluster/openmpi/3.1.3/gnu/8.2.0/include
MPI_INCL= -I/home/taffoni/sw/Linux_x86_64/21.5/comm_libs/mpi/include
HDF5_INCL =
HDF5_LIB =
OMP = -fopenmp
OMP = -mp=multicore,gpu -Mprof -cuda
#OMP = -fopenmp
NVCC = nvcc
NVFLAGS = -arch=sm_70 -Xcompiler -std=c++11
NVLIB = -L/home/taffoni/sw/Linux_x86_64/21.5/cuda/11.3/lib64/ -lcudart -lcuda
CFLAGS += $(OPTIMIZE)
CFLAGS += -I.
CFLAGS += -I/home/taffoni/sw/Linux_x86_64/21.5/comm_libs/mpi/include
CFLAGS += $(FFTW_INCL) $(GSL_INCL)
CFLAGS += $(FFTW_LIB) -lm
CFLAGS += -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL)
OPTIMIZE = $(OMP) -O3
# OMP GPU SPECIFIC FLAGS
#OPTIMIZE += -Wno-unused-result -foffload=-lm -ffast-math
#OPTIMIZE += -fcf-protection=none -fno-stack-protector -foffload=nvptx-none -foffload=-misa=sm_35
#-ffast-math -fopt-info-all-omp -foffload=-misa=sm_35 -fcf-protection=none -fno-stack-protector -foffload=nvptx-none
......@@ -5,16 +5,16 @@ MPICC = mpicc
MPIC++ = mpiCC
CFLAGS += -O3 -mcpu=native
CFLAGS += -I.
FFTW_INCL= -I/home/taffoni/sw/include
FFTW_LIB= -L/home/taffoni/sw/lib
LIBS = $(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm
NVCC = nvcc
NVFLAGS = -arch=sm_70 -Xcompiler -mno-float128 -std=c++11
NVLIB = -L/cineca/prod/opt/compilers/cuda/10.1/none/lib64/ -lcudart -lcuda
OMP= -fopenmp
CFLAGS += -O3 -mtune=native
CFLAGS += -I. $(FFTW_INCL) $(GSL_INCL) $(MPI_INCL)
OPTIMIZE = $(OMP) -O3 -mtune=native
......@@ -14,21 +14,22 @@ include Build/Makefile.systype
endif
LIBS = $(FFTW_LIB) -lfftw3 -lm -lcudart -lcuda
# create MPI code
OPT += -DUSE_MPI
#OPT += -DACCOMP
# use FFTW (it can be switched on ONLY if MPI is active)
ifeq (USE_MPI,$(findstring USE_MPI,$(OPT)))
OPT += -DUSE_FFTW
LIBS = $(FFTW_LIB) -lfftw3_mpi -lfftw3 -lm
endif
#OPT += -DNVIDIA
# perform one-side communication (suggested) instead of reduce (only if MPI is active)
OPT += -DONE_SIDE
# write the full 3D cube of gridded visibilities and its FFT transform
#OPT += -DWRITE_DATA
OPT += -DWRITE_DATA
# write the final image
OPT += -DWRITE_IMAGE
# perform w-stacking phase correction
......@@ -46,27 +47,36 @@ phase_correction.c: phase_correction.cu
ifeq (USE_MPI,$(findstring USE_MPI,$(OPT)))
%.o: %.c $(DEPS)
$(MPICC) -c -o $@ $< $(CFLAGS) $(OPT)
$(MPICC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS)
else
%.o: %.c $(DEPS)
$(CC) $(OMP) -c -o $@ $< $(CFLAGS) $(OPT)
$(CC) $(OPTIMIZE) $(OPT) -c -o $@ $< $(CFLAGS)
endif
serial: $(COBJ)
$(CC) -o w-stackingCfftw_serial $(CFLAGS) $^ -lm
$(CC) $(OPTIMIZE) $(OPT) -o w-stackingCfftw_serial $^ $(LIBS)
serial_omp: phase_correction.c
$(CC) $(OPTIMIZE) $(OPT) -o w-stackingOMP_serial w-stacking-fftw.c w-stacking_omp.c $(CFLAGS) $(LIBS)
simple_mpi: phase_correction.c
$(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingMPI_simple w-stacking_omp.c w-stacking-fftw.c phase_correction.c $(CFLAGS) $(LIBS)
mpi_omp: phase_correction.c
$(MPICC) $(OPTIMIZE) $(OPT) -o w-stackingMPI_omp w-stacking_omp.c w-stacking-fftw.c phase_correction.c $(CFLAGS) $(LIBS)
serial_cuda:
$(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB)
$(CC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c
$(CXX) $(CFLAGS) $(OPT) -o w-stackingfftw_serial w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) -lm
$(CC) $(OPTIMIZE) $(OPT) -c w-stacking-fftw.c $(CFLAGS) $(LIBS)
$(CXX) $(OPTIMIZE) $(OPT) -o w-stackingfftw_serial w-stacking-fftw.o w-stacking.o phase_correction.o $(CFLAGS) $(NVLIB) -lm
mpi: $(COBJ)
$(MPICC) -o w-stackingCfftw $^ $(CFLAGS)
$(MPICC) $(OPTIMIZE) -o w-stackingCfftw $^ $(CFLAGS) $(LIBS)
mpi_cuda:
$(NVCC) $(NVFLAGS) -c w-stacking.cu phase_correction.cu $(NVLIB)
$(MPICC) $(CFLAGS) $(OPT) -c w-stacking-fftw.c
$(MPIC++) $(OPT) -o w-stackingfftw w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) $(CFLAGS)
$(MPICC) $(OPTIMIZE) $(OPT) -c w-stacking-fftw.c $(CFLAGS) $(LIBS)
$(MPIC++) $(OPTIMIZE) $(OPT) -o w-stackingfftw w-stacking-fftw.o w-stacking.o phase_correction.o $(NVLIB) $(CFLAGS) $(LIBS)
clean:
rm *.o
......
......@@ -18,7 +18,11 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
for (int iw=0; iw<num_w_planes; iw++)
{
double wterm = (double)iw/dnum_w_planes;
#ifdef ACCOMP
#pragma omp target teams distribute parallel for \
map(tofrom: image_real[0:xaxis*yaxis], image_imag[0:xaxis*yaxis]) \
map (to: gridss[0:2*num_w_planes*xaxis*yaxis])
#endif
for (int iv=0; iv<yaxis; iv++)
for (int iu=0; iu<xaxis; iu++)
{
......@@ -53,10 +57,14 @@ void phase_correction(double* gridss, double* image_real, double* image_imag, in
s = pimag;
//printf("%d %d %d %ld %ld\n",iu,iv,iw,index,img_index);
#pragma omp atomic
image_real[img_index] += p*r-q*s;
#pragma omp atomic
image_imag[img_index] += p*s+q*r;
#else
#pragma omp atomic
image_real[img_index] += gridss[index];
#pragma omp atomic
image_imag[img_index] += gridss[index+1];
#endif
......
......@@ -7,9 +7,15 @@
#include <fftw3-mpi.h>
#endif
#endif
#include <omp.h>
#include <math.h>
#include <time.h>
#include <unistd.h>
#ifdef ACCOMP
#include "w-stacking_omp.h"
#else
#include "w-stacking.h"
#endif
#define PI 3.14159265359
#define NUM_OF_SECTORS -1
#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
......@@ -100,6 +106,8 @@ int main(int argc, char * argv[])
int xaxis;
int yaxis;
int num_w_planes = 1;
// DAV: the corresponding KernelLen is calculated within the wstack function. It can be anyway hardcoded for optimization
int w_support = 7;
int num_threads;// = 4;
......@@ -115,11 +123,23 @@ int main(int argc, char * argv[])
struct timespec begin, finish, begin0, begink, finishk;
double elapsed;
long nsectors;
/* GT get nymber of threads exit if not given */
if(argc == 1) {
fprintf(stderr, "Usage: %s number_of_OMP_Threads \n", argv[0]);
exit(1);
}
// Set the number of OpenMP threads
num_threads = atoi(argv[1]);
if ( num_threads == 0 )
{
fprintf(stderr, "Wrong parameter: %s\n\n", argv[1]);
fprintf(stderr, "Usage: %s number_of_OMP_Threads \n", argv[0]);
exit(1);
}
clock_gettime(CLOCK_MONOTONIC, &begin0);
start0 = clock();
// Set the number of OpenMP threads
num_threads = atoi(argv[1]);
// Intialize MPI environment
#ifdef USE_MPI
......@@ -134,8 +154,22 @@ int main(int argc, char * argv[])
rank = 0;
size = 1;
#endif
if(rank == 0)printf("Running with %d threads\n",num_threads);
#ifdef ACCOMP
if(rank == 0){
if (0 == omp_get_num_devices()) {
printf("No accelerator found ... exit\n");
exit(255);
}
printf("Number of available GPUs %d\n", omp_get_num_devices());
#ifdef NVIDIA
prtAccelInfo();
#endif
}
#endif
// set the local size of the image
local_grid_size_x = grid_size_x;
nsectors = NUM_OF_SECTORS;
......@@ -211,6 +245,7 @@ int main(int argc, char * argv[])
printf("N. visibilities on %d %ld\n",rank,Nvis);
#endif
// DAV: all these arrays can be allocatate statically for the sake of optimization. However be careful that if MPI is used
// all the sizes are rescaled by the number of MPI tasks
// Allocate arrays
......@@ -260,13 +295,13 @@ int main(int argc, char * argv[])
setup_time1 = (finish.tv_sec - begin.tv_sec);
setup_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
if(rank == 0)printf("GRIDDING DATA\n");
// Create histograms and linked lists
clock_gettime(CLOCK_MONOTONIC, &begin);
start = clock();
//CLAAA
// Initialize linked list
struct sectorlist ** sectorhead;
sectorhead = (struct sectorlist **) malloc((nsectors+1) * sizeof(struct sectorlist));
......@@ -335,20 +370,18 @@ int main(int argc, char * argv[])
#ifndef USE_MPI
double * gridtot = (double*) calloc(2*grid_size_x*grid_size_y*num_w_planes,sizeof(double));
#endif
double shift = (double)(dx*yaxis);
// Open the MPI Memory Window for the slab
#ifdef USE_MPI
MPI_Win slabwin;
MPI_Win_create(grid, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &slabwin);
MPI_Win_fence(0,slabwin);
#endif
#ifndef USE_MPI
pFile1 = fopen (outfile1,"w");
#endif
// loop over files
//
kernel_time = 0.0;
......@@ -357,9 +390,9 @@ int main(int argc, char * argv[])
reduce_time1 = 0.0;
compose_time = 0.0;
compose_time1 = 0.0;
for (int ifiles=0; ifiles<ndatasets; ifiles++)
{
strcpy(filename,datapath_multi[ifiles]);
printf("Processing %s, %d of %d\n",filename,ifiles+1,ndatasets);
strcat(filename,weightsfile);
......@@ -368,20 +401,21 @@ int main(int argc, char * argv[])
fseek (pFile,startrow*polarisations*sizeof(float),SEEK_SET);
fread(weights,(Nweights)*sizeof(float),1,pFile);
fclose(pFile);
strcpy(filename,datapath);
strcat(filename,visrealfile);
//printf("Reading %s\n",filename);
#ifdef VERBOSE
printf("Reading %s\n",filename);
#endif
pFile = fopen (filename,"rb");
fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET);
fread(visreal,Nvis*sizeof(float),1,pFile);
fclose(pFile);
strcpy(filename,datapath);
strcat(filename,visimgfile);
//printf("Reading %s\n",filename);
#ifdef VERBOSE
printf("Reading %s\n",filename);
#endif
pFile = fopen (filename,"rb");
fseek (pFile,startrow*freq_per_chan*polarisations*sizeof(float),SEEK_SET);
fread(visimg,Nvis*sizeof(float),1,pFile);
......@@ -390,7 +424,6 @@ int main(int argc, char * argv[])
#ifdef USE_MPI
MPI_Barrier(MPI_COMM_WORLD);
#endif
// Declare temporary arrays for the masking
double * uus;
double * vvs;
......@@ -398,8 +431,8 @@ int main(int argc, char * argv[])
float * visreals;
float * visimgs;
float * weightss;
long isector;
for (long isector_count=0; isector_count<nsectors; isector_count++)
{
clock_gettime(CLOCK_MONOTONIC, &begink);
......@@ -425,13 +458,15 @@ int main(int argc, char * argv[])
//CLAAAA
struct sectorlist * current;
current = sectorhead[isector];
while (current->index != -1)
{
long ilocal = current->index;
//double vvh = vv[ilocal];
//int binphi = (int)(vvh*nsectors);
//if (binphi == isector || boundary[ilocal] == isector)
//{
//if (binphi == isector || boundary[ilocal] == isector) {
uus[icount] = uu[ilocal];
vvs[icount] = vv[ilocal]-isector*shift;
wws[icount] = ww[ilocal];
......@@ -457,6 +492,7 @@ int main(int argc, char * argv[])
compose_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
compose_time1 += (finishk.tv_sec - begink.tv_sec);
compose_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
#ifndef USE_MPI
double uumin = 1e20;
double vvmin = 1e20;
......@@ -483,6 +519,7 @@ int main(int argc, char * argv[])
#endif
clock_gettime(CLOCK_MONOTONIC, &begink);
startk = clock();
wstack(num_w_planes,
Nsec,
freq_per_chan,
......@@ -500,6 +537,18 @@ int main(int argc, char * argv[])
yaxis,
gridss,
num_threads);
/* int z =0 ;
#pragma omp target map(to:test_i_gpu) map(from:z)
{
int x; // only accessible from accelerator
x = 2;
z = x + test_i_gpu;
}*/
clock_gettime(CLOCK_MONOTONIC, &finishk);
endk = clock();
kernel_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
......
......@@ -6,6 +6,9 @@
#include <stdlib.h>
#include <stdio.h>
#ifdef ACCOMP
#pragma omp declare target
#endif
#ifdef __CUDACC__
double __device__
#else
......@@ -17,6 +20,9 @@ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
conv_weight = norm * exp(-((u_dist*u_dist)+(v_dist*v_dist))*std22);
return conv_weight;
}
#ifdef ACCOMP
#pragma omp end declare target
#endif
#ifdef __CUDACC__
//double __device__ gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist)
......@@ -111,7 +117,9 @@ __global__ void convolve_g(
}
}
#endif
#ifdef ACCOMP
#pragma omp declare target
#endif
void wstack(
int num_w_planes,
long num_points,
......@@ -216,7 +224,14 @@ void wstack(
#ifdef _OPENMP
omp_set_num_threads(num_threads);
#endif
#ifdef ACCOMP
long Nvis = num_points*freq_per_chan*polarizations;
// #pragma omp target data map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan])
// #pragma omp target teams distribute parallel for map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom: grid[0:2*num_w_planes*grid_size_x*grid_size_y])
#else
#pragma omp parallel for private(visindex)
#endif
for (i = 0; i < num_points; i++)
{
#ifdef _OPENMP
......@@ -258,6 +273,7 @@ void wstack(
double u_dist = (double)j+0.5 - pos_u;
long iKer = 2 * (j + k*grid_size_x + grid_w*grid_size_x*grid_size_y);
double conv_weight = gauss_kernel_norm(norm,std22,u_dist,v_dist);
// Loops over frequencies and polarizations
double add_term_real = 0.0;
......@@ -293,6 +309,9 @@ void wstack(
#endif
//for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
}
#ifdef ACCOMP
#pragma omp end declare target
#endif
int test(int nnn)
{
......@@ -301,4 +320,3 @@ int test(int nnn)
mmm = nnn+1;
return mmm;
}
......@@ -8,6 +8,7 @@
#ifdef __CUDACC__
extern "C"
#endif
void wstack(
int,
long,
......
......@@ -3,6 +3,9 @@
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#ifdef NVIDIA
#include <cuda_runtime.h>
#endif
#ifdef ACCOMP
#pragma omp declare target
......@@ -17,9 +20,8 @@ double gauss_kernel_norm(double norm, double std22, double u_dist, double v_dist
#pragma omp end declare target
#endif
#ifdef ACCOMP
#pragma omp declare target
#endif
void wstack(
int num_w_planes,
long num_points,
......@@ -39,8 +41,7 @@ void wstack(
double* grid,
int num_threads)
{
long i;
long index;
//long index;
long visindex;
// initialize the convolution kernel
......@@ -57,12 +58,16 @@ void wstack(
#ifdef ACCOMP
long Nvis = num_points*freq_per_chan*polarizations;
// #pragma omp target data map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan])
#pragma omp target teams distribute parallel for private(visindex) map(to:uu[0:num_points], vv[0:num_points], ww[0:num_points], vis_real[0:Nvis], vis_img[0:Nvis], weight[0:Nvis/freq_per_chan]) map(tofrom: grid[0:2*num_w_planes*grid_size_x*grid_size_y])
#else
#pragma omp parallel for private(visindex)
long gpu_weight_dim = Nvis/freq_per_chan;
long gpu_grid_dim = 2*num_w_planes*grid_size_x*grid_size_y;
#pragma omp target teams distribute parallel for private(visindex) \
map(to:num_points, KernelLen, std, std22, norm, num_w_planes, \
uu[0:num_points], vv[0:num_points], ww[0:num_points], \
vis_real[0:Nvis], vis_img[0:Nvis], weight[0:gpu_weight_dim], \
grid_size_x, grid_size_y, freq_per_chan, polarizations, dx,dw, w_support, num_threads) \
map(tofrom: grid[0:gpu_grid_dim])
#endif
for (i = 0; i < num_points; i++)
for (long i = 0; i < num_points; i++)
{
#ifdef _OPENMP
//int tid;
......@@ -135,12 +140,104 @@ void wstack(
}
}
//for (int i=0; i<100000; i++)printf("%f\n",grid[i]);
}
#ifdef ACCOMP
#pragma omp end declare target
#ifdef NVIDIA
#define CUDAErrorCheck(funcall) \
do { \
cudaError_t ierr = funcall; \
if (cudaSuccess != ierr) { \
fprintf(stderr, "%s(line %d) : CUDA RT API error : %s(%d) -> %s\n", \
__FILE__, __LINE__, #funcall, ierr, cudaGetErrorString(ierr)); \
exit(ierr); \
} \
} while (0)
static inline int _corePerSM(int major, int minor)
/**
* @brief Give the number of CUDA cores per streaming multiprocessor (SM).
*
* The number of CUDA cores per SM is determined by the compute capability.
*
* @param major Major revision number of the compute capability.
* @param minor Minor revision number of the compute capability.
*
* @return The number of CUDA cores per SM.
*/
{
if (1 == major) {
if (0 == minor || 1 == minor || 2 == minor || 3 == minor) return 8;
}
if (2 == major) {
if (0 == minor) return 32;
if (1 == minor) return 48;
}
if (3 == major) {
if (0 == minor || 5 == minor || 7 == minor) return 192;
}
if (5 == major) {
if (0 == minor || 2 == minor) return 128;
}
if (6 == major) {
if (0 == minor) return 64;
if (1 == minor || 2 == minor) return 128;
}
if (7 == major) {
if (0 == minor || 2 == minor || 5 == minor) return 64;
}
return -1;
}
void getGPUInfo(int iaccel)
{
int corePerSM;
struct cudaDeviceProp dev;
CUDAErrorCheck(cudaSetDevice(iaccel));
CUDAErrorCheck(cudaGetDeviceProperties(&dev, iaccel));
corePerSM = _corePerSM(dev.major, dev.minor);
printf("\n");
printf("============================================================\n");
printf("CUDA Device name : \"%s\"\n", dev.name);
printf("------------------------------------------------------------\n");
printf("Comp. Capability : %d.%d\n", dev.major, dev.minor);
printf("max clock rate : %.0f MHz\n", dev.clockRate * 1.e-3f);
printf("number of SMs : %d\n", dev.multiProcessorCount);
printf("cores / SM : %d\n", corePerSM);
printf("# of CUDA cores : %d\n", corePerSM * dev.multiProcessorCount);
printf("------------------------------------------------------------\n");
printf("global memory : %5.0f MBytes\n", dev.totalGlobalMem / 1048576.0f);
printf("shared mem. / SM : %5.1f KBytes\n", dev.sharedMemPerMultiprocessor / 1024.0f);
printf("32-bit reg. / SM : %d\n", dev.regsPerMultiprocessor);
printf("------------------------------------------------------------\n");
printf("max # of threads / SM : %d\n", dev.maxThreadsPerMultiProcessor);
printf("max # of threads / block : %d\n", dev.maxThreadsPerBlock);
printf("max dim. of block : (%d, %d, %d)\n",
dev.maxThreadsDim[0], dev.maxThreadsDim[1], dev.maxThreadsDim[2]);
printf("max dim. of grid : (%d, %d, %d)\n",
dev.maxGridSize[0], dev.maxGridSize[1], dev.maxGridSize[2]);
printf("warp size : %d\n", dev.warpSize);
printf("============================================================\n");
int z = 0, x = 2;
#pragma omp target map(to:x) map(tofrom:z)
{
z=x+100;
}
}
#endif
int test(int nnn)
{
int mmm;
......
......@@ -34,6 +34,9 @@ double gauss_kernel_norm(
double u_dist,
double v_dist);
void phase_correction(
double*,
double*,
......@@ -43,3 +46,37 @@ void phase_correction(
int,
int,
int);
#ifdef ACCOMP
#ifdef NVIDIA
void getGPUInfo(int);
#endif
#pragma omp declare target (gauss_kernel_norm)
#endif
#ifdef NVIDIA
#ifdef __cplusplus
extern "C" {
#endif
#ifndef PRTACCELINFO_H
#define PRTACCELINFO_H
void prtAccelInfo(int iaccel);
/**<
* @brief Print some basic info of an accelerator.
*
* Strictly speaking, \c prtAccelInfo() can only print the basic info of an
* Nvidia CUDA device.
*
* @param iaccel The index of an accelerator.
*
* @return \c void.
*/
#endif
#ifdef __cplusplus
}
#endif
#endif
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment