Skip to content
Snippets Groups Projects
Commit 1d708577 authored by Luca Tornatore's avatar Luca Tornatore
Browse files

finalized one-sided communication in shared-memory

parent a22b9970
No related branches found
No related tags found
No related merge requests found
......@@ -7,7 +7,9 @@ struct ip in;
struct op out, outparam;
struct meta metaData;
struct time timing;
timing_t wt_timing = {0};
timing_t pr_timing = {0};
struct parameter param;
struct fileData data;
......@@ -16,6 +18,7 @@ char datapath[900];
int xaxis, yaxis;
int global_rank;
int size;
int verbose_level = 0;
long nsectors;
long startrow;
double resolution, dx, dw, w_supporth;
......
......@@ -92,13 +92,24 @@ extern struct meta
} metaData;
extern struct time
{
double setup_time, process_time, mpi_time, fftw_time, tot_time, kernel_time, reduce_time, compose_time, phase_time;
double setup_time1, process_time1, mpi_time1, fftw_time1, tot_time1, kernel_time1, reduce_time1, compose_time1, phase_time1;
double writetime, writetime1;
} timing;
typedef struct {
double setup; // time spent in initialization, init()
double init; // time spent in initializing arrays
double process; // time spent in gridding;
double mpi; // time spent in mpi communications
double fftw; //
double kernel; //
double mmove; //
double reduce; //
double reduce_mpi; //
double reduce_sh ; //
double compose; //
double phase; //
double write; //
double total; } timing_t;
extern timing_t wt_timing; // wall-clock timings
extern timing_t pr_timing; // process CPU timing
extern struct parameter
{
......@@ -127,12 +138,11 @@ extern char datapath[900];
extern int xaxis, yaxis;
extern int global_rank;
extern int size;
extern int verbose_level;
extern long nsectors;
extern long startrow;
extern double resolution, dx, dw, w_supporth;
extern clock_t start, end, start0, startk, endk;
extern struct timespec begin, finish, begin0, begink, finishk;
extern long * histo_send, size_of_grid;
extern double * grid, *gridss, *gridss_real, *gridss_img, *gridss_w;
......@@ -142,3 +152,60 @@ extern double * grid, *gridss, *gridss_real, *gridss_img, *gridss_w;
#endif
extern long **sectorarray;
#if defined(DEBUG)
#define dprintf(LEVEL, T, t, ...) if( (verbose_level >= (LEVEL)) && \
( ((t) ==-1 ) || ((T)==(t)) ) ) { \
printf(__VA_ARGS__); fflush(stdout); }
#else
#define dprintf(...)
#endif
#define CPU_TIME_wt ({ struct timespec myts; (clock_gettime( CLOCK_REALTIME, &myts ), (double)myts.tv_sec + (double)myts.tv_nsec * 1e-9);})
#define CPU_TIME_pr ({ struct timespec myts; (clock_gettime( CLOCK_PROCESS_CPUTIME_ID, &myts ), (double)myts.tv_sec + (double)myts.tv_nsec * 1e-9);})
#define CPU_TIME_th ({ struct timespec myts; (clock_gettime( CLOCK_THREAD_CPUTIME_ID, &myts ), (double)myts.tv_sec + (double)myts.tv_nsec * 1e-9);})
#if defined(_OPENMP)
#define TAKE_TIME_START( T ) { \
wt_timing.T = CPU_TIME_wt; \
pr_timing.T = CPU_TIME_pr; }
#define TAKE_TIME_STOP( T ) { \
pr_timing.T = CPU_TIME_pr - pr_timing.T; \
wt_timing.T = CPU_TIME_wt - wt_timing.T; }
#define TAKE_TIME( Twt, Tpr ) { Twt = CPU_TIME_wt; Tpr = CPU_TIME_pr; }
#define ADD_TIME( T, Twt, Tpr ) { \
pr_timing.T += CPU_TIME_pr - Tpr; \
wt_timing.T += CPU_TIME_wt - Twt; \
Tpr = CPU_TIME_pr; Twt = CPU_TIME_wt; }
#else
#define TAKE_TIME_START( T ) wt_timing.T = CPU_TIME_wt
#define TAKE_TIME_STOP( T ) wt_timing.T = CPU_TIME_wt - wt_timing.T
#define TAKE_TIME( Twt, ... ) Twt = CPU_TIME_wt;
#define ADD_TIME( T, Twt, ... ) { wt_timing.T += CPU_TIME_wt - Twt; Twt = CPU_TIME_wt;}
#endif
#define TAKE_TIMEwt_START( T) wt_timing.T = CPU_TIME_wt
#define TAKE_TIMEwt_STOP( T) wt_timing.T = CPU_TIME_wt - wt_timing.T
#define TAKE_TIMEwt( Twt ) Twt = CPU_TIME_wt;
#define ADD_TIMEwt( T, Twt ) { wt_timing.T += CPU_TIME_wt - Twt; Twt = CPU_TIME_wt; }
#if defined(__GNUC__) && !defined(__ICC) && !defined(__INTEL_COMPILER)
#define PRAGMA_IVDEP _Pragma("GCC ivdep")
#else
#define PRAGMA_IVDEP _Pragma("ivdep")
#endif
#define STRINGIFY(a) #a
#define UNROLL(N) _Pragma(STRINGIFY(unroll(N)))
ndatasets 1
Datapath1 /beegfs/lofar/cgheller/L798046_SB244_uv.uncorr_130B27932t_121MHz.pre-cal.binMS/
Datapath2 /beegfs/lofar/cgheller/L798046_SB244_uv.uncorr_130B27932t_123MHz.pre-cal.binMS/
Datapath3 /beegfs/lofar/cgheller/L798046_SB244_uv.uncorr_130B27932t_125MHz.pre-cal.binMS/
Datapath1 ../input/
Datapath2 ../input/
Datapath3 ../input/
num_threads 2
w_support 7
grid_size_x 2048
......@@ -24,3 +24,4 @@ fftfile3 fft_img.bin
logfile run.log
extension .txt
timingfile timings.dat
verbose_level 1
......@@ -2,13 +2,15 @@
#include "allvars.h"
#include "proto.h"
void fftw_data(){
void fftw_data()
{
#ifdef USE_FFTW
// FFT transform the data (using distributed FFTW)
if(global_rank == 0)printf("PERFORMING FFT\n");
clock_gettime(CLOCK_MONOTONIC, &begin);
start = clock();
TAKE_TIME_START(fftw);
fftw_plan plan;
fftw_complex *fftwgrid;
ptrdiff_t alloc_local, local_n0, local_0_start;
......@@ -64,21 +66,23 @@ void fftw_data(){
MPI_Barrier(MPI_COMM_WORLD);
#endif
end = clock();
clock_gettime(CLOCK_MONOTONIC, &finish);
timing.fftw_time = ((double) (end - start)) / CLOCKS_PER_SEC;
timing.fftw_time1 = (finish.tv_sec - begin.tv_sec);
timing.fftw_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
clock_gettime(CLOCK_MONOTONIC, &begin);
TAKE_TIME_STOP(fftw);
#endif
}
void write_fftw_data(){
// Write results
#ifdef USE_FFTW
double twt, tpr;
#ifdef WRITE_DATA
// Write results
TAKE_TIME(twt, tpr);
#ifdef USE_MPI
MPI_Win writewin;
MPI_Win_create(gridss, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &writewin);
......@@ -150,27 +154,28 @@ void write_fftw_data(){
MPI_Win_free(&writewin);
MPI_Barrier(MPI_COMM_WORLD);
#endif
ADD_TIME(write, twt, tpr);
#endif //WRITE_DATA
// Phase correction
clock_gettime(CLOCK_MONOTONIC, &begin);
start = clock();
TAKE_TIME_START(phase)
if(global_rank == 0)printf("PHASE CORRECTION\n");
double* image_real = (double*) calloc(xaxis*yaxis,sizeof(double));
double* image_imag = (double*) calloc(xaxis*yaxis,sizeof(double));
phase_correction(gridss,image_real,image_imag,xaxis,yaxis,param.num_w_planes,param.grid_size_x,param.grid_size_y,resolution,metaData.wmin,metaData.wmax,param.num_threads);
end = clock();
clock_gettime(CLOCK_MONOTONIC, &finish);
timing.phase_time = ((double) (end - start)) / CLOCKS_PER_SEC;
timing.phase_time1 = (finish.tv_sec - begin.tv_sec);
timing.phase_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
TAKE_TIME_STOP(phase);
#ifdef WRITE_IMAGE
TAKE_TIME(twt, tpr);
if(global_rank == 0)
{
file.pFilereal = fopen (out.fftfile2,"wb");
......@@ -208,6 +213,7 @@ void write_fftw_data(){
MPI_Barrier(MPI_COMM_WORLD);
#endif
ADD_TIME(write, twt, tpr);
#endif //WRITE_IMAGE
#endif //FFTW
......
......@@ -4,35 +4,35 @@
#include "proto.h"
void gridding(){
void gridding()
{
if(global_rank == 0)printf("GRIDDING DATA\n");
// Create histograms and linked lists
clock_gettime(CLOCK_MONOTONIC, &begin);
start = clock();
TAKE_TIME_START(init);
// Initialize linked list
initialize_array();
TAKE_TIME_STOP(init);
TAKE_TIME_START(process);
// Sector and Gridding data
gridding_data();
TAKE_TIME_STOP(process);
#ifdef USE_MPI
MPI_Barrier(MPI_COMM_WORLD);
#endif
end = clock();
clock_gettime(CLOCK_MONOTONIC, &finish);
timing.process_time = ((double) (end - start)) / CLOCKS_PER_SEC;
timing.process_time1 = (finish.tv_sec - begin.tv_sec);
timing.process_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
clock_gettime(CLOCK_MONOTONIC, &begin);
return;
}
void initialize_array(){
void initialize_array()
{
histo_send = (long*) calloc(nsectors+1,sizeof(long));
int * boundary = (int*) calloc(metaData.Nmeasures,sizeof(int));
......@@ -79,13 +79,16 @@ void initialize_array(){
#ifdef VERBOSE
for (int iii=0; iii<nsectors+1; iii++)printf("HISTO %d %d %ld\n", global_rank, iii, histo_send[iii]);
#endif
free( boundary);
return;
}
void gridding_data(){
void gridding_data()
{
double shift = (double)(dx*yaxis);
// Open the MPI Memory Window for the slab
#ifdef USE_MPI
MPI_Win_create(grid, size_of_grid*sizeof(double), sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &slabwin);
MPI_Win_fence(0,slabwin);
......@@ -95,24 +98,20 @@ void gridding_data(){
file.pFile1 = fopen (out.outfile1,"w");
#endif
timing.kernel_time = 0.0;
timing.kernel_time1 = 0.0;
timing.reduce_time = 0.0;
timing.reduce_time1 = 0.0;
timing.compose_time = 0.0;
timing.compose_time1 = 0.0;
// calculate the resolution in radians
resolution = 1.0/MAX(abs(metaData.uvmin),abs(metaData.uvmax));
// calculate the resolution in arcsec
double resolution_asec = (3600.0*180.0)/MAX(abs(metaData.uvmin),abs(metaData.uvmax))/PI;
if( global_rank == 0 )
printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec);
for (long isector = 0; isector < nsectors; isector++)
{
clock_gettime(CLOCK_MONOTONIC, &begink);
startk = clock();
double twt, tpr;
TAKE_TIME(twt, tpr);
// define local destination sector
//isector = (isector_count+rank)%size; // this line must be wrong! [LT]
......@@ -132,6 +131,7 @@ void gridding_data(){
long ip = 0;
long inu = 0;
#warning shall we omp-ize this ?
for(long iphi = histo_send[isector]-1; iphi>=0; iphi--)
{
long ilocal = sectorarray[isector][iphi];
......@@ -141,33 +141,32 @@ void gridding_data(){
uus[icount] = data.uu[ilocal];
vvs[icount] = data.vv[ilocal]-isector*shift;
wws[icount] = data.ww[ilocal];
for (long ipol=0; ipol<metaData.polarisations; ipol++)
UNROLL(4)
PRAGMA_IVDEP
for (long ipol=0; ipol<metaData.polarisations; ipol++, ip++)
{
weightss[ip] = data.weights[ilocal*metaData.polarisations+ipol];
ip++;
}
for (long ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++)
PRAGMA_IVDEP
UNROLL(4)
for (long ifreq=0; ifreq<metaData.polarisations*metaData.freq_per_chan; ifreq++, inu++)
{
visreals[inu] = data.visreal[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq];
visimgs[inu] = data.visimg[ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq];
//if(visimgs[inu]>1e10 || visimgs[inu]<-1e10)printf("%f %f %ld %ld %d %ld %ld\n",visreals[inu],visimgs[inu],inu,Nvissec,rank,ilocal*metaData.polarisations*metaData.freq_per_chan+ifreq,metaData.Nvis);
inu++;
}
icount++;
}
clock_gettime(CLOCK_MONOTONIC, &finishk);
endk = clock();
timing.compose_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
timing.compose_time1 += (finishk.tv_sec - begink.tv_sec);
timing.compose_time1 += (finishk.tv_sec - begink.tv_sec);
timing.compose_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
ADD_TIME(compose, twt, tpr);
#ifndef USE_MPI
double vvmin = 1e20;
double uumax = -1e20;
double vvmax = -1e20;
#warning shall we omp-ize this ?
for (long ipart=0; ipart<Nsec; ipart++)
{
uumin = MIN(uumin,uus[ipart]);
......@@ -186,8 +185,7 @@ void gridding_data(){
#ifdef VERBOSE
printf("Processing sector %ld\n",isector);
#endif
clock_gettime(CLOCK_MONOTONIC, &begink);
startk = clock();
TAKE_TIME(twt, tpr);
wstack(param.num_w_planes,
Nsec,
......@@ -207,6 +205,8 @@ void gridding_data(){
gridss,
param.num_threads);
ADD_TIME(kernel, twt, tpr);
/* int z =0 ;
* #pragma omp target map(to:test_i_gpu) map(from:z)
* {
......@@ -215,42 +215,49 @@ void gridding_data(){
* z = x + test_i_gpu;
* }*/
clock_gettime(CLOCK_MONOTONIC, &finishk);
endk = clock();
timing.kernel_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
timing.kernel_time1 += (finishk.tv_sec - begink.tv_sec);
timing.kernel_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
#ifdef VERBOSE
printf("Processed sector %ld\n",isector);
#endif
clock_gettime(CLOCK_MONOTONIC, &begink);
startk = clock();
/* ----------------
* REDUCE
* ---------------- */
//for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++)printf("--> %f\n",gridss[iii]);
double twt_r, tpr_r;
TAKE_TIME(twt_r, tpr_r);
#ifndef USE_MPI
long stride = isector*2*xaxis*yaxis*num_w_planes;
for (long iii=0; iii<2*xaxis*yaxis*num_w_planes; iii++)
// ..................
#ifndef USE_MPI // REDUCE WITH NO MPI
#pragma omp parallel
{
long stride = isector * size_of_grid;
#pragma omp for
for (long iii=0; iii< size_fo_grid; iii++)
gridtot[stride+iii] = gridss[iii];
#endif
}
// ..................
// REDUCE WITH MPI
#else
// Write grid in the corresponding remote slab
#ifdef USE_MPI
// int target_rank = (int)isector; it implied that size >= nsectors
int target_rank = (int)(isector % size);
#ifdef ONE_SIDE
// MPI_Win_lock(MPI_LOCK_SHARED,target_rank,0,slabwin);
// MPI_Accumulate(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,MPI_SUM,slabwin);
// MPI_Win_unlock(target_rank,slabwin);
// for every task, gridss coincides with the
// that can be avoided if shared window coincides with gridss
TAKE_TIME(twt, tpr);
memcpy(Me.win.ptr+isector*size_of_grid, gridss, size_of_grid*sizeof(double));
ADD_TIME(mmove, twt, tpr);
dprintf(1, global_rank, 0, "reducing sector %ld..\n", isector);
TAKE_TIME( twt, tpr);
reduce( isector, target_rank ); // here the reduce is performed within every host
ADD_TIME(reduce_sh, twt, tpr);
if ( Me.Nhosts > 1 )
{
......@@ -272,6 +279,7 @@ void gridding_data(){
if( Sector_Comm != MPI_COMM_NULL )
{
double _twt_;
int sector_size;
int sector_rank = 0;
int sector_target;
......@@ -281,7 +289,10 @@ void gridding_data(){
if ( global_rank == target_rank)
{
MPI_Send( &sector_rank, 1, MPI_INT, 0, 0, Sector_Comm);
memcpy(grid, Me.swins[Me.Rank[myHOST]].ptr+isector*size_of_grid*sizeof(double), size_of_grid * sizeof(double));
TAKE_TIMEwt( _twt_ );
memcpy(gridss, Me.swins[Me.Rank[myHOST]].ptr+isector*size_of_grid*sizeof(double),
size_of_grid * sizeof(double));
ADD_TIMEwt( mmove, _twt_);
}
if( sector_rank == 0 )
......@@ -290,29 +301,35 @@ void gridding_data(){
MPI_Recv( &sector_target, 1, MPI_INT, MPI_ANY_SOURCE, 0, Sector_Comm, &status);
}
TAKE_TIMEwt(_twt_);
MPI_Bcast( &sector_target, 1, MPI_INT, 0, Sector_Comm );
MPI_Reduce(grid, grid, size_of_grid, MPI_DOUBLE,MPI_SUM, sector_target, Sector_Comm);
MPI_Reduce(gridss, grid, size_of_grid, MPI_DOUBLE,MPI_SUM, sector_target, Sector_Comm);
MPI_Comm_free( &Sector_Comm );
ADD_TIMEwt(mpi, _twt_);
}
}
ADD_TIME(reduce_mpi, twt, tpr);
#else // relates to #ifdef ONE_SIDE
{
double _twt_;
TAKE_TIMEwt(_twt_);
MPI_Reduce(gridss,grid,size_of_grid,MPI_DOUBLE,MPI_SUM,target_rank,MPI_COMM_WORLD);
ADD_TIMEwt(mpi, _twt_);
}
#endif // closes #ifdef ONE_SIDE
#endif // closes USE_MPI
clock_gettime(CLOCK_MONOTONIC, &finishk);
endk = clock();
timing.reduce_time += ((double) (endk - startk)) / CLOCKS_PER_SEC;
timing.reduce_time1 += (finishk.tv_sec - begink.tv_sec);
timing.reduce_time1 += (finishk.tv_nsec - begink.tv_nsec) / 1000000000.0;
// Go to next sector
for (long inull=0; inull<2*param.num_w_planes*xaxis*yaxis; inull++)gridss[inull] = 0.0;
ADD_TIME(reduce, twt_r, tpr_r);
// wipe before getting to the next sector
memset((void*)gridss, 0, size_of_grid * sizeof(double));
// Deallocate all sector arrays
free(uus);
......@@ -324,25 +341,18 @@ void gridding_data(){
// End of loop over sector
}
// Finalize MPI communication
#ifdef USE_MPI
MPI_Win_fence(0,slabwin);
#endif
free( histo_send );
#ifndef USE_MPI
fclose(file.pFile1);
#endif
#ifdef USE_MPI
MPI_Win_fence(0,slabwin);
MPI_Barrier(MPI_COMM_WORLD);
#endif
end = clock();
clock_gettime(CLOCK_MONOTONIC, &finish);
timing.process_time = ((double) (end - start)) / CLOCKS_PER_SEC;
timing.process_time1 = (finish.tv_sec - begin.tv_sec);
timing.process_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
clock_gettime(CLOCK_MONOTONIC, &begin);
}
void write_grided_data()
......@@ -355,6 +365,7 @@ void write_grided_data()
printf("WRITING GRIDDED DATA\n");
file.pFilereal = fopen (out.outfile2,"wb");
file.pFileimg = fopen (out.outfile3,"wb");
#ifdef USE_MPI
for (int isector=0; isector<nsectors; isector++)
{
......@@ -377,6 +388,7 @@ void write_grided_data()
fseek(file.pFilereal, global_index, SEEK_SET);
fwrite(&gridss_real[index], 1, sizeof(double), file.pFilereal);
}
for (int iw=0; iw<param.num_w_planes; iw++)
for (int iv=0; iv<yaxis; iv++)
for (int iu=0; iu<xaxis; iu++)
......@@ -424,7 +436,6 @@ void write_grided_data()
#endif
#endif //WRITE_DATA
}
......@@ -434,6 +445,7 @@ void reduce( int sector, int target_rank )
{
int local_rank = Me.Rank[myHOST];
int target_rank_on_myhost = -1;
if( Me.Ranks_to_host[ target_rank ] == Me.myhost )
// exchange rank 0 with target rank
......@@ -442,16 +454,32 @@ void reduce( int sector, int target_rank )
// every target rank
{
int r = 0;
while( Me.Ranks_to_myhost[r] != target_rank )
r++;
target_rank_on_myhost = 0;
while( Me.Ranks_to_myhost[target_rank_on_myhost] != target_rank )
target_rank_on_myhost++;
if( r > 0 )
dprintf(2, Me.Rank[myHOST], 0,
"[SEC %d] swapping Host master with target rank %d (%d)\n",
sector, target_rank, target_rank_on_myhost);
if( target_rank_on_myhost > 0 )
// the target is not the task that already has rank 0
// on my host
{
if( local_rank == 0 )
local_rank = r;
if( local_rank == r )
local_rank = target_rank_on_myhost;
else if( local_rank == target_rank_on_myhost )
local_rank = 0;
win_t temp = Me.swins[target_rank_on_myhost];
Me.swins[target_rank_on_myhost] = Me.swins[0];
Me.swins[0] = temp;
temp = Me.scwins[target_rank_on_myhost];
Me.scwins[target_rank_on_myhost] = Me.scwins[0];
Me.scwins[0] = temp;
}
}
......@@ -470,6 +498,10 @@ void reduce( int sector, int target_rank )
if( local_rank % threshold == 0)
{
int source = local_rank + (1<<l);
dprintf(2, 0, 0,
"[SEC %d] task %d (%d) getting data from task %d at level %d\n",
sector, local_rank, Me.Rank[myHOST], source, l );
while( *(int*)(Me.scwins[source].ptr) < l )
// sleep 5 usec if the source target is not ready
NSLEEP( 5000 );
......@@ -481,8 +513,25 @@ void reduce( int sector, int target_rank )
*(int*)(Me.win_ctrl.ptr) = l;
}
else
{
dprintf(2, 0, 0,
"[SEC %d] task %d (%d) signaling that level %d is done\n",
sector, local_rank, Me.Rank[myHOST], l );
*(int*)(Me.win_ctrl.ptr) = l;
}
}
if ( target_rank_on_myhost > 0 )
{
win_t temp = Me.swins[target_rank_on_myhost];
Me.swins[target_rank_on_myhost] = Me.swins[0];
Me.swins[0] = temp;
temp = Me.scwins[target_rank_on_myhost];
Me.scwins[target_rank_on_myhost] = Me.scwins[0];
Me.scwins[0] = temp;
}
return;
}
......
......@@ -7,8 +7,7 @@
void init(int index)
{
clock_gettime(CLOCK_MONOTONIC, &begin0);
start0 = clock();
TAKE_TIME_START(total);
// DAV: the corresponding KernelLen is calculated within the wstack function. It can be anyway hardcoded for optimization
dx = 1.0/(double)param.grid_size_x;
......@@ -37,8 +36,7 @@ void init(int index)
printf("\nTask %d sees %d topology levels\n", global_rank, Me.MAXl);
#endif
clock_gettime(CLOCK_MONOTONIC, &begin);
start = clock();
TAKE_TIME_START(setup);
// INPUT FILES (only the first ndatasets entries are used)
strcpy(datapath,param.datapath_multi[index]);
......@@ -64,13 +62,7 @@ void init(int index)
MPI_Barrier(MPI_COMM_WORLD);
#endif
clock_gettime(CLOCK_MONOTONIC, &finish);
end = clock();
timing.setup_time = ((double) (end - start)) / CLOCKS_PER_SEC;
timing.setup_time1 = (finish.tv_sec - begin.tv_sec);
timing.setup_time1 += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
TAKE_TIME_STOP(setup);
}
void op_filename() {
......@@ -233,6 +225,11 @@ void read_parameter_file(char *fname)
{
strcpy(outparam.timingfile, buf2);
}
if(strcmp(buf1, "verbose_level") == 0)
{
verbose_level = atoi(buf1);
}
if(param.ndatasets > 1)
{
......@@ -258,9 +255,12 @@ void read_parameter_file(char *fname)
/* Communicating the relevent parameters to the other process */
#ifdef USE_MPI
double twt;
TAKE_TIMEwt(twt);
MPI_Bcast(&in, sizeof(struct ip), MPI_BYTE, 0, MPI_COMM_WORLD);
MPI_Bcast(&outparam, sizeof(struct op), MPI_BYTE, 0, MPI_COMM_WORLD);
MPI_Bcast(&param, sizeof(struct parameter), MPI_BYTE, 0, MPI_COMM_WORLD);
ADD_TIMEwt(mpi, twt);
#endif
}
......@@ -312,6 +312,7 @@ void readMetaData(char fileLocal[1000]) {
void metaData_calculation() {
int nsub = 1000;
if( global_rank == 0 )
printf("Subtracting last %d measurements\n",nsub);
metaData.Nmeasures = metaData.Nmeasures-nsub;
metaData.Nvis = metaData.Nmeasures*metaData.freq_per_chan*metaData.polarisations;
......
......@@ -62,9 +62,11 @@ int numa_init( int Rank, int Size, MPI_Comm *MYWORLD, map_t *Me )
// at my shared-memory level
//
for( int t = 0; t < Me->Ntasks[SHMEMl]; t++ )
if( t != Me->Rank[SHMEMl] )
{
//if( t != Me->Rank[SHMEMl] )
MPI_Win_shared_query( Me->win_ctrl.win, t, &(Me->scwins[t].size),
&(Me->scwins[t].disp), &(Me->scwins[t].ptr) );
}
if( Me->Rank[SHMEMl] != 0 )
MPI_Win_shared_query( win_ctrl_hostmaster, 0, &(win_ctrl_hostmaster_size),
......
......@@ -2,34 +2,43 @@
#include "allvars.h"
#include "proto.h"
void write_result() {
void write_result()
{
end = clock();
clock_gettime(CLOCK_MONOTONIC, &finish);
timing.tot_time = ((double) (end - start0)) / CLOCKS_PER_SEC;
timing.tot_time1 = (finish.tv_sec - begin0.tv_sec);
timing.tot_time1 += (finish.tv_nsec - begin0.tv_nsec) / 1000000000.0;
TAKE_TIME_STOP( total );
if (global_rank == 0)
{
printf("Setup time: %f sec\n",timing.setup_time);
printf("Process time: %f sec\n",timing.process_time);
printf("Kernel time = %f, Array Composition time %f, Reduce time: %f sec\n",timing.kernel_time,timing.compose_time,timing.reduce_time);
printf("%14s time : %f sec\n", "Setup", wt_timing.setup);
printf("%14s time : %f sec\n", "Process", wt_timing.process);
printf("%14s time : %f sec\n", "Reduce", wt_timing.reduce);
#if defined(USE_MPI)
#if defined(ONE_SIDE)
printf("%14s time : %f sec\n", "Reduce sh", wt_timing.reduce_sh);
printf("%14s time : %f sec\n", "Mmove", wt_timing.mmove);
printf("%14s time : %f sec\n", "ReduceMPI", wt_timing.reduce_mpi);
#endif
printf("%14s time : %f sec\n", "MPI", wt_timing.mpi);
#endif
printf("%10s Kernel time = %f, Array Composition time %f, Reduce time: %f sec\n", "",
wt_timing.kernel,wt_timing.compose,wt_timing.reduce);
#ifdef USE_FFTW
printf("FFTW time: %f sec\n",timing.fftw_time);
printf("Phase time: %f sec\n",timing.phase_time);
printf("%14s time : %f sec\n", "FFTW", wt_timing.fftw);
printf("%14s time : %f sec\n", "Phase",wt_timing.phase);
#endif
printf("TOT time: %f sec\n",timing.tot_time);
printf("%14s time : %f sec\n\n", "TOTAL", wt_timing.total);
if(param.num_threads > 1)
{
printf("PSetup time: %f sec\n",timing.setup_time1);
printf("PProcess time: %f sec\n",timing.process_time1);
printf("PKernel time = %f, PArray Composition time %f, PReduce time: %f sec\n",timing.kernel_time1,timing.compose_time1,timing.reduce_time1);
printf("%14s time : %f sec\n", "PSetup", pr_timing.setup);
printf("%14s time : %f sec\n", "PProcess", pr_timing.process);
printf("%10s PKernel time = %f, PArray Composition time %f, PReduce time: %f sec\n", "",
pr_timing.kernel,pr_timing.compose,pr_timing.reduce);
#ifdef USE_FFTW
printf("PFFTW time: %f sec\n",timing.fftw_time1);
printf("PPhase time: %f sec\n",timing.phase_time1);
printf("%14s time : %f sec\n", "PFFTW", pr_timing.fftw);
printf("%14s time : %f sec\n", "PPhase", pr_timing.phase);
#endif
printf("PTOT time: %f sec\n",timing.tot_time1);
printf("%14s time : %f sec\n", "PTOTAL", pr_timing.total);
}
}
......@@ -38,9 +47,13 @@ void write_result() {
file.pFile = fopen (out.timingfile,"w");
if (param.num_threads == 1)
{
fprintf(file.pFile, "%f %f %f %f %f %f %f\n",timing.setup_time,timing.kernel_time,timing.compose_time,timing.reduce_time,timing.fftw_time,timing.phase_time,timing.tot_time);
fprintf(file.pFile, "%f %f %f %f %f %f %f\n",
wt_timing.setup, wt_timing.kernel, wt_timing.compose,
wt_timing.reduce,wt_timing.fftw,wt_timing.phase, wt_timing.total);
} else {
fprintf(file.pFile, "%f %f %f %f %f %f %f\n",timing.setup_time1,timing.kernel_time1,timing.compose_time1,timing.reduce_time1,timing.fftw_time1,timing.phase_time1,timing.tot_time1);
fprintf(file.pFile, "%f %f %f %f %f %f %f\n",
pr_timing.setup, pr_timing.kernel, pr_timing.compose,
pr_timing.reduce,pr_timing.fftw,pr_timing.phase, pr_timing.total);
}
fclose(file.pFile);
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment