Skip to content
Snippets Groups Projects
Commit 71bccbe1 authored by Nandhana Sakhtivel's avatar Nandhana Sakhtivel
Browse files

Binomial Commnucation added but not working

parent 34c4d74f
No related branches found
No related tags found
No related merge requests found
......@@ -45,3 +45,5 @@ int Ntasks_local;
double **swins = NULL;
int **cwins = NULL;
int max_level = 0;
double *end_4, *end_reduce;
int dsize_4, iter=0;
......@@ -268,3 +268,5 @@ extern blocks_t blocks;
extern double **swins;
extern int **cwins;
extern int max_level;
extern double *end_4, *end_reduce;
extern int dsize_4, iter;
......@@ -136,12 +136,14 @@ void gridding_data()
copy_win_ptrs( (void***)&swins, Me.swins, Me.Ntasks[Me.SHMEMl] );
copy_win_ptrs( (void***)&cwins, Me.scwins, Me.Ntasks[Me.SHMEMl] );
int dsize_4 = (size_of_grid/4)*4;
double *end_4 = (double*)Me.win.ptr + dsize_4;
double *end = (double*)Me.win.ptr + datasize;
MPI_Barrier(MPI_COMM_WORLD);
// printf("The no of task in shared memory %d, host %d\n", Me.Ntasks[Me.SHMEMl], Me.Ntasks[myHOST]);
dsize_4 = (size_of_grid/4)*4;
end_4 = (double*)Me.win.ptr + dsize_4;
end_reduce = (double*)Me.win.ptr + size_of_grid;
while( (1<< (++max_level) ) < Me.Ntasks[Me.SHMEMl] );
// printf("Max level %d my rank %d\n",max_level, global_rank);
*(int*)Me.win_ctrl.ptr = DATA_FREE;
*((int*)Me.win_ctrl.ptr+1) = FINAL_FREE;
MPI_Barrier(*(Me.COMM[myHOST]));
......@@ -298,7 +300,8 @@ void gridding_data()
#ifdef ONE_SIDE
printf("One Side communication active\n");
printf("One Side communication active");
//printf("One Side communication active my rank %d target rank %d\n", global_rank, target_rank);
//MPI_Win_lock(MPI_LOCK_SHARED,target_rank,0,slabwin);
//MPI_Accumulate(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,MPI_SUM,slabwin);
......@@ -349,10 +352,10 @@ void gridding_data()
MPI_Barrier(MPI_COMM_WORLD);
if(Me.Nhosts>1)
// if(Me.Nhosts>1)
memcpy(Me.sfwins[Me.Rank[myHOST]].ptr, grid, size_of_grid);
else
memcpy(Me.sfwins[global_rank].ptr, grid, size_of_grid);
// else
// memcpy(Me.sfwins[global_rank].ptr, grid, size_of_grid);
#endif
......
......@@ -31,3 +31,5 @@ void write_result();
int reduce_ring (int );
int reduce_binomial (int );
int shmem_reduce_ring ( int, int, int_t, map_t *, double * restrict, blocks_t *);
int shmem_reduce_binomial( int, int, int, map_t *, double * restrict, int );
......@@ -165,6 +165,148 @@ int reduce_ring (int target_rank)
return 0;
}
int reduce_binomial ( int target_rank )
{
/* -------------------------------------------------
*
* USE THE SHARED MEMORY WINDOWS TO REDUCE DATA
* ------------------------------------------------- */
{
timing.rtime = CPU_TIME_rt;
timing.ttotal = CPU_TIME_pr;
#pragma omp parallel num_threads(2)
{
int thid = omp_get_thread_num();
if( thid == 1 )
{
// check that the data in Me.win
// can be overwritten by new data // -> this condition is true when // win_ctrl has the value "DATA_FREE"
ACQUIRE_CTRL((int*)Me.win_ctrl.ptr, DATA_FREE, timing.tspin, != )
memcpy(Me.win.ptr, gridss, sizeof(gridss));
if( Me.Ntasks[myHOST] > 1 )
{
int value = target_rank * (max_level+1);
atomic_store((int*)Me.win_ctrl.ptr, value);
double start = CPU_TIME_tr;
// printf("Im before shmem_reduce my rank %d target rank %d size_of_grid %d\n", global_rank, target_rank, size_of_grid);
int ret = shmem_reduce_binomial( target_rank, target_rank, size_of_grid, &Me, (double*)Me.win.ptr, max_level );
//printf("Im after shmem_reduce my rank %d target rank %d\n", global_rank, target_rank);
timing.treduce += CPU_TIME_tr - start;
if( ret != 0 )
{
printf("Task %d : shared-memory reduce for sector %d has returned "
"an error code %d : better stop here\n",
global_rank, target_rank, ret );
free(cwins);
free(swins);
numa_shutdown(global_rank, 0, &MYMPI_COMM_WORLD, &Me);
MPI_Finalize();
}
}
else
atomic_store((int*)Me.win_ctrl.ptr, DATA_FREE);
int Im_target = (global_rank == target_rank);
int Im_NOT_target_but_Im_master = (Me.Nhosts>1) &&
(Me.Ranks_to_host[target_rank]!=Me.myhost) && (Me.Rank[myHOST]==0);
if( Im_target || Im_NOT_target_but_Im_master )
{
ACQUIRE_CTRL((int*)Me.win_ctrl.ptr+1, FINAL_FREE, timing.tspin, != );
double start = CPU_TIME_tr;
double * restrict final = (double*)Me.win.ptr + size_of_grid;
double * restrict run = (double*)Me.win.ptr;
for( ; run < end_4; run += 4, final += 4 ) {
*final = *run;
*(final+1) = *(run+1);
*(final+2) = *(run+2);
*(final+3) = *(run+3); }
for( ; run < end_reduce; run++, final++ )
*final = *run;
timing.tmovmemory += CPU_TIME_tr - start;
printf("Im inside I'm target my rank %d target rank %d\n", global_rank, target_rank);
atomic_store(((int*)Me.win_ctrl.ptr+1), target_rank);
atomic_store((int*)Me.win_ctrl.ptr, DATA_FREE);
atomic_thread_fence(memory_order_release);
}
}
else
{
//MPI_Barrier(*Me.COMM[myHOST]);
/*
*
* REDUCE AMONG HOSTS
*/
if ( (Me.Nhosts > 1) && (Me.Rank[myHOST] == 0) )
{
double start = CPU_TIME_tr;
int target_task = Me.Ranks_to_host[target_rank];
int Im_hosting_target = Me.Ranks_to_host[target_rank] == Me.myhost;
int target = 0;
if( Im_hosting_target )
while( (target < Me.Ntasks[Me.SHMEMl]) &&
(Me.Ranks_to_myhost[target] != target_rank) )
target++;
int *ctrl_ptr = ( target == 0 ? (int*)Me.win_ctrl.ptr+1 : ((int*)Me.scwins[target].ptr)+1 );
double *send_buffer = ( Im_hosting_target ? (double*)Me.swins[target].ptr+size_of_grid :
(double*)Me.win.ptr+size_of_grid );
double *recv_buffer = ( Im_hosting_target ? (double*)Me.sfwins[target].ptr : NULL );
timingmpi.tmpi_setup += CPU_TIME_tr - start;
double tstart = CPU_TIME_tr;
ACQUIRE_CTRL( ctrl_ptr, target_rank, timing.tspin, != );
timingmpi.tmpi_reduce_wait += CPU_TIME_tr - tstart;
tstart = CPU_TIME_tr;
MPI_Ireduce(send_buffer, recv_buffer, size_of_grid, MPI_DOUBLE, MPI_SUM, target_task, COMM[HOSTS], &requests[target_rank]);
timingmpi.tmpi_reduce += CPU_TIME_tr - tstart;
MPI_Wait( &requests[target_rank], MPI_STATUS_IGNORE );
atomic_store(ctrl_ptr, FINAL_FREE);
iter++;
timingmpi.tmpi += CPU_TIME_tr - start;
fflush(stdout);
}
} // closes thread 0
atomic_thread_fence(memory_order_release);
}
timing.rtime = CPU_TIME_rt - timing.rtime;
timing.ttotal = CPU_TIME_pr - timing.ttotal;
free(cwins);
free(swins);
}
return 0;
}
int shmem_reduce_ring( int sector, int target_rank, int_t size_of_grid, map_t *Me, double * restrict data, blocks_t *blocks )
{
......@@ -315,9 +457,10 @@ int shmem_reduce_ring( int sector, int target_rank, int_t size_of_grid, map_t *M
return 0;
}
int shmem_reduce( int sector, int target_rank, int dsize, map_t *Me, double * restrict data, int max_level )
int shmem_reduce_binomial( int sector, int target_rank, int dsize, map_t *Me, double * restrict data, int max_level )
{
//printf("Im inside shmem_reduce_binomial my rank %d target rank %d\n", global_rank, target_rank);
int local_rank = Me->Rank[Me->SHMEMl];
int target_rank_on_myhost = -1;
......@@ -354,7 +497,7 @@ int shmem_reduce( int sector, int target_rank, int dsize, map_t *Me, double * re
if( target_rank_on_myhost == Me->Ntasks[Me->SHMEMl] )
return -1;
}
// printf("Im after ist if shmem_reduce_binomial my rank %d target rank %d\n", global_rank, target_rank);
// Here we start the reduction
//
......@@ -364,6 +507,7 @@ int shmem_reduce( int sector, int target_rank, int dsize, map_t *Me, double * re
int my_maxlevel = max_level;
while( (local_rank % (1<<my_maxlevel)) ) my_maxlevel--;
//printf("my max_level %d max level %d my rank %d\n", my_maxlevel, max_level, global_rank);
dprintf(1, 0, 0, "@ SEC %d t %d (%d), %d %d\n",
sector, local_rank, oRank, *(int*)Me->win_ctrl.ptr, my_maxlevel);
......@@ -383,13 +527,13 @@ int shmem_reduce( int sector, int target_rank, int dsize, map_t *Me, double * re
//
{
int I_m_target = local_rank < source;
// printf("Im inside the 1st if of reduction for loop inside shmem_reduce_binomial my rank %d target rank %d I'm target %d source %d\n", global_rank, target_rank, I_m_target, source);
// prepare pointers for the summation loop
double * restrict my_source = ( I_m_target ? swins[source] : data + dsize2);
double * restrict my_target = ( I_m_target ? data : swins[source]+dsize2 );
my_source = __builtin_assume_aligned( my_source, 8);
my_target = __builtin_assume_aligned( my_target, 8);
// printf("Im inside the 1st if of reduction after source and target assignment for loop inside shmem_reduce_binomial my rank %d target rank %d\n", global_rank, target_rank);
#if defined(DEBUG)
int my_start = ( I_m_target ? 0 : dsize2);
#endif
......@@ -398,7 +542,7 @@ int shmem_reduce( int sector, int target_rank, int dsize, map_t *Me, double * re
// are ready to be used (control tag must have // the value of the current sector )
int ctrl = sector*(max_level+1)+l;
ACQUIRE_CTRL( cwins[source], ctrl, timing.tspin_in, < );
// printf("Im inside the 1st if of reduction after aquire ctrl for loop inside shmem_reduce_binomial my rank %d target rank %d\n", global_rank, target_rank);
// performs the summation loop
//
double * my_end = my_source+dsize2;
......@@ -414,23 +558,31 @@ int shmem_reduce( int sector, int target_rank, int dsize, map_t *Me, double * re
summations += (!sector)*dsize2;
if( dsize2 < 16 )
{
// printf("Im inside the if dsize2<16 of reduction after aquire ctrl for loop inside shmem_reduce_binomial my rank %d target rank %d\n", global_rank, target_rank);
for( ; my_source < my_end; my_source++, my_target++)
*my_target += *my_source;
// printf("Im inside the if dsize2<16 of reduction after for loop inside shmem_reduce_binomial my rank %d target rank %d\n", global_rank, target_rank);
}
else
{
// printf("Im inside the else dsize2<16 of reduction after aquire ctrl for loop inside shmem_reduce_binomial my rank %d target rank %d\n", global_rank, target_rank);
double * my_end_4 = my_source+dsize2_4;
// printf("Im inside the else dsize2<16 of reduction after my_end_4 for loop inside shmem_reduce_binomial my rank %d target rank %d\n", global_rank, target_rank);
for( ; my_source < my_end_4; my_source+=4, my_target+=4 )
{
// printf("I'm inside the beginning of the for loop for adding source my source %lf rank %d target rank %d\n",*my_source, global_rank, target_rank);
*my_target += *my_source;
*(my_target+1) += *(my_source+1);
*(my_target+2) += *(my_source+2);
*(my_target+3) += *(my_source+3);
// printf("I'm inside the for loop for adding source my source %lf rank %d target rank %d\n",*my_source, global_rank, target_rank);
}
// printf("Im inside the else dsize2<16 of reduction after 1st for loop inside shmem_reduce_binomial my rank %d target rank %d\n", global_rank, target_rank);
for( ; my_source < my_end; my_source++, my_target++)
*my_target += *my_source;
// printf("Im inside the else dsize2<16 of reduction after 2st for loop inside shmem_reduce_binomial my rank %d target rank %d\n", global_rank, target_rank);
}
//printf("Im inside the 1st if of reduction after summation for loop inside shmem_reduce_binomial my rank %d target rank %d\n", global_rank, target_rank);
timing.tsum += CPU_TIME_tr - tstart;
#if defined(USE_PAPI)
if( sector == 0 )
......@@ -450,7 +602,7 @@ int shmem_reduce( int sector, int target_rank, int dsize, map_t *Me, double * re
dprintf(1,0,0, "- SEC %d l %d t %d <-> %d done : %d\n",
sector, l, local_rank, source, *(int*)(Me->win_ctrl.ptr));
//printf("Im at the end of reduction for loop inside shmem_reduce_binomial my rank %d target rank %d\n", global_rank, target_rank);
}
else
......@@ -462,6 +614,7 @@ int shmem_reduce( int sector, int target_rank, int dsize, map_t *Me, double * re
atomic_thread_fence(memory_order_release);
}
// printf("Im after reduction for loop inside shmem_reduce_binomial my rank %d target rank %d\n", global_rank, target_rank);
if ( target_rank_on_myhost > 0 )
{
......@@ -472,8 +625,10 @@ int shmem_reduce( int sector, int target_rank, int dsize, map_t *Me, double * re
temp = (void*)cwins[target_rank_on_myhost];
cwins[target_rank_on_myhost] = cwins[0];
cwins[0] = (int*)temp;
// printf("Im inside targetrankonmyhost %d inside shmem_reduce_binomial my rank %d target rank %d\n", target_rank_on_myhost ,global_rank, target_rank);
}
return 0;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment