From 71bccbe1d9efa5bf457364542d99e1730437f9c0 Mon Sep 17 00:00:00 2001 From: nandhanas Date: Fri, 27 Jan 2023 19:32:13 +0100 Subject: [PATCH] Binomial Commnucation added but not working --- allvars.c | 2 + allvars.h | 2 + gridding.c | 21 ++++--- proto.h | 4 +- reduce.c | 181 +++++++++++++++++++++++++++++++++++++++++++++++++---- 5 files changed, 187 insertions(+), 23 deletions(-) diff --git a/allvars.c b/allvars.c index 461de29..f74e55f 100644 --- a/allvars.c +++ b/allvars.c @@ -45,3 +45,5 @@ int Ntasks_local; double **swins = NULL; int **cwins = NULL; int max_level = 0; +double *end_4, *end_reduce; +int dsize_4, iter=0; diff --git a/allvars.h b/allvars.h index 34e19ed..34653a2 100644 --- a/allvars.h +++ b/allvars.h @@ -268,3 +268,5 @@ extern blocks_t blocks; extern double **swins; extern int **cwins; extern int max_level; +extern double *end_4, *end_reduce; +extern int dsize_4, iter; diff --git a/gridding.c b/gridding.c index a2add15..aa19541 100644 --- a/gridding.c +++ b/gridding.c @@ -136,12 +136,14 @@ void gridding_data() copy_win_ptrs( (void***)&swins, Me.swins, Me.Ntasks[Me.SHMEMl] ); copy_win_ptrs( (void***)&cwins, Me.scwins, Me.Ntasks[Me.SHMEMl] ); - int dsize_4 = (size_of_grid/4)*4; - double *end_4 = (double*)Me.win.ptr + dsize_4; - double *end = (double*)Me.win.ptr + datasize; + MPI_Barrier(MPI_COMM_WORLD); + // printf("The no of task in shared memory %d, host %d\n", Me.Ntasks[Me.SHMEMl], Me.Ntasks[myHOST]); + dsize_4 = (size_of_grid/4)*4; + end_4 = (double*)Me.win.ptr + dsize_4; + end_reduce = (double*)Me.win.ptr + size_of_grid; while( (1<< (++max_level) ) < Me.Ntasks[Me.SHMEMl] ); - + // printf("Max level %d my rank %d\n",max_level, global_rank); *(int*)Me.win_ctrl.ptr = DATA_FREE; *((int*)Me.win_ctrl.ptr+1) = FINAL_FREE; MPI_Barrier(*(Me.COMM[myHOST])); @@ -298,7 +300,8 @@ void gridding_data() #ifdef ONE_SIDE - printf("One Side communication active\n"); + printf("One Side communication active"); + //printf("One Side communication active my rank %d target rank %d\n", global_rank, target_rank); //MPI_Win_lock(MPI_LOCK_SHARED,target_rank,0,slabwin); //MPI_Accumulate(gridss,size_of_grid,MPI_DOUBLE,target_rank,0,size_of_grid,MPI_DOUBLE,MPI_SUM,slabwin); @@ -312,7 +315,7 @@ void gridding_data() int res = reduce_binomial(target_rank); #endif - // printf("I'm outside reduce global rank %d target rank %d local_rank %d \n", global_rank, target_rank, Me.Rank[HOSTS]); + //printf("I'm outside reduce global rank %d target rank %d local_rank %d \n", global_rank, target_rank, Me.Rank[HOSTS]); #else // relates to #ifdef ONE_SIDE @@ -349,10 +352,10 @@ void gridding_data() MPI_Barrier(MPI_COMM_WORLD); - if(Me.Nhosts>1) + // if(Me.Nhosts>1) memcpy(Me.sfwins[Me.Rank[myHOST]].ptr, grid, size_of_grid); - else - memcpy(Me.sfwins[global_rank].ptr, grid, size_of_grid); + // else + // memcpy(Me.sfwins[global_rank].ptr, grid, size_of_grid); #endif diff --git a/proto.h b/proto.h index ba69b4c..80c7519 100644 --- a/proto.h +++ b/proto.h @@ -30,4 +30,6 @@ void write_result(); int reduce_ring (int ); int reduce_binomial (int ); -int shmem_reduce_ring ( int, int, int_t, map_t *, double * restrict, blocks_t *); +int shmem_reduce_ring ( int, int, int_t, map_t *, double * restrict, blocks_t *); +int shmem_reduce_binomial( int, int, int, map_t *, double * restrict, int ); + diff --git a/reduce.c b/reduce.c index dd06166..b5e2b29 100644 --- a/reduce.c +++ b/reduce.c @@ -165,6 +165,148 @@ int reduce_ring (int target_rank) return 0; } +int reduce_binomial ( int target_rank ) +{ + + + /* ------------------------------------------------- + * + * USE THE SHARED MEMORY WINDOWS TO REDUCE DATA + * ------------------------------------------------- */ + + { + timing.rtime = CPU_TIME_rt; + timing.ttotal = CPU_TIME_pr; + #pragma omp parallel num_threads(2) + { + int thid = omp_get_thread_num(); + + + + if( thid == 1 ) + { + // check that the data in Me.win + // can be overwritten by new data // -> this condition is true when // win_ctrl has the value "DATA_FREE" + + ACQUIRE_CTRL((int*)Me.win_ctrl.ptr, DATA_FREE, timing.tspin, != ) + memcpy(Me.win.ptr, gridss, sizeof(gridss)); + if( Me.Ntasks[myHOST] > 1 ) + { + int value = target_rank * (max_level+1); + atomic_store((int*)Me.win_ctrl.ptr, value); + + double start = CPU_TIME_tr; + // printf("Im before shmem_reduce my rank %d target rank %d size_of_grid %d\n", global_rank, target_rank, size_of_grid); + int ret = shmem_reduce_binomial( target_rank, target_rank, size_of_grid, &Me, (double*)Me.win.ptr, max_level ); + //printf("Im after shmem_reduce my rank %d target rank %d\n", global_rank, target_rank); + timing.treduce += CPU_TIME_tr - start; + if( ret != 0 ) + { + printf("Task %d : shared-memory reduce for sector %d has returned " + "an error code %d : better stop here\n", + global_rank, target_rank, ret ); + free(cwins); + free(swins); + numa_shutdown(global_rank, 0, &MYMPI_COMM_WORLD, &Me); + MPI_Finalize(); + } + + } + else + atomic_store((int*)Me.win_ctrl.ptr, DATA_FREE); + + int Im_target = (global_rank == target_rank); + int Im_NOT_target_but_Im_master = (Me.Nhosts>1) && + (Me.Ranks_to_host[target_rank]!=Me.myhost) && (Me.Rank[myHOST]==0); + + if( Im_target || Im_NOT_target_but_Im_master ) + { + ACQUIRE_CTRL((int*)Me.win_ctrl.ptr+1, FINAL_FREE, timing.tspin, != ); + double start = CPU_TIME_tr; + double * restrict final = (double*)Me.win.ptr + size_of_grid; + double * restrict run = (double*)Me.win.ptr; + for( ; run < end_4; run += 4, final += 4 ) { + *final = *run; + *(final+1) = *(run+1); + *(final+2) = *(run+2); + *(final+3) = *(run+3); } + for( ; run < end_reduce; run++, final++ ) + *final = *run; + timing.tmovmemory += CPU_TIME_tr - start; + printf("Im inside I'm target my rank %d target rank %d\n", global_rank, target_rank); + + atomic_store(((int*)Me.win_ctrl.ptr+1), target_rank); + atomic_store((int*)Me.win_ctrl.ptr, DATA_FREE); + atomic_thread_fence(memory_order_release); + } + + } + else + { + //MPI_Barrier(*Me.COMM[myHOST]); + /* + * + * REDUCE AMONG HOSTS + */ + + if ( (Me.Nhosts > 1) && (Me.Rank[myHOST] == 0) ) + { + double start = CPU_TIME_tr; + + int target_task = Me.Ranks_to_host[target_rank]; + int Im_hosting_target = Me.Ranks_to_host[target_rank] == Me.myhost; + int target = 0; + + if( Im_hosting_target ) + while( (target < Me.Ntasks[Me.SHMEMl]) && + (Me.Ranks_to_myhost[target] != target_rank) ) + target++; + + + int *ctrl_ptr = ( target == 0 ? (int*)Me.win_ctrl.ptr+1 : ((int*)Me.scwins[target].ptr)+1 ); + + double *send_buffer = ( Im_hosting_target ? (double*)Me.swins[target].ptr+size_of_grid : + (double*)Me.win.ptr+size_of_grid ); + double *recv_buffer = ( Im_hosting_target ? (double*)Me.sfwins[target].ptr : NULL ); + + timingmpi.tmpi_setup += CPU_TIME_tr - start; + + double tstart = CPU_TIME_tr; + + ACQUIRE_CTRL( ctrl_ptr, target_rank, timing.tspin, != ); + + timingmpi.tmpi_reduce_wait += CPU_TIME_tr - tstart; + + tstart = CPU_TIME_tr; + MPI_Ireduce(send_buffer, recv_buffer, size_of_grid, MPI_DOUBLE, MPI_SUM, target_task, COMM[HOSTS], &requests[target_rank]); + timingmpi.tmpi_reduce += CPU_TIME_tr - tstart; + + MPI_Wait( &requests[target_rank], MPI_STATUS_IGNORE ); + atomic_store(ctrl_ptr, FINAL_FREE); + + iter++; + timingmpi.tmpi += CPU_TIME_tr - start; + fflush(stdout); + } + + } // closes thread 0 + atomic_thread_fence(memory_order_release); + + + } + timing.rtime = CPU_TIME_rt - timing.rtime; + timing.ttotal = CPU_TIME_pr - timing.ttotal; + + free(cwins); + free(swins); + + + } + + return 0; +} + + int shmem_reduce_ring( int sector, int target_rank, int_t size_of_grid, map_t *Me, double * restrict data, blocks_t *blocks ) { @@ -211,7 +353,7 @@ int shmem_reduce_ring( int sector, int target_rank, int_t size_of_grid, map_t *M for(int t = 0; t < end; t++) { // prepare pointers for the summation loop - int_t dsize = blocks->Bsize[myblock]; + int_t dsize = blocks->Bsize[myblock]; double * restrict my_source = (double*)Me->swins[target].ptr + blocks->Bstart[myblock]; double * restrict my_target = data + blocks->Bstart[myblock]; my_source = __builtin_assume_aligned( my_source, 8); @@ -315,9 +457,10 @@ int shmem_reduce_ring( int sector, int target_rank, int_t size_of_grid, map_t *M return 0; } -int shmem_reduce( int sector, int target_rank, int dsize, map_t *Me, double * restrict data, int max_level ) +int shmem_reduce_binomial( int sector, int target_rank, int dsize, map_t *Me, double * restrict data, int max_level ) { + //printf("Im inside shmem_reduce_binomial my rank %d target rank %d\n", global_rank, target_rank); int local_rank = Me->Rank[Me->SHMEMl]; int target_rank_on_myhost = -1; @@ -354,7 +497,7 @@ int shmem_reduce( int sector, int target_rank, int dsize, map_t *Me, double * re if( target_rank_on_myhost == Me->Ntasks[Me->SHMEMl] ) return -1; } - + // printf("Im after ist if shmem_reduce_binomial my rank %d target rank %d\n", global_rank, target_rank); // Here we start the reduction // @@ -364,6 +507,7 @@ int shmem_reduce( int sector, int target_rank, int dsize, map_t *Me, double * re int my_maxlevel = max_level; while( (local_rank % (1<win_ctrl.ptr, my_maxlevel); @@ -383,13 +527,13 @@ int shmem_reduce( int sector, int target_rank, int dsize, map_t *Me, double * re // { int I_m_target = local_rank < source; - - // prepare pointers for the summation loop - double * restrict my_source = ( I_m_target ? swins[source] : data + dsize2); + // printf("Im inside the 1st if of reduction for loop inside shmem_reduce_binomial my rank %d target rank %d I'm target %d source %d\n", global_rank, target_rank, I_m_target, source); + // prepare pointers for the summation loop + double * restrict my_source = ( I_m_target ? swins[source] : data + dsize2); double * restrict my_target = ( I_m_target ? data : swins[source]+dsize2 ); my_source = __builtin_assume_aligned( my_source, 8); my_target = __builtin_assume_aligned( my_target, 8); - + // printf("Im inside the 1st if of reduction after source and target assignment for loop inside shmem_reduce_binomial my rank %d target rank %d\n", global_rank, target_rank); #if defined(DEBUG) int my_start = ( I_m_target ? 0 : dsize2); #endif @@ -398,7 +542,7 @@ int shmem_reduce( int sector, int target_rank, int dsize, map_t *Me, double * re // are ready to be used (control tag must have // the value of the current sector ) int ctrl = sector*(max_level+1)+l; ACQUIRE_CTRL( cwins[source], ctrl, timing.tspin_in, < ); - + // printf("Im inside the 1st if of reduction after aquire ctrl for loop inside shmem_reduce_binomial my rank %d target rank %d\n", global_rank, target_rank); // performs the summation loop // double * my_end = my_source+dsize2; @@ -414,23 +558,31 @@ int shmem_reduce( int sector, int target_rank, int dsize, map_t *Me, double * re summations += (!sector)*dsize2; if( dsize2 < 16 ) { + // printf("Im inside the if dsize2<16 of reduction after aquire ctrl for loop inside shmem_reduce_binomial my rank %d target rank %d\n", global_rank, target_rank); for( ; my_source < my_end; my_source++, my_target++) *my_target += *my_source; + // printf("Im inside the if dsize2<16 of reduction after for loop inside shmem_reduce_binomial my rank %d target rank %d\n", global_rank, target_rank); } else - { + { + // printf("Im inside the else dsize2<16 of reduction after aquire ctrl for loop inside shmem_reduce_binomial my rank %d target rank %d\n", global_rank, target_rank); double * my_end_4 = my_source+dsize2_4; - + // printf("Im inside the else dsize2<16 of reduction after my_end_4 for loop inside shmem_reduce_binomial my rank %d target rank %d\n", global_rank, target_rank); for( ; my_source < my_end_4; my_source+=4, my_target+=4 ) { - *my_target += *my_source; + // printf("I'm inside the beginning of the for loop for adding source my source %lf rank %d target rank %d\n",*my_source, global_rank, target_rank); + *my_target += *my_source; *(my_target+1) += *(my_source+1); *(my_target+2) += *(my_source+2); - *(my_target+3) += *(my_source+3); + *(my_target+3) += *(my_source+3); + // printf("I'm inside the for loop for adding source my source %lf rank %d target rank %d\n",*my_source, global_rank, target_rank); } + // printf("Im inside the else dsize2<16 of reduction after 1st for loop inside shmem_reduce_binomial my rank %d target rank %d\n", global_rank, target_rank); for( ; my_source < my_end; my_source++, my_target++) *my_target += *my_source; + // printf("Im inside the else dsize2<16 of reduction after 2st for loop inside shmem_reduce_binomial my rank %d target rank %d\n", global_rank, target_rank); } + //printf("Im inside the 1st if of reduction after summation for loop inside shmem_reduce_binomial my rank %d target rank %d\n", global_rank, target_rank); timing.tsum += CPU_TIME_tr - tstart; #if defined(USE_PAPI) if( sector == 0 ) @@ -450,7 +602,7 @@ int shmem_reduce( int sector, int target_rank, int dsize, map_t *Me, double * re dprintf(1,0,0, "- SEC %d l %d t %d <-> %d done : %d\n", sector, l, local_rank, source, *(int*)(Me->win_ctrl.ptr)); - + //printf("Im at the end of reduction for loop inside shmem_reduce_binomial my rank %d target rank %d\n", global_rank, target_rank); } else @@ -462,6 +614,7 @@ int shmem_reduce( int sector, int target_rank, int dsize, map_t *Me, double * re atomic_thread_fence(memory_order_release); } + // printf("Im after reduction for loop inside shmem_reduce_binomial my rank %d target rank %d\n", global_rank, target_rank); if ( target_rank_on_myhost > 0 ) { @@ -472,8 +625,10 @@ int shmem_reduce( int sector, int target_rank, int dsize, map_t *Me, double * re temp = (void*)cwins[target_rank_on_myhost]; cwins[target_rank_on_myhost] = cwins[0]; cwins[0] = (int*)temp; + // printf("Im inside targetrankonmyhost %d inside shmem_reduce_binomial my rank %d target rank %d\n", target_rank_on_myhost ,global_rank, target_rank); } return 0; } + -- GitLab