From ea94cf8223ca181f814f3d0d9daf8354d78214ad Mon Sep 17 00:00:00 2001 From: "giovanni.lacopo" Date: Fri, 8 Sep 2023 14:41:53 +0200 Subject: [PATCH] Memory bug fixing --- gridding_cpu.c | 51 ++++++++++++++++++------------------------- gridding_nccl.cpp | 55 +++++++++++++++++++++-------------------------- gridding_rccl.cpp | 53 +++++++++++++++++++-------------------------- 3 files changed, 67 insertions(+), 92 deletions(-) diff --git a/gridding_cpu.c b/gridding_cpu.c index 1afc80c..dab71a9 100644 --- a/gridding_cpu.c +++ b/gridding_cpu.c @@ -84,37 +84,29 @@ void gridding_data() // find the largest value in histo_send[] // - uint Nsec = histo_send[0]; - for (uint isector = 1; isector < nsectors; isector++) - Nsec = ( Nsec < histo_send[isector] ? histo_send[isector] : Nsec ); - - uint Nweightss = Nsec*metaData.polarisations; - uint Nvissec = Nweightss*metaData.freq_per_chan; - - // allocate sector arrays - // note: we use the largest allocation among all sectors - // - unsigned long long int mem_size = (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t); - double_t *memory = (double*) malloc ( (Nsec*3)*sizeof(double_t) + - (Nvissec*2+Nweightss)*sizeof(float_t) ); - - if ( memory == NULL ) - shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__); - - double_t *uus = (double_t*) memory; - double_t *vvs = (double_t*) uus+Nsec; - double_t *wws = (double_t*) vvs+Nsec; - float_t *weightss = (float_t*)((double_t*)wws+Nsec); - float_t *visreals = (float_t*)weightss + Nweightss; - float_t *visimgs = (float_t*)visreals + Nvissec; - for (uint isector = 0; isector < nsectors; isector++) { double start = CPU_TIME_wt; - memset( memory, 0, mem_size ); + uint Nsec = histo_send[isector]; + uint Nweightss = Nsec*metaData.polarisations; + uint Nvissec = Nweightss*metaData.freq_per_chan; + double_t *memory = (double*) malloc ( (Nsec*3)*sizeof(double_t) + + (Nvissec*2+Nweightss)*sizeof(float_t) ); + + if ( memory == NULL ) + shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__); + + double_t *uus = (double_t*) memory; + double_t *vvs = (double_t*) uus+Nsec; + double_t *wws = (double_t*) vvs+Nsec; + float_t *weightss = (float_t*)((double_t*)wws+Nsec); + float_t *visreals = (float_t*)weightss + Nweightss; + float_t *visimgs = (float_t*)visreals + Nvissec; + + // select data for this sector uint icount = 0; @@ -183,14 +175,14 @@ void gridding_data() printf("Processing sector %ld\n",isector); #endif - start = CPU_TIME_wt; - double *stacking_target_array; if ( size > 1 ) stacking_target_array = gridss; else stacking_target_array = grid; - + + start = CPU_TIME_wt; + //We have to call different GPUs per MPI task!!! [GL] wstack(param.num_w_planes, Nsec, @@ -259,10 +251,9 @@ void gridding_data() memset ( gridss, 0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) ); } + free(memory); } - free( memory ); - if ( size > 1 ) { double start = CPU_TIME_wt; diff --git a/gridding_nccl.cpp b/gridding_nccl.cpp index 412d99e..c1a246b 100644 --- a/gridding_nccl.cpp +++ b/gridding_nccl.cpp @@ -68,32 +68,8 @@ void gridding_data(){ // find the largest value in histo_send[] // - uint Nsec = histo_send[0]; - for (uint isector = 1; isector < nsectors; isector++) - Nsec = ( Nsec < histo_send[isector] ? histo_send[isector] : Nsec ); - - uint Nweightss = Nsec*metaData.polarisations; - uint Nvissec = Nweightss*metaData.freq_per_chan; - - // allocate sector arrays - // note: we use the largest allocation among all sectors - // - unsigned long long int mem_size = (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t); - double_t *memory = (double*) malloc ( (Nsec*3)*sizeof(double_t) + - (Nvissec*2+Nweightss)*sizeof(float_t) ); - - if ( memory == NULL ) - shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__); - - double_t *uus = (double*) memory; - double_t *vvs = (double*) uus+Nsec; - double_t *wws = (double*) vvs+Nsec; - float_t *weightss = (float_t*)(wws+Nsec); - float_t *visreals = weightss + Nweightss; - float_t *visimgs = visreals + Nvissec; - - - + + //Initialize nccl double * grid_gpu, *gridss_gpu; @@ -129,7 +105,23 @@ void gridding_data(){ double start = CPU_TIME_wt; - memset( memory, 0, mem_size ); + uint Nsec = histo_send[isector]; + uint Nweightss = Nsec*metaData.polarisations; + uint Nvissec = Nweightss*metaData.freq_per_chan; + double_t *memory = (double*) malloc ( (Nsec*3)*sizeof(double_t) + + (Nvissec*2+Nweightss)*sizeof(float_t) ); + + if ( memory == NULL ) + shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__); + + double_t *uus = (double_t*) memory; + double_t *vvs = (double_t*) uus+Nsec; + double_t *wws = (double_t*) vvs+Nsec; + float_t *weightss = (float_t*)((double_t*)wws+Nsec); + float_t *visreals = (float_t*)weightss + Nweightss; + float_t *visimgs = (float_t*)visreals + Nvissec; + + // select data for this sector uint icount = 0; @@ -246,10 +238,10 @@ void gridding_data(){ // int target_rank = (int)isector; it implied that size >= nsectors int target_rank = (int)(isector % size); - start = CPU_TIME_wt; - cudaStreamSynchronize(stream_reduce); - + + start = CPU_TIME_wt; + ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble, ncclSum, target_rank, comm, stream_reduce); cudaStreamSynchronize(stream_reduce); @@ -260,11 +252,12 @@ void gridding_data(){ memset ( gridss, 0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) ); } + free(memory); } //Copy data back from device to host (to be deleted in next steps) - free(memory); + cudaMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), cudaMemcpyDeviceToHost, stream_reduce); MPI_Barrier(MPI_COMM_WORLD); diff --git a/gridding_rccl.cpp b/gridding_rccl.cpp index 2adeaf5..026878c 100644 --- a/gridding_rccl.cpp +++ b/gridding_rccl.cpp @@ -64,33 +64,7 @@ void gridding_data(){ if ( rank == 0 ) printf("RESOLUTION = %f rad, %f arcsec\n", resolution, resolution_asec); - // find the largest value in histo_send[] - // - uint Nsec = histo_send[0]; - for (uint isector = 1; isector < nsectors; isector++) - Nsec = ( Nsec < histo_send[isector] ? histo_send[isector] : Nsec ); - - uint Nweightss = Nsec*metaData.polarisations; - uint Nvissec = Nweightss*metaData.freq_per_chan; - - // allocate sector arrays - // note: we use the largest allocation among all sectors - // - unsigned long long int mem_size = (Nsec*3)*sizeof(double_t) + (Nvissec*2+Nweightss)*sizeof(float_t); - double_t *memory = (double*) malloc ( (Nsec*3)*sizeof(double_t) + - (Nvissec*2+Nweightss)*sizeof(float_t) ); - - if ( memory == NULL ) - shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__); - - double_ty *uus = (double*) memory; - double_ty *vvs = (double*) uus+Nsec; - double_ty *wws = (double*) vvs+Nsec; - float_ty *weightss = (float_ty*)(wws+Nsec); - float_ty *visreals = weightss + Nweightss; - float_ty *visimgs = visreals + Nvissec; - - + //Initialize nccl @@ -127,7 +101,23 @@ void gridding_data(){ double start = CPU_TIME_wt; - memset( memory, 0, mem_size ); + uint Nsec = histo_send[isector]; + uint Nweightss = Nsec*metaData.polarisations; + uint Nvissec = Nweightss*metaData.freq_per_chan; + double_t *memory = (double*) malloc ( (Nsec*3)*sizeof(double_t) + + (Nvissec*2+Nweightss)*sizeof(float_t) ); + + if ( memory == NULL ) + shutdown_wstacking(NOT_ENOUGH_MEM_STACKING, "Not enough memory for stacking", __FILE__, __LINE__); + + double_t *uus = (double_t*) memory; + double_t *vvs = (double_t*) uus+Nsec; + double_t *wws = (double_t*) vvs+Nsec; + float_t *weightss = (float_t*)((double_t*)wws+Nsec); + float_t *visreals = (float_t*)weightss + Nweightss; + float_t *visimgs = (float_t*)visreals + Nvissec; + + // select data for this sector uint icount = 0; @@ -244,10 +234,11 @@ void gridding_data(){ // int target_rank = (int)isector; it implied that size >= nsectors int target_rank = (int)(isector % size); - start = CPU_TIME_wt; - + hipStreamSynchronize(stream_reduce); + start = CPU_TIME_wt; + ncclReduce(gridss_gpu, grid_gpu, size_of_grid, ncclDouble, ncclSum, target_rank, comm, stream_reduce); hipStreamSynchronize(stream_reduce); @@ -258,11 +249,11 @@ void gridding_data(){ memset ( gridss, 0, 2*param.num_w_planes*xaxis*yaxis * sizeof(double) ); } + free(memory); } //Copy data back from device to host (to be deleted in next steps) - free(memory); hipMemcpyAsync(grid, grid_gpu, 2*param.num_w_planes*xaxis*yaxis*sizeof(double), hipMemcpyDeviceToHost, stream_reduce); MPI_Barrier(MPI_COMM_WORLD); -- GitLab