diff --git a/src/adp/adp.c b/src/adp/adp.c index 35f61e2a3a87f8b9a524414147619c8894bfb350..e796b8db7dc6a0ca93d79525026c54920b05850e 100644 --- a/src/adp/adp.c +++ b/src/adp/adp.c @@ -1,4 +1,5 @@ #include "adp.h" +#include <unistd.h> const border_t border_null = {.density = -1.0, .error = 0, .idx = NOBORDER}; const sparse_border_t sparse_border_null = {.density = -1.0, .error = 0, .idx = NOBORDER, .i = NOBORDER, .j = NOBORDER}; @@ -298,8 +299,8 @@ void compute_density_kstarnn_rma_v2(global_context_t* ctx, const float_t d, int idx_t* ks = (idx_t*)MY_MALLOC(ctx -> local_n_points * sizeof(idx_t)); for(int i = 0; i < ctx -> local_n_points; ++i) den[i] = ctx -> local_datapoints[i].log_rho; - for(int i = 0; i < ctx -> local_n_points; ++i) ks[i] = ctx -> local_datapoints[i].kstar; - for(int i = 0; i < ctx -> local_n_points; ++i) gs[i] = ctx -> local_datapoints[i].g; + for(int i = 0; i < ctx -> local_n_points; ++i) ks[i] = ctx -> local_datapoints[i].kstar; + for(int i = 0; i < ctx -> local_n_points; ++i) gs[i] = ctx -> local_datapoints[i].g; ordered_buffer_to_file(ctx, den, sizeof(float_t), ctx -> local_n_points, "bb/ordered_density.npy"); ordered_buffer_to_file(ctx, ks, sizeof(idx_t), ctx -> local_n_points, "bb/ks.npy"); @@ -617,6 +618,8 @@ lock_t h1_lock_acquire(global_context_t* ctx, MPI_Win lock_window, int owner, id while(result == LOCK_ACQUIRED && err == MPI_SUCCESS) { err = MPI_Compare_and_swap(&state, &compare, &result, MPI_LOCK_T, owner, pos, lock_window); + MPI_Win_flush(owner, lock_window); + usleep(100); } if(err != MPI_SUCCESS) @@ -677,8 +680,7 @@ clusters_t Heuristic1(global_context_t *ctx) MPI_Win_fence(0, win_datapoints); MPI_Win_lock_all(0, win_datapoints); -#if defined(THREAD_FUNNELED) -#else +#if !defined(THREAD_FUNNELED) #pragma omp parallel for #endif for(idx_t i = 0; i < n; ++i) @@ -741,14 +743,17 @@ clusters_t Heuristic1(global_context_t *ctx) MPI_Win_create(lock_array, n * sizeof(lock_t), sizeof(lock_t), MPI_INFO_NULL, ctx -> mpi_communicator, &win_locks); MPI_Win_fence(0, win_locks); +#ifdef EXP_H1 MPI_Win_lock_all(0, win_to_remove_mask); MPI_Win_lock_all(0, win_locks); +#endif +#ifdef EXP_H1 + printf("Using experimental h1\n"); +#endif - -#if defined(THREAD_FUNNELED) -#else - #pragma omp parallel for +#if !defined(THREAD_FUNNELED) + #pragma omp parallel for schedule(dynamic) #endif for(idx_t p = 0; p < n; ++p) { @@ -769,36 +774,38 @@ clusters_t Heuristic1(global_context_t *ctx) * * */ -#ifdef EXPERIMENTAL_H1 - int owner = foreign_owner(ctx, jidx); - idx_t jpos = jidx - ctx -> rank_idx_start[owner]; - - lock_t state = LOCK_FREE; - - state = h1_lock_acquire(ctx, win_locks, owner, jpos, state); +#ifdef EXP_H1 + #pragma omp critical (h1_exp) + { + int owner = foreign_owner(ctx, jidx); + idx_t jpos = jidx - ctx -> rank_idx_start[owner]; - heap_node mask_element; - MPI_Request request; + lock_t state = LOCK_FREE; - MPI_Rget(&mask_element, sizeof(heap_node), MPI_BYTE, - owner, jpos * sizeof(heap_node), sizeof(heap_node), MPI_BYTE, win_to_remove_mask, &request); - MPI_Wait(&request, MPI_STATUS_IGNORE); + state = h1_lock_acquire(ctx, win_locks, owner, jpos, state); - int flag = mask_element.array_idx == MY_SIZE_MAX; - if(flag || i_point.g > mask_element.value ) - { - heap_node tmp_mask_element = {.array_idx = i_point.array_idx, .value = i_point.g}; + heap_node mask_element; MPI_Request request; - MPI_Rput(&tmp_mask_element, sizeof(heap_node), MPI_BYTE, owner, - jpos*sizeof(heap_node), sizeof(heap_node), MPI_BYTE, win_to_remove_mask, &request); + + MPI_Rget(&mask_element, sizeof(heap_node), MPI_BYTE, + owner, jpos * sizeof(heap_node), sizeof(heap_node), MPI_BYTE, win_to_remove_mask, &request); MPI_Wait(&request, MPI_STATUS_IGNORE); - } + int flag = mask_element.array_idx == MY_SIZE_MAX; + if(flag || i_point.g > mask_element.value ) + { + heap_node tmp_mask_element = {.array_idx = i_point.array_idx, .value = i_point.g}; + MPI_Request request; + MPI_Rput(&tmp_mask_element, sizeof(heap_node), MPI_BYTE, owner, + jpos*sizeof(heap_node), sizeof(heap_node), MPI_BYTE, win_to_remove_mask, &request); + MPI_Wait(&request, MPI_STATUS_IGNORE); - state = h1_lock_free(ctx, win_locks, owner, jpos, state); -#else + } - #pragma omp critical (h1_centers_elimination) + state = h1_lock_free(ctx, win_locks, owner, jpos, state); + } +#else + #pragma omp critical (centers_elimination) { int owner = foreign_owner(ctx, jidx); idx_t jpos = jidx - ctx -> rank_idx_start[owner]; @@ -822,13 +829,16 @@ clusters_t Heuristic1(global_context_t *ctx) } MPI_Win_unlock(owner, win_to_remove_mask); + } #endif } } } +#ifdef EXP_H1 MPI_Win_unlock_all(win_to_remove_mask); MPI_Win_unlock_all(win_locks); +#endif MPI_Win_fence(0, win_to_remove_mask); MPI_Win_fence(0, win_locks); @@ -1815,8 +1825,6 @@ void Heuristic3(global_context_t* ctx, clusters_t* cluster, float_t Z, int halo) qsort(centers_dp, cluster -> centers.count, sizeof(datapoint_info_t), compare_dp_by_cidx); - printf("Centers\n"); - master_finds_borders(ctx, cluster, Z, surviving_clusters, centers_dp); master_fixes_border_matrix_and_centers(ctx, cluster, Z, old_to_new, surviving_clusters, nclus); free(centers_dp); diff --git a/src/main/main.c b/src/main/main.c index 8c90cfee2667a35126dd0b26b865511c4dd3f6d5..ab54c20845aaac44fd54f539341929b38759f381 100644 --- a/src/main/main.c +++ b/src/main/main.c @@ -232,52 +232,7 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx) if (ctx->mpi_rank == 0) { data = read_data_file(ctx,ctx -> input_data_file, ctx -> dims, ctx -> input_data_in_float32); - - //data = read_data_file(ctx, "../norm_data/50_blobs_more_var.npy", MY_TRUE); - //ctx->dims = 2; - //data = read_data_file(ctx, "../norm_data/blobs_small.npy", MY_FALSE); - //data = read_data_file(ctx, "../norm_data/blobs_small.npy", MY_FALSE); - // std_g0163178_Me14_091_0000 - - // 100M points - // 2D - // std_g2980844_091_0000 - //data = read_data_file(ctx,"../norm_data/huge_blobs.npy",MY_FALSE); - // 2B points - // data = read_data_file(ctx,"../norm_data/very_huge_blobs.npy",MY_FALSE); - // data = read_data_file(ctx,"../norm_data/hd_blobs.npy",5,MY_FALSE); - - //1B points - // data = read_data_file(ctx,"../norm_data/eds_box_acc_normalized",5,MY_FALSE); - // data = read_data_file(ctx,"../norm_data/eds_box_6d",6,MY_FALSE); - - // 190M points - // std_g2980844_091_0000 - // data = read_data_file(ctx,"../norm_data/std_g2980844_091_0000",5,MY_TRUE); - - /* 1M points ca.*/ - //data = read_data_file(ctx,"../norm_data/std_LR_091_0001",5,MY_TRUE); - - /* BOX */ - // data = read_data_file(ctx,"../norm_data/std_Box_256_30_092_0000",MY_TRUE); - - /* 8M points */ - - // data = read_data_file(ctx,"../norm_data/std_g0144846_Me14_091_0001",5,MY_TRUE); - - //88M - // data = read_data_file(ctx,"../norm_data/std_g5503149_091_0000",MY_TRUE); - - // - //34 M - //data = read_data_file(ctx,"../norm_data/std_g1212639_091_0001",MY_TRUE); - - //for weak scalability - //ctx->n_points = ctx->n_points / 4; - //ctx->n_points = (ctx->n_points / 32) * ctx -> world_size; - get_dataset_diagnostics(ctx, data); - } /* communicate the total number of points*/ @@ -320,9 +275,7 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx) int count_send = MIN(default_msg_len, send_counts[i] - already_sent_points); MPI_Send(data + displacements[i] + already_sent_points, count_send, MPI_MY_FLOAT, i, ctx -> mpi_rank, ctx -> mpi_communicator); already_sent_points += count_send; - //DB_PRINT("[RANK 0] has sent to rank %d %d elements out of %lu\n",i, already_sent_points, send_counts[i]); } - //DB_PRINT("------------------------------------------------\n"); } } else diff --git a/src/tree/kdtreeV2.c b/src/tree/kdtreeV2.c index b3b634a3db12cdbe0b43180b4161bf3ef203269a..3547975ea8c13d49c359583429cc3b9384b57577 100644 --- a/src/tree/kdtreeV2.c +++ b/src/tree/kdtreeV2.c @@ -228,9 +228,6 @@ kdnode_v2* make_tree_kdnode_v2(kdnode_v2* t, int start, int end, kdnode_v2* pare } - - - int median_idx = -1; //if ((end - start) < 0) return 0; diff --git a/src/tree/tree.c b/src/tree/tree.c index 12f181e3ec0ba4068f08e027c594c145df53622a..876dd85c5c9ebe40f0093ee2f5d2d095c97bd4da 100644 --- a/src/tree/tree.c +++ b/src/tree/tree.c @@ -1478,7 +1478,7 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre //ctx -> __local_heap_buffers = (heap_node*)MY_MALLOC(ctx -> local_n_points * k * sizeof(heap_node)); MPI_Alloc_mem(ctx -> local_n_points * k * sizeof(heap_node), MPI_INFO_NULL, &(ctx -> __local_heap_buffers)); - #pragma omp parallel for + #pragma omp parallel for schedule(dynamic) for(int p = 0; p < ctx -> local_n_points; ++p) { idx_t idx = local_tree -> _nodes[p].array_idx; @@ -1663,7 +1663,7 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre //if(count_rcv_work_batches[p] > 0) { //heap_batches_per_node[p] = (heap_node*)MY_MALLOC(k * point_to_rcv_count[p] * sizeof(heap_node)); - #pragma omp parallel for + #pragma omp parallel for schedule(dynamic) for(int batch = 0; batch < point_to_rcv_count[p]; ++batch) { heap H; @@ -1822,7 +1822,7 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre } /* merge lists */ - #pragma omp paralell for + #pragma omp parallel for for(int b = 0; b < ngbh_to_recv[rank_to_recv]; ++b) { int idx = local_idx_of_the_point[rank_to_recv][b]; @@ -1843,6 +1843,7 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre } + MPI_Barrier(ctx -> mpi_communicator); }