diff --git a/src/tree/tree.c b/src/tree/tree.c index bd1b165b7a36b76006393a07f179b850ee9fb0e1..01b51527b95ad496d649af7b88d8adbe5a2aefcc 100644 --- a/src/tree/tree.c +++ b/src/tree/tree.c @@ -19,8 +19,8 @@ #include <omp.h> #include <sys/sysinfo.h> -//#define WRITE_NGBH -//#define WRITE_TOP_NODES +#define WRITE_NGBH +#define WRITE_TOP_NODES /* * Maximum bytes to send with a single mpi send/recv, used @@ -1315,6 +1315,180 @@ void tree_walk( } +void tree_walk_v2_find_n_points( + global_context_t* ctx, + top_kdtree_node_t* root, + int point_idx, + float_t max_dist, + float_t* point, + int* point_to_send_capacity) +{ + if(root -> owner != -1 && root -> owner != ctx -> mpi_rank) + { + #pragma omp atomic update + point_to_send_capacity[root -> owner]++; + } + else + { + /* tree walk */ + int split_var = root -> split_dim; + float_t hp_distance = point[split_var] - root -> split_val; + __builtin_prefetch(root -> lch, 0, 3); + __builtin_prefetch(root -> rch, 0, 3); + + int side = hp_distance > 0.f; + + switch (side) + { + case TOP_TREE_LCH: + if(root -> lch) + { + /* walk on the left */ + tree_walk_v2_find_n_points(ctx, root -> lch, point_idx, max_dist, point, point_to_send_capacity); + } + break; + + case TOP_TREE_RCH: + if(root -> rch) + { + /* walk on the right */ + tree_walk_v2_find_n_points(ctx, root -> rch, point_idx, max_dist, point, point_to_send_capacity); + } + break; + + default: + break; + } + + int c = max_dist > (hp_distance * hp_distance); + + //if(c || (H -> count) < (H -> N)) + if(c) + { + + switch (side) + { + case HP_LEFT_SIDE: + if(root -> rch) + { + /* walk on the right */ + tree_walk_v2_find_n_points(ctx, root -> rch, point_idx, max_dist, point, point_to_send_capacity); + } + break; + + case HP_RIGHT_SIDE: + if(root -> lch) + { + /* walk on the left */ + tree_walk_v2_find_n_points(ctx, root -> lch, point_idx, max_dist, point, point_to_send_capacity); + } + break; + + default: + break; + } + } + } + +} + +void tree_walk_v2_append_points( + global_context_t* ctx, + top_kdtree_node_t* root, + int point_idx, + float_t max_dist, + float_t* point, + float_t** data_to_send_per_proc, + int** local_idx_of_the_point, + int* point_to_send_count) +{ + if(root -> owner != -1 && root -> owner != ctx -> mpi_rank) + { + /* put the leaf on the requests array */ + int owner = root -> owner; + + + int idx; + + #pragma omp atomic capture + idx = point_to_send_count[owner]++; + + int len = ctx -> dims; + + float_t* base = data_to_send_per_proc[owner] + (len * idx); + + memcpy(base, point, ctx -> dims * sizeof(float_t)); + local_idx_of_the_point[owner][idx] = point_idx; + } + else + { + /* tree walk */ + int split_var = root -> split_dim; + float_t hp_distance = point[split_var] - root -> split_val; + __builtin_prefetch(root -> lch, 0, 3); + __builtin_prefetch(root -> rch, 0, 3); + + int side = hp_distance > 0.f; + + switch (side) + { + case TOP_TREE_LCH: + if(root -> lch) + { + /* walk on the left */ + tree_walk_v2_append_points(ctx, root -> lch, point_idx, max_dist, point, + data_to_send_per_proc, local_idx_of_the_point, point_to_send_count); + } + break; + + case TOP_TREE_RCH: + if(root -> rch) + { + /* walk on the right */ + tree_walk_v2_append_points(ctx, root -> rch, point_idx, max_dist, point, + data_to_send_per_proc, local_idx_of_the_point, point_to_send_count); + } + break; + + default: + break; + } + + int c = max_dist > (hp_distance * hp_distance); + + //if(c || (H -> count) < (H -> N)) + if(c) + { + + switch (side) + { + case HP_LEFT_SIDE: + if(root -> rch) + { + /* walk on the right */ + tree_walk_v2_append_points(ctx, root -> rch, point_idx, max_dist, point, + data_to_send_per_proc, local_idx_of_the_point, point_to_send_count); + } + break; + + case HP_RIGHT_SIDE: + if(root -> lch) + { + /* walk on the left */ + tree_walk_v2_append_points(ctx, root -> lch, point_idx, max_dist, point, + data_to_send_per_proc, local_idx_of_the_point, point_to_send_count); + } + break; + + default: + break; + } + } + } + +} + + void convert_heap_idx_to_global(global_context_t* ctx, heap* H) { for(uint64_t i = 0; i < H -> count; ++i) @@ -1387,23 +1561,27 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre for(int i = 0; i < ctx -> world_size; ++i) { - //data_to_send_per_proc[i] = (float_t*)malloc(100 * (1 + ctx -> dims) * sizeof(float_t)); + /* allocate it afterwards */ + + /* OLD VERSION data_to_send_per_proc[i] = (float_t*)malloc(100 * (ctx -> dims) * sizeof(float_t)); local_idx_of_the_point[i] = (int*)malloc(100 * sizeof(int)); point_to_snd_capacity[i] = 100; + */ + + /* NEW VERSION with double tree walk */ + point_to_snd_capacity[i] = 0; point_to_snd_count[i] = 0; } /* for each point walk the tree and find to which proc send data */ /* actually compute intersection of ngbh radius of each point to node box */ + /* OLD VERSION SINGLE TREE WALK */ + /* #pragma omp parallel for for(int i = 0; i < ctx -> local_n_points; ++i) { - /* - MPI_DB_PRINT("%lu\n",dp_info[i].array_idx); - if(i > 10) break; - */ float_t max_dist = dp_info[i].ngbh.data[0].value; float_t* point = ctx -> local_data + (i * ctx -> dims); @@ -1411,6 +1589,38 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre point, data_to_send_per_proc, local_idx_of_the_point, point_to_snd_count, point_to_snd_capacity); } + */ + + /* NEW VERSION double tree walk */ + #pragma omp parallel for + for(int i = 0; i < ctx -> local_n_points; ++i) + { + float_t max_dist = dp_info[i].ngbh.data[0].value; + float_t* point = ctx -> local_data + (i * ctx -> dims); + + tree_walk_v2_find_n_points(ctx, top_tree -> root, i, max_dist, point, point_to_snd_capacity); + + } + + /* allocate needed space */ + for(int i = 0; i < ctx -> world_size; ++i) + { + int np = point_to_snd_capacity[i]; + data_to_send_per_proc[i] = (float_t*)malloc(np * (ctx -> dims) * sizeof(float_t)); + local_idx_of_the_point[i] = (int*)malloc(np * sizeof(int)); + + } + + + #pragma omp parallel for + for(int i = 0; i < ctx -> local_n_points; ++i) + { + float_t max_dist = dp_info[i].ngbh.data[0].value; + float_t* point = ctx -> local_data + (i * ctx -> dims); + + tree_walk_v2_append_points(ctx, top_tree -> root, i, max_dist, point, data_to_send_per_proc, local_idx_of_the_point, point_to_snd_count); + } + elapsed_time = TIME_STOP; LOG_WRITE("Finding points to refine", elapsed_time); @@ -1958,7 +2168,7 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx) //ctx -> n_points = 48*5*2000; ctx->n_points = ctx->n_points / ctx->dims; - ctx->n_points = (ctx->n_points * 5) / 10; + ctx->n_points = (ctx->n_points * 6) / 10; // ctx -> n_points = ctx -> world_size * 1000; //ctx -> n_points = 10000000 * ctx -> world_size; @@ -2061,7 +2271,7 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx) elapsed_time = TIME_STOP; LOG_WRITE("Total time for all knn search", elapsed_time) - find_foreign_nodes(ctx, dp_info); + //find_foreign_nodes(ctx, dp_info); #if defined (WRITE_NGBH)