diff --git a/.ipynb_checkpoints/var-checkpoint.py b/.ipynb_checkpoints/var-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..708cc54871c1a4c9d1ce62bd903ec4960a193898 --- /dev/null +++ b/.ipynb_checkpoints/var-checkpoint.py @@ -0,0 +1,6 @@ +import numpy as np + +d = np.fromfile("../norm_data/std_LR_091_0000", dtype=np.float32) +print(d.shape) +d = d.reshape((d.shape[0]//5,5)) +print(np.cov(d.T)) diff --git a/Makefile b/Makefile index 8df32476d9ac91b9171f382b9ea62f1c606e8b33..9c34660ebdf14daa9a557904972344b9d18bc740 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ LDFLAGS=-lm all: main -obj=src/main/main.c src/tree/tree.c src/common/common.c src/tree/kdtreeV2.c src/tree/heap.c +obj=src/main/main.c src/tree/tree.c src/common/common.c src/tree/kdtreeV2.c src/tree/heap.c src/adp/adp.c main: ${obj} ${CC} ${CFLAGS} ${LDFLAGS} ${obj} -o $@ diff --git a/src/adp/adp.c b/src/adp/adp.c new file mode 100644 index 0000000000000000000000000000000000000000..064fd7583c94c60077384d9110a0e74f6e0b72d0 --- /dev/null +++ b/src/adp/adp.c @@ -0,0 +1,2114 @@ +#include "adp.h" + +const border_t border_null = {.density = -1.0, .error = 0, .idx = NOBORDER}; +const sparse_border_t sparse_border_null = {.density = -1.0, .error = 0, .idx = NOBORDER, .i = NOBORDER, .j = NOBORDER}; + +float_t mEst2(float_t * x, float_t *y, idx_t n) +{ + + /* + * Estimate the m coefficient of a straight + * line passing through the origin + * params: + * - x: x values of the points + * - y: y values of the points + * - n: size of the arrays + */ + + float_t num = 0; + float_t den = 0; + float_t dd; + for(idx_t i = 0; i < n; ++i) + { + float_t xx = x[i]; + float_t yy = y[i]; + + dd = xx; + num += dd*yy; + den += dd*dd; + + } + + return num/den; +} + +float_t compute_ID_two_NN_ML(global_context_t* ctx, datapoint_info_t* dp_info, idx_t n, int verbose) +{ + + /* + * Estimation of the intrinsic dimension of a dataset + * args: + * - dp_info: array of structs + * - n: number of dp_info + * intrinsic_dim = (N - 1) / np.sum(log_mus) + */ + + struct timespec start_tot, finish_tot; + double elapsed_tot; + + if(verbose) + { + printf("ID estimation:\n"); + clock_gettime(CLOCK_MONOTONIC, &start_tot); + } + + float_t log_mus = 0; + for(idx_t i = 0; i < n; ++i) + { + log_mus += 0.5 * log(dp_info[i].ngbh.data[2].value/dp_info[i].ngbh.data[1].value); + } + + float_t d = 0; + MPI_Allreduce(&log_mus, &d, 1, MPI_MY_FLOAT, MPI_SUM, ctx -> mpi_communicator); + d = (ctx -> n_points - 1)/d; + if(verbose) + { + clock_gettime(CLOCK_MONOTONIC, &finish_tot); + elapsed_tot = (finish_tot.tv_sec - start_tot.tv_sec); + elapsed_tot += (finish_tot.tv_nsec - start_tot.tv_nsec) / 1000000000.0; + printf("\tID value: %.6lf\n", d); + printf("\tTotal time: %.3lfs\n\n", elapsed_tot); + } + + return d; + +} + +float_t get_j_ksel_dist(global_context_t* ctx, idx_t j, idx_t ksel, MPI_Win exposed_ngbh) +{ + int owner = foreign_owner(ctx, j); + idx_t k = ctx -> k; + /* find if datapoint is halo or not */ + if(owner == ctx -> mpi_rank) + { + idx_t pos = j - ctx -> idx_start; + return ctx -> local_datapoints[pos].ngbh.data[ksel].value; + } + else + { + heap_node el; + idx_t pos = j - ctx -> rank_idx_start[owner]; + MPI_Request request; + int err = MPI_Rget(&el, sizeof(heap_node), MPI_BYTE, owner, (MPI_Aint)((pos * k + ksel) * sizeof(heap_node)), sizeof(heap_node), MPI_BYTE, exposed_ngbh, &request); + MPI_Wait(&request,MPI_STATUS_IGNORE); + return el.value; + } +} + +idx_t get_j_ksel_idx(global_context_t* ctx, idx_t j, idx_t ksel, MPI_Win exposed_ngbh) +{ + int owner = foreign_owner(ctx, j); + idx_t k = ctx -> k; + /* find if datapoint is halo or not */ + if(owner == ctx -> mpi_rank) + { + idx_t pos = j - ctx -> idx_start; + return ctx -> local_datapoints[pos].ngbh.data[ksel].array_idx; + } + else + { + heap_node el; + idx_t pos = j - ctx -> rank_idx_start[owner]; + MPI_Request request; + int err = MPI_Rget(&el, sizeof(heap_node), MPI_BYTE, owner, (MPI_Aint)((pos * k + ksel) * sizeof(heap_node)), sizeof(heap_node), MPI_BYTE, exposed_ngbh, &request); + MPI_Wait(&request,MPI_STATUS_IGNORE); + return el.array_idx; + } +} + +void compute_density_kstarnn_rma(global_context_t* ctx, const float_t d, int verbose){ + + /* + * Point density computation: + * args: + * - paricles: array of structs + * - d : intrinsic dimension of the dataset + * - points : number of points in the dataset + */ + + + MPI_Info info; + + + MPI_Barrier(ctx -> mpi_communicator); + idx_t k = ctx -> local_datapoints[0].ngbh.N; + + struct timespec start_tot, finish_tot; + double elapsed_tot; + + datapoint_info_t* local_datapoints = ctx -> local_datapoints; + + if(verbose) + { + printf("Density and k* estimation:\n"); + clock_gettime(CLOCK_MONOTONIC, &start_tot); + } + + idx_t kMAX = ctx -> local_datapoints[0].ngbh.N - 1; + + float_t omega = 0.; + if(sizeof(float_t) == sizeof(float)){ omega = powf(PI_F,d/2)/tgammaf(d/2.0f + 1.0f);} + else{omega = pow(M_PI,d/2.)/tgamma(d/2.0 + 1.0);} + + + MPI_Win exposed_ngbh; + MPI_Win_create( ctx -> __local_heap_buffers, + ctx -> local_n_points * k * sizeof(heap_node), + 1, MPI_INFO_NULL, + ctx -> mpi_communicator, + &exposed_ngbh); + + MPI_Win_fence(0, exposed_ngbh); + MPI_Win_lock_all(0, exposed_ngbh); + + MPI_Barrier(ctx -> mpi_communicator); + + #pragma omp parallel for + for(idx_t i = 0; i < ctx -> local_n_points; ++i) + { + for(idx_t k = 0; k <= kMAX; ++k) + { + local_datapoints[i].ngbh.data[k].value = omega * pow(local_datapoints[i].ngbh.data[k].value, d/2.); + } + } + + for(idx_t i = 0; i < ctx -> local_n_points; ++i) + { + + idx_t j = 4; + idx_t k; + float_t dL = 0.; + float_t vvi = 0.; + float_t vvj = 0.; + float_t vp = 0.; + while(j < kMAX && dL < DTHR) + { + idx_t ksel = j - 1; + //vvi = omega * pow(local_datapoints[i].ngbh.data[ksel].value,d/2.); + vvi = local_datapoints[i].ngbh.data[ksel].value; + + idx_t jj = local_datapoints[i].ngbh.data[j].array_idx; + + /* + * note jj can be an halo point + * need to search maybe for it in foreign nodes + * */ + //float_t dist_jj = get_j_ksel_dist(ctx, jj, ksel, exposed_ngbh); + //vvj = omega * pow(dist_jj,d/2.); + vvj = get_j_ksel_dist(ctx, jj, ksel, exposed_ngbh); + + vp = (vvi + vvj)*(vvi + vvj); + dL = -2.0 * ksel * log(4.*vvi*vvj/vp); + j = j + 1; + } + if(j == kMAX) + { + k = j - 1; + //vvi = omega * pow(ctx -> local_datapoints[i].ngbh.data[k].value,d/2.); + vvi = ctx -> local_datapoints[i].ngbh.data[k].value; + } + else + { + k = j - 2; + } + local_datapoints[i].kstar = k; + local_datapoints[i].log_rho = log((float_t)(k)/vvi/((float_t)(ctx -> n_points))); + //dp_info[i].log_rho = log((float_t)(k)) - log(vvi) -log((float_t)(points)); + local_datapoints[i].log_rho_err = 1.0/sqrt((float_t)k); //(float_t)(-Q_rsqrt((float)k)); + local_datapoints[i].g = local_datapoints[i].log_rho - local_datapoints[i].log_rho_err; + } + + if(verbose) + { + clock_gettime(CLOCK_MONOTONIC, &finish_tot); + elapsed_tot = (finish_tot.tv_sec - start_tot.tv_sec); + elapsed_tot += (finish_tot.tv_nsec - start_tot.tv_nsec) / 1000000000.0; + printf("\tTotal time: %.3lfs\n\n", elapsed_tot); + } + + MPI_Win_unlock_all(exposed_ngbh); + MPI_Win_fence(0, exposed_ngbh); + MPI_Win_free(&exposed_ngbh); + + #if defined(WRITE_DENSITY) + /* densities */ + float_t* den = (float_t*)MY_MALLOC(ctx -> local_n_points * sizeof(float_t)); + float_t* gs = (float_t*)MY_MALLOC(ctx -> local_n_points * sizeof(float_t)); + idx_t* ks = (idx_t*)MY_MALLOC(ctx -> local_n_points * sizeof(idx_t)); + + for(int i = 0; i < ctx -> local_n_points; ++i) den[i] = ctx -> local_datapoints[i].log_rho; + for(int i = 0; i < ctx -> local_n_points; ++i) ks[i] = ctx -> local_datapoints[i].kstar; + for(int i = 0; i < ctx -> local_n_points; ++i) gs[i] = ctx -> local_datapoints[i].g; + + ordered_buffer_to_file(ctx, den, sizeof(float_t), ctx -> local_n_points, "bb/ordered_density.npy"); + ordered_buffer_to_file(ctx, ks, sizeof(idx_t), ctx -> local_n_points, "bb/ks.npy"); + ordered_buffer_to_file(ctx, gs, sizeof(float_t), ctx -> local_n_points, "bb/g.npy"); + + ordered_data_to_file(ctx); + free(den); + free(ks); + #endif + return; + + +} + +void compute_density_kstarnn_rma_v2(global_context_t* ctx, const float_t d, int verbose){ + + /* + * Point density computation: + * args: + * - paricles: array of structs + * - d : intrinsic dimension of the dataset + * - points : number of points in the dataset + */ + + + MPI_Info info; + + + MPI_Barrier(ctx -> mpi_communicator); + idx_t k = ctx -> local_datapoints[0].ngbh.N; + + struct timespec start_tot, finish_tot; + double elapsed_tot; + + datapoint_info_t* local_datapoints = ctx -> local_datapoints; + + if(verbose) + { + printf("Density and k* estimation:\n"); + clock_gettime(CLOCK_MONOTONIC, &start_tot); + } + + idx_t kMAX = ctx -> local_datapoints[0].ngbh.N - 1; + + float_t omega = 0.; + if(sizeof(float_t) == sizeof(float)){ omega = powf(PI_F,d/2)/tgammaf(d/2.0f + 1.0f);} + else{omega = pow(M_PI,d/2.)/tgamma(d/2.0 + 1.0);} + + + MPI_Win exposed_ngbh; + MPI_Win_create( ctx -> __local_heap_buffers, + ctx -> local_n_points * k * sizeof(heap_node), + 1, MPI_INFO_NULL, + ctx -> mpi_communicator, + &exposed_ngbh); + + MPI_Win_fence(MPI_MODE_NOPUT, exposed_ngbh); + + MPI_Barrier(ctx -> mpi_communicator); + + #pragma omp parallel for + for(idx_t i = 0; i < ctx -> local_n_points; ++i) + { + for(idx_t k = 0; k <= kMAX; ++k) + { + local_datapoints[i].ngbh.data[k].value = omega * pow(local_datapoints[i].ngbh.data[k].value, d/2.); + } + //initialize kstar at 0 + local_datapoints[i].kstar = 0; + } + + int i_have_finished = 0; + int all_have_finished = 0; + int finished_points = 0; + heap_node* scratch_heap_nodes = (heap_node*)MY_MALLOC(ctx -> local_n_points * sizeof(heap_node)); + + for(idx_t j = 4; j < kMAX - 1; ++j) + { + i_have_finished = 1; + //request data + idx_t ksel = j - 1; + +#if defined(THREAD_FUNNELED) +#else + #pragma omp parallel for +#endif + + for(idx_t i = 0; i < ctx -> local_n_points; ++i) + { + + if(ctx -> local_datapoints[i].kstar == 0) + { + //vvi = omega * pow(local_datapoints[i].ngbh.data[ksel].value,d/2.); + + idx_t jj = local_datapoints[i].ngbh.data[j].array_idx; + + /* + * note jj can be an halo point + * need to search maybe for it in foreign nodes + * */ + + int owner = foreign_owner(ctx, jj); + idx_t pos = jj - ctx -> rank_idx_start[owner]; + + if(owner == ctx -> mpi_rank) + { + scratch_heap_nodes[i] = ctx -> local_datapoints[pos].ngbh.data[ksel]; + } + else + { + MPI_Get(scratch_heap_nodes + i, + sizeof(heap_node), + MPI_BYTE, + owner, + (MPI_Aint)((pos * (ctx -> k) + ksel) * sizeof(heap_node)), + sizeof(heap_node), + MPI_BYTE, + exposed_ngbh); + } + } + + } + + MPI_Win_fence(MPI_MODE_NOPUT,exposed_ngbh); + //process data +#if defined(THREAD_FUNNELED) +#else + #pragma omp parallel for +#endif + for(idx_t i = 0; i < ctx -> local_n_points; ++i) + { + if(ctx -> local_datapoints[i].kstar == 0) + { + float_t vvi, vvj, vp, dL; + vvi = local_datapoints[i].ngbh.data[ksel].value; + vvj = scratch_heap_nodes[i].value; + vp = (vvi + vvj)*(vvi + vvj); + dL = -2.0 * ksel * log(4.*vvi*vvj/vp); + + if(dL > DTHR) + { + idx_t k = j - 1; + local_datapoints[i].kstar = k; + local_datapoints[i].log_rho = log((float_t)(k)/vvi/((float_t)(ctx -> n_points))); + local_datapoints[i].log_rho_err = 1.0/sqrt((float_t)k); //(float_t)(-Q_rsqrt((float)k)); + local_datapoints[i].g = local_datapoints[i].log_rho - local_datapoints[i].log_rho_err; + + #pragma omp atomic update + finished_points++; + } + } + + } + + + i_have_finished = (finished_points == ctx -> local_n_points); + MPI_Allreduce(&i_have_finished, &all_have_finished, 1, MPI_INT, MPI_LAND, ctx -> mpi_communicator); + + if(all_have_finished) break; + } + + + #pragma omp parallel for + for(idx_t i = 0; i < ctx -> local_n_points; ++i) + { + if(ctx -> local_datapoints[i].kstar == 0) + { + idx_t k = kMAX - 1; + + float_t vvi = ctx -> local_datapoints[i].ngbh.data[k].value; + + local_datapoints[i].kstar = k; + local_datapoints[i].log_rho = log((float_t)(k)/vvi/((float_t)(ctx -> n_points))); + local_datapoints[i].log_rho_err = 1.0/sqrt((float_t)k); //(float_t)(-Q_rsqrt((float)k)); + local_datapoints[i].g = local_datapoints[i].log_rho - local_datapoints[i].log_rho_err; + } + } + + + MPI_Win_fence(0, exposed_ngbh); + MPI_Win_free(&exposed_ngbh); + + free(scratch_heap_nodes); + + #if defined(WRITE_DENSITY) + /* densities */ + float_t* den = (float_t*)MY_MALLOC(ctx -> local_n_points * sizeof(float_t)); + float_t* gs = (float_t*)MY_MALLOC(ctx -> local_n_points * sizeof(float_t)); + idx_t* ks = (idx_t*)MY_MALLOC(ctx -> local_n_points * sizeof(idx_t)); + + for(int i = 0; i < ctx -> local_n_points; ++i) den[i] = ctx -> local_datapoints[i].log_rho; + for(int i = 0; i < ctx -> local_n_points; ++i) ks[i] = ctx -> local_datapoints[i].kstar; + for(int i = 0; i < ctx -> local_n_points; ++i) gs[i] = ctx -> local_datapoints[i].g; + + ordered_buffer_to_file(ctx, den, sizeof(float_t), ctx -> local_n_points, "bb/ordered_density.npy"); + ordered_buffer_to_file(ctx, ks, sizeof(idx_t), ctx -> local_n_points, "bb/ks.npy"); + ordered_buffer_to_file(ctx, gs, sizeof(float_t), ctx -> local_n_points, "bb/g.npy"); + + ordered_data_to_file(ctx); + free(den); + free(ks); + #endif + return; + + +} + +float_t get_j_ksel_dist_v2(global_context_t* ctx, idx_t i, idx_t j, idx_t ksel, int* flags, heap_node* tmp_heap_nodes, MPI_Win* exposed_ngbh) +{ + if(flags[i]) + { + int owner = foreign_owner(ctx, j); + idx_t k = ctx -> k; + /* find if datapoint is halo or not */ + if(owner == ctx -> mpi_rank) + { + idx_t pos = j - ctx -> idx_start; + return ctx -> local_datapoints[pos].ngbh.data[ksel].value; + } + else + { + //RMA + flags[i] = 0; + idx_t pos = j - ctx -> rank_idx_start[owner]; + MPI_Get(tmp_heap_nodes + i, sizeof(heap_node), MPI_BYTE, owner, (MPI_Aint)((pos * k + ksel) * sizeof(heap_node)), sizeof(heap_node), MPI_BYTE, *exposed_ngbh); + //if(ctx -> mpi_rank == 0) DB_PRINT("rvcd %lu %lf\n", el.array_idx, el.value); + return 0; + } + } + else + { + flags[i] = 1; + return tmp_heap_nodes[i].value; + } +} + +void clusters_allocate(clusters_t * c, int s) +{ + /* + * Helper function for handling allocation of resources + */ + if(c -> centers.count == 0) + { + printf("Provide a valid cluster centers list\n"); + return; + } + + idx_t nclus = c -> centers.count; + + if(s) + { + //printf("Using sparse implementation\n"); + c -> use_sparse_borders = 1; + c -> sparse_borders = (adj_list_t*)MY_MALLOC(nclus*sizeof(adj_list_t)); + for(idx_t i = 0; i < nclus; ++i) + { + c -> sparse_borders[i].count = 0; + c -> sparse_borders[i].size = PREALLOC_BORDERS; + c -> sparse_borders[i].data = (sparse_border_t*)MY_MALLOC(PREALLOC_BORDERS*sizeof(sparse_border_t)); + } + + } + else + { + //printf("Using dense implementation\n"); + c -> use_sparse_borders = 0; + c -> __borders_data = (border_t*)MY_MALLOC(nclus*nclus*sizeof(border_t)); + c -> borders = (border_t**)MY_MALLOC(nclus*sizeof(border_t*)); + + #pragma omp parallel for + + for(idx_t i = 0; i < nclus; ++i) + { + c -> borders[i] = c -> __borders_data + i*nclus; + for(idx_t j = 0; j < nclus; ++j) + { + c -> borders[i][j] = border_null; + } + } + } +} + + +void adj_list_insert(adj_list_t* l, sparse_border_t b) +{ + /* + * Handling of sparse border implementation as an adjecency list + */ + if(l -> count < l -> size) + { + l -> data[l -> count] = b; + l -> count++; + } + else + { + l -> size += PREALLOC_BORDERS; + l -> data = realloc( l -> data, sizeof(sparse_border_t) * ( l -> size)); + l -> data[l -> count] = b; + l -> count++; + } +} + +void adj_list_reset(adj_list_t* l) +{ + /* + * Handling of sparse border implementation as an adjecency list + */ + if(l -> data) free(l -> data); + l -> count = 0; + l -> size = 0; + l -> data = NULL; +} + +void clusters_reset(clusters_t * c) +{ + /* + * Handling reset of clusters object + */ + if(c -> use_sparse_borders) + { + for(idx_t i = 0; i < c -> centers.count; ++i) + { + adj_list_reset((c -> sparse_borders) + i); + + } + free(c -> sparse_borders); + c -> sparse_borders = NULL; + } + else + { + if(c -> __borders_data) free(c -> __borders_data); + if(c -> borders) free(c -> borders); + } + if(c -> centers.data) free(c -> centers.data); +} + +void clusters_free(clusters_t * c) +{ + /* + * Free cluster object + */ + clusters_reset(c); +} + + +void sparse_border_insert(clusters_t *c, sparse_border_t b) +{ + /* + * Insert a border element in the sparse implementation + */ + + idx_t i = b.i; + adj_list_t l = c -> sparse_borders[i]; + int check = 1; + for(idx_t k = 0; k < l.count; ++k) + { + sparse_border_t p = l.data[k]; + if(p.i == b.i && p.j == b.j) + { + if( b.density > p.density) + { + l.data[k] = b; + } + check = 0; + } + } + if(check) adj_list_insert(c -> sparse_borders + i, b); + return; +} + +sparse_border_t sparse_border_get(clusters_t* c, idx_t i, idx_t j) +{ + /* + * Get a border element in the sparse implementation + * - i,j: cluster to search for borders + * return border_null if not found + */ + + sparse_border_t b = sparse_border_null; + adj_list_t l = c -> sparse_borders[i]; + for(idx_t el = 0; el < l.count; ++el) + { + sparse_border_t candidate = l.data[el]; + if(candidate.i == i && candidate.j == j) + { + b = candidate; + } + } + return b; +} + +datapoint_info_t find_possibly_halo_datapoint_rma(global_context_t* ctx, idx_t idx, MPI_Win win_datapoints) +{ + int owner = foreign_owner(ctx, idx); + /* find if datapoint is halo or not */ + if(owner == ctx -> mpi_rank) + { + idx_t i = idx - ctx -> idx_start; + return ctx -> local_datapoints[i]; + } + else + { + datapoint_info_t tmp_dp; + #pragma omp critical + { + idx_t i = idx - ctx -> rank_idx_start[owner]; + MPI_Request request; + MPI_Status status; + + MPI_Rget(&tmp_dp, sizeof(datapoint_info_t), MPI_BYTE, owner, + i * sizeof(datapoint_info_t), sizeof(datapoint_info_t), MPI_BYTE, win_datapoints, &request); + MPI_Wait(&request, MPI_STATUS_IGNORE); + + } + + return tmp_dp; + } +} + +int cmpPP(const void* p1, const void *p2) +{ + /* + * Utility function to perform quicksort + * when clustering assignment is performed + */ + datapoint_info_t* pp1 = *(datapoint_info_t**)p1; + datapoint_info_t* pp2 = *(datapoint_info_t**)p2; + //return 2*(pp1 -> g < pp2 -> g) - 1; + int v = (pp1 -> g < pp2 -> g) - (pp1 -> g > pp2 -> g); + int a = (pp1 -> array_idx < pp2 -> array_idx) - (pp1 -> array_idx > pp2 -> array_idx); + return v == 0 ? a : v; + //return v; +} + +inline int its_mine(global_context_t* ctx, idx_t idx) +{ + return idx >= ctx -> idx_start && idx < ctx -> idx_start + ctx -> local_n_points; +} + +int compare_center(const void* a, const void* b) +{ + center_t* aa = (center_t*)a; + center_t* bb = (center_t*)b; + return (aa -> density < bb -> density) - (aa -> density > bb -> density); +} + + +void compute_correction(global_context_t* ctx, float_t Z) +{ + /* + * Utility function, find the minimum value of the density of the datapoints + * and shift them up in order to further work with values greater than 0 + */ + float_t min_log_rho = 999999.9; + + datapoint_info_t* dp_info = ctx -> local_datapoints; + idx_t n = ctx -> local_n_points; + + + #pragma omp parallel + { + float_t thread_min_log_rho = 9999999.; + #pragma omp for + for(idx_t i = 0; i < n; ++i) + { + float_t tmp = dp_info[i].log_rho - Z*dp_info[i].log_rho_err; + if(tmp < thread_min_log_rho){ + thread_min_log_rho = tmp; + } + } + #pragma omp critical + if(thread_min_log_rho < min_log_rho) min_log_rho = thread_min_log_rho; + } + + MPI_Allreduce(MPI_IN_PLACE, &min_log_rho, 1, MPI_MY_FLOAT, MPI_MIN, ctx -> mpi_communicator); + + #pragma omp parallel + { + #pragma omp for + for(idx_t i = 0; i < n; ++i) + { + dp_info[i].log_rho_c = dp_info[i].log_rho - min_log_rho + 1; + dp_info[i].g = dp_info[i].log_rho_c - dp_info[i].log_rho_err; + } + + } + + //printf("%lf\n",min_log_rho); +} + +clusters_t Heuristic1(global_context_t *ctx, int verbose) +{ + /* + * Heurisitc 1, from paper of Errico, Facco, Laio & Rodriguez + * ( https://doi.org/10.1016/j.ins.2021.01.010 ) + * + * args: + */ + + datapoint_info_t* dp_info = ctx -> local_datapoints; + idx_t n = ctx -> local_n_points; + + struct timespec start_tot, finish_tot; + double elapsed_tot; + + if(verbose) + { + printf("H1: Preliminary cluster assignment\n"); + clock_gettime(CLOCK_MONOTONIC, &start_tot); + } + + TIME_DEF; + + + //idx_t ncenters = 0; + //idx_t putativeCenters = n; + lu_dynamic_array_t all_centers, removed_centers, actual_centers, max_rho; + lu_dynamic_array_allocate(&all_centers); + lu_dynamic_array_allocate(&removed_centers); + lu_dynamic_array_allocate(&actual_centers); + lu_dynamic_array_allocate(&max_rho); + + datapoint_info_t** dp_info_ptrs = (datapoint_info_t**)MY_MALLOC(n*sizeof(datapoint_info_t*)); + + struct timespec start, finish; + double elapsed; + + + if(verbose) + { + clock_gettime(CLOCK_MONOTONIC, &start); + } + + /* proceed */ + + MPI_Win win_datapoints; + MPI_Win_create(ctx -> local_datapoints, ctx -> local_n_points * sizeof(datapoint_info_t), + 1, MPI_INFO_NULL, ctx -> mpi_communicator, &win_datapoints); + MPI_Win_fence(0, win_datapoints); + MPI_Win_lock_all(0, win_datapoints); + +#if defined(THREAD_FUNNELED) +#else + #pragma omp parallel for +#endif + + for(idx_t i = 0; i < n; ++i) + { + /* + * Find the centers of the clusters as the points of higher density in their neighborhoods + * A point is tagged as a putative center if it is the point of higer density of its neighborhood + */ + + dp_info_ptrs[i] = dp_info + i; + idx_t maxk = dp_info[i].kstar + 1; + float_t gi = dp_info[i].g; + dp_info[i].is_center = 1; + dp_info[i].cluster_idx = -1; + //printf("%lf\n",p -> g); + heap i_ngbh = dp_info[i].ngbh; + for(idx_t k = 1; k < maxk; ++k) + { + idx_t ngbh_index = i_ngbh.data[k].array_idx; + datapoint_info_t dj = find_possibly_halo_datapoint_rma(ctx, ngbh_index, win_datapoints); + float_t gj = dj.g; + if(gj > gi){ + dp_info[i].is_center = 0; + break; + } + } + if(dp_info[i].is_center) + { + //lu_dynamic_array_pushBack(&all_centers, dp_info[i].array_idx); + #pragma omp critical + { + lu_dynamic_array_pushBack(&all_centers, i); + } + } + } + + if(verbose) + { + clock_gettime(CLOCK_MONOTONIC, &finish); + elapsed = (finish.tv_sec - start.tv_sec); + elapsed += (finish.tv_nsec - start.tv_nsec) / 1000000000.0; + printf("\tFinding putative centers: %.3lfs\n",elapsed); + clock_gettime(CLOCK_MONOTONIC, &start); + } + + + /* + * optimized version + * + * Generate a mask that keeps track of the point has been eliminating the + * point considered. Each thread updates this mask, then after the procedure + * ends, center, removed centers, and max_rho arrays are populated + */ + + heap_node* to_remove_mask = (heap_node*)MY_MALLOC(n*sizeof(heap_node)); + for(idx_t p = 0; p < n; ++p) + { + to_remove_mask[p].array_idx = MY_SIZE_MAX; + to_remove_mask[p].value = 9999999; + } + qsort(dp_info_ptrs, n, sizeof(datapoint_info_t*), cmpPP); + + + MPI_Win win_to_remove_mask; + MPI_Win_create(to_remove_mask, n * sizeof(heap_node), 1, MPI_INFO_NULL, ctx -> mpi_communicator, &win_to_remove_mask); + MPI_Win_fence(0, win_to_remove_mask); + + + + /* check into internal nodes */ + + /* + * to remove + * and to reimplement it using rma + * for dp in datapoints: + * for nghb in neighbors: + * if ngbh its_mine + * no_problems, do it as always + * else: + * ngbh is an external node + * do rma things, + * in particular, acquire a lock on the window, read and write things + * then close + */ + +#if defined(THREAD_FUNNELED) +#else + #pragma omp parallel for +#endif + for(idx_t p = 0; p < n; ++p) + { + datapoint_info_t i_point = *(dp_info_ptrs[p]); + + for(idx_t j = 1; j < i_point.kstar + 1; ++j) + { + idx_t jidx = i_point.ngbh.data[j].array_idx; + + datapoint_info_t j_point = find_possibly_halo_datapoint_rma(ctx, jidx, win_datapoints); + + if(j_point.is_center && i_point.g > j_point.g) + { + #pragma omp critical + { + /* + * + * THIS PART CAN BE SUBSTITUTED + * we have something like (value, idx) pairs and if we have less than + * MAX_UINT32 particles we can manipulate value and idx to fit into a single + * UINT64 and then perform a single atomic max operation + * + * */ + int owner = foreign_owner(ctx, jidx); + idx_t jpos = jidx - ctx -> rank_idx_start[owner]; + + MPI_Win_lock(MPI_LOCK_EXCLUSIVE, owner, 0, win_to_remove_mask); + heap_node mask_element; + MPI_Request request; + MPI_Rget( &mask_element, sizeof(heap_node), MPI_BYTE, + owner, jpos * sizeof(heap_node) , sizeof(heap_node), MPI_BYTE, win_to_remove_mask, &request); + MPI_Wait(&request, MPI_STATUS_IGNORE); + + int flag = mask_element.array_idx == MY_SIZE_MAX; + if(flag || i_point.g > mask_element.value ) + { + heap_node tmp_mask_element = {.array_idx = i_point.array_idx, .value = i_point.g}; + MPI_Request request; + MPI_Rput(&tmp_mask_element, sizeof(heap_node), MPI_BYTE, owner, + jpos*sizeof(heap_node), sizeof(heap_node), MPI_BYTE, win_to_remove_mask, &request); + MPI_Wait(&request, MPI_STATUS_IGNORE); + + } + + MPI_Win_unlock(owner, win_to_remove_mask); + } + } + } + } + + MPI_Win_fence(0, win_to_remove_mask); + MPI_Barrier(ctx -> mpi_communicator); + + /* populate the usual arrays */ + for(idx_t p = 0; p < all_centers.count; ++p) + { + idx_t i = all_centers.data[p]; + int e = 0; + //float_t gi = dp_info[i].g; + idx_t mr = to_remove_mask[i].array_idx; + if(mr != MY_SIZE_MAX) + { + //if(dp_info[mr].g > gi) e = 1; + e = 1; + } + switch (e) + { + case 1: + { + lu_dynamic_array_pushBack(&removed_centers,i); + dp_info[i].is_center = 0; + for(idx_t c = 0; c < removed_centers.count - 1; ++c) + { + if(mr == removed_centers.data[c]) + { + mr = max_rho.data[c]; + } + } + lu_dynamic_array_pushBack(&max_rho,mr); + + } + break; + + case 0: + { + lu_dynamic_array_pushBack(&actual_centers,i); + dp_info[i].cluster_idx = actual_centers.count - 1; + } + break; + + default: + break; + } + } + + MPI_Win_free(&win_to_remove_mask); + free(to_remove_mask); + + if(verbose) + { + clock_gettime(CLOCK_MONOTONIC, &finish); + elapsed = (finish.tv_sec - start.tv_sec); + elapsed += (finish.tv_nsec - start.tv_nsec) / 1000000000.0; + printf("\tFinding actual centers: %.3lfs\n",elapsed); + clock_gettime(CLOCK_MONOTONIC, &start); + } + + int n_centers = (int)actual_centers.count; + int tot_centers; + MPI_Allreduce(&n_centers, &tot_centers, 1, MPI_INT, MPI_SUM, ctx -> mpi_communicator); + // MPI_Allreduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm) + /* + DB_PRINT("rank %d got %d heheh\n", ctx -> mpi_rank, (int)actual_centers.count); + DB_PRINT("rank %d tc %d rc %d\n", ctx -> mpi_rank, (int)all_centers.count, (int)removed_centers.count); + */ + MPI_DB_PRINT("Found %d temporary centers\n", tot_centers); + + + /* bring on master all centers + * order them in ascending order of density, + * then re-scatter them around to get unique cluster labels */ + + + center_t* private_centers_buffer = (center_t*)MY_MALLOC(actual_centers.count * sizeof(center_t)); + + center_t* global_centers_buffer = (center_t*)MY_MALLOC(tot_centers * sizeof(center_t)); + + for(int i = 0; i < actual_centers.count; ++i) + { + idx_t idx = actual_centers.data[i] ; + private_centers_buffer[i].density = dp_info[idx].g; + private_centers_buffer[i].idx = dp_info[idx].array_idx; + } + MPI_Datatype mpi_center_type; + + MPI_Type_contiguous(sizeof(center_t), MPI_BYTE, &mpi_center_type); + MPI_Type_commit(&mpi_center_type); + + int* center_counts = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); + int* center_displs = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); + + int cc = (int)actual_centers.count; + MPI_Allgather(&cc, 1, MPI_INT, center_counts, 1, MPI_INT, ctx -> mpi_communicator); + + center_displs[0] = 0; + for(int i = 1; i < ctx -> world_size; ++i) center_displs[i] = center_displs[i - 1] + center_counts[i - 1]; + + /* alternative to check if something breaks */ + for(int i = 0; i < actual_centers.count; ++i) + { + idx_t idx = actual_centers.data[i]; + dp_info[idx].cluster_idx += center_displs[ctx -> mpi_rank]; + actual_centers.data[i] += ctx -> idx_start; + } + + idx_t* all_center_idx = (idx_t*)MY_MALLOC(tot_centers * sizeof(idx_t)); + + MPI_Allgatherv(actual_centers.data, actual_centers.count, MPI_UINT64_T, all_center_idx, center_counts, center_displs, MPI_UINT64_T, ctx -> mpi_communicator); + + free(center_counts); + free(center_displs); + + /* + * Sort all the dp_info based on g and then perform the cluster assignment + * in asceding order + */ + + /* + * Sort all the dp_info based on g and then perform the cluster assignment + * in asceding order + */ + + int completed = 0; + + while(!completed) + { + completed = 1; + + + int proc_points = 0; + /* retrieve assignment + * if some point remain uncompleted then get assignment from external points */ + + for(idx_t i = 0; i < n; ++i) + { + int wait_for_comms = 0; + datapoint_info_t* p = dp_info_ptrs[i]; + if(!(p -> is_center) && p -> cluster_idx < 0) + { + int cluster = -1; + idx_t k = 0; + idx_t p_idx; + idx_t max_k = p -> ngbh.N; + /*assign each particle at the same cluster as the nearest particle of higher density*/ + datapoint_info_t p_retrieved; + while( k < max_k - 1 && cluster == -1) + { + + + ++k; + p_idx = p -> ngbh.data[k].array_idx; + p_retrieved = find_possibly_halo_datapoint_rma(ctx, p_idx, win_datapoints); + + int flag = p_retrieved.g > p -> g; + + if(p_retrieved.g > p -> g) + { + cluster = p_retrieved.cluster_idx; + wait_for_comms = 1; + break; + } + //if(p -> array_idx == 1587636) printf("%lu k %d p_idx %lu %lf %lf\n", k, cluster, p_idx, p_retrieved.g, p -> g ); + } + + + if(cluster == -1 && !wait_for_comms) + { + float_t gmax = -99999.; + idx_t gm_index = SIZE_MAX; + for(idx_t k = 0; k < max_k; ++k) + { + idx_t ngbh_index = p -> ngbh.data[k].array_idx; + for(idx_t m = 0; m < removed_centers.count; ++m) + { + idx_t max_rho_idx = max_rho.data[m]; + datapoint_info_t dp_max_rho = find_possibly_halo_datapoint_rma(ctx, max_rho_idx, win_datapoints); + //datapoint_info_t dp_max_rho = dp_info[max_rho_idx]; + float_t gcand = dp_max_rho.g; + if(ngbh_index == removed_centers.data[m] && gcand > gmax) + { + //printf("%lu -- %lu\n", ele, m); + gmax = gcand; + gm_index = max_rho.data[m]; + } + } + } + if(gm_index != SIZE_MAX) + { + datapoint_info_t dp_gm = find_possibly_halo_datapoint_rma(ctx, gm_index, win_datapoints); + //datapoint_info_t dp_gm = dp_info[gm_index]; + cluster = dp_gm.cluster_idx; + } + + } + p -> cluster_idx = cluster; + + } + completed = completed && p -> cluster_idx != -1; + proc_points += p -> cluster_idx != -1 ? 1 : 0; + + } + + //DB_PRINT("rank %d proc %d\n", ctx -> mpi_rank, proc_points); + MPI_Allreduce(MPI_IN_PLACE, &completed, 1, MPI_INT, MPI_SUM, ctx -> mpi_communicator); + completed = completed == ctx -> world_size ? 1 : 0; + + } + + MPI_Win_unlock_all(win_datapoints); + MPI_Win_fence(0, win_datapoints); + MPI_Win_free(&win_datapoints); + + MPI_Barrier(ctx -> mpi_communicator); + + if(verbose) + { + clock_gettime(CLOCK_MONOTONIC, &finish); + elapsed = (finish.tv_sec - start.tv_sec); + elapsed += (finish.tv_nsec - start.tv_nsec) / 1000000000.0; + printf("\tTentative clustering: %.3lfs\n",elapsed); + clock_gettime(CLOCK_MONOTONIC, &start); + } + + free(dp_info_ptrs); + free(max_rho.data); + free(removed_centers.data); + free(all_centers.data); + + #if defined(WRITE_CLUSTER_ASSIGN_H1) + /* densities */ + int* ks = (int*)MY_MALLOC(ctx -> local_n_points * sizeof(int)); + for(int i = 0; i < ctx -> local_n_points; ++i) ks[i] = ctx -> local_datapoints[i].cluster_idx; + + ordered_buffer_to_file(ctx, ks, sizeof(int), ctx -> local_n_points, "bb/cl.npy"); + ordered_data_to_file(ctx); + free(ks); + #endif + + + free(actual_centers.data); + actual_centers.size = tot_centers; + actual_centers.count = tot_centers; + actual_centers.data = all_center_idx; + + + clusters_t c_all; + c_all.centers = actual_centers; + + + + if(verbose) + { + clock_gettime(CLOCK_MONOTONIC, &finish); + elapsed = (finish.tv_sec - start.tv_sec); + elapsed += (finish.tv_nsec - start.tv_nsec) / 1000000000.0; + printf("\tFinalizing clustering: %.3lfs\n",elapsed); + printf("\n"); + } + + clock_gettime(CLOCK_MONOTONIC, &finish_tot); + elapsed_tot = (finish_tot.tv_sec - start_tot.tv_sec); + elapsed_tot += (finish_tot.tv_nsec - start_tot.tv_nsec) / 1000000000.0; + + + if(verbose) + { + printf("\tFound %lu clusters\n",(uint64_t)actual_centers.count); + printf("\tTotal time: %.3lfs\n\n", elapsed_tot); + } + + c_all.n = n; + return c_all; +} + + +void Heuristic2(global_context_t* ctx, clusters_t* cluster) +{ + /* + * Port the current implementation to this using rma + * then perform the matrix reduction + * + * Each one computes its borders, then the borders are shared, and the matrix is + * reduced to a single huge matrix of borders + * + */ + + int verbose = 0; + + datapoint_info_t* dp_info = ctx -> local_datapoints; + + MPI_Win dp_info_win, ngbh_win; + MPI_Win_create(ctx -> local_datapoints, ctx -> local_n_points * sizeof(datapoint_info_t), 1, MPI_INFO_NULL, ctx -> mpi_communicator, &dp_info_win); + MPI_Win_create(ctx -> __local_heap_buffers, ctx -> local_n_points * ctx -> k * sizeof(heap_node), 1, MPI_INFO_NULL, ctx -> mpi_communicator, &ngbh_win); + + MPI_Win_fence(0, dp_info_win); + MPI_Win_fence(0, ngbh_win); + + MPI_Win_lock_all(0, dp_info_win); + MPI_Win_lock_all(0, ngbh_win); + MPI_Barrier(ctx -> mpi_communicator); + + + + #define borders cluster->borders + + struct timespec start_tot, finish_tot; + double elapsed_tot; + idx_t n = ctx -> local_n_points; + + if(verbose) + { + printf("H2: Finding border points\n"); + clock_gettime(CLOCK_MONOTONIC, &start_tot); + } + + idx_t nclus = cluster->centers.count; + idx_t max_k = dp_info[0].ngbh.N; + + for(idx_t i = 0; i < n; ++i) + { + idx_t pp = NOBORDER; + /*loop over n neighbors*/ + int c = dp_info[i].cluster_idx; + if(!dp_info[i].is_center) + { + for(idx_t k = 1; k < dp_info[i].kstar + 1; ++k) + { + /*index of the kth ngbh of n*/ + idx_t j = dp_info[i].ngbh.data[k].array_idx; + pp = NOBORDER; + /*Loop over kn neigbhours to find if n is the nearest*/ + /*if cluster of the particle in nbhg is c then check is neighborhood*/ + + //DB_PRINT("my rank %d l %lu %lu %d\n", ctx -> mpi_rank, k, j, foreign_owner(ctx, j)); + datapoint_info_t j_dp = find_possibly_halo_datapoint_rma(ctx, j, dp_info_win); + if(j_dp.cluster_idx != c) + { + pp = j; + break; + } + + } + } + + if(pp != NOBORDER) + { + datapoint_info_t pp_dp = find_possibly_halo_datapoint_rma(ctx, pp, dp_info_win); + for(idx_t k = 1; k < max_k; ++k) + { + idx_t pp_ngbh_idx = get_j_ksel_idx(ctx, pp_dp.array_idx, k, ngbh_win); + datapoint_info_t pp_ngbh_dp = find_possibly_halo_datapoint_rma(ctx, pp_ngbh_idx, dp_info_win); + //TODO: can optimize it can retrieve the whole ngbh in one shot + + if(pp_ngbh_idx == dp_info[i].array_idx) + { + break; + } + + + if(pp_ngbh_dp.cluster_idx == c) + { + pp = NOBORDER; + break; + } + } + } + + /*if it is the maximum one add it to the cluster*/ + if(pp != NOBORDER) + { + datapoint_info_t j_dp = find_possibly_halo_datapoint_rma(ctx, pp, dp_info_win); + int ppc = j_dp.cluster_idx; + //insert one and symmetric one + sparse_border_t b = {.i = c, .j = ppc, .idx = ctx -> local_datapoints[i].array_idx, .density = ctx -> local_datapoints[i].g, .error = ctx -> local_datapoints[i].log_rho_err}; + sparse_border_insert(cluster, b); + ////get symmetric border + sparse_border_t bsym = {.i = ppc, .j = c, .idx = ctx -> local_datapoints[i].array_idx, .density = ctx -> local_datapoints[i].g, .error = ctx -> local_datapoints[i].log_rho_err}; + sparse_border_insert(cluster, bsym); + } + + + } + + MPI_Barrier(ctx -> mpi_communicator); + + idx_t num_border_el = 0; + for(idx_t i = 0; i < nclus; ++i) + { + num_border_el += cluster -> sparse_borders[i].count; + } + + int i_have_sent = 0; + int level = 1; + int ranks = ctx -> world_size; + + #define SEND 1 + #define RECV 0 + #define DO_NOTHING -1 + + MPI_Barrier(ctx -> mpi_communicator); + + while(ranks > 1) + { + int dp = ranks % 2; + ranks = ranks / 2 + dp; + int send_rcv = ctx -> mpi_rank >= ranks; + + if(dp && ctx -> mpi_rank == ranks - 1) send_rcv = DO_NOTHING; + + switch (send_rcv) + { + case SEND: + if(!i_have_sent) + { + idx_t num_border_el = 0; + for(idx_t i = 0; i < nclus; ++i) + { + num_border_el += cluster -> sparse_borders[i].count; + } + sparse_border_t* borders_to_send = (sparse_border_t*)MY_MALLOC(num_border_el * sizeof(sparse_border_t)); + + idx_t count = 0; + for(idx_t i = 0; i < nclus; ++i) + { + for(idx_t j = 0; j < cluster -> sparse_borders[i].count; ++j) + { + + borders_to_send[count] = cluster -> sparse_borders[i].data[j]; + count++; + } + } + + int rank_to_send = ctx -> mpi_rank - ranks; + + #if defined(PRINT_H2_COMM_SCHEME) + DB_PRINT("-- Rank %d sending to %d\n", ctx -> mpi_rank, rank_to_send); + #endif + + MPI_Send(&num_border_el, 1, MPI_UINT64_T, rank_to_send, rank_to_send, ctx -> mpi_communicator); + + MPI_Send(borders_to_send, num_border_el * sizeof(sparse_border_t), MPI_BYTE , rank_to_send, rank_to_send, ctx -> mpi_communicator); + + i_have_sent = 1; + free(borders_to_send); + } + break; + case RECV: + { + #if defined(PRINT_H2_COMM_SCHEME) + DB_PRINT("** Rank %d recieving\n", ctx -> mpi_rank); + #endif + idx_t num_borders_recv = 0; + MPI_Recv(&num_borders_recv, 1, MPI_UINT64_T, MPI_ANY_SOURCE, ctx -> mpi_rank, ctx -> mpi_communicator, MPI_STATUS_IGNORE); + + sparse_border_t* borders_to_recv = (sparse_border_t*)MY_MALLOC(num_borders_recv * sizeof(sparse_border_t)); + + MPI_Recv(borders_to_recv, num_borders_recv * sizeof(sparse_border_t), MPI_BYTE , MPI_ANY_SOURCE, ctx -> mpi_rank, ctx -> mpi_communicator, MPI_STATUS_IGNORE); + + for(int i = 0; i < num_borders_recv; ++i) + { + sparse_border_insert(cluster, borders_to_recv[i]); + } + free(borders_to_recv); + } + break; + default: + #if defined(PRINT_H2_COMM_SCHEME) + DB_PRINT(".. Rank %d doing nothing\n", ctx -> mpi_rank); + #endif + break; + } + MPI_Barrier(ctx -> mpi_communicator); + #if defined(PRINT_H2_COMM_SCHEME) + MPI_DB_PRINT("-----------------\n"); + #endif + + } + + num_border_el = 0; + for(idx_t i = 0; i < nclus; ++i) + { + num_border_el += cluster -> sparse_borders[i].count; + } + + MPI_DB_PRINT("Master final %lu border elements\n", num_border_el); + + //correction of the density at the borders to be compliant with dadadpy + if(I_AM_MASTER) + { + for(idx_t c = 0; c < nclus; ++c) + { + for(idx_t el = 0; el < cluster -> sparse_borders[c].count; ++el) + { + //fix border density, write log rho c + idx_t idx = cluster -> sparse_borders[c].data[el].idx; + datapoint_info_t idx_dp = find_possibly_halo_datapoint_rma(ctx, idx, dp_info_win); + cluster -> sparse_borders[c].data[el].density = idx_dp.log_rho_c; + } + } + } + + MPI_Barrier(ctx -> mpi_communicator); + MPI_Win_unlock_all(ngbh_win); + MPI_Win_unlock_all(dp_info_win); + + MPI_Win_free(&ngbh_win); + MPI_Win_free(&dp_info_win); + + #undef SEND + #undef RECV + #undef DO_NOTHING + + #ifdef WRITE_BORDERS + if(I_AM_MASTER) + { + printf("[MASTER] Writing borders to bb/borders.csv\n"); + FILE* f = fopen("bb/borders.csv", "w"); + for(idx_t i = 0; i < nclus; ++i) + { + for(int j = 0; j < cluster -> sparse_borders[i].count; ++j) + { + sparse_border_t b = cluster -> sparse_borders[i].data[j]; + fprintf(f, "%lu,%lu,%lu,%lf\n", b.i, b.j, b.idx, b.density); + } + } + fclose(f); + } + + + #endif + + return; + #undef borders + + } + + +int compare_merging_density( const void *A, const void *B) +{ + /* + * Utility function + * comparision between two merging + * elements + */ + + float_t DensA = ((merge_t*)A)->density; + float_t DensB = ((merge_t*)B)->density; + + return - ( DensA > DensB) + (DensA < DensB); +} + + +static inline int is_a_merging( float_t dens1, float_t dens1_err, + float_t dens2, float_t dens2_err, + float_t dens_border, float_t dens_border_err, + float_t Z) +{ + /* + * dens1 : the density of the particle that is the center of the first cluster + * dens2 : the density of the particle that is the center of the second cluster + * dens_border : the density of the border btw the cluster 1 and the cluster 2 + * border_err : the errors on the densities + * Z : the desired accuracy + */ + + /* in the original code it was: + * + float_t a1 = dp_info[cluster->centers.data[i]].log_rho_c - border_density[i][j]; + float_t a2 = dp_info[cluster->centers.data[j]].log_rho_c - border_density[i][j]; + + float_t e1 = Z*(dp_info[cluster->centers.data[i]].log_rho_err + border_err[i][j]); + float_t e2 = Z*(dp_info[cluster->centers.data[j]].log_rho_err + border_err[i][j]); + */ + + float_t a1 = dens1 - dens_border; + float_t a2 = dens2 - dens_border; + + float_t e1 = Z*(dens1_err + dens_border_err); + float_t e2 = Z*(dens2_err + dens_border_err); + + return (a1 < e1 || a2 < e2); +} + + +static inline int merging_roles( float_t dens1, float_t dens1_err, + float_t dens2, float_t dens2_err, + float_t dens_border, float_t dens_border_err ) +{ + /* + * Utility function + * Retrieve if cluster 1 is merged into 2 or + * vice versa + */ + + float_t c1 = (dens1 - dens_border) / (dens1_err + dens_border_err); + float_t c2 = (dens2 - dens_border) / (dens2_err + dens_border_err); + //printf("%.10lf %.10lf %d\n",c1,c2, c1 > c2); + + return ( c1 < c2 ); // if 1, this signal to swap 1 and 2 +} + +void fix_borders_A_into_B(idx_t A, idx_t B, border_t** borders, idx_t n) +{ + /* + * Dense border implementation + * - idx_t A : cluster A the one which goes into the other + * - idx_t B : cluster B the one that recieves the merging + * - border_t** borders : whole border matrix + * - idx_t n : number of clusters + * + */ + + #pragma omp parallel for if(n > MAX_SERIAL_MERGING) + for(idx_t i = 0; i < n; ++i) + { + if(borders[A][i].idx != NOBORDER ) + { + if(borders[B][i].idx != NOBORDER) + { + int mb = (borders[A][i].density > borders[B][i].density); + + borders[B][i] = mb ? borders[A][i] : borders[B][i]; + borders[i][B] = borders[B][i]; + } + else + { + borders[B][i] = borders[A][i]; + borders[i][B] = borders[B][i]; + } + } + borders[A][i] = border_null; + borders[i][A] = border_null; + } +} + +static inline void delete_adj_list_element(clusters_t * c, const idx_t list_idx, const idx_t el) +{ + /* + * Utility function + * Deletes an element into an adjecency list, + * representing the borders in the cluster topology + */ + + idx_t count = c -> sparse_borders[list_idx].count; + c -> sparse_borders[list_idx].data[el] = c -> sparse_borders[list_idx].data[count-1]; + c -> sparse_borders[list_idx].data[count-1] = sparse_border_null; + c -> sparse_borders[list_idx].count -= 1; +} + +void fix_sparse_borders_A_into_B(idx_t s,idx_t t, clusters_t* c) +{ + /* + * Handle borders after two clusters are merged + * - idx_t s : source cluster, the one has to be merged + * - idx_t t : target cluster, the one recieves the merge + * - clusters* c : object containing all the data + * + * When s goes into t all the clusters which had a border with s now they must have + * a border with t. If t already has a border like that, the border with higher + * density is kept + */ + + { + { + for(idx_t el = 0; el < c -> sparse_borders[t].count; ++el) + { + sparse_border_t b = c -> sparse_borders[t].data[el]; + if(b.i == t && b.j == s) + { + //delete the border src trg + delete_adj_list_element(c, t, el); + } + } + } + //find the border and delete it, other insert them in correct place + for(idx_t el = 0; el < c -> sparse_borders[s].count; ++el) + { + sparse_border_t b = c -> sparse_borders[s].data[el]; + // idx_t ii = b.i; + if(b.j != t) + { + //insert these borders as trg -> j and j -> trg + b.i = t; + sparse_border_insert(c, b); + sparse_border_t bsym = b; + bsym.i = b.j; + bsym.j = b.i; + sparse_border_insert(c, bsym); + for(idx_t dl = 0; dl < c -> sparse_borders[b.j].count; ++dl) + { + sparse_border_t b_del = c -> sparse_borders[b.j].data[dl]; + if(b_del.j == s) + { + //delete the border src trg + delete_adj_list_element(c, b.j, dl); + } + } + + } + } + //clean up all borders + //delete the src list + { + adj_list_reset((c->sparse_borders) + s); + } + //delete all borders containing src + // for(idx_t i = 0; i < nclus; ++i) + // { + // for(idx_t el = 0; el < c -> sparse_borders[i].count; ++el) + // { + // sparse_border_t b = c -> sparse_borders[i].data[el]; + // if(b.j == s) + // { + // //delete the border src trg + // delete_adj_list_element(c, i, el); + // } + // } + // + // } + } + + +} + +void merge_A_into_B(idx_t* who_amI, idx_t cluster_A, idx_t cluster_B, idx_t n) +{ + /* + * Utility function + * Performs correctino of the labels in the array that + * keep tracks of what cluster is after a merging + */ + + #pragma omp parallel if(n > MAX_SERIAL_MERGING) + { + idx_t tmp; + #pragma omp for + for(idx_t i = 0; i < n; ++i) + { + //substitute occurencies of b with a + tmp = who_amI[i] == cluster_A ? cluster_B : who_amI[i]; + who_amI[i] = tmp; + } + } + return; +} + +void master_finds_borders(global_context_t* ctx, clusters_t* cluster, float_t Z, idx_t* surviving_clusters, datapoint_info_t* centers_dp) +{ + datapoint_info_t* dp_info = ctx -> local_datapoints; + idx_t nclus = cluster -> centers.count; + + idx_t merge_count = 0; + idx_t merging_table_size = 1000; + merge_t *merging_table = (merge_t*)malloc(sizeof(merge_t)*merging_table_size); + + /* + * Find clusters to be merged + * Loop over borders and find which ones will generate a merge, + * store them later in the merging table + **/ + + + /* center density + * need to retrieve center density */ + + for(idx_t i = 0; i < nclus - 1; ++i) + { + idx_t count = cluster -> sparse_borders[i].count; + for(idx_t el = 0; el < count; ++el) + { + sparse_border_t b = cluster -> sparse_borders[i].data[el]; + if( b.j > b.i) + { + float_t dens1 = centers_dp[b.i].log_rho_c; + float_t dens1_err = centers_dp[b.i].log_rho_err; + float_t dens2 = centers_dp[b.j].log_rho_c; + float_t dens2_err = centers_dp[b.j].log_rho_err; + float_t dens_border = b.density; + float_t dens_border_err = b.error; + + if ( is_a_merging( dens1, dens1_err, dens2, dens2_err, dens_border, dens_border_err, Z ) ) + { + if ( merge_count == merging_table_size ) { + merging_table_size *= 1.1; + merging_table = (merge_t*)realloc( merging_table, sizeof(merge_t) * merging_table_size ); } + + idx_t src = b.j; + idx_t trg = b.i; + + merging_table[merge_count].source = src; + merging_table[merge_count].target = trg; + merging_table[merge_count].density = b.density; + ++merge_count; + } + } + } + } + + qsort( (void*)merging_table, merge_count, sizeof(merge_t), compare_merging_density); + + MPI_DB_PRINT("Found %lu merges\n", merge_count); + + #ifdef WRITE_MERGING_TABLE + if(I_AM_MASTER) + { + printf("[MASTER] Writing merging table to bb/merging_table.csv\n"); + FILE* f = fopen("bb/merging_table.csv", "w"); + for(idx_t i = 0; i < merge_count; ++i) + { + merge_t b = merging_table[i]; + fprintf(f, "%lu,%lu,%lf\n", b.source, b.target, b.density); + } + fclose(f); + } + + #endif + + for( idx_t m = 0; m < merge_count; m++ ) + { + + #define src surviving_clusters[merging_table[m].source] + #define trg surviving_clusters[merging_table[m].target] + + //int re_check = ( (src != merging_table[m].source) || (trg != merging_table[m].target) ); + //if(re_check) + { + /* + * Enforce a that in case of symmetric merging condition the lowest idx cluster + * is merged into the higher idx cluster, only to preserve compatibility with + * original ADP implementation + * + * Process each element in the merging table + */ + idx_t new_src = (src < trg) ? src : trg; + idx_t new_trg = (src < trg) ? trg : src; + + /* + * pick who am I and retrieve all needed data from the + * border matrices + */ + + float_t dens1 = centers_dp[new_src].log_rho_c; + float_t dens1_err = centers_dp[new_src].log_rho_err; + float_t dens2 = centers_dp[new_trg].log_rho_c; + float_t dens2_err = centers_dp[new_trg].log_rho_err; + + //borders get + sparse_border_t b = sparse_border_get(cluster, new_src, new_trg); + float_t dens_border = b.density; + float_t dens_border_err = b.error; + + int i_have_to_merge = is_a_merging(dens1,dens1_err,dens2,dens2_err,dens_border,dens_border_err,Z); + switch (i_have_to_merge && src != trg) + { + case 1: + { + int side = merging_roles(dens1,dens1_err,dens2,dens2_err,dens_border,dens_border_err); + if(!side) + { + idx_t tmp; + tmp = new_src; + new_src = new_trg; + new_trg = tmp; + } + + + /* + * Perform the actual meriging, + * first -> fix the borders, delete old ones and spawn new one in the correct position + * second -> update the surviving_clusters buffer + */ + fix_sparse_borders_A_into_B(new_src, new_trg, cluster); + merge_A_into_B(surviving_clusters, new_src, new_trg, nclus ); + } + break; + + default: + break; + } + } + + #undef src + #undef trg + } + + free(merging_table); +} + +void master_fixes_border_matrix_and_centers(global_context_t* ctx, clusters_t* cluster, float_t Z, idx_t* old_to_new, idx_t* surviving_clusters, idx_t nclus) +{ + /*finalize clustering*/ + /*acutally copying */ + lu_dynamic_array_t tmp_centers; + lu_dynamic_array_t tmp_cluster_idx; + + + lu_dynamic_array_init(&tmp_centers); + lu_dynamic_array_init(&tmp_cluster_idx); + + lu_dynamic_array_reserve(&tmp_centers, nclus); + lu_dynamic_array_reserve(&tmp_cluster_idx, nclus); + + idx_t final_cluster_count = 0; + + idx_t incremental_k = 0; + for(idx_t i = 0; i < nclus; ++i) + { + + if(surviving_clusters[i] == i){ + lu_dynamic_array_pushBack(&tmp_centers, cluster->centers.data[i]); + lu_dynamic_array_pushBack(&tmp_cluster_idx, i); + old_to_new[i] = incremental_k; + ++incremental_k; + ++final_cluster_count; + } + } + + //fill the rest of old_to_new + for(idx_t i = 0; i < nclus; ++i) + { + if(surviving_clusters[i] != i){ + idx_t cidx_to_copy_from = surviving_clusters[i]; + old_to_new[i] = old_to_new[cidx_to_copy_from]; + } + } + + /*allocate auxiliary pointers to store results of the finalization of the procedure*/ + + adj_list_t* tmp_borders = (adj_list_t*)malloc(final_cluster_count*sizeof(adj_list_t)); + + //initialize temporary borders + for(idx_t i = 0; i < final_cluster_count; ++i) + { + tmp_borders[i].count = 0; + tmp_borders[i].size = PREALLOC_BORDERS; + tmp_borders[i].data = (sparse_border_t*)malloc(PREALLOC_BORDERS*sizeof(sparse_border_t)); + } + + #pragma omp parallel for + for(idx_t c = 0; c < final_cluster_count; ++c) + { + idx_t c_idx = tmp_cluster_idx.data[c]; + for(idx_t el = 0; el < cluster -> sparse_borders[c_idx].count; ++el) + { + //retrieve border + sparse_border_t b = cluster -> sparse_borders[c_idx].data[el]; + //change idexes of clusters + b.i = old_to_new[b.i]; + b.j = old_to_new[b.j]; + + adj_list_insert(tmp_borders + c, b); + } + } + + clusters_reset(cluster); + /*pay attention to the defined borders*/ + /*copy into members*/ + cluster -> sparse_borders = tmp_borders; + cluster -> centers = tmp_centers; + free(tmp_cluster_idx.data); + +} + +int compare_dp_by_cidx(const void* a, const void* b) +{ + int aa = ((datapoint_info_t*)a) -> cluster_idx; + int bb = ((datapoint_info_t*)b) -> cluster_idx; + return (aa > bb) - (aa < bb); +} + +void Heuristic3(global_context_t* ctx, clusters_t* cluster, float_t Z, int halo) +{ + + int verbose = 0; + + /* + * Heurisitc 3, from paper of Errico, Facco, Laio & Rodriguez + * ( https://doi.org/10.1016/j.ins.2021.01.010 ) + * Dense implementation, makes use of a dense matrix to store + * borders between clusters, so it is more performant when the number of clusters is low + * + * args: + * - clusters* cluster : cluster object storing border info and cluster centers + * - datapoint_info* dp_info : array of Datapoint structures + * - float_t Z : parameter to assess density peak significance + * - halo : flag to set if you want to compute also the halo points + */ + + #define borders cluster->borders + + + struct timespec start_tot, finish_tot; + double elapsed_tot; + + struct timespec start, finish; + double elapsed; + + clock_gettime(CLOCK_MONOTONIC, &start_tot); + + + datapoint_info_t* dp_info = ctx -> local_datapoints; + idx_t nclus = cluster -> centers.count; + idx_t* surviving_clusters = (idx_t*)MY_MALLOC(nclus*sizeof(idx_t)); + idx_t* old_to_new = (idx_t*)MY_MALLOC(nclus*sizeof(idx_t)); + for(idx_t i = 0; i < nclus; ++i) + { + surviving_clusters[i] = i; + } + + MPI_Win dp_info_win; + MPI_Win_create(ctx -> local_datapoints, ctx -> local_n_points * sizeof(datapoint_info_t), 1, MPI_INFO_NULL, ctx -> mpi_communicator, &dp_info_win); + + MPI_Win_fence(0, dp_info_win); + + if(I_AM_MASTER) + { + datapoint_info_t* centers_dp = (datapoint_info_t*)MY_MALLOC(cluster -> centers.count * sizeof(datapoint_info_t)); + for(idx_t i = 0; i < cluster -> centers.count; ++i) + { + idx_t cidx = cluster -> centers.data[i]; + + int owner = foreign_owner(ctx, cidx); + idx_t pos = cidx - ctx -> rank_idx_start[owner]; + if(owner == ctx -> mpi_rank) + { + centers_dp[i] = ctx -> local_datapoints[pos]; + } + else + { + MPI_Get(centers_dp + i, + sizeof(datapoint_info_t), + MPI_BYTE, + owner, + (MPI_Aint)(pos * sizeof(datapoint_info_t)), + sizeof(datapoint_info_t), + MPI_BYTE, + dp_info_win); + } + } + MPI_Win_fence(0, dp_info_win); + + qsort(centers_dp, cluster -> centers.count, sizeof(datapoint_info_t), compare_dp_by_cidx); + + printf("Centers\n"); + + master_finds_borders(ctx, cluster, Z, surviving_clusters, centers_dp); + master_fixes_border_matrix_and_centers(ctx, cluster, Z, old_to_new, surviving_clusters, nclus); + free(centers_dp); + } + else + { + MPI_Win_fence(0, dp_info_win); + } + MPI_Win_free(&dp_info_win); + + /* at this point master has the final border matrix + * with the final list of surviving clusters + */ + + /* copy centers */ + + if(!(I_AM_MASTER)) + { + clusters_reset(cluster); + } + + MPI_Bcast(&(cluster -> centers.count), 1, MPI_UINT64_T, 0, ctx -> mpi_communicator); + MPI_Bcast(&(cluster -> centers.size), 1, MPI_UINT64_T, 0, ctx -> mpi_communicator); + + if(!(I_AM_MASTER)) + { + cluster -> centers.data = (idx_t*)MY_MALLOC(cluster -> centers.size * sizeof(idx_t)); + } + + MPI_Bcast(cluster -> centers.data, cluster -> centers.size, MPI_UINT64_T, 0, ctx -> mpi_communicator); + + /* copy borders */ + + idx_t final_cluster_count = cluster -> centers.count; + + if(!(I_AM_MASTER)) + { + cluster -> sparse_borders = (adj_list_t*)MY_MALLOC(final_cluster_count * sizeof(adj_list_t)); + } + + MPI_Bcast(cluster -> sparse_borders, final_cluster_count * sizeof(adj_list_t), MPI_BYTE, 0, ctx -> mpi_communicator); + + for(int i = 0; i < final_cluster_count; ++i) + { + if(!(I_AM_MASTER)) + { + cluster -> sparse_borders[i].data = (sparse_border_t*)MY_MALLOC(cluster -> sparse_borders[i].size * sizeof(sparse_border_t)); + } + MPI_Bcast(cluster -> sparse_borders[i].data, + cluster -> sparse_borders[i].size * sizeof(sparse_border_t), + MPI_BYTE, 0, ctx -> mpi_communicator); + } + + + MPI_Bcast(surviving_clusters, nclus, MPI_UINT64_T, 0, ctx -> mpi_communicator); + MPI_Bcast(old_to_new, nclus, MPI_UINT64_T, 0, ctx -> mpi_communicator); + + /*fix cluster assignment*/ + #pragma omp parallel for + for(idx_t i = 0; i < ctx -> local_n_points; ++i) + { + dp_info[i].is_center = 0; + int old_cidx = dp_info[i].cluster_idx; + dp_info[i].cluster_idx = old_to_new[old_cidx]; + } + + //reset centers + for(idx_t i = 0; i < cluster -> centers.count; ++i) + { + idx_t idx = cluster -> centers.data[i]; + int owner = foreign_owner(ctx, idx); + if(owner == ctx -> mpi_rank) + { + idx_t pos = idx - ctx -> idx_start; + dp_info[pos].is_center = 1; + } + } + + /*Halo*/ + + switch (halo) + { + case 1: + { + float_t* max_border_den_array = (float_t*)malloc(final_cluster_count*sizeof(float_t)); + #pragma omp parallel + { + #pragma omp for + for(idx_t c = 0; c < final_cluster_count; ++c) + { + float_t max_border_den = -2.; + for(idx_t el = 0; el < cluster -> sparse_borders[c].count; ++el) + { + sparse_border_t b = cluster -> sparse_borders[c].data[el]; + if(b.density > max_border_den) + { + max_border_den = b.density; + } + } + + max_border_den_array[c] = max_border_den; + + } + + #pragma omp barrier + + #pragma omp for + for(idx_t i = 0; i < cluster -> n; ++i) + { + int cidx = dp_info[i].cluster_idx; + int halo_flag = dp_info[i].log_rho_c < max_border_den_array[cidx]; + //int halo_flag = max_border_den_array[cidx] > dp_info[i].log_rho_c ; + dp_info[i].cluster_idx = halo_flag ? -1 : cidx; + } + } + free(max_border_den_array); + } + break; + + default: + break; + } + + MPI_DB_PRINT("--> final cluster count %lu\n", final_cluster_count); + free(surviving_clusters); + free(old_to_new); + + /*free memory and put the correct arrays into place*/ + //free(ipos.data); + //free(jpos.data); + // + + #ifdef WRITE_FINAL_ASSIGNMENT + int* cl = (int*)MY_MALLOC(ctx -> local_n_points * sizeof(int)); + + for(int i = 0; i < ctx -> local_n_points; ++i) cl[i] = ctx -> local_datapoints[i].cluster_idx; + + ordered_buffer_to_file(ctx, cl, sizeof(int), ctx -> local_n_points, "bb/final_assignment.npy"); + + free(cl); + + #endif + + #undef borders +} + diff --git a/src/adp/adp.h b/src/adp/adp.h new file mode 100644 index 0000000000000000000000000000000000000000..5da69a035bec46c0aaac8825e6f344a0cda335aa --- /dev/null +++ b/src/adp/adp.h @@ -0,0 +1,58 @@ +#pragma once +#include "../tree/tree.h" + +#define PREALLOC_BORDERS 100 +#define MAX_SERIAL_MERGING 100 +#define PRINT_H2_COMM_SCHEME + +typedef struct border_t +{ + float_t density; + float_t error; + idx_t idx; +} border_t; + +typedef struct sparse_border_t +{ + idx_t i; + idx_t j; + idx_t idx; + float_t density; + float_t error; +} sparse_border_t; + +typedef struct adj_list_t +{ + idx_t count; + idx_t size; + struct sparse_border_t* data; +} adj_list_t; + + +typedef struct clusters_t +{ + int use_sparse_borders; + struct adj_list_t *sparse_borders; + struct lu_dynamic_array_t centers; + struct border_t **borders; + struct border_t *__borders_data; + idx_t n; +} clusters_t; + +typedef struct merge_t +{ + idx_t source; + idx_t target; + float_t density; +} merge_t; + + + +void compute_density_kstarnn_rma(global_context_t* ctx, const float_t d, int verbose); +void compute_density_kstarnn_rma_v2(global_context_t* ctx, const float_t d, int verbose); +float_t compute_ID_two_NN_ML(global_context_t* ctx, datapoint_info_t* dp_info, idx_t n, int verbose); +void clusters_allocate(clusters_t * c, int s); + +clusters_t Heuristic1(global_context_t *ctx, int verbose); +void Heuristic2(global_context_t* ctx, clusters_t* cluster); +void Heuristic3(global_context_t* ctx, clusters_t* cluster, float_t Z, int halo); diff --git a/src/common/common.c b/src/common/common.c index 2a437caffa5896e455f85dc169729fe6717a7e40..1dcfc06b8e0dc2d82af3adb8160e209d9c1950d4 100644 --- a/src/common/common.c +++ b/src/common/common.c @@ -202,3 +202,54 @@ void lu_dynamic_array_init(lu_dynamic_array_t * a) a -> size = 0; } +void ordered_buffer_to_file(global_context_t* ctx, void* buffer, size_t el_size, uint64_t n, const char* fname) +{ + //MPI_Barrier(ctx -> mpi_communicator); + MPI_DB_PRINT("[MASTER] writing to file %s\n", fname); + void* tmp_data; + int* ppp; + int* displs; + + MPI_Barrier(ctx -> mpi_communicator); + + uint64_t tot_n = 0; + MPI_Reduce(&n, &tot_n, 1, MPI_UINT64_T , MPI_SUM, 0, ctx -> mpi_communicator); + + if(I_AM_MASTER) + { + tmp_data = (void*)MY_MALLOC(el_size * tot_n ); + ppp = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); + displs = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); + + } + + int nn = (int)n; + MPI_Gather(&nn, 1, MPI_INT, ppp, 1, MPI_INT, 0, ctx -> mpi_communicator); + + if(I_AM_MASTER) + { + displs[0] = 0; + for(int i = 0; i < ctx -> world_size; ++i) ppp[i] = el_size * ppp[i]; + for(int i = 1; i < ctx -> world_size; ++i) displs[i] = displs[i - 1] + ppp[i - 1]; + + } + + MPI_Gatherv(buffer, (int)(el_size * n), + MPI_BYTE, tmp_data, ppp, displs, MPI_BYTE, 0, ctx -> mpi_communicator); + + if(I_AM_MASTER) + { + FILE* file = fopen(fname,"w"); + if(!file) + { + printf("Cannot open file %s ! Aborting \n", fname); + } + fwrite(tmp_data, 1, el_size * tot_n, file); + fclose(file); + free(tmp_data); + free(ppp); + free(displs); + + } + MPI_Barrier(ctx -> mpi_communicator); +} diff --git a/src/common/common.h b/src/common/common.h index 3bd19562a400f2111b25cc10ec8a08c872d6c6dd..6fa6856975b5249e2e0c83eb3a870496824d9f04 100644 --- a/src/common/common.h +++ b/src/common/common.h @@ -29,6 +29,15 @@ typedef struct datapoint_info_t { #define float_t double #endif +#ifdef USE_FLOAT32 +#define MPI_MY_FLOAT MPI_FLOAT +#else +#define MPI_MY_FLOAT MPI_DOUBLE +#endif + + +#define I_AM_MASTER ctx->mpi_rank == 0 + #define MY_TRUE 1 #define MY_FALSE 0 @@ -173,4 +182,4 @@ void lu_dynamic_array_reserve(lu_dynamic_array_t * a, idx_t n); void lu_dynamic_array_init(lu_dynamic_array_t * a); void print_error_code(int err); - +void ordered_data_to_file(global_context_t* ctx); diff --git a/src/main/main.c b/src/main/main.c index 0a698424c28fea2b0e5f13228544e09d1fe885e2..243683164014e9f6a22ea7ce4e51d6a32aad8114 100644 --- a/src/main/main.c +++ b/src/main/main.c @@ -3,6 +3,7 @@ #include <stdio.h> #include "../common/common.h" #include "../tree/tree.h" +#include "../adp/adp.h" // @@ -89,3 +90,206 @@ int main(int argc, char** argv) { +void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx) +{ + float_t *data; + TIME_DEF + double elapsed_time; + + if (ctx->mpi_rank == 0) + { + //data = read_data_file(ctx, "../norm_data/50_blobs_more_var.npy", MY_TRUE); + //ctx->dims = 2; + //data = read_data_file(ctx, "../norm_data/50_blobs.npy", MY_TRUE); + // std_g0163178_Me14_091_0000 + + // 190M points + // std_g2980844_091_0000 + // data = read_data_file(ctx,"../norm_data/std_g2980844_091_0000",MY_TRUE); + + /* 1M points ca.*/ + data = read_data_file(ctx,"../norm_data/std_LR_091_0001",MY_TRUE); + + /* BOX */ + // data = read_data_file(ctx,"../norm_data/std_Box_256_30_092_0000",MY_TRUE); + + /* 8M points */ + + // data = read_data_file(ctx,"../norm_data/std_g0144846_Me14_091_0001",MY_TRUE); + + //88M + //data = read_data_file(ctx,"../norm_data/std_g5503149_091_0000",MY_TRUE); + + // + //34 M + //data = read_data_file(ctx,"../norm_data/std_g1212639_091_0001",MY_TRUE); + ctx->dims = 5; + + // ctx -> n_points = 5 * 100000; + ctx->n_points = ctx->n_points / ctx->dims; + //ctx->n_points = (ctx->n_points * 5) / 10; + // ctx -> n_points = ctx -> world_size * 1000; + + //ctx -> n_points = 10000000 * ctx -> world_size; + //generate_random_matrix(&data, ctx -> dims, ctx -> n_points, ctx); + //mpi_printf(ctx, "Read %lu points in %u dims\n", ctx->n_points, ctx->dims); + } + //MPI_DB_PRINT("[MASTER] Reading file and scattering\n"); + + /* communicate the total number of points*/ + MPI_Bcast(&(ctx->dims), 1, MPI_UINT32_T, 0, ctx->mpi_communicator); + MPI_Bcast(&(ctx->n_points), 1, MPI_UINT64_T, 0, ctx->mpi_communicator); + + /* compute the number of elements to recieve for each processor */ + int *send_counts = (int *)MY_MALLOC(ctx->world_size * sizeof(int)); + int *displacements = (int *)MY_MALLOC(ctx->world_size * sizeof(int)); + + displacements[0] = 0; + send_counts[0] = ctx->n_points / ctx->world_size; + send_counts[0] += (ctx->n_points % ctx->world_size) > 0 ? 1 : 0; + send_counts[0] = send_counts[0] * ctx->dims; + + for (int p = 1; p < ctx->world_size; ++p) + { + send_counts[p] = (ctx->n_points / ctx->world_size); + send_counts[p] += (ctx->n_points % ctx->world_size) > p ? 1 : 0; + send_counts[p] = send_counts[p] * ctx->dims; + displacements[p] = displacements[p - 1] + send_counts[p - 1]; + } + + + ctx->local_n_points = send_counts[ctx->mpi_rank] / ctx->dims; + + float_t *pvt_data = (float_t *)MY_MALLOC(send_counts[ctx->mpi_rank] * sizeof(float_t)); + + MPI_Scatterv(data, send_counts, displacements, MPI_MY_FLOAT, pvt_data, send_counts[ctx->mpi_rank], MPI_MY_FLOAT, 0, ctx->mpi_communicator); + + ctx->local_data = pvt_data; + + int k_local = 20; + int k_global = 20; + + uint64_t *global_bin_counts_int = (uint64_t *)MY_MALLOC(k_global * sizeof(uint64_t)); + + pointset_t original_ps; + original_ps.data = ctx->local_data; + original_ps.dims = ctx->dims; + original_ps.n_points = ctx->local_n_points; + original_ps.lb_box = (float_t*)MY_MALLOC(ctx -> dims * sizeof(float_t)); + original_ps.ub_box = (float_t*)MY_MALLOC(ctx -> dims * sizeof(float_t)); + + float_t tol = 0.002; + + top_kdtree_t tree; + TIME_START; + top_tree_init(ctx, &tree); + elapsed_time = TIME_STOP; + LOG_WRITE("Initializing global kdtree", elapsed_time); + + TIME_START; + build_top_kdtree(ctx, &original_ps, &tree, k_global, tol); + exchange_points(ctx, &tree); + elapsed_time = TIME_STOP; + LOG_WRITE("Top kdtree build and domain decomposition", elapsed_time); + //test_the_idea(ctx); + + TIME_START; + kdtree_v2 local_tree; + kdtree_v2_init( &local_tree, ctx -> local_data, ctx -> local_n_points, (unsigned int)ctx -> dims); + int k = 300; + //int k = 30; + + datapoint_info_t* dp_info = (datapoint_info_t*)MY_MALLOC(ctx -> local_n_points * sizeof(datapoint_info_t)); + /* initialize, to cope with valgrind */ + for(uint64_t i = 0; i < ctx -> local_n_points; ++i) + { + dp_info[i].ngbh.data = NULL; + dp_info[i].ngbh.N = 0; + dp_info[i].ngbh.count = 0; + dp_info[i].g = 0.f; + dp_info[i].log_rho = 0.f; + dp_info[i].log_rho_c = 0.f; + dp_info[i].log_rho_err = 0.f; + dp_info[i].array_idx = -1; + dp_info[i].kstar = -1; + dp_info[i].is_center = -1; + dp_info[i].cluster_idx = -1; + } + + build_local_tree(ctx, &local_tree); + elapsed_time = TIME_STOP; + LOG_WRITE("Local trees init and build", elapsed_time); + + TIME_START; + MPI_DB_PRINT("----- Performing ngbh search -----\n"); + MPI_Barrier(ctx -> mpi_communicator); + + mpi_ngbh_search(ctx, dp_info, &tree, &local_tree, ctx -> local_data, k); + + MPI_Barrier(ctx -> mpi_communicator); + elapsed_time = TIME_STOP; + LOG_WRITE("Total time for all knn search", elapsed_time) + + + + TIME_START; + //float_t id = id_estimate(ctx, dp_info, ctx -> local_n_points, 0.9, MY_FALSE); + float_t id = compute_ID_two_NN_ML(ctx, dp_info, ctx -> local_n_points, MY_FALSE); + elapsed_time = TIME_STOP; + LOG_WRITE("ID estimate", elapsed_time) + + MPI_DB_PRINT("ID %lf \n",id); + + TIME_START; + + float_t z = 2; + + ctx -> local_datapoints = dp_info; + //compute_density_kstarnn_rma(ctx, id, MY_FALSE); + compute_density_kstarnn_rma_v2(ctx, id, MY_FALSE); + compute_correction(ctx, z); + elapsed_time = TIME_STOP; + LOG_WRITE("Density estimate", elapsed_time) + + TIME_START; + clusters_t clusters = Heuristic1(ctx, MY_FALSE); + elapsed_time = TIME_STOP; + LOG_WRITE("H1", elapsed_time) + + TIME_START; + clusters_allocate(&clusters, 1); + Heuristic2(ctx, &clusters); + elapsed_time = TIME_STOP; + LOG_WRITE("H2", elapsed_time) + + + TIME_START; + int halo = 0; + Heuristic3(ctx, &clusters, z, halo); + elapsed_time = TIME_STOP; + LOG_WRITE("H3", elapsed_time) + + + /* find density */ + #if defined (WRITE_NGBH) + ordered_data_to_file(ctx); + #endif + + /* + free(foreign_dp_info); + */ + + + top_tree_free(ctx, &tree); + kdtree_v2_free(&local_tree); + + free(send_counts); + free(displacements); + //free(dp_info); + + if (ctx->mpi_rank == 0) free(data); + + original_ps.data = NULL; + free_pointset(&original_ps); + free(global_bin_counts_int); +} diff --git a/src/tree/heap.h b/src/tree/heap.h index 513170efc9b5ca225934dc444de99e409f09d809..84fc8f49da59904ed862efeed36d6ae7471f5406 100644 --- a/src/tree/heap.h +++ b/src/tree/heap.h @@ -8,7 +8,7 @@ #define T double #define DATA_DIMS 0 -#define HERE printf("%d reached line %d\n", ctx -> mpi_rank, __LINE__); +#define HERE printf("%d reached line %d\n", ctx -> mpi_rank, __LINE__); MPI_Barrier(ctx -> mpi_communicator); #ifdef USE_FLOAT32 #define FLOAT_TYPE float diff --git a/src/tree/tree.c b/src/tree/tree.c index 167791e35541cce1c6dafcdf7137a8a0c0883130..2ecac4eb5ad28f94170771af0803967f1bd73ead 100644 --- a/src/tree/tree.c +++ b/src/tree/tree.c @@ -21,9 +21,12 @@ //#define WRITE_NGBH //#define WRITE_TOP_NODES -//#define WRITE_DENSITY -#define WRITE_CLUSTER_ASSIGN_H1 -#define WRITE_BORDERS +#define WRITE_DENSITY +//#define WRITE_CLUSTER_ASSIGN_H1 +//#define WRITE_BORDERS +//#define WRITE_MERGING_TABLE +#define WRITE_FINAL_ASSIGNMENT + /* * Maximum bytes to send with a single mpi send/recv, used @@ -36,24 +39,12 @@ /* Used slices of 10 mb ? Really good? Maybe at the cause of TID thing */ #define MAX_MSG_SIZE 1000000000 -#ifdef USE_FLOAT32 -#define MPI_MY_FLOAT MPI_FLOAT -#else -#define MPI_MY_FLOAT MPI_DOUBLE -#endif - - -#define I_AM_MASTER ctx->mpi_rank == 0 #define TOP_TREE_RCH 1 #define TOP_TREE_LCH 0 #define NO_CHILD -1 unsigned int data_dims; -const border_t border_null = {.density = -1.0, .error = 0, .idx = NOBORDER}; -const sparse_border_t sparse_border_null = {.density = -1.0, .error = 0, .idx = NOBORDER, .i = NOBORDER, .j = NOBORDER}; - -#define PREALLOC_BORDERS 100 int cmp_float_t(const void* a, const void* b) @@ -64,7 +55,8 @@ int cmp_float_t(const void* a, const void* b) } -float_t *read_data_file(global_context_t *ctx, const char *fname, + +float_t* read_data_file(global_context_t *ctx, const char *fname, const int file_in_float32) { @@ -1841,6 +1833,7 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre req_array = (MPI_Request*)MY_MALLOC(req_num * sizeof(MPI_Request)); + for(int i = 0; i < ctx -> world_size; ++i) { already_sent_points[i] = 0; @@ -1849,6 +1842,7 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre int req_idx = 0; + for(int i = 0; i < ctx -> world_size; ++i) { int count = 0; @@ -1866,11 +1860,11 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre } } } + MPI_Barrier(ctx -> mpi_communicator); MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, ctx -> mpi_communicator, &flag, &status); //DB_PRINT("%d %p %p\n",ctx -> mpi_rank, &flag, &status); - //HERE while(flag) { MPI_Request request; @@ -1987,34 +1981,6 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre } -void test_the_idea(global_context_t* ctx) -{ - int* num = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); - for(int i = 0; i < ctx -> world_size; ++i) num[i] = i; - - for(int i = 0; i < ctx -> world_size; ++i) - { - MPI_Request request; - MPI_Isend(num + i, 1, MPI_INT, i, 0, ctx -> mpi_communicator, &request); - } - MPI_Barrier(ctx -> mpi_communicator); - MPI_Status status; - int flag; - int cc = 0; - MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_SOURCE, ctx -> mpi_communicator, &flag, &status); - while(flag) - { - cc++; - MPI_Request request; - MPI_Recv(num + status.MPI_SOURCE, 1, MPI_INT, status.MPI_SOURCE, MPI_ANY_TAG, ctx -> mpi_communicator, &status); - MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_SOURCE, ctx -> mpi_communicator, &flag, &status); - - } - MPI_DB_PRINT("Recieved %d msgs\n",cc); - free(num); - -} - void build_local_tree(global_context_t* ctx, kdtree_v2* local_tree) { local_tree -> root = build_tree_kdtree_v2(local_tree -> _nodes, local_tree -> n_nodes, ctx -> dims); @@ -2061,59 +2027,8 @@ void ordered_data_to_file(global_context_t* ctx) MPI_Barrier(ctx -> mpi_communicator); } -void ordered_buffer_to_file(global_context_t* ctx, void* buffer, size_t el_size, uint64_t n, const char* fname) -{ - //MPI_Barrier(ctx -> mpi_communicator); - MPI_DB_PRINT("[MASTER] writing to file %s\n", fname); - void* tmp_data; - int* ppp; - int* displs; - - MPI_Barrier(ctx -> mpi_communicator); - - uint64_t tot_n = 0; - MPI_Reduce(&n, &tot_n, 1, MPI_UINT64_T , MPI_SUM, 0, ctx -> mpi_communicator); - - if(I_AM_MASTER) - { - tmp_data = (void*)MY_MALLOC(el_size * tot_n ); - ppp = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); - displs = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); - - } - - int nn = (int)n; - MPI_Gather(&nn, 1, MPI_INT, ppp, 1, MPI_INT, 0, ctx -> mpi_communicator); - - if(I_AM_MASTER) - { - displs[0] = 0; - for(int i = 0; i < ctx -> world_size; ++i) ppp[i] = el_size * ppp[i]; - for(int i = 1; i < ctx -> world_size; ++i) displs[i] = displs[i - 1] + ppp[i - 1]; - - } - - MPI_Gatherv(buffer, (int)(el_size * n), - MPI_BYTE, tmp_data, ppp, displs, MPI_BYTE, 0, ctx -> mpi_communicator); - if(I_AM_MASTER) - { - FILE* file = fopen(fname,"w"); - if(!file) - { - printf("Cannot open file %s ! Aborting \n", fname); - } - fwrite(tmp_data, 1, el_size * tot_n, file); - fclose(file); - free(tmp_data); - free(ppp); - free(displs); - - } - MPI_Barrier(ctx -> mpi_communicator); -} - -static inline int foreign_owner(global_context_t* ctx, idx_t idx) +int foreign_owner(global_context_t* ctx, idx_t idx) { int owner = ctx -> mpi_rank; if( idx >= ctx -> idx_start && idx < ctx -> idx_start + ctx -> local_n_points) @@ -2129,2822 +2044,13 @@ static inline int foreign_owner(global_context_t* ctx, idx_t idx) return owner; } -static inline void append_foreign_idx_list(idx_t element, int owner, int* counts, idx_t** lists) -{ - - - /* find the plausible place */ - int idx_to_insert; - - #pragma omp atomic capture - idx_to_insert = counts[owner]++; - - lists[owner][idx_to_insert] = element; -} - -int cmp_idx(const void* a, const void* b) -{ - idx_t aa = *((idx_t*)a); - idx_t bb = *((idx_t*)b); - return (aa > bb) - (aa < bb); -} - -void find_foreign_nodes(global_context_t* ctx, datapoint_info_t* dp, datapoint_info_t** foreign_dp) -{ - int k = dp[0].ngbh.count; - - idx_t** array_indexes_to_request = (idx_t**)MY_MALLOC(ctx -> world_size * sizeof(idx_t*)); - int* count_to_request = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); - int* capacities = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); - - - for(int i = 0; i < ctx -> world_size; ++i) - { - capacities[i] = 0; - count_to_request[i] = 0; - } - - /* count them */ - - #pragma omp parallel for - for(uint32_t i = 0; i < ctx -> local_n_points; ++i) - { - for(int j = 0; j < k; ++j) - { - idx_t element = dp[i].ngbh.data[j].array_idx; - int owner = foreign_owner(ctx, element); - //DB_PRINT("%lu %d\n", element, owner); - - if(owner != ctx -> mpi_rank) - { - #pragma omp atomic update - capacities[owner]++; - } - } - } - - /* alloc */ - - for(int i = 0; i < ctx -> world_size; ++i) - { - array_indexes_to_request[i] = (idx_t*)MY_MALLOC(capacities[i] * sizeof(idx_t)); - } - - /* append them */ - - #pragma omp parallel for - for(uint32_t i = 0; i < ctx -> local_n_points; ++i) - { - for(int j = 0; j < k; ++j) - { - idx_t element = dp[i].ngbh.data[j].array_idx; - int owner = foreign_owner(ctx, element); - //DB_PRINT("%lu %d\n", element, owner); - - if(owner != ctx -> mpi_rank) - { - append_foreign_idx_list(element, owner, count_to_request, array_indexes_to_request); - } - } - } - - /* prune them */ - int* unique_count = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); - - /* - if(I_AM_MASTER) - { - FILE* f = fopen("uniq","w"); - fwrite(array_indexes_to_request[1], sizeof(idx_t), capacities[1],f); - fclose(f); - } - */ - - #pragma omp paralell for - for(int i = 0; i < ctx -> world_size; ++i) - { - unique_count[i] = capacities[i] > 0; //init unique count - qsort(array_indexes_to_request[i], capacities[i], sizeof(idx_t), cmp_idx); - uint32_t prev = array_indexes_to_request[i][0]; - for(int el = 1; el < capacities[i]; ++el) - { - int flag = prev == array_indexes_to_request[i][el]; - if(!flag) - { - /* in place subsitution - * if a different element is found then - * copy in at the next free place - * */ - array_indexes_to_request[i][unique_count[i]] = array_indexes_to_request[i][el]; - unique_count[i]++; - } - prev = array_indexes_to_request[i][el]; - } - } - - for(int i = 0; i < ctx -> world_size; ++i) - { - array_indexes_to_request[i] = (idx_t*)realloc(array_indexes_to_request[i], unique_count[i] * sizeof(idx_t)); - } - - for(int i = 0; i < ctx -> world_size; ++i) - { - foreign_dp[i] = (datapoint_info_t*)calloc(sizeof(datapoint_info_t), unique_count[i]); - } - - /* alias for unique counts */ - int* n_heap_to_recv = unique_count; - int* n_heap_to_send = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); - - /* exchange how many to recv how many to send */ - - MPI_Alltoall(n_heap_to_recv, 1, MPI_INT, n_heap_to_send, 1, MPI_INT , ctx -> mpi_communicator); - /* compute displacements and yada yada */ - int* sdispls = (int*)calloc(ctx -> world_size , sizeof(int)); - int* rdispls = (int*)calloc(ctx -> world_size , sizeof(int)); - - int tot_count_send = n_heap_to_send[0]; - int tot_count_recv = n_heap_to_recv[0]; - for(int i = 1; i < ctx -> world_size; ++i) - { - sdispls[i] = sdispls[i - 1] + n_heap_to_send[i - 1]; - rdispls[i] = rdispls[i - 1] + n_heap_to_recv[i - 1]; - - tot_count_send += n_heap_to_send[i]; - tot_count_recv += n_heap_to_recv[i]; - } - idx_t* idx_buffer_to_send = (idx_t*)MY_MALLOC(tot_count_send * sizeof(idx_t)); - idx_t* idx_buffer_to_recv = (idx_t*)MY_MALLOC(tot_count_recv * sizeof(idx_t)); - - /* copy indexes on send buffer */ - for(int i = 0; i < ctx -> world_size; ++i) - { - memcpy(idx_buffer_to_recv + rdispls[i], array_indexes_to_request[i], n_heap_to_recv[i] * sizeof(idx_t)); - } - - MPI_Alltoallv(idx_buffer_to_recv, n_heap_to_recv, rdispls, MPI_UINT64_T, idx_buffer_to_send, n_heap_to_send, sdispls, MPI_UINT64_T, ctx -> mpi_communicator); - - - /* allocate foreign dp */ - heap_node* heap_buffer_to_send = (heap_node*)MY_MALLOC(tot_count_send * k * sizeof(heap_node)); - heap_node* heap_buffer_to_recv = (heap_node*)MY_MALLOC(tot_count_recv * k * sizeof(heap_node)); - - if(!heap_buffer_to_send) - { - printf("Rank %d cannot allocate heap to send aborting\n",ctx -> mpi_rank); - printf("Required %d for %d bytes\n", tot_count_send, tot_count_send * 16); - exit(1); - } - - if(!heap_buffer_to_recv) - { - printf("Rank %d cannot allocate heap to recieve aborting\n",ctx -> mpi_rank); - printf("Required %d for %d bytes\n", tot_count_recv, tot_count_recv * 16); - exit(1); - } - - for(int i = 0; i < tot_count_send; ++i) - { - idx_t idx = idx_buffer_to_send[i] - ctx -> idx_start; - memcpy(heap_buffer_to_send + i * k, dp[idx].ngbh.data, k * sizeof(heap_node)); - } - /* exchange heaps */ - - - MPI_Barrier(ctx -> mpi_communicator); - int default_msg_len = MAX_MSG_SIZE / (k * sizeof(heap_node)); - - int* already_sent_points = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); - int* already_rcvd_points = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); - - /* allocate a request array to keep track of all requests going out*/ - MPI_Request* req_array; - int req_num = 0; - for(int i = 0; i < ctx -> world_size; ++i) - { - req_num += n_heap_to_send[i] > 0 ? n_heap_to_send[i]/default_msg_len + 1 : 0; - } - - req_array = (MPI_Request*)MY_MALLOC(req_num * sizeof(MPI_Request)); - - for(int i = 0; i < ctx -> world_size; ++i) - { - already_sent_points[i] = 0; - already_rcvd_points[i] = 0; - } - - int req_idx = 0; - - MPI_Datatype MPI_my_heap; - MPI_Type_contiguous(k * sizeof(heap_node), MPI_BYTE, &MPI_my_heap); - MPI_Barrier(ctx -> mpi_communicator); - MPI_Type_commit(&MPI_my_heap); - - for(int i = 0; i < ctx -> world_size; ++i) - { - int count = 0; - if(n_heap_to_send[i] > 0) - { - while(already_sent_points[i] < n_heap_to_send[i]) - { - MPI_Request request; - count = MIN(default_msg_len, n_heap_to_send[i] - already_sent_points[i] ); - MPI_Isend( heap_buffer_to_send + k * (already_sent_points[i] + sdispls[i]), count, - MPI_my_heap, i, 0, ctx -> mpi_communicator, &request); - already_sent_points[i] += count; - req_array[req_idx] = request; - ++req_idx; - } - } - } - int flag; - MPI_Status status; - MPI_Barrier(ctx -> mpi_communicator); - MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, ctx -> mpi_communicator, &flag, &status); - //DB_PRINT("%d %p %p\n",ctx -> mpi_rank, &flag, &status); - while(flag) - { - MPI_Request request; - int count; - int source = status.MPI_SOURCE; - MPI_Get_count(&status, MPI_my_heap, &count); - /* recieve each slice */ - - MPI_Recv(heap_buffer_to_recv + k * (already_rcvd_points[source] + rdispls[source]), - count, MPI_my_heap, source, MPI_ANY_TAG, ctx -> mpi_communicator, &status); - - already_rcvd_points[source] += count; - MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, ctx -> mpi_communicator, &flag, &status); - - } - MPI_Barrier(ctx -> mpi_communicator); - - MPI_Type_free(&MPI_my_heap); - - MPI_Testall(req_num, req_array, &flag, MPI_STATUSES_IGNORE); - - if(flag == 0) - { - DB_PRINT("[!!!] Rank %d has unfinished communications\n", ctx -> mpi_rank); - exit(1); - } - free(req_array); - free(already_sent_points); - free(already_rcvd_points); - - - /* copy results where needed */ - for(int i = 0; i < ctx -> world_size; ++i) - { - for(int j = 0; j < n_heap_to_recv[i]; ++j) - { - /* - foreign_dp[i][j].array_idx = array_indexes_to_request[i][j]; - init_heap(&(foreign_dp[i][j].ngbh)); - allocate_heap(&(foreign_dp[i][j].ngbh), k); - foreign_dp[i][j].ngbh.N = k; - foreign_dp[i][j].ngbh.count = k; - memcpy(foreign_dp[i][j].ngbh.data, heap_buffer_to_recv + k * (j + rdispls[i]), k * sizeof(heap_node)); - */ - - foreign_dp[i][j].cluster_idx = -1; - - foreign_dp[i][j].array_idx = array_indexes_to_request[i][j]; - //init_heap(&(foreign_dp[i][j].ngbh)); - foreign_dp[i][j].ngbh.N = k; - foreign_dp[i][j].ngbh.count = k; - foreign_dp[i][j].ngbh.data = heap_buffer_to_recv + k * (j + rdispls[i]); - - if(foreign_dp[i][j].ngbh.data[0].array_idx != array_indexes_to_request[i][j]) - { - printf("Error on %lu\n",array_indexes_to_request[i][j]); - } - } - } - - /* put back indexes in the context */ - ctx -> idx_halo_points_recv = array_indexes_to_request; - ctx -> n_halo_points_recv = n_heap_to_recv; - ctx -> n_halo_points_send = n_heap_to_send; - ctx -> idx_halo_points_send = (idx_t**)MY_MALLOC(ctx -> world_size * sizeof(idx_t*)); - for(int i = 0; i < ctx -> world_size; ++i) ctx -> idx_halo_points_send[i] = NULL; - for(int i = 0; i < ctx -> world_size; ++i) - { - ctx -> idx_halo_points_send[i] = (idx_t*)MY_MALLOC( n_heap_to_send[i] * sizeof(idx_t)); - memcpy(ctx -> idx_halo_points_send[i], idx_buffer_to_send + sdispls[i], n_heap_to_send[i] * sizeof(idx_t) ); - } - - ctx -> halo_datapoints = foreign_dp; - ctx -> local_datapoints = dp; - - free(count_to_request); - free(capacities); - - /* free(heap_buffer_to_recv); this needs to be preserved*/ - ctx -> __recv_heap_buffers = heap_buffer_to_recv; - free(heap_buffer_to_send); - free(idx_buffer_to_send); - free(idx_buffer_to_recv); - - free(sdispls); - free(rdispls); -} -float_t mEst2(float_t * x, float_t *y, idx_t n) -{ - - /* - * Estimate the m coefficient of a straight - * line passing through the origin - * params: - * - x: x values of the points - * - y: y values of the points - * - n: size of the arrays - */ - - float_t num = 0; - float_t den = 0; - float_t dd; - for(idx_t i = 0; i < n; ++i) - { - float_t xx = x[i]; - float_t yy = y[i]; - dd = xx; - num += dd*yy; - den += dd*dd; - } - - return num/den; -} -float_t compute_ID_two_NN_ML(global_context_t* ctx, datapoint_info_t* dp_info, idx_t n, int verbose) -{ - /* - * Estimation of the intrinsic dimension of a dataset - * args: - * - dp_info: array of structs - * - n: number of dp_info - * intrinsic_dim = (N - 1) / np.sum(log_mus) - */ - - struct timespec start_tot, finish_tot; - double elapsed_tot; - - if(verbose) - { - printf("ID estimation:\n"); - clock_gettime(CLOCK_MONOTONIC, &start_tot); - } - - float_t log_mus = 0; - for(idx_t i = 0; i < n; ++i) - { - log_mus += 0.5 * log(dp_info[i].ngbh.data[2].value/dp_info[i].ngbh.data[1].value); - } - - float_t d = 0; - MPI_Allreduce(&log_mus, &d, 1, MPI_MY_FLOAT, MPI_SUM, ctx -> mpi_communicator); - d = (ctx -> n_points - 1)/d; - if(verbose) - { - clock_gettime(CLOCK_MONOTONIC, &finish_tot); - elapsed_tot = (finish_tot.tv_sec - start_tot.tv_sec); - elapsed_tot += (finish_tot.tv_nsec - start_tot.tv_nsec) / 1000000000.0; - printf("\tID value: %.6lf\n", d); - printf("\tTotal time: %.3lfs\n\n", elapsed_tot); - } - - return d; - -} - - -float_t id_estimate(global_context_t* ctx, datapoint_info_t* dp_info, idx_t n, float_t fraction, int verbose) -{ - - /* - * Estimation of the intrinsic dimension of a dataset - * args: - * - dp_info: array of structs - * - n: number of dp_info - * Estimates the id via 2NN method. Computation of the log ratio of the - * distances of the first 2 neighbors of each point. Then compute the empirical distribution - * of these log ratii - * The intrinsic dimension is found by fitting with a straight line passing through the - * origin - */ - - struct timespec start_tot, finish_tot; - double elapsed_tot; - - if(verbose) - { - printf("ID estimation:\n"); - clock_gettime(CLOCK_MONOTONIC, &start_tot); - } - - //float_t fraction = 0.7; - float_t* r = (float_t*)MY_MALLOC(n*sizeof(float_t)); - float_t* Pemp = (float_t*)MY_MALLOC(n*sizeof(float_t)); - - for(idx_t i = 0; i < n; ++i) - { - r[i] = 0.5 * log(dp_info[i].ngbh.data[2].value/dp_info[i].ngbh.data[1].value); - Pemp[i] = -log(1 - (float_t)(i + 1)/(float_t)n); - } - qsort(r,n,sizeof(float_t),cmp_float_t); - - idx_t Neff = (idx_t)(n*fraction); - - float_t d = mEst2(r,Pemp,Neff); - free(r); - free(Pemp); - - if(verbose) - { - clock_gettime(CLOCK_MONOTONIC, &finish_tot); - elapsed_tot = (finish_tot.tv_sec - start_tot.tv_sec); - elapsed_tot += (finish_tot.tv_nsec - start_tot.tv_nsec) / 1000000000.0; - printf("\tID value: %.6lf\n", d); - printf("\tTotal time: %.3lfs\n\n", elapsed_tot); - } - - float_t rd = 0; - MPI_Allreduce(&d, &rd, 1, MPI_MY_FLOAT, MPI_SUM, ctx -> mpi_communicator); - rd = rd / ctx -> world_size; - - return rd; - -} - -int binary_search_on_idxs(idx_t* idxs, idx_t key, int n) -{ - #define LEFT 1 - #define RIGHT 0 - - int l = 0; - int r = n - 1; - int center = (r - l)/2; - while(idxs[center] != key && l < r) - { - int lr = key < idxs[center]; - /* if key < place */ - switch (lr) - { - case LEFT: - { - l = l; - r = center - 1; - center = l + (r - l) / 2; - } - break; - - case RIGHT: - { - l = center + 1; - r = r; - center = l + (r - l) / 2; - } - break; - - default: - break; - } - - } - - return center; - - #undef LEFT - #undef RIGHT -} - -datapoint_info_t find_possibly_halo_datapoint(global_context_t* ctx, idx_t idx) -{ - int owner = foreign_owner(ctx, idx); - /* find if datapoint is halo or not */ - if(owner == ctx -> mpi_rank) - { - idx_t i = idx - ctx -> idx_start; - return ctx -> local_datapoints[i]; - } - else - { - datapoint_info_t* halo_dp = ctx -> halo_datapoints[owner]; - idx_t* halo_idxs = ctx -> idx_halo_points_recv[owner]; - int n = ctx -> n_halo_points_recv[owner]; - int i = binary_search_on_idxs(halo_idxs, idx, n); - - if( idx != halo_dp[i].ngbh.data[0].array_idx && idx != MY_SIZE_MAX) - // if( idx != halo_idxs[i]) - { - printf("Osti %lu\n", idx); - } - return halo_dp[i]; - } -} - -datapoint_info_t find_possibly_halo_datapoint_rma(global_context_t* ctx, idx_t idx, MPI_Win win_datapoints) -{ - int owner = foreign_owner(ctx, idx); - /* find if datapoint is halo or not */ - if(owner == ctx -> mpi_rank) - { - idx_t i = idx - ctx -> idx_start; - return ctx -> local_datapoints[i]; - } - else - { - datapoint_info_t tmp_dp; - #pragma omp critical - { - idx_t i = idx - ctx -> rank_idx_start[owner]; - MPI_Request request; - MPI_Status status; - - MPI_Rget(&tmp_dp, sizeof(datapoint_info_t), MPI_BYTE, owner, - i * sizeof(datapoint_info_t), sizeof(datapoint_info_t), MPI_BYTE, win_datapoints, &request); - MPI_Wait(&request, MPI_STATUS_IGNORE); - - } - - return tmp_dp; - } -} - -datapoint_info_t find_possibly_halo_datapoint_rma_db(global_context_t* ctx, idx_t idx, MPI_Win win_datapoints) -{ - int owner = foreign_owner(ctx, idx); - /* find if datapoint is halo or not */ - if(owner == ctx -> mpi_rank) - { - idx_t i = idx - ctx -> idx_start; - return ctx -> local_datapoints[i]; - } - else - { - idx_t i = idx - ctx -> rank_idx_start[owner]; - datapoint_info_t tmp_dp; - MPI_Request request; - MPI_Status status; - - MPI_Rget(&tmp_dp, sizeof(datapoint_info_t), MPI_BYTE, owner, - i * sizeof(datapoint_info_t), sizeof(datapoint_info_t), MPI_BYTE, win_datapoints, &request); - MPI_Wait(&request, MPI_STATUS_IGNORE); - - return tmp_dp; - } -} - -void compute_density_kstarnn(global_context_t* ctx, const float_t d, int verbose){ - - /* - * Point density computation: - * args: - * - paricles: array of structs - * - d : intrinsic dimension of the dataset - * - points : number of points in the dataset - */ - - struct timespec start_tot, finish_tot; - double elapsed_tot; - - datapoint_info_t* local_datapoints = ctx -> local_datapoints; - - if(verbose) - { - printf("Density and k* estimation:\n"); - clock_gettime(CLOCK_MONOTONIC, &start_tot); - } - - idx_t kMAX = ctx -> local_datapoints[0].ngbh.N - 1; - - float_t omega = 0.; - if(sizeof(float_t) == sizeof(float)){ omega = powf(PI_F,d/2)/tgammaf(d/2.0f + 1.0f);} - else{omega = pow(M_PI,d/2.)/tgamma(d/2.0 + 1.0);} - - #pragma omp parallel for - for(idx_t i = 0; i < ctx -> local_n_points; ++i) - { - - idx_t j = 4; - idx_t k; - float_t dL = 0.; - float_t vvi = 0.; - float_t vvj = 0.; - float_t vp = 0.; - while(j < kMAX && dL < DTHR) - { - idx_t ksel = j - 1; - vvi = omega * pow(local_datapoints[i].ngbh.data[ksel].value,d/2.); - - idx_t jj = local_datapoints[i].ngbh.data[j].array_idx; - - /* - * note jj can be an halo point - * need to search maybe for it in foreign nodes - * */ - - datapoint_info_t tmp_dp = find_possibly_halo_datapoint(ctx, jj); - - vvj = omega * pow(tmp_dp.ngbh.data[ksel].value,d/2.); - - /* TO REMOVE - if(local_datapoints[i].array_idx == 17734) printf("%lu ksel i %lu j %lu tmp_dp %lu di %lf fj %lf vvi %lf vvj %lf\n", ksel, i, jj, tmp_dp.array_idx, - sqrt(local_datapoints[i].ngbh.data[ksel].value), sqrt(tmp_dp.ngbh.data[ksel].value), vvi, vvj); - */ - - vp = (vvi + vvj)*(vvi + vvj); - dL = -2.0 * ksel * log(4.*vvi*vvj/vp); - j = j + 1; - } - if(j == kMAX) - { - k = j - 1; - vvi = omega * pow(ctx -> local_datapoints[i].ngbh.data[k].value,d/2.); - } - else - { - k = j - 2; - } - local_datapoints[i].kstar = k; - local_datapoints[i].log_rho = log((float_t)(k)/vvi/((float_t)(ctx -> n_points))); - //dp_info[i].log_rho = log((float_t)(k)) - log(vvi) -log((float_t)(points)); - local_datapoints[i].log_rho_err = 1.0/sqrt((float_t)k); //(float_t)(-Q_rsqrt((float)k)); - local_datapoints[i].g = local_datapoints[i].log_rho - local_datapoints[i].log_rho_err; - } - - if(verbose) - { - clock_gettime(CLOCK_MONOTONIC, &finish_tot); - elapsed_tot = (finish_tot.tv_sec - start_tot.tv_sec); - elapsed_tot += (finish_tot.tv_nsec - start_tot.tv_nsec) / 1000000000.0; - printf("\tTotal time: %.3lfs\n\n", elapsed_tot); - } - - #if defined(WRITE_DENSITY) - /* densities */ - float_t* den = (float_t*)MY_MALLOC(ctx -> local_n_points * sizeof(float_t)); - float_t* gs = (float_t*)MY_MALLOC(ctx -> local_n_points * sizeof(float_t)); - idx_t* ks = (idx_t*)MY_MALLOC(ctx -> local_n_points * sizeof(idx_t)); - - for(int i = 0; i < ctx -> local_n_points; ++i) den[i] = ctx -> local_datapoints[i].log_rho; - for(int i = 0; i < ctx -> local_n_points; ++i) ks[i] = ctx -> local_datapoints[i].kstar; - for(int i = 0; i < ctx -> local_n_points; ++i) gs[i] = ctx -> local_datapoints[i].g; - - ordered_buffer_to_file(ctx, den, sizeof(float_t), ctx -> local_n_points, "bb/ordered_density.npy"); - ordered_buffer_to_file(ctx, ks, sizeof(idx_t), ctx -> local_n_points, "bb/ks.npy"); - ordered_buffer_to_file(ctx, gs, sizeof(float_t), ctx -> local_n_points, "bb/g.npy"); - - ordered_data_to_file(ctx); - - free(den); - free(ks); - free(gs); - #endif - return; - - -} - -float_t get_j_ksel_dist(global_context_t* ctx, idx_t j, idx_t ksel, MPI_Win exposed_ngbh) -{ - int owner = foreign_owner(ctx, j); - idx_t k = ctx -> k; - /* find if datapoint is halo or not */ - if(owner == ctx -> mpi_rank) - { - idx_t pos = j - ctx -> idx_start; - return ctx -> local_datapoints[pos].ngbh.data[ksel].value; - } - else - { - heap_node el; - idx_t pos = j - ctx -> rank_idx_start[owner]; - MPI_Request request; - int err = MPI_Rget(&el, sizeof(heap_node), MPI_BYTE, owner, (MPI_Aint)((pos * k + ksel) * sizeof(heap_node)), sizeof(heap_node), MPI_BYTE, exposed_ngbh, &request); - MPI_Wait(&request,MPI_STATUS_IGNORE); - return el.value; - } -} - -idx_t get_j_ksel_idx(global_context_t* ctx, idx_t j, idx_t ksel, MPI_Win exposed_ngbh) -{ - int owner = foreign_owner(ctx, j); - idx_t k = ctx -> k; - /* find if datapoint is halo or not */ - if(owner == ctx -> mpi_rank) - { - idx_t pos = j - ctx -> idx_start; - return ctx -> local_datapoints[pos].ngbh.data[ksel].array_idx; - } - else - { - heap_node el; - idx_t pos = j - ctx -> rank_idx_start[owner]; - MPI_Request request; - int err = MPI_Rget(&el, sizeof(heap_node), MPI_BYTE, owner, (MPI_Aint)((pos * k + ksel) * sizeof(heap_node)), sizeof(heap_node), MPI_BYTE, exposed_ngbh, &request); - MPI_Wait(&request,MPI_STATUS_IGNORE); - return el.array_idx; - } -} - -void compute_density_kstarnn_rma(global_context_t* ctx, const float_t d, int verbose){ - - /* - * Point density computation: - * args: - * - paricles: array of structs - * - d : intrinsic dimension of the dataset - * - points : number of points in the dataset - */ - - - MPI_Info info; - - - MPI_Barrier(ctx -> mpi_communicator); - idx_t k = ctx -> local_datapoints[0].ngbh.N; - - struct timespec start_tot, finish_tot; - double elapsed_tot; - - datapoint_info_t* local_datapoints = ctx -> local_datapoints; - - if(verbose) - { - printf("Density and k* estimation:\n"); - clock_gettime(CLOCK_MONOTONIC, &start_tot); - } - - idx_t kMAX = ctx -> local_datapoints[0].ngbh.N - 1; - - float_t omega = 0.; - if(sizeof(float_t) == sizeof(float)){ omega = powf(PI_F,d/2)/tgammaf(d/2.0f + 1.0f);} - else{omega = pow(M_PI,d/2.)/tgamma(d/2.0 + 1.0);} - - - MPI_Win exposed_ngbh; - MPI_Win_create( ctx -> __local_heap_buffers, - ctx -> local_n_points * k * sizeof(heap_node), - 1, MPI_INFO_NULL, - ctx -> mpi_communicator, - &exposed_ngbh); - - MPI_Win_fence(0, exposed_ngbh); - MPI_Win_lock_all(0, exposed_ngbh); - - MPI_Barrier(ctx -> mpi_communicator); - - #pragma omp parallel for - for(idx_t i = 0; i < ctx -> local_n_points; ++i) - { - for(idx_t k = 0; k <= kMAX; ++k) - { - local_datapoints[i].ngbh.data[k].value = omega * pow(local_datapoints[i].ngbh.data[k].value, d/2.); - } - } - - for(idx_t i = 0; i < ctx -> local_n_points; ++i) - { - - idx_t j = 4; - idx_t k; - float_t dL = 0.; - float_t vvi = 0.; - float_t vvj = 0.; - float_t vp = 0.; - while(j < kMAX && dL < DTHR) - { - idx_t ksel = j - 1; - //vvi = omega * pow(local_datapoints[i].ngbh.data[ksel].value,d/2.); - vvi = local_datapoints[i].ngbh.data[ksel].value; - - idx_t jj = local_datapoints[i].ngbh.data[j].array_idx; - - /* - * note jj can be an halo point - * need to search maybe for it in foreign nodes - * */ - //float_t dist_jj = get_j_ksel_dist(ctx, jj, ksel, exposed_ngbh); - //vvj = omega * pow(dist_jj,d/2.); - vvj = get_j_ksel_dist(ctx, jj, ksel, exposed_ngbh); - - vp = (vvi + vvj)*(vvi + vvj); - dL = -2.0 * ksel * log(4.*vvi*vvj/vp); - j = j + 1; - } - if(j == kMAX) - { - k = j - 1; - //vvi = omega * pow(ctx -> local_datapoints[i].ngbh.data[k].value,d/2.); - vvi = ctx -> local_datapoints[i].ngbh.data[k].value; - } - else - { - k = j - 2; - } - local_datapoints[i].kstar = k; - local_datapoints[i].log_rho = log((float_t)(k)/vvi/((float_t)(ctx -> n_points))); - //dp_info[i].log_rho = log((float_t)(k)) - log(vvi) -log((float_t)(points)); - local_datapoints[i].log_rho_err = 1.0/sqrt((float_t)k); //(float_t)(-Q_rsqrt((float)k)); - local_datapoints[i].g = local_datapoints[i].log_rho - local_datapoints[i].log_rho_err; - } - - if(verbose) - { - clock_gettime(CLOCK_MONOTONIC, &finish_tot); - elapsed_tot = (finish_tot.tv_sec - start_tot.tv_sec); - elapsed_tot += (finish_tot.tv_nsec - start_tot.tv_nsec) / 1000000000.0; - printf("\tTotal time: %.3lfs\n\n", elapsed_tot); - } - - MPI_Win_unlock_all(exposed_ngbh); - MPI_Win_fence(0, exposed_ngbh); - MPI_Win_free(&exposed_ngbh); - - #if defined(WRITE_DENSITY) - /* densities */ - float_t* den = (float_t*)MY_MALLOC(ctx -> local_n_points * sizeof(float_t)); - float_t* gs = (float_t*)MY_MALLOC(ctx -> local_n_points * sizeof(float_t)); - idx_t* ks = (idx_t*)MY_MALLOC(ctx -> local_n_points * sizeof(idx_t)); - - for(int i = 0; i < ctx -> local_n_points; ++i) den[i] = ctx -> local_datapoints[i].log_rho; - for(int i = 0; i < ctx -> local_n_points; ++i) ks[i] = ctx -> local_datapoints[i].kstar; - for(int i = 0; i < ctx -> local_n_points; ++i) gs[i] = ctx -> local_datapoints[i].g; - - ordered_buffer_to_file(ctx, den, sizeof(float_t), ctx -> local_n_points, "bb/ordered_density.npy"); - ordered_buffer_to_file(ctx, ks, sizeof(idx_t), ctx -> local_n_points, "bb/ks.npy"); - ordered_buffer_to_file(ctx, gs, sizeof(float_t), ctx -> local_n_points, "bb/g.npy"); - - ordered_data_to_file(ctx); - free(den); - free(ks); - #endif - return; - - -} - -void compute_density_kstarnn_rma_v2(global_context_t* ctx, const float_t d, int verbose){ - - /* - * Point density computation: - * args: - * - paricles: array of structs - * - d : intrinsic dimension of the dataset - * - points : number of points in the dataset - */ - - - MPI_Info info; - - - MPI_Barrier(ctx -> mpi_communicator); - idx_t k = ctx -> local_datapoints[0].ngbh.N; - - struct timespec start_tot, finish_tot; - double elapsed_tot; - - datapoint_info_t* local_datapoints = ctx -> local_datapoints; - - if(verbose) - { - printf("Density and k* estimation:\n"); - clock_gettime(CLOCK_MONOTONIC, &start_tot); - } - - idx_t kMAX = ctx -> local_datapoints[0].ngbh.N - 1; - - float_t omega = 0.; - if(sizeof(float_t) == sizeof(float)){ omega = powf(PI_F,d/2)/tgammaf(d/2.0f + 1.0f);} - else{omega = pow(M_PI,d/2.)/tgamma(d/2.0 + 1.0);} - - - MPI_Win exposed_ngbh; - MPI_Win_create( ctx -> __local_heap_buffers, - ctx -> local_n_points * k * sizeof(heap_node), - 1, MPI_INFO_NULL, - ctx -> mpi_communicator, - &exposed_ngbh); - - MPI_Win_fence(MPI_MODE_NOPUT, exposed_ngbh); - - MPI_Barrier(ctx -> mpi_communicator); - - #pragma omp parallel for - for(idx_t i = 0; i < ctx -> local_n_points; ++i) - { - for(idx_t k = 0; k <= kMAX; ++k) - { - local_datapoints[i].ngbh.data[k].value = omega * pow(local_datapoints[i].ngbh.data[k].value, d/2.); - } - //initialize kstar at 0 - local_datapoints[i].kstar = 0; - } - - int i_have_finished = 0; - int all_have_finished = 0; - heap_node* scratch_heap_nodes = (heap_node*)MY_MALLOC(ctx -> local_n_points * sizeof(heap_node)); - - for(idx_t j = 4; j < kMAX - 1; ++j) - { - i_have_finished = 1; - //request data - idx_t ksel = j - 1; - -#if defined(THREAD_FUNNELED) -#else - #pragma omp parallel for -#endif - - for(idx_t i = 0; i < ctx -> local_n_points; ++i) - { - - if(ctx -> local_datapoints[i].kstar == 0) - { - //vvi = omega * pow(local_datapoints[i].ngbh.data[ksel].value,d/2.); - - idx_t jj = local_datapoints[i].ngbh.data[j].array_idx; - - /* - * note jj can be an halo point - * need to search maybe for it in foreign nodes - * */ - - int owner = foreign_owner(ctx, jj); - idx_t pos = jj - ctx -> rank_idx_start[owner]; - - if(owner == ctx -> mpi_rank) - { - scratch_heap_nodes[i] = ctx -> local_datapoints[pos].ngbh.data[ksel]; - } - else - { - MPI_Get(scratch_heap_nodes + i, - sizeof(heap_node), - MPI_BYTE, - owner, - (MPI_Aint)((pos * (ctx -> k) + ksel) * sizeof(heap_node)), - sizeof(heap_node), - MPI_BYTE, - exposed_ngbh); - } - } - - } - - MPI_Win_fence(MPI_MODE_NOPUT,exposed_ngbh); - //process data -#if defined(THREAD_FUNNELED) -#else - #pragma omp parallel for -#endif - for(idx_t i = 0; i < ctx -> local_n_points; ++i) - { - if(ctx -> local_datapoints[i].kstar == 0) - { - float_t vvi, vvj, vp, dL; - vvi = local_datapoints[i].ngbh.data[ksel].value; - vvj = scratch_heap_nodes[i].value; - vp = (vvi + vvj)*(vvi + vvj); - dL = -2.0 * ksel * log(4.*vvi*vvj/vp); - - if(dL > DTHR) - { - idx_t k = j - 1; - local_datapoints[i].kstar = k; - local_datapoints[i].log_rho = log((float_t)(k)/vvi/((float_t)(ctx -> n_points))); - local_datapoints[i].log_rho_err = 1.0/sqrt((float_t)k); //(float_t)(-Q_rsqrt((float)k)); - local_datapoints[i].g = local_datapoints[i].log_rho - local_datapoints[i].log_rho_err; - } - else - { - #pragma omp atomic write - i_have_finished = 0; - } - } - - } - - - MPI_Allreduce(&i_have_finished, &all_have_finished, 1, MPI_INT, MPI_LAND, ctx -> mpi_communicator); - - if(all_have_finished) break; - } - - -#if defined(THREAD_FUNNELED) -#else - #pragma omp parallel for -#endif - for(idx_t i = 0; i < ctx -> local_n_points; ++i) - { - if(ctx -> local_datapoints[i].kstar == 0) - { - idx_t k = kMAX - 1; - - float_t vvi = ctx -> local_datapoints[i].ngbh.data[k].value; - - local_datapoints[i].kstar = k; - local_datapoints[i].log_rho = log((float_t)(k)/vvi/((float_t)(ctx -> n_points))); - local_datapoints[i].log_rho_err = 1.0/sqrt((float_t)k); //(float_t)(-Q_rsqrt((float)k)); - local_datapoints[i].g = local_datapoints[i].log_rho - local_datapoints[i].log_rho_err; - } - } - - - MPI_Win_fence(0, exposed_ngbh); - MPI_Win_free(&exposed_ngbh); - - free(scratch_heap_nodes); - - #if defined(WRITE_DENSITY) - /* densities */ - float_t* den = (float_t*)MY_MALLOC(ctx -> local_n_points * sizeof(float_t)); - float_t* gs = (float_t*)MY_MALLOC(ctx -> local_n_points * sizeof(float_t)); - idx_t* ks = (idx_t*)MY_MALLOC(ctx -> local_n_points * sizeof(idx_t)); - - for(int i = 0; i < ctx -> local_n_points; ++i) den[i] = ctx -> local_datapoints[i].log_rho; - for(int i = 0; i < ctx -> local_n_points; ++i) ks[i] = ctx -> local_datapoints[i].kstar; - for(int i = 0; i < ctx -> local_n_points; ++i) gs[i] = ctx -> local_datapoints[i].g; - - ordered_buffer_to_file(ctx, den, sizeof(float_t), ctx -> local_n_points, "bb/ordered_density.npy"); - ordered_buffer_to_file(ctx, ks, sizeof(idx_t), ctx -> local_n_points, "bb/ks.npy"); - ordered_buffer_to_file(ctx, gs, sizeof(float_t), ctx -> local_n_points, "bb/g.npy"); - - ordered_data_to_file(ctx); - free(den); - free(ks); - #endif - return; - - -} - -float_t get_j_ksel_dist_v2(global_context_t* ctx, idx_t i, idx_t j, idx_t ksel, int* flags, heap_node* tmp_heap_nodes, MPI_Win* exposed_ngbh) -{ - if(flags[i]) - { - int owner = foreign_owner(ctx, j); - idx_t k = ctx -> k; - /* find if datapoint is halo or not */ - if(owner == ctx -> mpi_rank) - { - idx_t pos = j - ctx -> idx_start; - return ctx -> local_datapoints[pos].ngbh.data[ksel].value; - } - else - { - //RMA - flags[i] = 0; - idx_t pos = j - ctx -> rank_idx_start[owner]; - MPI_Get(tmp_heap_nodes + i, sizeof(heap_node), MPI_BYTE, owner, (MPI_Aint)((pos * k + ksel) * sizeof(heap_node)), sizeof(heap_node), MPI_BYTE, *exposed_ngbh); - //if(ctx -> mpi_rank == 0) DB_PRINT("rvcd %lu %lf\n", el.array_idx, el.value); - return 0; - } - } - else - { - flags[i] = 1; - return tmp_heap_nodes[i].value; - } -} - -void clusters_allocate(clusters_t * c, int s) -{ - /* - * Helper function for handling allocation of resources - */ - if(c -> centers.count == 0) - { - printf("Provide a valid cluster centers list\n"); - return; - } - - idx_t nclus = c -> centers.count; - - - if(s) - { - //printf("Using sparse implementation\n"); - c -> use_sparse_borders = 1; - c -> sparse_borders = (adj_list_t*)MY_MALLOC(nclus*sizeof(adj_list_t)); - for(idx_t i = 0; i < nclus; ++i) - { - c -> sparse_borders[i].count = 0; - c -> sparse_borders[i].size = PREALLOC_BORDERS; - c -> sparse_borders[i].data = (sparse_border_t*)MY_MALLOC(PREALLOC_BORDERS*sizeof(sparse_border_t)); - } - - } - else - { - //printf("Using dense implementation\n"); - c -> use_sparse_borders = 0; - c -> __borders_data = (border_t*)MY_MALLOC(nclus*nclus*sizeof(border_t)); - c -> borders = (border_t**)MY_MALLOC(nclus*sizeof(border_t*)); - - #pragma omp parallel for - - for(idx_t i = 0; i < nclus; ++i) - { - c -> borders[i] = c -> __borders_data + i*nclus; - for(idx_t j = 0; j < nclus; ++j) - { - c -> borders[i][j] = border_null; - } - } - } -} - - -void adj_list_insert(adj_list_t* l, sparse_border_t b) -{ - /* - * Handling of sparse border implementation as an adjecency list - */ - if(l -> count < l -> size) - { - l -> data[l -> count] = b; - l -> count++; - } - else - { - l -> size += PREALLOC_BORDERS; - l -> data = realloc( l -> data, sizeof(sparse_border_t) * ( l -> size)); - l -> data[l -> count] = b; - l -> count++; - } -} - -void adj_list_reset(adj_list_t* l) -{ - /* - * Handling of sparse border implementation as an adjecency list - */ - if(l -> data) free(l -> data); - l -> count = 0; - l -> size = 0; - l -> data = NULL; -} - -void clusters_reset(clusters_t * c) -{ - /* - * Handling reset of clusters object - */ - if(c -> use_sparse_borders) - { - for(idx_t i = 0; i < c -> centers.count; ++i) - { - adj_list_reset((c -> sparse_borders) + i); - - } - free(c -> sparse_borders); - c -> sparse_borders = NULL; - } - else - { - if(c -> __borders_data) free(c -> __borders_data); - if(c -> borders) free(c -> borders); - } - if(c -> centers.data) free(c -> centers.data); -} - -void clusters_free(clusters_t * c) -{ - /* - * Free cluster object - */ - clusters_reset(c); -} - - -void sparse_border_insert(clusters_t *c, sparse_border_t b) -{ - /* - * Insert a border element in the sparse implementation - */ - - idx_t i = b.i; - adj_list_t l = c -> sparse_borders[i]; - int check = 1; - for(idx_t k = 0; k < l.count; ++k) - { - sparse_border_t p = l.data[k]; - if(p.i == b.i && p.j == b.j) - { - if( b.density > p.density) - { - l.data[k] = b; - } - check = 0; - } - } - if(check) adj_list_insert(c -> sparse_borders + i, b); - return; -} - -sparse_border_t sparse_border_get(clusters_t* c, idx_t i, idx_t j) -{ - /* - * Get a border element in the sparse implementation - * - i,j: cluster to search for borders - * return border_null if not found - */ - - sparse_border_t b = sparse_border_null; - adj_list_t l = c -> sparse_borders[i]; - for(idx_t el = 0; el < l.count; ++el) - { - sparse_border_t candidate = l.data[el]; - if(candidate.i == i && candidate.j == j) - { - b = candidate; - } - } - return b; -} - - -int cmpPP(const void* p1, const void *p2) -{ - /* - * Utility function to perform quicksort - * when clustering assignment is performed - */ - datapoint_info_t* pp1 = *(datapoint_info_t**)p1; - datapoint_info_t* pp2 = *(datapoint_info_t**)p2; - //return 2*(pp1 -> g < pp2 -> g) - 1; - int v = (pp1 -> g < pp2 -> g) - (pp1 -> g > pp2 -> g); - int a = (pp1 -> array_idx < pp2 -> array_idx) - (pp1 -> array_idx > pp2 -> array_idx); - return v == 0 ? a : v; - //return v; -} - -inline int its_mine(global_context_t* ctx, idx_t idx) -{ - return idx >= ctx -> idx_start && idx < ctx -> idx_start + ctx -> local_n_points; -} - -int compare_center(const void* a, const void* b) -{ - center_t* aa = (center_t*)a; - center_t* bb = (center_t*)b; - return (aa -> density < bb -> density) - (aa -> density > bb -> density); -} - - -void compute_correction(global_context_t* ctx, float_t Z) -{ - /* - * Utility function, find the minimum value of the density of the datapoints - * and shift them up in order to further work with values greater than 0 - */ - float_t min_log_rho = 999999.9; - - datapoint_info_t* dp_info = ctx -> local_datapoints; - idx_t n = ctx -> local_n_points; - - - #pragma omp parallel - { - float_t thread_min_log_rho = 9999999.; - #pragma omp for - for(idx_t i = 0; i < n; ++i) - { - float_t tmp = dp_info[i].log_rho - Z*dp_info[i].log_rho_err; - if(tmp < thread_min_log_rho){ - thread_min_log_rho = tmp; - } - } - #pragma omp critical - if(thread_min_log_rho < min_log_rho) min_log_rho = thread_min_log_rho; - } - - MPI_Allreduce(MPI_IN_PLACE, &min_log_rho, 1, MPI_MY_FLOAT, MPI_MIN, ctx -> mpi_communicator); - - #pragma omp parallel - { - #pragma omp for - for(idx_t i = 0; i < n; ++i) - { - dp_info[i].log_rho_c = dp_info[i].log_rho - min_log_rho + 1; - dp_info[i].g = dp_info[i].log_rho_c - dp_info[i].log_rho_err; - } - - } - - //printf("%lf\n",min_log_rho); -} - -clusters_t Heuristic1_rma(global_context_t *ctx, int verbose) -{ - /* - * Heurisitc 1, from paper of Errico, Facco, Laio & Rodriguez - * ( https://doi.org/10.1016/j.ins.2021.01.010 ) - * - * args: - */ - - HERE - datapoint_info_t* dp_info = ctx -> local_datapoints; - idx_t n = ctx -> local_n_points; - - struct timespec start_tot, finish_tot; - double elapsed_tot; - - if(verbose) - { - printf("H1: Preliminary cluster assignment\n"); - clock_gettime(CLOCK_MONOTONIC, &start_tot); - } - - TIME_DEF; - - - //idx_t ncenters = 0; - //idx_t putativeCenters = n; - lu_dynamic_array_t all_centers, removed_centers, actual_centers, max_rho; - lu_dynamic_array_allocate(&all_centers); - lu_dynamic_array_allocate(&removed_centers); - lu_dynamic_array_allocate(&actual_centers); - lu_dynamic_array_allocate(&max_rho); - - datapoint_info_t** dp_info_ptrs = (datapoint_info_t**)MY_MALLOC(n*sizeof(datapoint_info_t*)); - - struct timespec start, finish; - double elapsed; - - - if(verbose) - { - clock_gettime(CLOCK_MONOTONIC, &start); - } - - /* proceed */ - - MPI_Win win_datapoints; - MPI_Win_create(ctx -> local_datapoints, ctx -> local_n_points * sizeof(datapoint_info_t), - 1, MPI_INFO_NULL, ctx -> mpi_communicator, &win_datapoints); - MPI_Win_fence(0, win_datapoints); - MPI_Win_lock_all(0, win_datapoints); - HERE - -#if defined(THREAD_FUNNELED) -#else - #pragma omp parallel for -#endif - - for(idx_t i = 0; i < n; ++i) - { - /* - * Find the centers of the clusters as the points of higher density in their neighborhoods - * A point is tagged as a putative center if it is the point of higer density of its neighborhood - */ - - dp_info_ptrs[i] = dp_info + i; - idx_t maxk = dp_info[i].kstar + 1; - float_t gi = dp_info[i].g; - dp_info[i].is_center = 1; - dp_info[i].cluster_idx = -1; - //printf("%lf\n",p -> g); - heap i_ngbh = dp_info[i].ngbh; - for(idx_t k = 1; k < maxk; ++k) - { - idx_t ngbh_index = i_ngbh.data[k].array_idx; - datapoint_info_t dj = find_possibly_halo_datapoint_rma(ctx, ngbh_index, win_datapoints); - float_t gj = dj.g; - if(gj > gi){ - dp_info[i].is_center = 0; - break; - } - } - if(dp_info[i].is_center) - { - //lu_dynamic_array_pushBack(&all_centers, dp_info[i].array_idx); - #pragma omp critical - { - lu_dynamic_array_pushBack(&all_centers, i); - } - } - } - - HERE - - if(verbose) - { - clock_gettime(CLOCK_MONOTONIC, &finish); - elapsed = (finish.tv_sec - start.tv_sec); - elapsed += (finish.tv_nsec - start.tv_nsec) / 1000000000.0; - printf("\tFinding putative centers: %.3lfs\n",elapsed); - clock_gettime(CLOCK_MONOTONIC, &start); - } - - - /* - * optimized version - * - * Generate a mask that keeps track of the point has been eliminating the - * point considered. Each thread updates this mask, then after the procedure - * ends, center, removed centers, and max_rho arrays are populated - */ - - heap_node* to_remove_mask = (heap_node*)MY_MALLOC(n*sizeof(heap_node)); - for(idx_t p = 0; p < n; ++p) - { - to_remove_mask[p].array_idx = MY_SIZE_MAX; - to_remove_mask[p].value = 9999999; - } - qsort(dp_info_ptrs, n, sizeof(datapoint_info_t*), cmpPP); - - - MPI_Win win_to_remove_mask; - MPI_Win_create(to_remove_mask, n * sizeof(heap_node), 1, MPI_INFO_NULL, ctx -> mpi_communicator, &win_to_remove_mask); - MPI_Win_fence(0, win_to_remove_mask); - - - - /* check into internal nodes */ - - /* - * to remove - * and to reimplement it using rma - * for dp in datapoints: - * for nghb in neighbors: - * if ngbh its_mine - * no_problems, do it as always - * else: - * ngbh is an external node - * do rma things, - * in particular, acquire a lock on the window, read and write things - * then close - */ - -#if defined(THREAD_FUNNELED) -#else - #pragma omp parallel for -#endif - for(idx_t p = 0; p < n; ++p) - { - datapoint_info_t i_point = *(dp_info_ptrs[p]); - - for(idx_t j = 1; j < i_point.kstar + 1; ++j) - { - idx_t jidx = i_point.ngbh.data[j].array_idx; - - datapoint_info_t j_point = find_possibly_halo_datapoint_rma(ctx, jidx, win_datapoints); - - if(j_point.is_center && i_point.g > j_point.g) - { - #pragma omp critical - { - int owner = foreign_owner(ctx, jidx); - idx_t jpos = jidx - ctx -> rank_idx_start[owner]; - - MPI_Win_lock(MPI_LOCK_EXCLUSIVE, owner, 0, win_to_remove_mask); - heap_node mask_element; - MPI_Request request; - MPI_Rget( &mask_element, sizeof(heap_node), MPI_BYTE, - owner, jpos * sizeof(heap_node) , sizeof(heap_node), MPI_BYTE, win_to_remove_mask, &request); - MPI_Wait(&request, MPI_STATUS_IGNORE); - - int flag = mask_element.array_idx == MY_SIZE_MAX; - if(flag || i_point.g > mask_element.value ) - { - heap_node tmp_mask_element = {.array_idx = i_point.array_idx, .value = i_point.g}; - MPI_Request request; - MPI_Rput(&tmp_mask_element, sizeof(heap_node), MPI_BYTE, owner, - jpos*sizeof(heap_node), sizeof(heap_node), MPI_BYTE, win_to_remove_mask, &request); - MPI_Wait(&request, MPI_STATUS_IGNORE); - - } - - MPI_Win_unlock(owner, win_to_remove_mask); - } - } - } - } - - MPI_Win_fence(0, win_to_remove_mask); - MPI_Barrier(ctx -> mpi_communicator); - HERE - - /* populate the usual arrays */ - for(idx_t p = 0; p < all_centers.count; ++p) - { - idx_t i = all_centers.data[p]; - int e = 0; - //float_t gi = dp_info[i].g; - idx_t mr = to_remove_mask[i].array_idx; - if(mr != MY_SIZE_MAX) - { - //if(dp_info[mr].g > gi) e = 1; - e = 1; - } - switch (e) - { - case 1: - { - lu_dynamic_array_pushBack(&removed_centers,i); - dp_info[i].is_center = 0; - for(idx_t c = 0; c < removed_centers.count - 1; ++c) - { - if(mr == removed_centers.data[c]) - { - mr = max_rho.data[c]; - } - } - lu_dynamic_array_pushBack(&max_rho,mr); - - } - break; - - case 0: - { - lu_dynamic_array_pushBack(&actual_centers,i); - dp_info[i].cluster_idx = actual_centers.count - 1; - } - break; - - default: - break; - } - } - - HERE - MPI_Win_free(&win_to_remove_mask); - free(to_remove_mask); - - if(verbose) - { - clock_gettime(CLOCK_MONOTONIC, &finish); - elapsed = (finish.tv_sec - start.tv_sec); - elapsed += (finish.tv_nsec - start.tv_nsec) / 1000000000.0; - printf("\tFinding actual centers: %.3lfs\n",elapsed); - clock_gettime(CLOCK_MONOTONIC, &start); - } - - int n_centers = (int)actual_centers.count; - int tot_centers; - MPI_Allreduce(&n_centers, &tot_centers, 1, MPI_INT, MPI_SUM, ctx -> mpi_communicator); - // MPI_Allreduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm) - /* - DB_PRINT("rank %d got %d heheh\n", ctx -> mpi_rank, (int)actual_centers.count); - DB_PRINT("rank %d tc %d rc %d\n", ctx -> mpi_rank, (int)all_centers.count, (int)removed_centers.count); - */ - MPI_DB_PRINT("%d bbb\n", tot_centers); - - - /* bring on master all centers - * order them in ascending order of density, - * then re-scatter them around to get unique cluster labels */ - - - center_t* private_centers_buffer = (center_t*)MY_MALLOC(actual_centers.count * sizeof(center_t)); - - center_t* global_centers_buffer = (center_t*)MY_MALLOC(tot_centers * sizeof(center_t)); - - for(int i = 0; i < actual_centers.count; ++i) - { - idx_t idx = actual_centers.data[i] ; - private_centers_buffer[i].density = dp_info[idx].g; - private_centers_buffer[i].idx = dp_info[idx].array_idx; - } - MPI_Datatype mpi_center_type; - - MPI_Type_contiguous(sizeof(center_t), MPI_BYTE, &mpi_center_type); - MPI_Type_commit(&mpi_center_type); - - int* center_counts = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); - int* center_displs = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); - - int cc = (int)actual_centers.count; - MPI_Allgather(&cc, 1, MPI_INT, center_counts, 1, MPI_INT, ctx -> mpi_communicator); - - center_displs[0] = 0; - for(int i = 1; i < ctx -> world_size; ++i) center_displs[i] = center_displs[i - 1] + center_counts[i - 1]; - - /* alternative to check if something breaks */ - for(int i = 0; i < actual_centers.count; ++i) - { - idx_t idx = actual_centers.data[i]; - dp_info[idx].cluster_idx += center_displs[ctx -> mpi_rank]; - - } - - idx_t* all_center_idx = (idx_t*)MY_MALLOC(tot_centers * sizeof(idx_t)); - - MPI_Allgatherv(actual_centers.data, actual_centers.count, MPI_UINT64_T, all_center_idx, center_counts, center_displs, MPI_UINT64_T, ctx -> mpi_communicator); - - free(center_counts); - free(center_displs); - - /* - * Sort all the dp_info based on g and then perform the cluster assignment - * in asceding order - */ - - /* - * Sort all the dp_info based on g and then perform the cluster assignment - * in asceding order - */ - - int completed = 0; - - while(!completed) - { - completed = 1; - - - int proc_points = 0; - /* retrieve assignment - * if some point remain uncompleted then get assignment from external points */ - - for(idx_t i = 0; i < n; ++i) - { - int wait_for_comms = 0; - datapoint_info_t* p = dp_info_ptrs[i]; - if(!(p -> is_center) && p -> cluster_idx < 0) - { - int cluster = -1; - idx_t k = 0; - idx_t p_idx; - idx_t max_k = p -> ngbh.N; - /*assign each particle at the same cluster as the nearest particle of higher density*/ - datapoint_info_t p_retrieved; - while( k < max_k - 1 && cluster == -1) - { - - - ++k; - p_idx = p -> ngbh.data[k].array_idx; - p_retrieved = find_possibly_halo_datapoint_rma(ctx, p_idx, win_datapoints); - - int flag = p_retrieved.g > p -> g; - - if(p_retrieved.g > p -> g) - { - cluster = p_retrieved.cluster_idx; - wait_for_comms = 1; - break; - } - //if(p -> array_idx == 1587636) printf("%lu k %d p_idx %lu %lf %lf\n", k, cluster, p_idx, p_retrieved.g, p -> g ); - } - - - if(cluster == -1 && !wait_for_comms) - { - float_t gmax = -99999.; - idx_t gm_index = SIZE_MAX; - for(idx_t k = 0; k < max_k; ++k) - { - idx_t ngbh_index = p -> ngbh.data[k].array_idx; - for(idx_t m = 0; m < removed_centers.count; ++m) - { - idx_t max_rho_idx = max_rho.data[m]; - datapoint_info_t dp_max_rho = find_possibly_halo_datapoint_rma(ctx, max_rho_idx, win_datapoints); - //datapoint_info_t dp_max_rho = dp_info[max_rho_idx]; - float_t gcand = dp_max_rho.g; - if(ngbh_index == removed_centers.data[m] && gcand > gmax) - { - //printf("%lu -- %lu\n", ele, m); - gmax = gcand; - gm_index = max_rho.data[m]; - } - } - } - if(gm_index != SIZE_MAX) - { - datapoint_info_t dp_gm = find_possibly_halo_datapoint_rma(ctx, gm_index, win_datapoints); - //datapoint_info_t dp_gm = dp_info[gm_index]; - cluster = dp_gm.cluster_idx; - } - - } - p -> cluster_idx = cluster; - - } - completed = completed && p -> cluster_idx != -1; - proc_points += p -> cluster_idx != -1 ? 1 : 0; - - } - - //DB_PRINT("rank %d proc %d\n", ctx -> mpi_rank, proc_points); - MPI_Allreduce(MPI_IN_PLACE, &completed, 1, MPI_INT, MPI_SUM, ctx -> mpi_communicator); - completed = completed == ctx -> world_size ? 1 : 0; - - } - - MPI_Win_unlock_all(win_datapoints); - MPI_Win_fence(0, win_datapoints); - MPI_Win_free(&win_datapoints); - - MPI_Barrier(ctx -> mpi_communicator); - - if(verbose) - { - clock_gettime(CLOCK_MONOTONIC, &finish); - elapsed = (finish.tv_sec - start.tv_sec); - elapsed += (finish.tv_nsec - start.tv_nsec) / 1000000000.0; - printf("\tTentative clustering: %.3lfs\n",elapsed); - clock_gettime(CLOCK_MONOTONIC, &start); - } - - free(dp_info_ptrs); - free(max_rho.data); - free(removed_centers.data); - free(all_centers.data); - - #if defined(WRITE_CLUSTER_ASSIGN_H1) - /* densities */ - int* ks = (int*)MY_MALLOC(ctx -> local_n_points * sizeof(int)); - for(int i = 0; i < ctx -> local_n_points; ++i) ks[i] = ctx -> local_datapoints[i].cluster_idx; - - ordered_buffer_to_file(ctx, ks, sizeof(int), ctx -> local_n_points, "bb/cl.npy"); - ordered_data_to_file(ctx); - free(ks); - #endif - - - free(actual_centers.data); - actual_centers.size = tot_centers; - actual_centers.count = tot_centers; - actual_centers.data = all_center_idx; - - - clusters_t c_all; - c_all.centers = actual_centers; - - - - if(verbose) - { - clock_gettime(CLOCK_MONOTONIC, &finish); - elapsed = (finish.tv_sec - start.tv_sec); - elapsed += (finish.tv_nsec - start.tv_nsec) / 1000000000.0; - printf("\tFinalizing clustering: %.3lfs\n",elapsed); - printf("\n"); - } - - clock_gettime(CLOCK_MONOTONIC, &finish_tot); - elapsed_tot = (finish_tot.tv_sec - start_tot.tv_sec); - elapsed_tot += (finish_tot.tv_nsec - start_tot.tv_nsec) / 1000000000.0; - - - if(verbose) - { - printf("\tFound %lu clusters\n",(uint64_t)actual_centers.count); - printf("\tTotal time: %.3lfs\n\n", elapsed_tot); - } - - c_all.n = n; - return c_all; -} - -clusters_t Heuristic1(global_context_t *ctx, int verbose) -{ - /* - * Heurisitc 1, from paper of Errico, Facco, Laio & Rodriguez - * ( https://doi.org/10.1016/j.ins.2021.01.010 ) - * - * args: - */ - datapoint_info_t* dp_info = ctx -> local_datapoints; - idx_t n = ctx -> local_n_points; - - struct timespec start_tot, finish_tot; - double elapsed_tot; - - if(verbose) - { - printf("H1: Preliminary cluster assignment\n"); - clock_gettime(CLOCK_MONOTONIC, &start_tot); - } - - TIME_DEF; - - - //idx_t ncenters = 0; - //idx_t putativeCenters = n; - lu_dynamic_array_t all_centers, removed_centers, actual_centers, max_rho; - lu_dynamic_array_allocate(&all_centers); - lu_dynamic_array_allocate(&removed_centers); - lu_dynamic_array_allocate(&actual_centers); - lu_dynamic_array_allocate(&max_rho); - - datapoint_info_t** dp_info_ptrs = (datapoint_info_t**)MY_MALLOC(n*sizeof(datapoint_info_t*)); - - struct timespec start, finish; - double elapsed; - - - if(verbose) - { - clock_gettime(CLOCK_MONOTONIC, &start); - } - - - /* exchage halo G */ - - /* compute displs */ - int* sdispls = (int*)calloc(ctx -> world_size , sizeof(int)); - int* rdispls = (int*)calloc(ctx -> world_size , sizeof(int)); - - int tot_count_send = ctx -> n_halo_points_send[0]; - int tot_count_recv = ctx -> n_halo_points_recv[0]; - for(int i = 1; i < ctx -> world_size; ++i) - { - sdispls[i] = sdispls[i - 1] + ctx -> n_halo_points_send[i - 1]; - rdispls[i] = rdispls[i - 1] + ctx -> n_halo_points_recv[i - 1]; - - tot_count_send += ctx -> n_halo_points_send[i]; - tot_count_recv += ctx -> n_halo_points_recv[i]; - } - - float_t* gs_send = (float_t*)MY_MALLOC(tot_count_send * sizeof(float_t)); - float_t* gs_recv = (float_t*)MY_MALLOC(tot_count_recv * sizeof(float_t)); - idx_t* ks_send = (idx_t*)MY_MALLOC(tot_count_send * sizeof(idx_t)); - idx_t* ks_recv = (idx_t*)MY_MALLOC(tot_count_recv * sizeof(idx_t)); - - /* pack it in gsend */ - for(int i = 0; i < ctx -> world_size; ++i) - { - for(int j = 0; j < ctx -> n_halo_points_send[i]; ++j) - { - idx_t idx = ctx -> idx_halo_points_send[i][j] - ctx -> idx_start; - gs_send[j + sdispls[i]] = ctx -> local_datapoints[idx].g; - ks_send[j + sdispls[i]] = ctx -> local_datapoints[idx].kstar; - } - } - - /* - printf("Rank %d", ctx -> mpi_rank); - //for(int i = 0; i < ctx -> world_size; ++i) printf("i %d [ s: %d r: %d ] ", i, ctx -> n_halo_points_send[i], ctx -> n_halo_points_recv[i]); - for(int i = 0; i < ctx -> world_size; ++i) printf("i %d [ s: %d r: %d ] ", i, sdispls[i], rdispls[i]); - printf("\n"); - */ - - - MPI_Barrier(ctx -> mpi_communicator); - /* exchange */ - MPI_Alltoallv(gs_send, ctx -> n_halo_points_send, sdispls, MPI_MY_FLOAT, gs_recv, ctx -> n_halo_points_recv, rdispls, MPI_MY_FLOAT, ctx -> mpi_communicator); - MPI_Alltoallv(ks_send, ctx -> n_halo_points_send, sdispls, MPI_UINT64_T, ks_recv, ctx -> n_halo_points_recv, rdispls, MPI_UINT64_T, ctx -> mpi_communicator); - - /* unpack */ - - for(int i = 0; i < ctx -> world_size; ++i) - { - for(int j = 0; j < ctx -> n_halo_points_recv[i]; ++j) - { - ctx -> halo_datapoints[i][j].g = gs_recv[rdispls[i] + j]; - ctx -> halo_datapoints[i][j].kstar = ks_recv[rdispls[i] + j]; - ctx -> halo_datapoints[i][j].array_idx = ctx -> idx_halo_points_recv[i][j]; - } - } - - /* proceed */ - - - for(idx_t i = 0; i < n; ++i) - { - /* - * Find the centers of the clusters as the points of higher density in their neighborhoods - * A point is tagged as a putative center if it is the point of higer density of its neighborhood - */ - - dp_info_ptrs[i] = dp_info + i; - idx_t maxk = dp_info[i].kstar + 1; - float_t gi = dp_info[i].g; - dp_info[i].is_center = 1; - dp_info[i].cluster_idx = -1; - //printf("%lf\n",p -> g); - heap i_ngbh = dp_info[i].ngbh; - for(idx_t k = 1; k < maxk; ++k) - { - idx_t ngbh_index = i_ngbh.data[k].array_idx; - datapoint_info_t dj = find_possibly_halo_datapoint(ctx, ngbh_index); - float_t gj = dj.g; - if(gj > gi){ - dp_info[i].is_center = 0; - break; - } - } - if(dp_info[i].is_center) - { - //lu_dynamic_array_pushBack(&all_centers, dp_info[i].array_idx); - lu_dynamic_array_pushBack(&all_centers, i); - } - - - } - - - if(verbose) - { - clock_gettime(CLOCK_MONOTONIC, &finish); - elapsed = (finish.tv_sec - start.tv_sec); - elapsed += (finish.tv_nsec - start.tv_nsec) / 1000000000.0; - printf("\tFinding putative centers: %.3lfs\n",elapsed); - clock_gettime(CLOCK_MONOTONIC, &start); - } - - - /* - * optimized version - * - * Generate a mask that keeps track of the point has been eliminating the - * point considered. Each thread updates this mask, then after the procedure - * ends, center, removed centers, and max_rho arrays are populated - */ - - idx_t* to_remove_mask = (idx_t*)MY_MALLOC(n*sizeof(idx_t)); - for(idx_t p = 0; p < n; ++p) {to_remove_mask[p] = MY_SIZE_MAX;} - qsort(dp_info_ptrs, n, sizeof(datapoint_info_t*), cmpPP); - - /* check into internal nodes */ - - /* - * to remove - * and to reimplement it using rma - * for dp in datapoints: - * for nghb in neighbors: - * if ngbh its_mine - * no_problems, do it as always - * else: - * ngbh is an external node - * do rma things, - * in particular, acquire a lock on the window, read and write things - * then close - */ - #pragma omp parallel shared(to_remove_mask) - { - #pragma omp for - for(idx_t p = 0; p < n; ++p) - { - datapoint_info_t pp = *(dp_info_ptrs[p]); - int flag = 0; - idx_t ppp = 0; - - for(idx_t j = 1; j < pp.kstar + 1; ++j) - { - idx_t jidx = pp.ngbh.data[j].array_idx; - idx_t jpos = jidx - ctx -> idx_start; - - if(its_mine(ctx, jidx)) - { - if(dp_info[jpos].is_center && pp.g > dp_info[jpos].g) - { - - #pragma omp critical - { - ppp = to_remove_mask[jpos]; - datapoint_info_t ppp_datapoint; - flag = ppp != MY_SIZE_MAX; - if(flag) - { - ppp_datapoint = find_possibly_halo_datapoint(ctx, ppp); - to_remove_mask[jpos] = pp.g > ppp_datapoint.g ? pp.array_idx : ppp_datapoint.array_idx; - } - else - { - to_remove_mask[jpos] = pp.array_idx; - } - } - } - } - } - } - } - - /* check external nodes */ - for(int i = 0; i < ctx -> world_size; ++i) - { - for(int j = 0; j < ctx -> n_halo_points_recv[i]; ++j) - { - datapoint_info_t ext_dp = ctx -> halo_datapoints[i][j]; - - int flag = 0; - idx_t ppp = 0; - - for(idx_t k = 0; k < ext_dp.kstar + 1; ++k) - { - idx_t kidx = ext_dp.ngbh.data[k].array_idx; - idx_t kpos = kidx - ctx -> idx_start; - - if(its_mine(ctx, kidx)) - { - if(dp_info[kpos].is_center && ext_dp.g > dp_info[kpos].g) - { - #pragma omp critical - { - ppp = to_remove_mask[kpos]; - datapoint_info_t ppp_datapoint; - flag = ppp != MY_SIZE_MAX; - if(flag) - { - ppp_datapoint = find_possibly_halo_datapoint(ctx, ppp); - to_remove_mask[kpos] = ext_dp.g > ppp_datapoint.g ? ext_dp.array_idx : ppp_datapoint.array_idx; - } - else - { - to_remove_mask[kpos] = ext_dp.array_idx; - } - } - } - } - } - } - } - - /* populate the usual arrays */ - for(idx_t p = 0; p < all_centers.count; ++p) - { - idx_t i = all_centers.data[p]; - int e = 0; - //float_t gi = dp_info[i].g; - idx_t mr = to_remove_mask[i]; - if(mr != MY_SIZE_MAX) - { - //if(dp_info[mr].g > gi) e = 1; - e = 1; - } - switch (e) - { - case 1: - { - lu_dynamic_array_pushBack(&removed_centers,i); - dp_info[i].is_center = 0; - for(idx_t c = 0; c < removed_centers.count - 1; ++c) - { - if(mr == removed_centers.data[c]) - { - mr = max_rho.data[c]; - } - } - lu_dynamic_array_pushBack(&max_rho,mr); - - } - break; - - case 0: - { - lu_dynamic_array_pushBack(&actual_centers,i); - dp_info[i].cluster_idx = actual_centers.count - 1; - } - break; - - default: - break; - } - } - free(to_remove_mask); - - if(verbose) - { - clock_gettime(CLOCK_MONOTONIC, &finish); - elapsed = (finish.tv_sec - start.tv_sec); - elapsed += (finish.tv_nsec - start.tv_nsec) / 1000000000.0; - printf("\tFinding actual centers: %.3lfs\n",elapsed); - clock_gettime(CLOCK_MONOTONIC, &start); - } - - int n_centers = (int)actual_centers.count; - int tot_centers; - MPI_Allreduce(&n_centers, &tot_centers, 1, MPI_INT, MPI_SUM, ctx -> mpi_communicator); - // MPI_Allreduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm) - /* - DB_PRINT("rank %d got %d heheh\n", ctx -> mpi_rank, (int)actual_centers.count); - DB_PRINT("rank %d tc %d rc %d\n", ctx -> mpi_rank, (int)all_centers.count, (int)removed_centers.count); - */ - - /* bring on master all centers - * order them in ascending order of density, - * then re-scatter them around to get unique cluster labels */ - - - center_t* private_centers_buffer = (center_t*)MY_MALLOC(actual_centers.count * sizeof(center_t)); - - center_t* global_centers_buffer = (center_t*)MY_MALLOC(tot_centers * sizeof(center_t)); - - for(int i = 0; i < actual_centers.count; ++i) - { - idx_t idx = actual_centers.data[i] ; - private_centers_buffer[i].density = dp_info[idx].g; - private_centers_buffer[i].idx = dp_info[idx].array_idx; - } - MPI_Datatype mpi_center_type; - - HERE - - MPI_Type_contiguous(sizeof(center_t), MPI_BYTE, &mpi_center_type); - MPI_Type_commit(&mpi_center_type); - - int* center_counts = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); - int* center_displs = (int*)MY_MALLOC(ctx -> world_size * sizeof(int)); - - int cc = (int)actual_centers.count; - MPI_Allgather(&cc, 1, MPI_INT, center_counts, 1, MPI_INT, ctx -> mpi_communicator); - - center_displs[0] = 0; - for(int i = 1; i < ctx -> world_size; ++i) center_displs[i] = center_displs[i - 1] + center_counts[i - 1]; - - /* alternative to check if something breaks */ - for(int i = 0; i < actual_centers.count; ++i) - { - idx_t idx = actual_centers.data[i]; - dp_info[idx].cluster_idx += center_displs[ctx -> mpi_rank]; - - } - - - MPI_Gatherv(private_centers_buffer, (int)actual_centers.count, mpi_center_type, global_centers_buffer, center_counts, center_displs , mpi_center_type, 0, ctx -> mpi_communicator); - - - if(I_AM_MASTER) - { - qsort(global_centers_buffer, (size_t)tot_centers, sizeof(center_t), compare_center); - //qsort(void *base, size_t nmemb, size_t size, __compar_fn_t compar) - for(int i = 0; i < tot_centers; ++i) global_centers_buffer[i].cluster_idx = i; - } - - MPI_Barrier(ctx -> mpi_communicator); - MPI_Bcast(global_centers_buffer, tot_centers, mpi_center_type, 0, ctx -> mpi_communicator); - - - MPI_Type_free(&mpi_center_type); - - for(int i = 0; i < tot_centers; ++i) - { - if(its_mine(ctx, global_centers_buffer[i].idx)) - { - idx_t idx = global_centers_buffer[i].idx - ctx -> idx_start; - dp_info[idx].cluster_idx = global_centers_buffer[i].cluster_idx; - } - } - - free(private_centers_buffer); - free(global_centers_buffer); - - free(center_counts); - free(center_displs); - - HERE - - /* - * Sort all the dp_info based on g and then perform the cluster assignment - * in asceding order - */ - - /* - * Sort all the dp_info based on g and then perform the cluster assignment - * in asceding order - */ - - int completed = 0; - - int* cl_send = (int*)MY_MALLOC(tot_count_send * sizeof(int)); - int* cl_recv = (int*)MY_MALLOC(tot_count_recv * sizeof(int)); - - - - while(!completed) - { - completed = 1; - for(int i = 0; i < ctx -> world_size; ++i) - { - for(int j = 0; j < ctx -> n_halo_points_send[i]; ++j) - { - idx_t idx = ctx -> idx_halo_points_send[i][j] - ctx -> idx_start; - cl_send[j + sdispls[i]] = ctx -> local_datapoints[idx].cluster_idx; - } - } - - - - MPI_Barrier(ctx -> mpi_communicator); - /* exchange */ - MPI_Alltoallv(cl_send, ctx -> n_halo_points_send, sdispls, MPI_INT, cl_recv, ctx -> n_halo_points_recv, rdispls, MPI_INT, ctx -> mpi_communicator); - - /* unpack */ - - for(int i = 0; i < ctx -> world_size; ++i) - { - for(int j = 0; j < ctx -> n_halo_points_recv[i]; ++j) - { - ctx -> halo_datapoints[i][j].cluster_idx = cl_recv[rdispls[i] + j]; - } - } - int proc_points = 0; - /* retrieve assignment - * if some point remain uncompleted then get assignment from external points */ - - for(idx_t i = 0; i < n; ++i) - { - int wait_for_comms = 0; - datapoint_info_t* p = dp_info_ptrs[i]; - if(!(p -> is_center) && p -> cluster_idx < 0) - { - int cluster = -1; - idx_t k = 0; - idx_t p_idx; - idx_t max_k = p -> ngbh.N; - /*assign each particle at the same cluster as the nearest particle of higher density*/ - while( k < max_k - 1 && cluster == -1) - { - - - ++k; - p_idx = p -> ngbh.data[k].array_idx; - datapoint_info_t p_retrieved = find_possibly_halo_datapoint(ctx, p_idx); - - int flag = p_retrieved.g > p -> g; - - if(p_retrieved.g > p -> g) - { - cluster = p_retrieved.cluster_idx; - wait_for_comms = 1; - break; - } - } - - - if(cluster == -1 && !wait_for_comms) - { - float_t gmax = -99999.; - idx_t gm_index = SIZE_MAX; - for(idx_t k = 0; k < max_k; ++k) - { - idx_t ngbh_index = p -> ngbh.data[k].array_idx; - for(idx_t m = 0; m < removed_centers.count; ++m) - { - idx_t max_rho_idx = max_rho.data[m]; - datapoint_info_t dp_max_rho = find_possibly_halo_datapoint(ctx, max_rho_idx); - //datapoint_info_t dp_max_rho = dp_info[max_rho_idx]; - float_t gcand = dp_max_rho.g; - if(ngbh_index == removed_centers.data[m] && gcand > gmax) - { - //printf("%lu -- %lu\n", ele, m); - gmax = gcand; - gm_index = max_rho.data[m]; - } - } - } - if(gm_index != SIZE_MAX) - { - datapoint_info_t dp_gm = find_possibly_halo_datapoint(ctx, gm_index); - //datapoint_info_t dp_gm = dp_info[gm_index]; - cluster = dp_gm.cluster_idx; - } - - } - p -> cluster_idx = cluster; - - } - completed = completed && p -> cluster_idx != -1; - proc_points += p -> cluster_idx != -1 ? 1 : 0; - - } - - /* - for(idx_t i = 0; i < n; ++i) - { - if(dp_info[i].cluster_idx > tot_centers) DB_PRINT("rank %d found %lu cidx %d \n", ctx -> mpi_rank, dp_info[i].array_idx, dp_info[i].cluster_idx); - } - */ - - - //DB_PRINT("rank %d proc %d\n", ctx -> mpi_rank, proc_points); - MPI_Allreduce(MPI_IN_PLACE, &completed, 1, MPI_INT, MPI_SUM, ctx -> mpi_communicator); - completed = completed == ctx -> world_size ? 1 : 0; - /* copy cluster idx into buffer */ - - - } - - free(cl_send); - free(cl_recv); - - if(verbose) - { - clock_gettime(CLOCK_MONOTONIC, &finish); - elapsed = (finish.tv_sec - start.tv_sec); - elapsed += (finish.tv_nsec - start.tv_nsec) / 1000000000.0; - printf("\tTentative clustering: %.3lfs\n",elapsed); - clock_gettime(CLOCK_MONOTONIC, &start); - } - - free(sdispls); - free(rdispls); - free(dp_info_ptrs); - free(max_rho.data); - free(removed_centers.data); - free(all_centers.data); - - #if defined(WRITE_CLUSTER_ASSIGN_H1) - /* densities */ - int* ks = (int*)MY_MALLOC(ctx -> local_n_points * sizeof(int)); - for(int i = 0; i < ctx -> local_n_points; ++i) ks[i] = ctx -> local_datapoints[i].cluster_idx; - - ordered_buffer_to_file(ctx, ks, sizeof(int), ctx -> local_n_points, "bb/cl.npy"); - ordered_data_to_file(ctx); - free(ks); - #endif - - - clusters_t c_all; - c_all.centers = actual_centers; - - - if(verbose) - { - clock_gettime(CLOCK_MONOTONIC, &finish); - elapsed = (finish.tv_sec - start.tv_sec); - elapsed += (finish.tv_nsec - start.tv_nsec) / 1000000000.0; - printf("\tFinalizing clustering: %.3lfs\n",elapsed); - printf("\n"); - } - - clock_gettime(CLOCK_MONOTONIC, &finish_tot); - elapsed_tot = (finish_tot.tv_sec - start_tot.tv_sec); - elapsed_tot += (finish_tot.tv_nsec - start_tot.tv_nsec) / 1000000000.0; - - - if(verbose) - { - printf("\tFound %lu clusters\n",(uint64_t)actual_centers.count); - printf("\tTotal time: %.3lfs\n\n", elapsed_tot); - } - - c_all.n = n; - return c_all; -} - -void Heuristic2_rma(global_context_t* ctx, clusters_t* cluster) -{ - /* - * Port the current implementation to this using rma - * then perform the matrix reduction - * - * Each one computes its borders, then the borders are shared, and the matrix is - * reduced to a single huge matrix of borders - * - */ - - int verbose = 0; - - datapoint_info_t* dp_info = ctx -> local_datapoints; - - MPI_Win dp_info_win, ngbh_win; - MPI_Win_create(ctx -> local_datapoints, ctx -> local_n_points * sizeof(datapoint_info_t), 1, MPI_INFO_NULL, ctx -> mpi_communicator, &dp_info_win); - MPI_Win_create(ctx -> __local_heap_buffers, ctx -> local_n_points * ctx -> k * sizeof(heap_node), 1, MPI_INFO_NULL, ctx -> mpi_communicator, &ngbh_win); - - MPI_Win_fence(0, dp_info_win); - MPI_Win_fence(0, ngbh_win); - - MPI_Win_lock_all(0, dp_info_win); - MPI_Win_lock_all(0, ngbh_win); - MPI_Barrier(ctx -> mpi_communicator); - - - - #define borders cluster->borders - - struct timespec start_tot, finish_tot; - double elapsed_tot; - idx_t n = ctx -> local_n_points; - - if(verbose) - { - printf("H2: Finding border points\n"); - clock_gettime(CLOCK_MONOTONIC, &start_tot); - } - - idx_t nclus = cluster->centers.count; - idx_t max_k = dp_info[0].ngbh.N; - - for(idx_t i = 0; i < n; ++i) - { - idx_t pp = NOBORDER; - /*loop over n neighbors*/ - int c = dp_info[i].cluster_idx; - if(!dp_info[i].is_center) - { - for(idx_t k = 1; k < dp_info[i].kstar + 1; ++k) - { - /*index of the kth ngbh of n*/ - idx_t j = dp_info[i].ngbh.data[k].array_idx; - pp = NOBORDER; - /*Loop over kn neigbhours to find if n is the nearest*/ - /*if cluster of the particle in nbhg is c then check is neighborhood*/ - - //DB_PRINT("my rank %d l %lu %lu %d\n", ctx -> mpi_rank, k, j, foreign_owner(ctx, j)); - datapoint_info_t j_dp = find_possibly_halo_datapoint_rma(ctx, j, dp_info_win); - if(j_dp.cluster_idx != c) - { - pp = j; - break; - } - - } - } - - if(pp != NOBORDER) - { - datapoint_info_t pp_dp = find_possibly_halo_datapoint_rma(ctx, pp, dp_info_win); - for(idx_t k = 1; k < max_k; ++k) - { - //TODO: can optimize it can retrieve the whole ngbh in one shot - idx_t pp_ngbh_idx = get_j_ksel_idx(ctx, pp_dp.array_idx, k, ngbh_win); - if(pp_ngbh_idx == dp_info[i].array_idx) - { - break; - } - - datapoint_info_t pp_ngbh_dp = find_possibly_halo_datapoint_rma(ctx, pp_ngbh_idx, ngbh_win); - if(pp_ngbh_dp.cluster_idx == c) - { - pp = NOBORDER; - break; - } - } - } - - /*if it is the maximum one add it to the cluster*/ - if(pp != NOBORDER) - { - datapoint_info_t j_dp = find_possibly_halo_datapoint_rma(ctx, pp, dp_info_win); - int ppc = j_dp.cluster_idx; - //insert one and symmetric one - sparse_border_t b = {.i = c, .j = ppc, .idx = ctx -> local_datapoints[i].array_idx, .density = ctx -> local_datapoints[i].g, .error = ctx -> local_datapoints[i].log_rho_err}; - sparse_border_insert(cluster, b); - ////get symmetric border - sparse_border_t bsym = {.i = ppc, .j = c, .idx = ctx -> local_datapoints[i].array_idx, .density = ctx -> local_datapoints[i].g, .error = ctx -> local_datapoints[i].log_rho_err}; - sparse_border_insert(cluster, bsym); - } - - - } - - - for(idx_t c = 0; c < nclus; ++c) - { - for(idx_t el = 0; el < cluster -> sparse_borders[c].count; ++el) - { - //fix border density, write log rho c - idx_t idx = cluster -> sparse_borders[c].data[el].idx; - datapoint_info_t idx_dp = find_possibly_halo_datapoint_rma(ctx, idx, dp_info_win); - cluster -> sparse_borders[c].data[el].density = idx_dp.log_rho_c; - } - } - - - if(verbose) - { - clock_gettime(CLOCK_MONOTONIC, &finish_tot); - elapsed_tot = (finish_tot.tv_sec - start_tot.tv_sec); - elapsed_tot += (finish_tot.tv_nsec - start_tot.tv_nsec) / 1000000000.0; - printf("\tTotal time: %.3lfs\n\n", elapsed_tot); - } - - MPI_Barrier(ctx -> mpi_communicator); - MPI_Win_unlock_all(ngbh_win); - MPI_Win_unlock_all(dp_info_win); - - - /* Border reduce */ - - idx_t num_border_el = 0; - for(idx_t i = 0; i < nclus; ++i) - { - num_border_el += cluster -> sparse_borders[i].count; - } - - MPI_DB_PRINT("Master found %lu border elements\n", num_border_el); - - int i_have_sent = 0; - int level = 1; - int ranks = ctx -> world_size; - - #define SEND 1 - #define RECV 0 - #define DO_NOTHING -1 - - MPI_Barrier(ctx -> mpi_communicator); - - while(ranks > 1) - { - int dp = ranks % 2; - ranks = ranks / 2 + dp; - int send_rcv = ctx -> mpi_rank >= ranks; - - if(dp && ctx -> mpi_rank == ranks - 1) send_rcv = DO_NOTHING; - - switch (send_rcv) - { - case SEND: - if(!i_have_sent) - { - idx_t num_border_el = 0; - for(idx_t i = 0; i < nclus; ++i) - { - num_border_el += cluster -> sparse_borders[i].count; - } - sparse_border_t* borders_to_send = (sparse_border_t*)MY_MALLOC(num_border_el * sizeof(sparse_border_t)); - - idx_t count = 0; - for(idx_t i = 0; i < nclus; ++i) - { - for(idx_t j = 0; j < cluster -> sparse_borders[i].count; ++j) - { - borders_to_send[count] = cluster -> sparse_borders[i].data[j]; - count++; - } - } - - int rank_to_send = ctx -> mpi_rank - ranks; - DB_PRINT("-- Rank %d sending to %d\n", ctx -> mpi_rank, rank_to_send); - - MPI_Send(&num_border_el, 1, MPI_UINT64_T, rank_to_send, rank_to_send, ctx -> mpi_communicator); - - MPI_Send(borders_to_send, num_border_el * sizeof(sparse_border_t), MPI_BYTE , rank_to_send, rank_to_send, ctx -> mpi_communicator); - - i_have_sent = 1; - free(borders_to_send); - } - break; - case RECV: - { - DB_PRINT("** Rank %d recieving\n", ctx -> mpi_rank); - idx_t num_borders_recv = 0; - MPI_Recv(&num_borders_recv, 1, MPI_UINT64_T, MPI_ANY_SOURCE, ctx -> mpi_rank, ctx -> mpi_communicator, MPI_STATUS_IGNORE); - - sparse_border_t* borders_to_recv = (sparse_border_t*)MY_MALLOC(num_borders_recv * sizeof(sparse_border_t)); - - MPI_Recv(borders_to_recv, num_borders_recv * sizeof(sparse_border_t), MPI_BYTE , MPI_ANY_SOURCE, ctx -> mpi_rank, ctx -> mpi_communicator, MPI_STATUS_IGNORE); - - for(int i = 0; i < num_borders_recv; ++i) - { - sparse_border_insert(cluster, borders_to_recv[i]); - } - free(borders_to_recv); - } - break; - default: - DB_PRINT(".. Rank %d doing nothing\n", ctx -> mpi_rank); - break; - } - MPI_Barrier(ctx -> mpi_communicator); - MPI_DB_PRINT("-----------------\n"); - - } - - num_border_el = 0; - for(idx_t i = 0; i < nclus; ++i) - { - num_border_el += cluster -> sparse_borders[i].count; - } - - MPI_DB_PRINT("Master final %lu border elements\n", num_border_el); - - #undef SEND - #undef RECV - - #ifdef WRITE_BORDERS - if(I_AM_MASTER) - { - FILE* f = fopen("bb/borders.csv", "w"); - for(idx_t i = 0; i < nclus; ++i) - { - for(int j = 0; j < cluster -> sparse_borders[i].count; ++j) - { - sparse_border_t b = cluster -> sparse_borders[i].data[j]; - fprintf(f, "%lu,%lu,%lu,%lf\n", b.i, b.j, b.idx, b.density); - } - } - fclose(f); - } - - - #endif - - return; - #undef borders - - } - -void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx) -{ - float_t *data; - TIME_DEF - double elapsed_time; - - if (ctx->mpi_rank == 0) - { - //data = read_data_file(ctx, "../norm_data/50_blobs_more_var.npy", MY_TRUE); - //ctx->dims = 2; - //data = read_data_file(ctx, "../norm_data/50_blobs.npy", MY_TRUE); - // std_g0163178_Me14_091_0000 - - // 190M points - // std_g2980844_091_0000 - // data = read_data_file(ctx,"../norm_data/std_g2980844_091_0000",MY_TRUE); - - /* 1M points ca.*/ - // data = read_data_file(ctx,"../norm_data/std_LR_091_0001",MY_TRUE); - - /* BOX */ - // data = read_data_file(ctx,"../norm_data/std_Box_256_30_092_0000",MY_TRUE); - - /* 8M points */ - - // data = read_data_file(ctx,"../norm_data/std_g0144846_Me14_091_0001",MY_TRUE); - - //88M - data = read_data_file(ctx,"../norm_data/std_g5503149_091_0000",MY_TRUE); - - // - //34 M - // data = read_data_file(ctx,"../norm_data/std_g1212639_091_0001",MY_TRUE); - ctx->dims = 5; - - //ctx -> n_points = 5 * 100000; - ctx->n_points = ctx->n_points / ctx->dims; - //ctx->n_points = (ctx->n_points * 5) / 10; - // ctx -> n_points = ctx -> world_size * 1000; - - //ctx -> n_points = 10000000 * ctx -> world_size; - //generate_random_matrix(&data, ctx -> dims, ctx -> n_points, ctx); - //mpi_printf(ctx, "Read %lu points in %u dims\n", ctx->n_points, ctx->dims); - } - //MPI_DB_PRINT("[MASTER] Reading file and scattering\n"); - - /* communicate the total number of points*/ - MPI_Bcast(&(ctx->dims), 1, MPI_UINT32_T, 0, ctx->mpi_communicator); - MPI_Bcast(&(ctx->n_points), 1, MPI_UINT64_T, 0, ctx->mpi_communicator); - - /* compute the number of elements to recieve for each processor */ - int *send_counts = (int *)MY_MALLOC(ctx->world_size * sizeof(int)); - int *displacements = (int *)MY_MALLOC(ctx->world_size * sizeof(int)); - - displacements[0] = 0; - send_counts[0] = ctx->n_points / ctx->world_size; - send_counts[0] += (ctx->n_points % ctx->world_size) > 0 ? 1 : 0; - send_counts[0] = send_counts[0] * ctx->dims; - - for (int p = 1; p < ctx->world_size; ++p) - { - send_counts[p] = (ctx->n_points / ctx->world_size); - send_counts[p] += (ctx->n_points % ctx->world_size) > p ? 1 : 0; - send_counts[p] = send_counts[p] * ctx->dims; - displacements[p] = displacements[p - 1] + send_counts[p - 1]; - } - - - ctx->local_n_points = send_counts[ctx->mpi_rank] / ctx->dims; - - float_t *pvt_data = (float_t *)MY_MALLOC(send_counts[ctx->mpi_rank] * sizeof(float_t)); - - MPI_Scatterv(data, send_counts, displacements, MPI_MY_FLOAT, pvt_data, send_counts[ctx->mpi_rank], MPI_MY_FLOAT, 0, ctx->mpi_communicator); - - ctx->local_data = pvt_data; - - int k_local = 20; - int k_global = 20; - - uint64_t *global_bin_counts_int = (uint64_t *)MY_MALLOC(k_global * sizeof(uint64_t)); - - pointset_t original_ps; - original_ps.data = ctx->local_data; - original_ps.dims = ctx->dims; - original_ps.n_points = ctx->local_n_points; - original_ps.lb_box = (float_t*)MY_MALLOC(ctx -> dims * sizeof(float_t)); - original_ps.ub_box = (float_t*)MY_MALLOC(ctx -> dims * sizeof(float_t)); - - float_t tol = 0.002; - - top_kdtree_t tree; - TIME_START; - top_tree_init(ctx, &tree); - elapsed_time = TIME_STOP; - LOG_WRITE("Initializing global kdtree", elapsed_time); - - TIME_START; - build_top_kdtree(ctx, &original_ps, &tree, k_global, tol); - exchange_points(ctx, &tree); - elapsed_time = TIME_STOP; - LOG_WRITE("Top kdtree build and domain decomposition", elapsed_time); - //test_the_idea(ctx); - - TIME_START; - kdtree_v2 local_tree; - kdtree_v2_init( &local_tree, ctx -> local_data, ctx -> local_n_points, (unsigned int)ctx -> dims); - int k = 300; - //int k = 30; - - datapoint_info_t* dp_info = (datapoint_info_t*)MY_MALLOC(ctx -> local_n_points * sizeof(datapoint_info_t)); - /* initialize, to cope with valgrind */ - for(uint64_t i = 0; i < ctx -> local_n_points; ++i) - { - dp_info[i].ngbh.data = NULL; - dp_info[i].ngbh.N = 0; - dp_info[i].ngbh.count = 0; - dp_info[i].g = 0.f; - dp_info[i].log_rho = 0.f; - dp_info[i].log_rho_c = 0.f; - dp_info[i].log_rho_err = 0.f; - dp_info[i].array_idx = -1; - dp_info[i].kstar = -1; - dp_info[i].is_center = -1; - dp_info[i].cluster_idx = -1; - } - - build_local_tree(ctx, &local_tree); - elapsed_time = TIME_STOP; - LOG_WRITE("Local trees init and build", elapsed_time); - - TIME_START; - MPI_DB_PRINT("----- Performing ngbh search -----\n"); - MPI_Barrier(ctx -> mpi_communicator); - - mpi_ngbh_search(ctx, dp_info, &tree, &local_tree, ctx -> local_data, k); - - MPI_Barrier(ctx -> mpi_communicator); - elapsed_time = TIME_STOP; - LOG_WRITE("Total time for all knn search", elapsed_time) - - - - TIME_START; - //float_t id = id_estimate(ctx, dp_info, ctx -> local_n_points, 0.9, MY_FALSE); - float_t id = compute_ID_two_NN_ML(ctx, dp_info, ctx -> local_n_points, MY_FALSE); - elapsed_time = TIME_STOP; - LOG_WRITE("ID estimate", elapsed_time) - - MPI_DB_PRINT("ID %lf \n",id); - - - //TIME_START; - //datapoint_info_t** foreign_dp_info = (datapoint_info_t**)MY_MALLOC(ctx -> world_size * sizeof(datapoint_info_t*)); - //find_foreign_nodes(ctx, dp_info, foreign_dp_info); - //elapsed_time = TIME_STOP; - //LOG_WRITE("Finding points to request the ngbh", elapsed_time) - - TIME_START; - - ctx -> local_datapoints = dp_info; - //compute_density_kstarnn_rma(ctx, id, MY_FALSE); - compute_density_kstarnn_rma_v2(ctx, id, MY_FALSE); - //compute_density_kstarnn(ctx, id, MY_FALSE); - compute_correction(ctx, 2); - elapsed_time = TIME_STOP; - LOG_WRITE("Density estimate", elapsed_time) - - TIME_START; - clusters_t clusters = Heuristic1_rma(ctx, MY_FALSE); - elapsed_time = TIME_STOP; - LOG_WRITE("H1", elapsed_time) - - TIME_START; - clusters_allocate(&clusters, 1); - Heuristic2_rma(ctx, &clusters); - elapsed_time = TIME_STOP; - LOG_WRITE("H2", elapsed_time) - - - /* find density */ - - - - - #if defined (WRITE_NGBH) - ordered_data_to_file(ctx); - #endif - - /* - free(foreign_dp_info); - */ - - - top_tree_free(ctx, &tree); - kdtree_v2_free(&local_tree); - - free(send_counts); - free(displacements); - //free(dp_info); - - if (ctx->mpi_rank == 0) free(data); - - original_ps.data = NULL; - free_pointset(&original_ps); - free(global_bin_counts_int); -} diff --git a/src/tree/tree.h b/src/tree/tree.h index 9725a38336880b98884d646c272276bbbf18488f..1ee44f847f926b3713114ba76165f2aab8795a6d 100644 --- a/src/tree/tree.h +++ b/src/tree/tree.h @@ -2,6 +2,7 @@ #include "../common/common.h" #include "heap.h" #include "kdtreeV2.h" + #include <stdlib.h> #include <stdio.h> #include <memory.h> @@ -84,41 +85,24 @@ typedef struct top_kdtree_t size_t _capacity; struct top_kdtree_node_t* _nodes; struct top_kdtree_node_t* root; - } top_kdtree_t; -typedef struct border_t { - float_t density; - float_t error; - idx_t idx; -} border_t; - -typedef struct sparse_border_t { - idx_t i; - idx_t j; - idx_t idx; - float_t density; - float_t error; -} sparse_border_t; - -typedef struct adj_list_t { - idx_t count; - idx_t size; - struct sparse_border_t* data; -} adj_list_t; - - -typedef struct clusters_t { - int use_sparse_borders; - struct adj_list_t *sparse_borders; - struct lu_dynamic_array_t centers; - struct border_t **borders; - struct border_t *__borders_data; - idx_t n; -} clusters_t; - - -void simulate_master_read_and_scatter(int, size_t, global_context_t* ); + + +void simulate_master_read_and_scatter(int, size_t, global_context_t* ); +float_t* read_data_file(global_context_t *ctx, const char *fname, const int file_in_float32); +void top_tree_init(global_context_t *ctx, top_kdtree_t *tree); +void build_top_kdtree(global_context_t *ctx, pointset_t *og_pointset, top_kdtree_t *tree, int n_bins, float_t tolerance); +void exchange_points(global_context_t* ctx, top_kdtree_t* tree); +void build_local_tree(global_context_t* ctx, kdtree_v2* local_tree); +void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtree_t* top_tree, kdtree_v2* local_tree, float_t* data, int k); +float_t compute_ID_two_NN_ML(global_context_t* ctx, datapoint_info_t* dp_info, idx_t n, int verbose); +void compute_density_kstarnn_rma_v2(global_context_t* ctx, const float_t d, int verbose); +void compute_correction(global_context_t* ctx, float_t Z); +int foreign_owner(global_context_t*, idx_t); +void top_tree_free(global_context_t *ctx, top_kdtree_t *tree); + +