diff --git a/.ipynb_checkpoints/var-checkpoint.py b/.ipynb_checkpoints/var-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..708cc54871c1a4c9d1ce62bd903ec4960a193898
--- /dev/null
+++ b/.ipynb_checkpoints/var-checkpoint.py
@@ -0,0 +1,6 @@
+import numpy as np
+
+d = np.fromfile("../norm_data/std_LR_091_0000", dtype=np.float32)
+print(d.shape)
+d = d.reshape((d.shape[0]//5,5))
+print(np.cov(d.T))
diff --git a/Makefile b/Makefile
index 8df32476d9ac91b9171f382b9ea62f1c606e8b33..9c34660ebdf14daa9a557904972344b9d18bc740 100644
--- a/Makefile
+++ b/Makefile
@@ -4,7 +4,7 @@ LDFLAGS=-lm
 
 all: main
 
-obj=src/main/main.c src/tree/tree.c src/common/common.c src/tree/kdtreeV2.c src/tree/heap.c
+obj=src/main/main.c src/tree/tree.c src/common/common.c src/tree/kdtreeV2.c src/tree/heap.c src/adp/adp.c
 
 main: ${obj} 
 	${CC} ${CFLAGS} ${LDFLAGS} ${obj} -o $@
diff --git a/src/adp/adp.c b/src/adp/adp.c
new file mode 100644
index 0000000000000000000000000000000000000000..064fd7583c94c60077384d9110a0e74f6e0b72d0
--- /dev/null
+++ b/src/adp/adp.c
@@ -0,0 +1,2114 @@
+#include "adp.h"
+
+const border_t border_null = {.density = -1.0, .error = 0, .idx = NOBORDER};
+const sparse_border_t sparse_border_null = {.density = -1.0, .error = 0, .idx = NOBORDER, .i = NOBORDER, .j = NOBORDER};
+
+float_t mEst2(float_t * x, float_t *y, idx_t n)
+{
+
+    /*
+     * Estimate the m coefficient of a straight 
+     * line passing through the origin          
+     * params:                                  
+     * - x: x values of the points              
+     * - y: y values of the points              
+     * - n: size of the arrays                  
+     */
+     
+    float_t num = 0;
+    float_t den = 0;
+    float_t dd;
+    for(idx_t i = 0; i < n; ++i)
+    {
+        float_t xx = x[i];
+        float_t yy = y[i];
+
+        dd = xx;
+        num += dd*yy;
+        den += dd*dd;
+
+    }
+  
+    return num/den;
+}
+
+float_t compute_ID_two_NN_ML(global_context_t* ctx, datapoint_info_t* dp_info, idx_t n, int verbose)
+{
+
+    /*
+     * Estimation of the intrinsic dimension of a dataset                                       
+     * args:                                                                                    
+     * - dp_info: array of structs                                                             
+     * - n: number of dp_info                                                                  
+     * intrinsic_dim = (N - 1) / np.sum(log_mus)
+     */
+
+    struct timespec start_tot, finish_tot;
+    double elapsed_tot;
+
+	if(verbose) 
+    {
+		printf("ID estimation:\n");
+		clock_gettime(CLOCK_MONOTONIC, &start_tot);
+	}
+    
+    float_t log_mus = 0;
+    for(idx_t i = 0; i < n; ++i)
+    {
+        log_mus += 0.5 * log(dp_info[i].ngbh.data[2].value/dp_info[i].ngbh.data[1].value);
+    }
+
+    float_t d = 0;
+    MPI_Allreduce(&log_mus, &d, 1, MPI_MY_FLOAT, MPI_SUM, ctx -> mpi_communicator);
+    d = (ctx -> n_points - 1)/d;
+	if(verbose)
+	{
+		clock_gettime(CLOCK_MONOTONIC, &finish_tot);
+		elapsed_tot = (finish_tot.tv_sec - start_tot.tv_sec);
+		elapsed_tot += (finish_tot.tv_nsec - start_tot.tv_nsec) / 1000000000.0;
+		printf("\tID value: %.6lf\n", d);
+		printf("\tTotal time: %.3lfs\n\n", elapsed_tot);
+	}
+
+    return d;
+
+}
+
+float_t get_j_ksel_dist(global_context_t* ctx, idx_t j, idx_t ksel, MPI_Win exposed_ngbh)
+{
+    int owner = foreign_owner(ctx, j);
+    idx_t k = ctx -> k;
+    /* find if datapoint is halo or not */
+    if(owner == ctx -> mpi_rank)
+    {
+        idx_t pos = j - ctx -> idx_start;
+        return ctx -> local_datapoints[pos].ngbh.data[ksel].value;
+    }
+    else
+    {
+        heap_node el;
+        idx_t pos  = j - ctx -> rank_idx_start[owner];
+        MPI_Request request;
+        int err = MPI_Rget(&el, sizeof(heap_node), MPI_BYTE, owner, (MPI_Aint)((pos * k + ksel) * sizeof(heap_node)), sizeof(heap_node), MPI_BYTE, exposed_ngbh, &request);
+        MPI_Wait(&request,MPI_STATUS_IGNORE);
+        return el.value;
+    }                 
+}
+
+idx_t get_j_ksel_idx(global_context_t* ctx, idx_t j, idx_t ksel, MPI_Win exposed_ngbh)
+{
+    int owner = foreign_owner(ctx, j);
+    idx_t k = ctx -> k;
+    /* find if datapoint is halo or not */
+    if(owner == ctx -> mpi_rank)
+    {
+        idx_t pos = j - ctx -> idx_start;
+        return ctx -> local_datapoints[pos].ngbh.data[ksel].array_idx;
+    }
+    else
+    {
+        heap_node el;
+        idx_t pos  = j - ctx -> rank_idx_start[owner];
+        MPI_Request request;
+        int err = MPI_Rget(&el, sizeof(heap_node), MPI_BYTE, owner, (MPI_Aint)((pos * k + ksel) * sizeof(heap_node)), sizeof(heap_node), MPI_BYTE, exposed_ngbh, &request);
+        MPI_Wait(&request,MPI_STATUS_IGNORE);
+        return el.array_idx;
+    }                 
+}
+
+void compute_density_kstarnn_rma(global_context_t* ctx, const float_t d, int verbose){
+
+    /*
+     * Point density computation:                       
+     * args:                                            
+     * - paricles: array of structs                   
+     * - d       : intrinsic dimension of the dataset 
+     * - points  : number of points in the dataset    
+     */
+
+
+    MPI_Info info;
+
+
+    MPI_Barrier(ctx -> mpi_communicator);
+    idx_t k = ctx -> local_datapoints[0].ngbh.N;
+
+    struct timespec start_tot, finish_tot;
+    double elapsed_tot;
+
+    datapoint_info_t* local_datapoints = ctx -> local_datapoints;
+
+	if(verbose)
+	{
+		printf("Density and k* estimation:\n");
+		clock_gettime(CLOCK_MONOTONIC, &start_tot);
+	}
+
+    idx_t kMAX = ctx -> local_datapoints[0].ngbh.N - 1;   
+
+    float_t omega = 0.;  
+    if(sizeof(float_t) == sizeof(float)){ omega = powf(PI_F,d/2)/tgammaf(d/2.0f + 1.0f);}  
+    else{omega = pow(M_PI,d/2.)/tgamma(d/2.0 + 1.0);}
+
+
+    MPI_Win exposed_ngbh;
+    MPI_Win_create( ctx -> __local_heap_buffers, 
+                    ctx -> local_n_points * k * sizeof(heap_node), 
+                    1, MPI_INFO_NULL, 
+                    ctx -> mpi_communicator, 
+                    &exposed_ngbh);
+
+    MPI_Win_fence(0, exposed_ngbh);
+    MPI_Win_lock_all(0, exposed_ngbh);
+
+    MPI_Barrier(ctx -> mpi_communicator);
+
+    #pragma omp parallel for
+    for(idx_t i = 0; i < ctx -> local_n_points; ++i)
+    {
+        for(idx_t k = 0; k <= kMAX; ++k)
+        {
+            local_datapoints[i].ngbh.data[k].value = omega * pow(local_datapoints[i].ngbh.data[k].value, d/2.);  
+        }
+    }
+
+    for(idx_t i = 0; i < ctx -> local_n_points; ++i)
+    {
+
+        idx_t j = 4;
+        idx_t k;
+        float_t dL  = 0.;
+        float_t vvi = 0.;
+		float_t vvj = 0.;
+		float_t vp  = 0.;
+        while(j < kMAX  && dL < DTHR)
+        {
+            idx_t ksel = j - 1;
+            //vvi = omega * pow(local_datapoints[i].ngbh.data[ksel].value,d/2.);
+            vvi = local_datapoints[i].ngbh.data[ksel].value;
+
+            idx_t jj = local_datapoints[i].ngbh.data[j].array_idx;
+
+            /* 
+             * note jj can be an halo point 
+             * need to search maybe for it in foreign nodes
+             * */
+            //float_t dist_jj = get_j_ksel_dist(ctx, jj, ksel, exposed_ngbh);
+            //vvj = omega * pow(dist_jj,d/2.);
+            vvj = get_j_ksel_dist(ctx, jj, ksel, exposed_ngbh);
+
+            vp = (vvi + vvj)*(vvi + vvj);
+            dL = -2.0 * ksel * log(4.*vvi*vvj/vp);
+            j = j + 1;
+        }
+        if(j == kMAX)
+        {
+            k = j - 1;
+            //vvi = omega * pow(ctx -> local_datapoints[i].ngbh.data[k].value,d/2.);
+            vvi = ctx -> local_datapoints[i].ngbh.data[k].value;
+        }
+        else
+        {
+            k = j - 2;
+        }
+        local_datapoints[i].kstar = k;
+        local_datapoints[i].log_rho = log((float_t)(k)/vvi/((float_t)(ctx -> n_points)));
+        //dp_info[i].log_rho = log((float_t)(k)) - log(vvi) -log((float_t)(points));
+        local_datapoints[i].log_rho_err =   1.0/sqrt((float_t)k); //(float_t)(-Q_rsqrt((float)k));
+        local_datapoints[i].g = local_datapoints[i].log_rho - local_datapoints[i].log_rho_err;
+    }
+
+	if(verbose)
+	{
+		clock_gettime(CLOCK_MONOTONIC, &finish_tot);
+		elapsed_tot = (finish_tot.tv_sec - start_tot.tv_sec);
+		elapsed_tot += (finish_tot.tv_nsec - start_tot.tv_nsec) / 1000000000.0;
+		printf("\tTotal time: %.3lfs\n\n", elapsed_tot);
+	}
+
+    MPI_Win_unlock_all(exposed_ngbh);
+    MPI_Win_fence(0, exposed_ngbh);
+    MPI_Win_free(&exposed_ngbh);
+
+    #if defined(WRITE_DENSITY)
+        /* densities */
+        float_t* den = (float_t*)MY_MALLOC(ctx -> local_n_points * sizeof(float_t));
+        float_t* gs = (float_t*)MY_MALLOC(ctx -> local_n_points * sizeof(float_t));
+        idx_t* ks = (idx_t*)MY_MALLOC(ctx -> local_n_points * sizeof(idx_t));
+
+        for(int i = 0; i < ctx -> local_n_points; ++i) den[i] = ctx -> local_datapoints[i].log_rho;
+        for(int i = 0; i < ctx -> local_n_points; ++i) ks[i] = ctx -> local_datapoints[i].kstar;
+        for(int i = 0; i < ctx -> local_n_points; ++i) gs[i] = ctx -> local_datapoints[i].g;
+
+        ordered_buffer_to_file(ctx, den, sizeof(float_t), ctx -> local_n_points, "bb/ordered_density.npy");
+        ordered_buffer_to_file(ctx, ks, sizeof(idx_t), ctx -> local_n_points, "bb/ks.npy");
+        ordered_buffer_to_file(ctx, gs, sizeof(float_t), ctx -> local_n_points, "bb/g.npy");
+
+        ordered_data_to_file(ctx);
+        free(den);
+        free(ks);
+    #endif
+    return;
+
+
+}
+
+void compute_density_kstarnn_rma_v2(global_context_t* ctx, const float_t d, int verbose){
+
+    /*
+     * Point density computation:                       
+     * args:                                            
+     * - paricles: array of structs                   
+     * - d       : intrinsic dimension of the dataset 
+     * - points  : number of points in the dataset    
+     */
+
+
+    MPI_Info info;
+
+
+    MPI_Barrier(ctx -> mpi_communicator);
+    idx_t k = ctx -> local_datapoints[0].ngbh.N;
+
+    struct timespec start_tot, finish_tot;
+    double elapsed_tot;
+
+    datapoint_info_t* local_datapoints = ctx -> local_datapoints;
+
+	if(verbose)
+	{
+		printf("Density and k* estimation:\n");
+		clock_gettime(CLOCK_MONOTONIC, &start_tot);
+	}
+
+    idx_t kMAX = ctx -> local_datapoints[0].ngbh.N - 1;   
+
+    float_t omega = 0.;  
+    if(sizeof(float_t) == sizeof(float)){ omega = powf(PI_F,d/2)/tgammaf(d/2.0f + 1.0f);}  
+    else{omega = pow(M_PI,d/2.)/tgamma(d/2.0 + 1.0);}
+
+
+    MPI_Win exposed_ngbh;
+    MPI_Win_create( ctx -> __local_heap_buffers, 
+                    ctx -> local_n_points * k * sizeof(heap_node), 
+                    1, MPI_INFO_NULL, 
+                    ctx -> mpi_communicator, 
+                    &exposed_ngbh);
+
+    MPI_Win_fence(MPI_MODE_NOPUT, exposed_ngbh);
+
+    MPI_Barrier(ctx -> mpi_communicator);
+
+    #pragma omp parallel for
+    for(idx_t i = 0; i < ctx -> local_n_points; ++i)
+    {
+        for(idx_t k = 0; k <= kMAX; ++k)
+        {
+            local_datapoints[i].ngbh.data[k].value = omega * pow(local_datapoints[i].ngbh.data[k].value, d/2.);  
+        }
+        //initialize kstar at 0
+        local_datapoints[i].kstar = 0;
+    }
+
+    int i_have_finished = 0;
+    int all_have_finished = 0;
+    int finished_points = 0;
+    heap_node* scratch_heap_nodes = (heap_node*)MY_MALLOC(ctx -> local_n_points * sizeof(heap_node));  
+
+        for(idx_t j = 4; j < kMAX - 1; ++j)
+        {
+            i_have_finished = 1;
+            //request data
+            idx_t ksel = j - 1;
+
+#if defined(THREAD_FUNNELED)
+#else
+            #pragma omp parallel for
+#endif
+
+            for(idx_t i = 0; i < ctx -> local_n_points; ++i)
+            {
+
+                if(ctx -> local_datapoints[i].kstar == 0)
+                {
+                    //vvi = omega * pow(local_datapoints[i].ngbh.data[ksel].value,d/2.);
+
+                    idx_t jj = local_datapoints[i].ngbh.data[j].array_idx;
+
+                    /* 
+                     * note jj can be an halo point 
+                     * need to search maybe for it in foreign nodes
+                     * */
+
+                    int owner = foreign_owner(ctx, jj);
+                    idx_t pos = jj - ctx -> rank_idx_start[owner];
+
+                    if(owner == ctx -> mpi_rank)
+                    {
+                        scratch_heap_nodes[i] = ctx -> local_datapoints[pos].ngbh.data[ksel];
+                    }
+                    else
+                    {
+                        MPI_Get(scratch_heap_nodes + i, 
+                                sizeof(heap_node), 
+                                MPI_BYTE, 
+                                owner, 
+                                (MPI_Aint)((pos * (ctx -> k) + ksel) * sizeof(heap_node)), 
+                                sizeof(heap_node), 
+                                MPI_BYTE, 
+                                exposed_ngbh);
+                    }
+                }
+
+            }
+
+            MPI_Win_fence(MPI_MODE_NOPUT,exposed_ngbh); 
+            //process data
+#if defined(THREAD_FUNNELED)
+#else
+            #pragma omp parallel for
+#endif
+            for(idx_t i = 0; i < ctx -> local_n_points; ++i)
+            {
+                if(ctx -> local_datapoints[i].kstar == 0)
+                {
+                    float_t vvi, vvj, vp, dL;
+                    vvi = local_datapoints[i].ngbh.data[ksel].value;
+                    vvj = scratch_heap_nodes[i].value;
+                    vp = (vvi + vvj)*(vvi + vvj);
+                    dL = -2.0 * ksel * log(4.*vvi*vvj/vp);
+
+                    if(dL > DTHR)
+                    {
+                        idx_t k = j - 1;
+                        local_datapoints[i].kstar = k;
+                        local_datapoints[i].log_rho = log((float_t)(k)/vvi/((float_t)(ctx -> n_points)));
+                        local_datapoints[i].log_rho_err =   1.0/sqrt((float_t)k); //(float_t)(-Q_rsqrt((float)k));
+                        local_datapoints[i].g = local_datapoints[i].log_rho - local_datapoints[i].log_rho_err;
+
+                        #pragma omp atomic update
+                        finished_points++;
+                    }
+                }
+
+            }
+
+
+            i_have_finished = (finished_points == ctx -> local_n_points);
+            MPI_Allreduce(&i_have_finished, &all_have_finished, 1, MPI_INT, MPI_LAND, ctx -> mpi_communicator);
+
+            if(all_have_finished) break;
+        }
+
+
+        #pragma omp parallel for
+        for(idx_t i = 0; i < ctx -> local_n_points; ++i)
+        {
+            if(ctx -> local_datapoints[i].kstar == 0)
+            {
+                idx_t k = kMAX - 1;
+
+                float_t vvi = ctx -> local_datapoints[i].ngbh.data[k].value;
+
+                local_datapoints[i].kstar = k;
+                local_datapoints[i].log_rho = log((float_t)(k)/vvi/((float_t)(ctx -> n_points)));
+                local_datapoints[i].log_rho_err =   1.0/sqrt((float_t)k); //(float_t)(-Q_rsqrt((float)k));
+                local_datapoints[i].g = local_datapoints[i].log_rho - local_datapoints[i].log_rho_err;
+            }
+        }
+
+
+    MPI_Win_fence(0, exposed_ngbh);
+    MPI_Win_free(&exposed_ngbh);
+
+    free(scratch_heap_nodes);
+
+    #if defined(WRITE_DENSITY)
+        /* densities */
+        float_t* den = (float_t*)MY_MALLOC(ctx -> local_n_points * sizeof(float_t));
+        float_t* gs = (float_t*)MY_MALLOC(ctx -> local_n_points * sizeof(float_t));
+        idx_t* ks = (idx_t*)MY_MALLOC(ctx -> local_n_points * sizeof(idx_t));
+
+        for(int i = 0; i < ctx -> local_n_points; ++i) den[i] = ctx -> local_datapoints[i].log_rho;
+        for(int i = 0; i < ctx -> local_n_points; ++i) ks[i] = ctx -> local_datapoints[i].kstar;
+        for(int i = 0; i < ctx -> local_n_points; ++i) gs[i] = ctx -> local_datapoints[i].g;
+
+        ordered_buffer_to_file(ctx, den, sizeof(float_t), ctx -> local_n_points, "bb/ordered_density.npy");
+        ordered_buffer_to_file(ctx, ks, sizeof(idx_t), ctx -> local_n_points, "bb/ks.npy");
+        ordered_buffer_to_file(ctx, gs, sizeof(float_t), ctx -> local_n_points, "bb/g.npy");
+
+        ordered_data_to_file(ctx);
+        free(den);
+        free(ks);
+    #endif
+    return;
+
+
+}
+
+float_t get_j_ksel_dist_v2(global_context_t* ctx, idx_t i, idx_t j, idx_t ksel, int* flags, heap_node* tmp_heap_nodes, MPI_Win* exposed_ngbh)
+{
+    if(flags[i])
+    {
+        int owner = foreign_owner(ctx, j);
+        idx_t k = ctx -> k;
+        /* find if datapoint is halo or not */
+        if(owner == ctx -> mpi_rank)
+        {
+            idx_t pos = j - ctx -> idx_start;
+            return ctx -> local_datapoints[pos].ngbh.data[ksel].value;
+        }
+        else
+        {
+            //RMA
+            flags[i] = 0;
+            idx_t pos  = j - ctx -> rank_idx_start[owner];
+            MPI_Get(tmp_heap_nodes + i, sizeof(heap_node), MPI_BYTE, owner, (MPI_Aint)((pos * k + ksel) * sizeof(heap_node)), sizeof(heap_node), MPI_BYTE, *exposed_ngbh);
+            //if(ctx -> mpi_rank == 0) DB_PRINT("rvcd %lu %lf\n", el.array_idx, el.value);
+            return 0;
+        }                 
+    }
+    else
+    {
+        flags[i] = 1;
+        return tmp_heap_nodes[i].value;
+    }
+}
+
+void clusters_allocate(clusters_t * c, int s)
+{
+	/*
+	 * Helper function for handling allocation of resources 
+	 */ 
+    if(c -> centers.count == 0)
+    {
+        printf("Provide a valid cluster centers list\n");
+        return;
+    }
+
+    idx_t nclus = c -> centers.count;
+    
+    if(s)
+    {
+	    //printf("Using sparse implementation\n");
+	    c -> use_sparse_borders = 1;
+	    c -> sparse_borders = (adj_list_t*)MY_MALLOC(nclus*sizeof(adj_list_t));
+	    for(idx_t i = 0; i < nclus; ++i)
+	    {
+		    c -> sparse_borders[i].count = 0;
+		    c -> sparse_borders[i].size  = PREALLOC_BORDERS;
+		    c -> sparse_borders[i].data  = (sparse_border_t*)MY_MALLOC(PREALLOC_BORDERS*sizeof(sparse_border_t));
+	    }
+
+    }
+    else
+    {
+	    //printf("Using dense implementation\n");
+	    c -> use_sparse_borders = 0;
+	    c -> __borders_data         = (border_t*)MY_MALLOC(nclus*nclus*sizeof(border_t)); 
+	    c -> borders                = (border_t**)MY_MALLOC(nclus*sizeof(border_t*));
+
+	    #pragma omp parallel for
+
+	    for(idx_t i = 0; i < nclus; ++i)
+	    {
+			c -> borders[i]         = c -> __borders_data + i*nclus;
+			for(idx_t j = 0; j < nclus; ++j)
+			{
+				c -> borders[i][j] = border_null;
+			}
+	    }
+    }
+}
+
+
+void adj_list_insert(adj_list_t* l, sparse_border_t b)
+{
+	/*
+	 * Handling of sparse border implementation as an adjecency list
+	 */
+	if(l -> count < l -> size)
+	{
+		l -> data[l -> count] = b;
+		l -> count++;
+	}
+	else
+	{
+		l -> size += PREALLOC_BORDERS; 
+		l -> data = realloc( l -> data, sizeof(sparse_border_t) * ( l -> size));
+		l -> data[l -> count] = b;
+		l -> count++;
+	}
+}
+
+void adj_list_reset(adj_list_t* l)
+{
+	/*
+	 * Handling of sparse border implementation as an adjecency list
+	 */
+	if(l -> data) free(l -> data);
+	l -> count = 0;
+	l -> size  = 0;
+	l -> data  = NULL;
+}
+
+void clusters_reset(clusters_t * c)
+{
+	/* 
+	 * Handling reset of clusters object 
+	 */
+	if(c -> use_sparse_borders)
+	{
+		for(idx_t i = 0; i < c -> centers.count; ++i)
+		{
+			adj_list_reset((c -> sparse_borders) + i);
+		
+		}
+		free(c -> sparse_borders);
+		c -> sparse_borders = NULL;
+	}
+	else
+	{
+		if(c -> __borders_data)  free(c -> __borders_data);
+		if(c -> borders) free(c -> borders);
+	}
+	if(c -> centers.data) free(c -> centers.data);
+}
+
+void clusters_free(clusters_t * c)
+{
+	/*
+	 * Free cluster object
+	 */
+    clusters_reset(c);
+}
+
+
+void sparse_border_insert(clusters_t *c, sparse_border_t b)
+{
+	/*
+	 * Insert a border element in the sparse implementation
+	 */
+
+	idx_t i = b.i;
+	adj_list_t l = c -> sparse_borders[i];
+	int check = 1;
+	for(idx_t k = 0; k < l.count; ++k)
+	{
+		sparse_border_t p = l.data[k];
+		if(p.i == b.i && p.j == b.j)
+		{
+			if( b.density > p.density)
+			{
+				l.data[k] = b;
+			}
+			check = 0;
+		}
+	}
+	if(check) adj_list_insert(c -> sparse_borders + i, b);
+	return;
+}
+
+sparse_border_t sparse_border_get(clusters_t* c, idx_t i, idx_t j)
+{
+	/*
+	 * Get a border element in the sparse implementation
+	 * - i,j: cluster to search for borders
+	 * return border_null if not found
+	 */
+
+	sparse_border_t b = sparse_border_null;
+	adj_list_t l = c -> sparse_borders[i];
+	for(idx_t el = 0; el < l.count; ++el)
+	{
+		sparse_border_t candidate = l.data[el];
+		if(candidate.i == i && candidate.j == j)
+		{
+			b = candidate;
+		}
+	}
+	return b;
+}
+
+datapoint_info_t find_possibly_halo_datapoint_rma(global_context_t* ctx, idx_t idx, MPI_Win win_datapoints)
+{
+    int owner = foreign_owner(ctx, idx);
+    /* find if datapoint is halo or not */
+    if(owner == ctx -> mpi_rank)
+    {
+        idx_t i = idx - ctx -> idx_start;
+        return ctx -> local_datapoints[i];
+    }
+    else
+    {
+        datapoint_info_t tmp_dp;
+        #pragma omp critical
+        {
+            idx_t i = idx - ctx -> rank_idx_start[owner];
+            MPI_Request request;
+            MPI_Status status;
+
+            MPI_Rget(&tmp_dp, sizeof(datapoint_info_t), MPI_BYTE, owner,
+                    i * sizeof(datapoint_info_t), sizeof(datapoint_info_t), MPI_BYTE, win_datapoints, &request);
+            MPI_Wait(&request, MPI_STATUS_IGNORE);
+
+        }
+
+        return tmp_dp;         
+    }                 
+}
+
+int cmpPP(const void* p1, const void *p2)
+{
+    /*
+     * Utility function to perform quicksort   
+     * when clustering assignment is performed    
+     */
+    datapoint_info_t* pp1 = *(datapoint_info_t**)p1;
+    datapoint_info_t* pp2 = *(datapoint_info_t**)p2;
+    //return 2*(pp1 -> g < pp2 -> g) - 1;
+    int v = (pp1 -> g < pp2 -> g) - (pp1 -> g > pp2 -> g);
+    int a = (pp1 -> array_idx < pp2 -> array_idx) - (pp1 -> array_idx > pp2 -> array_idx);
+    return v == 0 ? a : v;
+    //return v;
+}
+
+inline int its_mine(global_context_t* ctx, idx_t idx)
+{
+    return idx >= ctx -> idx_start && idx < ctx -> idx_start + ctx -> local_n_points;
+}
+
+int compare_center(const void* a, const void* b)
+{
+    center_t* aa = (center_t*)a;
+    center_t* bb = (center_t*)b;
+    return (aa -> density < bb -> density) - (aa -> density > bb -> density);
+}
+
+
+void compute_correction(global_context_t* ctx, float_t Z)
+{
+    /*
+     * Utility function, find the minimum value of the density of the datapoints
+     * and shift them up in order to further work with values greater than 0     
+     */
+    float_t min_log_rho = 999999.9;
+
+    datapoint_info_t* dp_info = ctx -> local_datapoints;
+    idx_t n = ctx -> local_n_points;
+    
+
+    #pragma omp parallel
+    {
+        float_t thread_min_log_rho = 9999999.;
+        #pragma omp for
+        for(idx_t i = 0; i < n; ++i)
+        {
+            float_t tmp = dp_info[i].log_rho - Z*dp_info[i].log_rho_err;
+            if(tmp < thread_min_log_rho){
+                thread_min_log_rho = tmp;
+            }
+        }
+        #pragma omp critical
+        if(thread_min_log_rho < min_log_rho) min_log_rho = thread_min_log_rho;
+    }
+
+    MPI_Allreduce(MPI_IN_PLACE, &min_log_rho, 1, MPI_MY_FLOAT, MPI_MIN, ctx -> mpi_communicator);
+
+    #pragma omp parallel
+    {
+        #pragma omp for
+        for(idx_t i = 0; i < n; ++i)
+        {
+            dp_info[i].log_rho_c = dp_info[i].log_rho - min_log_rho + 1;
+            dp_info[i].g = dp_info[i].log_rho_c - dp_info[i].log_rho_err;
+        }
+
+    }
+
+    //printf("%lf\n",min_log_rho);
+}
+
+clusters_t Heuristic1(global_context_t *ctx, int verbose)
+{
+    /*
+     * Heurisitc 1, from paper of Errico, Facco, Laio & Rodriguez 
+     * ( https://doi.org/10.1016/j.ins.2021.01.010 )              
+     *                                                            
+     * args:                                                      
+     */
+
+    datapoint_info_t* dp_info = ctx -> local_datapoints;
+    idx_t n = ctx -> local_n_points; 
+
+    struct timespec start_tot, finish_tot;
+    double elapsed_tot;
+
+	if(verbose)
+	{
+		printf("H1: Preliminary cluster assignment\n");
+		clock_gettime(CLOCK_MONOTONIC, &start_tot);
+	}
+    
+    TIME_DEF;
+
+
+    //idx_t ncenters = 0;
+    //idx_t putativeCenters = n;
+    lu_dynamic_array_t all_centers, removed_centers, actual_centers, max_rho;
+    lu_dynamic_array_allocate(&all_centers);
+    lu_dynamic_array_allocate(&removed_centers);
+    lu_dynamic_array_allocate(&actual_centers);
+    lu_dynamic_array_allocate(&max_rho);
+
+    datapoint_info_t** dp_info_ptrs = (datapoint_info_t**)MY_MALLOC(n*sizeof(datapoint_info_t*));
+
+    struct timespec start, finish;
+    double elapsed;
+
+
+    if(verbose)
+	{
+		clock_gettime(CLOCK_MONOTONIC, &start);
+	}
+
+    /* proceed */
+
+    MPI_Win win_datapoints;
+    MPI_Win_create(ctx -> local_datapoints, ctx -> local_n_points * sizeof(datapoint_info_t), 
+                   1, MPI_INFO_NULL, ctx -> mpi_communicator, &win_datapoints);
+    MPI_Win_fence(0, win_datapoints);
+    MPI_Win_lock_all(0,  win_datapoints);
+
+#if defined(THREAD_FUNNELED)
+#else
+    #pragma omp parallel for
+#endif
+
+    for(idx_t i = 0; i < n; ++i)
+    {   
+        /*
+         * Find the centers of the clusters as the points of higher density in their neighborhoods
+         * A point is tagged as a putative center if it is the point of higer density of its neighborhood 
+         */
+
+        dp_info_ptrs[i] = dp_info + i;
+        idx_t maxk = dp_info[i].kstar + 1;
+        float_t gi = dp_info[i].g;
+        dp_info[i].is_center = 1;
+        dp_info[i].cluster_idx = -1;
+        //printf("%lf\n",p -> g);
+        heap i_ngbh = dp_info[i].ngbh;
+        for(idx_t k = 1; k < maxk; ++k)
+        {
+            idx_t ngbh_index = i_ngbh.data[k].array_idx;
+            datapoint_info_t dj = find_possibly_halo_datapoint_rma(ctx, ngbh_index, win_datapoints);
+            float_t gj = dj.g;
+            if(gj > gi){
+                dp_info[i].is_center = 0;
+                break;
+            }
+        }
+        if(dp_info[i].is_center)
+        {
+            //lu_dynamic_array_pushBack(&all_centers, dp_info[i].array_idx);
+            #pragma omp critical
+            {
+                lu_dynamic_array_pushBack(&all_centers, i);
+            }
+        }
+    }
+
+    if(verbose)
+	{
+		clock_gettime(CLOCK_MONOTONIC, &finish);
+		elapsed = (finish.tv_sec - start.tv_sec);
+		elapsed += (finish.tv_nsec - start.tv_nsec) / 1000000000.0;
+		printf("\tFinding putative centers: %.3lfs\n",elapsed);
+		clock_gettime(CLOCK_MONOTONIC, &start);
+	}
+
+
+	/* 
+	 * optimized version
+	 *
+	 * Generate a mask that keeps track of the point has been eliminating the 
+	 * point considered. Each thread updates this mask, then after the procedure
+	 * ends, center, removed centers, and max_rho arrays are populated
+	 */
+		
+	heap_node* to_remove_mask = (heap_node*)MY_MALLOC(n*sizeof(heap_node));
+    for(idx_t p = 0; p < n; ++p) 
+    {
+        to_remove_mask[p].array_idx = MY_SIZE_MAX;
+        to_remove_mask[p].value = 9999999;
+    }
+    qsort(dp_info_ptrs, n, sizeof(datapoint_info_t*), cmpPP);
+
+
+    MPI_Win win_to_remove_mask;
+    MPI_Win_create(to_remove_mask, n * sizeof(heap_node), 1, MPI_INFO_NULL, ctx -> mpi_communicator, &win_to_remove_mask);
+    MPI_Win_fence(0, win_to_remove_mask);
+
+
+
+    /* check into internal nodes */
+
+    /* 
+     * to remove 
+     * and to reimplement it using rma
+     * for dp in datapoints:
+     *  for nghb in neighbors:
+     *   if ngbh its_mine
+     *      no_problems, do it as always
+     *   else:
+     *      ngbh is an external node
+     *      do rma things,
+     *      in particular, acquire a lock on the window, read and write things
+     *      then close
+     */
+
+#if defined(THREAD_FUNNELED)
+#else
+    #pragma omp parallel for
+#endif
+    for(idx_t p = 0; p < n; ++p)
+    {
+        datapoint_info_t i_point = *(dp_info_ptrs[p]);
+
+        for(idx_t j = 1; j < i_point.kstar + 1; ++j)
+        {
+            idx_t jidx = i_point.ngbh.data[j].array_idx; 
+
+            datapoint_info_t j_point = find_possibly_halo_datapoint_rma(ctx, jidx,  win_datapoints);
+
+            if(j_point.is_center && i_point.g > j_point.g)
+            {
+                #pragma omp critical
+                {
+                    /*
+                     *
+                     * THIS PART CAN BE SUBSTITUTED
+                     * we have something like (value, idx) pairs and if we have less than 
+                     * MAX_UINT32 particles we can manipulate value and idx to fit into a single
+                     * UINT64 and then perform a single atomic max operation
+                     *
+                     * */
+                    int owner = foreign_owner(ctx, jidx);
+                    idx_t jpos = jidx - ctx -> rank_idx_start[owner];
+
+                    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, owner, 0, win_to_remove_mask);
+                    heap_node mask_element;
+                    MPI_Request request;
+                    MPI_Rget(   &mask_element, sizeof(heap_node), MPI_BYTE, 
+                            owner, jpos * sizeof(heap_node) , sizeof(heap_node), MPI_BYTE, win_to_remove_mask, &request);
+                    MPI_Wait(&request, MPI_STATUS_IGNORE);
+
+                    int flag = mask_element.array_idx == MY_SIZE_MAX;							
+                    if(flag || i_point.g > mask_element.value )
+                    {
+                        heap_node tmp_mask_element = {.array_idx = i_point.array_idx, .value = i_point.g};
+                        MPI_Request request;
+                        MPI_Rput(&tmp_mask_element, sizeof(heap_node), MPI_BYTE, owner, 
+                                jpos*sizeof(heap_node), sizeof(heap_node), MPI_BYTE, win_to_remove_mask, &request);
+                        MPI_Wait(&request, MPI_STATUS_IGNORE);
+
+                    }
+
+                    MPI_Win_unlock(owner, win_to_remove_mask);
+                }
+            }
+        }
+    }
+    
+    MPI_Win_fence(0, win_to_remove_mask);
+    MPI_Barrier(ctx -> mpi_communicator);
+
+	/* populate the usual arrays */
+    for(idx_t p = 0; p < all_centers.count; ++p)
+    {
+        idx_t i = all_centers.data[p];
+        int e = 0;
+        //float_t gi = dp_info[i].g;
+        idx_t mr = to_remove_mask[i].array_idx;
+        if(mr != MY_SIZE_MAX)
+        {
+            //if(dp_info[mr].g > gi) e = 1;
+			e = 1;
+        }
+        switch (e)
+        {
+            case 1:
+                {
+                    lu_dynamic_array_pushBack(&removed_centers,i);
+                    dp_info[i].is_center = 0;
+                    for(idx_t c = 0; c < removed_centers.count - 1; ++c)
+                    {
+                        if(mr == removed_centers.data[c])
+                        {
+                            mr = max_rho.data[c];
+                        }
+                    }
+                    lu_dynamic_array_pushBack(&max_rho,mr);
+                    
+                }
+                break;
+
+            case 0:
+                {
+                    lu_dynamic_array_pushBack(&actual_centers,i);
+                    dp_info[i].cluster_idx = actual_centers.count - 1;
+                }
+                break;
+
+            default:
+                break;
+        }
+    }
+
+    MPI_Win_free(&win_to_remove_mask);
+	free(to_remove_mask);
+
+    if(verbose)
+	{
+		clock_gettime(CLOCK_MONOTONIC, &finish);
+		elapsed = (finish.tv_sec - start.tv_sec);
+		elapsed += (finish.tv_nsec - start.tv_nsec) / 1000000000.0;
+		printf("\tFinding actual centers:   %.3lfs\n",elapsed);
+		clock_gettime(CLOCK_MONOTONIC, &start);
+	}
+
+    int n_centers = (int)actual_centers.count;
+    int tot_centers;
+    MPI_Allreduce(&n_centers, &tot_centers, 1, MPI_INT, MPI_SUM, ctx -> mpi_communicator);
+    // MPI_Allreduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
+    /*
+    DB_PRINT("rank %d got %d heheh\n", ctx -> mpi_rank, (int)actual_centers.count);
+    DB_PRINT("rank %d tc %d rc %d\n", ctx -> mpi_rank, (int)all_centers.count, (int)removed_centers.count);
+    */
+    MPI_DB_PRINT("Found %d temporary centers\n", tot_centers);
+
+
+    /* bring on master all centers 
+     * order them in ascending order of density, 
+     * then re-scatter them around to get unique cluster labels */ 
+
+
+    center_t* private_centers_buffer = (center_t*)MY_MALLOC(actual_centers.count * sizeof(center_t));
+
+    center_t* global_centers_buffer  = (center_t*)MY_MALLOC(tot_centers * sizeof(center_t));
+
+    for(int i = 0; i < actual_centers.count; ++i)
+    {
+        idx_t idx = actual_centers.data[i] ;
+        private_centers_buffer[i].density = dp_info[idx].g;
+        private_centers_buffer[i].idx     = dp_info[idx].array_idx;
+    }
+    MPI_Datatype mpi_center_type;
+    
+    MPI_Type_contiguous(sizeof(center_t), MPI_BYTE, &mpi_center_type);
+    MPI_Type_commit(&mpi_center_type);
+
+    int* center_counts = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
+    int* center_displs = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
+
+    int cc = (int)actual_centers.count;
+    MPI_Allgather(&cc, 1, MPI_INT, center_counts, 1, MPI_INT, ctx -> mpi_communicator);
+
+    center_displs[0] = 0;
+    for(int i = 1; i < ctx -> world_size; ++i) center_displs[i] = center_displs[i - 1] + center_counts[i - 1];
+
+    /* alternative to check if something breaks */
+    for(int i = 0; i < actual_centers.count; ++i)
+    {
+        idx_t idx = actual_centers.data[i];
+        dp_info[idx].cluster_idx += center_displs[ctx -> mpi_rank];
+        actual_centers.data[i] += ctx -> idx_start;
+    }
+
+    idx_t* all_center_idx = (idx_t*)MY_MALLOC(tot_centers * sizeof(idx_t));
+    
+    MPI_Allgatherv(actual_centers.data, actual_centers.count, MPI_UINT64_T, all_center_idx, center_counts, center_displs, MPI_UINT64_T, ctx -> mpi_communicator);
+
+    free(center_counts);
+    free(center_displs);
+
+    /*
+     * Sort all the dp_info based on g and then perform the cluster assignment
+     * in asceding order                                                     
+     */
+
+    /*
+     * Sort all the dp_info based on g and then perform the cluster assignment
+     * in asceding order                                                     
+     */
+                                                                                
+    int completed = 0;
+
+    while(!completed)
+    {
+        completed = 1;
+
+
+        int proc_points = 0;
+        /* retrieve assignment 
+         * if some point remain uncompleted then get assignment from external points */
+
+        for(idx_t i = 0; i < n; ++i)
+        {   
+            int wait_for_comms = 0;
+            datapoint_info_t* p = dp_info_ptrs[i];
+            if(!(p -> is_center) && p -> cluster_idx < 0)
+            {
+                int cluster = -1;
+                idx_t k = 0;
+                idx_t p_idx;
+                idx_t max_k = p -> ngbh.N;
+                /*assign each particle at the same cluster as the nearest particle of higher density*/
+                datapoint_info_t p_retrieved;
+                while( k < max_k - 1 && cluster == -1)
+                {
+
+
+                    ++k;
+                    p_idx = p -> ngbh.data[k].array_idx;
+                    p_retrieved = find_possibly_halo_datapoint_rma(ctx, p_idx, win_datapoints);
+
+                    int flag = p_retrieved.g > p -> g;
+
+                    if(p_retrieved.g > p -> g)
+                    {
+                        cluster = p_retrieved.cluster_idx; 
+                        wait_for_comms = 1;
+                        break;
+                    }
+                    //if(p -> array_idx == 1587636) printf("%lu k %d p_idx %lu %lf %lf\n", k, cluster, p_idx, p_retrieved.g, p -> g );
+                }
+
+
+                if(cluster == -1 && !wait_for_comms)
+                {
+                    float_t gmax = -99999.;               
+                    idx_t gm_index = SIZE_MAX;
+                    for(idx_t k = 0; k < max_k; ++k)
+                    {
+                        idx_t ngbh_index = p -> ngbh.data[k].array_idx;
+                        for(idx_t m = 0; m < removed_centers.count; ++m)
+                        {
+                            idx_t max_rho_idx = max_rho.data[m];
+                            datapoint_info_t dp_max_rho = find_possibly_halo_datapoint_rma(ctx, max_rho_idx, win_datapoints);
+                            //datapoint_info_t dp_max_rho = dp_info[max_rho_idx];
+                            float_t gcand = dp_max_rho.g;
+                            if(ngbh_index == removed_centers.data[m] && gcand > gmax)
+                            {   
+                                //printf("%lu -- %lu\n", ele, m);
+                                gmax = gcand;
+                                gm_index = max_rho.data[m];
+                            }
+                        }
+                    }
+                    if(gm_index != SIZE_MAX)
+                    {
+                        datapoint_info_t dp_gm = find_possibly_halo_datapoint_rma(ctx, gm_index, win_datapoints);
+                        //datapoint_info_t dp_gm = dp_info[gm_index];
+                        cluster = dp_gm.cluster_idx;
+                    }
+
+                }
+                p -> cluster_idx = cluster;
+
+            }
+            completed = completed && p -> cluster_idx != -1; 
+            proc_points += p -> cluster_idx != -1 ?  1 : 0;
+
+        }
+
+        //DB_PRINT("rank %d proc %d\n", ctx -> mpi_rank, proc_points);
+        MPI_Allreduce(MPI_IN_PLACE, &completed, 1, MPI_INT, MPI_SUM, ctx -> mpi_communicator);
+        completed = completed == ctx -> world_size ? 1 : 0;
+
+    }
+
+    MPI_Win_unlock_all(win_datapoints);
+    MPI_Win_fence(0, win_datapoints);
+    MPI_Win_free(&win_datapoints);
+
+    MPI_Barrier(ctx -> mpi_communicator);
+
+    if(verbose)
+	{
+		clock_gettime(CLOCK_MONOTONIC, &finish);
+		elapsed = (finish.tv_sec - start.tv_sec);
+		elapsed += (finish.tv_nsec - start.tv_nsec) / 1000000000.0;
+		printf("\tTentative clustering:     %.3lfs\n",elapsed);
+		clock_gettime(CLOCK_MONOTONIC, &start);
+	}
+
+    free(dp_info_ptrs);
+    free(max_rho.data);
+    free(removed_centers.data);
+    free(all_centers.data);
+
+    #if defined(WRITE_CLUSTER_ASSIGN_H1)
+        /* densities */
+        int* ks = (int*)MY_MALLOC(ctx -> local_n_points * sizeof(int));
+        for(int i = 0; i < ctx -> local_n_points; ++i) ks[i] = ctx -> local_datapoints[i].cluster_idx;
+
+        ordered_buffer_to_file(ctx, ks, sizeof(int), ctx -> local_n_points, "bb/cl.npy");
+        ordered_data_to_file(ctx);
+        free(ks);
+    #endif
+
+
+    free(actual_centers.data);
+    actual_centers.size  = tot_centers;
+    actual_centers.count = tot_centers;
+    actual_centers.data  = all_center_idx;
+
+
+    clusters_t c_all;
+    c_all.centers = actual_centers;
+
+
+
+    if(verbose)
+	{
+		clock_gettime(CLOCK_MONOTONIC, &finish);
+		elapsed = (finish.tv_sec - start.tv_sec);
+		elapsed += (finish.tv_nsec - start.tv_nsec) / 1000000000.0;
+		printf("\tFinalizing clustering:    %.3lfs\n",elapsed);
+		printf("\n");
+	}
+
+    clock_gettime(CLOCK_MONOTONIC, &finish_tot);
+    elapsed_tot = (finish_tot.tv_sec - start_tot.tv_sec);
+    elapsed_tot += (finish_tot.tv_nsec - start_tot.tv_nsec) / 1000000000.0;
+
+
+	if(verbose)
+	{
+		printf("\tFound %lu clusters\n",(uint64_t)actual_centers.count);
+		printf("\tTotal time: %.3lfs\n\n", elapsed_tot);
+	}
+
+    c_all.n = n;
+    return c_all;
+}
+
+
+void Heuristic2(global_context_t* ctx, clusters_t* cluster)
+{
+    /*
+     * Port the current implementation to this using rma
+     * then perform the matrix reduction
+     *
+     * Each one computes its borders, then the borders are shared, and the matrix is 
+     * reduced to a single huge matrix of borders
+     *
+     */
+
+    int verbose = 0;
+
+    datapoint_info_t* dp_info = ctx -> local_datapoints;
+
+    MPI_Win dp_info_win, ngbh_win;
+    MPI_Win_create(ctx -> local_datapoints, ctx -> local_n_points * sizeof(datapoint_info_t), 1, MPI_INFO_NULL, ctx -> mpi_communicator, &dp_info_win);
+    MPI_Win_create(ctx -> __local_heap_buffers, ctx -> local_n_points * ctx -> k * sizeof(heap_node), 1, MPI_INFO_NULL, ctx -> mpi_communicator, &ngbh_win);
+
+    MPI_Win_fence(0, dp_info_win);
+    MPI_Win_fence(0, ngbh_win);
+
+    MPI_Win_lock_all(0, dp_info_win);
+    MPI_Win_lock_all(0, ngbh_win);
+    MPI_Barrier(ctx -> mpi_communicator);
+
+
+
+    #define borders cluster->borders
+
+    struct timespec start_tot, finish_tot;
+    double elapsed_tot;
+    idx_t n = ctx -> local_n_points;
+
+	if(verbose)
+	{
+		printf("H2: Finding border points\n");
+		clock_gettime(CLOCK_MONOTONIC, &start_tot);
+	}
+
+    idx_t nclus = cluster->centers.count; 
+    idx_t max_k = dp_info[0].ngbh.N;
+
+    for(idx_t i = 0; i < n; ++i)
+    {
+        idx_t pp = NOBORDER;
+        /*loop over n neighbors*/
+        int c = dp_info[i].cluster_idx;
+        if(!dp_info[i].is_center)
+        {
+            for(idx_t k = 1; k < dp_info[i].kstar + 1; ++k)
+            {
+                /*index of the kth ngbh of n*/
+                idx_t j = dp_info[i].ngbh.data[k].array_idx;
+                pp = NOBORDER;
+                /*Loop over kn neigbhours to find if n is the nearest*/
+                /*if cluster of the particle in nbhg is c then check is neighborhood*/                                                
+
+                //DB_PRINT("my rank %d l %lu %lu %d\n", ctx -> mpi_rank, k, j, foreign_owner(ctx, j));
+                datapoint_info_t j_dp = find_possibly_halo_datapoint_rma(ctx, j, dp_info_win);
+                if(j_dp.cluster_idx != c)
+                {
+                    pp = j;
+                    break;
+                }
+
+            }
+        }
+
+		if(pp != NOBORDER)
+		{
+            datapoint_info_t pp_dp = find_possibly_halo_datapoint_rma(ctx, pp, dp_info_win);
+			for(idx_t k = 1; k < max_k; ++k)
+			{
+                idx_t pp_ngbh_idx = get_j_ksel_idx(ctx, pp_dp.array_idx, k, ngbh_win);
+                datapoint_info_t pp_ngbh_dp = find_possibly_halo_datapoint_rma(ctx, pp_ngbh_idx, dp_info_win);
+                //TODO: can optimize it can retrieve the whole ngbh in one shot
+
+				if(pp_ngbh_idx == dp_info[i].array_idx)
+				{
+					break;
+				}
+
+
+				if(pp_ngbh_dp.cluster_idx == c)
+				{
+					pp = NOBORDER;
+					break;
+				}
+			}
+		}
+
+		/*if it is the maximum one add it to the cluster*/
+		if(pp != NOBORDER)
+		{
+            datapoint_info_t j_dp = find_possibly_halo_datapoint_rma(ctx, pp, dp_info_win);
+			int ppc = j_dp.cluster_idx;
+            //insert one and symmetric one
+            sparse_border_t b = {.i = c, .j = ppc, .idx = ctx -> local_datapoints[i].array_idx, .density = ctx -> local_datapoints[i].g, .error = ctx -> local_datapoints[i].log_rho_err}; 
+            sparse_border_insert(cluster, b);
+            ////get symmetric border
+            sparse_border_t bsym = {.i = ppc, .j = c, .idx = ctx -> local_datapoints[i].array_idx, .density = ctx -> local_datapoints[i].g, .error = ctx -> local_datapoints[i].log_rho_err}; 
+            sparse_border_insert(cluster, bsym);
+		}
+
+
+    }
+
+    MPI_Barrier(ctx -> mpi_communicator);
+
+    idx_t num_border_el = 0;
+    for(idx_t i = 0; i < nclus; ++i)
+    {
+        num_border_el += cluster -> sparse_borders[i].count;
+    }
+
+    int i_have_sent = 0;
+    int level = 1;
+    int ranks = ctx -> world_size;
+
+    #define SEND 1 
+    #define RECV 0
+    #define DO_NOTHING -1
+
+    MPI_Barrier(ctx -> mpi_communicator);
+
+    while(ranks > 1)
+    {
+        int dp = ranks % 2;
+        ranks = ranks / 2 + dp;
+        int send_rcv = ctx -> mpi_rank >= ranks;
+
+        if(dp && ctx -> mpi_rank == ranks - 1) send_rcv = DO_NOTHING;
+
+        switch (send_rcv) 
+        {
+            case SEND:
+                if(!i_have_sent)
+                {
+                    idx_t num_border_el = 0;
+                    for(idx_t i = 0; i < nclus; ++i)
+                    {
+                        num_border_el += cluster -> sparse_borders[i].count;
+                    }
+                    sparse_border_t* borders_to_send = (sparse_border_t*)MY_MALLOC(num_border_el * sizeof(sparse_border_t)); 
+
+                    idx_t count = 0;
+                    for(idx_t i = 0; i < nclus; ++i)
+                    {
+                        for(idx_t j = 0; j < cluster -> sparse_borders[i].count; ++j)
+                        {
+
+                            borders_to_send[count] = cluster -> sparse_borders[i].data[j];
+                            count++;
+                        }
+                    }
+
+                    int rank_to_send = ctx -> mpi_rank - ranks;
+
+                    #if defined(PRINT_H2_COMM_SCHEME)
+                    DB_PRINT("-- Rank %d sending to %d\n", ctx -> mpi_rank, rank_to_send);
+                    #endif
+
+                    MPI_Send(&num_border_el, 1, MPI_UINT64_T, rank_to_send, rank_to_send, ctx -> mpi_communicator);
+
+                    MPI_Send(borders_to_send, num_border_el * sizeof(sparse_border_t), MPI_BYTE , rank_to_send, rank_to_send, ctx -> mpi_communicator);
+                                        
+                    i_have_sent = 1;
+                    free(borders_to_send);
+                }
+                break;
+            case RECV:
+                {
+                    #if defined(PRINT_H2_COMM_SCHEME)
+                    DB_PRINT("** Rank %d recieving\n", ctx -> mpi_rank);
+                    #endif
+                    idx_t num_borders_recv = 0;
+                    MPI_Recv(&num_borders_recv, 1, MPI_UINT64_T, MPI_ANY_SOURCE, ctx -> mpi_rank, ctx -> mpi_communicator, MPI_STATUS_IGNORE);
+
+                    sparse_border_t* borders_to_recv = (sparse_border_t*)MY_MALLOC(num_borders_recv * sizeof(sparse_border_t)); 
+
+                    MPI_Recv(borders_to_recv, num_borders_recv * sizeof(sparse_border_t), MPI_BYTE , MPI_ANY_SOURCE, ctx -> mpi_rank, ctx -> mpi_communicator, MPI_STATUS_IGNORE);
+
+                    for(int i = 0; i < num_borders_recv; ++i)
+                    {
+                        sparse_border_insert(cluster, borders_to_recv[i]);
+                    }
+                    free(borders_to_recv);
+                }
+                break;
+            default:
+                #if defined(PRINT_H2_COMM_SCHEME)
+                DB_PRINT(".. Rank %d doing nothing\n", ctx -> mpi_rank);
+                #endif
+                break;
+        }
+        MPI_Barrier(ctx -> mpi_communicator);
+        #if defined(PRINT_H2_COMM_SCHEME)
+        MPI_DB_PRINT("-----------------\n");
+        #endif
+
+    }
+
+    num_border_el = 0;
+    for(idx_t i = 0; i < nclus; ++i)
+    {
+        num_border_el += cluster -> sparse_borders[i].count;
+    }
+
+    MPI_DB_PRINT("Master final %lu border elements\n", num_border_el);
+
+    //correction of the density at the borders to be compliant with dadadpy
+    if(I_AM_MASTER)
+    {
+        for(idx_t c = 0; c < nclus; ++c)
+        {
+            for(idx_t el = 0; el < cluster -> sparse_borders[c].count; ++el)
+            {
+                //fix border density, write log rho c
+                idx_t idx = cluster -> sparse_borders[c].data[el].idx; 
+                datapoint_info_t idx_dp = find_possibly_halo_datapoint_rma(ctx, idx, dp_info_win);
+                cluster -> sparse_borders[c].data[el].density = idx_dp.log_rho_c;
+            }
+        }
+    }
+
+    MPI_Barrier(ctx -> mpi_communicator);
+    MPI_Win_unlock_all(ngbh_win);
+    MPI_Win_unlock_all(dp_info_win);
+
+    MPI_Win_free(&ngbh_win);
+    MPI_Win_free(&dp_info_win);
+
+    #undef SEND
+    #undef RECV
+    #undef DO_NOTHING
+
+    #ifdef WRITE_BORDERS
+        if(I_AM_MASTER)
+        {
+            printf("[MASTER] Writing borders to bb/borders.csv\n");
+            FILE* f = fopen("bb/borders.csv", "w");
+            for(idx_t i = 0; i < nclus; ++i)
+            {
+                for(int j = 0; j < cluster -> sparse_borders[i].count; ++j)
+                {
+                    sparse_border_t b = cluster -> sparse_borders[i].data[j];
+                    fprintf(f, "%lu,%lu,%lu,%lf\n", b.i, b.j, b.idx, b.density);
+                }
+            }
+            fclose(f);
+        }
+        
+    
+    #endif
+
+    return;
+    #undef borders
+
+   }
+
+
+int compare_merging_density( const void *A, const void *B)
+{
+	/*
+	 * Utility function 
+	 * comparision between two merging
+	 * elements
+	 */
+	
+	float_t DensA = ((merge_t*)A)->density;
+	float_t DensB = ((merge_t*)B)->density;
+
+	return - ( DensA > DensB) + (DensA < DensB);
+}
+
+
+static inline int is_a_merging(  float_t dens1, float_t dens1_err,
+								 float_t dens2, float_t dens2_err,
+								 float_t dens_border, float_t dens_border_err,
+								 float_t Z)
+{
+	/*
+	 * dens1	   : the density of the particle that is the center of the first cluster
+	 * dens2	   : the density of the particle that is the center of the second cluster
+	 * dens_border : the density of the border btw the cluster 1 and the cluster 2
+	 * border_err  : the errors on the densities
+	 * Z     	   : the desired accuracy
+	 */
+
+	/* in the original code it was:
+	 *
+	 float_t a1 = dp_info[cluster->centers.data[i]].log_rho_c - border_density[i][j];
+	 float_t a2 = dp_info[cluster->centers.data[j]].log_rho_c - border_density[i][j];
+
+	 float_t e1 = Z*(dp_info[cluster->centers.data[i]].log_rho_err + border_err[i][j]);
+	 float_t e2 = Z*(dp_info[cluster->centers.data[j]].log_rho_err + border_err[i][j]);
+	 */
+
+	float_t a1 = dens1 - dens_border;
+	float_t a2 = dens2 - dens_border;
+
+	float_t e1 = Z*(dens1_err + dens_border_err);
+	float_t e2 = Z*(dens2_err + dens_border_err);
+
+	return (a1 < e1 || a2 < e2);
+}
+
+
+static inline int merging_roles(  float_t dens1, float_t dens1_err,
+								  float_t dens2, float_t dens2_err,
+								  float_t dens_border, float_t dens_border_err )
+{
+	/*
+	 * Utility function 
+	 * Retrieve if cluster 1 is merged into 2 or
+	 * vice versa
+	 */
+      
+	float_t c1 = (dens1 - dens_border) / (dens1_err + dens_border_err); 
+	float_t c2 = (dens2 - dens_border) / (dens2_err + dens_border_err);
+	//printf("%.10lf %.10lf %d\n",c1,c2, c1 > c2);
+
+	return ( c1 < c2 );     // if 1, this signal to swap 1 and 2
+}
+
+void fix_borders_A_into_B(idx_t A, idx_t B, border_t** borders, idx_t n)
+{
+	/*
+	 * Dense border implementation
+	 * - idx_t A 			: cluster A the one which goes into the other 
+	 * - idx_t B 			: cluster B the one that recieves the merging
+	 * - border_t** borders : whole border matrix
+	 * - idx_t n 			: number of clusters
+	 *
+	 */
+
+	#pragma omp parallel for if(n > MAX_SERIAL_MERGING)
+	for(idx_t i = 0; i < n; ++i) 
+	{
+		if(borders[A][i].idx != NOBORDER )
+		{
+			if(borders[B][i].idx != NOBORDER)
+			{
+				int mb = (borders[A][i].density > borders[B][i].density); 
+
+				borders[B][i] = mb ? borders[A][i] : borders[B][i];
+				borders[i][B] = borders[B][i];
+			}
+			else
+			{
+				borders[B][i] = borders[A][i];
+				borders[i][B] = borders[B][i];
+			}
+		} 
+		borders[A][i] = border_null;
+		borders[i][A] = border_null;
+	}
+}
+
+static inline void delete_adj_list_element(clusters_t * c, const idx_t list_idx, const idx_t el)
+{
+	/*
+	 * Utility function
+	 * Deletes an element into an adjecency list,
+	 * representing the borders in the cluster topology
+	 */
+
+	idx_t count = c -> sparse_borders[list_idx].count;
+	c -> sparse_borders[list_idx].data[el] = c -> sparse_borders[list_idx].data[count-1];
+	c -> sparse_borders[list_idx].data[count-1] = sparse_border_null;
+	c -> sparse_borders[list_idx].count -= 1;
+}
+
+void fix_sparse_borders_A_into_B(idx_t s,idx_t t, clusters_t* c)
+{
+	/*
+	 * Handle borders after two clusters are merged
+	 * - idx_t s 	 : source cluster, the one has to be merged
+	 * - idx_t t 	 : target cluster, the one recieves the merge
+	 * - clusters* c : object containing all the data 
+	 *
+	 * When s goes into t all the clusters which had a border with s now they must have
+	 * a border with t. If t already has a border like that, the border with higher 
+	 * density is kept
+	 */
+	
+	{
+		{
+			for(idx_t el = 0; el < c -> sparse_borders[t].count; ++el)
+			{
+				sparse_border_t b = c -> sparse_borders[t].data[el];
+				if(b.i == t && b.j == s)
+				{
+					//delete the border src trg
+					delete_adj_list_element(c, t, el);
+				}
+			}
+		}
+		//find the border and delete it, other insert them in correct place
+		for(idx_t el = 0; el < c -> sparse_borders[s].count; ++el)
+		{
+			sparse_border_t b = c -> sparse_borders[s].data[el];
+		//	idx_t ii = b.i;
+			if(b.j != t)
+			{
+				//insert these borders as trg -> j and j -> trg
+				b.i = t;
+				sparse_border_insert(c, b);
+				sparse_border_t bsym = b;
+				bsym.i = b.j;
+				bsym.j = b.i;
+				sparse_border_insert(c, bsym);
+				for(idx_t dl = 0; dl < c -> sparse_borders[b.j].count; ++dl)
+				{
+					sparse_border_t b_del = c -> sparse_borders[b.j].data[dl];
+					if(b_del.j == s)
+					{
+						//delete the border src trg
+						delete_adj_list_element(c, b.j, dl);
+					}
+				}
+						
+			}
+		}
+		//clean up all borders
+		//delete the src list
+		{
+			adj_list_reset((c->sparse_borders) + s);
+		}
+		//delete all borders containing src
+	//	for(idx_t i = 0; i < nclus; ++i)
+	//	{
+	//		for(idx_t el = 0; el < c -> sparse_borders[i].count; ++el)
+	//		{
+	//			sparse_border_t b = c -> sparse_borders[i].data[el];
+	//			if(b.j == s)
+	//			{
+	//				//delete the border src trg
+	//				delete_adj_list_element(c, i, el);
+	//			}
+	//		}
+	//			
+	//	}
+	}
+
+
+}
+
+void merge_A_into_B(idx_t* who_amI, idx_t cluster_A, idx_t cluster_B, idx_t n)
+{
+	/*
+	 * Utility function
+	 * Performs correctino of the labels in the array that
+	 * keep tracks of what cluster is after a merging
+	 */
+
+    #pragma omp parallel if(n > MAX_SERIAL_MERGING)
+    {
+	    idx_t tmp;
+	    #pragma omp for
+	    for(idx_t i = 0; i < n; ++i)
+	    {   
+			//substitute occurencies of b with a 
+			tmp = who_amI[i] == cluster_A ? cluster_B : who_amI[i];
+			who_amI[i] = tmp;
+	    }
+    }
+    return;
+}
+
+void master_finds_borders(global_context_t* ctx, clusters_t* cluster, float_t Z, idx_t* surviving_clusters, datapoint_info_t* centers_dp)
+{
+    datapoint_info_t* dp_info   = ctx -> local_datapoints;
+	idx_t nclus                 = cluster -> centers.count;  
+
+	idx_t   merge_count         = 0;
+	idx_t   merging_table_size  = 1000;
+	merge_t *merging_table      = (merge_t*)malloc(sizeof(merge_t)*merging_table_size);
+  
+	/*
+	 * Find clusters to be merged
+	 * Loop over borders and find which ones will generate a merge,
+	 * store them later in the merging table
+	 **/
+
+
+    /* center density 
+     * need to retrieve center density */
+
+	for(idx_t i = 0; i < nclus - 1; ++i)   
+	{
+		idx_t count = cluster -> sparse_borders[i].count;
+		for(idx_t el = 0; el < count; ++el)   
+		{
+			sparse_border_t b = cluster -> sparse_borders[i].data[el];
+			if( b.j > b.i)
+			{
+				float_t dens1           = centers_dp[b.i].log_rho_c;
+				float_t dens1_err       = centers_dp[b.i].log_rho_err;
+				float_t dens2           = centers_dp[b.j].log_rho_c;
+				float_t dens2_err       = centers_dp[b.j].log_rho_err;
+				float_t dens_border     = b.density;
+				float_t dens_border_err = b.error;
+
+				if ( is_a_merging( dens1, dens1_err, dens2, dens2_err, dens_border, dens_border_err, Z ) )
+				{
+					if ( merge_count == merging_table_size ) {
+					merging_table_size *= 1.1;
+					merging_table = (merge_t*)realloc( merging_table, sizeof(merge_t) * merging_table_size ); }
+
+					idx_t src = b.j;
+					idx_t trg = b.i;
+
+					merging_table[merge_count].source = src;
+					merging_table[merge_count].target = trg;
+					merging_table[merge_count].density = b.density;
+					++merge_count;
+				}
+			}
+		}
+	}
+
+	qsort( (void*)merging_table, merge_count, sizeof(merge_t), compare_merging_density);
+    
+    MPI_DB_PRINT("Found %lu merges\n", merge_count);
+
+    #ifdef WRITE_MERGING_TABLE
+        if(I_AM_MASTER)
+        {
+            printf("[MASTER] Writing merging table to bb/merging_table.csv\n");
+            FILE* f = fopen("bb/merging_table.csv", "w");
+            for(idx_t i = 0; i < merge_count; ++i)
+            {
+                merge_t b = merging_table[i];
+                fprintf(f, "%lu,%lu,%lf\n", b.source, b.target, b.density);
+            }
+            fclose(f);
+        }
+            
+    #endif
+
+    for( idx_t m = 0; m < merge_count; m++ )
+    {
+      
+        #define src surviving_clusters[merging_table[m].source]
+        #define trg surviving_clusters[merging_table[m].target]
+
+        //int re_check = ( (src != merging_table[m].source) || (trg != merging_table[m].target) );
+		//if(re_check)
+		{
+			/* 
+			 * Enforce a that in case of symmetric merging condition the lowest idx cluster 
+			 * is merged into the higher idx cluster, only to preserve compatibility with 
+			 * original ADP implementation
+			 *
+			 * Process each element in the merging table
+			 */
+			idx_t new_src = (src < trg) ? src : trg;
+			idx_t new_trg = (src < trg) ? trg : src;
+
+			/*
+			 * pick who am I and retrieve all needed data from the 
+			 * border matrices
+			 */
+
+			float_t dens1           = centers_dp[new_src].log_rho_c;
+			float_t dens1_err       = centers_dp[new_src].log_rho_err;
+			float_t dens2           = centers_dp[new_trg].log_rho_c;
+			float_t dens2_err       = centers_dp[new_trg].log_rho_err;
+
+			//borders get
+			sparse_border_t b 	   	= sparse_border_get(cluster, new_src, new_trg);
+			float_t dens_border     = b.density;
+			float_t dens_border_err = b.error;
+
+			int i_have_to_merge = is_a_merging(dens1,dens1_err,dens2,dens2_err,dens_border,dens_border_err,Z);            
+			switch (i_have_to_merge && src != trg)
+			{
+				case 1:
+				{
+					int side = merging_roles(dens1,dens1_err,dens2,dens2_err,dens_border,dens_border_err);
+					if(!side)
+					{
+						idx_t tmp;
+						tmp = new_src;
+						new_src = new_trg;
+						new_trg = tmp;
+					}
+
+						
+					/* 
+					 * Perform the actual meriging,
+					 * first  -> fix the borders, delete old ones and spawn new one in the correct position
+					 * second -> update the surviving_clusters buffer
+					 */
+					fix_sparse_borders_A_into_B(new_src, new_trg, cluster);
+					merge_A_into_B(surviving_clusters, new_src, new_trg, nclus );	  
+				}
+				break;
+			
+			default:
+				break;
+			}
+		}
+        
+        #undef src
+        #undef trg
+    }
+
+    free(merging_table);
+}
+
+void master_fixes_border_matrix_and_centers(global_context_t* ctx, clusters_t* cluster, float_t Z, idx_t* old_to_new, idx_t* surviving_clusters, idx_t nclus)
+{
+    /*finalize clustering*/
+    /*acutally copying */
+    lu_dynamic_array_t tmp_centers;
+    lu_dynamic_array_t tmp_cluster_idx;
+
+
+    lu_dynamic_array_init(&tmp_centers);
+    lu_dynamic_array_init(&tmp_cluster_idx);
+
+    lu_dynamic_array_reserve(&tmp_centers, nclus);
+    lu_dynamic_array_reserve(&tmp_cluster_idx, nclus);
+
+    idx_t final_cluster_count = 0;
+
+    idx_t incremental_k = 0;
+    for(idx_t i = 0; i < nclus; ++i)
+    {
+        
+        if(surviving_clusters[i] == i){
+            lu_dynamic_array_pushBack(&tmp_centers, cluster->centers.data[i]);
+            lu_dynamic_array_pushBack(&tmp_cluster_idx, i);
+            old_to_new[i] = incremental_k;
+            ++incremental_k;
+            ++final_cluster_count;
+        }
+    }
+
+    //fill the rest of old_to_new
+    for(idx_t i = 0; i < nclus; ++i)
+    {
+        if(surviving_clusters[i] != i){
+            idx_t cidx_to_copy_from = surviving_clusters[i];
+            old_to_new[i] = old_to_new[cidx_to_copy_from];
+        }
+    }
+
+    /*allocate auxiliary pointers to store results of the finalization of the procedure*/
+
+    adj_list_t* tmp_borders = (adj_list_t*)malloc(final_cluster_count*sizeof(adj_list_t));
+
+    //initialize temporary borders
+    for(idx_t i = 0; i < final_cluster_count; ++i)
+    {
+	    tmp_borders[i].count = 0;
+	    tmp_borders[i].size  = PREALLOC_BORDERS;
+	    tmp_borders[i].data  = (sparse_border_t*)malloc(PREALLOC_BORDERS*sizeof(sparse_border_t));
+    }
+
+    #pragma omp parallel for
+    for(idx_t c = 0; c < final_cluster_count; ++c)
+    {
+        idx_t c_idx = tmp_cluster_idx.data[c];
+		for(idx_t el = 0; el < cluster -> sparse_borders[c_idx].count; ++el)
+		{
+			//retrieve border
+			sparse_border_t b = cluster -> sparse_borders[c_idx].data[el];
+			//change idexes of clusters
+			b.i = old_to_new[b.i];
+			b.j = old_to_new[b.j];
+
+			adj_list_insert(tmp_borders + c, b);
+		}
+    }
+
+    clusters_reset(cluster);
+    /*pay attention to the defined borders*/
+    /*copy into members*/
+    cluster -> sparse_borders = tmp_borders;
+    cluster -> centers = tmp_centers;
+    free(tmp_cluster_idx.data);
+
+}
+
+int compare_dp_by_cidx(const void* a, const void* b)
+{
+    int aa = ((datapoint_info_t*)a) -> cluster_idx;
+    int bb = ((datapoint_info_t*)b) -> cluster_idx;
+    return (aa > bb) - (aa < bb);
+}
+
+void Heuristic3(global_context_t* ctx, clusters_t* cluster, float_t Z, int halo)
+{
+
+    int verbose = 0;
+
+    /*
+     * Heurisitc 3, from paper of Errico, Facco, Laio & Rodriguez 
+     * ( https://doi.org/10.1016/j.ins.2021.01.010 )              
+	 * Dense implementation, makes use of a dense matrix to store
+	 * borders between clusters, so it is more performant when the number of clusters is low
+     *                                                            
+     * args:                                                      
+	 * - clusters* cluster 			: cluster object storing border info and cluster centers                 
+     * - datapoint_info* dp_info 	: array of Datapoint structures                             
+	 * - float_t Z 					: parameter to assess density peak significance
+     * - halo 					    : flag to set if you want to compute also the halo points                               
+     */
+
+	#define borders cluster->borders
+
+
+	struct timespec start_tot, finish_tot;
+	double elapsed_tot;
+
+	struct timespec start, finish;
+	double elapsed;
+
+	clock_gettime(CLOCK_MONOTONIC, &start_tot);
+
+
+    datapoint_info_t* dp_info   = ctx -> local_datapoints;
+	idx_t  nclus                = cluster -> centers.count;  
+	idx_t* surviving_clusters   = (idx_t*)MY_MALLOC(nclus*sizeof(idx_t));
+    idx_t* old_to_new           = (idx_t*)MY_MALLOC(nclus*sizeof(idx_t));
+	for(idx_t i = 0; i < nclus; ++i)
+	{ 
+		surviving_clusters[i] = i; 
+	}
+
+    MPI_Win dp_info_win;
+    MPI_Win_create(ctx -> local_datapoints, ctx -> local_n_points * sizeof(datapoint_info_t), 1, MPI_INFO_NULL, ctx -> mpi_communicator, &dp_info_win);
+
+    MPI_Win_fence(0, dp_info_win);
+
+    if(I_AM_MASTER)
+    {
+        datapoint_info_t* centers_dp = (datapoint_info_t*)MY_MALLOC(cluster -> centers.count * sizeof(datapoint_info_t));
+        for(idx_t i = 0; i < cluster -> centers.count; ++i)
+        {
+            idx_t cidx = cluster -> centers.data[i];
+            
+            int owner = foreign_owner(ctx, cidx);
+            idx_t pos = cidx - ctx -> rank_idx_start[owner];
+            if(owner == ctx -> mpi_rank)
+            {
+                centers_dp[i] = ctx -> local_datapoints[pos];
+            }
+            else
+            {
+                MPI_Get(centers_dp + i, 
+                        sizeof(datapoint_info_t), 
+                        MPI_BYTE, 
+                        owner, 
+                        (MPI_Aint)(pos * sizeof(datapoint_info_t)), 
+                        sizeof(datapoint_info_t), 
+                        MPI_BYTE, 
+                        dp_info_win);
+            }
+        }
+        MPI_Win_fence(0, dp_info_win);
+
+        qsort(centers_dp, cluster -> centers.count, sizeof(datapoint_info_t), compare_dp_by_cidx);
+
+        printf("Centers\n");
+
+        master_finds_borders(ctx, cluster, Z, surviving_clusters, centers_dp);
+        master_fixes_border_matrix_and_centers(ctx, cluster, Z, old_to_new, surviving_clusters, nclus);
+        free(centers_dp);
+    }
+    else
+    {
+        MPI_Win_fence(0, dp_info_win);
+    }
+    MPI_Win_free(&dp_info_win);
+
+    /* at this point master has the final border matrix 
+     * with the final list of surviving clusters
+     */
+
+    /* copy centers */ 
+
+    if(!(I_AM_MASTER))
+    {
+        clusters_reset(cluster);
+    }
+
+    MPI_Bcast(&(cluster -> centers.count), 1, MPI_UINT64_T, 0, ctx -> mpi_communicator);
+    MPI_Bcast(&(cluster -> centers.size), 1, MPI_UINT64_T, 0, ctx -> mpi_communicator);
+
+    if(!(I_AM_MASTER))
+    {
+        cluster -> centers.data = (idx_t*)MY_MALLOC(cluster -> centers.size * sizeof(idx_t));
+    }
+
+    MPI_Bcast(cluster -> centers.data, cluster -> centers.size, MPI_UINT64_T, 0, ctx -> mpi_communicator);
+
+    /* copy borders */
+
+    idx_t final_cluster_count = cluster -> centers.count;
+
+    if(!(I_AM_MASTER))
+    {
+        cluster -> sparse_borders = (adj_list_t*)MY_MALLOC(final_cluster_count * sizeof(adj_list_t));
+    }
+
+    MPI_Bcast(cluster -> sparse_borders, final_cluster_count * sizeof(adj_list_t), MPI_BYTE, 0, ctx -> mpi_communicator);
+
+    for(int i = 0; i < final_cluster_count; ++i)
+    {
+        if(!(I_AM_MASTER))
+        {
+            cluster -> sparse_borders[i].data = (sparse_border_t*)MY_MALLOC(cluster -> sparse_borders[i].size * sizeof(sparse_border_t));
+        }
+        MPI_Bcast(cluster -> sparse_borders[i].data,
+                  cluster -> sparse_borders[i].size * sizeof(sparse_border_t),
+                  MPI_BYTE, 0, ctx -> mpi_communicator);
+    }
+
+
+    MPI_Bcast(surviving_clusters, nclus, MPI_UINT64_T, 0, ctx -> mpi_communicator);
+    MPI_Bcast(old_to_new, nclus, MPI_UINT64_T, 0, ctx -> mpi_communicator);
+
+    /*fix cluster assignment*/
+    #pragma omp parallel for
+    for(idx_t i = 0; i < ctx -> local_n_points; ++i)
+    {
+        dp_info[i].is_center = 0;
+        int old_cidx = dp_info[i].cluster_idx;
+        dp_info[i].cluster_idx = old_to_new[old_cidx];
+    }
+
+    //reset centers
+    for(idx_t i = 0; i < cluster -> centers.count; ++i)
+    {
+        idx_t idx = cluster -> centers.data[i];
+        int owner = foreign_owner(ctx, idx);
+        if(owner == ctx -> mpi_rank)
+        {
+            idx_t pos = idx - ctx -> idx_start;
+            dp_info[pos].is_center = 1;
+        }
+    }
+
+    /*Halo*/
+
+    switch (halo)
+    {
+		case 1:
+		{
+			float_t* max_border_den_array = (float_t*)malloc(final_cluster_count*sizeof(float_t));
+			#pragma omp parallel
+			{
+				#pragma omp for
+				for(idx_t c = 0; c < final_cluster_count; ++c)
+				{
+					float_t max_border_den = -2.;
+					for(idx_t el = 0; el < cluster -> sparse_borders[c].count; ++el)
+					{
+						sparse_border_t b = cluster -> sparse_borders[c].data[el];
+						if(b.density > max_border_den)
+						{
+							max_border_den = b.density;
+						}
+					}
+
+					max_border_den_array[c] = max_border_den;
+
+				}
+
+				#pragma omp barrier
+
+				#pragma omp for
+				for(idx_t i = 0; i < cluster -> n; ++i)
+				{
+					int cidx = dp_info[i].cluster_idx;
+					int halo_flag = dp_info[i].log_rho_c < max_border_den_array[cidx]; 
+					//int halo_flag = max_border_den_array[cidx] > dp_info[i].log_rho_c  ; 
+					dp_info[i].cluster_idx = halo_flag ? -1 : cidx;
+				}
+			}
+			free(max_border_den_array);
+		}
+			break;
+		
+		default:
+			break;
+    }    
+
+    MPI_DB_PRINT("--> final cluster count %lu\n", final_cluster_count);
+    free(surviving_clusters);
+    free(old_to_new);
+
+    /*free memory and put the correct arrays into place*/
+    //free(ipos.data);
+    //free(jpos.data);
+    //
+
+    #ifdef WRITE_FINAL_ASSIGNMENT
+        int* cl = (int*)MY_MALLOC(ctx -> local_n_points * sizeof(int));
+
+        for(int i = 0; i < ctx -> local_n_points; ++i) cl[i] = ctx -> local_datapoints[i].cluster_idx;
+
+        ordered_buffer_to_file(ctx, cl, sizeof(int), ctx -> local_n_points, "bb/final_assignment.npy");
+
+        free(cl);
+        
+    #endif
+
+  #undef  borders  
+}
+
diff --git a/src/adp/adp.h b/src/adp/adp.h
new file mode 100644
index 0000000000000000000000000000000000000000..5da69a035bec46c0aaac8825e6f344a0cda335aa
--- /dev/null
+++ b/src/adp/adp.h
@@ -0,0 +1,58 @@
+#pragma once
+#include "../tree/tree.h"
+
+#define PREALLOC_BORDERS 100
+#define MAX_SERIAL_MERGING 100
+#define PRINT_H2_COMM_SCHEME
+
+typedef struct border_t 
+{
+    float_t density;
+    float_t error;
+    idx_t idx;
+} border_t;
+
+typedef struct sparse_border_t 
+{
+    idx_t i;
+    idx_t j;
+    idx_t idx;
+    float_t density;
+    float_t error;
+} sparse_border_t;
+
+typedef struct adj_list_t 
+{
+    idx_t count;
+    idx_t size;
+    struct sparse_border_t* data;
+} adj_list_t;
+
+
+typedef struct clusters_t 
+{
+    int use_sparse_borders;
+    struct adj_list_t *sparse_borders;
+    struct lu_dynamic_array_t centers;
+    struct border_t **borders;
+    struct border_t *__borders_data;
+    idx_t n;
+} clusters_t;
+
+typedef struct merge_t 
+{
+    idx_t source;
+    idx_t target;
+    float_t density;
+} merge_t;
+
+
+
+void compute_density_kstarnn_rma(global_context_t* ctx, const float_t d, int verbose);
+void compute_density_kstarnn_rma_v2(global_context_t* ctx, const float_t d, int verbose);
+float_t compute_ID_two_NN_ML(global_context_t* ctx, datapoint_info_t* dp_info, idx_t n, int verbose);
+void clusters_allocate(clusters_t * c, int s);
+
+clusters_t Heuristic1(global_context_t *ctx, int verbose);
+void Heuristic2(global_context_t* ctx, clusters_t* cluster);
+void Heuristic3(global_context_t* ctx, clusters_t* cluster, float_t Z, int halo);
diff --git a/src/common/common.c b/src/common/common.c
index 2a437caffa5896e455f85dc169729fe6717a7e40..1dcfc06b8e0dc2d82af3adb8160e209d9c1950d4 100644
--- a/src/common/common.c
+++ b/src/common/common.c
@@ -202,3 +202,54 @@ void lu_dynamic_array_init(lu_dynamic_array_t * a)
     a -> size = 0;
 }
 
+void ordered_buffer_to_file(global_context_t* ctx, void* buffer, size_t el_size, uint64_t n, const char* fname)
+{
+    //MPI_Barrier(ctx -> mpi_communicator);
+    MPI_DB_PRINT("[MASTER] writing to file %s\n", fname);
+    void* tmp_data; 
+    int* ppp; 
+    int* displs;
+
+    MPI_Barrier(ctx -> mpi_communicator);
+    
+    uint64_t tot_n = 0;
+    MPI_Reduce(&n, &tot_n, 1, MPI_UINT64_T , MPI_SUM, 0, ctx -> mpi_communicator);
+
+    if(I_AM_MASTER) 
+    {
+        tmp_data = (void*)MY_MALLOC(el_size * tot_n );
+        ppp      = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
+        displs   = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
+
+    }
+    
+    int nn = (int)n;
+    MPI_Gather(&nn, 1, MPI_INT, ppp, 1, MPI_INT, 0, ctx -> mpi_communicator);
+
+    if(I_AM_MASTER)
+    {
+        displs[0] = 0;
+        for(int i = 0; i < ctx -> world_size; ++i) ppp[i]    = el_size  * ppp[i];
+        for(int i = 1; i < ctx -> world_size; ++i) displs[i] = displs[i - 1] + ppp[i - 1];
+            
+    }
+
+    MPI_Gatherv(buffer, (int)(el_size * n), 
+            MPI_BYTE, tmp_data, ppp, displs, MPI_BYTE, 0, ctx -> mpi_communicator);
+
+    if(I_AM_MASTER)
+    {
+        FILE* file = fopen(fname,"w");
+        if(!file)
+        {
+            printf("Cannot open file %s ! Aborting \n", fname);
+        }
+        fwrite(tmp_data, 1, el_size * tot_n, file);
+        fclose(file);
+        free(tmp_data);
+        free(ppp);
+        free(displs);
+
+    }
+    MPI_Barrier(ctx -> mpi_communicator);
+}
diff --git a/src/common/common.h b/src/common/common.h
index 3bd19562a400f2111b25cc10ec8a08c872d6c6dd..6fa6856975b5249e2e0c83eb3a870496824d9f04 100644
--- a/src/common/common.h
+++ b/src/common/common.h
@@ -29,6 +29,15 @@ typedef struct datapoint_info_t {
 	#define float_t double
 #endif
 
+#ifdef USE_FLOAT32
+#define MPI_MY_FLOAT MPI_FLOAT
+#else
+#define MPI_MY_FLOAT MPI_DOUBLE
+#endif
+
+
+#define I_AM_MASTER ctx->mpi_rank == 0
+
 
 #define MY_TRUE  1
 #define MY_FALSE 0
@@ -173,4 +182,4 @@ void lu_dynamic_array_reserve(lu_dynamic_array_t * a, idx_t n);
 void lu_dynamic_array_init(lu_dynamic_array_t * a);
 void print_error_code(int err);
 
-
+void ordered_data_to_file(global_context_t* ctx);
diff --git a/src/main/main.c b/src/main/main.c
index 0a698424c28fea2b0e5f13228544e09d1fe885e2..243683164014e9f6a22ea7ce4e51d6a32aad8114 100644
--- a/src/main/main.c
+++ b/src/main/main.c
@@ -3,6 +3,7 @@
 #include <stdio.h>
 #include "../common/common.h"
 #include "../tree/tree.h"
+#include "../adp/adp.h"
 
 //
 
@@ -89,3 +90,206 @@ int main(int argc, char** argv) {
 
 
 
+void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx) 
+{
+    float_t *data;
+    TIME_DEF
+    double elapsed_time;
+
+    if (ctx->mpi_rank == 0) 
+    {
+        //data = read_data_file(ctx, "../norm_data/50_blobs_more_var.npy", MY_TRUE);
+        //ctx->dims = 2;
+        //data = read_data_file(ctx, "../norm_data/50_blobs.npy", MY_TRUE);
+        // std_g0163178_Me14_091_0000
+    
+        // 190M points
+        // std_g2980844_091_0000
+        // data = read_data_file(ctx,"../norm_data/std_g2980844_091_0000",MY_TRUE);
+        
+        /* 1M points ca.*/
+        data = read_data_file(ctx,"../norm_data/std_LR_091_0001",MY_TRUE);
+
+        /* BOX */
+        // data = read_data_file(ctx,"../norm_data/std_Box_256_30_092_0000",MY_TRUE);
+
+        /* 8M points */
+        
+        // data = read_data_file(ctx,"../norm_data/std_g0144846_Me14_091_0001",MY_TRUE);
+
+        //88M 
+        //data = read_data_file(ctx,"../norm_data/std_g5503149_091_0000",MY_TRUE);
+
+        //
+        //34 M
+        //data = read_data_file(ctx,"../norm_data/std_g1212639_091_0001",MY_TRUE);
+        ctx->dims = 5;
+
+        // ctx -> n_points = 5 * 100000;
+        ctx->n_points = ctx->n_points / ctx->dims;
+        //ctx->n_points = (ctx->n_points * 5) / 10;
+        // ctx -> n_points = ctx -> world_size * 1000;
+
+        //ctx -> n_points = 10000000 * ctx -> world_size;
+        //generate_random_matrix(&data, ctx -> dims, ctx -> n_points, ctx);
+        //mpi_printf(ctx, "Read %lu points in %u dims\n", ctx->n_points, ctx->dims);
+    }
+    //MPI_DB_PRINT("[MASTER] Reading file and scattering\n");
+    
+    /* communicate the total number of points*/
+    MPI_Bcast(&(ctx->dims), 1, MPI_UINT32_T, 0, ctx->mpi_communicator);
+    MPI_Bcast(&(ctx->n_points), 1, MPI_UINT64_T, 0, ctx->mpi_communicator);
+
+    /* compute the number of elements to recieve for each processor */
+    int *send_counts = (int *)MY_MALLOC(ctx->world_size * sizeof(int));
+    int *displacements = (int *)MY_MALLOC(ctx->world_size * sizeof(int));
+
+    displacements[0] = 0;
+    send_counts[0] = ctx->n_points / ctx->world_size;
+    send_counts[0] += (ctx->n_points % ctx->world_size) > 0 ? 1 : 0;
+    send_counts[0] = send_counts[0] * ctx->dims;
+
+    for (int p = 1; p < ctx->world_size; ++p) 
+    {
+        send_counts[p] = (ctx->n_points / ctx->world_size);
+        send_counts[p] += (ctx->n_points % ctx->world_size) > p ? 1 : 0;
+        send_counts[p] = send_counts[p] * ctx->dims;
+        displacements[p] = displacements[p - 1] + send_counts[p - 1];
+    }
+
+
+    ctx->local_n_points = send_counts[ctx->mpi_rank] / ctx->dims;
+
+    float_t *pvt_data = (float_t *)MY_MALLOC(send_counts[ctx->mpi_rank] * sizeof(float_t));
+
+    MPI_Scatterv(data, send_counts, displacements, MPI_MY_FLOAT, pvt_data, send_counts[ctx->mpi_rank], MPI_MY_FLOAT, 0, ctx->mpi_communicator);
+
+    ctx->local_data = pvt_data;
+
+    int k_local = 20;
+    int k_global = 20;
+
+    uint64_t *global_bin_counts_int = (uint64_t *)MY_MALLOC(k_global * sizeof(uint64_t));
+
+    pointset_t original_ps;
+    original_ps.data = ctx->local_data;
+    original_ps.dims = ctx->dims;
+    original_ps.n_points = ctx->local_n_points;
+    original_ps.lb_box = (float_t*)MY_MALLOC(ctx -> dims * sizeof(float_t));
+    original_ps.ub_box = (float_t*)MY_MALLOC(ctx -> dims * sizeof(float_t));
+
+    float_t tol = 0.002;
+
+    top_kdtree_t tree;
+    TIME_START;
+    top_tree_init(ctx, &tree);
+    elapsed_time = TIME_STOP;
+    LOG_WRITE("Initializing global kdtree", elapsed_time);
+
+    TIME_START;
+    build_top_kdtree(ctx, &original_ps, &tree, k_global, tol);
+    exchange_points(ctx, &tree);
+    elapsed_time = TIME_STOP;
+    LOG_WRITE("Top kdtree build and domain decomposition", elapsed_time);
+    //test_the_idea(ctx);
+
+    TIME_START;
+    kdtree_v2 local_tree;
+    kdtree_v2_init( &local_tree, ctx -> local_data, ctx -> local_n_points, (unsigned int)ctx -> dims);
+    int k = 300;
+    //int k = 30;
+
+    datapoint_info_t* dp_info = (datapoint_info_t*)MY_MALLOC(ctx -> local_n_points * sizeof(datapoint_info_t));            
+    /* initialize, to cope with valgrind */
+    for(uint64_t i = 0; i < ctx -> local_n_points; ++i)
+    {
+        dp_info[i].ngbh.data = NULL;
+        dp_info[i].ngbh.N = 0;
+        dp_info[i].ngbh.count = 0;
+        dp_info[i].g = 0.f;
+        dp_info[i].log_rho = 0.f;
+        dp_info[i].log_rho_c = 0.f;
+        dp_info[i].log_rho_err = 0.f;
+        dp_info[i].array_idx = -1;
+        dp_info[i].kstar = -1;
+        dp_info[i].is_center = -1;
+        dp_info[i].cluster_idx = -1;
+    }
+
+    build_local_tree(ctx, &local_tree);
+    elapsed_time = TIME_STOP;
+    LOG_WRITE("Local trees init and build", elapsed_time);
+
+    TIME_START;
+    MPI_DB_PRINT("----- Performing ngbh search -----\n");
+    MPI_Barrier(ctx -> mpi_communicator);
+
+    mpi_ngbh_search(ctx, dp_info, &tree, &local_tree, ctx -> local_data, k);
+
+    MPI_Barrier(ctx -> mpi_communicator);
+    elapsed_time = TIME_STOP;
+    LOG_WRITE("Total time for all knn search", elapsed_time)
+
+
+
+    TIME_START;
+    //float_t id = id_estimate(ctx, dp_info, ctx -> local_n_points, 0.9, MY_FALSE);
+    float_t id = compute_ID_two_NN_ML(ctx, dp_info, ctx -> local_n_points, MY_FALSE);
+    elapsed_time = TIME_STOP;
+    LOG_WRITE("ID estimate", elapsed_time)
+
+    MPI_DB_PRINT("ID %lf \n",id);
+
+    TIME_START;
+
+    float_t  z = 2;
+
+    ctx -> local_datapoints = dp_info;
+    //compute_density_kstarnn_rma(ctx, id, MY_FALSE);
+    compute_density_kstarnn_rma_v2(ctx, id, MY_FALSE);
+    compute_correction(ctx, z);
+    elapsed_time = TIME_STOP;
+    LOG_WRITE("Density estimate", elapsed_time)
+
+    TIME_START;
+    clusters_t clusters = Heuristic1(ctx, MY_FALSE);
+    elapsed_time = TIME_STOP;
+    LOG_WRITE("H1", elapsed_time)
+
+    TIME_START;
+    clusters_allocate(&clusters, 1);
+    Heuristic2(ctx, &clusters);
+    elapsed_time = TIME_STOP;
+    LOG_WRITE("H2", elapsed_time)
+
+
+    TIME_START;
+    int halo = 0;
+    Heuristic3(ctx, &clusters, z, halo);
+    elapsed_time = TIME_STOP;
+    LOG_WRITE("H3", elapsed_time)
+
+
+    /* find density */ 
+    #if defined (WRITE_NGBH)
+        ordered_data_to_file(ctx);
+    #endif
+
+    /*
+    free(foreign_dp_info);
+    */
+    
+    
+    top_tree_free(ctx, &tree);
+    kdtree_v2_free(&local_tree);
+
+    free(send_counts);
+    free(displacements);
+    //free(dp_info);
+
+    if (ctx->mpi_rank == 0) free(data);
+
+    original_ps.data = NULL;
+    free_pointset(&original_ps);
+    free(global_bin_counts_int);
+}
diff --git a/src/tree/heap.h b/src/tree/heap.h
index 513170efc9b5ca225934dc444de99e409f09d809..84fc8f49da59904ed862efeed36d6ae7471f5406 100644
--- a/src/tree/heap.h
+++ b/src/tree/heap.h
@@ -8,7 +8,7 @@
 #define T double
 #define DATA_DIMS 0 
 
-#define HERE printf("%d reached line %d\n", ctx -> mpi_rank, __LINE__);
+#define HERE printf("%d reached line %d\n", ctx -> mpi_rank, __LINE__); MPI_Barrier(ctx -> mpi_communicator);
 
 #ifdef USE_FLOAT32
     #define FLOAT_TYPE float
diff --git a/src/tree/tree.c b/src/tree/tree.c
index 167791e35541cce1c6dafcdf7137a8a0c0883130..2ecac4eb5ad28f94170771af0803967f1bd73ead 100644
--- a/src/tree/tree.c
+++ b/src/tree/tree.c
@@ -21,9 +21,12 @@
 
 //#define WRITE_NGBH
 //#define WRITE_TOP_NODES
-//#define WRITE_DENSITY
-#define WRITE_CLUSTER_ASSIGN_H1
-#define WRITE_BORDERS
+#define WRITE_DENSITY
+//#define WRITE_CLUSTER_ASSIGN_H1
+//#define WRITE_BORDERS
+//#define WRITE_MERGING_TABLE
+#define WRITE_FINAL_ASSIGNMENT
+
 
 /* 
  * Maximum bytes to send with a single mpi send/recv, used 
@@ -36,24 +39,12 @@
 /* Used slices of 10 mb ? Really good? Maybe at the cause of TID thing */
 #define MAX_MSG_SIZE 1000000000 
 
-#ifdef USE_FLOAT32
-#define MPI_MY_FLOAT MPI_FLOAT
-#else
-#define MPI_MY_FLOAT MPI_DOUBLE
-#endif
-
-
-#define I_AM_MASTER ctx->mpi_rank == 0
 
 #define TOP_TREE_RCH 1
 #define TOP_TREE_LCH 0
 #define NO_CHILD -1
 
 unsigned int data_dims;
-const border_t border_null = {.density = -1.0, .error = 0, .idx = NOBORDER};
-const sparse_border_t sparse_border_null = {.density = -1.0, .error = 0, .idx = NOBORDER, .i = NOBORDER, .j = NOBORDER};
-
-#define PREALLOC_BORDERS 100
 
 
 int cmp_float_t(const void* a, const void* b)
@@ -64,7 +55,8 @@ int cmp_float_t(const void* a, const void* b)
 }
 
 
-float_t *read_data_file(global_context_t *ctx, const char *fname,
+
+float_t* read_data_file(global_context_t *ctx, const char *fname,
                         const int file_in_float32) 
 {
 
@@ -1841,6 +1833,7 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
 
     req_array = (MPI_Request*)MY_MALLOC(req_num * sizeof(MPI_Request));
 
+
     for(int i = 0; i < ctx -> world_size; ++i)
     {
         already_sent_points[i] = 0;
@@ -1849,6 +1842,7 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
 
     int req_idx = 0;
 
+
     for(int i = 0; i < ctx -> world_size; ++i)
     {
         int count = 0;
@@ -1866,11 +1860,11 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
             }
         }
     }
+
     
     MPI_Barrier(ctx -> mpi_communicator);
     MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, ctx -> mpi_communicator, &flag, &status);
     //DB_PRINT("%d %p %p\n",ctx -> mpi_rank, &flag, &status);
-    //HERE
     while(flag)
     {
         MPI_Request request;
@@ -1987,34 +1981,6 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
 
 }
 
-void test_the_idea(global_context_t* ctx)
-{
-    int* num = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
-    for(int i = 0; i < ctx -> world_size; ++i) num[i] = i;
-
-    for(int i = 0; i < ctx -> world_size; ++i)
-    {
-        MPI_Request request;
-        MPI_Isend(num + i, 1, MPI_INT, i, 0, ctx -> mpi_communicator, &request);
-    }
-    MPI_Barrier(ctx -> mpi_communicator);
-    MPI_Status status;
-    int flag;
-    int cc = 0;
-    MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_SOURCE, ctx -> mpi_communicator, &flag, &status);
-    while(flag)
-    {
-        cc++;
-        MPI_Request request;
-        MPI_Recv(num + status.MPI_SOURCE, 1, MPI_INT, status.MPI_SOURCE, MPI_ANY_TAG, ctx -> mpi_communicator, &status);
-        MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_SOURCE, ctx -> mpi_communicator, &flag, &status);
-
-    }
-    MPI_DB_PRINT("Recieved %d msgs\n",cc);
-    free(num);
-
-}
-
 void build_local_tree(global_context_t* ctx, kdtree_v2* local_tree)
 {
     local_tree -> root = build_tree_kdtree_v2(local_tree -> _nodes, local_tree -> n_nodes, ctx -> dims);
@@ -2061,59 +2027,8 @@ void ordered_data_to_file(global_context_t* ctx)
     MPI_Barrier(ctx -> mpi_communicator);
 }
 
-void ordered_buffer_to_file(global_context_t* ctx, void* buffer, size_t el_size, uint64_t n, const char* fname)
-{
-    //MPI_Barrier(ctx -> mpi_communicator);
-    MPI_DB_PRINT("[MASTER] writing to file %s\n", fname);
-    void* tmp_data; 
-    int* ppp; 
-    int* displs;
-
-    MPI_Barrier(ctx -> mpi_communicator);
-    
-    uint64_t tot_n = 0;
-    MPI_Reduce(&n, &tot_n, 1, MPI_UINT64_T , MPI_SUM, 0, ctx -> mpi_communicator);
-
-    if(I_AM_MASTER) 
-    {
-        tmp_data = (void*)MY_MALLOC(el_size * tot_n );
-        ppp      = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
-        displs   = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
-
-    }
-    
-    int nn = (int)n;
-    MPI_Gather(&nn, 1, MPI_INT, ppp, 1, MPI_INT, 0, ctx -> mpi_communicator);
-
-    if(I_AM_MASTER)
-    {
-        displs[0] = 0;
-        for(int i = 0; i < ctx -> world_size; ++i) ppp[i]    = el_size  * ppp[i];
-        for(int i = 1; i < ctx -> world_size; ++i) displs[i] = displs[i - 1] + ppp[i - 1];
-            
-    }
-
-    MPI_Gatherv(buffer, (int)(el_size * n), 
-            MPI_BYTE, tmp_data, ppp, displs, MPI_BYTE, 0, ctx -> mpi_communicator);
 
-    if(I_AM_MASTER)
-    {
-        FILE* file = fopen(fname,"w");
-        if(!file)
-        {
-            printf("Cannot open file %s ! Aborting \n", fname);
-        }
-        fwrite(tmp_data, 1, el_size * tot_n, file);
-        fclose(file);
-        free(tmp_data);
-        free(ppp);
-        free(displs);
-
-    }
-    MPI_Barrier(ctx -> mpi_communicator);
-}
-
-static inline int foreign_owner(global_context_t* ctx, idx_t idx)
+int foreign_owner(global_context_t* ctx, idx_t idx)
 {
     int owner = ctx -> mpi_rank;
     if( idx >= ctx -> idx_start && idx < ctx -> idx_start + ctx -> local_n_points) 
@@ -2129,2822 +2044,13 @@ static inline int foreign_owner(global_context_t* ctx, idx_t idx)
     return owner;
 }
 
-static inline void append_foreign_idx_list(idx_t element, int owner, int* counts, idx_t** lists)
-{
-
-
-    /* find the plausible place */
-    int idx_to_insert;
-
-    #pragma omp atomic capture
-    idx_to_insert = counts[owner]++; 
-    
-    lists[owner][idx_to_insert] = element;
-}
-
-int cmp_idx(const void* a, const void* b)
-{
-    idx_t aa = *((idx_t*)a);
-    idx_t bb = *((idx_t*)b);
-    return (aa > bb) - (aa < bb);
-}
-
-void find_foreign_nodes(global_context_t* ctx, datapoint_info_t* dp, datapoint_info_t** foreign_dp)
-{
-    int k = dp[0].ngbh.count;
-    
-    idx_t** array_indexes_to_request = (idx_t**)MY_MALLOC(ctx -> world_size * sizeof(idx_t*));
-    int*    count_to_request         = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
-    int*    capacities               = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
-
-
-    for(int i = 0; i < ctx -> world_size; ++i)
-    {
-        capacities[i] = 0;
-        count_to_request[i] = 0;
-    }
-
-    /* count them */
-    
-    #pragma omp parallel for
-    for(uint32_t i = 0; i < ctx -> local_n_points; ++i)
-    {
-        for(int j = 0; j < k; ++j)
-        {
-            idx_t element = dp[i].ngbh.data[j].array_idx;        
-            int owner = foreign_owner(ctx, element);
-            //DB_PRINT("%lu %d\n", element, owner);
-            
-            if(owner != ctx -> mpi_rank)
-            {
-                #pragma  omp atomic update
-                capacities[owner]++;
-            }
-        }
-    }
-
-    /* alloc */
-
-    for(int i = 0; i < ctx -> world_size; ++i)
-    {
-        array_indexes_to_request[i] = (idx_t*)MY_MALLOC(capacities[i] * sizeof(idx_t));
-    }
-
-    /* append them */
-    
-    #pragma omp parallel for
-    for(uint32_t i = 0; i < ctx -> local_n_points; ++i)
-    {
-        for(int j = 0; j < k; ++j)
-        {
-            idx_t element = dp[i].ngbh.data[j].array_idx;        
-            int owner = foreign_owner(ctx, element);
-            //DB_PRINT("%lu %d\n", element, owner);
-            
-            if(owner != ctx -> mpi_rank)
-            {
-                append_foreign_idx_list(element, owner, count_to_request, array_indexes_to_request); 
-            }
-        }
-    }
-
-    /* prune them */
-    int* unique_count = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
-
-    /*
-    if(I_AM_MASTER)
-    {
-        FILE* f = fopen("uniq","w");
-        fwrite(array_indexes_to_request[1], sizeof(idx_t), capacities[1],f);
-        fclose(f);
-    }
-    */
-
-    #pragma omp paralell for
-    for(int i = 0; i < ctx -> world_size; ++i)
-    {
-       unique_count[i] = capacities[i] > 0; //init unique count 
-       qsort(array_indexes_to_request[i], capacities[i], sizeof(idx_t), cmp_idx); 
-       uint32_t prev = array_indexes_to_request[i][0];
-       for(int el = 1; el < capacities[i]; ++el)
-       {
-            int flag = prev == array_indexes_to_request[i][el];
-            if(!flag)
-            {
-                /* in place subsitution 
-                 * if a different element is found then 
-                 * copy in at the next free place
-                 * */
-                array_indexes_to_request[i][unique_count[i]] = array_indexes_to_request[i][el];
-                unique_count[i]++;
-            }
-            prev = array_indexes_to_request[i][el];
-       }
-    }
-
-    for(int i = 0; i < ctx -> world_size; ++i)
-    {
-        array_indexes_to_request[i] = (idx_t*)realloc(array_indexes_to_request[i], unique_count[i] * sizeof(idx_t));
-    }
-
-    for(int i = 0; i < ctx -> world_size; ++i)
-    {
-        foreign_dp[i] = (datapoint_info_t*)calloc(sizeof(datapoint_info_t), unique_count[i]);
-    }
-    
-    /* alias for unique counts */
-    int* n_heap_to_recv = unique_count;
-    int* n_heap_to_send = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
-
-    /* exchange how many to recv how many to send */
-
-    MPI_Alltoall(n_heap_to_recv, 1, MPI_INT, n_heap_to_send, 1, MPI_INT , ctx -> mpi_communicator);
 
-    /* compute displacements and yada yada */
-    int* sdispls = (int*)calloc(ctx -> world_size , sizeof(int));
-    int* rdispls = (int*)calloc(ctx -> world_size , sizeof(int));
-    
-    int tot_count_send = n_heap_to_send[0];
-    int tot_count_recv = n_heap_to_recv[0];
-    for(int i = 1; i < ctx -> world_size; ++i)
-    {
-        sdispls[i] = sdispls[i - 1] + n_heap_to_send[i - 1];
-        rdispls[i] = rdispls[i - 1] + n_heap_to_recv[i - 1];
-
-        tot_count_send += n_heap_to_send[i];
-        tot_count_recv += n_heap_to_recv[i];
-    }
-    idx_t* idx_buffer_to_send = (idx_t*)MY_MALLOC(tot_count_send * sizeof(idx_t));
-    idx_t* idx_buffer_to_recv = (idx_t*)MY_MALLOC(tot_count_recv * sizeof(idx_t));
-    
-    /* copy indexes on send buffer */
-    for(int i = 0; i < ctx -> world_size; ++i)
-    {
-        memcpy(idx_buffer_to_recv + rdispls[i], array_indexes_to_request[i], n_heap_to_recv[i] * sizeof(idx_t));
-    }
-    
-    MPI_Alltoallv(idx_buffer_to_recv, n_heap_to_recv, rdispls, MPI_UINT64_T, idx_buffer_to_send, n_heap_to_send, sdispls, MPI_UINT64_T, ctx -> mpi_communicator);
-
-
-    /* allocate foreign dp */ 
-    heap_node* heap_buffer_to_send = (heap_node*)MY_MALLOC(tot_count_send * k * sizeof(heap_node));
-    heap_node* heap_buffer_to_recv = (heap_node*)MY_MALLOC(tot_count_recv * k * sizeof(heap_node));
-
-    if(!heap_buffer_to_send)
-    {
-        printf("Rank %d cannot allocate heap to send aborting\n",ctx -> mpi_rank);
-        printf("Required %d for %d bytes\n", tot_count_send, tot_count_send * 16);
-        exit(1);
-    }
-
-    if(!heap_buffer_to_recv)
-    {
-        printf("Rank %d cannot allocate heap to recieve aborting\n",ctx -> mpi_rank);
-        printf("Required %d for %d bytes\n", tot_count_recv, tot_count_recv * 16);
-        exit(1);
-    }
-
-    for(int i = 0; i < tot_count_send; ++i)
-    {
-        idx_t idx = idx_buffer_to_send[i] - ctx -> idx_start;
-        memcpy(heap_buffer_to_send + i * k, dp[idx].ngbh.data, k * sizeof(heap_node));
-    }
-    /* exchange heaps */
-
-
-    MPI_Barrier(ctx -> mpi_communicator);
-    int default_msg_len = MAX_MSG_SIZE / (k * sizeof(heap_node));
-
-    int* already_sent_points = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
-    int* already_rcvd_points = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
-
-    /* allocate a request array to keep track of all requests going out*/
-    MPI_Request* req_array;
-    int req_num = 0;
-    for(int i = 0; i < ctx -> world_size; ++i)
-    {
-        req_num += n_heap_to_send[i] > 0 ? n_heap_to_send[i]/default_msg_len + 1 : 0;
-    }
-
-    req_array = (MPI_Request*)MY_MALLOC(req_num * sizeof(MPI_Request));
-
-    for(int i = 0; i < ctx -> world_size; ++i)
-    {
-        already_sent_points[i] = 0;
-        already_rcvd_points[i] = 0;
-    }
-
-    int req_idx = 0;
-    
-    MPI_Datatype MPI_my_heap;
-    MPI_Type_contiguous(k * sizeof(heap_node), MPI_BYTE, &MPI_my_heap);
-    MPI_Barrier(ctx -> mpi_communicator);
-    MPI_Type_commit(&MPI_my_heap);
-
-    for(int i = 0; i < ctx -> world_size; ++i)
-    {
-        int count = 0;
-        if(n_heap_to_send[i] > 0)
-        {
-            while(already_sent_points[i] < n_heap_to_send[i])
-            {
-                MPI_Request request;
-                count = MIN(default_msg_len, n_heap_to_send[i] - already_sent_points[i] );
-                MPI_Isend(  heap_buffer_to_send + k * (already_sent_points[i] + sdispls[i]), count,  
-                        MPI_my_heap, i, 0, ctx -> mpi_communicator, &request);
-                already_sent_points[i] += count;
-                req_array[req_idx] = request;
-                ++req_idx;
-            }
-        }
-    }
-    int flag; 
-    MPI_Status status;
-    MPI_Barrier(ctx -> mpi_communicator);
-    MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, ctx -> mpi_communicator, &flag, &status);
-    //DB_PRINT("%d %p %p\n",ctx -> mpi_rank, &flag, &status);
-    while(flag)
-    {
-        MPI_Request request;
-        int count; 
-        int source = status.MPI_SOURCE;
-        MPI_Get_count(&status, MPI_my_heap, &count);
-        /* recieve each slice */
-
-        MPI_Recv(heap_buffer_to_recv + k * (already_rcvd_points[source] + rdispls[source]), 
-                count, MPI_my_heap, source, MPI_ANY_TAG, ctx -> mpi_communicator, &status);
-
-        already_rcvd_points[source] += count;
-        MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, ctx -> mpi_communicator, &flag, &status);
-
-    }
-    MPI_Barrier(ctx -> mpi_communicator);
-
-    MPI_Type_free(&MPI_my_heap);
-
-    MPI_Testall(req_num, req_array, &flag, MPI_STATUSES_IGNORE);
-
-    if(flag == 0)
-    {
-        DB_PRINT("[!!!] Rank %d has unfinished communications\n", ctx -> mpi_rank);
-        exit(1);
-    }
-    free(req_array);
-    free(already_sent_points);
-    free(already_rcvd_points);
-
-
-    /* copy results where needed */
-    for(int i = 0; i < ctx -> world_size; ++i)
-    {
-        for(int j = 0; j < n_heap_to_recv[i]; ++j)
-        {
-            /*
-            foreign_dp[i][j].array_idx = array_indexes_to_request[i][j];
-            init_heap(&(foreign_dp[i][j].ngbh));
-            allocate_heap(&(foreign_dp[i][j].ngbh), k);
-            foreign_dp[i][j].ngbh.N     = k;
-            foreign_dp[i][j].ngbh.count = k;
-            memcpy(foreign_dp[i][j].ngbh.data, heap_buffer_to_recv + k * (j + rdispls[i]), k * sizeof(heap_node));
-            */
-
-            foreign_dp[i][j].cluster_idx = -1;
-
-            foreign_dp[i][j].array_idx = array_indexes_to_request[i][j];
-            //init_heap(&(foreign_dp[i][j].ngbh));
-            foreign_dp[i][j].ngbh.N     = k;
-            foreign_dp[i][j].ngbh.count = k;
-            foreign_dp[i][j].ngbh.data =  heap_buffer_to_recv + k * (j + rdispls[i]);
-
-            if(foreign_dp[i][j].ngbh.data[0].array_idx != array_indexes_to_request[i][j])
-            {
-                printf("Error on %lu\n",array_indexes_to_request[i][j]);
-            }
-        }
-    }
-    
 
-    /* put back indexes in the context */
 
-    ctx -> idx_halo_points_recv = array_indexes_to_request; 
-    ctx -> n_halo_points_recv   = n_heap_to_recv;
 
-    ctx -> n_halo_points_send = n_heap_to_send;
-    ctx -> idx_halo_points_send = (idx_t**)MY_MALLOC(ctx -> world_size * sizeof(idx_t*));
-    for(int i = 0; i < ctx -> world_size; ++i) ctx -> idx_halo_points_send[i] = NULL;
 
-    for(int i = 0; i < ctx -> world_size; ++i)
-    {
-        ctx -> idx_halo_points_send[i] = (idx_t*)MY_MALLOC( n_heap_to_send[i] * sizeof(idx_t));
-        memcpy(ctx -> idx_halo_points_send[i], idx_buffer_to_send + sdispls[i], n_heap_to_send[i] * sizeof(idx_t) ); 
-    }
-
-    ctx -> halo_datapoints = foreign_dp;
-    ctx -> local_datapoints = dp;
-
-    free(count_to_request);
-    free(capacities);
-
-    /* free(heap_buffer_to_recv); this needs to be preserved*/
-    ctx -> __recv_heap_buffers = heap_buffer_to_recv;
-    free(heap_buffer_to_send);
-    free(idx_buffer_to_send);
-    free(idx_buffer_to_recv);
-
-    free(sdispls);
-    free(rdispls);
-}
 
-float_t mEst2(float_t * x, float_t *y, idx_t n)
-{
-
-    /*
-     * Estimate the m coefficient of a straight 
-     * line passing through the origin          
-     * params:                                  
-     * - x: x values of the points              
-     * - y: y values of the points              
-     * - n: size of the arrays                  
-     */
-     
-    float_t num = 0;
-    float_t den = 0;
-    float_t dd;
-    for(idx_t i = 0; i < n; ++i)
-    {
-        float_t xx = x[i];
-        float_t yy = y[i];
 
-        dd = xx;
-        num += dd*yy;
-        den += dd*dd;
 
-    }
-  
-    return num/den;
-}
 
-float_t compute_ID_two_NN_ML(global_context_t* ctx, datapoint_info_t* dp_info, idx_t n, int verbose)
-{
 
-    /*
-     * Estimation of the intrinsic dimension of a dataset                                       
-     * args:                                                                                    
-     * - dp_info: array of structs                                                             
-     * - n: number of dp_info                                                                  
-     * intrinsic_dim = (N - 1) / np.sum(log_mus)
-     */
-
-    struct timespec start_tot, finish_tot;
-    double elapsed_tot;
-
-	if(verbose) 
-    {
-		printf("ID estimation:\n");
-		clock_gettime(CLOCK_MONOTONIC, &start_tot);
-	}
-    
-    float_t log_mus = 0;
-    for(idx_t i = 0; i < n; ++i)
-    {
-        log_mus += 0.5 * log(dp_info[i].ngbh.data[2].value/dp_info[i].ngbh.data[1].value);
-    }
-
-    float_t d = 0;
-    MPI_Allreduce(&log_mus, &d, 1, MPI_MY_FLOAT, MPI_SUM, ctx -> mpi_communicator);
-    d = (ctx -> n_points - 1)/d;
-	if(verbose)
-	{
-		clock_gettime(CLOCK_MONOTONIC, &finish_tot);
-		elapsed_tot = (finish_tot.tv_sec - start_tot.tv_sec);
-		elapsed_tot += (finish_tot.tv_nsec - start_tot.tv_nsec) / 1000000000.0;
-		printf("\tID value: %.6lf\n", d);
-		printf("\tTotal time: %.3lfs\n\n", elapsed_tot);
-	}
-
-    return d;
-
-}
-
-
-float_t id_estimate(global_context_t* ctx, datapoint_info_t* dp_info, idx_t n, float_t fraction, int verbose)
-{
-
-    /*
-     * Estimation of the intrinsic dimension of a dataset                                       
-     * args:                                                                                    
-     * - dp_info: array of structs                                                             
-     * - n: number of dp_info                                                                  
-     * Estimates the id via 2NN method. Computation of the log ratio of the                      
-     * distances of the first 2 neighbors of each point. Then compute the empirical distribution 
-     * of these log ratii                                                                        
-     * The intrinsic dimension is found by fitting with a straight line passing through the      
-     * origin                                                                                    
-     */
-
-    struct timespec start_tot, finish_tot;
-    double elapsed_tot;
-
-	if(verbose)
-	{
-		printf("ID estimation:\n");
-		clock_gettime(CLOCK_MONOTONIC, &start_tot);
-	}
-
-    //float_t fraction = 0.7;
-    float_t* r = (float_t*)MY_MALLOC(n*sizeof(float_t));
-    float_t* Pemp = (float_t*)MY_MALLOC(n*sizeof(float_t));
-
-    for(idx_t i = 0; i < n; ++i)
-    {
-        r[i] = 0.5 * log(dp_info[i].ngbh.data[2].value/dp_info[i].ngbh.data[1].value);
-        Pemp[i] = -log(1 - (float_t)(i + 1)/(float_t)n);
-    }
-    qsort(r,n,sizeof(float_t),cmp_float_t);
-
-    idx_t Neff = (idx_t)(n*fraction);
-
-    float_t d = mEst2(r,Pemp,Neff); 
-    free(r);
-    free(Pemp);
-
-	if(verbose)
-	{
-		clock_gettime(CLOCK_MONOTONIC, &finish_tot);
-		elapsed_tot = (finish_tot.tv_sec - start_tot.tv_sec);
-		elapsed_tot += (finish_tot.tv_nsec - start_tot.tv_nsec) / 1000000000.0;
-		printf("\tID value: %.6lf\n", d);
-		printf("\tTotal time: %.3lfs\n\n", elapsed_tot);
-	}
-
-    float_t rd = 0;
-    MPI_Allreduce(&d, &rd, 1, MPI_MY_FLOAT, MPI_SUM, ctx -> mpi_communicator);
-    rd = rd / ctx -> world_size;
-
-    return rd;
-
-}
-
-int binary_search_on_idxs(idx_t* idxs, idx_t key, int n)
-{
-    #define LEFT  1
-    #define RIGHT 0
-
-    int l = 0;
-    int r = n - 1;
-    int center = (r - l)/2;
-    while(idxs[center] != key && l < r)
-    {
-        int lr = key < idxs[center];
-        /* if key < place */
-        switch (lr)
-        {
-            case LEFT:
-                {
-                    l = l;
-                    r = center - 1;
-                    center = l + (r - l) / 2;
-                }
-                break;
-
-            case RIGHT:
-                {
-                    l = center + 1;
-                    r = r;
-                    center = l + (r - l) / 2;
-                }
-                break;
-
-            default:
-                break;
-        }
-
-    }
-     
-    return center;
-
-    #undef LEFT
-    #undef RIGHT
-}
-
-datapoint_info_t find_possibly_halo_datapoint(global_context_t* ctx, idx_t idx)
-{
-    int owner = foreign_owner(ctx, idx);
-    /* find if datapoint is halo or not */
-    if(owner == ctx -> mpi_rank)
-    {
-        idx_t i = idx - ctx -> idx_start;
-        return ctx -> local_datapoints[i];
-    }
-    else
-    {
-        datapoint_info_t* halo_dp = ctx -> halo_datapoints[owner]; 
-        idx_t* halo_idxs = ctx -> idx_halo_points_recv[owner];
-        int n = ctx -> n_halo_points_recv[owner];
-        int i = binary_search_on_idxs(halo_idxs, idx, n);
-        
-        if( idx != halo_dp[i].ngbh.data[0].array_idx && idx != MY_SIZE_MAX)
-        // if( idx != halo_idxs[i])
-        {
-            printf("Osti %lu\n", idx);
-        }
-        return halo_dp[i];         
-    }                 
-}
-
-datapoint_info_t find_possibly_halo_datapoint_rma(global_context_t* ctx, idx_t idx, MPI_Win win_datapoints)
-{
-    int owner = foreign_owner(ctx, idx);
-    /* find if datapoint is halo or not */
-    if(owner == ctx -> mpi_rank)
-    {
-        idx_t i = idx - ctx -> idx_start;
-        return ctx -> local_datapoints[i];
-    }
-    else
-    {
-        datapoint_info_t tmp_dp;
-        #pragma omp critical
-        {
-            idx_t i = idx - ctx -> rank_idx_start[owner];
-            MPI_Request request;
-            MPI_Status status;
-
-            MPI_Rget(&tmp_dp, sizeof(datapoint_info_t), MPI_BYTE, owner,
-                    i * sizeof(datapoint_info_t), sizeof(datapoint_info_t), MPI_BYTE, win_datapoints, &request);
-            MPI_Wait(&request, MPI_STATUS_IGNORE);
-
-        }
-
-        return tmp_dp;         
-    }                 
-}
-
-datapoint_info_t find_possibly_halo_datapoint_rma_db(global_context_t* ctx, idx_t idx, MPI_Win win_datapoints)
-{
-    int owner = foreign_owner(ctx, idx);
-    /* find if datapoint is halo or not */
-    if(owner == ctx -> mpi_rank)
-    {
-        idx_t i = idx - ctx -> idx_start;
-        return ctx -> local_datapoints[i];
-    }
-    else
-    {
-        idx_t i = idx - ctx -> rank_idx_start[owner];
-        datapoint_info_t tmp_dp;
-        MPI_Request request;
-        MPI_Status status;
-
-        MPI_Rget(&tmp_dp, sizeof(datapoint_info_t), MPI_BYTE, owner,
-                i * sizeof(datapoint_info_t), sizeof(datapoint_info_t), MPI_BYTE, win_datapoints, &request);
-        MPI_Wait(&request, MPI_STATUS_IGNORE);
-
-        return tmp_dp;         
-    }                 
-}
-
-void compute_density_kstarnn(global_context_t* ctx, const float_t d, int verbose){
-
-    /*
-     * Point density computation:                       
-     * args:                                            
-     * - paricles: array of structs                   
-     * - d       : intrinsic dimension of the dataset 
-     * - points  : number of points in the dataset    
-     */
-
-    struct timespec start_tot, finish_tot;
-    double elapsed_tot;
-
-    datapoint_info_t* local_datapoints = ctx -> local_datapoints;
-
-	if(verbose)
-	{
-		printf("Density and k* estimation:\n");
-		clock_gettime(CLOCK_MONOTONIC, &start_tot);
-	}
-
-    idx_t kMAX = ctx -> local_datapoints[0].ngbh.N - 1;   
-
-    float_t omega = 0.;  
-    if(sizeof(float_t) == sizeof(float)){ omega = powf(PI_F,d/2)/tgammaf(d/2.0f + 1.0f);}  
-    else{omega = pow(M_PI,d/2.)/tgamma(d/2.0 + 1.0);}
-
-    #pragma omp parallel for
-    for(idx_t i = 0; i < ctx -> local_n_points; ++i)
-    {
-
-        idx_t j = 4;
-        idx_t k;
-        float_t dL  = 0.;
-        float_t vvi = 0.;
-		float_t vvj = 0.;
-		float_t vp  = 0.;
-        while(j < kMAX  && dL < DTHR)
-        {
-            idx_t ksel = j - 1;
-            vvi = omega * pow(local_datapoints[i].ngbh.data[ksel].value,d/2.);
-
-            idx_t jj = local_datapoints[i].ngbh.data[j].array_idx;
-
-            /* 
-             * note jj can be an halo point 
-             * need to search maybe for it in foreign nodes
-             * */
-
-            datapoint_info_t tmp_dp = find_possibly_halo_datapoint(ctx, jj);
-
-            vvj = omega * pow(tmp_dp.ngbh.data[ksel].value,d/2.);
-
-            /* TO REMOVE
-            if(local_datapoints[i].array_idx == 17734) printf("%lu ksel i %lu j %lu tmp_dp %lu di %lf fj %lf vvi %lf vvj %lf\n", ksel, i, jj, tmp_dp.array_idx, 
-                        sqrt(local_datapoints[i].ngbh.data[ksel].value), sqrt(tmp_dp.ngbh.data[ksel].value), vvi, vvj);
-            */
-
-            vp = (vvi + vvj)*(vvi + vvj);
-            dL = -2.0 * ksel * log(4.*vvi*vvj/vp);
-            j = j + 1;
-        }
-        if(j == kMAX)
-        {
-            k = j - 1;
-            vvi = omega * pow(ctx -> local_datapoints[i].ngbh.data[k].value,d/2.);
-        }
-        else
-        {
-            k = j - 2;
-        }
-        local_datapoints[i].kstar = k;
-        local_datapoints[i].log_rho = log((float_t)(k)/vvi/((float_t)(ctx -> n_points)));
-        //dp_info[i].log_rho = log((float_t)(k)) - log(vvi) -log((float_t)(points));
-        local_datapoints[i].log_rho_err =   1.0/sqrt((float_t)k); //(float_t)(-Q_rsqrt((float)k));
-        local_datapoints[i].g = local_datapoints[i].log_rho - local_datapoints[i].log_rho_err;
-    }
-
-	if(verbose)
-	{
-		clock_gettime(CLOCK_MONOTONIC, &finish_tot);
-		elapsed_tot = (finish_tot.tv_sec - start_tot.tv_sec);
-		elapsed_tot += (finish_tot.tv_nsec - start_tot.tv_nsec) / 1000000000.0;
-		printf("\tTotal time: %.3lfs\n\n", elapsed_tot);
-	}
-
-    #if defined(WRITE_DENSITY)
-        /* densities */
-        float_t* den = (float_t*)MY_MALLOC(ctx -> local_n_points * sizeof(float_t));
-        float_t* gs = (float_t*)MY_MALLOC(ctx -> local_n_points * sizeof(float_t));
-        idx_t* ks = (idx_t*)MY_MALLOC(ctx -> local_n_points * sizeof(idx_t));
-
-        for(int i = 0; i < ctx -> local_n_points; ++i) den[i] = ctx -> local_datapoints[i].log_rho;
-        for(int i = 0; i < ctx -> local_n_points; ++i) ks[i] = ctx -> local_datapoints[i].kstar;
-        for(int i = 0; i < ctx -> local_n_points; ++i) gs[i] = ctx -> local_datapoints[i].g;
-
-        ordered_buffer_to_file(ctx, den, sizeof(float_t), ctx -> local_n_points, "bb/ordered_density.npy");
-        ordered_buffer_to_file(ctx, ks, sizeof(idx_t), ctx -> local_n_points, "bb/ks.npy");
-        ordered_buffer_to_file(ctx, gs, sizeof(float_t), ctx -> local_n_points, "bb/g.npy");
-
-        ordered_data_to_file(ctx);
-
-        free(den);
-        free(ks);
-        free(gs);
-    #endif
-    return;
-
-
-}
-
-float_t get_j_ksel_dist(global_context_t* ctx, idx_t j, idx_t ksel, MPI_Win exposed_ngbh)
-{
-    int owner = foreign_owner(ctx, j);
-    idx_t k = ctx -> k;
-    /* find if datapoint is halo or not */
-    if(owner == ctx -> mpi_rank)
-    {
-        idx_t pos = j - ctx -> idx_start;
-        return ctx -> local_datapoints[pos].ngbh.data[ksel].value;
-    }
-    else
-    {
-        heap_node el;
-        idx_t pos  = j - ctx -> rank_idx_start[owner];
-        MPI_Request request;
-        int err = MPI_Rget(&el, sizeof(heap_node), MPI_BYTE, owner, (MPI_Aint)((pos * k + ksel) * sizeof(heap_node)), sizeof(heap_node), MPI_BYTE, exposed_ngbh, &request);
-        MPI_Wait(&request,MPI_STATUS_IGNORE);
-        return el.value;
-    }                 
-}
-
-idx_t get_j_ksel_idx(global_context_t* ctx, idx_t j, idx_t ksel, MPI_Win exposed_ngbh)
-{
-    int owner = foreign_owner(ctx, j);
-    idx_t k = ctx -> k;
-    /* find if datapoint is halo or not */
-    if(owner == ctx -> mpi_rank)
-    {
-        idx_t pos = j - ctx -> idx_start;
-        return ctx -> local_datapoints[pos].ngbh.data[ksel].array_idx;
-    }
-    else
-    {
-        heap_node el;
-        idx_t pos  = j - ctx -> rank_idx_start[owner];
-        MPI_Request request;
-        int err = MPI_Rget(&el, sizeof(heap_node), MPI_BYTE, owner, (MPI_Aint)((pos * k + ksel) * sizeof(heap_node)), sizeof(heap_node), MPI_BYTE, exposed_ngbh, &request);
-        MPI_Wait(&request,MPI_STATUS_IGNORE);
-        return el.array_idx;
-    }                 
-}
-
-void compute_density_kstarnn_rma(global_context_t* ctx, const float_t d, int verbose){
-
-    /*
-     * Point density computation:                       
-     * args:                                            
-     * - paricles: array of structs                   
-     * - d       : intrinsic dimension of the dataset 
-     * - points  : number of points in the dataset    
-     */
-
-
-    MPI_Info info;
-
-
-    MPI_Barrier(ctx -> mpi_communicator);
-    idx_t k = ctx -> local_datapoints[0].ngbh.N;
-
-    struct timespec start_tot, finish_tot;
-    double elapsed_tot;
-
-    datapoint_info_t* local_datapoints = ctx -> local_datapoints;
-
-	if(verbose)
-	{
-		printf("Density and k* estimation:\n");
-		clock_gettime(CLOCK_MONOTONIC, &start_tot);
-	}
-
-    idx_t kMAX = ctx -> local_datapoints[0].ngbh.N - 1;   
-
-    float_t omega = 0.;  
-    if(sizeof(float_t) == sizeof(float)){ omega = powf(PI_F,d/2)/tgammaf(d/2.0f + 1.0f);}  
-    else{omega = pow(M_PI,d/2.)/tgamma(d/2.0 + 1.0);}
-
-
-    MPI_Win exposed_ngbh;
-    MPI_Win_create( ctx -> __local_heap_buffers, 
-                    ctx -> local_n_points * k * sizeof(heap_node), 
-                    1, MPI_INFO_NULL, 
-                    ctx -> mpi_communicator, 
-                    &exposed_ngbh);
-
-    MPI_Win_fence(0, exposed_ngbh);
-    MPI_Win_lock_all(0, exposed_ngbh);
-
-    MPI_Barrier(ctx -> mpi_communicator);
-
-    #pragma omp parallel for
-    for(idx_t i = 0; i < ctx -> local_n_points; ++i)
-    {
-        for(idx_t k = 0; k <= kMAX; ++k)
-        {
-            local_datapoints[i].ngbh.data[k].value = omega * pow(local_datapoints[i].ngbh.data[k].value, d/2.);  
-        }
-    }
-
-    for(idx_t i = 0; i < ctx -> local_n_points; ++i)
-    {
-
-        idx_t j = 4;
-        idx_t k;
-        float_t dL  = 0.;
-        float_t vvi = 0.;
-		float_t vvj = 0.;
-		float_t vp  = 0.;
-        while(j < kMAX  && dL < DTHR)
-        {
-            idx_t ksel = j - 1;
-            //vvi = omega * pow(local_datapoints[i].ngbh.data[ksel].value,d/2.);
-            vvi = local_datapoints[i].ngbh.data[ksel].value;
-
-            idx_t jj = local_datapoints[i].ngbh.data[j].array_idx;
-
-            /* 
-             * note jj can be an halo point 
-             * need to search maybe for it in foreign nodes
-             * */
-            //float_t dist_jj = get_j_ksel_dist(ctx, jj, ksel, exposed_ngbh);
-            //vvj = omega * pow(dist_jj,d/2.);
-            vvj = get_j_ksel_dist(ctx, jj, ksel, exposed_ngbh);
-
-            vp = (vvi + vvj)*(vvi + vvj);
-            dL = -2.0 * ksel * log(4.*vvi*vvj/vp);
-            j = j + 1;
-        }
-        if(j == kMAX)
-        {
-            k = j - 1;
-            //vvi = omega * pow(ctx -> local_datapoints[i].ngbh.data[k].value,d/2.);
-            vvi = ctx -> local_datapoints[i].ngbh.data[k].value;
-        }
-        else
-        {
-            k = j - 2;
-        }
-        local_datapoints[i].kstar = k;
-        local_datapoints[i].log_rho = log((float_t)(k)/vvi/((float_t)(ctx -> n_points)));
-        //dp_info[i].log_rho = log((float_t)(k)) - log(vvi) -log((float_t)(points));
-        local_datapoints[i].log_rho_err =   1.0/sqrt((float_t)k); //(float_t)(-Q_rsqrt((float)k));
-        local_datapoints[i].g = local_datapoints[i].log_rho - local_datapoints[i].log_rho_err;
-    }
-
-	if(verbose)
-	{
-		clock_gettime(CLOCK_MONOTONIC, &finish_tot);
-		elapsed_tot = (finish_tot.tv_sec - start_tot.tv_sec);
-		elapsed_tot += (finish_tot.tv_nsec - start_tot.tv_nsec) / 1000000000.0;
-		printf("\tTotal time: %.3lfs\n\n", elapsed_tot);
-	}
-
-    MPI_Win_unlock_all(exposed_ngbh);
-    MPI_Win_fence(0, exposed_ngbh);
-    MPI_Win_free(&exposed_ngbh);
-
-    #if defined(WRITE_DENSITY)
-        /* densities */
-        float_t* den = (float_t*)MY_MALLOC(ctx -> local_n_points * sizeof(float_t));
-        float_t* gs = (float_t*)MY_MALLOC(ctx -> local_n_points * sizeof(float_t));
-        idx_t* ks = (idx_t*)MY_MALLOC(ctx -> local_n_points * sizeof(idx_t));
-
-        for(int i = 0; i < ctx -> local_n_points; ++i) den[i] = ctx -> local_datapoints[i].log_rho;
-        for(int i = 0; i < ctx -> local_n_points; ++i) ks[i] = ctx -> local_datapoints[i].kstar;
-        for(int i = 0; i < ctx -> local_n_points; ++i) gs[i] = ctx -> local_datapoints[i].g;
-
-        ordered_buffer_to_file(ctx, den, sizeof(float_t), ctx -> local_n_points, "bb/ordered_density.npy");
-        ordered_buffer_to_file(ctx, ks, sizeof(idx_t), ctx -> local_n_points, "bb/ks.npy");
-        ordered_buffer_to_file(ctx, gs, sizeof(float_t), ctx -> local_n_points, "bb/g.npy");
-
-        ordered_data_to_file(ctx);
-        free(den);
-        free(ks);
-    #endif
-    return;
-
-
-}
-
-void compute_density_kstarnn_rma_v2(global_context_t* ctx, const float_t d, int verbose){
-
-    /*
-     * Point density computation:                       
-     * args:                                            
-     * - paricles: array of structs                   
-     * - d       : intrinsic dimension of the dataset 
-     * - points  : number of points in the dataset    
-     */
-
-
-    MPI_Info info;
-
-
-    MPI_Barrier(ctx -> mpi_communicator);
-    idx_t k = ctx -> local_datapoints[0].ngbh.N;
-
-    struct timespec start_tot, finish_tot;
-    double elapsed_tot;
-
-    datapoint_info_t* local_datapoints = ctx -> local_datapoints;
-
-	if(verbose)
-	{
-		printf("Density and k* estimation:\n");
-		clock_gettime(CLOCK_MONOTONIC, &start_tot);
-	}
-
-    idx_t kMAX = ctx -> local_datapoints[0].ngbh.N - 1;   
-
-    float_t omega = 0.;  
-    if(sizeof(float_t) == sizeof(float)){ omega = powf(PI_F,d/2)/tgammaf(d/2.0f + 1.0f);}  
-    else{omega = pow(M_PI,d/2.)/tgamma(d/2.0 + 1.0);}
-
-
-    MPI_Win exposed_ngbh;
-    MPI_Win_create( ctx -> __local_heap_buffers, 
-                    ctx -> local_n_points * k * sizeof(heap_node), 
-                    1, MPI_INFO_NULL, 
-                    ctx -> mpi_communicator, 
-                    &exposed_ngbh);
-
-    MPI_Win_fence(MPI_MODE_NOPUT, exposed_ngbh);
-
-    MPI_Barrier(ctx -> mpi_communicator);
-
-    #pragma omp parallel for
-    for(idx_t i = 0; i < ctx -> local_n_points; ++i)
-    {
-        for(idx_t k = 0; k <= kMAX; ++k)
-        {
-            local_datapoints[i].ngbh.data[k].value = omega * pow(local_datapoints[i].ngbh.data[k].value, d/2.);  
-        }
-        //initialize kstar at 0
-        local_datapoints[i].kstar = 0;
-    }
-
-    int i_have_finished = 0;
-    int all_have_finished = 0;
-    heap_node* scratch_heap_nodes = (heap_node*)MY_MALLOC(ctx -> local_n_points * sizeof(heap_node));  
-
-        for(idx_t j = 4; j < kMAX - 1; ++j)
-        {
-            i_have_finished = 1;
-            //request data
-            idx_t ksel = j - 1;
-
-#if defined(THREAD_FUNNELED)
-#else
-            #pragma omp parallel for
-#endif
-
-            for(idx_t i = 0; i < ctx -> local_n_points; ++i)
-            {
-
-                if(ctx -> local_datapoints[i].kstar == 0)
-                {
-                    //vvi = omega * pow(local_datapoints[i].ngbh.data[ksel].value,d/2.);
-
-                    idx_t jj = local_datapoints[i].ngbh.data[j].array_idx;
-
-                    /* 
-                     * note jj can be an halo point 
-                     * need to search maybe for it in foreign nodes
-                     * */
-
-                    int owner = foreign_owner(ctx, jj);
-                    idx_t pos = jj - ctx -> rank_idx_start[owner];
-
-                    if(owner == ctx -> mpi_rank)
-                    {
-                        scratch_heap_nodes[i] = ctx -> local_datapoints[pos].ngbh.data[ksel];
-                    }
-                    else
-                    {
-                        MPI_Get(scratch_heap_nodes + i, 
-                                sizeof(heap_node), 
-                                MPI_BYTE, 
-                                owner, 
-                                (MPI_Aint)((pos * (ctx -> k) + ksel) * sizeof(heap_node)), 
-                                sizeof(heap_node), 
-                                MPI_BYTE, 
-                                exposed_ngbh);
-                    }
-                }
-
-            }
-
-            MPI_Win_fence(MPI_MODE_NOPUT,exposed_ngbh); 
-            //process data
-#if defined(THREAD_FUNNELED)
-#else
-            #pragma omp parallel for
-#endif
-            for(idx_t i = 0; i < ctx -> local_n_points; ++i)
-            {
-                if(ctx -> local_datapoints[i].kstar == 0)
-                {
-                    float_t vvi, vvj, vp, dL;
-                    vvi = local_datapoints[i].ngbh.data[ksel].value;
-                    vvj = scratch_heap_nodes[i].value;
-                    vp = (vvi + vvj)*(vvi + vvj);
-                    dL = -2.0 * ksel * log(4.*vvi*vvj/vp);
-
-                    if(dL > DTHR)
-                    {
-                        idx_t k = j - 1;
-                        local_datapoints[i].kstar = k;
-                        local_datapoints[i].log_rho = log((float_t)(k)/vvi/((float_t)(ctx -> n_points)));
-                        local_datapoints[i].log_rho_err =   1.0/sqrt((float_t)k); //(float_t)(-Q_rsqrt((float)k));
-                        local_datapoints[i].g = local_datapoints[i].log_rho - local_datapoints[i].log_rho_err;
-                    }
-                    else
-                    {
-                        #pragma omp atomic write
-                        i_have_finished = 0;
-                    }
-                }
-
-            }
-
-
-            MPI_Allreduce(&i_have_finished, &all_have_finished, 1, MPI_INT, MPI_LAND, ctx -> mpi_communicator);
-
-            if(all_have_finished) break;
-        }
-
-
-#if defined(THREAD_FUNNELED)
-#else
-        #pragma omp parallel for
-#endif
-        for(idx_t i = 0; i < ctx -> local_n_points; ++i)
-        {
-            if(ctx -> local_datapoints[i].kstar == 0)
-            {
-                idx_t k = kMAX - 1;
-
-                float_t vvi = ctx -> local_datapoints[i].ngbh.data[k].value;
-
-                local_datapoints[i].kstar = k;
-                local_datapoints[i].log_rho = log((float_t)(k)/vvi/((float_t)(ctx -> n_points)));
-                local_datapoints[i].log_rho_err =   1.0/sqrt((float_t)k); //(float_t)(-Q_rsqrt((float)k));
-                local_datapoints[i].g = local_datapoints[i].log_rho - local_datapoints[i].log_rho_err;
-            }
-        }
-
-
-    MPI_Win_fence(0, exposed_ngbh);
-    MPI_Win_free(&exposed_ngbh);
-
-    free(scratch_heap_nodes);
-
-    #if defined(WRITE_DENSITY)
-        /* densities */
-        float_t* den = (float_t*)MY_MALLOC(ctx -> local_n_points * sizeof(float_t));
-        float_t* gs = (float_t*)MY_MALLOC(ctx -> local_n_points * sizeof(float_t));
-        idx_t* ks = (idx_t*)MY_MALLOC(ctx -> local_n_points * sizeof(idx_t));
-
-        for(int i = 0; i < ctx -> local_n_points; ++i) den[i] = ctx -> local_datapoints[i].log_rho;
-        for(int i = 0; i < ctx -> local_n_points; ++i) ks[i] = ctx -> local_datapoints[i].kstar;
-        for(int i = 0; i < ctx -> local_n_points; ++i) gs[i] = ctx -> local_datapoints[i].g;
-
-        ordered_buffer_to_file(ctx, den, sizeof(float_t), ctx -> local_n_points, "bb/ordered_density.npy");
-        ordered_buffer_to_file(ctx, ks, sizeof(idx_t), ctx -> local_n_points, "bb/ks.npy");
-        ordered_buffer_to_file(ctx, gs, sizeof(float_t), ctx -> local_n_points, "bb/g.npy");
-
-        ordered_data_to_file(ctx);
-        free(den);
-        free(ks);
-    #endif
-    return;
-
-
-}
-
-float_t get_j_ksel_dist_v2(global_context_t* ctx, idx_t i, idx_t j, idx_t ksel, int* flags, heap_node* tmp_heap_nodes, MPI_Win* exposed_ngbh)
-{
-    if(flags[i])
-    {
-        int owner = foreign_owner(ctx, j);
-        idx_t k = ctx -> k;
-        /* find if datapoint is halo or not */
-        if(owner == ctx -> mpi_rank)
-        {
-            idx_t pos = j - ctx -> idx_start;
-            return ctx -> local_datapoints[pos].ngbh.data[ksel].value;
-        }
-        else
-        {
-            //RMA
-            flags[i] = 0;
-            idx_t pos  = j - ctx -> rank_idx_start[owner];
-            MPI_Get(tmp_heap_nodes + i, sizeof(heap_node), MPI_BYTE, owner, (MPI_Aint)((pos * k + ksel) * sizeof(heap_node)), sizeof(heap_node), MPI_BYTE, *exposed_ngbh);
-            //if(ctx -> mpi_rank == 0) DB_PRINT("rvcd %lu %lf\n", el.array_idx, el.value);
-            return 0;
-        }                 
-    }
-    else
-    {
-        flags[i] = 1;
-        return tmp_heap_nodes[i].value;
-    }
-}
-
-void clusters_allocate(clusters_t * c, int s)
-{
-	/*
-	 * Helper function for handling allocation of resources 
-	 */ 
-    if(c -> centers.count == 0)
-    {
-        printf("Provide a valid cluster centers list\n");
-        return;
-    }
-
-    idx_t nclus = c -> centers.count;
-
-    
-    if(s)
-    {
-	    //printf("Using sparse implementation\n");
-	    c -> use_sparse_borders = 1;
-	    c -> sparse_borders = (adj_list_t*)MY_MALLOC(nclus*sizeof(adj_list_t));
-	    for(idx_t i = 0; i < nclus; ++i)
-	    {
-		    c -> sparse_borders[i].count = 0;
-		    c -> sparse_borders[i].size  = PREALLOC_BORDERS;
-		    c -> sparse_borders[i].data  = (sparse_border_t*)MY_MALLOC(PREALLOC_BORDERS*sizeof(sparse_border_t));
-	    }
-
-    }
-    else
-    {
-	    //printf("Using dense implementation\n");
-	    c -> use_sparse_borders = 0;
-	    c -> __borders_data         = (border_t*)MY_MALLOC(nclus*nclus*sizeof(border_t)); 
-	    c -> borders                = (border_t**)MY_MALLOC(nclus*sizeof(border_t*));
-
-	    #pragma omp parallel for
-
-	    for(idx_t i = 0; i < nclus; ++i)
-	    {
-			c -> borders[i]         = c -> __borders_data + i*nclus;
-			for(idx_t j = 0; j < nclus; ++j)
-			{
-				c -> borders[i][j] = border_null;
-			}
-	    }
-    }
-}
-
-
-void adj_list_insert(adj_list_t* l, sparse_border_t b)
-{
-	/*
-	 * Handling of sparse border implementation as an adjecency list
-	 */
-	if(l -> count < l -> size)
-	{
-		l -> data[l -> count] = b;
-		l -> count++;
-	}
-	else
-	{
-		l -> size += PREALLOC_BORDERS; 
-		l -> data = realloc( l -> data, sizeof(sparse_border_t) * ( l -> size));
-		l -> data[l -> count] = b;
-		l -> count++;
-	}
-}
-
-void adj_list_reset(adj_list_t* l)
-{
-	/*
-	 * Handling of sparse border implementation as an adjecency list
-	 */
-	if(l -> data) free(l -> data);
-	l -> count = 0;
-	l -> size  = 0;
-	l -> data  = NULL;
-}
-
-void clusters_reset(clusters_t * c)
-{
-	/* 
-	 * Handling reset of clusters object 
-	 */
-	if(c -> use_sparse_borders)
-	{
-		for(idx_t i = 0; i < c -> centers.count; ++i)
-		{
-			adj_list_reset((c -> sparse_borders) + i);
-		
-		}
-		free(c -> sparse_borders);
-		c -> sparse_borders = NULL;
-	}
-	else
-	{
-		if(c -> __borders_data)  free(c -> __borders_data);
-		if(c -> borders) free(c -> borders);
-	}
-	if(c -> centers.data) free(c -> centers.data);
-}
-
-void clusters_free(clusters_t * c)
-{
-	/*
-	 * Free cluster object
-	 */
-    clusters_reset(c);
-}
-
-
-void sparse_border_insert(clusters_t *c, sparse_border_t b)
-{
-	/*
-	 * Insert a border element in the sparse implementation
-	 */
-
-	idx_t i = b.i;
-	adj_list_t l = c -> sparse_borders[i];
-	int check = 1;
-	for(idx_t k = 0; k < l.count; ++k)
-	{
-		sparse_border_t p = l.data[k];
-		if(p.i == b.i && p.j == b.j)
-		{
-			if( b.density > p.density)
-			{
-				l.data[k] = b;
-			}
-			check = 0;
-		}
-	}
-	if(check) adj_list_insert(c -> sparse_borders + i, b);
-	return;
-}
-
-sparse_border_t sparse_border_get(clusters_t* c, idx_t i, idx_t j)
-{
-	/*
-	 * Get a border element in the sparse implementation
-	 * - i,j: cluster to search for borders
-	 * return border_null if not found
-	 */
-
-	sparse_border_t b = sparse_border_null;
-	adj_list_t l = c -> sparse_borders[i];
-	for(idx_t el = 0; el < l.count; ++el)
-	{
-		sparse_border_t candidate = l.data[el];
-		if(candidate.i == i && candidate.j == j)
-		{
-			b = candidate;
-		}
-	}
-	return b;
-}
-
-
-int cmpPP(const void* p1, const void *p2)
-{
-    /*
-     * Utility function to perform quicksort   
-     * when clustering assignment is performed    
-     */
-    datapoint_info_t* pp1 = *(datapoint_info_t**)p1;
-    datapoint_info_t* pp2 = *(datapoint_info_t**)p2;
-    //return 2*(pp1 -> g < pp2 -> g) - 1;
-    int v = (pp1 -> g < pp2 -> g) - (pp1 -> g > pp2 -> g);
-    int a = (pp1 -> array_idx < pp2 -> array_idx) - (pp1 -> array_idx > pp2 -> array_idx);
-    return v == 0 ? a : v;
-    //return v;
-}
-
-inline int its_mine(global_context_t* ctx, idx_t idx)
-{
-    return idx >= ctx -> idx_start && idx < ctx -> idx_start + ctx -> local_n_points;
-}
-
-int compare_center(const void* a, const void* b)
-{
-    center_t* aa = (center_t*)a;
-    center_t* bb = (center_t*)b;
-    return (aa -> density < bb -> density) - (aa -> density > bb -> density);
-}
-
-
-void compute_correction(global_context_t* ctx, float_t Z)
-{
-    /*
-     * Utility function, find the minimum value of the density of the datapoints
-     * and shift them up in order to further work with values greater than 0     
-     */
-    float_t min_log_rho = 999999.9;
-
-    datapoint_info_t* dp_info = ctx -> local_datapoints;
-    idx_t n = ctx -> local_n_points;
-    
-
-    #pragma omp parallel
-    {
-        float_t thread_min_log_rho = 9999999.;
-        #pragma omp for
-        for(idx_t i = 0; i < n; ++i)
-        {
-            float_t tmp = dp_info[i].log_rho - Z*dp_info[i].log_rho_err;
-            if(tmp < thread_min_log_rho){
-                thread_min_log_rho = tmp;
-            }
-        }
-        #pragma omp critical
-        if(thread_min_log_rho < min_log_rho) min_log_rho = thread_min_log_rho;
-    }
-
-    MPI_Allreduce(MPI_IN_PLACE, &min_log_rho, 1, MPI_MY_FLOAT, MPI_MIN, ctx -> mpi_communicator);
-
-    #pragma omp parallel
-    {
-        #pragma omp for
-        for(idx_t i = 0; i < n; ++i)
-        {
-            dp_info[i].log_rho_c = dp_info[i].log_rho - min_log_rho + 1;
-            dp_info[i].g = dp_info[i].log_rho_c - dp_info[i].log_rho_err;
-        }
-
-    }
-
-    //printf("%lf\n",min_log_rho);
-}
-
-clusters_t Heuristic1_rma(global_context_t *ctx, int verbose)
-{
-    /*
-     * Heurisitc 1, from paper of Errico, Facco, Laio & Rodriguez 
-     * ( https://doi.org/10.1016/j.ins.2021.01.010 )              
-     *                                                            
-     * args:                                                      
-     */
-
-    HERE
-    datapoint_info_t* dp_info = ctx -> local_datapoints;
-    idx_t n = ctx -> local_n_points; 
-
-    struct timespec start_tot, finish_tot;
-    double elapsed_tot;
-
-	if(verbose)
-	{
-		printf("H1: Preliminary cluster assignment\n");
-		clock_gettime(CLOCK_MONOTONIC, &start_tot);
-	}
-    
-    TIME_DEF;
-
-
-    //idx_t ncenters = 0;
-    //idx_t putativeCenters = n;
-    lu_dynamic_array_t all_centers, removed_centers, actual_centers, max_rho;
-    lu_dynamic_array_allocate(&all_centers);
-    lu_dynamic_array_allocate(&removed_centers);
-    lu_dynamic_array_allocate(&actual_centers);
-    lu_dynamic_array_allocate(&max_rho);
-
-    datapoint_info_t** dp_info_ptrs = (datapoint_info_t**)MY_MALLOC(n*sizeof(datapoint_info_t*));
-
-    struct timespec start, finish;
-    double elapsed;
-
-
-    if(verbose)
-	{
-		clock_gettime(CLOCK_MONOTONIC, &start);
-	}
-
-    /* proceed */
-
-    MPI_Win win_datapoints;
-    MPI_Win_create(ctx -> local_datapoints, ctx -> local_n_points * sizeof(datapoint_info_t), 
-                   1, MPI_INFO_NULL, ctx -> mpi_communicator, &win_datapoints);
-    MPI_Win_fence(0, win_datapoints);
-    MPI_Win_lock_all(0,  win_datapoints);
-    HERE
-
-#if defined(THREAD_FUNNELED)
-#else
-    #pragma omp parallel for
-#endif
-
-    for(idx_t i = 0; i < n; ++i)
-    {   
-        /*
-         * Find the centers of the clusters as the points of higher density in their neighborhoods
-         * A point is tagged as a putative center if it is the point of higer density of its neighborhood 
-         */
-
-        dp_info_ptrs[i] = dp_info + i;
-        idx_t maxk = dp_info[i].kstar + 1;
-        float_t gi = dp_info[i].g;
-        dp_info[i].is_center = 1;
-        dp_info[i].cluster_idx = -1;
-        //printf("%lf\n",p -> g);
-        heap i_ngbh = dp_info[i].ngbh;
-        for(idx_t k = 1; k < maxk; ++k)
-        {
-            idx_t ngbh_index = i_ngbh.data[k].array_idx;
-            datapoint_info_t dj = find_possibly_halo_datapoint_rma(ctx, ngbh_index, win_datapoints);
-            float_t gj = dj.g;
-            if(gj > gi){
-                dp_info[i].is_center = 0;
-                break;
-            }
-        }
-        if(dp_info[i].is_center)
-        {
-            //lu_dynamic_array_pushBack(&all_centers, dp_info[i].array_idx);
-            #pragma omp critical
-            {
-                lu_dynamic_array_pushBack(&all_centers, i);
-            }
-        }
-    }
-
-    HERE
-
-    if(verbose)
-	{
-		clock_gettime(CLOCK_MONOTONIC, &finish);
-		elapsed = (finish.tv_sec - start.tv_sec);
-		elapsed += (finish.tv_nsec - start.tv_nsec) / 1000000000.0;
-		printf("\tFinding putative centers: %.3lfs\n",elapsed);
-		clock_gettime(CLOCK_MONOTONIC, &start);
-	}
-
-
-	/* 
-	 * optimized version
-	 *
-	 * Generate a mask that keeps track of the point has been eliminating the 
-	 * point considered. Each thread updates this mask, then after the procedure
-	 * ends, center, removed centers, and max_rho arrays are populated
-	 */
-		
-	heap_node* to_remove_mask = (heap_node*)MY_MALLOC(n*sizeof(heap_node));
-    for(idx_t p = 0; p < n; ++p) 
-    {
-        to_remove_mask[p].array_idx = MY_SIZE_MAX;
-        to_remove_mask[p].value = 9999999;
-    }
-    qsort(dp_info_ptrs, n, sizeof(datapoint_info_t*), cmpPP);
-
-
-    MPI_Win win_to_remove_mask;
-    MPI_Win_create(to_remove_mask, n * sizeof(heap_node), 1, MPI_INFO_NULL, ctx -> mpi_communicator, &win_to_remove_mask);
-    MPI_Win_fence(0, win_to_remove_mask);
-
-
-
-    /* check into internal nodes */
-
-    /* 
-     * to remove 
-     * and to reimplement it using rma
-     * for dp in datapoints:
-     *  for nghb in neighbors:
-     *   if ngbh its_mine
-     *      no_problems, do it as always
-     *   else:
-     *      ngbh is an external node
-     *      do rma things,
-     *      in particular, acquire a lock on the window, read and write things
-     *      then close
-     */
-
-#if defined(THREAD_FUNNELED)
-#else
-    #pragma omp parallel for
-#endif
-    for(idx_t p = 0; p < n; ++p)
-    {
-        datapoint_info_t i_point = *(dp_info_ptrs[p]);
-
-        for(idx_t j = 1; j < i_point.kstar + 1; ++j)
-        {
-            idx_t jidx = i_point.ngbh.data[j].array_idx; 
-
-            datapoint_info_t j_point = find_possibly_halo_datapoint_rma(ctx, jidx,  win_datapoints);
-
-            if(j_point.is_center && i_point.g > j_point.g)
-            {
-                #pragma omp critical
-                {
-                    int owner = foreign_owner(ctx, jidx);
-                    idx_t jpos = jidx - ctx -> rank_idx_start[owner];
-
-                    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, owner, 0, win_to_remove_mask);
-                    heap_node mask_element;
-                    MPI_Request request;
-                    MPI_Rget(   &mask_element, sizeof(heap_node), MPI_BYTE, 
-                            owner, jpos * sizeof(heap_node) , sizeof(heap_node), MPI_BYTE, win_to_remove_mask, &request);
-                    MPI_Wait(&request, MPI_STATUS_IGNORE);
-
-                    int flag = mask_element.array_idx == MY_SIZE_MAX;							
-                    if(flag || i_point.g > mask_element.value )
-                    {
-                        heap_node tmp_mask_element = {.array_idx = i_point.array_idx, .value = i_point.g};
-                        MPI_Request request;
-                        MPI_Rput(&tmp_mask_element, sizeof(heap_node), MPI_BYTE, owner, 
-                                jpos*sizeof(heap_node), sizeof(heap_node), MPI_BYTE, win_to_remove_mask, &request);
-                        MPI_Wait(&request, MPI_STATUS_IGNORE);
-
-                    }
-
-                    MPI_Win_unlock(owner, win_to_remove_mask);
-                }
-            }
-        }
-    }
-    
-    MPI_Win_fence(0, win_to_remove_mask);
-    MPI_Barrier(ctx -> mpi_communicator);
-    HERE
-
-	/* populate the usual arrays */
-    for(idx_t p = 0; p < all_centers.count; ++p)
-    {
-        idx_t i = all_centers.data[p];
-        int e = 0;
-        //float_t gi = dp_info[i].g;
-        idx_t mr = to_remove_mask[i].array_idx;
-        if(mr != MY_SIZE_MAX)
-        {
-            //if(dp_info[mr].g > gi) e = 1;
-			e = 1;
-        }
-        switch (e)
-        {
-            case 1:
-                {
-                    lu_dynamic_array_pushBack(&removed_centers,i);
-                    dp_info[i].is_center = 0;
-                    for(idx_t c = 0; c < removed_centers.count - 1; ++c)
-                    {
-                        if(mr == removed_centers.data[c])
-                        {
-                            mr = max_rho.data[c];
-                        }
-                    }
-                    lu_dynamic_array_pushBack(&max_rho,mr);
-                    
-                }
-                break;
-
-            case 0:
-                {
-                    lu_dynamic_array_pushBack(&actual_centers,i);
-                    dp_info[i].cluster_idx = actual_centers.count - 1;
-                }
-                break;
-
-            default:
-                break;
-        }
-    }
-
-    HERE
-    MPI_Win_free(&win_to_remove_mask);
-	free(to_remove_mask);
-
-    if(verbose)
-	{
-		clock_gettime(CLOCK_MONOTONIC, &finish);
-		elapsed = (finish.tv_sec - start.tv_sec);
-		elapsed += (finish.tv_nsec - start.tv_nsec) / 1000000000.0;
-		printf("\tFinding actual centers:   %.3lfs\n",elapsed);
-		clock_gettime(CLOCK_MONOTONIC, &start);
-	}
-
-    int n_centers = (int)actual_centers.count;
-    int tot_centers;
-    MPI_Allreduce(&n_centers, &tot_centers, 1, MPI_INT, MPI_SUM, ctx -> mpi_communicator);
-    // MPI_Allreduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
-    /*
-    DB_PRINT("rank %d got %d heheh\n", ctx -> mpi_rank, (int)actual_centers.count);
-    DB_PRINT("rank %d tc %d rc %d\n", ctx -> mpi_rank, (int)all_centers.count, (int)removed_centers.count);
-    */
-    MPI_DB_PRINT("%d bbb\n", tot_centers);
-
-
-    /* bring on master all centers 
-     * order them in ascending order of density, 
-     * then re-scatter them around to get unique cluster labels */ 
-
-
-    center_t* private_centers_buffer = (center_t*)MY_MALLOC(actual_centers.count * sizeof(center_t));
-
-    center_t* global_centers_buffer  = (center_t*)MY_MALLOC(tot_centers * sizeof(center_t));
-
-    for(int i = 0; i < actual_centers.count; ++i)
-    {
-        idx_t idx = actual_centers.data[i] ;
-        private_centers_buffer[i].density = dp_info[idx].g;
-        private_centers_buffer[i].idx     = dp_info[idx].array_idx;
-    }
-    MPI_Datatype mpi_center_type;
-    
-    MPI_Type_contiguous(sizeof(center_t), MPI_BYTE, &mpi_center_type);
-    MPI_Type_commit(&mpi_center_type);
-
-    int* center_counts = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
-    int* center_displs = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
-
-    int cc = (int)actual_centers.count;
-    MPI_Allgather(&cc, 1, MPI_INT, center_counts, 1, MPI_INT, ctx -> mpi_communicator);
-
-    center_displs[0] = 0;
-    for(int i = 1; i < ctx -> world_size; ++i) center_displs[i] = center_displs[i - 1] + center_counts[i - 1];
-
-    /* alternative to check if something breaks */
-    for(int i = 0; i < actual_centers.count; ++i)
-    {
-        idx_t idx = actual_centers.data[i];
-        dp_info[idx].cluster_idx += center_displs[ctx -> mpi_rank];
-
-    }
-
-    idx_t* all_center_idx = (idx_t*)MY_MALLOC(tot_centers * sizeof(idx_t));
-    
-    MPI_Allgatherv(actual_centers.data, actual_centers.count, MPI_UINT64_T, all_center_idx, center_counts, center_displs, MPI_UINT64_T, ctx -> mpi_communicator);
-
-    free(center_counts);
-    free(center_displs);
-
-    /*
-     * Sort all the dp_info based on g and then perform the cluster assignment
-     * in asceding order                                                     
-     */
-
-    /*
-     * Sort all the dp_info based on g and then perform the cluster assignment
-     * in asceding order                                                     
-     */
-                                                                                
-    int completed = 0;
-
-    while(!completed)
-    {
-        completed = 1;
-
-
-        int proc_points = 0;
-        /* retrieve assignment 
-         * if some point remain uncompleted then get assignment from external points */
-
-        for(idx_t i = 0; i < n; ++i)
-        {   
-            int wait_for_comms = 0;
-            datapoint_info_t* p = dp_info_ptrs[i];
-            if(!(p -> is_center) && p -> cluster_idx < 0)
-            {
-                int cluster = -1;
-                idx_t k = 0;
-                idx_t p_idx;
-                idx_t max_k = p -> ngbh.N;
-                /*assign each particle at the same cluster as the nearest particle of higher density*/
-                datapoint_info_t p_retrieved;
-                while( k < max_k - 1 && cluster == -1)
-                {
-
-
-                    ++k;
-                    p_idx = p -> ngbh.data[k].array_idx;
-                    p_retrieved = find_possibly_halo_datapoint_rma(ctx, p_idx, win_datapoints);
-
-                    int flag = p_retrieved.g > p -> g;
-
-                    if(p_retrieved.g > p -> g)
-                    {
-                        cluster = p_retrieved.cluster_idx; 
-                        wait_for_comms = 1;
-                        break;
-                    }
-                    //if(p -> array_idx == 1587636) printf("%lu k %d p_idx %lu %lf %lf\n", k, cluster, p_idx, p_retrieved.g, p -> g );
-                }
-
-
-                if(cluster == -1 && !wait_for_comms)
-                {
-                    float_t gmax = -99999.;               
-                    idx_t gm_index = SIZE_MAX;
-                    for(idx_t k = 0; k < max_k; ++k)
-                    {
-                        idx_t ngbh_index = p -> ngbh.data[k].array_idx;
-                        for(idx_t m = 0; m < removed_centers.count; ++m)
-                        {
-                            idx_t max_rho_idx = max_rho.data[m];
-                            datapoint_info_t dp_max_rho = find_possibly_halo_datapoint_rma(ctx, max_rho_idx, win_datapoints);
-                            //datapoint_info_t dp_max_rho = dp_info[max_rho_idx];
-                            float_t gcand = dp_max_rho.g;
-                            if(ngbh_index == removed_centers.data[m] && gcand > gmax)
-                            {   
-                                //printf("%lu -- %lu\n", ele, m);
-                                gmax = gcand;
-                                gm_index = max_rho.data[m];
-                            }
-                        }
-                    }
-                    if(gm_index != SIZE_MAX)
-                    {
-                        datapoint_info_t dp_gm = find_possibly_halo_datapoint_rma(ctx, gm_index, win_datapoints);
-                        //datapoint_info_t dp_gm = dp_info[gm_index];
-                        cluster = dp_gm.cluster_idx;
-                    }
-
-                }
-                p -> cluster_idx = cluster;
-
-            }
-            completed = completed && p -> cluster_idx != -1; 
-            proc_points += p -> cluster_idx != -1 ?  1 : 0;
-
-        }
-
-        //DB_PRINT("rank %d proc %d\n", ctx -> mpi_rank, proc_points);
-        MPI_Allreduce(MPI_IN_PLACE, &completed, 1, MPI_INT, MPI_SUM, ctx -> mpi_communicator);
-        completed = completed == ctx -> world_size ? 1 : 0;
-
-    }
-
-    MPI_Win_unlock_all(win_datapoints);
-    MPI_Win_fence(0, win_datapoints);
-    MPI_Win_free(&win_datapoints);
-
-    MPI_Barrier(ctx -> mpi_communicator);
-
-    if(verbose)
-	{
-		clock_gettime(CLOCK_MONOTONIC, &finish);
-		elapsed = (finish.tv_sec - start.tv_sec);
-		elapsed += (finish.tv_nsec - start.tv_nsec) / 1000000000.0;
-		printf("\tTentative clustering:     %.3lfs\n",elapsed);
-		clock_gettime(CLOCK_MONOTONIC, &start);
-	}
-
-    free(dp_info_ptrs);
-    free(max_rho.data);
-    free(removed_centers.data);
-    free(all_centers.data);
-
-    #if defined(WRITE_CLUSTER_ASSIGN_H1)
-        /* densities */
-        int* ks = (int*)MY_MALLOC(ctx -> local_n_points * sizeof(int));
-        for(int i = 0; i < ctx -> local_n_points; ++i) ks[i] = ctx -> local_datapoints[i].cluster_idx;
-
-        ordered_buffer_to_file(ctx, ks, sizeof(int), ctx -> local_n_points, "bb/cl.npy");
-        ordered_data_to_file(ctx);
-        free(ks);
-    #endif
-
-
-    free(actual_centers.data);
-    actual_centers.size  = tot_centers;
-    actual_centers.count = tot_centers;
-    actual_centers.data  = all_center_idx;
-
-
-    clusters_t c_all;
-    c_all.centers = actual_centers;
-
-
-
-    if(verbose)
-	{
-		clock_gettime(CLOCK_MONOTONIC, &finish);
-		elapsed = (finish.tv_sec - start.tv_sec);
-		elapsed += (finish.tv_nsec - start.tv_nsec) / 1000000000.0;
-		printf("\tFinalizing clustering:    %.3lfs\n",elapsed);
-		printf("\n");
-	}
-
-    clock_gettime(CLOCK_MONOTONIC, &finish_tot);
-    elapsed_tot = (finish_tot.tv_sec - start_tot.tv_sec);
-    elapsed_tot += (finish_tot.tv_nsec - start_tot.tv_nsec) / 1000000000.0;
-
-
-	if(verbose)
-	{
-		printf("\tFound %lu clusters\n",(uint64_t)actual_centers.count);
-		printf("\tTotal time: %.3lfs\n\n", elapsed_tot);
-	}
-
-    c_all.n = n;
-    return c_all;
-}
-
-clusters_t Heuristic1(global_context_t *ctx, int verbose)
-{
-    /*
-     * Heurisitc 1, from paper of Errico, Facco, Laio & Rodriguez 
-     * ( https://doi.org/10.1016/j.ins.2021.01.010 )              
-     *                                                            
-     * args:                                                      
-     */
-    datapoint_info_t* dp_info = ctx -> local_datapoints;
-    idx_t n = ctx -> local_n_points; 
-
-    struct timespec start_tot, finish_tot;
-    double elapsed_tot;
-
-	if(verbose)
-	{
-		printf("H1: Preliminary cluster assignment\n");
-		clock_gettime(CLOCK_MONOTONIC, &start_tot);
-	}
-    
-    TIME_DEF;
-
-
-    //idx_t ncenters = 0;
-    //idx_t putativeCenters = n;
-    lu_dynamic_array_t all_centers, removed_centers, actual_centers, max_rho;
-    lu_dynamic_array_allocate(&all_centers);
-    lu_dynamic_array_allocate(&removed_centers);
-    lu_dynamic_array_allocate(&actual_centers);
-    lu_dynamic_array_allocate(&max_rho);
-
-    datapoint_info_t** dp_info_ptrs = (datapoint_info_t**)MY_MALLOC(n*sizeof(datapoint_info_t*));
-
-    struct timespec start, finish;
-    double elapsed;
-
-
-    if(verbose)
-	{
-		clock_gettime(CLOCK_MONOTONIC, &start);
-	}
-
-
-    /* exchage halo G */
-
-    /* compute displs */
-    int* sdispls = (int*)calloc(ctx -> world_size , sizeof(int));
-    int* rdispls = (int*)calloc(ctx -> world_size , sizeof(int));
-
-    int tot_count_send = ctx -> n_halo_points_send[0];
-    int tot_count_recv = ctx -> n_halo_points_recv[0];
-    for(int i = 1; i < ctx -> world_size; ++i)
-    {
-        sdispls[i] = sdispls[i - 1] + ctx -> n_halo_points_send[i - 1];
-        rdispls[i] = rdispls[i - 1] + ctx -> n_halo_points_recv[i - 1];
-
-        tot_count_send += ctx -> n_halo_points_send[i];
-        tot_count_recv += ctx -> n_halo_points_recv[i];
-    }
-
-    float_t* gs_send = (float_t*)MY_MALLOC(tot_count_send * sizeof(float_t));
-    float_t* gs_recv = (float_t*)MY_MALLOC(tot_count_recv * sizeof(float_t));
-    idx_t* ks_send = (idx_t*)MY_MALLOC(tot_count_send * sizeof(idx_t));
-    idx_t* ks_recv = (idx_t*)MY_MALLOC(tot_count_recv * sizeof(idx_t));
-
-    /* pack it in gsend */
-    for(int i = 0; i < ctx -> world_size; ++i)
-    {
-        for(int j = 0; j < ctx -> n_halo_points_send[i]; ++j)
-        {
-            idx_t idx = ctx -> idx_halo_points_send[i][j] - ctx -> idx_start;
-            gs_send[j + sdispls[i]] = ctx -> local_datapoints[idx].g;
-            ks_send[j + sdispls[i]] = ctx -> local_datapoints[idx].kstar;
-        }
-    }
-
-    /*
-    printf("Rank %d", ctx -> mpi_rank);
-    //for(int i = 0; i < ctx -> world_size; ++i) printf("i %d [ s: %d r: %d ] ", i, ctx -> n_halo_points_send[i], ctx -> n_halo_points_recv[i]);
-    for(int i = 0; i < ctx -> world_size; ++i) printf("i %d [ s: %d r: %d ] ", i, sdispls[i], rdispls[i]);
-    printf("\n");
-    */
-
-
-    MPI_Barrier(ctx -> mpi_communicator);
-    /* exchange */ 
-    MPI_Alltoallv(gs_send, ctx -> n_halo_points_send, sdispls, MPI_MY_FLOAT, gs_recv, ctx -> n_halo_points_recv, rdispls, MPI_MY_FLOAT, ctx -> mpi_communicator);
-    MPI_Alltoallv(ks_send, ctx -> n_halo_points_send, sdispls, MPI_UINT64_T, ks_recv, ctx -> n_halo_points_recv, rdispls, MPI_UINT64_T, ctx -> mpi_communicator);
-
-    /* unpack */
-
-    for(int i = 0; i < ctx -> world_size; ++i)
-    {
-        for(int j = 0; j < ctx -> n_halo_points_recv[i]; ++j)
-        {
-            ctx -> halo_datapoints[i][j].g = gs_recv[rdispls[i] + j];
-            ctx -> halo_datapoints[i][j].kstar = ks_recv[rdispls[i] + j];
-            ctx -> halo_datapoints[i][j].array_idx = ctx -> idx_halo_points_recv[i][j];
-        }
-    }
-
-    /* proceed */
-
-
-    for(idx_t i = 0; i < n; ++i)
-    {   
-        /*
-         * Find the centers of the clusters as the points of higher density in their neighborhoods
-         * A point is tagged as a putative center if it is the point of higer density of its neighborhood 
-         */
-
-        dp_info_ptrs[i] = dp_info + i;
-        idx_t maxk = dp_info[i].kstar + 1;
-        float_t gi = dp_info[i].g;
-        dp_info[i].is_center = 1;
-        dp_info[i].cluster_idx = -1;
-        //printf("%lf\n",p -> g);
-        heap i_ngbh = dp_info[i].ngbh;
-        for(idx_t k = 1; k < maxk; ++k)
-        {
-            idx_t ngbh_index = i_ngbh.data[k].array_idx;
-            datapoint_info_t dj = find_possibly_halo_datapoint(ctx, ngbh_index);
-            float_t gj = dj.g;
-            if(gj > gi){
-                dp_info[i].is_center = 0;
-                break;
-            }
-        }
-        if(dp_info[i].is_center)
-        {
-            //lu_dynamic_array_pushBack(&all_centers, dp_info[i].array_idx);
-            lu_dynamic_array_pushBack(&all_centers, i);
-        }
-
-
-    }
-
-
-    if(verbose)
-	{
-		clock_gettime(CLOCK_MONOTONIC, &finish);
-		elapsed = (finish.tv_sec - start.tv_sec);
-		elapsed += (finish.tv_nsec - start.tv_nsec) / 1000000000.0;
-		printf("\tFinding putative centers: %.3lfs\n",elapsed);
-		clock_gettime(CLOCK_MONOTONIC, &start);
-	}
-
-
-	/* 
-	 * optimized version
-	 *
-	 * Generate a mask that keeps track of the point has been eliminating the 
-	 * point considered. Each thread updates this mask, then after the procedure
-	 * ends, center, removed centers, and max_rho arrays are populated
-	 */
-		
-	idx_t* to_remove_mask = (idx_t*)MY_MALLOC(n*sizeof(idx_t));
-    for(idx_t p = 0; p < n; ++p) {to_remove_mask[p] = MY_SIZE_MAX;}
-    qsort(dp_info_ptrs, n, sizeof(datapoint_info_t*), cmpPP);
-
-    /* check into internal nodes */
-
-    /* 
-     * to remove 
-     * and to reimplement it using rma
-     * for dp in datapoints:
-     *  for nghb in neighbors:
-     *   if ngbh its_mine
-     *      no_problems, do it as always
-     *   else:
-     *      ngbh is an external node
-     *      do rma things,
-     *      in particular, acquire a lock on the window, read and write things
-     *      then close
-     */
-    #pragma omp parallel shared(to_remove_mask)
-    {
-		#pragma omp for
-		for(idx_t p = 0; p < n; ++p)
-		{
-			datapoint_info_t pp = *(dp_info_ptrs[p]);
-			int flag = 0;
-			idx_t ppp = 0;
-
-			for(idx_t j = 1; j < pp.kstar + 1; ++j)
-			{
-				idx_t jidx = pp.ngbh.data[j].array_idx; 
-                idx_t jpos = jidx - ctx -> idx_start;
-
-                if(its_mine(ctx, jidx))
-                {
-                    if(dp_info[jpos].is_center && pp.g > dp_info[jpos].g)
-                    {
-
-                        #pragma omp critical 
-                        {
-                            ppp = to_remove_mask[jpos];
-                            datapoint_info_t ppp_datapoint; 
-                            flag = ppp != MY_SIZE_MAX;							
-                            if(flag) 
-                            {
-                                ppp_datapoint = find_possibly_halo_datapoint(ctx, ppp);
-                                to_remove_mask[jpos] = pp.g > ppp_datapoint.g ? pp.array_idx : ppp_datapoint.array_idx; 
-                            }
-                            else
-                            {
-                                to_remove_mask[jpos] = pp.array_idx; 
-                            }
-                        }
-                    }
-                }
-            }
-		}
-	}
-
-    /* check external nodes */
-    for(int i = 0; i < ctx -> world_size; ++i)
-    {
-        for(int j = 0; j < ctx -> n_halo_points_recv[i]; ++j)
-        {
-            datapoint_info_t ext_dp = ctx -> halo_datapoints[i][j];
-
-			int flag = 0;
-			idx_t ppp = 0;
-
-            for(idx_t k = 0; k < ext_dp.kstar + 1; ++k)
-            {
-				idx_t kidx = ext_dp.ngbh.data[k].array_idx; 
-                idx_t kpos = kidx - ctx -> idx_start;
-
-                if(its_mine(ctx, kidx))
-                {
-                    if(dp_info[kpos].is_center && ext_dp.g > dp_info[kpos].g)
-                    {
-                        #pragma omp critical 
-                        {
-                            ppp = to_remove_mask[kpos];
-                            datapoint_info_t ppp_datapoint; 
-                            flag = ppp != MY_SIZE_MAX;							
-                            if(flag) 
-                            {
-                                ppp_datapoint = find_possibly_halo_datapoint(ctx, ppp);
-                                to_remove_mask[kpos] = ext_dp.g > ppp_datapoint.g ? ext_dp.array_idx : ppp_datapoint.array_idx; 
-                            }
-                            else
-                            {
-                                to_remove_mask[kpos] = ext_dp.array_idx; 
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-	/* populate the usual arrays */
-    for(idx_t p = 0; p < all_centers.count; ++p)
-    {
-        idx_t i = all_centers.data[p];
-        int e = 0;
-        //float_t gi = dp_info[i].g;
-        idx_t mr = to_remove_mask[i];
-        if(mr != MY_SIZE_MAX)
-        {
-            //if(dp_info[mr].g > gi) e = 1;
-			e = 1;
-        }
-        switch (e)
-        {
-            case 1:
-                {
-                    lu_dynamic_array_pushBack(&removed_centers,i);
-                    dp_info[i].is_center = 0;
-                    for(idx_t c = 0; c < removed_centers.count - 1; ++c)
-                    {
-                        if(mr == removed_centers.data[c])
-                        {
-                            mr = max_rho.data[c];
-                        }
-                    }
-                    lu_dynamic_array_pushBack(&max_rho,mr);
-                    
-                }
-                break;
-
-            case 0:
-                {
-                    lu_dynamic_array_pushBack(&actual_centers,i);
-                    dp_info[i].cluster_idx = actual_centers.count - 1;
-                }
-                break;
-
-            default:
-                break;
-        }
-    }
-	free(to_remove_mask);
-
-    if(verbose)
-	{
-		clock_gettime(CLOCK_MONOTONIC, &finish);
-		elapsed = (finish.tv_sec - start.tv_sec);
-		elapsed += (finish.tv_nsec - start.tv_nsec) / 1000000000.0;
-		printf("\tFinding actual centers:   %.3lfs\n",elapsed);
-		clock_gettime(CLOCK_MONOTONIC, &start);
-	}
-
-    int n_centers = (int)actual_centers.count;
-    int tot_centers;
-    MPI_Allreduce(&n_centers, &tot_centers, 1, MPI_INT, MPI_SUM, ctx -> mpi_communicator);
-    // MPI_Allreduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
-    /*
-    DB_PRINT("rank %d got %d heheh\n", ctx -> mpi_rank, (int)actual_centers.count);
-    DB_PRINT("rank %d tc %d rc %d\n", ctx -> mpi_rank, (int)all_centers.count, (int)removed_centers.count);
-    */
-
-    /* bring on master all centers 
-     * order them in ascending order of density, 
-     * then re-scatter them around to get unique cluster labels */ 
-
-
-    center_t* private_centers_buffer = (center_t*)MY_MALLOC(actual_centers.count * sizeof(center_t));
-
-    center_t* global_centers_buffer  = (center_t*)MY_MALLOC(tot_centers * sizeof(center_t));
-
-    for(int i = 0; i < actual_centers.count; ++i)
-    {
-        idx_t idx = actual_centers.data[i] ;
-        private_centers_buffer[i].density = dp_info[idx].g;
-        private_centers_buffer[i].idx     = dp_info[idx].array_idx;
-    }
-    MPI_Datatype mpi_center_type;
-    
-    HERE
-
-    MPI_Type_contiguous(sizeof(center_t), MPI_BYTE, &mpi_center_type);
-    MPI_Type_commit(&mpi_center_type);
-
-    int* center_counts = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
-    int* center_displs = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
-
-    int cc = (int)actual_centers.count;
-    MPI_Allgather(&cc, 1, MPI_INT, center_counts, 1, MPI_INT, ctx -> mpi_communicator);
-
-    center_displs[0] = 0;
-    for(int i = 1; i < ctx -> world_size; ++i) center_displs[i] = center_displs[i - 1] + center_counts[i - 1];
-
-    /* alternative to check if something breaks */
-    for(int i = 0; i < actual_centers.count; ++i)
-    {
-        idx_t idx = actual_centers.data[i];
-        dp_info[idx].cluster_idx += center_displs[ctx -> mpi_rank];
-
-    }
-
-    
-    MPI_Gatherv(private_centers_buffer, (int)actual_centers.count, mpi_center_type, global_centers_buffer, center_counts, center_displs , mpi_center_type, 0, ctx -> mpi_communicator);
-
-
-    if(I_AM_MASTER)
-    {
-        qsort(global_centers_buffer, (size_t)tot_centers, sizeof(center_t), compare_center);
-        //qsort(void *base, size_t nmemb, size_t size, __compar_fn_t compar)
-        for(int i = 0; i < tot_centers; ++i) global_centers_buffer[i].cluster_idx = i;
-    }
-
-    MPI_Barrier(ctx -> mpi_communicator);
-    MPI_Bcast(global_centers_buffer, tot_centers, mpi_center_type, 0, ctx -> mpi_communicator);
-
-
-    MPI_Type_free(&mpi_center_type);
-
-    for(int i = 0; i < tot_centers; ++i)
-    {
-       if(its_mine(ctx, global_centers_buffer[i].idx))
-       {
-           idx_t idx = global_centers_buffer[i].idx - ctx -> idx_start;
-           dp_info[idx].cluster_idx = global_centers_buffer[i].cluster_idx;
-       }
-    }
-
-    free(private_centers_buffer);
-    free(global_centers_buffer);
-
-    free(center_counts);
-    free(center_displs);
-
-    HERE
-
-    /*
-     * Sort all the dp_info based on g and then perform the cluster assignment
-     * in asceding order                                                     
-     */
-
-    /*
-     * Sort all the dp_info based on g and then perform the cluster assignment
-     * in asceding order                                                     
-     */
-                                                                                
-    int completed = 0;
-
-    int* cl_send = (int*)MY_MALLOC(tot_count_send * sizeof(int));
-    int* cl_recv = (int*)MY_MALLOC(tot_count_recv * sizeof(int));
-    
-
-
-    while(!completed)
-    {
-        completed = 1;
-        for(int i = 0; i < ctx -> world_size; ++i)
-        {
-            for(int j = 0; j < ctx -> n_halo_points_send[i]; ++j)
-            {
-                idx_t idx = ctx -> idx_halo_points_send[i][j] - ctx -> idx_start;
-                cl_send[j + sdispls[i]] = ctx -> local_datapoints[idx].cluster_idx;
-            }
-        }
-
-
-
-        MPI_Barrier(ctx -> mpi_communicator);
-        /* exchange */ 
-        MPI_Alltoallv(cl_send, ctx -> n_halo_points_send, sdispls, MPI_INT, cl_recv, ctx -> n_halo_points_recv, rdispls, MPI_INT, ctx -> mpi_communicator);
-
-        /* unpack */
-
-        for(int i = 0; i < ctx -> world_size; ++i)
-        {
-            for(int j = 0; j < ctx -> n_halo_points_recv[i]; ++j)
-            {
-                ctx -> halo_datapoints[i][j].cluster_idx = cl_recv[rdispls[i] + j];
-            }
-        }
-        int proc_points = 0;
-        /* retrieve assignment 
-         * if some point remain uncompleted then get assignment from external points */
-
-        for(idx_t i = 0; i < n; ++i)
-        {   
-            int wait_for_comms = 0;
-            datapoint_info_t* p = dp_info_ptrs[i];
-            if(!(p -> is_center) && p -> cluster_idx < 0)
-            {
-                int cluster = -1;
-                idx_t k = 0;
-                idx_t p_idx;
-                idx_t max_k = p -> ngbh.N;
-                /*assign each particle at the same cluster as the nearest particle of higher density*/
-                while( k < max_k - 1 && cluster == -1)
-                {
-
-
-                    ++k;
-                    p_idx = p -> ngbh.data[k].array_idx;
-                    datapoint_info_t p_retrieved = find_possibly_halo_datapoint(ctx, p_idx);
-
-                    int flag = p_retrieved.g > p -> g;
-
-                    if(p_retrieved.g > p -> g)
-                    {
-                        cluster = p_retrieved.cluster_idx; 
-                        wait_for_comms = 1;
-                        break;
-                    }
-                }
-
-
-                if(cluster == -1 && !wait_for_comms)
-                {
-                    float_t gmax = -99999.;               
-                    idx_t gm_index = SIZE_MAX;
-                    for(idx_t k = 0; k < max_k; ++k)
-                    {
-                        idx_t ngbh_index = p -> ngbh.data[k].array_idx;
-                        for(idx_t m = 0; m < removed_centers.count; ++m)
-                        {
-                            idx_t max_rho_idx = max_rho.data[m];
-                            datapoint_info_t dp_max_rho = find_possibly_halo_datapoint(ctx, max_rho_idx);
-                            //datapoint_info_t dp_max_rho = dp_info[max_rho_idx];
-                            float_t gcand = dp_max_rho.g;
-                            if(ngbh_index == removed_centers.data[m] && gcand > gmax)
-                            {   
-                                //printf("%lu -- %lu\n", ele, m);
-                                gmax = gcand;
-                                gm_index = max_rho.data[m];
-                            }
-                        }
-                    }
-                    if(gm_index != SIZE_MAX)
-                    {
-                        datapoint_info_t dp_gm = find_possibly_halo_datapoint(ctx, gm_index);
-                        //datapoint_info_t dp_gm = dp_info[gm_index];
-                        cluster = dp_gm.cluster_idx;
-                    }
-
-                }
-                p -> cluster_idx = cluster;
-
-            }
-            completed = completed && p -> cluster_idx != -1; 
-            proc_points += p -> cluster_idx != -1 ?  1 : 0;
-
-        }
-
-        /*
-        for(idx_t i = 0; i < n; ++i) 
-        {
-            if(dp_info[i].cluster_idx > tot_centers) DB_PRINT("rank %d found %lu cidx %d \n", ctx -> mpi_rank, dp_info[i].array_idx, dp_info[i].cluster_idx);
-        }
-        */
-
-
-        //DB_PRINT("rank %d proc %d\n", ctx -> mpi_rank, proc_points);
-        MPI_Allreduce(MPI_IN_PLACE, &completed, 1, MPI_INT, MPI_SUM, ctx -> mpi_communicator);
-        completed = completed == ctx -> world_size ? 1 : 0;
-        /* copy cluster idx into buffer */
-
-
-    }
-
-    free(cl_send);
-    free(cl_recv);
-
-    if(verbose)
-	{
-		clock_gettime(CLOCK_MONOTONIC, &finish);
-		elapsed = (finish.tv_sec - start.tv_sec);
-		elapsed += (finish.tv_nsec - start.tv_nsec) / 1000000000.0;
-		printf("\tTentative clustering:     %.3lfs\n",elapsed);
-		clock_gettime(CLOCK_MONOTONIC, &start);
-	}
-
-    free(sdispls);
-    free(rdispls);
-    free(dp_info_ptrs);
-    free(max_rho.data);
-    free(removed_centers.data);
-    free(all_centers.data);
-
-    #if defined(WRITE_CLUSTER_ASSIGN_H1)
-        /* densities */
-        int* ks = (int*)MY_MALLOC(ctx -> local_n_points * sizeof(int));
-        for(int i = 0; i < ctx -> local_n_points; ++i) ks[i] = ctx -> local_datapoints[i].cluster_idx;
-
-        ordered_buffer_to_file(ctx, ks, sizeof(int), ctx -> local_n_points, "bb/cl.npy");
-        ordered_data_to_file(ctx);
-        free(ks);
-    #endif
-
-
-    clusters_t c_all;
-    c_all.centers = actual_centers;
-
-
-    if(verbose)
-	{
-		clock_gettime(CLOCK_MONOTONIC, &finish);
-		elapsed = (finish.tv_sec - start.tv_sec);
-		elapsed += (finish.tv_nsec - start.tv_nsec) / 1000000000.0;
-		printf("\tFinalizing clustering:    %.3lfs\n",elapsed);
-		printf("\n");
-	}
-
-    clock_gettime(CLOCK_MONOTONIC, &finish_tot);
-    elapsed_tot = (finish_tot.tv_sec - start_tot.tv_sec);
-    elapsed_tot += (finish_tot.tv_nsec - start_tot.tv_nsec) / 1000000000.0;
-
-
-	if(verbose)
-	{
-		printf("\tFound %lu clusters\n",(uint64_t)actual_centers.count);
-		printf("\tTotal time: %.3lfs\n\n", elapsed_tot);
-	}
-
-    c_all.n = n;
-    return c_all;
-}
-
-void Heuristic2_rma(global_context_t* ctx, clusters_t* cluster)
-{
-    /*
-     * Port the current implementation to this using rma
-     * then perform the matrix reduction
-     *
-     * Each one computes its borders, then the borders are shared, and the matrix is 
-     * reduced to a single huge matrix of borders
-     *
-     */
-
-    int verbose = 0;
-
-    datapoint_info_t* dp_info = ctx -> local_datapoints;
-
-    MPI_Win dp_info_win, ngbh_win;
-    MPI_Win_create(ctx -> local_datapoints, ctx -> local_n_points * sizeof(datapoint_info_t), 1, MPI_INFO_NULL, ctx -> mpi_communicator, &dp_info_win);
-    MPI_Win_create(ctx -> __local_heap_buffers, ctx -> local_n_points * ctx -> k * sizeof(heap_node), 1, MPI_INFO_NULL, ctx -> mpi_communicator, &ngbh_win);
-
-    MPI_Win_fence(0, dp_info_win);
-    MPI_Win_fence(0, ngbh_win);
-
-    MPI_Win_lock_all(0, dp_info_win);
-    MPI_Win_lock_all(0, ngbh_win);
-    MPI_Barrier(ctx -> mpi_communicator);
-
-
-
-    #define borders cluster->borders
-
-    struct timespec start_tot, finish_tot;
-    double elapsed_tot;
-    idx_t n = ctx -> local_n_points;
-
-	if(verbose)
-	{
-		printf("H2: Finding border points\n");
-		clock_gettime(CLOCK_MONOTONIC, &start_tot);
-	}
-
-    idx_t nclus = cluster->centers.count; 
-    idx_t max_k = dp_info[0].ngbh.N;
-
-    for(idx_t i = 0; i < n; ++i)
-    {
-        idx_t pp = NOBORDER;
-        /*loop over n neighbors*/
-        int c = dp_info[i].cluster_idx;
-        if(!dp_info[i].is_center)
-        {
-            for(idx_t k = 1; k < dp_info[i].kstar + 1; ++k)
-            {
-                /*index of the kth ngbh of n*/
-                idx_t j = dp_info[i].ngbh.data[k].array_idx;
-                pp = NOBORDER;
-                /*Loop over kn neigbhours to find if n is the nearest*/
-                /*if cluster of the particle in nbhg is c then check is neighborhood*/                                                
-
-                //DB_PRINT("my rank %d l %lu %lu %d\n", ctx -> mpi_rank, k, j, foreign_owner(ctx, j));
-                datapoint_info_t j_dp = find_possibly_halo_datapoint_rma(ctx, j, dp_info_win);
-                if(j_dp.cluster_idx != c)
-                {
-                    pp = j;
-                    break;
-                }
-
-            }
-        }
-
-		if(pp != NOBORDER)
-		{
-            datapoint_info_t pp_dp = find_possibly_halo_datapoint_rma(ctx, pp, dp_info_win);
-			for(idx_t k = 1; k < max_k; ++k)
-			{
-                //TODO: can optimize it can retrieve the whole ngbh in one shot
-				idx_t pp_ngbh_idx = get_j_ksel_idx(ctx, pp_dp.array_idx, k, ngbh_win);
-				if(pp_ngbh_idx == dp_info[i].array_idx)
-				{
-					break;
-				}
-
-                datapoint_info_t pp_ngbh_dp = find_possibly_halo_datapoint_rma(ctx, pp_ngbh_idx, ngbh_win);
-				if(pp_ngbh_dp.cluster_idx == c)
-				{
-					pp = NOBORDER;
-					break;
-				}
-			}
-		}
-
-		/*if it is the maximum one add it to the cluster*/
-		if(pp != NOBORDER)
-		{
-            datapoint_info_t j_dp = find_possibly_halo_datapoint_rma(ctx, pp, dp_info_win);
-			int ppc = j_dp.cluster_idx;
-            //insert one and symmetric one
-            sparse_border_t b = {.i = c, .j = ppc, .idx = ctx -> local_datapoints[i].array_idx, .density = ctx -> local_datapoints[i].g, .error = ctx -> local_datapoints[i].log_rho_err}; 
-            sparse_border_insert(cluster, b);
-            ////get symmetric border
-            sparse_border_t bsym = {.i = ppc, .j = c, .idx = ctx -> local_datapoints[i].array_idx, .density = ctx -> local_datapoints[i].g, .error = ctx -> local_datapoints[i].log_rho_err}; 
-            sparse_border_insert(cluster, bsym);
-		}
-
-
-    }
-
-
-    for(idx_t c = 0; c < nclus; ++c)
-    {
-        for(idx_t el = 0; el < cluster -> sparse_borders[c].count; ++el)
-        {
-            //fix border density, write log rho c
-            idx_t idx = cluster -> sparse_borders[c].data[el].idx; 
-            datapoint_info_t idx_dp = find_possibly_halo_datapoint_rma(ctx, idx, dp_info_win);
-            cluster -> sparse_borders[c].data[el].density = idx_dp.log_rho_c;
-        }
-    }
-
-
-	if(verbose)
-	{
-		clock_gettime(CLOCK_MONOTONIC, &finish_tot);
-		elapsed_tot = (finish_tot.tv_sec - start_tot.tv_sec);
-		elapsed_tot += (finish_tot.tv_nsec - start_tot.tv_nsec) / 1000000000.0;
-		printf("\tTotal time: %.3lfs\n\n", elapsed_tot);
-	}
-
-    MPI_Barrier(ctx -> mpi_communicator);
-    MPI_Win_unlock_all(ngbh_win);
-    MPI_Win_unlock_all(dp_info_win);
-
-
-    /* Border reduce */
-
-    idx_t num_border_el = 0;
-    for(idx_t i = 0; i < nclus; ++i)
-    {
-        num_border_el += cluster -> sparse_borders[i].count;
-    }
-
-    MPI_DB_PRINT("Master found %lu border elements\n", num_border_el);
-
-    int i_have_sent = 0;
-    int level = 1;
-    int ranks = ctx -> world_size;
-
-    #define SEND 1 
-    #define RECV 0
-    #define DO_NOTHING -1
-
-    MPI_Barrier(ctx -> mpi_communicator);
-
-    while(ranks > 1)
-    {
-        int dp = ranks % 2;
-        ranks = ranks / 2 + dp;
-        int send_rcv = ctx -> mpi_rank >= ranks;
-
-        if(dp && ctx -> mpi_rank == ranks - 1) send_rcv = DO_NOTHING;
-
-        switch (send_rcv) 
-        {
-            case SEND:
-                if(!i_have_sent)
-                {
-                    idx_t num_border_el = 0;
-                    for(idx_t i = 0; i < nclus; ++i)
-                    {
-                        num_border_el += cluster -> sparse_borders[i].count;
-                    }
-                    sparse_border_t* borders_to_send = (sparse_border_t*)MY_MALLOC(num_border_el * sizeof(sparse_border_t)); 
-
-                    idx_t count = 0;
-                    for(idx_t i = 0; i < nclus; ++i)
-                    {
-                        for(idx_t j = 0; j < cluster -> sparse_borders[i].count; ++j)
-                        {
-                            borders_to_send[count] = cluster -> sparse_borders[i].data[j];
-                            count++;
-                        }
-                    }
-
-                    int rank_to_send = ctx -> mpi_rank - ranks;
-                    DB_PRINT("-- Rank %d sending to %d\n", ctx -> mpi_rank, rank_to_send);
-
-                    MPI_Send(&num_border_el, 1, MPI_UINT64_T, rank_to_send, rank_to_send, ctx -> mpi_communicator);
-
-                    MPI_Send(borders_to_send, num_border_el * sizeof(sparse_border_t), MPI_BYTE , rank_to_send, rank_to_send, ctx -> mpi_communicator);
-                                        
-                    i_have_sent = 1;
-                    free(borders_to_send);
-                }
-                break;
-            case RECV:
-                {
-                    DB_PRINT("** Rank %d recieving\n", ctx -> mpi_rank);
-                    idx_t num_borders_recv = 0;
-                    MPI_Recv(&num_borders_recv, 1, MPI_UINT64_T, MPI_ANY_SOURCE, ctx -> mpi_rank, ctx -> mpi_communicator, MPI_STATUS_IGNORE);
-
-                    sparse_border_t* borders_to_recv = (sparse_border_t*)MY_MALLOC(num_borders_recv * sizeof(sparse_border_t)); 
-
-                    MPI_Recv(borders_to_recv, num_borders_recv * sizeof(sparse_border_t), MPI_BYTE , MPI_ANY_SOURCE, ctx -> mpi_rank, ctx -> mpi_communicator, MPI_STATUS_IGNORE);
-
-                    for(int i = 0; i < num_borders_recv; ++i)
-                    {
-                        sparse_border_insert(cluster, borders_to_recv[i]);
-                    }
-                    free(borders_to_recv);
-                }
-                break;
-            default:
-                DB_PRINT(".. Rank %d doing nothing\n", ctx -> mpi_rank);
-                break;
-        }
-        MPI_Barrier(ctx -> mpi_communicator);
-        MPI_DB_PRINT("-----------------\n");
-
-    }
-
-    num_border_el = 0;
-    for(idx_t i = 0; i < nclus; ++i)
-    {
-        num_border_el += cluster -> sparse_borders[i].count;
-    }
-
-    MPI_DB_PRINT("Master final %lu border elements\n", num_border_el);
-
-    #undef SEND
-    #undef RECV
-
-    #ifdef WRITE_BORDERS
-        if(I_AM_MASTER)
-        {
-            FILE* f = fopen("bb/borders.csv", "w");
-            for(idx_t i = 0; i < nclus; ++i)
-            {
-                for(int j = 0; j < cluster -> sparse_borders[i].count; ++j)
-                {
-                    sparse_border_t b = cluster -> sparse_borders[i].data[j];
-                    fprintf(f, "%lu,%lu,%lu,%lf\n", b.i, b.j, b.idx, b.density);
-                }
-            }
-            fclose(f);
-        }
-        
-    
-    #endif
-
-    return;
-    #undef borders
-
-   }
-
-void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx) 
-{
-    float_t *data;
-    TIME_DEF
-    double elapsed_time;
-
-    if (ctx->mpi_rank == 0) 
-    {
-        //data = read_data_file(ctx, "../norm_data/50_blobs_more_var.npy", MY_TRUE);
-        //ctx->dims = 2;
-        //data = read_data_file(ctx, "../norm_data/50_blobs.npy", MY_TRUE);
-        // std_g0163178_Me14_091_0000
-    
-        // 190M points
-        // std_g2980844_091_0000
-        // data = read_data_file(ctx,"../norm_data/std_g2980844_091_0000",MY_TRUE);
-        
-        /* 1M points ca.*/
-        // data = read_data_file(ctx,"../norm_data/std_LR_091_0001",MY_TRUE);
-
-        /* BOX */
-        // data = read_data_file(ctx,"../norm_data/std_Box_256_30_092_0000",MY_TRUE);
-
-        /* 8M points */
-        
-        // data = read_data_file(ctx,"../norm_data/std_g0144846_Me14_091_0001",MY_TRUE);
-
-        //88M 
-        data = read_data_file(ctx,"../norm_data/std_g5503149_091_0000",MY_TRUE);
-
-        //
-        //34 M
-        // data = read_data_file(ctx,"../norm_data/std_g1212639_091_0001",MY_TRUE);
-        ctx->dims = 5;
-
-        //ctx -> n_points = 5 * 100000;
-        ctx->n_points = ctx->n_points / ctx->dims;
-        //ctx->n_points = (ctx->n_points * 5) / 10;
-        // ctx -> n_points = ctx -> world_size * 1000;
-
-        //ctx -> n_points = 10000000 * ctx -> world_size;
-        //generate_random_matrix(&data, ctx -> dims, ctx -> n_points, ctx);
-        //mpi_printf(ctx, "Read %lu points in %u dims\n", ctx->n_points, ctx->dims);
-    }
-    //MPI_DB_PRINT("[MASTER] Reading file and scattering\n");
-    
-    /* communicate the total number of points*/
-    MPI_Bcast(&(ctx->dims), 1, MPI_UINT32_T, 0, ctx->mpi_communicator);
-    MPI_Bcast(&(ctx->n_points), 1, MPI_UINT64_T, 0, ctx->mpi_communicator);
-
-    /* compute the number of elements to recieve for each processor */
-    int *send_counts = (int *)MY_MALLOC(ctx->world_size * sizeof(int));
-    int *displacements = (int *)MY_MALLOC(ctx->world_size * sizeof(int));
-
-    displacements[0] = 0;
-    send_counts[0] = ctx->n_points / ctx->world_size;
-    send_counts[0] += (ctx->n_points % ctx->world_size) > 0 ? 1 : 0;
-    send_counts[0] = send_counts[0] * ctx->dims;
-
-    for (int p = 1; p < ctx->world_size; ++p) 
-    {
-        send_counts[p] = (ctx->n_points / ctx->world_size);
-        send_counts[p] += (ctx->n_points % ctx->world_size) > p ? 1 : 0;
-        send_counts[p] = send_counts[p] * ctx->dims;
-        displacements[p] = displacements[p - 1] + send_counts[p - 1];
-    }
-
-
-    ctx->local_n_points = send_counts[ctx->mpi_rank] / ctx->dims;
-
-    float_t *pvt_data = (float_t *)MY_MALLOC(send_counts[ctx->mpi_rank] * sizeof(float_t));
-
-    MPI_Scatterv(data, send_counts, displacements, MPI_MY_FLOAT, pvt_data, send_counts[ctx->mpi_rank], MPI_MY_FLOAT, 0, ctx->mpi_communicator);
-
-    ctx->local_data = pvt_data;
-
-    int k_local = 20;
-    int k_global = 20;
-
-    uint64_t *global_bin_counts_int = (uint64_t *)MY_MALLOC(k_global * sizeof(uint64_t));
-
-    pointset_t original_ps;
-    original_ps.data = ctx->local_data;
-    original_ps.dims = ctx->dims;
-    original_ps.n_points = ctx->local_n_points;
-    original_ps.lb_box = (float_t*)MY_MALLOC(ctx -> dims * sizeof(float_t));
-    original_ps.ub_box = (float_t*)MY_MALLOC(ctx -> dims * sizeof(float_t));
-
-    float_t tol = 0.002;
-
-    top_kdtree_t tree;
-    TIME_START;
-    top_tree_init(ctx, &tree);
-    elapsed_time = TIME_STOP;
-    LOG_WRITE("Initializing global kdtree", elapsed_time);
-
-    TIME_START;
-    build_top_kdtree(ctx, &original_ps, &tree, k_global, tol);
-    exchange_points(ctx, &tree);
-    elapsed_time = TIME_STOP;
-    LOG_WRITE("Top kdtree build and domain decomposition", elapsed_time);
-    //test_the_idea(ctx);
-
-    TIME_START;
-    kdtree_v2 local_tree;
-    kdtree_v2_init( &local_tree, ctx -> local_data, ctx -> local_n_points, (unsigned int)ctx -> dims);
-    int k = 300;
-    //int k = 30;
-
-    datapoint_info_t* dp_info = (datapoint_info_t*)MY_MALLOC(ctx -> local_n_points * sizeof(datapoint_info_t));            
-    /* initialize, to cope with valgrind */
-    for(uint64_t i = 0; i < ctx -> local_n_points; ++i)
-    {
-        dp_info[i].ngbh.data = NULL;
-        dp_info[i].ngbh.N = 0;
-        dp_info[i].ngbh.count = 0;
-        dp_info[i].g = 0.f;
-        dp_info[i].log_rho = 0.f;
-        dp_info[i].log_rho_c = 0.f;
-        dp_info[i].log_rho_err = 0.f;
-        dp_info[i].array_idx = -1;
-        dp_info[i].kstar = -1;
-        dp_info[i].is_center = -1;
-        dp_info[i].cluster_idx = -1;
-    }
-
-    build_local_tree(ctx, &local_tree);
-    elapsed_time = TIME_STOP;
-    LOG_WRITE("Local trees init and build", elapsed_time);
-
-    TIME_START;
-    MPI_DB_PRINT("----- Performing ngbh search -----\n");
-    MPI_Barrier(ctx -> mpi_communicator);
-
-    mpi_ngbh_search(ctx, dp_info, &tree, &local_tree, ctx -> local_data, k);
-
-    MPI_Barrier(ctx -> mpi_communicator);
-    elapsed_time = TIME_STOP;
-    LOG_WRITE("Total time for all knn search", elapsed_time)
-
-
-
-    TIME_START;
-    //float_t id = id_estimate(ctx, dp_info, ctx -> local_n_points, 0.9, MY_FALSE);
-    float_t id = compute_ID_two_NN_ML(ctx, dp_info, ctx -> local_n_points, MY_FALSE);
-    elapsed_time = TIME_STOP;
-    LOG_WRITE("ID estimate", elapsed_time)
-
-    MPI_DB_PRINT("ID %lf \n",id);
-
-
-    //TIME_START;
-    //datapoint_info_t** foreign_dp_info = (datapoint_info_t**)MY_MALLOC(ctx -> world_size * sizeof(datapoint_info_t*));
-    //find_foreign_nodes(ctx, dp_info, foreign_dp_info);
-    //elapsed_time = TIME_STOP;
-    //LOG_WRITE("Finding points to request the ngbh", elapsed_time)
-
-    TIME_START;
-
-    ctx -> local_datapoints = dp_info;
-    //compute_density_kstarnn_rma(ctx, id, MY_FALSE);
-    compute_density_kstarnn_rma_v2(ctx, id, MY_FALSE);
-    //compute_density_kstarnn(ctx, id, MY_FALSE);
-    compute_correction(ctx, 2);
-    elapsed_time = TIME_STOP;
-    LOG_WRITE("Density estimate", elapsed_time)
-
-    TIME_START;
-    clusters_t clusters = Heuristic1_rma(ctx, MY_FALSE);
-    elapsed_time = TIME_STOP;
-    LOG_WRITE("H1", elapsed_time)
-
-    TIME_START;
-    clusters_allocate(&clusters, 1);
-    Heuristic2_rma(ctx, &clusters);
-    elapsed_time = TIME_STOP;
-    LOG_WRITE("H2", elapsed_time)
-    
-
-    /* find density */ 
-
-
-
-
-    #if defined (WRITE_NGBH)
-        ordered_data_to_file(ctx);
-    #endif
-
-    /*
-    free(foreign_dp_info);
-    */
-    
-    
-    top_tree_free(ctx, &tree);
-    kdtree_v2_free(&local_tree);
-
-    free(send_counts);
-    free(displacements);
-    //free(dp_info);
-
-    if (ctx->mpi_rank == 0) free(data);
-
-    original_ps.data = NULL;
-    free_pointset(&original_ps);
-    free(global_bin_counts_int);
-}
diff --git a/src/tree/tree.h b/src/tree/tree.h
index 9725a38336880b98884d646c272276bbbf18488f..1ee44f847f926b3713114ba76165f2aab8795a6d 100644
--- a/src/tree/tree.h
+++ b/src/tree/tree.h
@@ -2,6 +2,7 @@
 #include "../common/common.h"
 #include "heap.h"
 #include "kdtreeV2.h"
+
 #include <stdlib.h>
 #include <stdio.h>
 #include <memory.h>
@@ -84,41 +85,24 @@ typedef struct top_kdtree_t
 	size_t _capacity;
 	struct top_kdtree_node_t* _nodes;
 	struct top_kdtree_node_t* root;
-
 } top_kdtree_t;
 
-typedef struct border_t {
-  float_t density;
-  float_t error;
-  idx_t idx;
-} border_t;
-
-typedef struct sparse_border_t {
-  idx_t i;
-  idx_t j;
-  idx_t idx;
-  float_t density;
-  float_t error;
-} sparse_border_t;
-
-typedef struct adj_list_t {
-  idx_t count;
-  idx_t size;
-  struct sparse_border_t* data;
-} adj_list_t;
-
-
-typedef struct clusters_t {
-  int use_sparse_borders;
-  struct adj_list_t *sparse_borders;
-  struct lu_dynamic_array_t centers;
-  struct border_t **borders;
-  struct border_t *__borders_data;
-  idx_t n;
-} clusters_t;
-
-
-void simulate_master_read_and_scatter(int, size_t, global_context_t* ); 
+
+
+void     simulate_master_read_and_scatter(int, size_t, global_context_t* ); 
+float_t* read_data_file(global_context_t *ctx, const char *fname, const int file_in_float32);
+void     top_tree_init(global_context_t *ctx, top_kdtree_t *tree); 
+void     build_top_kdtree(global_context_t *ctx, pointset_t *og_pointset, top_kdtree_t *tree, int n_bins, float_t tolerance);
+void     exchange_points(global_context_t* ctx, top_kdtree_t* tree);
+void     build_local_tree(global_context_t* ctx, kdtree_v2* local_tree);
+void     mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtree_t* top_tree, kdtree_v2* local_tree, float_t* data, int k);
+float_t  compute_ID_two_NN_ML(global_context_t* ctx, datapoint_info_t* dp_info, idx_t n, int verbose);
+void     compute_density_kstarnn_rma_v2(global_context_t* ctx, const float_t d, int verbose);
+void     compute_correction(global_context_t* ctx, float_t Z);
+int      foreign_owner(global_context_t*, idx_t);
+void     top_tree_free(global_context_t *ctx, top_kdtree_t *tree);
+
+