#include "adp.h"
#include <unistd.h>

const border_t border_null = {.density = -1.0, .error = 0, .idx = NOBORDER};
const sparse_border_t sparse_border_null = {.density = -1.0, .error = 0, .idx = NOBORDER, .i = NOBORDER, .j = NOBORDER};

float_t mEst2(float_t * x, float_t *y, idx_t n)
{

    /*
     * Estimate the m coefficient of a straight 
     * line passing through the origin          
     * params:                                  
     * - x: x values of the points              
     * - y: y values of the points              
     * - n: size of the arrays                  
     */
     
    float_t num = 0;
    float_t den = 0;
    float_t dd;
    for(idx_t i = 0; i < n; ++i)
    {
        float_t xx = x[i];
        float_t yy = y[i];

        dd = xx;
        num += dd*yy;
        den += dd*dd;

    }
  
    return num/den;
}

float_t compute_ID_two_NN_ML(global_context_t* ctx, datapoint_info_t* dp_info, idx_t n, int verbose)
{

    /*
     * Estimation of the intrinsic dimension of a dataset                                       
     * args:                                                                                    
     * - dp_info: array of structs                                                             
     * - n: number of dp_info                                                                  
     * intrinsic_dim = (N - 1) / np.sum(log_mus)
     */

    struct timespec start_tot, finish_tot;
    double elapsed_tot;

	if(verbose) 
    {
		printf("ID estimation:\n");
		clock_gettime(CLOCK_MONOTONIC, &start_tot);
	}
    
    float_t log_mus = 0.;
    for(idx_t i = 0; i < n; ++i)
    {
        log_mus += 0.5 * log(dp_info[i].ngbh[2].value/dp_info[i].ngbh[1].value);
    }

    float_t d = 0;
    MPI_Allreduce(&log_mus, &d, 1, MPI_MY_FLOAT, MPI_SUM, ctx -> mpi_communicator);
    d = (ctx -> n_points - 1)/d;
	if(verbose)
	{
		clock_gettime(CLOCK_MONOTONIC, &finish_tot);
		elapsed_tot = (finish_tot.tv_sec - start_tot.tv_sec);
		elapsed_tot += (finish_tot.tv_nsec - start_tot.tv_nsec) / 1000000000.0;
		printf("\tID value: %.6lf\n", d);
		printf("\tTotal time: %.3lfs\n\n", elapsed_tot);
	}

    return d;

}

float_t get_j_ksel_dist(global_context_t* ctx, idx_t j, idx_t ksel, MPI_Win exposed_ngbh)
{
    int owner = foreign_owner(ctx, j);
    idx_t k = ctx -> k;
    /* find if datapoint is halo or not */
    if(owner == ctx -> mpi_rank)
    {
        idx_t pos = j - ctx -> idx_start;
        return ctx -> local_datapoints[pos].ngbh[ksel].value;
    }
    else
    {
        heap_node el;
        idx_t pos  = j - ctx -> rank_idx_start[owner];
        MPI_Request request;
        int err = MPI_Rget(&el, sizeof(heap_node), MPI_BYTE, owner, (MPI_Aint)((pos * k + ksel) * sizeof(heap_node)), sizeof(heap_node), MPI_BYTE, exposed_ngbh, &request);
        MPI_Wait(&request,MPI_STATUS_IGNORE);
        return el.value;
    }                 
}

idx_t get_j_ksel_idx(global_context_t* ctx, idx_t j, idx_t ksel, MPI_Win exposed_ngbh)
{
    int owner = foreign_owner(ctx, j);
    idx_t k = ctx -> k;
    /* find if datapoint is halo or not */
    if(owner == ctx -> mpi_rank)
    {
        idx_t pos = j - ctx -> idx_start;
        return ctx -> local_datapoints[pos].ngbh[ksel].array_idx;
    }
    else
    {
        heap_node el;
        idx_t pos  = j - ctx -> rank_idx_start[owner];
        MPI_Request request;
        int err = MPI_Rget(&el, sizeof(heap_node), MPI_BYTE, owner, (MPI_Aint)((pos * k + ksel) * sizeof(heap_node)), sizeof(heap_node), MPI_BYTE, exposed_ngbh, &request);
        MPI_Wait(&request,MPI_STATUS_IGNORE);
        return el.array_idx;
    }                 
}


void compute_density_kstarnn_rma_v2(global_context_t* ctx, const float_t d, int verbose){

    /*
     * Point density computation:                       
     * args:                                            
     * - paricles: array of structs                   
     * - d       : intrinsic dimension of the dataset 
     * - points  : number of points in the dataset    
     */


    MPI_Info info;


    MPI_Barrier(ctx -> mpi_communicator);
    idx_t k = ctx -> k;

    struct timespec start_tot, finish_tot;
    double elapsed_tot;

    datapoint_info_t* local_datapoints = ctx -> local_datapoints;

	if(verbose)
	{
		printf("Density and k* estimation:\n");
		clock_gettime(CLOCK_MONOTONIC, &start_tot);
	}

    idx_t kMAX = ctx -> k - 1;   

    float_t omega = 0.;  
    if(sizeof(float_t) == sizeof(float)){ omega = powf(PI_F,d/2)/tgammaf(d/2.0f + 1.0f);}  
    else{omega = pow(M_PI,d/2.)/tgamma(d/2.0 + 1.0);}


    // [LT comment] it may be the case to promote this win to a global variable
    //              and to leave it open until it is needed (then also for the next
    //              heuristics
    //              Also, shall we give som info argument to it ?
    //

    MPI_Win exposed_ngbh;
    MPI_Win_create( ctx -> __local_heap_buffers, 
                    ctx -> local_n_points * k * sizeof(heap_node), 
                    1, MPI_INFO_NULL, 
                    ctx -> mpi_communicator, 
                    &exposed_ngbh);

    MPI_Win_fence(MPI_MODE_NOPUT, exposed_ngbh);

    MPI_Barrier(ctx -> mpi_communicator);

    #pragma omp parallel for
    for(idx_t i = 0; i < ctx -> local_n_points; ++i)
    {
        for(idx_t k = 0; k <= kMAX; ++k)
        {
            local_datapoints[i].ngbh[k].value = omega * pow(local_datapoints[i].ngbh[k].value, d/2.);  
        }
        //initialize kstar at 0
        local_datapoints[i].kstar = 0;
    }

    int i_have_finished = 0;
    int all_have_finished = 0;
    int finished_points = 0;
    heap_node* scratch_heap_nodes = (heap_node*)MY_MALLOC(ctx -> local_n_points * sizeof(heap_node));  

        for(idx_t j = 4; j < kMAX - 1; ++j)
        {
            i_have_finished = 1;
            //request data
            idx_t ksel = j - 1;

#if !defined(THREAD_FUNNELED)
            #pragma omp parallel for
#endif

            for(idx_t i = 0; i < ctx -> local_n_points; ++i)
            {

                if(ctx -> local_datapoints[i].kstar == 0)
                {
                    //vvi = omega * pow(local_datapoints[i].ngbh.data[ksel].value,d/2.);

                    idx_t jj = local_datapoints[i].ngbh[j].array_idx;

                    /* 
                     * note jj can be an halo point 
                     * need to search maybe for it in foreign nodes
                     * */

                    int owner = foreign_owner(ctx, jj);
                    idx_t pos = jj - ctx -> rank_idx_start[owner];

                    if(owner == ctx -> mpi_rank)
                    {
                        scratch_heap_nodes[i] = ctx -> local_datapoints[pos].ngbh[ksel];
                    }
                    else
                    {
                        MPI_Get(scratch_heap_nodes + i, 
                                sizeof(heap_node), 
                                MPI_BYTE, 
                                owner, 
                                (MPI_Aint)((pos * (ctx -> k) + ksel) * sizeof(heap_node)), 
                                sizeof(heap_node), 
                                MPI_BYTE, 
                                exposed_ngbh);
                    }
                }

            }

            MPI_Win_fence(MPI_MODE_NOPUT,exposed_ngbh); 
            //process data
#if !defined(THREAD_FUNNELED)
            #pragma omp parallel for
#endif
            for(idx_t i = 0; i < ctx -> local_n_points; ++i)
            {
                if(ctx -> local_datapoints[i].kstar == 0)
                {
                    float_t vvi, vvj, vp, dL;
                    vvi = local_datapoints[i].ngbh[ksel].value;
                    vvj = scratch_heap_nodes[i].value;
                    vp = (vvi + vvj)*(vvi + vvj);
                    dL = -2.0 * ksel * log(4.*vvi*vvj/vp);

                    if(dL > DTHR)
                    {
                        idx_t k = j - 1;
                        local_datapoints[i].kstar = k;
                        local_datapoints[i].log_rho = log((float_t)(k)/vvi/((float_t)(ctx -> n_points)));
                        local_datapoints[i].log_rho_err =   1.0/sqrt((float_t)k); //(float_t)(-Q_rsqrt((float)k));
                        local_datapoints[i].g = local_datapoints[i].log_rho - local_datapoints[i].log_rho_err;

                        #pragma omp atomic update
                        finished_points++;
                    }
                }

            }


            i_have_finished = (finished_points == ctx -> local_n_points);
            MPI_Allreduce(&i_have_finished, &all_have_finished, 1, MPI_INT, MPI_LAND, ctx -> mpi_communicator);

            if(all_have_finished) break;
        }


        #pragma omp parallel for
        for(idx_t i = 0; i < ctx -> local_n_points; ++i)
        {
            if(ctx -> local_datapoints[i].kstar == 0)
            {
                idx_t k = kMAX - 1;

                float_t vvi = ctx -> local_datapoints[i].ngbh[k].value;

                local_datapoints[i].kstar = k;
                local_datapoints[i].log_rho = log((float_t)(k)/vvi/((float_t)(ctx -> n_points)));
                local_datapoints[i].log_rho_err =   1.0/sqrt((float_t)k); //(float_t)(-Q_rsqrt((float)k));
                local_datapoints[i].g = local_datapoints[i].log_rho - local_datapoints[i].log_rho_err;
            }
        }


    MPI_Win_fence(0, exposed_ngbh);
    MPI_Win_free(&exposed_ngbh);

    free(scratch_heap_nodes);

    #if defined(WRITE_DENSITY)
        /* densities */
        float_t* den = (float_t*)MY_MALLOC(ctx -> local_n_points * sizeof(float_t));
        float_t* gs = (float_t*)MY_MALLOC(ctx -> local_n_points * sizeof(float_t));
        idx_t* ks = (idx_t*)MY_MALLOC(ctx -> local_n_points * sizeof(idx_t));

        for(int i = 0; i < ctx -> local_n_points; ++i) den[i] = ctx -> local_datapoints[i].log_rho;
        for(int i = 0; i < ctx -> local_n_points; ++i) ks[i]  = ctx -> local_datapoints[i].kstar;
        for(int i = 0; i < ctx -> local_n_points; ++i) gs[i]  = ctx -> local_datapoints[i].g;

        ordered_buffer_to_file(ctx, den, sizeof(float_t), ctx -> local_n_points, "bb/ordered_density.npy");
        ordered_buffer_to_file(ctx, ks, sizeof(idx_t), ctx -> local_n_points, "bb/ks.npy");
        ordered_buffer_to_file(ctx, gs, sizeof(float_t), ctx -> local_n_points, "bb/g.npy");

        ordered_data_to_file(ctx, "bb/ordered_data.npy");
        free(den);
        free(ks);
    #endif
    return;


}

float_t get_j_ksel_dist_v2(global_context_t* ctx, idx_t i, idx_t j, idx_t ksel, int* flags, heap_node* tmp_heap_nodes, MPI_Win* exposed_ngbh)
{
    if(flags[i])
    {
        int owner = foreign_owner(ctx, j);
        idx_t k = ctx -> k;
        /* find if datapoint is halo or not */
        if(owner == ctx -> mpi_rank)
        {
            idx_t pos = j - ctx -> idx_start;
            return ctx -> local_datapoints[pos].ngbh[ksel].value;
        }
        else
        {
            //RMA
            flags[i] = 0;
            idx_t pos  = j - ctx -> rank_idx_start[owner];
            MPI_Get(tmp_heap_nodes + i, sizeof(heap_node), MPI_BYTE, owner, (MPI_Aint)((pos * k + ksel) * sizeof(heap_node)), sizeof(heap_node), MPI_BYTE, *exposed_ngbh);
            return 0;
        }                 
    }
    else
    {
        flags[i] = 1;
        return tmp_heap_nodes[i].value;
    }
}

void clusters_allocate(clusters_t * c, int s)
{
	/*
	 * Helper function for handling allocation of resources 
	 */ 
    if(c -> centers.count == 0)
    {
        printf("Provide a valid cluster centers list\n");
        return;
    }

    idx_t nclus = c -> centers.count;
    
    if(s)
    {
	    //printf("Using sparse implementation\n");
	    c -> use_sparse_borders = 1;
	    c -> sparse_borders = (adj_list_t*)MY_MALLOC(nclus*sizeof(adj_list_t));
	    for(idx_t i = 0; i < nclus; ++i)
	    {
		    c -> sparse_borders[i].count = 0;
		    c -> sparse_borders[i].size  = PREALLOC_BORDERS;
		    c -> sparse_borders[i].data  = (sparse_border_t*)MY_MALLOC(PREALLOC_BORDERS*sizeof(sparse_border_t));
	    }

    }
    else
    {
	    //printf("Using dense implementation\n");
	    c -> use_sparse_borders = 0;
	    c -> __borders_data         = (border_t*)MY_MALLOC(nclus*nclus*sizeof(border_t)); 
	    c -> borders                = (border_t**)MY_MALLOC(nclus*sizeof(border_t*));

	    #pragma omp parallel for

	    for(idx_t i = 0; i < nclus; ++i)
	    {
			c -> borders[i]         = c -> __borders_data + i*nclus;
			for(idx_t j = 0; j < nclus; ++j)
			{
				c -> borders[i][j] = border_null;
			}
	    }
    }
}


void adj_list_insert(adj_list_t* l, sparse_border_t b)
{
	/*
	 * Handling of sparse border implementation as an adjecency list
	 */
	if(l -> count < l -> size)
	{
		l -> data[l -> count] = b;
		l -> count++;
	}
	else
	{
		l -> size += PREALLOC_BORDERS; 
		l -> data = realloc( l -> data, sizeof(sparse_border_t) * ( l -> size));
		l -> data[l -> count] = b;
		l -> count++;
	}
}

void adj_list_reset(adj_list_t* l)
{
	/*
	 * Handling of sparse border implementation as an adjecency list
	 */
	if(l -> data) free(l -> data);
	l -> count = 0;
	l -> size  = 0;
	l -> data  = NULL;
}

void clusters_reset(clusters_t * c)
{
	/* 
	 * Handling reset of clusters object 
	 */
	if(c -> use_sparse_borders)
	{
		for(idx_t i = 0; i < c -> centers.count; ++i)
		{
			adj_list_reset((c -> sparse_borders) + i);
		
		}
		free(c -> sparse_borders);
		c -> sparse_borders = NULL;
	}
	else
	{
		if(c -> __borders_data)  free(c -> __borders_data);
		if(c -> borders) free(c -> borders);
	}
	if(c -> centers.data) free(c -> centers.data);
}

void clusters_free(clusters_t * c)
{
	/*
	 * Free cluster object
	 */
    clusters_reset(c);
}


void sparse_border_insert(clusters_t *c, sparse_border_t b)
{
	/*
	 * Insert a border element in the sparse implementation
	 */

	idx_t i = b.i;
	adj_list_t l = c -> sparse_borders[i];
	int check = 1;
	for(idx_t k = 0; k < l.count; ++k)
	{
		sparse_border_t p = l.data[k];
		if(p.i == b.i && p.j == b.j)
		{
			if( b.density > p.density)
			{
				l.data[k] = b;
			}
			check = 0;
		}
	}
	if(check) adj_list_insert(c -> sparse_borders + i, b);
	return;
}

sparse_border_t sparse_border_get(clusters_t* c, idx_t i, idx_t j)
{
	/*
	 * Get a border element in the sparse implementation
	 * - i,j: cluster to search for borders
	 * return border_null if not found
	 */

	sparse_border_t b = sparse_border_null;
	adj_list_t l = c -> sparse_borders[i];
	for(idx_t el = 0; el < l.count; ++el)
	{
		sparse_border_t candidate = l.data[el];
		if(candidate.i == i && candidate.j == j)
		{
			b = candidate;
		}
	}
	return b;
}

datapoint_info_t find_possibly_halo_datapoint_rma(global_context_t* ctx, idx_t idx, MPI_Win win_datapoints)
{
    int owner = foreign_owner(ctx, idx);
    /* find if datapoint is halo or not */
    if(owner == ctx -> mpi_rank)
    {
        idx_t i = idx - ctx -> idx_start;
        return ctx -> local_datapoints[i];
    }
    else
    {
        datapoint_info_t tmp_dp;
        #pragma omp critical
        {
            idx_t i = idx - ctx -> rank_idx_start[owner];
            MPI_Request request;
            MPI_Status status;

            MPI_Rget(&tmp_dp, sizeof(datapoint_info_t), MPI_BYTE, owner,
                    i * sizeof(datapoint_info_t), sizeof(datapoint_info_t), MPI_BYTE, win_datapoints, &request);
            MPI_Wait(&request, MPI_STATUS_IGNORE);

        }

        return tmp_dp;         
    }                 
}

int cmpPP(const void* p1, const void *p2)
{
    /*
     * Utility function to perform quicksort   
     * when clustering assignment is performed    
     */
    datapoint_info_t* pp1 = *(datapoint_info_t**)p1;
    datapoint_info_t* pp2 = *(datapoint_info_t**)p2;
    //return 2*(pp1 -> g < pp2 -> g) - 1;
    int v = (pp1 -> g < pp2 -> g) - (pp1 -> g > pp2 -> g);
    int a = (pp1 -> array_idx < pp2 -> array_idx) - (pp1 -> array_idx > pp2 -> array_idx);
    return v == 0 ? a : v;
    //return v;
}

inline int its_mine(global_context_t* ctx, idx_t idx)
{
    return idx >= ctx -> idx_start && idx < ctx -> idx_start + ctx -> local_n_points;
}

int compare_center(const void* a, const void* b)
{
    center_t* aa = (center_t*)a;
    center_t* bb = (center_t*)b;
    return (aa -> density < bb -> density) - (aa -> density > bb -> density);
}


void compute_correction(global_context_t* ctx, float_t Z)
{
    /*
     * Utility function, find the minimum value of the density of the datapoints
     * and shift them up in order to further work with values greater than 0     
     */
    float_t min_log_rho = 999999.9;

    datapoint_info_t* dp_info = ctx -> local_datapoints;
    idx_t n = ctx -> local_n_points;
    

    #pragma omp parallel
    {
        float_t thread_min_log_rho = 9999999.;
        #pragma omp for
        for(idx_t i = 0; i < n; ++i)
        {
            float_t tmp = dp_info[i].log_rho - Z*dp_info[i].log_rho_err;
            if(tmp < thread_min_log_rho){
                thread_min_log_rho = tmp;
            }
        }
        #pragma omp critical
        if(thread_min_log_rho < min_log_rho) min_log_rho = thread_min_log_rho;
    }

    MPI_Allreduce(MPI_IN_PLACE, &min_log_rho, 1, MPI_MY_FLOAT, MPI_MIN, ctx -> mpi_communicator);

    #pragma omp parallel
    {
        #pragma omp for
        for(idx_t i = 0; i < n; ++i)
        {
            dp_info[i].log_rho_c = dp_info[i].log_rho - min_log_rho + 1;
            dp_info[i].g = dp_info[i].log_rho_c - dp_info[i].log_rho_err;
        }

    }

}

/* maybe this should return an error?*/

#define LOCK_ACQUIRED 1
#define LOCK_FREE     0

#define lock_t     int
#define MPI_LOCK_T MPI_INT

lock_t h1_lock_acquire(global_context_t* ctx, MPI_Win lock_window, int owner, idx_t pos, lock_t state)
{
    if(state == LOCK_FREE)
    {
        state = LOCK_ACQUIRED;

        lock_t compare = LOCK_FREE;
        lock_t result  = LOCK_ACQUIRED;

        int err = MPI_SUCCESS;

        while(result == LOCK_ACQUIRED && err == MPI_SUCCESS)
        {
            err = MPI_Compare_and_swap(&state, &compare, &result, MPI_LOCK_T, owner, pos, lock_window);
            MPI_Win_flush(owner, lock_window);
            usleep(100);
        }

        if(err != MPI_SUCCESS)
        {
            printf("/!\\ Rank %d at line %u\n encountered an error while using MPI_RMA, aborting\n", ctx -> mpi_rank, __LINE__);
            print_error_code(err);
            exit(1);
        }

    }
    return state;
                
}

lock_t h1_lock_free(global_context_t* ctx, MPI_Win lock_window, int owner, idx_t pos, lock_t state)
{
    if(state == LOCK_ACQUIRED)
    {
        state = LOCK_FREE;

        MPI_Accumulate(&state, 1, MPI_LOCK_T, owner, 
                        pos,   1, MPI_LOCK_T, MPI_REPLACE, lock_window);

    }
    return state;
    
}

clusters_t Heuristic1(global_context_t *ctx)
{
    /*
     * Heurisitc 1, from paper of Errico, Facco, Laio & Rodriguez 
     * ( https://doi.org/10.1016/j.ins.2021.01.010 )              
     */

    datapoint_info_t* dp_info = ctx -> local_datapoints;
    idx_t n = ctx -> local_n_points; 

    struct timespec start_tot, finish_tot;
    double elapsed_tot;

    TIME_DEF;

    lu_dynamic_array_t all_centers, removed_centers, actual_centers, max_rho;

    lu_dynamic_array_allocate(&all_centers);
    lu_dynamic_array_allocate(&removed_centers);
    lu_dynamic_array_allocate(&actual_centers);
    lu_dynamic_array_allocate(&max_rho);

    datapoint_info_t** dp_info_ptrs = (datapoint_info_t**)MY_MALLOC(n*sizeof(datapoint_info_t*));

    /* proceed */

    MPI_Win win_datapoints;
    MPI_Win_create(ctx -> local_datapoints, ctx -> local_n_points * sizeof(datapoint_info_t), 
                   1, MPI_INFO_NULL, ctx -> mpi_communicator, &win_datapoints);
    MPI_Win_fence(0, win_datapoints);
    MPI_Win_lock_all(0,  win_datapoints);

#if !defined(THREAD_FUNNELED)
    #pragma omp parallel for
#endif
    for(idx_t i = 0; i < n; ++i)
    {   
        /*
         * Find the centers of the clusters as the points of higher density in their neighborhoods
         * A point is tagged as a putative center if it is the point of higer density of its neighborhood 
         */

        dp_info_ptrs[i] = dp_info + i;
        idx_t maxk = dp_info[i].kstar + 1;
        float_t gi = dp_info[i].g;
        dp_info[i].is_center = 1;
        dp_info[i].cluster_idx = -1;
        //printf("%lf\n",p -> g);
        for(idx_t k = 1; k < maxk; ++k)
        {
            idx_t ngbh_index = dp_info[i].ngbh[k].array_idx;
            datapoint_info_t dj = find_possibly_halo_datapoint_rma(ctx, ngbh_index, win_datapoints);
            float_t gj = dj.g;
            if(gj > gi){
                dp_info[i].is_center = 0;
                break;
            }
        }
        if(dp_info[i].is_center)
        {
            #pragma omp critical (push_candidate_center)
            {
                lu_dynamic_array_pushBack(&all_centers, i);
            }
        }
    }

	/* 
	 * optimized version
	 *
	 * Generate a mask that keeps track of the point has been eliminating the 
	 * point considered. Each thread updates this mask, then after the procedure
	 * ends, center, removed centers, and max_rho arrays are populated
	 */
		
    lock_t*    lock_array     = (lock_t*)MY_MALLOC(n * sizeof(lock_t));
	heap_node* to_remove_mask = (heap_node*)MY_MALLOC(n*sizeof(heap_node));

    for(idx_t p = 0; p < n; ++p) 
    {
        to_remove_mask[p].array_idx = MY_SIZE_MAX;
        to_remove_mask[p].value = 9999999;
        lock_array[p] = LOCK_FREE;
    }
    qsort(dp_info_ptrs, n, sizeof(datapoint_info_t*), cmpPP);


    MPI_Win win_to_remove_mask;
    MPI_Win_create(to_remove_mask, n * sizeof(heap_node), 1, MPI_INFO_NULL, ctx -> mpi_communicator, &win_to_remove_mask);
    MPI_Win_fence(0, win_to_remove_mask);

    MPI_Win win_locks;
    MPI_Win_create(lock_array, n * sizeof(lock_t), sizeof(lock_t), MPI_INFO_NULL, ctx -> mpi_communicator, &win_locks);
    MPI_Win_fence(0, win_locks);

#ifdef EXP_H1
    MPI_Win_lock_all(0, win_to_remove_mask);
    MPI_Win_lock_all(0, win_locks);
#endif

#ifdef EXP_H1
    printf("Using experimental h1\n");
#endif

#if !defined(THREAD_FUNNELED)
    #pragma omp parallel for schedule(dynamic)
#endif
    for(idx_t p = 0; p < n; ++p)
    {
        datapoint_info_t i_point = *(dp_info_ptrs[p]);

        for(idx_t j = 1; j < i_point.kstar + 1; ++j)
        {
            idx_t jidx = i_point.ngbh[j].array_idx; 

            datapoint_info_t j_point = find_possibly_halo_datapoint_rma(ctx, jidx,  win_datapoints);

            if(j_point.is_center && i_point.g > j_point.g)
            {
                /*
                 *
                 * TODO: Implement it without this but using private locks
                 * use an array of locks, and compare and swap to actually gain control of the thing
                 *
                 * */

#ifdef EXP_H1
                #pragma omp critical (h1_exp)
                {
                    int owner = foreign_owner(ctx, jidx);
                    idx_t jpos = jidx - ctx -> rank_idx_start[owner];

                    lock_t state = LOCK_FREE;

                    state = h1_lock_acquire(ctx, win_locks, owner, jpos, state);

                    heap_node mask_element;
                    MPI_Request request;

                    MPI_Rget(&mask_element, sizeof(heap_node), MPI_BYTE, 
                            owner, jpos * sizeof(heap_node), sizeof(heap_node), MPI_BYTE, win_to_remove_mask, &request);
                    MPI_Wait(&request, MPI_STATUS_IGNORE);

                    int flag = mask_element.array_idx == MY_SIZE_MAX;							
                    if(flag || i_point.g > mask_element.value )
                    {
                        heap_node tmp_mask_element = {.array_idx = i_point.array_idx, .value = i_point.g};
                        MPI_Request request;
                        MPI_Rput(&tmp_mask_element, sizeof(heap_node), MPI_BYTE, owner, 
                                jpos*sizeof(heap_node), sizeof(heap_node), MPI_BYTE, win_to_remove_mask, &request);
                        MPI_Wait(&request, MPI_STATUS_IGNORE);

                    }

                    state = h1_lock_free(ctx, win_locks, owner, jpos, state);
                }
#else
                #pragma omp critical (centers_elimination)                 
                {
                    int owner = foreign_owner(ctx, jidx);
                    idx_t jpos = jidx - ctx -> rank_idx_start[owner];

                    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, owner, 0, win_to_remove_mask);
                    heap_node mask_element;
                    MPI_Request request;
                    MPI_Rget(&mask_element, sizeof(heap_node), MPI_BYTE, 
                             owner, jpos * sizeof(heap_node), sizeof(heap_node), MPI_BYTE, win_to_remove_mask, &request);
                    MPI_Wait(&request, MPI_STATUS_IGNORE);

                    int flag = mask_element.array_idx == MY_SIZE_MAX;							
                    if(flag || i_point.g > mask_element.value )
                    {
                        heap_node tmp_mask_element = {.array_idx = i_point.array_idx, .value = i_point.g};
                        MPI_Request request;
                        MPI_Rput(&tmp_mask_element, sizeof(heap_node), MPI_BYTE, owner, 
                                jpos*sizeof(heap_node), sizeof(heap_node), MPI_BYTE, win_to_remove_mask, &request);
                        MPI_Wait(&request, MPI_STATUS_IGNORE);

                    }

                    MPI_Win_unlock(owner, win_to_remove_mask);
                }
#endif
            }
        }
    }

#ifdef EXP_H1
    MPI_Win_unlock_all(win_to_remove_mask);
    MPI_Win_unlock_all(win_locks);
#endif
    
    MPI_Win_fence(0, win_to_remove_mask);
    MPI_Win_fence(0, win_locks);
    MPI_Barrier(ctx -> mpi_communicator);

	/* populate the usual arrays */
    for(idx_t p = 0; p < all_centers.count; ++p)
    {
        idx_t i = all_centers.data[p];
        int e = 0;
        //float_t gi = dp_info[i].g;
        idx_t mr = to_remove_mask[i].array_idx;
        if(mr != MY_SIZE_MAX)
        {
            //if(dp_info[mr].g > gi) e = 1;
			e = 1;
        }
        switch (e)
        {
            case 1:
                {
                    //lu_dynamic_array_pushBack(&removed_centers,i);
                    lu_dynamic_array_pushBack(&removed_centers,dp_info[i].array_idx);
                    //here it sets is_center to 0
                    dp_info[i].is_center = 0;
                    for(idx_t c = 0; c < removed_centers.count - 1; ++c)
                    {
                        if(mr == removed_centers.data[c])
                        {
                            mr = max_rho.data[c];
                        }
                    }
                    lu_dynamic_array_pushBack(&max_rho,mr);
                    
                }
                break;

            case 0:
                {
                    lu_dynamic_array_pushBack(&actual_centers,i);
                    dp_info[i].cluster_idx = actual_centers.count - 1;
                }
                break;

            default:
                break;
        }
    }

    MPI_Win_free(&win_to_remove_mask);
	free(to_remove_mask);
    
    MPI_Win_free(&win_locks);
	free(lock_array);

    int n_centers = (int)actual_centers.count;
    int tot_centers;
    MPI_Allreduce(&n_centers, &tot_centers, 1, MPI_INT, MPI_SUM, ctx -> mpi_communicator);

    MPI_DB_PRINT("Found %d temporary centers\n", tot_centers);

    /* bring on master all centers 
     * order them in ascending order of density, 
     * then re-scatter them around to get unique cluster labels */ 

    center_t* private_centers_buffer = (center_t*)MY_MALLOC(actual_centers.count * sizeof(center_t));
    center_t* global_centers_buffer  = (center_t*)MY_MALLOC(tot_centers * sizeof(center_t));

    for(int i = 0; i < actual_centers.count; ++i)
    {
        idx_t idx = actual_centers.data[i] ;
        private_centers_buffer[i].density = dp_info[idx].g;
        private_centers_buffer[i].idx     = dp_info[idx].array_idx;
    }
    MPI_Datatype mpi_center_type;
    
    MPI_Type_contiguous(sizeof(center_t), MPI_BYTE, &mpi_center_type);
    MPI_Type_commit(&mpi_center_type);

    int* center_counts = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
    int* center_displs = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));

    int cc = (int)actual_centers.count;
    MPI_Allgather(&cc, 1, MPI_INT, center_counts, 1, MPI_INT, ctx -> mpi_communicator);

    center_displs[0] = 0;
    for(int i = 1; i < ctx -> world_size; ++i) center_displs[i] = center_displs[i - 1] + center_counts[i - 1];

    /* alternative to check if something breaks */
    for(int i = 0; i < actual_centers.count; ++i)
    {
        idx_t idx = actual_centers.data[i];
        dp_info[idx].cluster_idx += center_displs[ctx -> mpi_rank];

        //this tranlates them to global indexing
        actual_centers.data[i] += ctx -> idx_start;
    }

    idx_t* all_center_idx = (idx_t*)MY_MALLOC(tot_centers * sizeof(idx_t));
    
    MPI_Allgatherv(actual_centers.data, actual_centers.count, MPI_UINT64_T, all_center_idx, center_counts, center_displs, MPI_UINT64_T, ctx -> mpi_communicator);

    free(center_counts);
    free(center_displs);

    /*
     * Sort all the dp_info based on g and then perform the cluster assignment
     * in asceding order                                                     
     */

    int completed = 0;

    while(!completed)
    {
        completed = 1;


        int proc_points = 0;
        /* retrieve assignment 
         * if some point remain uncompleted then get assignment from external points */

        for(idx_t i = 0; i < n; ++i)
        {   
            int wait_for_comms = 0;
            datapoint_info_t* p = dp_info_ptrs[i];
            if(!(p -> is_center) && p -> cluster_idx < 0)
            {
                int cluster = -1;
                idx_t k = 0;
                idx_t p_idx;
                idx_t max_k = ctx -> k;
                /*assign each particle at the same cluster as the nearest particle of higher density*/
                datapoint_info_t p_retrieved;
                while( k < max_k - 1 && cluster == -1)
                {


                    ++k;
                    p_idx = p -> ngbh[k].array_idx;
                    p_retrieved = find_possibly_halo_datapoint_rma(ctx, p_idx, win_datapoints);

                    int flag = p_retrieved.g > p -> g;

                    if(p_retrieved.g > p -> g)
                    {
                        cluster = p_retrieved.cluster_idx; 
                        wait_for_comms = 1;
                        break;
                    }
                }

                if(cluster == -1 && !wait_for_comms)
                {
                    float_t gmax = -99999.;               
                    idx_t gm_index = SIZE_MAX;
                    for(idx_t k = 0; k < max_k; ++k)
                    {
                        idx_t ngbh_index = p -> ngbh[k].array_idx;
                        for(idx_t m = 0; m < removed_centers.count; ++m)
                        {
                            idx_t max_rho_idx = max_rho.data[m];
                            datapoint_info_t dp_max_rho = find_possibly_halo_datapoint_rma(ctx, max_rho_idx, win_datapoints);
                            float_t gcand = dp_max_rho.g;
                            if(ngbh_index == removed_centers.data[m] && gcand > gmax)
                            {   
                                gmax = gcand;
                                gm_index = max_rho.data[m];
                            }
                        }
                    }
                    if(gm_index != SIZE_MAX)
                    {
                        datapoint_info_t dp_gm = find_possibly_halo_datapoint_rma(ctx, gm_index, win_datapoints);
                        cluster = dp_gm.cluster_idx;
                    }

                }
                p -> cluster_idx = cluster;

            }
            completed = completed && p -> cluster_idx != -1; 
            proc_points += p -> cluster_idx != -1 ?  1 : 0;

        }

        #ifdef PRINT_H1_CLUSTER_ASSIGN_COMPLETION
        DB_PRINT("[RANK %d] proc points %d completed %d %lu\n", ctx -> mpi_rank, proc_points, completed, ctx -> local_n_points);
        MPI_Barrier(ctx -> mpi_communicator);
        #endif

        MPI_Allreduce(MPI_IN_PLACE, &completed, 1, MPI_INT, MPI_SUM, ctx -> mpi_communicator);
        completed = completed == ctx -> world_size ? 1 : 0;

    }

    MPI_Win_unlock_all(win_datapoints);
    MPI_Win_fence(0, win_datapoints);
    MPI_Win_free(&win_datapoints);

    MPI_Barrier(ctx -> mpi_communicator);

    free(dp_info_ptrs);
    free(max_rho.data);
    free(removed_centers.data);
    free(all_centers.data);

    #if defined(WRITE_CLUSTER_ASSIGN_H1)
        /* densities */
        int* ks = (int*)MY_MALLOC(ctx -> local_n_points * sizeof(int));
        for(int i = 0; i < ctx -> local_n_points; ++i) ks[i] = ctx -> local_datapoints[i].cluster_idx;

        ordered_buffer_to_file(ctx, ks, sizeof(int), ctx -> local_n_points, "bb/cl.npy");
        ordered_data_to_file(ctx, "bb/ordered_data.npy");
        free(ks);
    #endif


    free(actual_centers.data);
    actual_centers.size  = tot_centers;
    actual_centers.count = tot_centers;
    actual_centers.data  = all_center_idx;


    clusters_t c_all;
    c_all.centers = actual_centers;


    clock_gettime(CLOCK_MONOTONIC, &finish_tot);
    elapsed_tot = (finish_tot.tv_sec - start_tot.tv_sec);
    elapsed_tot += (finish_tot.tv_nsec - start_tot.tv_nsec) / 1000000000.0;

    c_all.n = n;
    return c_all;
}


void Heuristic2(global_context_t* ctx, clusters_t* cluster)
{
    /*
     *
     * Each one computes its borders, then the borders are shared, and the matrix is 
     * reduced to a single huge matrix of borders
     *
     */

    int verbose = 0;

    datapoint_info_t* dp_info = ctx -> local_datapoints;

    MPI_Win dp_info_win, ngbh_win;
    MPI_Win_create(ctx -> local_datapoints, ctx -> local_n_points * sizeof(datapoint_info_t), 1, MPI_INFO_NULL, ctx -> mpi_communicator, &dp_info_win);
    MPI_Win_create(ctx -> __local_heap_buffers, ctx -> local_n_points * ctx -> k * sizeof(heap_node), 1, MPI_INFO_NULL, ctx -> mpi_communicator, &ngbh_win);

    MPI_Win_fence(0, dp_info_win);
    MPI_Win_fence(0, ngbh_win);

    MPI_Win_lock_all(0, dp_info_win);
    MPI_Win_lock_all(0, ngbh_win);
    MPI_Barrier(ctx -> mpi_communicator);

    #define borders cluster->borders

    struct timespec start_tot, finish_tot;
    double elapsed_tot;
    idx_t n = ctx -> local_n_points;

	if(verbose)
	{
		printf("H2: Finding border points\n");
		clock_gettime(CLOCK_MONOTONIC, &start_tot);
	}

    idx_t nclus = cluster->centers.count; 
    idx_t max_k = ctx -> k;

    for(idx_t i = 0; i < n; ++i)
    {
        idx_t pp = NOBORDER;
        /*loop over n neighbors*/
        int c = dp_info[i].cluster_idx;
        if(!dp_info[i].is_center)
        {
            for(idx_t k = 1; k < dp_info[i].kstar + 1; ++k)
            {
                /*index of the kth ngbh of n*/
                idx_t j = dp_info[i].ngbh[k].array_idx;
                pp = NOBORDER;
                /*Loop over kn neigbhours to find if n is the nearest*/
                /*if cluster of the particle in nbhg is c then check is neighborhood*/                                                

                datapoint_info_t j_dp = find_possibly_halo_datapoint_rma(ctx, j, dp_info_win);
                if(j_dp.cluster_idx != c)
                {
                    pp = j;
                    break;
                }

            }
        }

		if(pp != NOBORDER)
		{
            datapoint_info_t pp_dp = find_possibly_halo_datapoint_rma(ctx, pp, dp_info_win);
			for(idx_t k = 1; k < max_k; ++k)
			{
                idx_t pp_ngbh_idx = get_j_ksel_idx(ctx, pp_dp.array_idx, k, ngbh_win);
                datapoint_info_t pp_ngbh_dp = find_possibly_halo_datapoint_rma(ctx, pp_ngbh_idx, dp_info_win);
                //TODO: can optimize it can retrieve the whole ngbh in one shot

				if(pp_ngbh_idx == dp_info[i].array_idx)
				{
					break;
				}


				if(pp_ngbh_dp.cluster_idx == c)
				{
					pp = NOBORDER;
					break;
				}
			}
		}

		/*if it is the maximum one add it to the cluster*/
		if(pp != NOBORDER)
		{
            datapoint_info_t j_dp = find_possibly_halo_datapoint_rma(ctx, pp, dp_info_win);
			int ppc = j_dp.cluster_idx;

            sparse_border_t b = {.i = c, .j = ppc, .idx = ctx -> local_datapoints[i].array_idx, .density = ctx -> local_datapoints[i].g, .error = ctx -> local_datapoints[i].log_rho_err}; 
            sparse_border_insert(cluster, b);
            sparse_border_t bsym = {.i = ppc, .j = c, .idx = ctx -> local_datapoints[i].array_idx, .density = ctx -> local_datapoints[i].g, .error = ctx -> local_datapoints[i].log_rho_err}; 
            sparse_border_insert(cluster, bsym);
		}


    }

    MPI_Barrier(ctx -> mpi_communicator);

    idx_t num_border_el = 0;
    for(idx_t i = 0; i < nclus; ++i)
    {
        num_border_el += cluster -> sparse_borders[i].count;
    }

    int i_have_sent = 0;
    int level = 1;
    int ranks = ctx -> world_size;

    #define SEND 1 
    #define RECV 0
    #define DO_NOTHING -1

    while(ranks > 1)
    {
        int dp = ranks % 2;
        ranks = ranks / 2 + dp;
        int send_rcv = (ctx -> mpi_rank >= ranks);

        MPI_Barrier(ctx -> mpi_communicator);

        if(dp && ctx -> mpi_rank == (ranks - 1)) send_rcv = DO_NOTHING;

        switch (send_rcv) 
        {
            case SEND:
                if(!i_have_sent)
                {
                    idx_t num_border_el = 0;
                    for(idx_t i = 0; i < nclus; ++i)
                    {
                        num_border_el += cluster -> sparse_borders[i].count;
                    }
                    sparse_border_t* borders_to_send = (sparse_border_t*)MY_MALLOC(num_border_el * sizeof(sparse_border_t)); 

                    idx_t count = 0;
                    for(idx_t i = 0; i < nclus; ++i)
                    {
                        for(idx_t j = 0; j < cluster -> sparse_borders[i].count; ++j)
                        {

                            borders_to_send[count] = cluster -> sparse_borders[i].data[j];
                            count++;
                        }
                    }

                    int rank_to_send = ctx -> mpi_rank - ranks;

                    #if defined(PRINT_H2_COMM_SCHEME)
                    DB_PRINT("-- Rank %d sending to %d\n", ctx -> mpi_rank, rank_to_send);
                    #endif

                    MPI_Send(&num_border_el, 1, MPI_UINT64_T, rank_to_send, rank_to_send, ctx -> mpi_communicator);

                    MPI_Send(borders_to_send, num_border_el * sizeof(sparse_border_t), MPI_BYTE , rank_to_send, rank_to_send, ctx -> mpi_communicator);
                                        
                    i_have_sent = 1;
                    free(borders_to_send);
                }
                break;
            case RECV:
                {
                    #if defined(PRINT_H2_COMM_SCHEME)
                    DB_PRINT("** Rank %d recieving\n", ctx -> mpi_rank);
                    #endif
                    idx_t num_borders_recv = 0;
                    MPI_Recv(&num_borders_recv, 1, MPI_UINT64_T, MPI_ANY_SOURCE, ctx -> mpi_rank, ctx -> mpi_communicator, MPI_STATUS_IGNORE);

                    sparse_border_t* borders_to_recv = (sparse_border_t*)MY_MALLOC(num_borders_recv * sizeof(sparse_border_t)); 

                    MPI_Recv(borders_to_recv, num_borders_recv * sizeof(sparse_border_t), MPI_BYTE , MPI_ANY_SOURCE, ctx -> mpi_rank, ctx -> mpi_communicator, MPI_STATUS_IGNORE);

                    for(int i = 0; i < num_borders_recv; ++i)
                    {
                        sparse_border_insert(cluster, borders_to_recv[i]);
                    }
                    free(borders_to_recv);
                }
                break;
            default:
                #if defined(PRINT_H2_COMM_SCHEME)
                DB_PRINT(".. Rank %d doing nothing\n", ctx -> mpi_rank);
                #endif
                break;
        }
        #if defined(PRINT_H2_COMM_SCHEME)
        MPI_DB_PRINT("-----------------\n");
        MPI_Barrier(ctx -> mpi_communicator);
        #endif

    }

    num_border_el = 0;
    for(idx_t i = 0; i < nclus; ++i)
    {
        num_border_el += cluster -> sparse_borders[i].count;
    }

    MPI_DB_PRINT("Master final %lu border elements\n", num_border_el);

    //correction of the density at the borders to be compliant with dadadpy
    if(I_AM_MASTER)
    {
        for(idx_t c = 0; c < nclus; ++c)
        {
            for(idx_t el = 0; el < cluster -> sparse_borders[c].count; ++el)
            {
                //fix border density, write log rho c
                idx_t idx = cluster -> sparse_borders[c].data[el].idx; 
                datapoint_info_t idx_dp = find_possibly_halo_datapoint_rma(ctx, idx, dp_info_win);
                cluster -> sparse_borders[c].data[el].density = idx_dp.log_rho_c;
            }
        }
    }

    MPI_Barrier(ctx -> mpi_communicator);
    MPI_Win_unlock_all(ngbh_win);
    MPI_Win_unlock_all(dp_info_win);

    MPI_Win_free(&ngbh_win);
    MPI_Win_free(&dp_info_win);

    #undef SEND
    #undef RECV
    #undef DO_NOTHING

    #ifdef WRITE_BORDERS
        if(I_AM_MASTER)
        {
            printf("[MASTER] Writing borders to bb/borders.csv\n");
            FILE* f = fopen("bb/borders.csv", "w");
            for(idx_t i = 0; i < nclus; ++i)
            {
                for(int j = 0; j < cluster -> sparse_borders[i].count; ++j)
                {
                    sparse_border_t b = cluster -> sparse_borders[i].data[j];
                    fprintf(f, "%lu,%lu,%lu,%lf\n", b.i, b.j, b.idx, b.density);
                }
            }
            fclose(f);
        }
        
    
    #endif

    return;
    #undef borders

   }


int compare_merging_density( const void *A, const void *B)
{
	/*
	 * Utility function 
	 * comparision between two merging
	 * elements
	 */
	
	float_t DensA = ((merge_t*)A)->density;
	float_t DensB = ((merge_t*)B)->density;

	return - ( DensA > DensB) + (DensA < DensB);
}


static inline int is_a_merging(  float_t dens1, float_t dens1_err,
								 float_t dens2, float_t dens2_err,
								 float_t dens_border, float_t dens_border_err,
								 float_t Z)
{
	/*
	 * dens1	   : the density of the particle that is the center of the first cluster
	 * dens2	   : the density of the particle that is the center of the second cluster
	 * dens_border : the density of the border btw the cluster 1 and the cluster 2
	 * border_err  : the errors on the densities
	 * Z     	   : the desired accuracy
	 */

	/* in the original code it was:
	 *
	 float_t a1 = dp_info[cluster->centers.data[i]].log_rho_c - border_density[i][j];
	 float_t a2 = dp_info[cluster->centers.data[j]].log_rho_c - border_density[i][j];

	 float_t e1 = Z*(dp_info[cluster->centers.data[i]].log_rho_err + border_err[i][j]);
	 float_t e2 = Z*(dp_info[cluster->centers.data[j]].log_rho_err + border_err[i][j]);
	 */

	float_t a1 = dens1 - dens_border;
	float_t a2 = dens2 - dens_border;

	float_t e1 = Z*(dens1_err + dens_border_err);
	float_t e2 = Z*(dens2_err + dens_border_err);

	return (a1 < e1 || a2 < e2);
}


static inline int merging_roles(  float_t dens1, float_t dens1_err,
								  float_t dens2, float_t dens2_err,
								  float_t dens_border, float_t dens_border_err )
{
	/*
	 * Utility function 
	 * Retrieve if cluster 1 is merged into 2 or
	 * vice versa
	 */
      
	float_t c1 = (dens1 - dens_border) / (dens1_err + dens_border_err); 
	float_t c2 = (dens2 - dens_border) / (dens2_err + dens_border_err);
	//printf("%.10lf %.10lf %d\n",c1,c2, c1 > c2);

	return ( c1 < c2 );     // if 1, this signal to swap 1 and 2
}

void fix_borders_A_into_B(idx_t A, idx_t B, border_t** borders, idx_t n)
{
	/*
	 * Dense border implementation
	 * - idx_t A 			: cluster A the one which goes into the other 
	 * - idx_t B 			: cluster B the one that recieves the merging
	 * - border_t** borders : whole border matrix
	 * - idx_t n 			: number of clusters
	 *
	 */

	#pragma omp parallel for if(n > MAX_SERIAL_MERGING)
	for(idx_t i = 0; i < n; ++i) 
	{
		if(borders[A][i].idx != NOBORDER )
		{
			if(borders[B][i].idx != NOBORDER)
			{
				int mb = (borders[A][i].density > borders[B][i].density); 

				borders[B][i] = mb ? borders[A][i] : borders[B][i];
				borders[i][B] = borders[B][i];
			}
			else
			{
				borders[B][i] = borders[A][i];
				borders[i][B] = borders[B][i];
			}
		} 
		borders[A][i] = border_null;
		borders[i][A] = border_null;
	}
}

static inline void delete_adj_list_element(clusters_t * c, const idx_t list_idx, const idx_t el)
{
	/*
	 * Utility function
	 * Deletes an element into an adjecency list,
	 * representing the borders in the cluster topology
	 */

	idx_t count = c -> sparse_borders[list_idx].count;
	c -> sparse_borders[list_idx].data[el] = c -> sparse_borders[list_idx].data[count-1];
	c -> sparse_borders[list_idx].data[count-1] = sparse_border_null;
	c -> sparse_borders[list_idx].count -= 1;
}

void fix_sparse_borders_A_into_B(idx_t s,idx_t t, clusters_t* c)
{
	/*
	 * Handle borders after two clusters are merged
	 * - idx_t s 	 : source cluster, the one has to be merged
	 * - idx_t t 	 : target cluster, the one recieves the merge
	 * - clusters* c : object containing all the data 
	 *
	 * When s goes into t all the clusters which had a border with s now they must have
	 * a border with t. If t already has a border like that, the border with higher 
	 * density is kept
	 */
	
	{
		{
			for(idx_t el = 0; el < c -> sparse_borders[t].count; ++el)
			{
				sparse_border_t b = c -> sparse_borders[t].data[el];
				if(b.i == t && b.j == s)
				{
					//delete the border src trg
					delete_adj_list_element(c, t, el);
				}
			}
		}
		//find the border and delete it, other insert them in correct place
		for(idx_t el = 0; el < c -> sparse_borders[s].count; ++el)
		{
			sparse_border_t b = c -> sparse_borders[s].data[el];
		//	idx_t ii = b.i;
			if(b.j != t)
			{
				//insert these borders as trg -> j and j -> trg
				b.i = t;
				sparse_border_insert(c, b);
				sparse_border_t bsym = b;
				bsym.i = b.j;
				bsym.j = b.i;
				sparse_border_insert(c, bsym);
				for(idx_t dl = 0; dl < c -> sparse_borders[b.j].count; ++dl)
				{
					sparse_border_t b_del = c -> sparse_borders[b.j].data[dl];
					if(b_del.j == s)
					{
						//delete the border src trg
						delete_adj_list_element(c, b.j, dl);
					}
				}
						
			}
		}
		//clean up all borders
		//delete the src list
		{
			adj_list_reset((c->sparse_borders) + s);
		}
		//delete all borders containing src
	//	for(idx_t i = 0; i < nclus; ++i)
	//	{
	//		for(idx_t el = 0; el < c -> sparse_borders[i].count; ++el)
	//		{
	//			sparse_border_t b = c -> sparse_borders[i].data[el];
	//			if(b.j == s)
	//			{
	//				//delete the border src trg
	//				delete_adj_list_element(c, i, el);
	//			}
	//		}
	//			
	//	}
	}


}

void merge_A_into_B(idx_t* who_amI, idx_t cluster_A, idx_t cluster_B, idx_t n)
{
	/*
	 * Utility function
	 * Performs correctino of the labels in the array that
	 * keep tracks of what cluster is after a merging
	 */

    #pragma omp parallel if(n > MAX_SERIAL_MERGING)
    {
	    idx_t tmp;
	    #pragma omp for
	    for(idx_t i = 0; i < n; ++i)
	    {   
			//substitute occurencies of b with a 
			tmp = who_amI[i] == cluster_A ? cluster_B : who_amI[i];
			who_amI[i] = tmp;
	    }
    }
    return;
}

void master_finds_borders(global_context_t* ctx, clusters_t* cluster, float_t Z, idx_t* surviving_clusters, datapoint_info_t* centers_dp)
{
    datapoint_info_t* dp_info   = ctx -> local_datapoints;
	idx_t nclus                 = cluster -> centers.count;  

	idx_t   merge_count         = 0;
	idx_t   merging_table_size  = 1000;
	merge_t *merging_table      = (merge_t*)malloc(sizeof(merge_t)*merging_table_size);
  
	/*
	 * Find clusters to be merged
	 * Loop over borders and find which ones will generate a merge,
	 * store them later in the merging table
	 **/


    /* center density 
     * need to retrieve center density */

	for(idx_t i = 0; i < nclus - 1; ++i)   
	{
		idx_t count = cluster -> sparse_borders[i].count;
		for(idx_t el = 0; el < count; ++el)   
		{
			sparse_border_t b = cluster -> sparse_borders[i].data[el];
			if( b.j > b.i)
			{
				float_t dens1           = centers_dp[b.i].log_rho_c;
				float_t dens1_err       = centers_dp[b.i].log_rho_err;
				float_t dens2           = centers_dp[b.j].log_rho_c;
				float_t dens2_err       = centers_dp[b.j].log_rho_err;
				float_t dens_border     = b.density;
				float_t dens_border_err = b.error;

				if ( is_a_merging( dens1, dens1_err, dens2, dens2_err, dens_border, dens_border_err, Z ) )
				{
					if ( merge_count == merging_table_size ) {
					merging_table_size *= 1.1;
					merging_table = (merge_t*)realloc( merging_table, sizeof(merge_t) * merging_table_size ); }

					idx_t src = b.j;
					idx_t trg = b.i;

					merging_table[merge_count].source = src;
					merging_table[merge_count].target = trg;
					merging_table[merge_count].density = b.density;
					++merge_count;
				}
			}
		}
	}

	qsort( (void*)merging_table, merge_count, sizeof(merge_t), compare_merging_density);
    
    MPI_DB_PRINT("Found %lu merges\n", merge_count);

    #ifdef WRITE_MERGING_TABLE
        if(I_AM_MASTER)
        {
            printf("[MASTER] Writing merging table to bb/merging_table.csv\n");
            FILE* f = fopen("bb/merging_table.csv", "w");
            for(idx_t i = 0; i < merge_count; ++i)
            {
                merge_t b = merging_table[i];
                fprintf(f, "%lu,%lu,%lf\n", b.source, b.target, b.density);
            }
            fclose(f);
        }
            
    #endif

    for( idx_t m = 0; m < merge_count; m++ )
    {
      
        #define src surviving_clusters[merging_table[m].source]
        #define trg surviving_clusters[merging_table[m].target]

        /* 
         * Enforce a that in case of symmetric merging condition the lowest idx cluster 
         * is merged into the higher idx cluster, only to preserve compatibility with 
         * original ADP implementation
         *
         * Process each element in the merging table
         */
        idx_t new_src = (src < trg) ? src : trg;
        idx_t new_trg = (src < trg) ? trg : src;

        /*
         * pick who am I and retrieve all needed data from the 
         * border matrices
         */

        float_t dens1           = centers_dp[new_src].log_rho_c;
        float_t dens1_err       = centers_dp[new_src].log_rho_err;
        float_t dens2           = centers_dp[new_trg].log_rho_c;
        float_t dens2_err       = centers_dp[new_trg].log_rho_err;

        //borders get
        sparse_border_t b 	   	= sparse_border_get(cluster, new_src, new_trg);
        float_t dens_border     = b.density;
        float_t dens_border_err = b.error;

        int i_have_to_merge = is_a_merging(dens1,dens1_err,dens2,dens2_err,dens_border,dens_border_err,Z);            
        switch (i_have_to_merge && src != trg)
        {
            case 1:
            {
                int side = merging_roles(dens1,dens1_err,dens2,dens2_err,dens_border,dens_border_err);
                if(!side)
                {
                    idx_t tmp;
                    tmp = new_src;
                    new_src = new_trg;
                    new_trg = tmp;
                }

                    
                /* 
                 * Perform the actual meriging,
                 * first  -> fix the borders, delete old ones and spawn new one in the correct position
                 * second -> update the surviving_clusters buffer
                 */
                fix_sparse_borders_A_into_B(new_src, new_trg, cluster);
                merge_A_into_B(surviving_clusters, new_src, new_trg, nclus );	  
            }
            break;
        
        default:
            break;
        }
        
        #undef src
        #undef trg
    }

    free(merging_table);
}

void master_fixes_border_matrix_and_centers(global_context_t* ctx, clusters_t* cluster, float_t Z, idx_t* old_to_new, idx_t* surviving_clusters, idx_t nclus)
{
    /*finalize clustering*/
    /*acutally copying */
    lu_dynamic_array_t tmp_centers;
    lu_dynamic_array_t tmp_cluster_idx;


    lu_dynamic_array_init(&tmp_centers);
    lu_dynamic_array_init(&tmp_cluster_idx);

    lu_dynamic_array_reserve(&tmp_centers, nclus);
    lu_dynamic_array_reserve(&tmp_cluster_idx, nclus);

    idx_t final_cluster_count = 0;

    idx_t incremental_k = 0;
    for(idx_t i = 0; i < nclus; ++i)
    {
        
        if(surviving_clusters[i] == i){
            lu_dynamic_array_pushBack(&tmp_centers, cluster->centers.data[i]);
            lu_dynamic_array_pushBack(&tmp_cluster_idx, i);
            old_to_new[i] = incremental_k;
            ++incremental_k;
            ++final_cluster_count;
        }
    }

    //fill the rest of old_to_new
    for(idx_t i = 0; i < nclus; ++i)
    {
        if(surviving_clusters[i] != i){
            idx_t cidx_to_copy_from = surviving_clusters[i];
            old_to_new[i] = old_to_new[cidx_to_copy_from];
        }
    }

    /*allocate auxiliary pointers to store results of the finalization of the procedure*/

    adj_list_t* tmp_borders = (adj_list_t*)malloc(final_cluster_count*sizeof(adj_list_t));

    //initialize temporary borders
    for(idx_t i = 0; i < final_cluster_count; ++i)
    {
	    tmp_borders[i].count = 0;
	    tmp_borders[i].size  = PREALLOC_BORDERS;
	    tmp_borders[i].data  = (sparse_border_t*)malloc(PREALLOC_BORDERS*sizeof(sparse_border_t));
    }

    #pragma omp parallel for
    for(idx_t c = 0; c < final_cluster_count; ++c)
    {
        idx_t c_idx = tmp_cluster_idx.data[c];
		for(idx_t el = 0; el < cluster -> sparse_borders[c_idx].count; ++el)
		{
			//retrieve border
			sparse_border_t b = cluster -> sparse_borders[c_idx].data[el];
			//change idexes of clusters
			b.i = old_to_new[b.i];
			b.j = old_to_new[b.j];

			adj_list_insert(tmp_borders + c, b);
		}
    }

    clusters_reset(cluster);
    /*pay attention to the defined borders*/
    /*copy into members*/
    cluster -> sparse_borders = tmp_borders;
    cluster -> centers = tmp_centers;
    free(tmp_cluster_idx.data);

}

int compare_dp_by_cidx(const void* a, const void* b)
{
    int aa = ((datapoint_info_t*)a) -> cluster_idx;
    int bb = ((datapoint_info_t*)b) -> cluster_idx;
    return (aa > bb) - (aa < bb);
}

void Heuristic3(global_context_t* ctx, clusters_t* cluster, float_t Z, int halo)
{

    int verbose = 0;

    /*
     * Heurisitc 3, from paper of Errico, Facco, Laio & Rodriguez 
     * ( https://doi.org/10.1016/j.ins.2021.01.010 )              
     */

	#define borders cluster->borders


	struct timespec start_tot, finish_tot;
	double elapsed_tot;

	struct timespec start, finish;
	double elapsed;

	clock_gettime(CLOCK_MONOTONIC, &start_tot);


    datapoint_info_t* dp_info   = ctx -> local_datapoints;
	idx_t  nclus                = cluster -> centers.count;  
	idx_t* surviving_clusters   = (idx_t*)MY_MALLOC(nclus*sizeof(idx_t));
    idx_t* old_to_new           = (idx_t*)MY_MALLOC(nclus*sizeof(idx_t));
	for(idx_t i = 0; i < nclus; ++i)
	{ 
		surviving_clusters[i] = i; 
	}

    MPI_Win dp_info_win;
    MPI_Win_create(ctx -> local_datapoints, ctx -> local_n_points * sizeof(datapoint_info_t), 1, MPI_INFO_NULL, ctx -> mpi_communicator, &dp_info_win);

    MPI_Win_fence(0, dp_info_win);

    if(I_AM_MASTER)
    {
        datapoint_info_t* centers_dp = (datapoint_info_t*)MY_MALLOC(cluster -> centers.count * sizeof(datapoint_info_t));
        for(idx_t i = 0; i < cluster -> centers.count; ++i)
        {
            idx_t cidx = cluster -> centers.data[i];
            
            int owner = foreign_owner(ctx, cidx);
            idx_t pos = cidx - ctx -> rank_idx_start[owner];
            if(owner == ctx -> mpi_rank)
            {
                centers_dp[i] = ctx -> local_datapoints[pos];
            }
            else
            {
                MPI_Get(centers_dp + i, 
                        sizeof(datapoint_info_t), 
                        MPI_BYTE, 
                        owner, 
                        (MPI_Aint)(pos * sizeof(datapoint_info_t)), 
                        sizeof(datapoint_info_t), 
                        MPI_BYTE, 
                        dp_info_win);
            }
        }
        MPI_Win_fence(0, dp_info_win);

        qsort(centers_dp, cluster -> centers.count, sizeof(datapoint_info_t), compare_dp_by_cidx);

        master_finds_borders(ctx, cluster, Z, surviving_clusters, centers_dp);
        master_fixes_border_matrix_and_centers(ctx, cluster, Z, old_to_new, surviving_clusters, nclus);
        free(centers_dp);
    }
    else
    {
        MPI_Win_fence(0, dp_info_win);
    }
    MPI_Win_free(&dp_info_win);

    /* at this point master has the final border matrix 
     * with the final list of surviving clusters
     */

    /* copy centers */ 

    if(!(I_AM_MASTER))
    {
        clusters_reset(cluster);
    }

    /* broadcast the number of elements on those lists */

    MPI_Bcast(&(cluster -> centers.count), 1, MPI_UINT64_T, 0, ctx -> mpi_communicator);
    MPI_Bcast(&(cluster -> centers.size), 1, MPI_UINT64_T, 0, ctx -> mpi_communicator);

    if(!(I_AM_MASTER))
    {
        cluster -> centers.data = (idx_t*)MY_MALLOC(cluster -> centers.size * sizeof(idx_t));
    }

    MPI_Bcast(cluster -> centers.data, cluster -> centers.size, MPI_UINT64_T, 0, ctx -> mpi_communicator);

    /* copy borders */

    idx_t final_cluster_count = cluster -> centers.count;

    if(!(I_AM_MASTER))
    {
        cluster -> sparse_borders = (adj_list_t*)MY_MALLOC(final_cluster_count * sizeof(adj_list_t));
    }

    MPI_Bcast(cluster -> sparse_borders, final_cluster_count * sizeof(adj_list_t), MPI_BYTE, 0, ctx -> mpi_communicator);

    for(int i = 0; i < final_cluster_count; ++i)
    {
        if(!(I_AM_MASTER))
        {
            cluster -> sparse_borders[i].data = (sparse_border_t*)MY_MALLOC(cluster -> sparse_borders[i].size * sizeof(sparse_border_t));
        }
        MPI_Bcast(cluster -> sparse_borders[i].data,
                  cluster -> sparse_borders[i].size * sizeof(sparse_border_t),
                  MPI_BYTE, 0, ctx -> mpi_communicator);
    }


    MPI_Bcast(surviving_clusters, nclus, MPI_UINT64_T, 0, ctx -> mpi_communicator);
    MPI_Bcast(old_to_new, nclus, MPI_UINT64_T, 0, ctx -> mpi_communicator);

    /*fix cluster assignment*/
    #pragma omp parallel for
    for(idx_t i = 0; i < ctx -> local_n_points; ++i)
    {
        dp_info[i].is_center = 0;
        int old_cidx = dp_info[i].cluster_idx;
        dp_info[i].cluster_idx = old_to_new[old_cidx];
    }

    //reset centers
    for(idx_t i = 0; i < cluster -> centers.count; ++i)
    {
        idx_t idx = cluster -> centers.data[i];
        int owner = foreign_owner(ctx, idx);
        if(owner == ctx -> mpi_rank)
        {
            idx_t pos = idx - ctx -> idx_start;
            dp_info[pos].is_center = 1;
        }
    }

    /*Halo*/

    switch (halo)
    {
		case 1:
		{
			float_t* max_border_den_array = (float_t*)malloc(final_cluster_count*sizeof(float_t));
			#pragma omp parallel
			{
				#pragma omp for
				for(idx_t c = 0; c < final_cluster_count; ++c)
				{
					float_t max_border_den = -2.;
					for(idx_t el = 0; el < cluster -> sparse_borders[c].count; ++el)
					{
						sparse_border_t b = cluster -> sparse_borders[c].data[el];
						if(b.density > max_border_den)
						{
							max_border_den = b.density;
						}
					}

					max_border_den_array[c] = max_border_den;

				}

				#pragma omp barrier

				#pragma omp for
				for(idx_t i = 0; i < cluster -> n; ++i)
				{
					int cidx = dp_info[i].cluster_idx;
					int halo_flag = dp_info[i].log_rho_c < max_border_den_array[cidx]; 
                    //changed_here

                    //doing this we can have both the assignment and if it is
                    //part of the halo, without the need for storing other info
                    //halo points have cidx < 0 (old cidx = (c + 1) * -1 )
					dp_info[i].cluster_idx = halo_flag ? (cidx * (-1)) - 1 : cidx;
				}
			}
			free(max_border_den_array);
		}
			break;
		
		default:
			break;
    }    

    MPI_DB_PRINT("--> final cluster count %lu\n", final_cluster_count);
    free(surviving_clusters);
    free(old_to_new);

    /*free memory and put the correct arrays into place*/

    #ifdef WRITE_FINAL_ASSIGNMENT
        int* cl = (int*)MY_MALLOC(ctx -> local_n_points * sizeof(int));

        for(int i = 0; i < ctx -> local_n_points; ++i) cl[i] = ctx -> local_datapoints[i].cluster_idx;

        ordered_buffer_to_file(ctx, cl, sizeof(int), ctx -> local_n_points, "bb/final_assignment.npy");
        ordered_data_to_file(ctx, "bb/ordered_data.npy");

        free(cl);
        
    #endif

  #undef  borders  
}

