diff --git a/src/tree/heap.c b/src/tree/heap.c
index abe8e8945f7b0728375240ef284516105d927105..67d38e59f186bd90b3b58d22dbead27b953c3d17 100644
--- a/src/tree/heap.c
+++ b/src/tree/heap.c
@@ -9,7 +9,7 @@ void swap_heap_node(heap_node* a, heap_node* b){
 }
 
 void allocate_heap(heap* H, idx_t n){
-	H -> data = (heap_node*)malloc(n*sizeof(heap_node));
+    H -> data = (heap_node*)malloc(n*sizeof(heap_node));
     H -> N = n;
     H -> count = 0;
     return;
@@ -28,39 +28,39 @@ void free_heap(heap * H){ free(H -> data);}
 
 
 void heapify_max_heap(heap* H, idx_t node){
-	idx_t nn = node;
+    idx_t nn = node;
     idx_t largest = nn; 
     /*
     Found gratest between children of node and boundcheck if the node is a leaf 
     */
     
-	while(1)
-	{
-		largest = 	(HEAP_LCH(nn) < H -> N) && 
-					(H -> data[HEAP_LCH(nn)].value > H -> data[largest].value ) ? HEAP_LCH(nn) : largest;
-
-		largest = 	(HEAP_RCH(nn) < H -> N) && 
-					(H -> data[HEAP_RCH(nn)].value > H -> data[largest].value ) ? HEAP_RCH(nn) : largest;
-		if(largest != nn) 
-		{
-			swap_heap_node(H -> data + nn, H -> data + largest);
-			nn = largest;
-		}
-		else
-		{
-			break;
-		}
-	}
+    while(1)
+    {
+        largest =   (HEAP_LCH(nn) < H -> N) && 
+                    (H -> data[HEAP_LCH(nn)].value > H -> data[largest].value ) ? HEAP_LCH(nn) : largest;
+
+        largest =   (HEAP_RCH(nn) < H -> N) && 
+                    (H -> data[HEAP_RCH(nn)].value > H -> data[largest].value ) ? HEAP_RCH(nn) : largest;
+        if(largest != nn) 
+        {
+            swap_heap_node(H -> data + nn, H -> data + largest);
+            nn = largest;
+        }
+        else
+        {
+            break;
+        }
+    }
     
 
     /*
     if(HEAP_LCH(node) < H -> N){
         //if(H -> data[HEAP_LCH(node)].value > H -> data[largest].value ) largest = HEAP_LCH(node);
-		largest = (H -> data[HEAP_LCH(nn)].value > H -> data[largest].value ) ? HEAP_LCH(nn) : largest;
+        largest = (H -> data[HEAP_LCH(nn)].value > H -> data[largest].value ) ? HEAP_LCH(nn) : largest;
     }
     if(HEAP_RCH(node) < H -> N){
         //if(H -> data[HEAP_RCH(node)].value > H -> data[largest].value ) largest = HEAP_RCH(node);
-		largest = (H -> data[HEAP_RCH(nn)].value > H -> data[largest].value ) ? HEAP_RCH(nn) : largest;
+        largest = (H -> data[HEAP_RCH(nn)].value > H -> data[largest].value ) ? HEAP_RCH(nn) : largest;
     }
     if(largest == node){
         return;
@@ -81,55 +81,55 @@ void set_root_max_heap(heap * H, FLOAT_TYPE val, idx_t array_idx){
 }
 
 void insert_max_heap_insertion_sort(heap * H,const FLOAT_TYPE val,const idx_t array_idx){
-	heap_node tmpNode = {.value = val, .array_idx = array_idx};
-	if(H -> count < H -> N)
-	{
-		idx_t idx = H -> count;
-		H -> data[idx] = tmpNode;
-		++(H -> count);
-		while(idx >= 1)
-		{
-			if(H -> data[idx].value < H -> data[idx - 1].value)
-			{
-				swap_heap_node((H -> data) + idx, (H -> data) + idx - 1);
-				idx--;
-			}
-			else
-			{
-				break;
-			}
-		}
-
-	}
-	else
-	{
-		if(H -> data[H -> count - 1].value > val)
-		{
-			idx_t idx = H -> count - 1;
-			H -> data[idx] = tmpNode;
-			while(idx >= 1)
-			{
-				if(H -> data[idx].value < H -> data[idx - 1].value)
-				{
-					swap_heap_node(H -> data + idx, H -> data + idx - 1);
-					idx--;
-				}
-				else
-				{
-					break;
-				}
-			}
-		}
-	}
-	return;
+    heap_node tmpNode = {.value = val, .array_idx = array_idx};
+    if(H -> count < H -> N)
+    {
+        idx_t idx = H -> count;
+        H -> data[idx] = tmpNode;
+        ++(H -> count);
+        while(idx >= 1)
+        {
+            if(H -> data[idx].value < H -> data[idx - 1].value)
+            {
+                swap_heap_node((H -> data) + idx, (H -> data) + idx - 1);
+                idx--;
+            }
+            else
+            {
+                break;
+            }
+        }
+
+    }
+    else
+    {
+        if(H -> data[H -> count - 1].value > val)
+        {
+            idx_t idx = H -> count - 1;
+            H -> data[idx] = tmpNode;
+            while(idx >= 1)
+            {
+                if(H -> data[idx].value < H -> data[idx - 1].value)
+                {
+                    swap_heap_node(H -> data + idx, H -> data + idx - 1);
+                    idx--;
+                }
+                else
+                {
+                    break;
+                }
+            }
+        }
+    }
+    return;
 }
 
 void insert_max_heap(heap * H,const FLOAT_TYPE val,const idx_t array_idx){
-	int c1 = H -> count < H -> N;
-	int c2 = (val < H -> data[0].value) && (!c1);
-	int ctot = c1 + 2*c2;
-	switch (ctot) {
-		case 1:
+    int c1 = H -> count < H -> N;
+    int c2 = (val < H -> data[0].value) && (!c1);
+    int ctot = c1 + 2*c2;
+    switch (ctot) {
+        case 1:
             {
                 idx_t node = H->count;
                 ++(H -> count);
@@ -145,17 +145,17 @@ void insert_max_heap(heap * H,const FLOAT_TYPE val,const idx_t array_idx){
                     //if(node == 0) break;
                 }
             }
-	    	break;
+            break;
 
-		case 2: 
+        case 2: 
             {
                 set_root_max_heap(H,val,array_idx);
             }
-			break;
-		default:
-			break;
-	}
-	/* alternative solution, left here if something breaks*/
+            break;
+        default:
+            break;
+    }
+    /* alternative solution, left here if something breaks*/
 
     //if(H -> count < H -> N){
     //    idx_t node = H->count;
@@ -182,23 +182,23 @@ void insert_max_heap(heap * H,const FLOAT_TYPE val,const idx_t array_idx){
     //{
     //    return;
     //}
-	
+    
     
 }
 
 #ifdef USE_FLOAT32
-	#define EPS 5.96e-08
+    #define EPS 5.96e-08
 #else
-	#define EPS 2.11e-16
+    #define EPS 2.11e-16
 #endif
 
 int cmp_heap_nodes(const void* a, const void* b)
 {
-	const heap_node* aa = (const heap_node*)a;
-	const heap_node* bb = (const heap_node*)b;
-	int val =  (aa -> value > bb -> value) - (aa -> value < bb -> value); 
-	//return vv; 
-	return val;
+    const heap_node* aa = (const heap_node*)a;
+    const heap_node* bb = (const heap_node*)b;
+    int val =  (aa -> value > bb -> value) - (aa -> value < bb -> value); 
+    //return vv; 
+    return val;
 
 
 }
@@ -206,8 +206,8 @@ int cmp_heap_nodes(const void* a, const void* b)
 
 void heap_sort(heap* H){
     idx_t n = H -> N;
-	qsort(H -> data, n, sizeof(heap_node),cmp_heap_nodes);
-	//for(idx_t i= (H -> N) - 1; i > 0; --i)
+    qsort(H -> data, n, sizeof(heap_node),cmp_heap_nodes);
+    //for(idx_t i= (H -> N) - 1; i > 0; --i)
     //{
     //    swap_heap_node(H -> data, H -> data + i);
     //    H -> N = i;
diff --git a/src/tree/heap.h b/src/tree/heap.h
index 9982a796d1718c6b19d5238a44994253e93101c4..3ecd6e184edebfec0ceab72d3f815f3da328bd4c 100644
--- a/src/tree/heap.h
+++ b/src/tree/heap.h
@@ -9,17 +9,17 @@
 #define DATA_DIMS 0 
 
 #ifdef USE_FLOAT32
-	#define FLOAT_TYPE float
+    #define FLOAT_TYPE float
 #else
-	#define FLOAT_TYPE double 
+    #define FLOAT_TYPE double 
 #endif
 
 #ifdef USE_INT32
-	#define MY_SIZE_MAX UINT32_MAX
-	#define idx_t uint32_t
+    #define MY_SIZE_MAX UINT32_MAX
+    #define idx_t uint32_t
 #else
-	#define MY_SIZE_MAX UINT64_MAX
-	#define idx_t uint64_t
+    #define MY_SIZE_MAX UINT64_MAX
+    #define idx_t uint64_t
 #endif
 
 #define HEAP_LCH(x) (2*x + 1)
diff --git a/src/tree/kdtreeV2.c b/src/tree/kdtreeV2.c
index 51a88c375488038d2a57460ec6018463e7fa4922..5f129ce77f060828d6da053dc655116da75b2ce0 100644
--- a/src/tree/kdtreeV2.c
+++ b/src/tree/kdtreeV2.c
@@ -15,14 +15,14 @@ FLOAT_TYPE eud_kdtree_v2(FLOAT_TYPE* restrict p1, FLOAT_TYPE* restrict p2){
         register FLOAT_TYPE dd = (p1[i] - p2[i]);
         d += dd*dd;
     }
-	return d;
-	//return sqrt(d);
+    return d;
+    //return sqrt(d);
 }
 
 #ifdef USE_FLOAT32 
-	typedef float v4f __attribute__ ((vector_size (16)));
+    typedef float v4f __attribute__ ((vector_size (16)));
 #else
-	typedef double v4f __attribute__ ((vector_size (32)));
+    typedef double v4f __attribute__ ((vector_size (32)));
 #endif
 
 FLOAT_TYPE eud_kdtree_v2_2(FLOAT_TYPE* restrict u, FLOAT_TYPE* restrict v)
@@ -35,7 +35,7 @@ FLOAT_TYPE eud_kdtree_v2_2(FLOAT_TYPE* restrict u, FLOAT_TYPE* restrict v)
         register v4f _u = {u[i], u[i + 1], u[i + 2], u[i + 3]};
         register v4f _v = {v[i], v[i + 1], v[i + 2], v[i + 3]};
         register v4f _diff = _u - _v;
-		acc = _diff*_diff;
+        acc = _diff*_diff;
     }
 
     s = acc[0] + acc[1] + acc[2] + acc[3];
@@ -56,14 +56,14 @@ void swap_kdnode_v2(kdnode_v2 *x, kdnode_v2 *y) {
     *x = *y;
     *y = tmp;
 
-	#ifdef SWMEM
-		memcpy(swapMem_kdv2, x -> data, sizeof(FLOAT_TYPE)*data_dims);
-		memcpy(x -> data, y -> data, sizeof(FLOAT_TYPE)*data_dims);
-		memcpy(y -> data, swapMem_kdv2, sizeof(FLOAT_TYPE)*data_dims);
-		FLOAT_TYPE* tmpPtr = x -> data;
-		x -> data = y -> data;
-		y -> data = tmpPtr;
-	#endif
+    #ifdef SWMEM
+        memcpy(swapMem_kdv2, x -> data, sizeof(FLOAT_TYPE)*data_dims);
+        memcpy(x -> data, y -> data, sizeof(FLOAT_TYPE)*data_dims);
+        memcpy(y -> data, swapMem_kdv2, sizeof(FLOAT_TYPE)*data_dims);
+        FLOAT_TYPE* tmpPtr = x -> data;
+        x -> data = y -> data;
+        y -> data = tmpPtr;
+    #endif
 }
 
 
@@ -86,8 +86,8 @@ void initialize_kdnodes_v2(kdnode_v2* node_array, FLOAT_TYPE* d, idx_t n )
         node_array[i].level = -1;
         node_array[i].is_leaf = 0;
         node_array[i].split_var = -1;
-		node_array[i].node_list.data = NULL;
-		node_array[i].node_list.count = 0;
+        node_array[i].node_list.data = NULL;
+        node_array[i].node_list.count = 0;
     }
 } 
 /*
@@ -178,61 +178,61 @@ kdnode_v2* make_tree_kdnode_v2(kdnode_v2* t, int start, int end, kdnode_v2* pare
 {
     kdnode_v2 *n = NULL;
     int split_var = level % data_dims; 
-	FLOAT_TYPE max_diff = -999999.;
-	for(unsigned int v = 0; v < data_dims; ++v)
-	{
-		FLOAT_TYPE max_v = -9999999.;
-		FLOAT_TYPE min_v = 9999999.;
-		for(int i = start; i <= end; ++i)
-		{
-			max_v = t[i].data[v] > max_v ? t[i].data[v] : max_v;
-			min_v = t[i].data[v] < min_v ? t[i].data[v] : min_v;
-		}
-		if((max_v - min_v) > max_diff)
-		{
-			max_diff = max_v - min_v;
-			split_var = v;
-		}
-	}
-	
+    FLOAT_TYPE max_diff = -999999.;
+    for(unsigned int v = 0; v < data_dims; ++v)
+    {
+        FLOAT_TYPE max_v = -9999999.;
+        FLOAT_TYPE min_v = 9999999.;
+        for(int i = start; i <= end; ++i)
+        {
+            max_v = t[i].data[v] > max_v ? t[i].data[v] : max_v;
+            min_v = t[i].data[v] < min_v ? t[i].data[v] : min_v;
+        }
+        if((max_v - min_v) > max_diff)
+        {
+            max_diff = max_v - min_v;
+            split_var = v;
+        }
+    }
+    
     /*
-	#ifdef SWMEM	
-		if(parent == NULL)
-		{
-			swapMem_kdv2 = (FLOAT_TYPE*)malloc(sizeof(FLOAT_TYPE)*data_dims);
-		}
-	#endif
+    #ifdef SWMEM    
+        if(parent == NULL)
+        {
+            swapMem_kdv2 = (FLOAT_TYPE*)malloc(sizeof(FLOAT_TYPE)*data_dims);
+        }
+    #endif
     */
-	
-	
-	
-	if(end - start < DEFAULT_LEAF_SIZE)
-	{
-		n =  t + start;
-		n -> is_leaf = 1;
+    
+    
+    
+    if(end - start < DEFAULT_LEAF_SIZE)
+    {
+        n =  t + start;
+        n -> is_leaf = 1;
         n -> parent = parent;
         n -> lch = NULL;
         n -> rch = NULL;
-		size_t j = 0;
-		n -> node_list.count = (size_t)(end - start + 1);
-		n -> node_list.data = (kdnode_v2**)malloc(n -> node_list.count * sizeof(kdnode_v2*));
-		for(int i = start; i <= end; ++i){
-			t[i].parent = n;
-			t[i].is_leaf = 1;
-			t[i].lch = NULL;
-			t[i].rch = NULL;
-			n -> node_list.data[j] = t + i;
-			++j;
-		}
-		return n;
-		
-	}
-	
-	
+        size_t j = 0;
+        n -> node_list.count = (size_t)(end - start + 1);
+        n -> node_list.data = (kdnode_v2**)malloc(n -> node_list.count * sizeof(kdnode_v2*));
+        for(int i = start; i <= end; ++i){
+            t[i].parent = n;
+            t[i].is_leaf = 1;
+            t[i].lch = NULL;
+            t[i].rch = NULL;
+            n -> node_list.data[j] = t + i;
+            ++j;
+        }
+        return n;
+        
+    }
+    
+    
 
 
     int median_idx = -1;
-	
+    
     //if ((end - start) < 0) return 0;
     //if (end  == start) {
     //    n = t + start;
@@ -247,25 +247,25 @@ kdnode_v2* make_tree_kdnode_v2(kdnode_v2* t, int start, int end, kdnode_v2* pare
     median_idx = median_of_nodes_kdnode_v2(t, start, end, split_var);
     //printf("%d median idx\n", median_idx);
     if(median_idx > -1){
-		swap_kdnode_v2(t + start, t + median_idx);
+        swap_kdnode_v2(t + start, t + median_idx);
         //n = t + median_idx;
         n = t + start;
-				//n->lch  = make_tree_kdnode_v2(t, start, median_idx - 1, n, level + 1);
-				n->lch  = make_tree_kdnode_v2(t, start + 1, median_idx, n, level + 1);
-				//n->rch = make_tree_kdnode_v2(t, median_idx + 1, end, n, level + 1);
-				n->rch = make_tree_kdnode_v2(t, median_idx + 1, end, n, level + 1);
+                //n->lch  = make_tree_kdnode_v2(t, start, median_idx - 1, n, level + 1);
+                n->lch  = make_tree_kdnode_v2(t, start + 1, median_idx, n, level + 1);
+                //n->rch = make_tree_kdnode_v2(t, median_idx + 1, end, n, level + 1);
+                n->rch = make_tree_kdnode_v2(t, median_idx + 1, end, n, level + 1);
         n -> split_var = split_var;
         n->parent = parent;
         n->level = level;
     }
-	
+    
     /*
-	#ifdef SWMEM
-		if(parent == NULL)
-		{
-			swapMem_kdv2 = malloc(sizeof(FLOAT_TYPE)*data_dims);
-		}
-	#endif
+    #ifdef SWMEM
+        if(parent == NULL)
+        {
+            swapMem_kdv2 = malloc(sizeof(FLOAT_TYPE)*data_dims);
+        }
+    #endif
     */
 
     return n;
@@ -283,23 +283,23 @@ static inline int hyper_plane_side(FLOAT_TYPE* p1, FLOAT_TYPE* p2, int var)
 
 void knn_sub_tree_search_kdtree_v2(FLOAT_TYPE* point, kdnode_v2* root, heap * H)
 {
-	if(root -> is_leaf)
-	{
-		for(size_t i = 0; i < root -> node_list.count; ++i)
-		{
-			kdnode_v2* n = root -> node_list.data[i];
-			//__builtin_prefetch(root -> node_list.data + i + 1, 0, 3);
-			FLOAT_TYPE distance = eud_kdtree_v2(point, n -> data);
-			insert_max_heap(H, distance,n -> array_idx);
-		}
-		return;
-	}
+    if(root -> is_leaf)
+    {
+        for(size_t i = 0; i < root -> node_list.count; ++i)
+        {
+            kdnode_v2* n = root -> node_list.data[i];
+            //__builtin_prefetch(root -> node_list.data + i + 1, 0, 3);
+            FLOAT_TYPE distance = eud_kdtree_v2(point, n -> data);
+            insert_max_heap(H, distance,n -> array_idx);
+        }
+        return;
+    }
 
     FLOAT_TYPE current_distance = eud_kdtree_v2(point, root -> data);
     FLOAT_TYPE hp_distance = hyper_plane_dist(point, root -> data, root -> split_var);
     insert_max_heap(H, current_distance, root -> array_idx);
-	__builtin_prefetch(root -> lch, 0, 3);
-	__builtin_prefetch(root -> rch, 0, 3);
+    __builtin_prefetch(root -> lch, 0, 3);
+    __builtin_prefetch(root -> rch, 0, 3);
 
     int side = hp_distance > 0.f;
 
@@ -307,16 +307,16 @@ void knn_sub_tree_search_kdtree_v2(FLOAT_TYPE* point, kdnode_v2* root, heap * H)
     {
         case HP_LEFT_SIDE:
             if(root -> lch)
-			{
-				knn_sub_tree_search_kdtree_v2(point, root -> lch, H);
-			}
+            {
+                knn_sub_tree_search_kdtree_v2(point, root -> lch, H);
+            }
             break;
         
         case HP_RIGHT_SIDE:
-			if(root -> rch)
-			{
-				knn_sub_tree_search_kdtree_v2(point, root -> rch, H);
-			}
+            if(root -> rch)
+            {
+                knn_sub_tree_search_kdtree_v2(point, root -> rch, H);
+            }
             break;
 
         default:
@@ -333,16 +333,16 @@ void knn_sub_tree_search_kdtree_v2(FLOAT_TYPE* point, kdnode_v2* root, heap * H)
         {
             case HP_LEFT_SIDE:
                 if(root -> rch) 
-				{
-					knn_sub_tree_search_kdtree_v2(point, root -> rch, H);
-				}
+                {
+                    knn_sub_tree_search_kdtree_v2(point, root -> rch, H);
+                }
                 break;
             
             case HP_RIGHT_SIDE:
                 if(root -> lch) 
-				{
-					knn_sub_tree_search_kdtree_v2(point, root -> lch, H);
-				}
+                {
+                    knn_sub_tree_search_kdtree_v2(point, root -> lch, H);
+                }
                 break;
 
             default:
@@ -354,11 +354,11 @@ void knn_sub_tree_search_kdtree_v2(FLOAT_TYPE* point, kdnode_v2* root, heap * H)
 
 void kdtree_v2_init(kdtree_v2* tree, FLOAT_TYPE* data, size_t n_nodes, unsigned int dims)
 {
-	data_dims = dims;
-	tree -> n_nodes = n_nodes;
-	tree -> _nodes = (kdnode_v2*)malloc(n_nodes * sizeof(kdnode_v2));
-	initialize_kdnodes_v2(tree -> _nodes, data, n_nodes);
-	tree -> root = NULL;
+    data_dims = dims;
+    tree -> n_nodes = n_nodes;
+    tree -> _nodes = (kdnode_v2*)malloc(n_nodes * sizeof(kdnode_v2));
+    initialize_kdnodes_v2(tree -> _nodes, data, n_nodes);
+    tree -> root = NULL;
 }
 
 void kdtree_v2_free(kdtree_v2* tree)
@@ -366,7 +366,7 @@ void kdtree_v2_free(kdtree_v2* tree)
     for(uint64_t i = 0; i < tree->n_nodes; ++i) 
         if(tree -> _nodes[i].node_list.data) free(tree -> _nodes[i].node_list.data);
 
-	free(tree -> _nodes);
+    free(tree -> _nodes);
 }
 
 
@@ -395,7 +395,7 @@ kdnode_v2 * build_tree_kdtree_v2(kdnode_v2* kd_ptrs, size_t n, size_t dimensions
      * Wrapper for make_tree function.               *
      * Simplifies interfaces and takes time measures *
      *************************************************/
-   	data_dims = dimensions; 
+    data_dims = dimensions; 
     kdnode_v2* root = make_tree_kdnode_v2(kd_ptrs, 0, n-1, NULL ,0);
     return root;
 
diff --git a/src/tree/kdtreeV2.h b/src/tree/kdtreeV2.h
index 60bdfab0c916504e9588923cfc18ca737a85d097..78d7e3a4be7ede3a6558c43e10bc5019a0a4724c 100644
--- a/src/tree/kdtreeV2.h
+++ b/src/tree/kdtreeV2.h
@@ -10,23 +10,23 @@
 #define DATA_DIMS 0 
 
 #ifdef USE_FLOAT32
-	#define FLOAT_TYPE float
+    #define FLOAT_TYPE float
 #else
-	#define FLOAT_TYPE double 
+    #define FLOAT_TYPE double 
 #endif
 
 #ifdef USE_INT32
-	#define MY_SIZE_MAX UINT32_MAX
-	#define idx_t uint32_t
+    #define MY_SIZE_MAX UINT32_MAX
+    #define idx_t uint32_t
 #else
-	#define MY_SIZE_MAX UINT64_MAX
-	#define idx_t uint64_t
+    #define MY_SIZE_MAX UINT64_MAX
+    #define idx_t uint64_t
 #endif
 
 struct kdnode_v2_list
 {
-	size_t count;
-	struct kdnode_v2** data;
+    size_t count;
+    struct kdnode_v2** data;
 };
 
 
@@ -45,9 +45,9 @@ struct kdnode_v2
 
 struct kdtree_v2
 {
-	size_t n_nodes;
-	struct kdnode_v2* _nodes;
-	struct kdnode_v2* root;
+    size_t n_nodes;
+    struct kdnode_v2* _nodes;
+    struct kdnode_v2* root;
 };
 
 
diff --git a/src/tree/tree.c b/src/tree/tree.c
index 63a98c72cbf897ce54c7a4d5a6b579b3a5b25c4b..bca429f4891b3fe6b914243ccc683a70707da4a8 100644
--- a/src/tree/tree.c
+++ b/src/tree/tree.c
@@ -53,192 +53,192 @@ float_t *read_data_file(global_context_t *ctx, const char *fname,
                         const int file_in_float32) 
 {
 
-	FILE *f = fopen(fname, "r");
-	if (!f) 
-	{
-		printf("Nope\n");
-		exit(1);
-	}
-	fseek(f, 0, SEEK_END);
-	size_t n = ftell(f);
-	rewind(f);
-
-	int InputFloatSize = file_in_float32 ? 4 : 8;
-
-	n = n / (InputFloatSize);
-
-	float_t *data = (float_t *)malloc(n * sizeof(float_t));
-
-	if (file_in_float32) 
-	{
-		float *df = (float *)malloc(n * sizeof(float));
-		size_t fff = fread(df, sizeof(float), n, f);
-		mpi_printf(ctx, "Read %luB\n", fff);
-		fclose(f);
-
-		for (uint64_t i = 0; i < n; ++i) data[i] = (float_t)(df[i]);
-
-		free(df);
-	} 
-	else 
-	{
-		double *df = (double *)malloc(n * sizeof(double));
-		size_t fff = fread(df, sizeof(double), n, f);
-		mpi_printf(ctx, "Read %luB\n", fff);
-		fclose(f);
-
-		for (uint64_t i = 0; i < n; ++i) data[i] = (float_t)(df[i]);
-
-		free(df);
-	}
-	ctx->n_points = n;
-	return data;
+    FILE *f = fopen(fname, "r");
+    if (!f) 
+    {
+        printf("Nope\n");
+        exit(1);
+    }
+    fseek(f, 0, SEEK_END);
+    size_t n = ftell(f);
+    rewind(f);
+
+    int InputFloatSize = file_in_float32 ? 4 : 8;
+
+    n = n / (InputFloatSize);
+
+    float_t *data = (float_t *)malloc(n * sizeof(float_t));
+
+    if (file_in_float32) 
+    {
+        float *df = (float *)malloc(n * sizeof(float));
+        size_t fff = fread(df, sizeof(float), n, f);
+        mpi_printf(ctx, "Read %luB\n", fff);
+        fclose(f);
+
+        for (uint64_t i = 0; i < n; ++i) data[i] = (float_t)(df[i]);
+
+        free(df);
+    } 
+    else 
+    {
+        double *df = (double *)malloc(n * sizeof(double));
+        size_t fff = fread(df, sizeof(double), n, f);
+        mpi_printf(ctx, "Read %luB\n", fff);
+        fclose(f);
+
+        for (uint64_t i = 0; i < n; ++i) data[i] = (float_t)(df[i]);
+
+        free(df);
+    }
+    ctx->n_points = n;
+    return data;
 }
 
 /* quickselect for an element along a dimension */
 
 void swap_data_element(float_t *a, float_t *b, int vec_len) {
-	float_t tmp;
-	for (int i = 0; i < vec_len; ++i) 
-	{
-		tmp = a[i];
-		a[i] = b[i];
-		b[i] = tmp;
-	}
+    float_t tmp;
+    for (int i = 0; i < vec_len; ++i) 
+    {
+        tmp = a[i];
+        a[i] = b[i];
+        b[i] = tmp;
+    }
 }
 
 int compare_data_element(float_t *a, float_t *b, int compare_dim) {
-	return -((a[compare_dim] - b[compare_dim]) > 0.) + ((a[compare_dim] - b[compare_dim]) < 0.);
+    return -((a[compare_dim] - b[compare_dim]) > 0.) + ((a[compare_dim] - b[compare_dim]) < 0.);
 }
 
 int partition_data_element(float_t *array, int vec_len, int compare_dim,
                            int left, int right, int pivot_index) 
 {
-	int store_index = left;
-	int i;
-	/* Move pivot to end */
-	swap_data_element(array + pivot_index * vec_len, array + right * vec_len, vec_len);
-	for (i = left; i < right; ++i) 
-	{
-		// if(compare_data_element(array + i*vec_len, array + pivot_index*vec_len,
-		// compare_dim ) >= 0){
-		if (array[i * vec_len + compare_dim] < array[right * vec_len + compare_dim]) 
-		{
-			swap_data_element(array + store_index * vec_len, array + i * vec_len, vec_len);
-			store_index += 1;
-		}
-	}
-	/* Move pivot to its final place */
-	swap_data_element(array + (store_index)*vec_len, array + right * vec_len, vec_len);
-
-	return store_index;
+    int store_index = left;
+    int i;
+    /* Move pivot to end */
+    swap_data_element(array + pivot_index * vec_len, array + right * vec_len, vec_len);
+    for (i = left; i < right; ++i) 
+    {
+        // if(compare_data_element(array + i*vec_len, array + pivot_index*vec_len,
+        // compare_dim ) >= 0){
+        if (array[i * vec_len + compare_dim] < array[right * vec_len + compare_dim]) 
+        {
+            swap_data_element(array + store_index * vec_len, array + i * vec_len, vec_len);
+            store_index += 1;
+        }
+    }
+    /* Move pivot to its final place */
+    swap_data_element(array + (store_index)*vec_len, array + right * vec_len, vec_len);
+
+    return store_index;
 }
 
 int qselect_data_element(float_t *array, int vec_len, int compare_dim, int left, int right, int n) 
 {
-	int pivot_index;
-	if (left == right) 
-	{
-		return left;
-	}
-	pivot_index = left; // + (rand() % (right-left + 1)); /* random int left <= x <= right */
-	pivot_index = partition_data_element(array, vec_len, compare_dim, left, right, pivot_index);
-	/* The pivot is in its final sorted position */
-	if (n == pivot_index) 
-	{
-		return pivot_index;
-	} 
-	else if (n < pivot_index) 
-	{
-		return qselect_data_element(array, vec_len, compare_dim, left, pivot_index - 1, n);
-	} 
-	else 
-	{
-		return qselect_data_element(array, vec_len, compare_dim, pivot_index + 1, right, n);
-	}
+    int pivot_index;
+    if (left == right) 
+    {
+        return left;
+    }
+    pivot_index = left; // + (rand() % (right-left + 1)); /* random int left <= x <= right */
+    pivot_index = partition_data_element(array, vec_len, compare_dim, left, right, pivot_index);
+    /* The pivot is in its final sorted position */
+    if (n == pivot_index) 
+    {
+        return pivot_index;
+    } 
+    else if (n < pivot_index) 
+    {
+        return qselect_data_element(array, vec_len, compare_dim, left, pivot_index - 1, n);
+    } 
+    else 
+    {
+        return qselect_data_element(array, vec_len, compare_dim, pivot_index + 1, right, n);
+    }
 }
 
 int quickselect_data_element(float_t *array, int vec_len, int array_size, int compare_dim, int k) 
 {
-	return qselect_data_element(array, vec_len, compare_dim, 0, array_size - 1, k - 1);
+    return qselect_data_element(array, vec_len, compare_dim, 0, array_size - 1, k - 1);
 }
 
 int CMP_DIM;
 int compare_data_element_sort(const void *a, const void *b) {
-	float_t aa = *((float_t *)a + CMP_DIM);
-	float_t bb = *((float_t *)b + CMP_DIM);
-	return ((aa - bb) > 0.) - ((aa - bb) < 0.);
+    float_t aa = *((float_t *)a + CMP_DIM);
+    float_t bb = *((float_t *)b + CMP_DIM);
+    return ((aa - bb) > 0.) - ((aa - bb) < 0.);
 }
 
 void compute_bounding_box(global_context_t *ctx) {
-	ctx->lb_box = (float_t *)malloc(ctx->dims * sizeof(float_t));
-	ctx->ub_box = (float_t *)malloc(ctx->dims * sizeof(float_t));
-
-	for (size_t d = 0; d < ctx->dims; ++d) {
-	ctx->lb_box[d] = 99999999.;
-	ctx->ub_box[d] = -99999999.;
-	}
-
-	#define local_data ctx->local_data
-	#define lb ctx->lb_box
-	#define ub ctx->ub_box
-
-	/* compute minimum and maximum for each dimensions, store them in local bb */
-	/* each processor on its own */
-	for (size_t i = 0; i < ctx->local_n_points; ++i) {
-	for (size_t d = 0; d < ctx->dims; ++d) {
-	  lb[d] = MIN(local_data[i * ctx->dims + d], lb[d]);
-	  ub[d] = MAX(local_data[i * ctx->dims + d], ub[d]);
-	}
-	}
-
-	/* Reduce to obtain bb */
-	/*
-	MPI_Allreduce(  const void *sendbuf,
-								  void *recvbuf,
-								  int count,
-								  MPI_Datatype datatype,
-								  MPI_Op op,
-								  MPI_Comm comm)
-	*/
-
-	/*get the bounding box */
-
-	MPI_Allreduce(MPI_IN_PLACE, lb, ctx->dims, MPI_MY_FLOAT, MPI_MIN, ctx->mpi_communicator);
-	MPI_Allreduce(MPI_IN_PLACE, ub, ctx->dims, MPI_MY_FLOAT, MPI_MAX, ctx->mpi_communicator);
-
-	/*
-	DB_PRINT("[RANK %d]:", ctx -> mpi_rank);
-	for(size_t d = 0; d < ctx -> dims; ++d)
-	{
-		  DB_PRINT("%lf ", ctx -> ub_box[d]);
-	}
-	DB_PRINT("\n");
-	*/
-
-	/*
-	MPI_DB_PRINT("[BOUNDING BOX]: ");
-	for(size_t d = 0; d < ctx -> dims; ++d) MPI_DB_PRINT("d%d:[%lf, %lf] ",(int)d,
-	lb[d], ub[d]); MPI_DB_PRINT("\n");
-	*/
-
-	#undef local_data
-	#undef lb
-	#undef ub
+    ctx->lb_box = (float_t *)malloc(ctx->dims * sizeof(float_t));
+    ctx->ub_box = (float_t *)malloc(ctx->dims * sizeof(float_t));
+
+    for (size_t d = 0; d < ctx->dims; ++d) {
+    ctx->lb_box[d] = 99999999.;
+    ctx->ub_box[d] = -99999999.;
+    }
+
+    #define local_data ctx->local_data
+    #define lb ctx->lb_box
+    #define ub ctx->ub_box
+
+    /* compute minimum and maximum for each dimensions, store them in local bb */
+    /* each processor on its own */
+    for (size_t i = 0; i < ctx->local_n_points; ++i) {
+    for (size_t d = 0; d < ctx->dims; ++d) {
+      lb[d] = MIN(local_data[i * ctx->dims + d], lb[d]);
+      ub[d] = MAX(local_data[i * ctx->dims + d], ub[d]);
+    }
+    }
+
+    /* Reduce to obtain bb */
+    /*
+    MPI_Allreduce(  const void *sendbuf,
+                                  void *recvbuf,
+                                  int count,
+                                  MPI_Datatype datatype,
+                                  MPI_Op op,
+                                  MPI_Comm comm)
+    */
+
+    /*get the bounding box */
+
+    MPI_Allreduce(MPI_IN_PLACE, lb, ctx->dims, MPI_MY_FLOAT, MPI_MIN, ctx->mpi_communicator);
+    MPI_Allreduce(MPI_IN_PLACE, ub, ctx->dims, MPI_MY_FLOAT, MPI_MAX, ctx->mpi_communicator);
+
+    /*
+    DB_PRINT("[RANK %d]:", ctx -> mpi_rank);
+    for(size_t d = 0; d < ctx -> dims; ++d)
+    {
+          DB_PRINT("%lf ", ctx -> ub_box[d]);
+    }
+    DB_PRINT("\n");
+    */
+
+    /*
+    MPI_DB_PRINT("[BOUNDING BOX]: ");
+    for(size_t d = 0; d < ctx -> dims; ++d) MPI_DB_PRINT("d%d:[%lf, %lf] ",(int)d,
+    lb[d], ub[d]); MPI_DB_PRINT("\n");
+    */
+
+    #undef local_data
+    #undef lb
+    #undef ub
 }
 
 /* i want a queue to enqueue the partitions to deal with */
 void enqueue_partition(partition_queue_t *queue, partition_t p) 
 {
-	if (queue->count == queue->_capacity) 
-	{
-		queue->_capacity = queue->_capacity * 1.10;
-		queue->data = realloc(queue->data, queue->_capacity);
-	}
-	/* insert point */
-	memmove(queue->data + 1, queue->data, queue->count * sizeof(partition_t));
-	queue->data[0] = p;
-	queue->count++;
+    if (queue->count == queue->_capacity) 
+    {
+        queue->_capacity = queue->_capacity * 1.10;
+        queue->data = realloc(queue->data, queue->_capacity);
+    }
+    /* insert point */
+    memmove(queue->data + 1, queue->data, queue->count * sizeof(partition_t));
+    queue->data[0] = p;
+    queue->count++;
 }
 
 partition_t dequeue_partition(partition_queue_t *queue) 
@@ -247,148 +247,148 @@ partition_t dequeue_partition(partition_queue_t *queue)
 }
 
 void compute_medians_and_check(global_context_t *ctx, float_t *data) {
-	float_t prop = 0.5;
-	int k = (int)(ctx->local_n_points * prop);
-	int d = 1;
-
-	/*quick select on a particular dimension */
-	CMP_DIM = d;
-	int kk = (k - 1) * ctx->dims;
-
-	int count = 0;
-	// idx = idx - 1;
-	//
-	int aaa = quickselect_data_element(ctx->local_data, (int)(ctx->dims), (int)(ctx->local_n_points), d, k);
-	/*
-	* sanity check
-	* check if the median found in each node is
-	* a median
-	*/
-
-	float_t *medians_rcv = (float_t *)malloc(ctx->dims * ctx->world_size * sizeof(float_t));
-
-	/*
-	* MPI_Allgather( 	const void *sendbuf,
-	* 					int sendcount,
-	* 					MPI_Datatype sendtype,
-	* 					void *recvbuf,
-	* 					int recvcount,
-	* 					MPI_Datatype recvtype,
-	* 					MPI_Comm comm)
-	*/
-
-	/* Exchange medians */
-
-	MPI_Allgather(ctx->local_data + kk, ctx->dims, MPI_MY_FLOAT, medians_rcv, ctx->dims, MPI_MY_FLOAT, ctx->mpi_communicator);
-
-	/* sort medians on each node */
-
-	CMP_DIM = d;
-	qsort(medians_rcv, ctx->world_size, ctx->dims * sizeof(float_t), compare_data_element_sort);
-
-	/*
-	* Evaluate goodness of the median on master which has whole dataset
-	*/
-
-	if (ctx->mpi_rank == 0) {
-	int count = 0;
-	int idx = (int)(prop * (ctx->world_size));
-	// idx = idx - 1;
-	for (int i = 0; i < ctx->n_points; ++i) 
-	{
-		count += data[i * ctx->dims + d] <= medians_rcv[idx * ctx->dims + d];
-	}
-	mpi_printf(ctx, "Choosing %lf percentile on dimension %d: empirical prop %lf\n", prop, d, (float_t)count / (float_t)(ctx->n_points));
-	}
-	free(medians_rcv);
+    float_t prop = 0.5;
+    int k = (int)(ctx->local_n_points * prop);
+    int d = 1;
+
+    /*quick select on a particular dimension */
+    CMP_DIM = d;
+    int kk = (k - 1) * ctx->dims;
+
+    int count = 0;
+    // idx = idx - 1;
+    //
+    int aaa = quickselect_data_element(ctx->local_data, (int)(ctx->dims), (int)(ctx->local_n_points), d, k);
+    /*
+    * sanity check
+    * check if the median found in each node is
+    * a median
+    */
+
+    float_t *medians_rcv = (float_t *)malloc(ctx->dims * ctx->world_size * sizeof(float_t));
+
+    /*
+    * MPI_Allgather(     const void *sendbuf,
+    *                     int sendcount,
+    *                     MPI_Datatype sendtype,
+    *                     void *recvbuf,
+    *                     int recvcount,
+    *                     MPI_Datatype recvtype,
+    *                     MPI_Comm comm)
+    */
+
+    /* Exchange medians */
+
+    MPI_Allgather(ctx->local_data + kk, ctx->dims, MPI_MY_FLOAT, medians_rcv, ctx->dims, MPI_MY_FLOAT, ctx->mpi_communicator);
+
+    /* sort medians on each node */
+
+    CMP_DIM = d;
+    qsort(medians_rcv, ctx->world_size, ctx->dims * sizeof(float_t), compare_data_element_sort);
+
+    /*
+    * Evaluate goodness of the median on master which has whole dataset
+    */
+
+    if (ctx->mpi_rank == 0) {
+    int count = 0;
+    int idx = (int)(prop * (ctx->world_size));
+    // idx = idx - 1;
+    for (int i = 0; i < ctx->n_points; ++i) 
+    {
+        count += data[i * ctx->dims + d] <= medians_rcv[idx * ctx->dims + d];
+    }
+    mpi_printf(ctx, "Choosing %lf percentile on dimension %d: empirical prop %lf\n", prop, d, (float_t)count / (float_t)(ctx->n_points));
+    }
+    free(medians_rcv);
 }
 
 float_t check_pc_pointset_parallel(global_context_t *ctx, pointset_t *ps, guess_t g, int d, float_t prop) {
-	/*
-	 * ONLY FOR TEST PURPOSES
-	 * gather on master all data
-	 * perform the count on master
-	 */
-	int pvt_count = 0;
-	for (int i = 0; i < ps->n_points; ++i) 
-	{
-		pvt_count += ps->data[i * ps->dims + d] <= g.x_guess;
-	}
-
-	int pvt_n_and_tot[2] = {pvt_count, ps->n_points};
-	int tot_count[2];
-	MPI_Allreduce(pvt_n_and_tot, tot_count, 2, MPI_INT, MPI_SUM, ctx->mpi_communicator);
-
-	float_t ep = (float_t)tot_count[0] / (float_t)(tot_count[1]);
-	/*
-	mpi_printf(ctx,"[PS TEST PARALLEL]: ");
-	mpi_printf(ctx,"Condsidering %d points, searching for %lf percentile on
-	dimension %d: empirical measure %lf\n",tot_count[1],prop, d, ep);
-	*/
-	return ep;
+    /*
+     * ONLY FOR TEST PURPOSES
+     * gather on master all data
+     * perform the count on master
+     */
+    int pvt_count = 0;
+    for (int i = 0; i < ps->n_points; ++i) 
+    {
+        pvt_count += ps->data[i * ps->dims + d] <= g.x_guess;
+    }
+
+    int pvt_n_and_tot[2] = {pvt_count, ps->n_points};
+    int tot_count[2];
+    MPI_Allreduce(pvt_n_and_tot, tot_count, 2, MPI_INT, MPI_SUM, ctx->mpi_communicator);
+
+    float_t ep = (float_t)tot_count[0] / (float_t)(tot_count[1]);
+    /*
+    mpi_printf(ctx,"[PS TEST PARALLEL]: ");
+    mpi_printf(ctx,"Condsidering %d points, searching for %lf percentile on
+    dimension %d: empirical measure %lf\n",tot_count[1],prop, d, ep);
+    */
+    return ep;
 }
 
 void compute_bounding_box_pointset(global_context_t *ctx, pointset_t *ps) {
 
-	for (size_t d = 0; d < ps->dims; ++d)
-	{
-		ps->lb_box[d] = 99999999.;
-		ps->ub_box[d] = -99999999.;
-	}
+    for (size_t d = 0; d < ps->dims; ++d)
+    {
+        ps->lb_box[d] = 99999999.;
+        ps->ub_box[d] = -99999999.;
+    }
 
-	#define local_data ps->data
-	#define lb ps->lb_box
-	#define ub ps->ub_box
+    #define local_data ps->data
+    #define lb ps->lb_box
+    #define ub ps->ub_box
 
-	/* compute minimum and maximum for each dimensions, store them in local bb */
-	/* each processor on its own */
+    /* compute minimum and maximum for each dimensions, store them in local bb */
+    /* each processor on its own */
 
     /* TODO: reduction using omp directive */
 
-	for (size_t i = 0; i < ps->n_points; ++i) 
-	{
-		for (size_t d = 0; d < ps->dims; ++d) 
-		{
-			lb[d] = MIN(local_data[i * ps->dims + d], lb[d]);
-			ub[d] = MAX(local_data[i * ps->dims + d], ub[d]);
-    	}
-  	}
-
-	/* Reduce to obtain bb */
-	/*
-	MPI_Allreduce(  const void *sendbuf,
-								  void *recvbuf,
-								  int count,
-								  MPI_Datatype datatype,
-								  MPI_Op op,
-								  MPI_Comm comm)
-	*/
-
-	/*get the bounding box */
-
-	MPI_Allreduce(MPI_IN_PLACE, lb, ps->dims, MPI_MY_FLOAT, MPI_MIN, ctx->mpi_communicator);
-	MPI_Allreduce(MPI_IN_PLACE, ub, ps->dims, MPI_MY_FLOAT, MPI_MAX, ctx->mpi_communicator);
-
-	/*
-	DB_PRINT("[RANK %d]:", ctx -> mpi_rank);
-	for(size_t d = 0; d < ctx -> dims; ++d)
-	{
-		  DB_PRINT("%lf ", ctx -> ub_box[d]);
-	}
-	DB_PRINT("\n");
-	*/
-
-	/*
-	MPI_DB_PRINT("[PS BOUNDING BOX]: ");
-	for(size_t d = 0; d < ps -> dims; ++d) MPI_DB_PRINT("d%d:[%lf, %lf] ",(int)d,
-	lb[d], ub[d]); MPI_DB_PRINT("\n");
-	*/
-
-
-
-	#undef local_data
-	#undef lb
-	#undef ub
+    for (size_t i = 0; i < ps->n_points; ++i) 
+    {
+        for (size_t d = 0; d < ps->dims; ++d) 
+        {
+            lb[d] = MIN(local_data[i * ps->dims + d], lb[d]);
+            ub[d] = MAX(local_data[i * ps->dims + d], ub[d]);
+        }
+      }
+
+    /* Reduce to obtain bb */
+    /*
+    MPI_Allreduce(  const void *sendbuf,
+                                  void *recvbuf,
+                                  int count,
+                                  MPI_Datatype datatype,
+                                  MPI_Op op,
+                                  MPI_Comm comm)
+    */
+
+    /*get the bounding box */
+
+    MPI_Allreduce(MPI_IN_PLACE, lb, ps->dims, MPI_MY_FLOAT, MPI_MIN, ctx->mpi_communicator);
+    MPI_Allreduce(MPI_IN_PLACE, ub, ps->dims, MPI_MY_FLOAT, MPI_MAX, ctx->mpi_communicator);
+
+    /*
+    DB_PRINT("[RANK %d]:", ctx -> mpi_rank);
+    for(size_t d = 0; d < ctx -> dims; ++d)
+    {
+          DB_PRINT("%lf ", ctx -> ub_box[d]);
+    }
+    DB_PRINT("\n");
+    */
+
+    /*
+    MPI_DB_PRINT("[PS BOUNDING BOX]: ");
+    for(size_t d = 0; d < ps -> dims; ++d) MPI_DB_PRINT("d%d:[%lf, %lf] ",(int)d,
+    lb[d], ub[d]); MPI_DB_PRINT("\n");
+    */
+
+
+
+    #undef local_data
+    #undef lb
+    #undef ub
 }
 
 
@@ -397,317 +397,317 @@ guess_t retrieve_guess_pure(global_context_t *ctx, pointset_t *ps,
                         int k_global, int d, float_t pc)
 {
 
-	/*
-	* retrieving the best median guess from pure binning
-	*/
-
-	float_t total_count = 0.;
-	for (int i = 0; i < k_global; ++i) total_count += (float_t)global_bin_counts[i];
-
-	/*
-	MPI_DB_PRINT("[ ");
-	for(int i = 0; i < k_global; ++i)
-	{
-		  MPI_DB_PRINT("%lu %lf --- ", global_bin_counts[i],
-	(float_t)global_bin_counts[i]/(float_t)total_count);
-	}
-	MPI_DB_PRINT("\n");
-	*/
-
-	float_t cumulative_count = 0;
-	int idx = 0;
-	while ((cumulative_count + (float_t)global_bin_counts[idx]) / total_count < pc) 
-	{
-		cumulative_count += (float_t)global_bin_counts[idx];
-		idx++;
-	}
-	/* find best spot in the bin */
-	float_t box_lb = ps->lb_box[d];
-	float_t box_ub = ps->ub_box[d];
-	float_t box_width = box_ub - box_lb;
-	float_t global_bin_width = box_width / (float_t)k_global;
-
-	float_t x0 = box_lb + (global_bin_width * (idx));
-	float_t x1 = box_lb + (global_bin_width * (idx + 1));
-
-	float_t y0 = (cumulative_count) / total_count;
-	float_t y1 = (cumulative_count + global_bin_counts[idx]) / total_count;
-
-	float_t x_guess = (pc - y0) / (y1 - y0) * (x1 - x0) + x0;
-
-		
-	/*
-	MPI_DB_PRINT("[MASTER] best guess @ %lf is %lf on bin %d on dimension %d --- x0 %lf x1 %lf y0 %lf y1 %lf\n",pc, x_guess,idx, d, x0, x1, y0, y1);
-	*/
-	
-
-
-	guess_t g = {.bin_idx = idx, .x_guess = x_guess};
-	return g;
+    /*
+    * retrieving the best median guess from pure binning
+    */
+
+    float_t total_count = 0.;
+    for (int i = 0; i < k_global; ++i) total_count += (float_t)global_bin_counts[i];
+
+    /*
+    MPI_DB_PRINT("[ ");
+    for(int i = 0; i < k_global; ++i)
+    {
+          MPI_DB_PRINT("%lu %lf --- ", global_bin_counts[i],
+    (float_t)global_bin_counts[i]/(float_t)total_count);
+    }
+    MPI_DB_PRINT("\n");
+    */
+
+    float_t cumulative_count = 0;
+    int idx = 0;
+    while ((cumulative_count + (float_t)global_bin_counts[idx]) / total_count < pc) 
+    {
+        cumulative_count += (float_t)global_bin_counts[idx];
+        idx++;
+    }
+    /* find best spot in the bin */
+    float_t box_lb = ps->lb_box[d];
+    float_t box_ub = ps->ub_box[d];
+    float_t box_width = box_ub - box_lb;
+    float_t global_bin_width = box_width / (float_t)k_global;
+
+    float_t x0 = box_lb + (global_bin_width * (idx));
+    float_t x1 = box_lb + (global_bin_width * (idx + 1));
+
+    float_t y0 = (cumulative_count) / total_count;
+    float_t y1 = (cumulative_count + global_bin_counts[idx]) / total_count;
+
+    float_t x_guess = (pc - y0) / (y1 - y0) * (x1 - x0) + x0;
+
+        
+    /*
+    MPI_DB_PRINT("[MASTER] best guess @ %lf is %lf on bin %d on dimension %d --- x0 %lf x1 %lf y0 %lf y1 %lf\n",pc, x_guess,idx, d, x0, x1, y0, y1);
+    */
+    
+
+
+    guess_t g = {.bin_idx = idx, .x_guess = x_guess};
+    return g;
 }
 
 
 void global_binning_check(global_context_t *ctx, float_t *data, int d, int k) 
 {
-	/*
-	* sanity check
-	* find if global bins are somehow similar to acutal binning on master
-	*/
-
-	if (I_AM_MASTER) 
-	{
-		int *counts = (int *)malloc(k * sizeof(int));
-		for (int bin_idx = 0; bin_idx < k; ++bin_idx) counts[bin_idx] = 0;
-
-		float_t box_lb = ctx->lb_box[d];
-		float_t box_ub = ctx->ub_box[d];
-		float_t box_width = box_ub - box_lb;
-		float_t bin_width = box_width / (float_t)k;
-
-		for (int i = 0; i < ctx->n_points; ++i) 
-		{
-			int bin_idx = (int)((data[i * ctx->dims + d] - box_lb) / bin_width);
-			if (bin_idx < k) counts[bin_idx]++;
-			// counts[bin_idx]++
-		}
-		int cc = 0;
-
-		/*
-		MPI_DB_PRINT("Actual bins: [");
-		for(int bin_idx = 0; bin_idx < k; ++bin_idx)
-		{
-				MPI_DB_PRINT("%d ", counts[bin_idx]);
-				cc += counts[bin_idx];
-		}
-		MPI_DB_PRINT("] tot %d\n",cc);
-		*/
-
-		free(counts);
-	}
+    /*
+    * sanity check
+    * find if global bins are somehow similar to acutal binning on master
+    */
+
+    if (I_AM_MASTER) 
+    {
+        int *counts = (int *)malloc(k * sizeof(int));
+        for (int bin_idx = 0; bin_idx < k; ++bin_idx) counts[bin_idx] = 0;
+
+        float_t box_lb = ctx->lb_box[d];
+        float_t box_ub = ctx->ub_box[d];
+        float_t box_width = box_ub - box_lb;
+        float_t bin_width = box_width / (float_t)k;
+
+        for (int i = 0; i < ctx->n_points; ++i) 
+        {
+            int bin_idx = (int)((data[i * ctx->dims + d] - box_lb) / bin_width);
+            if (bin_idx < k) counts[bin_idx]++;
+            // counts[bin_idx]++
+        }
+        int cc = 0;
+
+        /*
+        MPI_DB_PRINT("Actual bins: [");
+        for(int bin_idx = 0; bin_idx < k; ++bin_idx)
+        {
+                MPI_DB_PRINT("%d ", counts[bin_idx]);
+                cc += counts[bin_idx];
+        }
+        MPI_DB_PRINT("] tot %d\n",cc);
+        */
+
+        free(counts);
+    }
 }
 
 void compute_pure_global_binning(global_context_t *ctx, pointset_t *ps,
                                  uint64_t *global_bin_counts, int k_global,
                                  int d) 
 {
-	/* compute binning of data along dimension d */
-	uint64_t *local_bin_count = (uint64_t *)malloc(k_global * sizeof(uint64_t));
-	for (size_t k = 0; k < k_global; ++k) 
-	{
-		local_bin_count[k] = 0;
-		global_bin_counts[k] = 0;
-	}
-
-	/*
-	MPI_DB_PRINT("[PS BOUNDING BOX %d]: ", ctx -> mpi_rank);
-	for(size_t d = 0; d < ps -> dims; ++d) MPI_DB_PRINT("d%d:[%lf, %lf] ",(int)d, ps -> lb_box[d], ps -> ub_box[d]); MPI_DB_PRINT("\n");
-	MPI_DB_PRINT("\n");
-	*/
-
-	float_t bin_w = (ps-> ub_box[d] - ps->lb_box[d]) / (float_t)k_global;
+    /* compute binning of data along dimension d */
+    uint64_t *local_bin_count = (uint64_t *)malloc(k_global * sizeof(uint64_t));
+    for (size_t k = 0; k < k_global; ++k) 
+    {
+        local_bin_count[k] = 0;
+        global_bin_counts[k] = 0;
+    }
+
+    /*
+    MPI_DB_PRINT("[PS BOUNDING BOX %d]: ", ctx -> mpi_rank);
+    for(size_t d = 0; d < ps -> dims; ++d) MPI_DB_PRINT("d%d:[%lf, %lf] ",(int)d, ps -> lb_box[d], ps -> ub_box[d]); MPI_DB_PRINT("\n");
+    MPI_DB_PRINT("\n");
+    */
+
+    float_t bin_w = (ps-> ub_box[d] - ps->lb_box[d]) / (float_t)k_global;
 
     #pragma omp parallel for
-	for (size_t i = 0; i < ps->n_points; ++i) 
-	{
-		float_t p = ps->data[i * ps->dims + d];
-		/* to prevent the border point in the box to have bin_idx == k_global causing invalid memory access */
-		int bin_idx = MIN((int)((p - ps->lb_box[d]) / bin_w), k_global - 1);
+    for (size_t i = 0; i < ps->n_points; ++i) 
+    {
+        float_t p = ps->data[i * ps->dims + d];
+        /* to prevent the border point in the box to have bin_idx == k_global causing invalid memory access */
+        int bin_idx = MIN((int)((p - ps->lb_box[d]) / bin_w), k_global - 1);
         
         #pragma omp atomic update
-		local_bin_count[bin_idx]++;
-	}
+        local_bin_count[bin_idx]++;
+    }
 
-	MPI_Allreduce(local_bin_count, global_bin_counts, k_global, MPI_UNSIGNED_LONG, MPI_SUM, ctx->mpi_communicator);
-	free(local_bin_count);
+    MPI_Allreduce(local_bin_count, global_bin_counts, k_global, MPI_UNSIGNED_LONG, MPI_SUM, ctx->mpi_communicator);
+    free(local_bin_count);
 }
 
 int partition_data_around_value(float_t *array, int vec_len, int compare_dim,
-								int left, int right, float_t pivot_value) 
+                                int left, int right, float_t pivot_value) 
 {
-	/*
-	* returns the number of elements less than the pivot
-	*/
-	int store_index = left;
-	int i;
-	/* Move pivot to end */
-	for (i = left; i < right; ++i) 
-	{
-		// if(compare_data_element(array + i*vec_len, array + pivot_index*vec_len, compare_dim ) >= 0){
-		if (array[i * vec_len + compare_dim] < pivot_value) 
-		{
-			swap_data_element(array + store_index * vec_len, array + i * vec_len, vec_len);
-			store_index += 1;
-		}
-	}
-	/* Move pivot to its final place */
-	// swap_data_element(array + (store_index)*vec_len , array + right*vec_len,
-	// vec_len);
-
-	return store_index; // maybe, update, it works :)
+    /*
+    * returns the number of elements less than the pivot
+    */
+    int store_index = left;
+    int i;
+    /* Move pivot to end */
+    for (i = left; i < right; ++i) 
+    {
+        // if(compare_data_element(array + i*vec_len, array + pivot_index*vec_len, compare_dim ) >= 0){
+        if (array[i * vec_len + compare_dim] < pivot_value) 
+        {
+            swap_data_element(array + store_index * vec_len, array + i * vec_len, vec_len);
+            store_index += 1;
+        }
+    }
+    /* Move pivot to its final place */
+    // swap_data_element(array + (store_index)*vec_len , array + right*vec_len,
+    // vec_len);
+
+    return store_index; // maybe, update, it works :)
 }
 
 guess_t refine_pure_binning(global_context_t *ctx, pointset_t *ps,
                             guess_t best_guess, uint64_t *global_bin_count,
                             int k_global, int d, float_t f, float_t tolerance)
 {
-	/* refine process from obtained binning */
-	if (fabs(best_guess.ep - f) < tolerance) 
-	{
-		/*
-		MPI_DB_PRINT("[MASTER] No need to refine, finishing\n");
-		*/
-		return best_guess;
-	}
-	float_t total_count = 0;
-	float_t starting_cumulative = 0;
-
-	for (int i = 0; i < best_guess.bin_idx; ++i) starting_cumulative += global_bin_count[i];
-	for (int i = 0; i < k_global; ++i) total_count += global_bin_count[i];
-
-	float_t bin_w = (ps->ub_box[d] - ps->lb_box[d]) / k_global;
-	float_t bin_lb = ps->lb_box[d] + (bin_w * (best_guess.bin_idx));
-	float_t bin_ub = ps->lb_box[d] + (bin_w * (best_guess.bin_idx + 1));
-
-	uint64_t *tmp_global_bins = (uint64_t *)malloc(sizeof(uint64_t) * k_global);
-	for (int i = 0; i < k_global; ++i) tmp_global_bins[i] = global_bin_count[i];
-
-	/*
-	MPI_DB_PRINT("STARTING REFINE global bins: ");
-	for(int i = 0; i < k_global; ++i)
-	{
-		  MPI_DB_PRINT("%lf ", global_bin_count[i]);
-	}
-	MPI_DB_PRINT("\n");
-	*/
-
-	guess_t g;
-	while (fabs(best_guess.ep - f) > tolerance) {
-		/* compute the target */
-		float_t ff, b0, b1;
-		ff = -1;
-		b0 = starting_cumulative;
-		b1 = tmp_global_bins[best_guess.bin_idx];
-		ff = (f * total_count - b0) / ((float_t)tmp_global_bins[best_guess.bin_idx]);
-
-		/*
-		 * generate a partset of points in the bin considered
-		 * each one has to partition its dataset according to the
-		 * fact that points on dimension d has to be in the bin
-		 *
-		 * then make into in place alg for now, copy data in another pointer
-		 * will be done in place
-		 * */
-
-		
-		/*
-		MPI_DB_PRINT("---- ---- ----\n");
-		MPI_DB_PRINT("[MASTER] Refining on bin %d lb %lf ub %lf starting c %lf %lf\n", 
-				best_guess.bin_idx, bin_lb, bin_ub, starting_cumulative/total_count,
-				(tmp_global_bins[best_guess.bin_idx] + starting_cumulative)/total_count);
-		*/
-	
-
-		for (int i = 0; i < k_global; ++i)  tmp_global_bins[i] = 0;
-
-		pointset_t tmp_ps;
-
-		int end_idx = partition_data_around_value(ps->data, (int)ps->dims, d, 0, (int)ps->n_points, bin_ub);
-		int start_idx = partition_data_around_value(ps->data, (int)ps->dims, d, 0,end_idx, bin_lb);
-
-		tmp_ps.n_points = end_idx - start_idx;
-		tmp_ps.data = ps->data + start_idx * ps->dims;
-		tmp_ps.dims = ps->dims;
-		tmp_ps.lb_box = (float_t*)malloc(ctx -> dims * sizeof(float_t));
-		tmp_ps.ub_box = (float_t*)malloc(ctx -> dims * sizeof(float_t));
-
-		compute_bounding_box_pointset(ctx, &tmp_ps);
-
-		/*
-		MPI_DB_PRINT("[MASTER] searching for %lf of the bin considered\n",ff);
-		*/
-
-		// DB_PRINT("%lu\n",tmp_ps.n_points );
-		MPI_Barrier(ctx->mpi_communicator);
-		compute_pure_global_binning(ctx, &tmp_ps, tmp_global_bins, k_global, d);
-
-		/* sum to global bins */
-		// for(int i = 0; i < k_global; ++i) tmp_global_bins[i] +=
-		// starting_cumulative;
-
-		best_guess = retrieve_guess_pure(ctx, &tmp_ps, tmp_global_bins, k_global, d, ff);
-
-		best_guess.ep = check_pc_pointset_parallel(ctx, ps, best_guess, d, f);
-		// ep = check_pc_pointset_parallel(ctx, &tmp_ps, best_guess, d, f);
-
-		bin_w = (tmp_ps.ub_box[d] - tmp_ps.lb_box[d]) / k_global;
-		bin_lb = tmp_ps.lb_box[d] + (bin_w * (best_guess.bin_idx));
-		bin_ub = tmp_ps.lb_box[d] + (bin_w * (best_guess.bin_idx + 1));
-
-		for (int i = 0; i < best_guess.bin_idx; ++i) starting_cumulative += tmp_global_bins[i];
-
-		// free(tmp_ps.data);
-		free(tmp_ps.lb_box);
-		free(tmp_ps.ub_box);
-	}
-
-	/*
-	MPI_DB_PRINT("SUCCESS!!! \n");
-	*/
-
-	free(tmp_global_bins);
-
-	return best_guess;
+    /* refine process from obtained binning */
+    if (fabs(best_guess.ep - f) < tolerance) 
+    {
+        /*
+        MPI_DB_PRINT("[MASTER] No need to refine, finishing\n");
+        */
+        return best_guess;
+    }
+    float_t total_count = 0;
+    float_t starting_cumulative = 0;
+
+    for (int i = 0; i < best_guess.bin_idx; ++i) starting_cumulative += global_bin_count[i];
+    for (int i = 0; i < k_global; ++i) total_count += global_bin_count[i];
+
+    float_t bin_w = (ps->ub_box[d] - ps->lb_box[d]) / k_global;
+    float_t bin_lb = ps->lb_box[d] + (bin_w * (best_guess.bin_idx));
+    float_t bin_ub = ps->lb_box[d] + (bin_w * (best_guess.bin_idx + 1));
+
+    uint64_t *tmp_global_bins = (uint64_t *)malloc(sizeof(uint64_t) * k_global);
+    for (int i = 0; i < k_global; ++i) tmp_global_bins[i] = global_bin_count[i];
+
+    /*
+    MPI_DB_PRINT("STARTING REFINE global bins: ");
+    for(int i = 0; i < k_global; ++i)
+    {
+          MPI_DB_PRINT("%lf ", global_bin_count[i]);
+    }
+    MPI_DB_PRINT("\n");
+    */
+
+    guess_t g;
+    while (fabs(best_guess.ep - f) > tolerance) {
+        /* compute the target */
+        float_t ff, b0, b1;
+        ff = -1;
+        b0 = starting_cumulative;
+        b1 = tmp_global_bins[best_guess.bin_idx];
+        ff = (f * total_count - b0) / ((float_t)tmp_global_bins[best_guess.bin_idx]);
+
+        /*
+         * generate a partset of points in the bin considered
+         * each one has to partition its dataset according to the
+         * fact that points on dimension d has to be in the bin
+         *
+         * then make into in place alg for now, copy data in another pointer
+         * will be done in place
+         * */
+
+        
+        /*
+        MPI_DB_PRINT("---- ---- ----\n");
+        MPI_DB_PRINT("[MASTER] Refining on bin %d lb %lf ub %lf starting c %lf %lf\n", 
+                best_guess.bin_idx, bin_lb, bin_ub, starting_cumulative/total_count,
+                (tmp_global_bins[best_guess.bin_idx] + starting_cumulative)/total_count);
+        */
+    
+
+        for (int i = 0; i < k_global; ++i)  tmp_global_bins[i] = 0;
+
+        pointset_t tmp_ps;
+
+        int end_idx = partition_data_around_value(ps->data, (int)ps->dims, d, 0, (int)ps->n_points, bin_ub);
+        int start_idx = partition_data_around_value(ps->data, (int)ps->dims, d, 0,end_idx, bin_lb);
+
+        tmp_ps.n_points = end_idx - start_idx;
+        tmp_ps.data = ps->data + start_idx * ps->dims;
+        tmp_ps.dims = ps->dims;
+        tmp_ps.lb_box = (float_t*)malloc(ctx -> dims * sizeof(float_t));
+        tmp_ps.ub_box = (float_t*)malloc(ctx -> dims * sizeof(float_t));
+
+        compute_bounding_box_pointset(ctx, &tmp_ps);
+
+        /*
+        MPI_DB_PRINT("[MASTER] searching for %lf of the bin considered\n",ff);
+        */
+
+        // DB_PRINT("%lu\n",tmp_ps.n_points );
+        MPI_Barrier(ctx->mpi_communicator);
+        compute_pure_global_binning(ctx, &tmp_ps, tmp_global_bins, k_global, d);
+
+        /* sum to global bins */
+        // for(int i = 0; i < k_global; ++i) tmp_global_bins[i] +=
+        // starting_cumulative;
+
+        best_guess = retrieve_guess_pure(ctx, &tmp_ps, tmp_global_bins, k_global, d, ff);
+
+        best_guess.ep = check_pc_pointset_parallel(ctx, ps, best_guess, d, f);
+        // ep = check_pc_pointset_parallel(ctx, &tmp_ps, best_guess, d, f);
+
+        bin_w = (tmp_ps.ub_box[d] - tmp_ps.lb_box[d]) / k_global;
+        bin_lb = tmp_ps.lb_box[d] + (bin_w * (best_guess.bin_idx));
+        bin_ub = tmp_ps.lb_box[d] + (bin_w * (best_guess.bin_idx + 1));
+
+        for (int i = 0; i < best_guess.bin_idx; ++i) starting_cumulative += tmp_global_bins[i];
+
+        // free(tmp_ps.data);
+        free(tmp_ps.lb_box);
+        free(tmp_ps.ub_box);
+    }
+
+    /*
+    MPI_DB_PRINT("SUCCESS!!! \n");
+    */
+
+    free(tmp_global_bins);
+
+    return best_guess;
 }
 
 void init_queue(partition_queue_t *pq) 
 {
-	pq->count = 0;
-	pq->_capacity = 1000;
-	pq->data = (partition_t *)malloc(pq->_capacity * sizeof(partition_t));
+    pq->count = 0;
+    pq->_capacity = 1000;
+    pq->data = (partition_t *)malloc(pq->_capacity * sizeof(partition_t));
 }
 
 void free_queue(partition_queue_t *pq) { free(pq->data); }
 
 void get_pointset_from_partition(pointset_t *ps, partition_t *part) 
 {
-	ps->n_points = part->n_points;
-	ps->data 	 = part->base_ptr;
-	ps->n_points = part->n_points;
+    ps->n_points = part->n_points;
+    ps->data      = part->base_ptr;
+    ps->n_points = part->n_points;
 }
 
 guess_t compute_median_pure_binning(global_context_t *ctx, pointset_t *ps, float_t fraction, int selected_dim, int n_bins, float_t tolerance)
 {
-	int best_bin_idx;
-	float_t ep;
-
-	uint64_t *global_bin_counts_int = (uint64_t *)malloc(n_bins * sizeof(uint64_t));
-
-	compute_bounding_box_pointset(ctx, ps);
-	compute_pure_global_binning(ctx, ps, global_bin_counts_int, n_bins, selected_dim);
-	guess_t g = retrieve_guess_pure(ctx, ps, global_bin_counts_int, n_bins, selected_dim, fraction);
-	// check_pc_pointset(ctx, ps, best_guess, d, f);
-	g.ep = check_pc_pointset_parallel(ctx, ps, g, selected_dim, fraction);
-	g = refine_pure_binning(ctx, ps, g, global_bin_counts_int, n_bins, selected_dim, fraction, tolerance);
-	free(global_bin_counts_int);
-	return g;
+    int best_bin_idx;
+    float_t ep;
+
+    uint64_t *global_bin_counts_int = (uint64_t *)malloc(n_bins * sizeof(uint64_t));
+
+    compute_bounding_box_pointset(ctx, ps);
+    compute_pure_global_binning(ctx, ps, global_bin_counts_int, n_bins, selected_dim);
+    guess_t g = retrieve_guess_pure(ctx, ps, global_bin_counts_int, n_bins, selected_dim, fraction);
+    // check_pc_pointset(ctx, ps, best_guess, d, f);
+    g.ep = check_pc_pointset_parallel(ctx, ps, g, selected_dim, fraction);
+    g = refine_pure_binning(ctx, ps, g, global_bin_counts_int, n_bins, selected_dim, fraction, tolerance);
+    free(global_bin_counts_int);
+    return g;
 }
 
 int compute_n_nodes(int n)
 {
-	if(n == 1) return 1;
-	int nl = n/2;
-	int nr = n - nl;
-	return 1 + compute_n_nodes(nl) + compute_n_nodes(nr);
+    if(n == 1) return 1;
+    int nl = n/2;
+    int nr = n - nl;
+    return 1 + compute_n_nodes(nl) + compute_n_nodes(nr);
 }
 
 void top_tree_init(global_context_t *ctx, top_kdtree_t *tree) 
 {
-	/* we want procs leaves */
-    int l = (int)(ceil(log2((float_t)ctx -> world_size)));	
+    /* we want procs leaves */
+    int l = (int)(ceil(log2((float_t)ctx -> world_size)));    
     int tree_nodes = (1 << (l + 1)) - 1;
-	//int tree_nodes = compute_n_nodes(ctx -> world_size);	
-	//MPI_DB_PRINT("Tree nodes %d %d %d %d\n", ctx -> world_size,l, tree_nodes, compute_n_nodes(ctx -> world_size));
-	tree->_nodes  	= (top_kdtree_node_t*)malloc(tree_nodes * sizeof(top_kdtree_node_t));
+    //int tree_nodes = compute_n_nodes(ctx -> world_size);    
+    //MPI_DB_PRINT("Tree nodes %d %d %d %d\n", ctx -> world_size,l, tree_nodes, compute_n_nodes(ctx -> world_size));
+    tree->_nodes      = (top_kdtree_node_t*)malloc(tree_nodes * sizeof(top_kdtree_node_t));
     for(int i = 0; i < tree_nodes; ++i)
     {
         tree -> _nodes[i].lch = NULL;
@@ -721,275 +721,275 @@ void top_tree_init(global_context_t *ctx, top_kdtree_t *tree)
         tree -> _nodes[i].ub_node_box = NULL;
 
     }
-	tree->_capacity = tree_nodes;
-	tree->dims 		= ctx->dims;
-	tree->count 	= 0;
-	return;
+    tree->_capacity = tree_nodes;
+    tree->dims         = ctx->dims;
+    tree->count     = 0;
+    return;
 }
 
 void top_tree_free(global_context_t *ctx, top_kdtree_t *tree) 
 {
-	for(int i = 0; i < tree -> count; ++i)
-	{
-		if(tree -> _nodes[i].lb_node_box) free(tree -> _nodes[i].lb_node_box);
-		if(tree -> _nodes[i].ub_node_box) free(tree -> _nodes[i].ub_node_box);
-	}
-	free(tree->_nodes);
-	return;
+    for(int i = 0; i < tree -> count; ++i)
+    {
+        if(tree -> _nodes[i].lb_node_box) free(tree -> _nodes[i].lb_node_box);
+        if(tree -> _nodes[i].ub_node_box) free(tree -> _nodes[i].ub_node_box);
+    }
+    free(tree->_nodes);
+    return;
 }
 
 top_kdtree_node_t* top_tree_generate_node(global_context_t* ctx, top_kdtree_t* tree)
 {
-	top_kdtree_node_t* ptr = tree -> _nodes + tree -> count;
-	ptr -> lch = NULL;
-	ptr -> rch = NULL;
+    top_kdtree_node_t* ptr = tree -> _nodes + tree -> count;
+    ptr -> lch = NULL;
+    ptr -> rch = NULL;
     ptr -> parent = NULL;
-	ptr -> lb_node_box = (float_t*)malloc(ctx -> dims * sizeof(float_t));
-	ptr -> ub_node_box = (float_t*)malloc(ctx -> dims * sizeof(float_t));
-	ptr -> owner 	   = -1;
-	ptr -> split_dim   = 0.;
-	++tree -> count;
-	return ptr;
+    ptr -> lb_node_box = (float_t*)malloc(ctx -> dims * sizeof(float_t));
+    ptr -> ub_node_box = (float_t*)malloc(ctx -> dims * sizeof(float_t));
+    ptr -> owner        = -1;
+    ptr -> split_dim   = 0.;
+    ++tree -> count;
+    return ptr;
  
 }
 
 void tree_print(global_context_t* ctx, top_kdtree_node_t* root)
 {
-	MPI_DB_PRINT("Node %p: \n\tsplit_dim %d \n\tsplit_val %lf", root, root -> split_dim, root -> split_val);
-	MPI_DB_PRINT("\n\tparent %p", root -> parent);
-	MPI_DB_PRINT("\n\towner  %d", root -> owner);
-	MPI_DB_PRINT("\n\tbox");
-	MPI_DB_PRINT("\n\tlch %p", root -> lch);
-	MPI_DB_PRINT("\n\trch %p\n", root -> rch);
-	for(size_t d = 0; d < ctx -> dims; ++d) MPI_DB_PRINT("\n\t  d%d:[%lf, %lf]",(int)d, root -> lb_node_box[d], root -> ub_node_box[d]); 
-	MPI_DB_PRINT("\n");
-	if(root -> lch) tree_print(ctx, root -> lch);
-	if(root -> rch) tree_print(ctx, root -> rch);
+    MPI_DB_PRINT("Node %p: \n\tsplit_dim %d \n\tsplit_val %lf", root, root -> split_dim, root -> split_val);
+    MPI_DB_PRINT("\n\tparent %p", root -> parent);
+    MPI_DB_PRINT("\n\towner  %d", root -> owner);
+    MPI_DB_PRINT("\n\tbox");
+    MPI_DB_PRINT("\n\tlch %p", root -> lch);
+    MPI_DB_PRINT("\n\trch %p\n", root -> rch);
+    for(size_t d = 0; d < ctx -> dims; ++d) MPI_DB_PRINT("\n\t  d%d:[%lf, %lf]",(int)d, root -> lb_node_box[d], root -> ub_node_box[d]); 
+    MPI_DB_PRINT("\n");
+    if(root -> lch) tree_print(ctx, root -> lch);
+    if(root -> rch) tree_print(ctx, root -> rch);
 }
 void _recursive_nodes_to_file(global_context_t* ctx, FILE* nodes_file, top_kdtree_node_t* root, int level)
 {
-	fprintf(nodes_file, "%d,", level);
-	fprintf(nodes_file, "%d,", root -> owner);
-	fprintf(nodes_file, "%d,", root -> split_dim);
-	fprintf(nodes_file, "%lf,", root -> split_val);
-	for(int i = 0; i < ctx -> dims; ++i)
-	{
-		fprintf(nodes_file,"%lf,",root -> lb_node_box[i]);
-	}
-	for(int i = 0; i < ctx -> dims - 1; ++i)
-	{
-		fprintf(nodes_file,"%lf,",root -> ub_node_box[i]);
-	}
-	fprintf(nodes_file,"%lf\n",root -> ub_node_box[ctx -> dims - 1]);
-	if(root -> lch) _recursive_nodes_to_file(ctx, nodes_file, root -> lch, level + 1);
-	if(root -> rch) _recursive_nodes_to_file(ctx, nodes_file, root -> rch, level + 1);
+    fprintf(nodes_file, "%d,", level);
+    fprintf(nodes_file, "%d,", root -> owner);
+    fprintf(nodes_file, "%d,", root -> split_dim);
+    fprintf(nodes_file, "%lf,", root -> split_val);
+    for(int i = 0; i < ctx -> dims; ++i)
+    {
+        fprintf(nodes_file,"%lf,",root -> lb_node_box[i]);
+    }
+    for(int i = 0; i < ctx -> dims - 1; ++i)
+    {
+        fprintf(nodes_file,"%lf,",root -> ub_node_box[i]);
+    }
+    fprintf(nodes_file,"%lf\n",root -> ub_node_box[ctx -> dims - 1]);
+    if(root -> lch) _recursive_nodes_to_file(ctx, nodes_file, root -> lch, level + 1);
+    if(root -> rch) _recursive_nodes_to_file(ctx, nodes_file, root -> rch, level + 1);
 }
 void write_nodes_to_file( global_context_t* ctx,top_kdtree_t* tree, 
-						const char* nodes_path) 
+                        const char* nodes_path) 
 {
-	FILE* nodes_file  = fopen(nodes_path,"w");
+    FILE* nodes_file  = fopen(nodes_path,"w");
 
-	if(!nodes_file) 
-	{
-		printf("Cannot open hp file\n");
-		return;
-	}
-	_recursive_nodes_to_file(ctx, nodes_file, tree -> root, 0);
-	fclose(nodes_file);
+    if(!nodes_file) 
+    {
+        printf("Cannot open hp file\n");
+        return;
+    }
+    _recursive_nodes_to_file(ctx, nodes_file, tree -> root, 0);
+    fclose(nodes_file);
 
-	
+    
 }
 
 void tree_print_leaves(global_context_t* ctx, top_kdtree_node_t* root)
 {
-	if(root -> owner != -1)
-	{
-		MPI_DB_PRINT("Node %p: \n\tsplit_dim %d \n\tsplit_val %lf", root, root -> split_dim, root -> split_val);
-		MPI_DB_PRINT("\n\tparent %p", root -> parent);
-		MPI_DB_PRINT("\n\towner  %d", root -> owner);
-		MPI_DB_PRINT("\n\tbox");
-		MPI_DB_PRINT("\n\tlch %p", root -> lch);
-		MPI_DB_PRINT("\n\trch %p\n", root -> rch);
-		for(size_t d = 0; d < ctx -> dims; ++d) MPI_DB_PRINT("\n\t  d%d:[%lf, %lf]",(int)d, root -> lb_node_box[d], root -> ub_node_box[d]); 
-		MPI_DB_PRINT("\n");
-	}
-	if(root -> lch) tree_print_leaves(ctx, root -> lch);
-	if(root -> rch) tree_print_leaves(ctx, root -> rch);
+    if(root -> owner != -1)
+    {
+        MPI_DB_PRINT("Node %p: \n\tsplit_dim %d \n\tsplit_val %lf", root, root -> split_dim, root -> split_val);
+        MPI_DB_PRINT("\n\tparent %p", root -> parent);
+        MPI_DB_PRINT("\n\towner  %d", root -> owner);
+        MPI_DB_PRINT("\n\tbox");
+        MPI_DB_PRINT("\n\tlch %p", root -> lch);
+        MPI_DB_PRINT("\n\trch %p\n", root -> rch);
+        for(size_t d = 0; d < ctx -> dims; ++d) MPI_DB_PRINT("\n\t  d%d:[%lf, %lf]",(int)d, root -> lb_node_box[d], root -> ub_node_box[d]); 
+        MPI_DB_PRINT("\n");
+    }
+    if(root -> lch) tree_print_leaves(ctx, root -> lch);
+    if(root -> rch) tree_print_leaves(ctx, root -> rch);
 }
 
 void build_top_kdtree(global_context_t *ctx, pointset_t *og_pointset, top_kdtree_t *tree, int n_bins, float_t tolerance) 
 {
-	size_t tot_n_points = 0;
-	MPI_Allreduce(&(og_pointset->n_points), &tot_n_points, 1, MPI_UINT64_T, MPI_SUM, ctx->mpi_communicator);
+    size_t tot_n_points = 0;
+    MPI_Allreduce(&(og_pointset->n_points), &tot_n_points, 1, MPI_UINT64_T, MPI_SUM, ctx->mpi_communicator);
 
     /*
-	MPI_DB_PRINT("[MASTER] Top tree builder invoked\n");
+    MPI_DB_PRINT("[MASTER] Top tree builder invoked\n");
     */
     MPI_DB_PRINT("\n");
     MPI_DB_PRINT("Building top tree on %lu points with %d processors\n", tot_n_points, ctx->world_size);
     MPI_DB_PRINT("\n");
 
-	size_t current_partition_n_points = tot_n_points;
-	size_t expected_points_per_node = tot_n_points / ctx->world_size;
-
-	/* enqueue the two partitions */
-
-	compute_bounding_box_pointset(ctx, og_pointset);
-
-	partition_queue_t queue;
-	init_queue(&queue);
-
-	int selected_dim = 0;
-	partition_t current_partition = {  .d 		 = selected_dim,
-									   .base_ptr = og_pointset->data,
-									   .n_points = og_pointset->n_points,
-									   .n_procs = ctx->world_size,
-									   .parent  = NULL,
-									   .lr 		= NO_CHILD 
-									};
-
-	enqueue_partition(&queue, current_partition);
-	pointset_t current_pointset;
-	current_pointset.lb_box = (float_t*)malloc(ctx -> dims * sizeof(float_t));
-	current_pointset.ub_box = (float_t*)malloc(ctx -> dims * sizeof(float_t));
-
-	while (queue.count) 
-	{
-		/*dequeue the partition to process */
-		current_partition = dequeue_partition(&queue);
-
-		/* generate e pointset for that partition */
-
-		get_pointset_from_partition(&current_pointset, &current_partition);
-		current_pointset.dims = ctx->dims;
-
-		/*generate a tree node */
-
-		top_kdtree_node_t* current_node  = top_tree_generate_node(ctx, tree);
-		/* insert node */
-		
-		/*
-		MPI_DB_PRINT("Handling partition: \n\tcurrent_node %p, \n\tdim %d, \n\tn_points %d, \n\tstart_proc %d, \n\tn_procs %d, \n\tparent %p\n", 
-				current_node,
-				current_partition.d,
-				current_partition.n_points,
-				current_partition.start_proc,
-				current_partition.n_procs,
-				current_partition.parent);
-		*/
-
-		switch (current_partition.lr) {
-			case TOP_TREE_LCH:
-				if(current_partition.parent)
-				{
-					current_node -> parent 		  = current_partition.parent;
-					current_node -> parent -> lch = current_node;
-					/* compute the box */
-					/*
-					 * left child has lb equal to parent
-					 * ub equal to parent except for the dim of splitting 
-					 */
-					int parent_split_dim = current_node -> parent -> split_dim;
-					float_t parent_hp 	 = current_node -> parent -> split_val;
-
-					memcpy(current_node -> lb_node_box, current_node -> parent -> lb_node_box, ctx -> dims * sizeof(float_t));
-					memcpy(current_node -> ub_node_box, current_node -> parent -> ub_node_box, ctx -> dims * sizeof(float_t));
-
-					current_node -> ub_node_box[parent_split_dim] = parent_hp;
-				}
-				break;
-
-			case TOP_TREE_RCH:
-				if(current_partition.parent)
-				{
-					current_node -> parent 		  = current_partition.parent;
-					current_node -> parent -> rch = current_node;
-
-					int parent_split_dim = current_node -> parent -> split_dim;
-					float_t parent_hp 	 = current_node -> parent -> split_val;
-
-					/*
-					 * right child has ub equal to parent
-					 * lb equal to parent except for the dim of splitting 
-					 */
-
-					memcpy(current_node -> lb_node_box, current_node -> parent -> lb_node_box, ctx -> dims * sizeof(float_t));
-					memcpy(current_node -> ub_node_box, current_node -> parent -> ub_node_box, ctx -> dims * sizeof(float_t));
-
-					current_node -> lb_node_box[parent_split_dim] = parent_hp;
-				}
-				break;
-			case NO_CHILD:
-				{
-					tree -> root = current_node;
-					memcpy(current_node -> lb_node_box, og_pointset -> lb_box, ctx -> dims * sizeof(float_t));
-					memcpy(current_node -> ub_node_box, og_pointset -> ub_box, ctx -> dims * sizeof(float_t));
-				}
-				break;
-		}
-
-		current_node -> split_dim = current_partition.d;
-		current_node -> parent = current_partition.parent;
-		current_node -> lch = NULL;
-		current_node -> rch = NULL;
-
-
-		/* handle partition */
-		if(current_partition.n_procs > 1)
-		{
-			float_t fraction = (current_partition.n_procs / 2) / (float_t)current_partition.n_procs;
-			guess_t g = compute_median_pure_binning(ctx, &current_pointset, fraction, current_partition.d, n_bins, tolerance);
-			int pv = partition_data_around_value(current_pointset.data, ctx->dims, current_partition.d, 0, current_pointset.n_points, g.x_guess);
-
-			current_node -> split_val = g.x_guess;
-
-			size_t points_left = (size_t)pv;
-			size_t points_right = current_partition.n_points - points_left;
-
-			int procs_left = current_partition.n_procs * fraction;
-			int procs_right = current_partition.n_procs - procs_left;
-
-
-			/*
-			MPI_DB_PRINT("Chosing as guess: %lf, seareching for %lf, obtained %lf\n", g.x_guess, fraction, g.ep);
-			MPI_DB_PRINT("-------------------\n\n");
-			*/
-	
-
-
-			int next_dimension = (++selected_dim) % (ctx->dims);
-			partition_t left_partition = {
-				.n_points 	= points_left, 
-				.n_procs 	= procs_left,
-				.start_proc = current_partition.start_proc,
-				.parent 	= current_node,
-				.lr 		= TOP_TREE_LCH,
-				.base_ptr 	= current_pointset.data,
-				.d 			= next_dimension,
-			};
-
-			partition_t right_partition = {
-				.n_points 	= points_right, 
-				.n_procs 	= procs_right,
-				.start_proc = current_partition.start_proc + procs_left,
-				.parent 	= current_node,
-				.lr 		= TOP_TREE_RCH,
-				.base_ptr 	= current_pointset.data + pv * current_pointset.dims,
-				.d 			= next_dimension
-			};
-
-			enqueue_partition(&queue, left_partition);
-			enqueue_partition(&queue, right_partition);
-		}
-		else
-		{
-			current_node -> owner = current_partition.start_proc;
-		}
-	}
-	tree -> root = tree -> _nodes;
+    size_t current_partition_n_points = tot_n_points;
+    size_t expected_points_per_node = tot_n_points / ctx->world_size;
+
+    /* enqueue the two partitions */
+
+    compute_bounding_box_pointset(ctx, og_pointset);
+
+    partition_queue_t queue;
+    init_queue(&queue);
+
+    int selected_dim = 0;
+    partition_t current_partition = {  .d          = selected_dim,
+                                       .base_ptr = og_pointset->data,
+                                       .n_points = og_pointset->n_points,
+                                       .n_procs = ctx->world_size,
+                                       .parent  = NULL,
+                                       .lr         = NO_CHILD 
+                                    };
+
+    enqueue_partition(&queue, current_partition);
+    pointset_t current_pointset;
+    current_pointset.lb_box = (float_t*)malloc(ctx -> dims * sizeof(float_t));
+    current_pointset.ub_box = (float_t*)malloc(ctx -> dims * sizeof(float_t));
+
+    while (queue.count) 
+    {
+        /*dequeue the partition to process */
+        current_partition = dequeue_partition(&queue);
+
+        /* generate e pointset for that partition */
+
+        get_pointset_from_partition(&current_pointset, &current_partition);
+        current_pointset.dims = ctx->dims;
+
+        /*generate a tree node */
+
+        top_kdtree_node_t* current_node  = top_tree_generate_node(ctx, tree);
+        /* insert node */
+        
+        /*
+        MPI_DB_PRINT("Handling partition: \n\tcurrent_node %p, \n\tdim %d, \n\tn_points %d, \n\tstart_proc %d, \n\tn_procs %d, \n\tparent %p\n", 
+                current_node,
+                current_partition.d,
+                current_partition.n_points,
+                current_partition.start_proc,
+                current_partition.n_procs,
+                current_partition.parent);
+        */
+
+        switch (current_partition.lr) {
+            case TOP_TREE_LCH:
+                if(current_partition.parent)
+                {
+                    current_node -> parent           = current_partition.parent;
+                    current_node -> parent -> lch = current_node;
+                    /* compute the box */
+                    /*
+                     * left child has lb equal to parent
+                     * ub equal to parent except for the dim of splitting 
+                     */
+                    int parent_split_dim = current_node -> parent -> split_dim;
+                    float_t parent_hp      = current_node -> parent -> split_val;
+
+                    memcpy(current_node -> lb_node_box, current_node -> parent -> lb_node_box, ctx -> dims * sizeof(float_t));
+                    memcpy(current_node -> ub_node_box, current_node -> parent -> ub_node_box, ctx -> dims * sizeof(float_t));
+
+                    current_node -> ub_node_box[parent_split_dim] = parent_hp;
+                }
+                break;
+
+            case TOP_TREE_RCH:
+                if(current_partition.parent)
+                {
+                    current_node -> parent           = current_partition.parent;
+                    current_node -> parent -> rch = current_node;
+
+                    int parent_split_dim = current_node -> parent -> split_dim;
+                    float_t parent_hp      = current_node -> parent -> split_val;
+
+                    /*
+                     * right child has ub equal to parent
+                     * lb equal to parent except for the dim of splitting 
+                     */
+
+                    memcpy(current_node -> lb_node_box, current_node -> parent -> lb_node_box, ctx -> dims * sizeof(float_t));
+                    memcpy(current_node -> ub_node_box, current_node -> parent -> ub_node_box, ctx -> dims * sizeof(float_t));
+
+                    current_node -> lb_node_box[parent_split_dim] = parent_hp;
+                }
+                break;
+            case NO_CHILD:
+                {
+                    tree -> root = current_node;
+                    memcpy(current_node -> lb_node_box, og_pointset -> lb_box, ctx -> dims * sizeof(float_t));
+                    memcpy(current_node -> ub_node_box, og_pointset -> ub_box, ctx -> dims * sizeof(float_t));
+                }
+                break;
+        }
+
+        current_node -> split_dim = current_partition.d;
+        current_node -> parent = current_partition.parent;
+        current_node -> lch = NULL;
+        current_node -> rch = NULL;
+
+
+        /* handle partition */
+        if(current_partition.n_procs > 1)
+        {
+            float_t fraction = (current_partition.n_procs / 2) / (float_t)current_partition.n_procs;
+            guess_t g = compute_median_pure_binning(ctx, &current_pointset, fraction, current_partition.d, n_bins, tolerance);
+            int pv = partition_data_around_value(current_pointset.data, ctx->dims, current_partition.d, 0, current_pointset.n_points, g.x_guess);
+
+            current_node -> split_val = g.x_guess;
+
+            size_t points_left = (size_t)pv;
+            size_t points_right = current_partition.n_points - points_left;
+
+            int procs_left = current_partition.n_procs * fraction;
+            int procs_right = current_partition.n_procs - procs_left;
+
+
+            /*
+            MPI_DB_PRINT("Chosing as guess: %lf, seareching for %lf, obtained %lf\n", g.x_guess, fraction, g.ep);
+            MPI_DB_PRINT("-------------------\n\n");
+            */
+    
+
+
+            int next_dimension = (++selected_dim) % (ctx->dims);
+            partition_t left_partition = {
+                .n_points     = points_left, 
+                .n_procs     = procs_left,
+                .start_proc = current_partition.start_proc,
+                .parent     = current_node,
+                .lr         = TOP_TREE_LCH,
+                .base_ptr     = current_pointset.data,
+                .d             = next_dimension,
+            };
+
+            partition_t right_partition = {
+                .n_points     = points_right, 
+                .n_procs     = procs_right,
+                .start_proc = current_partition.start_proc + procs_left,
+                .parent     = current_node,
+                .lr         = TOP_TREE_RCH,
+                .base_ptr     = current_pointset.data + pv * current_pointset.dims,
+                .d             = next_dimension
+            };
+
+            enqueue_partition(&queue, left_partition);
+            enqueue_partition(&queue, right_partition);
+        }
+        else
+        {
+            current_node -> owner = current_partition.start_proc;
+        }
+    }
+    tree -> root = tree -> _nodes;
 
     #if defined(WRITE_TOP_NODES)
-	MPI_DB_PRINT("Root is %p\n", tree -> root);
+    MPI_DB_PRINT("Root is %p\n", tree -> root);
         if(I_AM_MASTER)
         {
             //tree_print(ctx, tree -> root);
@@ -997,117 +997,117 @@ void build_top_kdtree(global_context_t *ctx, pointset_t *og_pointset, top_kdtree
         }
     #endif
 
-	
-	free(current_pointset.lb_box);
-	free(current_pointset.ub_box);
-	free_queue(&queue);
+    
+    free(current_pointset.lb_box);
+    free(current_pointset.ub_box);
+    free_queue(&queue);
 
 }
 
 int compute_point_owner(global_context_t* ctx, top_kdtree_t* tree, float_t* data)
 {
-	top_kdtree_node_t* current_node = tree -> root;
-	int owner = current_node -> owner;
-	while(owner == -1)
-	{
-		/* compute side */
-		int split_dim = current_node -> split_dim;
-		int side = data[split_dim] > current_node -> split_val;
-		switch (side) 
-		{
-			case TOP_TREE_RCH:
-				{
-					current_node = current_node -> rch;					
-				}
-				break;
-
-			case TOP_TREE_LCH:
-				{
-					current_node = current_node -> lch;					
-				}
-				break;
-			default:
-				break;
-		}
-		owner = current_node -> owner;
-	}
-	return owner;
+    top_kdtree_node_t* current_node = tree -> root;
+    int owner = current_node -> owner;
+    while(owner == -1)
+    {
+        /* compute side */
+        int split_dim = current_node -> split_dim;
+        int side = data[split_dim] > current_node -> split_val;
+        switch (side) 
+        {
+            case TOP_TREE_RCH:
+                {
+                    current_node = current_node -> rch;                    
+                }
+                break;
+
+            case TOP_TREE_LCH:
+                {
+                    current_node = current_node -> lch;                    
+                }
+                break;
+            default:
+                break;
+        }
+        owner = current_node -> owner;
+    }
+    return owner;
 }
 
 /* to partition points around owners */
 int partition_data_around_key(int* key, float_t *val, int vec_len, int ref_key , int left, int right) 
 {
-	/*
-	* returns the number of elements less than the pivot
-	*/
-	int store_index = left;
-	int i;
-	/* Move pivot to end */
-	for (i = left; i < right; ++i) 
-	{
-		// if(compare_data_element(array + i*vec_len, array + pivot_index*vec_len, compare_dim ) >= 0){
-		if (key[i] < ref_key) 
-		{
-			swap_data_element(val + store_index * vec_len, val + i * vec_len, vec_len);
-			/* swap keys */
-			int tmp = key[i];
-			key[i] = key[store_index];
-			key[store_index] = tmp;
-			
-			store_index += 1;
-		}
-	}
-	/* Move pivot to its final place */
-	// swap_data_element(array + (store_index)*vec_len , array + right*vec_len,
-	// vec_len);
-
-	return store_index; // maybe, update, it works :)
+    /*
+    * returns the number of elements less than the pivot
+    */
+    int store_index = left;
+    int i;
+    /* Move pivot to end */
+    for (i = left; i < right; ++i) 
+    {
+        // if(compare_data_element(array + i*vec_len, array + pivot_index*vec_len, compare_dim ) >= 0){
+        if (key[i] < ref_key) 
+        {
+            swap_data_element(val + store_index * vec_len, val + i * vec_len, vec_len);
+            /* swap keys */
+            int tmp = key[i];
+            key[i] = key[store_index];
+            key[store_index] = tmp;
+            
+            store_index += 1;
+        }
+    }
+    /* Move pivot to its final place */
+    // swap_data_element(array + (store_index)*vec_len , array + right*vec_len,
+    // vec_len);
+
+    return store_index; // maybe, update, it works :)
 }
 
 
 
 void exchange_points(global_context_t* ctx, top_kdtree_t* tree)
 {
-	int* points_per_proc = (int*)malloc(ctx -> world_size * sizeof(int));	
-	int* points_owners 	 = (int*)malloc(ctx -> dims * ctx -> local_n_points * sizeof(float_t));
-	int* partition_offset = (int*)malloc(ctx -> world_size * sizeof(int));	
+    int* points_per_proc = (int*)malloc(ctx -> world_size * sizeof(int));    
+    int* points_owners      = (int*)malloc(ctx -> dims * ctx -> local_n_points * sizeof(float_t));
+    int* partition_offset = (int*)malloc(ctx -> world_size * sizeof(int));    
 
-	/* compute owner */
+    /* compute owner */
     #pragma omp parallel for
-	for(size_t i = 0; i < ctx -> local_n_points; ++i)
-	{
-		/* tree walk */
-		points_owners[i] = compute_point_owner(ctx, tree, ctx -> local_data + (i * ctx -> dims));
-	}
-		
-	
-	int last_idx = 0;
-	int len 	 = ctx -> local_n_points;
-	float_t* curr_data = ctx -> local_data;
-
-	partition_offset[0] = 0;
-	for(int owner = 1; owner < ctx -> world_size; ++owner)
-	{
-		last_idx = partition_data_around_key(points_owners, ctx -> local_data, ctx -> dims, owner, last_idx, ctx -> local_n_points);	
-		partition_offset[owner] = last_idx;
-		points_per_proc[owner - 1] = last_idx;
-	}
-
-	points_per_proc[ctx -> world_size - 1] = ctx -> local_n_points;
-	
-	
-	for(int i = ctx -> world_size - 1; i > 0; --i)
-	{
-		points_per_proc[i] = points_per_proc[i] - points_per_proc[i - 1];
-	}
-
-	int* rcv_count      = (int*)malloc(ctx -> world_size * sizeof(int));
-	int* rcv_displs     = (int*)malloc(ctx -> world_size * sizeof(int));
+    for(size_t i = 0; i < ctx -> local_n_points; ++i)
+    {
+        /* tree walk */
+        points_owners[i] = compute_point_owner(ctx, tree, ctx -> local_data + (i * ctx -> dims));
+    }
+        
+    
+    int last_idx = 0;
+    int len      = ctx -> local_n_points;
+    float_t* curr_data = ctx -> local_data;
+
+    partition_offset[0] = 0;
+    for(int owner = 1; owner < ctx -> world_size; ++owner)
+    {
+        last_idx = partition_data_around_key(points_owners, ctx -> local_data, ctx -> dims, owner, last_idx, ctx -> local_n_points);    
+        partition_offset[owner] = last_idx;
+        points_per_proc[owner - 1] = last_idx;
+    }
+
+    points_per_proc[ctx -> world_size - 1] = ctx -> local_n_points;
+    
+    
+    for(int i = ctx -> world_size - 1; i > 0; --i)
+    {
+        points_per_proc[i] = points_per_proc[i] - points_per_proc[i - 1];
+    }
+
+    int* rcv_count      = (int*)malloc(ctx -> world_size * sizeof(int));
+    int* rcv_displs     = (int*)malloc(ctx -> world_size * sizeof(int));
     int* send_displs    = (int*)malloc(ctx -> world_size * sizeof(int)); 
     int* send_count     = points_per_proc;
 
-	float_t* rcvbuffer = NULL;
-	int tot_count = 0;
+    float_t* rcvbuffer = NULL;
+    int tot_count = 0;
 
     //[-8.33416939 -8.22858047]
     
@@ -1143,70 +1143,70 @@ void exchange_points(global_context_t* ctx, top_kdtree_t* tree)
                     rcvbuffer, rcv_count, rcv_displs, MPI_MY_FLOAT, 
                     ctx -> mpi_communicator);
 
-	ctx -> local_n_points = tot_count / ctx -> dims; 
+    ctx -> local_n_points = tot_count / ctx -> dims; 
     int* ppp = (int*)malloc(ctx -> world_size * sizeof(int));
 
     MPI_Allgather(&(ctx -> local_n_points), 1, MPI_INT, ppp, 1, MPI_INT, ctx -> mpi_communicator);
-	ctx -> idx_start = 0;
-	for(int i = 0; i < ctx -> mpi_rank; ++i)
-	{
-		ctx -> idx_start += ppp[i];
-	}
+    ctx -> idx_start = 0;
+    for(int i = 0; i < ctx -> mpi_rank; ++i)
+    {
+        ctx -> idx_start += ppp[i];
+    }
     //DB_PRINT("rank %d start %lu\n", ctx -> mpi_rank, ctx -> idx_start);
 
-	/* free prv pointer */
+    /* free prv pointer */
     free(ppp);
-	free(ctx -> local_data);
-	ctx -> local_data = rcvbuffer;
+    free(ctx -> local_data);
+    ctx -> local_data = rcvbuffer;
 
-	/* check exchange */
+    /* check exchange */
     
-	for(size_t i = 0; i < ctx -> local_n_points; ++i)
-	{
-		int o = compute_point_owner(ctx, tree, ctx -> local_data + (i * ctx -> dims));
-		if(o != ctx -> mpi_rank) DB_PRINT("rank %d got an error\n",ctx -> mpi_rank);
-	}
-
-	free(points_owners);
-	free(points_per_proc);
-	free(partition_offset);
-	free(rcv_count);
-	free(rcv_displs);
+    for(size_t i = 0; i < ctx -> local_n_points; ++i)
+    {
+        int o = compute_point_owner(ctx, tree, ctx -> local_data + (i * ctx -> dims));
+        if(o != ctx -> mpi_rank) DB_PRINT("rank %d got an error\n",ctx -> mpi_rank);
+    }
+
+    free(points_owners);
+    free(points_per_proc);
+    free(partition_offset);
+    free(rcv_count);
+    free(rcv_displs);
     free(send_displs);
 }
 
 static inline size_t local_to_global_idx(global_context_t* ctx, size_t local_idx)
 {
-	return local_idx + ctx -> idx_start; 
+    return local_idx + ctx -> idx_start; 
 }
 
 void translate_tree_idx_to_global(global_context_t* ctx, kdtree_v2* local_tree) 
 {
-	for(size_t i = 0; i < ctx -> local_n_points; ++i)		
-	{
-		local_tree -> _nodes[i].array_idx = local_to_global_idx(ctx, local_tree -> _nodes[i].array_idx); 
-	}
+    for(size_t i = 0; i < ctx -> local_n_points; ++i)        
+    {
+        local_tree -> _nodes[i].array_idx = local_to_global_idx(ctx, local_tree -> _nodes[i].array_idx); 
+    }
 }
 
 heap ngbh_search_kdtree(global_context_t* ctx, kdtree_v2* local_tree, float_t* data, int k)
 {
-	data_dims = ctx -> dims;
-	return knn_kdtree_v2(data, local_tree -> root, k);
+    data_dims = ctx -> dims;
+    return knn_kdtree_v2(data, local_tree -> root, k);
 }
 
 void tree_walk(
-		global_context_t* ctx, 
-		top_kdtree_node_t* root, 
-		int point_idx,
-		float_t max_dist,
-		float_t* point,
-		float_t** data_to_send_per_proc, 
-		int** local_idx_of_the_point, 
-		int* point_to_send_count, 
-		int* point_to_send_capacity)
+        global_context_t* ctx, 
+        top_kdtree_node_t* root, 
+        int point_idx,
+        float_t max_dist,
+        float_t* point,
+        float_t** data_to_send_per_proc, 
+        int** local_idx_of_the_point, 
+        int* point_to_send_count, 
+        int* point_to_send_capacity)
 {
-	if(root -> owner != -1 && root -> owner != ctx -> mpi_rank)
-	{
+    if(root -> owner != -1 && root -> owner != ctx -> mpi_rank)
+    {
         
         #pragma omp critical
         {
@@ -1235,76 +1235,76 @@ void tree_walk(
             point_to_send_count[owner]++;
         }
 
-	}
-	else
-	{
-		/* tree walk */
-		int split_var = root -> split_dim;
-		float_t hp_distance = point[split_var] - root -> split_val;
-		__builtin_prefetch(root -> lch, 0, 3);
-		__builtin_prefetch(root -> rch, 0, 3);
-
-		int side = hp_distance > 0.f;
-
-		switch (side)
-		{
-			case TOP_TREE_LCH:
-				if(root -> lch)
-				{
-					/* walk on the left */
-					tree_walk(ctx, root -> lch, point_idx, max_dist, point, 
-							data_to_send_per_proc, local_idx_of_the_point, 
-							point_to_send_count, point_to_send_capacity);
-				}
-				break;
-			
-			case TOP_TREE_RCH:
-				if(root -> rch)
-				{
-					/* walk on the right */
-					tree_walk(ctx, root -> rch, point_idx, max_dist, point, 
-							data_to_send_per_proc, local_idx_of_the_point, 
-							point_to_send_count, point_to_send_capacity);
-				}
-				break;
-
-			default:
-				break;
-		}
-
-		int c   = max_dist > (hp_distance * hp_distance);
-
-		//if(c || (H -> count) < (H -> N))
-		if(c)
-		{
-
-			switch (side)
-			{
-				case HP_LEFT_SIDE:
-					if(root -> rch) 
-					{
-						/* walk on the right */
-						tree_walk(ctx, root -> rch, point_idx, max_dist, point, 
-								data_to_send_per_proc, local_idx_of_the_point, 
-								point_to_send_count, point_to_send_capacity);
-					}
-					break;
-				
-				case HP_RIGHT_SIDE:
-					if(root -> lch) 
-					{
-						/* walk on the left */
-						tree_walk(ctx, root -> lch, point_idx, max_dist, point, 
-								data_to_send_per_proc, local_idx_of_the_point, 
-								point_to_send_count, point_to_send_capacity);
-					}
-					break;
-
-				default:
-					break;
-			}
-		}
-	}
+    }
+    else
+    {
+        /* tree walk */
+        int split_var = root -> split_dim;
+        float_t hp_distance = point[split_var] - root -> split_val;
+        __builtin_prefetch(root -> lch, 0, 3);
+        __builtin_prefetch(root -> rch, 0, 3);
+
+        int side = hp_distance > 0.f;
+
+        switch (side)
+        {
+            case TOP_TREE_LCH:
+                if(root -> lch)
+                {
+                    /* walk on the left */
+                    tree_walk(ctx, root -> lch, point_idx, max_dist, point, 
+                            data_to_send_per_proc, local_idx_of_the_point, 
+                            point_to_send_count, point_to_send_capacity);
+                }
+                break;
+            
+            case TOP_TREE_RCH:
+                if(root -> rch)
+                {
+                    /* walk on the right */
+                    tree_walk(ctx, root -> rch, point_idx, max_dist, point, 
+                            data_to_send_per_proc, local_idx_of_the_point, 
+                            point_to_send_count, point_to_send_capacity);
+                }
+                break;
+
+            default:
+                break;
+        }
+
+        int c   = max_dist > (hp_distance * hp_distance);
+
+        //if(c || (H -> count) < (H -> N))
+        if(c)
+        {
+
+            switch (side)
+            {
+                case HP_LEFT_SIDE:
+                    if(root -> rch) 
+                    {
+                        /* walk on the right */
+                        tree_walk(ctx, root -> rch, point_idx, max_dist, point, 
+                                data_to_send_per_proc, local_idx_of_the_point, 
+                                point_to_send_count, point_to_send_capacity);
+                    }
+                    break;
+                
+                case HP_RIGHT_SIDE:
+                    if(root -> lch) 
+                    {
+                        /* walk on the left */
+                        tree_walk(ctx, root -> lch, point_idx, max_dist, point, 
+                                data_to_send_per_proc, local_idx_of_the_point, 
+                                point_to_send_count, point_to_send_capacity);
+                    }
+                    break;
+
+                default:
+                    break;
+            }
+        }
+    }
 
 }
 
@@ -1347,7 +1347,7 @@ void print_diagnositcs(global_context_t* ctx, int k)
 
 void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtree_t* top_tree, kdtree_v2* local_tree, float_t* data, int k)
 {
-	/* local search */
+    /* local search */
     /* print diagnostics */
     print_diagnositcs(ctx, k);
     
@@ -1357,45 +1357,45 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
     TIME_START;
     MPI_Barrier(ctx -> mpi_communicator);
     #pragma omp parallel for
-	for(int p = 0; p < ctx -> local_n_points; ++p)
-	{
-		idx_t idx = local_tree -> _nodes[p].array_idx;
-		/* actually we want to preserve the heap to then insert guesses from other nodes */
-		dp_info[idx].ngbh = knn_kdtree_v2_no_heapsort(local_tree -> _nodes[p].data, local_tree -> root, k);
+    for(int p = 0; p < ctx -> local_n_points; ++p)
+    {
+        idx_t idx = local_tree -> _nodes[p].array_idx;
+        /* actually we want to preserve the heap to then insert guesses from other nodes */
+        dp_info[idx].ngbh = knn_kdtree_v2_no_heapsort(local_tree -> _nodes[p].data, local_tree -> root, k);
         convert_heap_idx_to_global(ctx, &(dp_info[idx].ngbh));
-		dp_info[idx].cluster_idx = -1;
-		dp_info[idx].is_center = 0;
-		dp_info[idx].array_idx = idx;
-	}
+        dp_info[idx].cluster_idx = -1;
+        dp_info[idx].is_center = 0;
+        dp_info[idx].array_idx = idx;
+    }
     elapsed_time = TIME_STOP;
     LOG_WRITE("Local neighborhood search", elapsed_time);
 
 
     TIME_START;
-	/* find if a points needs a refine on the global tree */
-	float_t** data_to_send_per_proc  	= (float_t**)malloc(ctx -> world_size * sizeof(float_t*));
-	int** 	  local_idx_of_the_point 	= (int**)malloc(ctx -> world_size * sizeof(int*));
-	int* 	  point_to_snd_count       = (int*)malloc(ctx -> world_size * sizeof(int));
-	int* 	  point_to_snd_capacity    = (int*)malloc(ctx -> world_size * sizeof(int));
-
-	for(int i = 0; i < ctx -> world_size; ++i)
-	{
-		//data_to_send_per_proc[i]  = (float_t*)malloc(100 * (1 + ctx -> dims) * sizeof(float_t));	
-		data_to_send_per_proc[i]  = (float_t*)malloc(100 * (ctx -> dims) * sizeof(float_t));	
-		local_idx_of_the_point[i] = (int*)malloc(100 * sizeof(int));	
-		point_to_snd_capacity[i] = 100;
-		point_to_snd_count[i]    = 0;
-	}
-
-	/* for each point walk the tree and find to which proc send data */
-	/* actually compute intersection of ngbh radius of each point to node box */
+    /* find if a points needs a refine on the global tree */
+    float_t** data_to_send_per_proc      = (float_t**)malloc(ctx -> world_size * sizeof(float_t*));
+    int**       local_idx_of_the_point     = (int**)malloc(ctx -> world_size * sizeof(int*));
+    int*       point_to_snd_count       = (int*)malloc(ctx -> world_size * sizeof(int));
+    int*       point_to_snd_capacity    = (int*)malloc(ctx -> world_size * sizeof(int));
+
+    for(int i = 0; i < ctx -> world_size; ++i)
+    {
+        //data_to_send_per_proc[i]  = (float_t*)malloc(100 * (1 + ctx -> dims) * sizeof(float_t));    
+        data_to_send_per_proc[i]  = (float_t*)malloc(100 * (ctx -> dims) * sizeof(float_t));    
+        local_idx_of_the_point[i] = (int*)malloc(100 * sizeof(int));    
+        point_to_snd_capacity[i] = 100;
+        point_to_snd_count[i]    = 0;
+    }
+
+    /* for each point walk the tree and find to which proc send data */
+    /* actually compute intersection of ngbh radius of each point to node box */
 
     #pragma omp parallel for
-	for(int i = 0; i < ctx -> local_n_points; ++i)
-	{
+    for(int i = 0; i < ctx -> local_n_points; ++i)
+    {
         /*
-		MPI_DB_PRINT("%lu\n",dp_info[i].array_idx);
-		if(i > 10) break;
+        MPI_DB_PRINT("%lu\n",dp_info[i].array_idx);
+        if(i > 10) break;
         */
         float_t max_dist = dp_info[i].ngbh.data[0].value;
         float_t* point   = ctx -> local_data + (i * ctx -> dims);
@@ -1403,7 +1403,7 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
         tree_walk(ctx, top_tree -> root, i, max_dist, 
                   point, data_to_send_per_proc, local_idx_of_the_point, 
                   point_to_snd_count, point_to_snd_capacity);
-	}
+    }
 
     elapsed_time = TIME_STOP;
     LOG_WRITE("Finding points to refine", elapsed_time);
@@ -1466,8 +1466,8 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
         rcv_work_batches[i]       = __rcv_points + rcv_displ[i];
     }
 
-	MPI_Status status;
-	int flag;
+    MPI_Status status;
+    int flag;
     /* prepare heap batches */
 
     //int work_batch_stride = 1 + ctx -> dims;
@@ -1607,8 +1607,8 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
 
     int req_idx = 0;
 
-	for(int i = 0; i < ctx -> world_size; ++i)
-	{
+    for(int i = 0; i < ctx -> world_size; ++i)
+    {
         int count = 0;
         if(ngbh_to_send[i] > 0)
         {
@@ -1623,27 +1623,27 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
                 ++req_idx;
             }
         }
-	}
+    }
     
-	MPI_Barrier(ctx -> mpi_communicator);
-	MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, ctx -> mpi_communicator, &flag, &status);
+    MPI_Barrier(ctx -> mpi_communicator);
+    MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, ctx -> mpi_communicator, &flag, &status);
     //DB_PRINT("%d %p %p\n",ctx -> mpi_rank, &flag, &status);
     //HERE
-	while(flag)
-	{
-		MPI_Request request;
+    while(flag)
+    {
+        MPI_Request request;
         int count; 
         int source = status.MPI_SOURCE;
         MPI_Get_count(&status, MPI_my_heap, &count);
         /* recieve each slice */
 
-		MPI_Recv(rcv_heap_batches[source] + k * already_rcvd_points[source], 
+        MPI_Recv(rcv_heap_batches[source] + k * already_rcvd_points[source], 
                 count, MPI_my_heap, source, MPI_ANY_TAG, ctx -> mpi_communicator, &status);
 
         already_rcvd_points[source] += count;
-		MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, ctx -> mpi_communicator, &flag, &status);
+        MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, ctx -> mpi_communicator, &flag, &status);
 
-	}
+    }
     MPI_Barrier(ctx -> mpi_communicator);
 
 
@@ -1663,11 +1663,11 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
 
     /* merge old with new heaps */
 
-	MPI_Barrier(ctx -> mpi_communicator);
+    MPI_Barrier(ctx -> mpi_communicator);
 
     TIME_START;
 
-	for(int i = 0; i < ctx -> world_size; ++i)
+    for(int i = 0; i < ctx -> world_size; ++i)
     {
         #pragma omp paralell for
         for(int b = 0; b < ngbh_to_recv[i]; ++b)
@@ -1722,11 +1722,11 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
 
 
     
-	for(int i = 0; i < ctx -> world_size; ++i)
-	{
-		if(data_to_send_per_proc[i])  free(data_to_send_per_proc[i]);
-		if(local_idx_of_the_point[i]) free(local_idx_of_the_point[i]);
-	}
+    for(int i = 0; i < ctx -> world_size; ++i)
+    {
+        if(data_to_send_per_proc[i])  free(data_to_send_per_proc[i]);
+        if(local_idx_of_the_point[i]) free(local_idx_of_the_point[i]);
+    }
 
     for(int i = 0; i < ctx -> local_n_points; ++i)
     {
@@ -1739,8 +1739,8 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
     free(rcv_heap_batches);
     free(rcv_work_batches);
     free(point_to_rcv_count);
-	free(point_to_snd_count);
-	free(point_to_snd_capacity);
+    free(point_to_snd_count);
+    free(point_to_snd_capacity);
 
     free(rcv_count);
     free(snd_count);
@@ -1755,35 +1755,35 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
 
 void test_the_idea(global_context_t* ctx)
 {
-	int* num = (int*)malloc(ctx -> world_size * sizeof(int));
-	for(int i = 0; i < ctx -> world_size; ++i) num[i] = i;
-
-	for(int i = 0; i < ctx -> world_size; ++i)
-	{
-		MPI_Request request;
-		MPI_Isend(num + i, 1, MPI_INT, i, 0, ctx -> mpi_communicator, &request);
-	}
-	MPI_Barrier(ctx -> mpi_communicator);
-	MPI_Status status;
-	int flag;
-	int cc = 0;
-	MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_SOURCE, ctx -> mpi_communicator, &flag, &status);
-	while(flag)
-	{
-		cc++;
-		MPI_Request request;
-		MPI_Recv(num + status.MPI_SOURCE, 1, MPI_INT, status.MPI_SOURCE, MPI_ANY_TAG, ctx -> mpi_communicator, &status);
-		MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_SOURCE, ctx -> mpi_communicator, &flag, &status);
-
-	}
-	MPI_DB_PRINT("Recieved %d msgs\n",cc);
-	free(num);
+    int* num = (int*)malloc(ctx -> world_size * sizeof(int));
+    for(int i = 0; i < ctx -> world_size; ++i) num[i] = i;
+
+    for(int i = 0; i < ctx -> world_size; ++i)
+    {
+        MPI_Request request;
+        MPI_Isend(num + i, 1, MPI_INT, i, 0, ctx -> mpi_communicator, &request);
+    }
+    MPI_Barrier(ctx -> mpi_communicator);
+    MPI_Status status;
+    int flag;
+    int cc = 0;
+    MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_SOURCE, ctx -> mpi_communicator, &flag, &status);
+    while(flag)
+    {
+        cc++;
+        MPI_Request request;
+        MPI_Recv(num + status.MPI_SOURCE, 1, MPI_INT, status.MPI_SOURCE, MPI_ANY_TAG, ctx -> mpi_communicator, &status);
+        MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_SOURCE, ctx -> mpi_communicator, &flag, &status);
+
+    }
+    MPI_DB_PRINT("Recieved %d msgs\n",cc);
+    free(num);
 
 }
 
 void build_local_tree(global_context_t* ctx, kdtree_v2* local_tree)
 {
-	local_tree -> root = build_tree_kdtree_v2(local_tree -> _nodes, local_tree -> n_nodes, ctx -> dims);
+    local_tree -> root = build_tree_kdtree_v2(local_tree -> _nodes, local_tree -> n_nodes, ctx -> dims);
 }
 
 void ordered_data_to_file(global_context_t* ctx)
@@ -1826,116 +1826,118 @@ void ordered_data_to_file(global_context_t* ctx)
 
     }
     MPI_Barrier(ctx -> mpi_communicator);
+    gg
 }
 
 
+
 void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx) 
 {
-	float_t *data;
+    float_t *data;
     TIME_DEF
     double elapsed_time;
 
-	if (ctx->mpi_rank == 0) 
-	{
-		//data = read_data_file(ctx, "../norm_data/50_blobs_more_var.npy", MY_TRUE);
+    if (ctx->mpi_rank == 0) 
+    {
+        //data = read_data_file(ctx, "../norm_data/50_blobs_more_var.npy", MY_TRUE);
         //ctx->dims = 2;
-		//data = read_data_file(ctx, "../norm_data/50_blobs.npy", MY_TRUE);
-		// std_g0163178_Me14_091_0000
+        //data = read_data_file(ctx, "../norm_data/50_blobs.npy", MY_TRUE);
+        // std_g0163178_Me14_091_0000
     
         // 190M points
         // std_g2980844_091_0000
-		data = read_data_file(ctx,"../norm_data/std_g2980844_091_0000",MY_TRUE);
+        data = read_data_file(ctx,"../norm_data/std_g2980844_091_0000",MY_TRUE);
         
         /* 1M points ca.*/
-		// data = read_data_file(ctx,"../norm_data/std_LR_091_0001",MY_TRUE);
+        // data = read_data_file(ctx,"../norm_data/std_LR_091_0001",MY_TRUE);
 
         /* 8M points */
         
         //data = read_data_file(ctx,"../norm_data/std_g0144846_Me14_091_0001",MY_TRUE);
 
         //88M 
-		//data = read_data_file(ctx,"../norm_data/std_g5503149_091_0000",MY_TRUE);
+        //data = read_data_file(ctx,"../norm_data/std_g5503149_091_0000",MY_TRUE);
 
         //
         //34 M
-		// data = read_data_file(ctx,"../norm_data/std_g1212639_091_0001",MY_TRUE);
+        // data = read_data_file(ctx,"../norm_data/std_g1212639_091_0001",MY_TRUE);
         ctx->dims = 5;
 
-		//ctx -> n_points = 48*5*2000;
-		ctx->n_points = ctx->n_points / ctx->dims;
-		ctx->n_points = (ctx->n_points * 10) / 10;
+        //ctx -> n_points = 48*5*2000;
+        ctx->n_points = ctx->n_points / ctx->dims;
+        ctx->n_points = (ctx->n_points * 6) / 10;
         // ctx -> n_points = ctx -> world_size * 1000;
 
         //ctx -> n_points = 10000000 * ctx -> world_size;
         //generate_random_matrix(&data, ctx -> dims, ctx -> n_points, ctx);
-		//mpi_printf(ctx, "Read %lu points in %u dims\n", ctx->n_points, ctx->dims);
-	}
+        //mpi_printf(ctx, "Read %lu points in %u dims\n", ctx->n_points, ctx->dims);
+    }
     //MPI_DB_PRINT("[MASTER] Reading file and scattering\n");
-	
-	/* communicate the total number of points*/
-	MPI_Bcast(&(ctx->dims), 1, MPI_UINT32_T, 0, ctx->mpi_communicator);
-	MPI_Bcast(&(ctx->n_points), 1, MPI_UINT64_T, 0, ctx->mpi_communicator);
+    
+    /* communicate the total number of points*/
+    MPI_Bcast(&(ctx->dims), 1, MPI_UINT32_T, 0, ctx->mpi_communicator);
+    MPI_Bcast(&(ctx->n_points), 1, MPI_UINT64_T, 0, ctx->mpi_communicator);
 
-	/* compute the number of elements to recieve for each processor */
-	int *send_counts = (int *)malloc(ctx->world_size * sizeof(int));
-	int *displacements = (int *)malloc(ctx->world_size * sizeof(int));
+    /* compute the number of elements to recieve for each processor */
+    int *send_counts = (int *)malloc(ctx->world_size * sizeof(int));
+    int *displacements = (int *)malloc(ctx->world_size * sizeof(int));
 
-	displacements[0] = 0;
-	send_counts[0] = ctx->n_points / ctx->world_size;
-	send_counts[0] += (ctx->n_points % ctx->world_size) > 0 ? 1 : 0;
-	send_counts[0] = send_counts[0] * ctx->dims;
+    displacements[0] = 0;
+    send_counts[0] = ctx->n_points / ctx->world_size;
+    send_counts[0] += (ctx->n_points % ctx->world_size) > 0 ? 1 : 0;
+    send_counts[0] = send_counts[0] * ctx->dims;
 
-	for (int p = 1; p < ctx->world_size; ++p) 
-	{
-		send_counts[p] = (ctx->n_points / ctx->world_size);
-		send_counts[p] += (ctx->n_points % ctx->world_size) > p ? 1 : 0;
-		send_counts[p] = send_counts[p] * ctx->dims;
-		displacements[p] = displacements[p - 1] + send_counts[p - 1];
-	}
+    for (int p = 1; p < ctx->world_size; ++p) 
+    {
+        send_counts[p] = (ctx->n_points / ctx->world_size);
+        send_counts[p] += (ctx->n_points % ctx->world_size) > p ? 1 : 0;
+        send_counts[p] = send_counts[p] * ctx->dims;
+        displacements[p] = displacements[p - 1] + send_counts[p - 1];
+    }
 
 
-	ctx->local_n_points = send_counts[ctx->mpi_rank] / ctx->dims;
+    ctx->local_n_points = send_counts[ctx->mpi_rank] / ctx->dims;
 
-	float_t *pvt_data = (float_t *)malloc(send_counts[ctx->mpi_rank] * sizeof(float_t));
+    float_t *pvt_data = (float_t *)malloc(send_counts[ctx->mpi_rank] * sizeof(float_t));
 
-	MPI_Scatterv(data, send_counts, displacements, MPI_MY_FLOAT, pvt_data, send_counts[ctx->mpi_rank], MPI_MY_FLOAT, 0, ctx->mpi_communicator);
+    MPI_Scatterv(data, send_counts, displacements, MPI_MY_FLOAT, pvt_data, send_counts[ctx->mpi_rank], MPI_MY_FLOAT, 0, ctx->mpi_communicator);
 
-	ctx->local_data = pvt_data;
+    ctx->local_data = pvt_data;
 
-	int k_local = 20;
-	int k_global = 20;
+    int k_local = 20;
+    int k_global = 20;
 
-	uint64_t *global_bin_counts_int = (uint64_t *)malloc(k_global * sizeof(uint64_t));
+    uint64_t *global_bin_counts_int = (uint64_t *)malloc(k_global * sizeof(uint64_t));
 
-	pointset_t original_ps;
-	original_ps.data = ctx->local_data;
-	original_ps.dims = ctx->dims;
-	original_ps.n_points = ctx->local_n_points;
-	original_ps.lb_box = (float_t*)malloc(ctx -> dims * sizeof(float_t));
-	original_ps.ub_box = (float_t*)malloc(ctx -> dims * sizeof(float_t));
+    pointset_t original_ps;
+    original_ps.data = ctx->local_data;
+    original_ps.dims = ctx->dims;
+    original_ps.n_points = ctx->local_n_points;
+    original_ps.lb_box = (float_t*)malloc(ctx -> dims * sizeof(float_t));
+    original_ps.ub_box = (float_t*)malloc(ctx -> dims * sizeof(float_t));
 
-	float_t tol = 0.002;
+    float_t tol = 0.002;
 
-	top_kdtree_t tree;
+    top_kdtree_t tree;
     TIME_START;
-	top_tree_init(ctx, &tree);
+    top_tree_init(ctx, &tree);
     elapsed_time = TIME_STOP;
     LOG_WRITE("Initializing global kdtree", elapsed_time);
 
     TIME_START;
-	build_top_kdtree(ctx, &original_ps, &tree, k_global, tol);
-	exchange_points(ctx, &tree);
+    build_top_kdtree(ctx, &original_ps, &tree, k_global, tol);
+    exchange_points(ctx, &tree);
     elapsed_time = TIME_STOP;
     LOG_WRITE("Top kdtree build and domain decomposition", elapsed_time);
-	//test_the_idea(ctx);
+    //test_the_idea(ctx);
 
     TIME_START;
-	kdtree_v2 local_tree;
-	kdtree_v2_init( &local_tree, ctx -> local_data, ctx -> local_n_points, (unsigned int)ctx -> dims);
-	int k = 300;
-	//int k = 30;
+    kdtree_v2 local_tree;
+    kdtree_v2_init( &local_tree, ctx -> local_data, ctx -> local_n_points, (unsigned int)ctx -> dims);
+    int k = 300;
+    //int k = 30;
 
-	datapoint_info_t* dp_info = (datapoint_info_t*)malloc(ctx -> local_n_points * sizeof(datapoint_info_t));			
+    datapoint_info_t* dp_info = (datapoint_info_t*)malloc(ctx -> local_n_points * sizeof(datapoint_info_t));            
     /* initialize, to cope with valgrind */
     for(uint64_t i = 0; i < ctx -> local_n_points; ++i)
     {
@@ -1952,7 +1954,7 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx)
         dp_info[i].cluster_idx = -1;
     }
 
-	build_local_tree(ctx, &local_tree);
+    build_local_tree(ctx, &local_tree);
     elapsed_time = TIME_STOP;
     LOG_WRITE("Local trees init and build", elapsed_time);
 
@@ -1960,28 +1962,28 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx)
     MPI_DB_PRINT("----- Performing ngbh search -----\n");
     MPI_Barrier(ctx -> mpi_communicator);
 
-	mpi_ngbh_search(ctx, dp_info, &tree, &local_tree, ctx -> local_data, k);
+    mpi_ngbh_search(ctx, dp_info, &tree, &local_tree, ctx -> local_data, k);
 
     MPI_Barrier(ctx -> mpi_communicator);
     elapsed_time = TIME_STOP;
     LOG_WRITE("Total time for all knn search", elapsed_time)
-		
+        
 
     #if defined (WRITE_NGBH)
         ordered_data_to_file(ctx);
     #endif
     
 
-	top_tree_free(ctx, &tree);
-	kdtree_v2_free(&local_tree);
+    top_tree_free(ctx, &tree);
+    kdtree_v2_free(&local_tree);
 
-	free(send_counts);
-	free(displacements);
-	free(dp_info);
+    free(send_counts);
+    free(displacements);
+    free(dp_info);
 
-	if (ctx->mpi_rank == 0) free(data);
+    if (ctx->mpi_rank == 0) free(data);
 
-	original_ps.data = NULL;
-	free_pointset(&original_ps);
-  	free(global_bin_counts_int);
+    original_ps.data = NULL;
+    free_pointset(&original_ps);
+    free(global_bin_counts_int);
 }
diff --git a/src/tree/tree.h b/src/tree/tree.h
index 7439b1594c558e5caffd118a5a6df26b593e7cb5..6b38c79bc63669c35cef78c7c0e9bcbfe8f4eef8 100644
--- a/src/tree/tree.h
+++ b/src/tree/tree.h
@@ -83,9 +83,9 @@ typedef struct top_kdtree_t
 
 
 typedef struct datapoint_info_t {
-  float_t g;
   heap ngbh;
   idx_t array_idx;
+  float_t g;
   float_t log_rho;
   float_t log_rho_c;
   float_t log_rho_err;