diff --git a/src/common/common.c b/src/common/common.c
index 452e8df912ab2a4b1f50f8cc9dc47136b749feb3..5da59fa0e040c87f336ab7e253cd91397411d52d 100644
--- a/src/common/common.c
+++ b/src/common/common.c
@@ -10,6 +10,8 @@ void get_context(global_context_t* ctx)
 	ctx -> local_data = NULL;
 	ctx -> lb_box 	  = NULL;
 	ctx -> ub_box 	  = NULL;
+    ctx -> rank_n_points  = (int*)malloc(ctx -> world_size * sizeof(int));
+    ctx -> rank_idx_start = (int*)malloc(ctx -> world_size * sizeof(int));
 }
 
 void free_context(global_context_t* ctx)
@@ -31,7 +33,8 @@ void free_context(global_context_t* ctx)
 		free(ctx -> lb_box);
 		ctx -> lb_box = NULL;	
 	}
-
+    free(ctx -> rank_n_points);
+    free(ctx -> rank_idx_start);
 }
 
 void free_pointset(pointset_t* ps)
diff --git a/src/common/common.h b/src/common/common.h
index ded8368aa0d8d1c38eb1c2fdd92e9484b5c894aa..6b84cde46d1f180879fdd2fcdd6f921f4ffdd282 100644
--- a/src/common/common.h
+++ b/src/common/common.h
@@ -92,16 +92,18 @@
 
 struct global_context_t 
 {
-	size_t n_points;
-	size_t idx_start;
-	size_t local_n_points;
-	uint32_t dims;
+    int world_size; 
+    int mpi_rank;
+    int __processor_name_len;
 	float_t* local_data;
 	float_t* lb_box;
 	float_t* ub_box;
-	int world_size; 
-	int mpi_rank;
-	int __processor_name_len;
+    size_t n_points;
+    size_t idx_start;
+    size_t local_n_points;
+    uint32_t dims;
+    int* rank_idx_start;
+    int* rank_n_points;
 	char processor_mame[MPI_MAX_PROCESSOR_NAME];
 	MPI_Comm mpi_communicator;
 };
diff --git a/src/tree/tree.c b/src/tree/tree.c
index bca429f4891b3fe6b914243ccc683a70707da4a8..bd1b165b7a36b76006393a07f179b850ee9fb0e1 100644
--- a/src/tree/tree.c
+++ b/src/tree/tree.c
@@ -1152,7 +1152,12 @@ void exchange_points(global_context_t* ctx, top_kdtree_t* tree)
     {
         ctx -> idx_start += ppp[i];
     }
-    //DB_PRINT("rank %d start %lu\n", ctx -> mpi_rank, ctx -> idx_start);
+
+    /* find slices of indices */
+    for(int i = 0; i < ctx -> world_size; ++i) ctx -> rank_n_points[i] = ppp[i]; 
+
+    ctx -> rank_idx_start[0] = 0;
+    for(int i = 1; i < ctx -> world_size; ++i) ctx -> rank_idx_start[i] = ppp[i - 1] + ctx -> rank_idx_start[i - 1]; 
 
     /* free prv pointer */
     free(ppp);
@@ -1167,6 +1172,8 @@ void exchange_points(global_context_t* ctx, top_kdtree_t* tree)
         if(o != ctx -> mpi_rank) DB_PRINT("rank %d got an error\n",ctx -> mpi_rank);
     }
 
+
+
     free(points_owners);
     free(points_per_proc);
     free(partition_offset);
@@ -1718,20 +1725,12 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
 
     MPI_Barrier(ctx -> mpi_communicator);
 
-    
-
-
-    
     for(int i = 0; i < ctx -> world_size; ++i)
     {
         if(data_to_send_per_proc[i])  free(data_to_send_per_proc[i]);
         if(local_idx_of_the_point[i]) free(local_idx_of_the_point[i]);
     }
 
-    for(int i = 0; i < ctx -> local_n_points; ++i)
-    {
-        free(dp_info[i].ngbh.data);
-    }
 
     free(data_to_send_per_proc);
     free(local_idx_of_the_point);
@@ -1826,10 +1825,104 @@ void ordered_data_to_file(global_context_t* ctx)
 
     }
     MPI_Barrier(ctx -> mpi_communicator);
-    gg
 }
 
+static inline int foreign_owner(global_context_t* ctx, idx_t idx)
+{
+    int owner;
+    if( idx >= ctx -> idx_start && idx < ctx -> idx_start + ctx -> local_n_points) 
+    {
+        return ctx -> mpi_rank;
+    }
+
+    for(int i = 0; i < ctx -> world_size; ++i)
+    {
+        owner = i;    
+        if( idx >= ctx -> rank_idx_start[i] && idx < ctx -> rank_idx_start[i] + ctx -> rank_n_points[i]) break;
+    }
+    return owner;
+}
+
+static inline void push_on_foreign_idx_list(idx_t element, int owner, int* counts, int* capacities, idx_t** lists)
+{
+
+    if(capacities[owner] == counts[owner])
+    {
+        int new_cap = capacities[owner] * 1.1;
+        lists[owner] = (idx_t*)realloc(lists[owner], new_cap * sizeof(idx_t));
+        capacities[owner] = new_cap;
+    }
+   
+
+    /* find the plausible place */
+    int idx_to_insert = counts[owner]; 
+    
+    int flag = 1;
+    /* check if point is already inserted */
+    for(int i = 0; i < idx_to_insert; ++i)
+    {
+        flag = flag && lists[owner][i] != element; 
+    }
 
+    /* if this is the case insert it */ 
+    
+    if(flag)
+    {
+        counts[owner]++;
+        lists[owner][idx_to_insert] = element;
+    }
+
+}
+
+void find_foreign_nodes(global_context_t* ctx, datapoint_info_t* dp)
+{
+    int k = dp[0].ngbh.count;
+    
+    idx_t** array_indexes_to_request = (idx_t**)malloc(ctx -> world_size * sizeof(idx_t*));
+    int*    count_to_request         = (int*)malloc(ctx -> world_size * sizeof(int));
+    int*    capacities               = (int*)malloc(ctx -> world_size * sizeof(int));
+
+    for(int i = 0; i < ctx -> world_size; ++i)
+    {
+        array_indexes_to_request[i] = (idx_t*)malloc(100 * sizeof(idx_t));
+        capacities[i] = 100;
+        count_to_request[i] = 0;
+    }
+    
+    #pragma omp parallel for
+    for(uint32_t i = 0; i < ctx -> local_n_points; ++i)
+    {
+        for(int j = 0; j < k; ++j)
+        {
+            idx_t element = dp[i].ngbh.data[j].array_idx;        
+            int owner = foreign_owner(ctx, element);
+            //DB_PRINT("%lu %d\n", element, owner);
+            
+            if(owner != ctx -> mpi_rank)
+            {
+                //DB_PRINT("Hehe");
+                #pragma omp critical
+                {
+                    push_on_foreign_idx_list(element, owner, count_to_request, capacities, array_indexes_to_request);
+                }
+            }
+        }
+    }
+    
+    DB_PRINT("[RANK %d elements to request]", ctx -> mpi_rank);
+    for(int i = 0; i < ctx -> world_size; ++i)
+    {
+        DB_PRINT("%d\t", count_to_request[i]);
+    }
+    DB_PRINT("\n");
+
+
+    
+    for(int i = 0; i < ctx -> world_size; ++i) free(array_indexes_to_request[i]);
+    free(array_indexes_to_request);
+    free(count_to_request);
+    free(capacities);
+}
 
 void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx) 
 {
@@ -1846,10 +1939,10 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx)
     
         // 190M points
         // std_g2980844_091_0000
-        data = read_data_file(ctx,"../norm_data/std_g2980844_091_0000",MY_TRUE);
+        // data = read_data_file(ctx,"../norm_data/std_g2980844_091_0000",MY_TRUE);
         
         /* 1M points ca.*/
-        // data = read_data_file(ctx,"../norm_data/std_LR_091_0001",MY_TRUE);
+        data = read_data_file(ctx,"../norm_data/std_LR_091_0001",MY_TRUE);
 
         /* 8M points */
         
@@ -1865,7 +1958,7 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx)
 
         //ctx -> n_points = 48*5*2000;
         ctx->n_points = ctx->n_points / ctx->dims;
-        ctx->n_points = (ctx->n_points * 6) / 10;
+        ctx->n_points = (ctx->n_points * 5) / 10;
         // ctx -> n_points = ctx -> world_size * 1000;
 
         //ctx -> n_points = 10000000 * ctx -> world_size;
@@ -1967,11 +2060,18 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx)
     MPI_Barrier(ctx -> mpi_communicator);
     elapsed_time = TIME_STOP;
     LOG_WRITE("Total time for all knn search", elapsed_time)
-        
+
+    find_foreign_nodes(ctx, dp_info);
+
 
     #if defined (WRITE_NGBH)
         ordered_data_to_file(ctx);
     #endif
+
+    for(int i = 0; i < ctx -> local_n_points; ++i)
+    {
+        free(dp_info[i].ngbh.data);
+    }
     
 
     top_tree_free(ctx, &tree);
@@ -1986,4 +2086,5 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx)
     original_ps.data = NULL;
     free_pointset(&original_ps);
     free(global_bin_counts_int);
+
 }
diff --git a/src/tree/tree.h b/src/tree/tree.h
index 6b38c79bc63669c35cef78c7c0e9bcbfe8f4eef8..39086a27ac49a21375c2042c21cd5a6575c75565 100644
--- a/src/tree/tree.h
+++ b/src/tree/tree.h
@@ -83,8 +83,8 @@ typedef struct top_kdtree_t
 
 
 typedef struct datapoint_info_t {
+    idx_t array_idx;
   heap ngbh;
-  idx_t array_idx;
   float_t g;
   float_t log_rho;
   float_t log_rho_c;