diff --git a/src/tree/tree.c b/src/tree/tree.c
index bd1b165b7a36b76006393a07f179b850ee9fb0e1..01b51527b95ad496d649af7b88d8adbe5a2aefcc 100644
--- a/src/tree/tree.c
+++ b/src/tree/tree.c
@@ -19,8 +19,8 @@
 #include <omp.h>
 #include <sys/sysinfo.h>
 
-//#define WRITE_NGBH
-//#define WRITE_TOP_NODES
+#define WRITE_NGBH
+#define WRITE_TOP_NODES
 
 /* 
  * Maximum bytes to send with a single mpi send/recv, used 
@@ -1315,6 +1315,180 @@ void tree_walk(
 
 }
 
+void tree_walk_v2_find_n_points(
+        global_context_t* ctx, 
+        top_kdtree_node_t* root, 
+        int point_idx,
+        float_t max_dist,
+        float_t* point,
+        int* point_to_send_capacity) 
+{
+    if(root -> owner != -1 && root -> owner != ctx -> mpi_rank)
+    {
+        #pragma omp atomic update 
+        point_to_send_capacity[root -> owner]++;
+    }
+    else
+    {
+        /* tree walk */
+        int split_var = root -> split_dim;
+        float_t hp_distance = point[split_var] - root -> split_val;
+        __builtin_prefetch(root -> lch, 0, 3);
+        __builtin_prefetch(root -> rch, 0, 3);
+
+        int side = hp_distance > 0.f;
+
+        switch (side)
+        {
+            case TOP_TREE_LCH:
+                if(root -> lch)
+                {
+                    /* walk on the left */
+                    tree_walk_v2_find_n_points(ctx, root -> lch, point_idx, max_dist, point, point_to_send_capacity);
+                }
+                break;
+            
+            case TOP_TREE_RCH:
+                if(root -> rch)
+                {
+                    /* walk on the right */
+                    tree_walk_v2_find_n_points(ctx, root -> rch, point_idx, max_dist, point, point_to_send_capacity);
+                }
+                break;
+
+            default:
+                break;
+        }
+
+        int c   = max_dist > (hp_distance * hp_distance);
+
+        //if(c || (H -> count) < (H -> N))
+        if(c)
+        {
+
+            switch (side)
+            {
+                case HP_LEFT_SIDE:
+                    if(root -> rch) 
+                    {
+                        /* walk on the right */
+                        tree_walk_v2_find_n_points(ctx, root -> rch, point_idx, max_dist, point, point_to_send_capacity);
+                    }
+                    break;
+                
+                case HP_RIGHT_SIDE:
+                    if(root -> lch) 
+                    {
+                        /* walk on the left */
+                        tree_walk_v2_find_n_points(ctx, root -> lch, point_idx, max_dist, point, point_to_send_capacity);
+                    }
+                    break;
+
+                default:
+                    break;
+            }
+        }
+    }
+
+}
+
+void tree_walk_v2_append_points(
+        global_context_t* ctx, 
+        top_kdtree_node_t* root, 
+        int point_idx,
+        float_t max_dist,
+        float_t* point,
+        float_t** data_to_send_per_proc, 
+        int** local_idx_of_the_point, 
+        int* point_to_send_count) 
+{
+    if(root -> owner != -1 && root -> owner != ctx -> mpi_rank)
+    {
+        /* put the leaf on the requests array */
+        int owner = root -> owner;
+
+
+        int idx;
+
+        #pragma omp atomic capture
+        idx = point_to_send_count[owner]++;
+
+        int len = ctx -> dims;
+
+        float_t* base = data_to_send_per_proc[owner] + (len * idx); 
+
+        memcpy(base, point, ctx -> dims * sizeof(float_t));
+        local_idx_of_the_point[owner][idx] = point_idx;
+    }
+    else
+    {
+        /* tree walk */
+        int split_var = root -> split_dim;
+        float_t hp_distance = point[split_var] - root -> split_val;
+        __builtin_prefetch(root -> lch, 0, 3);
+        __builtin_prefetch(root -> rch, 0, 3);
+
+        int side = hp_distance > 0.f;
+
+        switch (side)
+        {
+            case TOP_TREE_LCH:
+                if(root -> lch)
+                {
+                    /* walk on the left */
+                    tree_walk_v2_append_points(ctx, root -> lch, point_idx, max_dist, point, 
+                            data_to_send_per_proc, local_idx_of_the_point, point_to_send_count);
+                }
+                break;
+            
+            case TOP_TREE_RCH:
+                if(root -> rch)
+                {
+                    /* walk on the right */
+                    tree_walk_v2_append_points(ctx, root -> rch, point_idx, max_dist, point, 
+                            data_to_send_per_proc, local_idx_of_the_point, point_to_send_count);
+                }
+                break;
+
+            default:
+                break;
+        }
+
+        int c   = max_dist > (hp_distance * hp_distance);
+
+        //if(c || (H -> count) < (H -> N))
+        if(c)
+        {
+
+            switch (side)
+            {
+                case HP_LEFT_SIDE:
+                    if(root -> rch) 
+                    {
+                        /* walk on the right */
+                        tree_walk_v2_append_points(ctx, root -> rch, point_idx, max_dist, point, 
+                                data_to_send_per_proc, local_idx_of_the_point, point_to_send_count);
+                    }
+                    break;
+                
+                case HP_RIGHT_SIDE:
+                    if(root -> lch) 
+                    {
+                        /* walk on the left */
+                        tree_walk_v2_append_points(ctx, root -> lch, point_idx, max_dist, point, 
+                                data_to_send_per_proc, local_idx_of_the_point, point_to_send_count);
+                    }
+                    break;
+
+                default:
+                    break;
+            }
+        }
+    }
+
+}
+
+
 void convert_heap_idx_to_global(global_context_t* ctx, heap* H)
 {
     for(uint64_t i = 0; i < H -> count; ++i)
@@ -1387,23 +1561,27 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
 
     for(int i = 0; i < ctx -> world_size; ++i)
     {
-        //data_to_send_per_proc[i]  = (float_t*)malloc(100 * (1 + ctx -> dims) * sizeof(float_t));    
+        /* allocate it afterwards */
+
+        /* OLD VERSION 
         data_to_send_per_proc[i]  = (float_t*)malloc(100 * (ctx -> dims) * sizeof(float_t));    
         local_idx_of_the_point[i] = (int*)malloc(100 * sizeof(int));    
         point_to_snd_capacity[i] = 100;
+        */
+
+        /* NEW VERSION with double tree walk */
+        point_to_snd_capacity[i] = 0;
         point_to_snd_count[i]    = 0;
     }
 
     /* for each point walk the tree and find to which proc send data */
     /* actually compute intersection of ngbh radius of each point to node box */
 
+    /* OLD VERSION SINGLE TREE WALK */
+    /*
     #pragma omp parallel for
     for(int i = 0; i < ctx -> local_n_points; ++i)
     {
-        /*
-        MPI_DB_PRINT("%lu\n",dp_info[i].array_idx);
-        if(i > 10) break;
-        */
         float_t max_dist = dp_info[i].ngbh.data[0].value;
         float_t* point   = ctx -> local_data + (i * ctx -> dims);
 
@@ -1411,6 +1589,38 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
                   point, data_to_send_per_proc, local_idx_of_the_point, 
                   point_to_snd_count, point_to_snd_capacity);
     }
+    */
+
+    /* NEW VERSION double tree walk */
+    #pragma omp parallel for
+    for(int i = 0; i < ctx -> local_n_points; ++i)
+    {
+        float_t max_dist = dp_info[i].ngbh.data[0].value;
+        float_t* point   = ctx -> local_data + (i * ctx -> dims);
+        
+        tree_walk_v2_find_n_points(ctx, top_tree -> root, i, max_dist, point, point_to_snd_capacity);
+
+    }
+
+    /* allocate needed space */
+    for(int i = 0; i < ctx -> world_size; ++i)
+    {
+        int np = point_to_snd_capacity[i];
+        data_to_send_per_proc[i]  = (float_t*)malloc(np * (ctx -> dims) * sizeof(float_t));    
+        local_idx_of_the_point[i] = (int*)malloc(np * sizeof(int));    
+
+    }
+
+
+    #pragma omp parallel for
+    for(int i = 0; i < ctx -> local_n_points; ++i)
+    {
+        float_t max_dist = dp_info[i].ngbh.data[0].value;
+        float_t* point   = ctx -> local_data + (i * ctx -> dims);
+
+        tree_walk_v2_append_points(ctx, top_tree -> root, i, max_dist, point, data_to_send_per_proc, local_idx_of_the_point, point_to_snd_count);
+    }
+
 
     elapsed_time = TIME_STOP;
     LOG_WRITE("Finding points to refine", elapsed_time);
@@ -1958,7 +2168,7 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx)
 
         //ctx -> n_points = 48*5*2000;
         ctx->n_points = ctx->n_points / ctx->dims;
-        ctx->n_points = (ctx->n_points * 5) / 10;
+        ctx->n_points = (ctx->n_points * 6) / 10;
         // ctx -> n_points = ctx -> world_size * 1000;
 
         //ctx -> n_points = 10000000 * ctx -> world_size;
@@ -2061,7 +2271,7 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx)
     elapsed_time = TIME_STOP;
     LOG_WRITE("Total time for all knn search", elapsed_time)
 
-    find_foreign_nodes(ctx, dp_info);
+    //find_foreign_nodes(ctx, dp_info);
 
 
     #if defined (WRITE_NGBH)