diff --git a/src/adp/adp.c b/src/adp/adp.c
index 35f61e2a3a87f8b9a524414147619c8894bfb350..e796b8db7dc6a0ca93d79525026c54920b05850e 100644
--- a/src/adp/adp.c
+++ b/src/adp/adp.c
@@ -1,4 +1,5 @@
 #include "adp.h"
+#include <unistd.h>
 
 const border_t border_null = {.density = -1.0, .error = 0, .idx = NOBORDER};
 const sparse_border_t sparse_border_null = {.density = -1.0, .error = 0, .idx = NOBORDER, .i = NOBORDER, .j = NOBORDER};
@@ -298,8 +299,8 @@ void compute_density_kstarnn_rma_v2(global_context_t* ctx, const float_t d, int
         idx_t* ks = (idx_t*)MY_MALLOC(ctx -> local_n_points * sizeof(idx_t));
 
         for(int i = 0; i < ctx -> local_n_points; ++i) den[i] = ctx -> local_datapoints[i].log_rho;
-        for(int i = 0; i < ctx -> local_n_points; ++i) ks[i] = ctx -> local_datapoints[i].kstar;
-        for(int i = 0; i < ctx -> local_n_points; ++i) gs[i] = ctx -> local_datapoints[i].g;
+        for(int i = 0; i < ctx -> local_n_points; ++i) ks[i]  = ctx -> local_datapoints[i].kstar;
+        for(int i = 0; i < ctx -> local_n_points; ++i) gs[i]  = ctx -> local_datapoints[i].g;
 
         ordered_buffer_to_file(ctx, den, sizeof(float_t), ctx -> local_n_points, "bb/ordered_density.npy");
         ordered_buffer_to_file(ctx, ks, sizeof(idx_t), ctx -> local_n_points, "bb/ks.npy");
@@ -617,6 +618,8 @@ lock_t h1_lock_acquire(global_context_t* ctx, MPI_Win lock_window, int owner, id
         while(result == LOCK_ACQUIRED && err == MPI_SUCCESS)
         {
             err = MPI_Compare_and_swap(&state, &compare, &result, MPI_LOCK_T, owner, pos, lock_window);
+            MPI_Win_flush(owner, lock_window);
+            usleep(100);
         }
 
         if(err != MPI_SUCCESS)
@@ -677,8 +680,7 @@ clusters_t Heuristic1(global_context_t *ctx)
     MPI_Win_fence(0, win_datapoints);
     MPI_Win_lock_all(0,  win_datapoints);
 
-#if defined(THREAD_FUNNELED)
-#else
+#if !defined(THREAD_FUNNELED)
     #pragma omp parallel for
 #endif
     for(idx_t i = 0; i < n; ++i)
@@ -741,14 +743,17 @@ clusters_t Heuristic1(global_context_t *ctx)
     MPI_Win_create(lock_array, n * sizeof(lock_t), sizeof(lock_t), MPI_INFO_NULL, ctx -> mpi_communicator, &win_locks);
     MPI_Win_fence(0, win_locks);
 
+#ifdef EXP_H1
     MPI_Win_lock_all(0, win_to_remove_mask);
     MPI_Win_lock_all(0, win_locks);
+#endif
 
+#ifdef EXP_H1
+    printf("Using experimental h1\n");
+#endif
 
-
-#if defined(THREAD_FUNNELED)
-#else
-    #pragma omp parallel for
+#if !defined(THREAD_FUNNELED)
+    #pragma omp parallel for schedule(dynamic)
 #endif
     for(idx_t p = 0; p < n; ++p)
     {
@@ -769,36 +774,38 @@ clusters_t Heuristic1(global_context_t *ctx)
                  *
                  * */
 
-#ifdef EXPERIMENTAL_H1
-                int owner = foreign_owner(ctx, jidx);
-                idx_t jpos = jidx - ctx -> rank_idx_start[owner];
-
-                lock_t state = LOCK_FREE;
-
-                state = h1_lock_acquire(ctx, win_locks, owner, jpos, state);
+#ifdef EXP_H1
+                #pragma omp critical (h1_exp)
+                {
+                    int owner = foreign_owner(ctx, jidx);
+                    idx_t jpos = jidx - ctx -> rank_idx_start[owner];
 
-                heap_node mask_element;
-                MPI_Request request;
+                    lock_t state = LOCK_FREE;
 
-                MPI_Rget(&mask_element, sizeof(heap_node), MPI_BYTE, 
-                        owner, jpos * sizeof(heap_node), sizeof(heap_node), MPI_BYTE, win_to_remove_mask, &request);
-                MPI_Wait(&request, MPI_STATUS_IGNORE);
+                    state = h1_lock_acquire(ctx, win_locks, owner, jpos, state);
 
-                int flag = mask_element.array_idx == MY_SIZE_MAX;							
-                if(flag || i_point.g > mask_element.value )
-                {
-                    heap_node tmp_mask_element = {.array_idx = i_point.array_idx, .value = i_point.g};
+                    heap_node mask_element;
                     MPI_Request request;
-                    MPI_Rput(&tmp_mask_element, sizeof(heap_node), MPI_BYTE, owner, 
-                            jpos*sizeof(heap_node), sizeof(heap_node), MPI_BYTE, win_to_remove_mask, &request);
+
+                    MPI_Rget(&mask_element, sizeof(heap_node), MPI_BYTE, 
+                            owner, jpos * sizeof(heap_node), sizeof(heap_node), MPI_BYTE, win_to_remove_mask, &request);
                     MPI_Wait(&request, MPI_STATUS_IGNORE);
 
-                }
+                    int flag = mask_element.array_idx == MY_SIZE_MAX;							
+                    if(flag || i_point.g > mask_element.value )
+                    {
+                        heap_node tmp_mask_element = {.array_idx = i_point.array_idx, .value = i_point.g};
+                        MPI_Request request;
+                        MPI_Rput(&tmp_mask_element, sizeof(heap_node), MPI_BYTE, owner, 
+                                jpos*sizeof(heap_node), sizeof(heap_node), MPI_BYTE, win_to_remove_mask, &request);
+                        MPI_Wait(&request, MPI_STATUS_IGNORE);
 
-                state = h1_lock_free(ctx, win_locks, owner, jpos, state);
-#else
+                    }
 
-                #pragma omp critical (h1_centers_elimination)
+                    state = h1_lock_free(ctx, win_locks, owner, jpos, state);
+                }
+#else
+                #pragma omp critical (centers_elimination)                 
                 {
                     int owner = foreign_owner(ctx, jidx);
                     idx_t jpos = jidx - ctx -> rank_idx_start[owner];
@@ -822,13 +829,16 @@ clusters_t Heuristic1(global_context_t *ctx)
                     }
 
                     MPI_Win_unlock(owner, win_to_remove_mask);
+                }
 #endif
             }
         }
     }
 
+#ifdef EXP_H1
     MPI_Win_unlock_all(win_to_remove_mask);
     MPI_Win_unlock_all(win_locks);
+#endif
     
     MPI_Win_fence(0, win_to_remove_mask);
     MPI_Win_fence(0, win_locks);
@@ -1815,8 +1825,6 @@ void Heuristic3(global_context_t* ctx, clusters_t* cluster, float_t Z, int halo)
 
         qsort(centers_dp, cluster -> centers.count, sizeof(datapoint_info_t), compare_dp_by_cidx);
 
-        printf("Centers\n");
-
         master_finds_borders(ctx, cluster, Z, surviving_clusters, centers_dp);
         master_fixes_border_matrix_and_centers(ctx, cluster, Z, old_to_new, surviving_clusters, nclus);
         free(centers_dp);
diff --git a/src/main/main.c b/src/main/main.c
index 8c90cfee2667a35126dd0b26b865511c4dd3f6d5..ab54c20845aaac44fd54f539341929b38759f381 100644
--- a/src/main/main.c
+++ b/src/main/main.c
@@ -232,52 +232,7 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx)
     if (ctx->mpi_rank == 0) 
     {
         data = read_data_file(ctx,ctx -> input_data_file, ctx -> dims, ctx -> input_data_in_float32);
-        
-        //data = read_data_file(ctx, "../norm_data/50_blobs_more_var.npy", MY_TRUE);
-        //ctx->dims = 2;
-        //data = read_data_file(ctx, "../norm_data/blobs_small.npy", MY_FALSE);
-        //data = read_data_file(ctx, "../norm_data/blobs_small.npy", MY_FALSE);
-        // std_g0163178_Me14_091_0000
-    
-        // 100M points
-        // 2D
-        // std_g2980844_091_0000
-        //data = read_data_file(ctx,"../norm_data/huge_blobs.npy",MY_FALSE);
-        // 2B points
-        // data = read_data_file(ctx,"../norm_data/very_huge_blobs.npy",MY_FALSE);
-        // data = read_data_file(ctx,"../norm_data/hd_blobs.npy",5,MY_FALSE);
-        
-        //1B points
-        // data = read_data_file(ctx,"../norm_data/eds_box_acc_normalized",5,MY_FALSE);
-        // data = read_data_file(ctx,"../norm_data/eds_box_6d",6,MY_FALSE);
-
-        // 190M points
-        // std_g2980844_091_0000
-        // data = read_data_file(ctx,"../norm_data/std_g2980844_091_0000",5,MY_TRUE);
-        
-        /* 1M points ca.*/
-        //data = read_data_file(ctx,"../norm_data/std_LR_091_0001",5,MY_TRUE);
-
-        /* BOX */
-        // data = read_data_file(ctx,"../norm_data/std_Box_256_30_092_0000",MY_TRUE);
-
-        /* 8M points */
-        
-        // data = read_data_file(ctx,"../norm_data/std_g0144846_Me14_091_0001",5,MY_TRUE);
-
-        //88M 
-        // data = read_data_file(ctx,"../norm_data/std_g5503149_091_0000",MY_TRUE);
-
-        //
-        //34 M
-        //data = read_data_file(ctx,"../norm_data/std_g1212639_091_0001",MY_TRUE);
-        
-        //for weak scalability 
-        //ctx->n_points = ctx->n_points / 4;
-        //ctx->n_points = (ctx->n_points / 32) * ctx -> world_size;
-
         get_dataset_diagnostics(ctx, data);
-
     }
     
     /* communicate the total number of points*/
@@ -320,9 +275,7 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx)
                 int count_send = MIN(default_msg_len, send_counts[i] - already_sent_points); 
                 MPI_Send(data + displacements[i] + already_sent_points, count_send, MPI_MY_FLOAT, i, ctx -> mpi_rank, ctx -> mpi_communicator);
                 already_sent_points += count_send;
-                //DB_PRINT("[RANK 0] has sent to rank %d %d elements out of %lu\n",i, already_sent_points, send_counts[i]);
             }
-            //DB_PRINT("------------------------------------------------\n");
         }
     }
     else
diff --git a/src/tree/kdtreeV2.c b/src/tree/kdtreeV2.c
index b3b634a3db12cdbe0b43180b4161bf3ef203269a..3547975ea8c13d49c359583429cc3b9384b57577 100644
--- a/src/tree/kdtreeV2.c
+++ b/src/tree/kdtreeV2.c
@@ -228,9 +228,6 @@ kdnode_v2* make_tree_kdnode_v2(kdnode_v2* t, int start, int end, kdnode_v2* pare
         
     }
     
-    
-
-
     int median_idx = -1;
     
     //if ((end - start) < 0) return 0;
diff --git a/src/tree/tree.c b/src/tree/tree.c
index 12f181e3ec0ba4068f08e027c594c145df53622a..876dd85c5c9ebe40f0093ee2f5d2d095c97bd4da 100644
--- a/src/tree/tree.c
+++ b/src/tree/tree.c
@@ -1478,7 +1478,7 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
     //ctx -> __local_heap_buffers = (heap_node*)MY_MALLOC(ctx -> local_n_points * k * sizeof(heap_node));
     MPI_Alloc_mem(ctx -> local_n_points * k * sizeof(heap_node), MPI_INFO_NULL, &(ctx -> __local_heap_buffers));
 
-    #pragma omp parallel for
+    #pragma omp parallel for schedule(dynamic)
     for(int p = 0; p < ctx -> local_n_points; ++p)
     {
         idx_t idx = local_tree -> _nodes[p].array_idx;
@@ -1663,7 +1663,7 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
         //if(count_rcv_work_batches[p] > 0)
         {
             //heap_batches_per_node[p] = (heap_node*)MY_MALLOC(k * point_to_rcv_count[p] * sizeof(heap_node));
-            #pragma omp parallel for
+            #pragma omp parallel for schedule(dynamic)
             for(int batch = 0; batch < point_to_rcv_count[p]; ++batch)
             {
                 heap H;
@@ -1822,7 +1822,7 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
         }
 
         /* merge lists */
-        #pragma omp paralell for
+        #pragma omp parallel for
         for(int b = 0; b < ngbh_to_recv[rank_to_recv]; ++b)
         {
             int idx = local_idx_of_the_point[rank_to_recv][b];
@@ -1843,6 +1843,7 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
         }
 
 
+
         MPI_Barrier(ctx -> mpi_communicator);
     }