From 792e25120f96af89711e8c4a373b9bab192c3e95 Mon Sep 17 00:00:00 2001
From: lykos98 <francy273998@gmail.com>
Date: Mon, 28 Apr 2025 15:17:56 +0200
Subject: [PATCH] added implementation of centers elimination queues

---
 README.md       |  3 +-
 run_leo         |  3 +-
 src/adp/adp.c   | 75 +++++++++++++++++++++++++++++++++++++++++++------
 src/main/main.c |  6 ++--
 4 files changed, 74 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index 7949127..7fd3b04 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,8 @@ The suggestion is to run it with one mpi task per socket.
 
 # Todo
 
- - [ ] H1: implementation of lock free centers elimination
+ - [ ] argument parsing: find an elegant way to pass parameters and file (maybe a config file?)
+ - [~] H1: implementation of lock free centers elimination (*work in progress*)
  - [ ] context: open all windows in a single shot, close them all togheter
  - [ ] io: curation of IO using mpi IO or other solutions 
  - [ ] kdtree: optimization an profiling
diff --git a/run_leo b/run_leo
index db5382a..9a835c4 100644
--- a/run_leo
+++ b/run_leo
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-#SBATCH --nodes=1
+#SBATCH --nodes=2
 #SBATCH --ntasks-per-node=2
 #SBATCH --cpus-per-task=56
 #SBATCH --time=04:00:00
@@ -37,6 +37,7 @@ IN_DATA=/leonardo_work/IscrC_dadp
 
 #10^6 points 
 time mpirun -n ${SLURM_NTASKS} --map-by ppr:1:socket:PE=${SLURM_CPUS_PER_TASK}  ./main -t f32 -i ${IN_DATA}/norm_data/std_LR_091_0001 -d 5 -a ${OUT_ASSIGNMENT} -o ${OUT_DATA}
+#time mpirun -n ${SLURM_NTASKS} --map-by core  ./main -t f32 -i ${IN_DATA}/norm_data/std_LR_091_0001 -d 5 -a ${OUT_ASSIGNMENT} -o ${OUT_DATA}
 
 #34 * 10^6 points
 #time mpirun -n ${SLURM_NTASKS} --map-by ppr:1:socket:PE=${SLURM_CPUS_PER_TASK}  ./main -t f32 -i ${IN_DATA}/norm_data/std_g1212639_091_0001 -d 5 -a ${OUT_ASSIGNMENT} -o ${OUT_DATA}
diff --git a/src/adp/adp.c b/src/adp/adp.c
index 700c43c..014f3d0 100644
--- a/src/adp/adp.c
+++ b/src/adp/adp.c
@@ -750,7 +750,7 @@ clusters_t Heuristic1(global_context_t *ctx)
     for(idx_t p = 0; p < n; ++p) 
     {
         to_remove_mask[p].array_idx = MY_SIZE_MAX;
-        to_remove_mask[p].value = 9999999;
+        to_remove_mask[p].value = -9999999;
     }
     qsort(dp_info_ptrs, n, sizeof(datapoint_info_t*), cmpPP);
 
@@ -796,14 +796,13 @@ clusters_t Heuristic1(global_context_t *ctx)
                 if(owner == ctx -> mpi_rank)
                 {
                     idx_t jpos = jidx - ctx -> idx_start;
+                    omp_set_lock(lock_array + jpos);
                     if(i_point.g > to_remove_mask[jpos].value)
                     {
-                        omp_set_lock(lock_array + jpos);
                         to_remove_mask[jpos].array_idx = i_point.array_idx;
                         to_remove_mask[jpos].value     = i_point.g;
-                        omp_unset_lock(lock_array + jpos);
                     }
-                    
+                    omp_unset_lock(lock_array + jpos);
                 }
                 //otherwise enqueue for sending
                 else
@@ -840,6 +839,13 @@ clusters_t Heuristic1(global_context_t *ctx)
     
     qsort(removal_buffer, tot_removal, sizeof(center_removal_t), compare_removal_by_target);
 
+    // remove
+    for(int i = 1; i < tot_removal; ++i)
+    {
+        if(removal_buffer[i - 1].rank > removal_buffer[i].rank)
+            printf("Unsorted removal buffer");
+    }
+
     //prepare for the sendrcv
     
     int* recv_counts = (int*)MY_MALLOC(ctx -> world_size * sizeof(int));
@@ -880,17 +886,65 @@ clusters_t Heuristic1(global_context_t *ctx)
     idx_t tot_recv_counts = 0;
 
     // count how many elements to recieve
+    MPI_DB_PRINT("Using centers elimination queue experiment\n");
 
     for(int i = 0; i < ctx -> world_size; ++i) tot_recv_counts += recv_counts[i];
+    /*
     if(ctx -> mpi_rank == 0){
         for(int i = 0; i < ctx -> world_size; ++i){
             DB_PRINT("%d mpi rank recv_count %d to %d\n", ctx -> mpi_rank, recv_counts[i], i);
             DB_PRINT("%d mpi rank send_count %d to %d\n", ctx -> mpi_rank, send_counts[i], i);
         } 
     }
-    DB_PRINT("rank %d: %lu recv counts\n", ctx -> mpi_rank, tot_recv_counts);
+    */
+    /*DB_PRINT("rank %d: %lu recv counts\n", ctx -> mpi_rank, tot_recv_counts);*/
 
     // change dimensions to bytes
+    //
+
+    //comm matrices
+
+    /* DEBUG PRINT
+     * COMMUNICATION MATRICES */
+
+    /*
+    int* all_send = (int*)MY_MALLOC(ctx -> world_size * ctx -> world_size * sizeof(int));
+    int* all_recv = (int*)MY_MALLOC(ctx -> world_size * ctx -> world_size * sizeof(int));
+
+    MPI_Gather(send_counts, ctx -> world_size, MPI_INT, 
+                all_send, ctx -> world_size , MPI_INT, 0, ctx -> mpi_communicator);
+
+    MPI_Gather(recv_counts, ctx -> world_size, MPI_INT, 
+                all_recv, ctx -> world_size , MPI_INT, 0, ctx -> mpi_communicator);
+
+    if(I_AM_MASTER)
+    {
+        printf("Send matrix\n");
+        for(int i = 0; i < ctx -> world_size; ++i)
+        {
+            for(int j = 0; j < ctx -> world_size; ++j)
+            {
+                printf("%4d ", all_send[i*ctx -> world_size + j]);
+            }
+            printf("\n");
+        }
+
+        printf("Recv matrix\n");
+        for(int i = 0; i < ctx -> world_size; ++i)
+        {
+            for(int j = 0; j < ctx -> world_size; ++j)
+            {
+                printf("%4d ", all_recv[i*ctx -> world_size + j]);
+            }
+            printf("\n");
+        }
+    }
+
+    free(all_recv);
+    free(all_send);
+
+    */
+
 
     for(int i = 0; i < ctx -> world_size; ++i)
     {
@@ -914,13 +968,18 @@ clusters_t Heuristic1(global_context_t *ctx)
     for(idx_t i = 0; i < tot_recv_counts; ++i)
     {
         idx_t el_pos = recv_removals[i].target_id - ctx -> idx_start;
+        int owner = foreign_owner(ctx, recv_removals[i].target_id);
+        if(owner != ctx -> mpi_rank){
+            printf("Error here\n");
+            exit(1);
+        }
+        omp_set_lock(lock_array + el_pos);
         if(recv_removals[i].source_density > to_remove_mask[el_pos].value)
         {
-            omp_set_lock(lock_array + el_pos);
             to_remove_mask[el_pos].array_idx = recv_removals[i].source_id;
             to_remove_mask[el_pos].value     = recv_removals[i].source_density;
-            omp_unset_lock(lock_array + el_pos);
         }
+        omp_unset_lock(lock_array + el_pos);
     }
 
 
@@ -974,7 +1033,7 @@ clusters_t Heuristic1(global_context_t *ctx)
     {
         removal_queues[i].count  = 0;
         removal_queues[i].size   = 0;
-        removal_queues[i].data   = NULL;
+        free(removal_queues[i].data);
         omp_destroy_lock(lock_array+ i);
     }
 
diff --git a/src/main/main.c b/src/main/main.c
index 3f3d839..21cde33 100644
--- a/src/main/main.c
+++ b/src/main/main.c
@@ -212,7 +212,7 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx)
     double elapsed_time;
 
     float_t z = 3;
-    int halo = MY_TRUE;
+    int halo = MY_FALSE;
     float_t tol = 0.002;
     int k = 300;
 
@@ -343,7 +343,7 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx)
         dp_info[i].array_idx = -1;
         dp_info[i].kstar = -1;
         dp_info[i].is_center = -1;
-        dp_info[i].cluster_idx = -1;
+        dp_info[i].cluster_idx = -1717171717;
         //dp_info[i].halo_flag = 0;
     }
     ctx -> local_datapoints = dp_info;
@@ -402,7 +402,7 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx)
     int* cl = (int*)MY_MALLOC(ctx -> local_n_points * sizeof(int));
     for(int i = 0; i < ctx -> local_n_points; ++i) cl[i] = ctx -> local_datapoints[i].cluster_idx;
 
-    if(ctx -> world_size <= 6)
+    if(ctx -> world_size <= 32)
     {
         big_ordered_buffer_to_file(ctx, cl, sizeof(int), ctx -> local_n_points, ctx -> output_assignment_file);
         big_ordered_buffer_to_file(ctx, ctx -> local_data, sizeof(double), ctx -> local_n_points * ctx -> dims, ctx -> output_data_file);
-- 
GitLab