Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
D
dADP
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Luca Tornatore
dADP
Commits
43b6d769
Commit
43b6d769
authored
2 months ago
by
lykos98
Browse files
Options
Downloads
Patches
Plain Diff
added experimental h1 optimization
parent
9b4a2351
Branches
Branches containing commit
No related tags found
No related merge requests found
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
run_leo
+13
-4
13 additions, 4 deletions
run_leo
src/adp/adp.c
+185
-90
185 additions, 90 deletions
src/adp/adp.c
src/adp/adp.h
+15
-0
15 additions, 0 deletions
src/adp/adp.h
with
213 additions
and
94 deletions
run_leo
+
13
−
4
View file @
43b6d769
#!/bin/bash
#SBATCH --nodes=
6
#SBATCH --nodes=
1
#SBATCH --ntasks-per-node=2
#SBATCH --cpus-per-task=56
#SBATCH --time=04:00:00
#SBATCH --job-name=dadp_test
#SBATCH --partition=dcgp_usr_prod
#SBATCH --account=
IscrC_dadp
#SBATCH --account=
EUHPC_D18_045
#SBATCH --output=out_leo
#SBATCH --error=err_leo
#SBATCH --mem=480G
...
...
@@ -14,8 +14,10 @@
cd
$SLURM_SUBMIT_DIR
module restore my_gcc
#module restore my_intel
module load gcc
module load openmpi
make clean
make
ulimit
-s
unlimited
...
...
@@ -36,5 +38,12 @@ IN_DATA=/leonardo_work/IscrC_dadp
#10^6 points
time
mpirun
-n
${
SLURM_NTASKS
}
--map-by
ppr:1:socket:PE
=
${
SLURM_CPUS_PER_TASK
}
./main
-t
f32
-i
${
IN_DATA
}
/norm_data/std_LR_091_0001
-d
5
-a
${
OUT_ASSIGNMENT
}
-o
${
OUT_DATA
}
#34 * 10^6 points
#time mpirun -n ${SLURM_NTASKS} --map-by ppr:1:socket:PE=${SLURM_CPUS_PER_TASK} ./main -t f32 -i ${IN_DATA}/norm_data/std_g1212639_091_0001 -d 5 -a ${OUT_ASSIGNMENT} -o ${OUT_DATA}
#88 * 10^6 points
#time mpirun -n ${SLURM_NTASKS} --map-by ppr:1:socket:PE=${SLURM_CPUS_PER_TASK} ./main -t f32 -i ${IN_DATA}/norm_data/std_g5503149_091_0000 -d 5 -a ${OUT_ASSIGNMENT} -o ${OUT_DATA}
#200 * 10^6 points
#time mpirun -n ${SLURM_NTASKS} --map-by ppr:1:socket:PE=${SLURM_CPUS_PER_TASK} ./main -t f32 -i ${IN_DATA}/norm_data/std_g2980844_091_0000 -d 5 -a ${OUT_ASSIGNMENT} -o ${OUT_DATA}
This diff is collapsed.
Click to expand it.
src/adp/adp.c
+
185
−
90
View file @
43b6d769
...
...
@@ -118,8 +118,8 @@ idx_t get_j_ksel_idx(global_context_t* ctx, idx_t j, idx_t ksel, MPI_Win exposed
}
void
compute_density_kstarnn_rma_v2
(
global_context_t
*
ctx
,
const
float_t
d
,
int
verbose
)
{
void
compute_density_kstarnn_rma_v2
(
global_context_t
*
ctx
,
const
float_t
d
,
int
verbose
)
{
/*
* Point density computation:
* args:
...
...
@@ -648,12 +648,32 @@ lock_t h1_lock_free(global_context_t* ctx, MPI_Win lock_window, int owner, idx_t
}
void
enqueue_center_removal
(
center_removal_queue_t
*
queue
,
center_removal_t
element
)
{
int
default_incr_size
=
100
;
if
(
queue
->
count
>=
queue
->
size
)
{
queue
->
size
+=
default_incr_size
;
queue
->
data
=
realloc
(
queue
->
data
,
queue
->
size
*
sizeof
(
center_removal_t
));
}
queue
->
data
[
queue
->
count
]
=
element
;
queue
->
count
++
;
}
int
compare_removal_by_target
(
const
void
*
a
,
const
void
*
b
)
{
idx_t
arg1
=
(
*
(
const
center_removal_t
*
)
a
).
target_id
;
idx_t
arg2
=
(
*
(
const
center_removal_t
*
)
b
).
target_id
;
return
(
arg1
>
arg2
)
-
(
arg1
<
arg2
);
}
clusters_t
Heuristic1
(
global_context_t
*
ctx
)
{
/*
/*
*
* Heurisitc 1, from paper of Errico, Facco, Laio & Rodriguez
* ( https://doi.org/10.1016/j.ins.2021.01.010 )
*/
*
*/
datapoint_info_t
*
dp_info
=
ctx
->
local_datapoints
;
idx_t
n
=
ctx
->
local_n_points
;
...
...
@@ -721,36 +741,38 @@ clusters_t Heuristic1(global_context_t *ctx)
* Generate a mask that keeps track of the point has been eliminating the
* point considered. Each thread updates this mask, then after the procedure
* ends, center, removed centers, and max_rho arrays are populated
*
* optimized v2 use a queue of center removal and then exchange them
*/
lock_t
*
lock_array
=
(
lock_t
*
)
MY_MALLOC
(
n
*
sizeof
(
lock_t
));
heap_node
*
to_remove_mask
=
(
heap_node
*
)
MY_MALLOC
(
n
*
sizeof
(
heap_node
));
for
(
idx_t
p
=
0
;
p
<
n
;
++
p
)
{
to_remove_mask
[
p
].
array_idx
=
MY_SIZE_MAX
;
to_remove_mask
[
p
].
value
=
9999999
;
lock_array
[
p
]
=
LOCK_FREE
;
}
qsort
(
dp_info_ptrs
,
n
,
sizeof
(
datapoint_info_t
*
),
cmpPP
);
/**
* workflow
* find the elements we need to eliminate
* compact them to a single array
* send/recieve
**/
MPI_Win
win_to_remove_mask
;
MPI_Win_create
(
to_remove_mask
,
n
*
sizeof
(
heap_node
),
1
,
MPI_INFO_NULL
,
ctx
->
mpi_communicator
,
&
win_to_remove_mask
);
MPI_Win_fence
(
0
,
win_to_remove_mask
);
MPI_Win
win_locks
;
MPI_Win_create
(
lock_array
,
n
*
sizeof
(
lock_t
),
sizeof
(
lock_t
),
MPI_INFO_NULL
,
ctx
->
mpi_communicator
,
&
win_locks
);
MPI_Win_fence
(
0
,
win_locks
);
center_removal_queue_t
*
removal_queues
=
(
center_removal_queue_t
*
)
MY_MALLOC
(
n
*
sizeof
(
center_removal_queue_t
));
omp_lock_t
*
lock_array
=
(
omp_lock_t
*
)
MY_MALLOC
(
n
*
sizeof
(
omp_lock_t
));
#ifdef EXP_H1
MPI_Win_lock_all
(
0
,
win_to_remove_mask
);
MPI_Win_lock_all
(
0
,
win_locks
);
#endif
//zero out all queues
#ifdef EXP_H1
printf
(
"Using experimental h1
\n
"
);
#endif
for
(
idx_t
i
=
0
;
i
<
n
;
++
i
)
{
removal_queues
[
i
].
count
=
0
;
removal_queues
[
i
].
size
=
0
;
removal_queues
[
i
].
data
=
NULL
;
omp_init_lock
(
lock_array
+
i
);
}
#if !defined(THREAD_FUNNELED)
#pragma omp parallel for schedule(dynamic)
...
...
@@ -767,82 +789,140 @@ clusters_t Heuristic1(global_context_t *ctx)
if
(
j_point
.
is_center
&&
i_point
.
g
>
j_point
.
g
)
{
/*
*
* TODO: Implement it without this but using private locks
* use an array of locks, and compare and swap to actually gain control of the thing
*
* */
#ifdef EXP_H1
#pragma omp critical (h1_exp)
{
// set the lock at position i
// actually is the p-th point
int
owner
=
foreign_owner
(
ctx
,
jidx
);
idx_t
jpos
=
jidx
-
ctx
->
rank_idx_start
[
owner
];
//if local process it
if
(
owner
==
ctx
->
mpi_rank
)
{
idx_t
jpos
=
jidx
-
ctx
->
idx_start
;
if
(
i_point
.
g
>
to_remove_mask
[
jpos
].
value
)
{
omp_set_lock
(
lock_array
+
jpos
);
to_remove_mask
[
jpos
].
array_idx
=
i_point
.
array_idx
;
to_remove_mask
[
jpos
].
value
=
i_point
.
g
;
omp_unset_lock
(
lock_array
+
jpos
);
}
lock_t
state
=
LOCK_FREE
;
}
//otherwise enqueue for sending
else
{
center_removal_t
element
=
{.
rank
=
owner
,
.
source_id
=
i_point
.
array_idx
,
.
target_id
=
j_point
.
array_idx
,
.
source_density
=
i_point
.
g
};
enqueue_center_removal
(
removal_queues
+
p
,
element
);
}
}
}
}
state
=
h1_lock_acquire
(
ctx
,
win_locks
,
owner
,
jpos
,
state
);
//assemble arrays into a single buffer
heap_node
mask_element
;
MPI_Request
request
;
idx_t
tot_removal
=
0
;
for
(
idx_t
p
=
0
;
p
<
n
;
++
p
)
{
tot_removal
+=
removal_queues
[
p
].
count
;
}
MPI_Rget
(
&
mask_element
,
sizeof
(
heap_node
),
MPI_BYTE
,
owner
,
jpos
*
sizeof
(
heap_node
),
sizeof
(
heap_node
),
MPI_BYTE
,
win_to_remove_mask
,
&
request
);
MPI_Wait
(
&
request
,
MPI_STATUS_IGNORE
);
idx_t
buffer_idx
=
0
;
center_removal_t
*
removal_buffer
=
(
center_removal_t
*
)
MY_MALLOC
(
tot_removal
*
sizeof
(
center_removal_t
));
int
flag
=
mask_element
.
array_idx
==
MY_SIZE_MAX
;
if
(
flag
||
i_point
.
g
>
mask_element
.
value
)
for
(
idx_t
p
=
0
;
p
<
n
;
++
p
)
{
heap_node
tmp_mask_element
=
{.
array_idx
=
i_point
.
array_idx
,
.
value
=
i_point
.
g
};
MPI_Request
request
;
MPI_Rput
(
&
tmp_mask_element
,
sizeof
(
heap_node
),
MPI_BYTE
,
owner
,
jpos
*
sizeof
(
heap_node
),
sizeof
(
heap_node
),
MPI_BYTE
,
win_to_remove_mask
,
&
request
);
MPI_Wait
(
&
request
,
MPI_STATUS_IGNORE
);
if
(
removal_queues
[
p
].
count
>
0
)
{
memcpy
(
removal_buffer
+
buffer_idx
,
removal_queues
[
p
].
data
,
removal_queues
[
p
].
count
*
sizeof
(
center_removal_t
));
buffer_idx
+=
removal_queues
[
p
].
count
;
}
state
=
h1_lock_free
(
ctx
,
win_locks
,
owner
,
jpos
,
state
);
}
#else
#pragma omp critical (centers_elimination)
{
int
owner
=
foreign_owner
(
ctx
,
jidx
);
idx_t
jpos
=
jidx
-
ctx
->
rank_idx_start
[
owner
];
MPI_Win_lock
(
MPI_LOCK_EXCLUSIVE
,
owner
,
0
,
win_to_remove_mask
);
heap_node
mask_element
;
MPI_Request
request
;
MPI_Rget
(
&
mask_element
,
sizeof
(
heap_node
),
MPI_BYTE
,
owner
,
jpos
*
sizeof
(
heap_node
),
sizeof
(
heap_node
),
MPI_BYTE
,
win_to_remove_mask
,
&
request
);
MPI_Wait
(
&
request
,
MPI_STATUS_IGNORE
);
//sort by array idx (it sorts also by rank)
qsort
(
removal_buffer
,
tot_removal
,
sizeof
(
center_removal_t
),
compare_removal_by_target
);
//prepare for the sendrcv
int
*
recv_counts
=
(
int
*
)
MY_MALLOC
(
ctx
->
world_size
*
sizeof
(
int
));
int
*
send_counts
=
(
int
*
)
MY_MALLOC
(
ctx
->
world_size
*
sizeof
(
int
));
int
flag
=
mask_element
.
array_idx
==
MY_SIZE_MAX
;
if
(
flag
||
i_point
.
g
>
mask_element
.
value
)
int
*
recv_displs
=
(
int
*
)
MY_MALLOC
(
ctx
->
world_size
*
sizeof
(
int
));
int
*
send_displs
=
(
int
*
)
MY_MALLOC
(
ctx
->
world_size
*
sizeof
(
int
));
//zero_out_all
for
(
int
i
=
0
;
i
<
ctx
->
mpi_rank
;
++
i
)
{
heap_node
tmp_mask_element
=
{.
array_idx
=
i_point
.
array_idx
,
.
value
=
i_point
.
g
}
;
MPI_Request
request
;
MPI_Rput
(
&
tmp_mask_element
,
sizeof
(
heap_node
),
MPI_BYTE
,
owner
,
jpos
*
sizeof
(
heap_node
),
sizeof
(
heap_node
),
MPI_BYTE
,
win_to_remove_mask
,
&
request
)
;
MPI_Wait
(
&
request
,
MPI_STATUS_IGNORE
);
recv_counts
[
i
]
=
0
;
send_counts
[
i
]
=
0
;
recv_displs
[
i
]
=
0
;
send_displs
[
i
]
=
0
;
}
for
(
idx_t
i
=
0
;
i
<
tot_removal
;
++
i
)
{
int
rank_idx
=
removal_buffer
[
i
].
rank
;
send_counts
[
rank_idx
]
++
;
}
MPI_Win_unlock
(
owner
,
win_to_remove_mask
);
// all to all to recieve counts
MPI_Alltoall
(
send_counts
,
1
,
MPI_INT
,
recv_counts
,
1
,
MPI_INT
,
ctx
->
mpi_communicator
);
// compute displacements
for
(
int
i
=
1
;
i
<
ctx
->
world_size
;
++
i
)
{
recv_displs
[
i
]
=
recv_displs
[
i
-
1
]
+
recv_counts
[
i
-
1
];
send_displs
[
i
]
=
send_displs
[
i
-
1
]
+
send_counts
[
i
-
1
];
}
#endif
idx_t
tot_recv_counts
=
0
;
// count how many elements to recieve
for
(
int
i
=
0
;
i
<
ctx
->
world_size
;
++
i
)
tot_recv_counts
+=
recv_counts
[
i
];
if
(
ctx
->
mpi_rank
==
0
){
for
(
int
i
=
0
;
i
<
ctx
->
world_size
;
++
i
){
DB_PRINT
(
"%d mpi rank recv_count %d to %d
\n
"
,
ctx
->
mpi_rank
,
recv_counts
[
i
],
i
);
DB_PRINT
(
"%d mpi rank send_count %d to %d
\n
"
,
ctx
->
mpi_rank
,
send_counts
[
i
],
i
);
}
}
DB_PRINT
(
"rank %d: %lu recv counts
\n
"
,
ctx
->
mpi_rank
,
tot_recv_counts
);
// change dimensions to bytes
for
(
int
i
=
0
;
i
<
ctx
->
world_size
;
++
i
)
{
recv_counts
[
i
]
=
recv_counts
[
i
]
*
sizeof
(
center_removal_t
);
send_counts
[
i
]
=
send_counts
[
i
]
*
sizeof
(
center_removal_t
);
recv_displs
[
i
]
=
recv_displs
[
i
]
*
sizeof
(
center_removal_t
);
send_displs
[
i
]
=
send_displs
[
i
]
*
sizeof
(
center_removal_t
);
}
#ifdef EXP_H1
MPI_Win_unlock_all
(
win_to_remove_mask
);
MPI_Win_unlock_all
(
win_locks
);
#endif
//allocate buffer to recieve center elminiations
center_removal_t
*
recv_removals
=
(
center_removal_t
*
)
MY_MALLOC
(
tot_recv_counts
*
sizeof
(
center_removal_t
));
// all to all
MPI_Alltoallv
(
removal_buffer
,
send_counts
,
send_displs
,
MPI_BYTE
,
recv_removals
,
recv_counts
,
recv_displs
,
MPI_BYTE
,
ctx
->
mpi_communicator
);
// merge into the mask
#pragma omp parallel for
for
(
idx_t
i
=
0
;
i
<
tot_recv_counts
;
++
i
)
{
idx_t
el_pos
=
recv_removals
[
i
].
target_id
-
ctx
->
idx_start
;
if
(
recv_removals
[
i
].
source_density
>
to_remove_mask
[
el_pos
].
value
)
{
omp_set_lock
(
lock_array
+
el_pos
);
to_remove_mask
[
el_pos
].
array_idx
=
recv_removals
[
i
].
source_id
;
to_remove_mask
[
el_pos
].
value
=
recv_removals
[
i
].
source_density
;
omp_unset_lock
(
lock_array
+
el_pos
);
}
}
MPI_Win_fence
(
0
,
win_to_remove_mask
);
MPI_Win_fence
(
0
,
win_locks
);
MPI_Barrier
(
ctx
->
mpi_communicator
);
/* populate the usual arrays */
for
(
idx_t
p
=
0
;
p
<
all_centers
.
count
;
++
p
)
...
...
@@ -888,11 +968,24 @@ clusters_t Heuristic1(global_context_t *ctx)
}
}
MPI_Win_free
(
&
win_to_remove_mask
);
free
(
to_remove_mask
);
MPI_Win_free
(
&
win_locks
);
for
(
idx_t
i
=
0
;
i
<
n
;
++
i
)
{
removal_queues
[
i
].
count
=
0
;
removal_queues
[
i
].
size
=
0
;
removal_queues
[
i
].
data
=
NULL
;
omp_destroy_lock
(
lock_array
+
i
);
}
free
(
removal_queues
);
free
(
removal_buffer
);
free
(
send_counts
);
free
(
send_displs
);
free
(
recv_counts
);
free
(
recv_displs
);
free
(
lock_array
);
free
(
recv_removals
);
int
n_centers
=
(
int
)
actual_centers
.
count
;
int
tot_centers
;
...
...
@@ -900,9 +993,11 @@ clusters_t Heuristic1(global_context_t *ctx)
MPI_DB_PRINT
(
"Found %d temporary centers
\n
"
,
tot_centers
);
/* bring on master all centers
/**
* bring on master all centers
* order them in ascending order of density,
* then re-scatter them around to get unique cluster labels */
* then re-scatter them around to get unique cluster labels
**/
center_t
*
private_centers_buffer
=
(
center_t
*
)
MY_MALLOC
(
actual_centers
.
count
*
sizeof
(
center_t
));
center_t
*
global_centers_buffer
=
(
center_t
*
)
MY_MALLOC
(
tot_centers
*
sizeof
(
center_t
));
...
...
This diff is collapsed.
Click to expand it.
src/adp/adp.h
+
15
−
0
View file @
43b6d769
...
...
@@ -45,6 +45,21 @@ typedef struct merge_t
float_t
density
;
}
merge_t
;
typedef
struct
center_removal_t
{
int
rank
;
idx_t
source_id
;
idx_t
target_id
;
float_t
source_density
;
}
center_removal_t
;
typedef
struct
center_removal_queue_t
{
center_removal_t
*
data
;
idx_t
count
;
idx_t
size
;
}
center_removal_queue_t
;
void
compute_density_kstarnn_rma
(
global_context_t
*
ctx
,
const
float_t
d
,
int
verbose
);
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment