Skip to content
Snippets Groups Projects
Commit 8507be87 authored by lykos98's avatar lykos98
Browse files

added density estimation, preliminarly tested, working on full tests

parent 97cd94f0
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id:d1f5f17d-9949-4707-b553-e03c15bc59ee tags:
``` python
import numpy as np
import matplotlib.pyplot as plt
import dadac
```
%% Cell type:code id:aeb8a19b-be6d-48d8-86e0-479b4d6d99cf tags:
``` python
x = np.fromfile("bb/ordered_data.npy", dtype = np.float64)
x = x.reshape((x.shape[0]//5, 5))
x.shape
```
%% Output
(1842842, 5)
%% Cell type:code id:550c24bc-062c-4e9a-83ab-5fb51aa44dc4 tags:
``` python
data = dadac.Data(x)
```
%% Output
You are running in a notebook maybe the timing output will break, but everything should be fine
%% Cell type:code id:83e48b9f-1739-4b4c-aeea-87fb732c02e2 tags:
``` python
data.compute_distances(299)
data.compute_id_2NN()
#data.id = 4
data.compute_density_kstarNN()
```
%% Output
Building the KDtree v2:
Total time: 1.741s
knn search:
Total time: 18.676s
ID estimation:
ID value: 3.920865
Total time: 0.391s
Density and k* estimation:
Total time: 5.015s
%% Cell type:code id:314b516a-02ec-412a-bc7e-1de8f790e429 tags:
``` python
den_gt = data.log_den
den_comp = np.fromfile("bb/ordered_density.npy", np.float64)
print(data.id)
```
%% Output
3.920865231328582
%% Cell type:code id:92225324-e798-4388-a022-b6bb8906768c tags:
``` python
np.average(np.abs(den_comp - den_gt))
```
%% Output
0.0001499206205206319
%% Cell type:code id:67a8bfdf-f1c1-421a-ab06-0ed9dc57b1b5 tags:
``` python
den_gt
```
%% Output
array([-3.64725383, -4.53602916, -4.30573383, ..., 7.18148125,
-6.25416517, -3.54944958])
%% Cell type:code id:a7093d5f-d40b-43d9-a4d0-1f70638d9bbd tags:
``` python
den_comp
```
%% Output
array([-3.64714384, -4.53592253, -4.30561972, ..., 7.18179655,
-6.25407934, -3.54934192])
%% Cell type:code id:f02c64d5-18b8-4b88-a756-e00fa9295e64 tags:
``` python
a = np.argmax(np.abs(den_comp - den_gt))
print(den_comp[a], den_gt[a])
a
```
%% Output
-0.5825421810150146 -1.2760780561214355
1789174
%% Cell type:code id:10dcc6ee-af0f-412d-8be9-d7500be6cd08 tags:
``` python
np.abs(den_comp - den_gt)[a]
```
%% Output
0.6935358751064209
%% Cell type:code id:19b1c9e6-bf00-44f9-822f-f13412263ce5 tags:
``` python
data.kstar[a]
```
%% Output
130
%% Cell type:code id:cd441b5f-f7d8-476c-9b8f-2c1a7f92380f tags:
``` python
```
......@@ -19,6 +19,7 @@ void get_context(global_context_t* ctx)
ctx -> n_halo_points_recv = NULL;
ctx -> n_halo_points_send = NULL;
ctx -> halo_datapoints = NULL;
ctx -> local_datapoints = NULL;
}
void free_context(global_context_t* ctx)
......@@ -28,9 +29,22 @@ void free_context(global_context_t* ctx)
FREE_NOT_NULL(ctx -> ub_box);
FREE_NOT_NULL(ctx -> lb_box);
if(ctx -> local_datapoints)
{
for(int i = 0; i < ctx -> local_n_points; ++i) FREE_NOT_NULL(ctx -> local_datapoints[i].ngbh.data);
}
FREE_NOT_NULL(ctx -> local_datapoints);
if(ctx -> halo_datapoints)
{
for(int i = 0; i < ctx -> world_size; ++i) FREE_NOT_NULL(ctx -> halo_datapoints[i]);
for(int i = 0; i < ctx -> world_size; ++i)
{
for(int j = 0; j < ctx -> n_halo_points_recv[i]; ++j)
{
FREE_NOT_NULL(ctx -> halo_datapoints[i][j].ngbh.data);
}
FREE_NOT_NULL(ctx -> halo_datapoints[i]);
}
}
FREE_NOT_NULL(ctx -> halo_datapoints);
......
......@@ -21,6 +21,7 @@
//#define WRITE_NGBH
//#define WRITE_TOP_NODES
#define WRITE_DENSITY
/*
* Maximum bytes to send with a single mpi send/recv, used
......@@ -49,6 +50,15 @@
unsigned int data_dims;
int cmp_float_t(const void* a, const void* b)
{
float_t aa = *((float_t*)a);
float_t bb = *((float_t*)b);
return (aa > bb) - (aa < bb);
}
float_t *read_data_file(global_context_t *ctx, const char *fname,
const int file_in_float32)
{
......@@ -1546,7 +1556,7 @@ void mpi_ngbh_search(global_context_t* ctx, datapoint_info_t* dp_info, top_kdtre
convert_heap_idx_to_global(ctx, &(dp_info[idx].ngbh));
dp_info[idx].cluster_idx = -1;
dp_info[idx].is_center = 0;
dp_info[idx].array_idx = idx;
dp_info[idx].array_idx = idx + ctx -> idx_start;
}
elapsed_time = TIME_STOP;
LOG_WRITE("Local neighborhood search", elapsed_time);
......@@ -2032,6 +2042,53 @@ void ordered_data_to_file(global_context_t* ctx)
free(tmp_data);
free(ppp);
free(displs);
}
MPI_Barrier(ctx -> mpi_communicator);
}
void ordered_buffer_to_file(global_context_t* ctx, void* buffer, size_t el_size, uint64_t n, const char* fname)
{
//MPI_Barrier(ctx -> mpi_communicator);
MPI_DB_PRINT("[MASTER] writing to file\n");
void* tmp_data;
int* ppp;
int* displs;
MPI_Barrier(ctx -> mpi_communicator);
uint64_t tot_n = 0;
MPI_Reduce(&n, &tot_n, 1, MPI_UINT64_T , MPI_SUM, 0, ctx -> mpi_communicator);
if(I_AM_MASTER)
{
tmp_data = (void*)malloc(el_size * tot_n );
ppp = (int*)malloc(ctx -> world_size * sizeof(int));
displs = (int*)malloc(ctx -> world_size * sizeof(int));
}
int nn = (int)n;
MPI_Gather(&nn, 1, MPI_INT, ppp, 1, MPI_INT, 0, ctx -> mpi_communicator);
if(I_AM_MASTER)
{
displs[0] = 0;
for(int i = 0; i < ctx -> world_size; ++i) ppp[i] = el_size * ppp[i];
for(int i = 1; i < ctx -> world_size; ++i) displs[i] = displs[i - 1] + ppp[i - 1];
}
MPI_Gatherv(buffer, (int)(el_size * n),
MPI_CHAR, tmp_data, ppp, displs, MPI_CHAR, 0, ctx -> mpi_communicator);
if(I_AM_MASTER)
{
FILE* file = fopen(fname,"w");
fwrite(tmp_data, 1, el_size * tot_n, file);
fclose(file);
free(tmp_data);
free(ppp);
free(displs);
}
MPI_Barrier(ctx -> mpi_communicator);
......@@ -2101,8 +2158,6 @@ void find_foreign_nodes(global_context_t* ctx, datapoint_info_t* dp, datapoint_i
if(owner != ctx -> mpi_rank)
{
//DB_PRINT("Hehe");
/* TODO: change this to a series of locks */
#pragma omp atomic update
capacities[owner]++;
}
......@@ -2129,8 +2184,6 @@ void find_foreign_nodes(global_context_t* ctx, datapoint_info_t* dp, datapoint_i
if(owner != ctx -> mpi_rank)
{
//DB_PRINT("Hehe");
/* TODO: change this to a series of locks */
append_foreign_idx_list(element, owner, count_to_request, array_indexes_to_request);
}
}
......@@ -2175,24 +2228,6 @@ void find_foreign_nodes(global_context_t* ctx, datapoint_info_t* dp, datapoint_i
array_indexes_to_request[i] = (idx_t*)realloc(array_indexes_to_request[i], unique_count[i] * sizeof(idx_t));
}
DB_PRINT("[RANK %d elements to request]", ctx -> mpi_rank);
for(int i = 0; i < ctx -> world_size; ++i)
{
DB_PRINT("%d\t", unique_count[i]);
}
DB_PRINT("\n");
/*
if(I_AM_MASTER)
{
FILE* f = fopen("uniq2","w");
fwrite(array_indexes_to_request[1], sizeof(idx_t), unique_count[1],f);
fclose(f);
}
*/
for(int i = 0; i < ctx -> world_size; ++i)
{
foreign_dp[i] = (datapoint_info_t*)calloc(sizeof(datapoint_info_t), unique_count[i]);
......@@ -2340,7 +2375,7 @@ void find_foreign_nodes(global_context_t* ctx, datapoint_info_t* dp, datapoint_i
if(foreign_dp[i][j].ngbh.data[0].array_idx != array_indexes_to_request[i][j])
{
printf("Chahah %lu\n",array_indexes_to_request[i][j]);
printf("Error on %lu\n",array_indexes_to_request[i][j]);
}
}
}
......@@ -2361,6 +2396,9 @@ void find_foreign_nodes(global_context_t* ctx, datapoint_info_t* dp, datapoint_i
memcpy(ctx -> idx_halo_points_send[i], idx_buffer_to_send + sdispls[i], n_heap_to_send[i] * sizeof(idx_t) );
}
ctx -> halo_datapoints = foreign_dp;
ctx -> local_datapoints = dp;
free(count_to_request);
free(capacities);
......@@ -2368,6 +2406,265 @@ void find_foreign_nodes(global_context_t* ctx, datapoint_info_t* dp, datapoint_i
free(heap_buffer_to_send);
free(idx_buffer_to_send);
free(idx_buffer_to_recv);
free(sdispls);
free(rdispls);
}
float_t mEst2(float_t * x, float_t *y, idx_t n)
{
/*
* Estimate the m coefficient of a straight
* line passing through the origin
* params:
* - x: x values of the points
* - y: y values of the points
* - n: size of the arrays
*/
float_t num = 0;
float_t den = 0;
float_t dd;
for(idx_t i = 0; i < n; ++i)
{
float_t xx = x[i];
float_t yy = y[i];
dd = xx;
num += dd*yy;
den += dd*dd;
}
return num/den;
}
float_t id_estimate(global_context_t* ctx, datapoint_info_t* dp_info, idx_t n, float_t fraction, int verbose)
{
/*
* Estimation of the intrinsic dimension of a dataset
* args:
* - dp_info: array of structs
* - n: number of dp_info
* Estimates the id via 2NN method. Computation of the log ratio of the
* distances of the first 2 neighbors of each point. Then compute the empirical distribution
* of these log ratii
* The intrinsic dimension is found by fitting with a straight line passing through the
* origin
*/
struct timespec start_tot, finish_tot;
double elapsed_tot;
if(verbose)
{
printf("ID estimation:\n");
clock_gettime(CLOCK_MONOTONIC, &start_tot);
}
//float_t fraction = 0.7;
float_t* r = (float_t*)malloc(n*sizeof(float_t));
float_t* Pemp = (float_t*)malloc(n*sizeof(float_t));
for(idx_t i = 0; i < n; ++i)
{
r[i] = 0.5 * log(dp_info[i].ngbh.data[2].value/dp_info[i].ngbh.data[1].value);
Pemp[i] = -log(1 - (float_t)(i + 1)/(float_t)n);
}
qsort(r,n,sizeof(float_t),cmp_float_t);
idx_t Neff = (idx_t)(n*fraction);
float_t d = mEst2(r,Pemp,Neff);
free(r);
free(Pemp);
if(verbose)
{
clock_gettime(CLOCK_MONOTONIC, &finish_tot);
elapsed_tot = (finish_tot.tv_sec - start_tot.tv_sec);
elapsed_tot += (finish_tot.tv_nsec - start_tot.tv_nsec) / 1000000000.0;
printf("\tID value: %.6lf\n", d);
printf("\tTotal time: %.3lfs\n\n", elapsed_tot);
}
float_t rd = 0;
MPI_Allreduce(&d, &rd, 1, MPI_MY_FLOAT, MPI_SUM, ctx -> mpi_communicator);
rd = rd / ctx -> world_size;
return rd;
}
int binary_search_on_idxs(idx_t* idxs, idx_t key, int n)
{
#define LEFT 1
#define RIGHT 0
int l = 0;
int r = n - 1;
int center = (r - l)/2;
while(idxs[center] != key && l < r)
{
int lr = key < idxs[center];
/* if key < place */
switch (lr)
{
case LEFT:
{
l = l;
r = center - 1;
center = l + (r - l) / 2;
}
break;
case RIGHT:
{
l = center + 1;
r = r;
center = l + (r - l) / 2;
}
break;
default:
break;
}
}
return center;
#undef LEFT
#undef RIGHT
}
datapoint_info_t find_possibly_halo_datapoint(global_context_t* ctx, idx_t idx)
{
int owner = foreign_owner(ctx, idx);
/* find if datapoint is halo or not */
if(owner == ctx -> mpi_rank)
{
idx_t i = idx - ctx -> idx_start;
return ctx -> local_datapoints[i];
}
else
{
datapoint_info_t* halo_dp = ctx -> halo_datapoints[owner];
idx_t* halo_idxs = ctx -> idx_halo_points_recv[owner];
int n = ctx -> n_halo_points_recv[owner];
int i = binary_search_on_idxs(halo_idxs, idx, n);
if( idx != halo_dp[i].ngbh.data[0].array_idx)
// if( idx != halo_idxs[i])
{
printf("Osti %lu\n", idx);
}
return halo_dp[i];
}
}
void compute_density_kstarnn(global_context_t* ctx, const float_t d, int verbose){
/*
* Point density computation:
* args:
* - paricles: array of structs
* - d : intrinsic dimension of the dataset
* - points : number of points in the dataset
*/
struct timespec start_tot, finish_tot;
double elapsed_tot;
datapoint_info_t* local_datapoints = ctx -> local_datapoints;
if(verbose)
{
printf("Density and k* estimation:\n");
clock_gettime(CLOCK_MONOTONIC, &start_tot);
}
idx_t kMAX = ctx -> local_datapoints[0].ngbh.N - 1;
float_t omega = 0.;
if(sizeof(float_t) == sizeof(float)){ omega = powf(PI_F,d/2)/tgammaf(d/2.0f + 1.0f);}
else{omega = pow(M_PI,d/2.)/tgamma(d/2.0 + 1.0);}
#pragma omp parallel for
for(idx_t i = 0; i < ctx -> local_n_points; ++i)
{
idx_t j = 4;
idx_t k;
float_t dL = 0.;
float_t vvi = 0.;
float_t vvj = 0.;
float_t vp = 0.;
while(j < kMAX && dL < DTHR)
{
idx_t ksel = j - 1;
vvi = omega * pow(local_datapoints[i].ngbh.data[ksel].value,d/2.);
idx_t jj = local_datapoints[i].ngbh.data[j].array_idx;
/*
* note jj can be an halo point
* need to search maybe for it in foreign nodes
* */
datapoint_info_t tmp_dp = find_possibly_halo_datapoint(ctx, jj);
vvj = omega * pow(tmp_dp.ngbh.data[ksel].value,d/2.);
/* TO REMOVE
if(local_datapoints[i].array_idx == 17734) printf("%lu ksel i %lu j %lu tmp_dp %lu di %lf fj %lf vvi %lf vvj %lf\n", ksel, i, jj, tmp_dp.array_idx,
sqrt(local_datapoints[i].ngbh.data[ksel].value), sqrt(tmp_dp.ngbh.data[ksel].value), vvi, vvj);
*/
vp = (vvi + vvj)*(vvi + vvj);
dL = -2.0 * ksel * log(4.*vvi*vvj/vp);
j = j + 1;
}
if(j == kMAX)
{
k = j - 1;
vvi = omega * pow(ctx -> local_datapoints[i].ngbh.data[k].value,d/2.);
}
else
{
k = j - 2;
}
local_datapoints[i].kstar = k;
local_datapoints[i].log_rho = log((float_t)(k)/vvi/((float_t)(ctx -> n_points)));
//dp_info[i].log_rho = log((float_t)(k)) - log(vvi) -log((float_t)(points));
local_datapoints[i].log_rho_err = 1.0/sqrt((float_t)k); //(float_t)(-Q_rsqrt((float)k));
local_datapoints[i].g = local_datapoints[i].log_rho - local_datapoints[i].log_rho_err;
}
if(verbose)
{
clock_gettime(CLOCK_MONOTONIC, &finish_tot);
elapsed_tot = (finish_tot.tv_sec - start_tot.tv_sec);
elapsed_tot += (finish_tot.tv_nsec - start_tot.tv_nsec) / 1000000000.0;
printf("\tTotal time: %.3lfs\n\n", elapsed_tot);
}
#if defined(WRITE_DENSITY)
/* densities */
float_t* den = (float_t*)malloc(ctx -> local_n_points * sizeof(float_t));
for(int i = 0; i < ctx -> local_n_points; ++i) den[i] = ctx -> local_datapoints[i].log_rho;
ordered_buffer_to_file(ctx, den, sizeof(float_t), ctx -> local_n_points, "bb/ordered_density.npy");
ordered_data_to_file(ctx);
free(den);
#endif
return;
}
void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx)
......@@ -2404,7 +2701,7 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx)
//ctx -> n_points = 48*5*2000;
ctx->n_points = ctx->n_points / ctx->dims;
ctx->n_points = (ctx->n_points * 0.1) / 10;
// ctx->n_points = (ctx->n_points * 0.5) / 10;
// ctx -> n_points = ctx -> world_size * 1000;
//ctx -> n_points = 10000000 * ctx -> world_size;
......@@ -2507,6 +2804,7 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx)
elapsed_time = TIME_STOP;
LOG_WRITE("Total time for all knn search", elapsed_time)
TIME_START;
datapoint_info_t** foreign_dp_info = (datapoint_info_t**)malloc(ctx -> world_size * sizeof(datapoint_info_t*));
......@@ -2514,11 +2812,33 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx)
elapsed_time = TIME_STOP;
LOG_WRITE("Finding points to request the ngbh", elapsed_time)
TIME_START;
float_t id = id_estimate(ctx, dp_info, ctx -> local_n_points, 0.9, MY_FALSE);
elapsed_time = TIME_STOP;
//id = 3.920865231328582;
//id = 4.008350298212649;
//id = 4.;
LOG_WRITE("ID estimate", elapsed_time)
MPI_DB_PRINT("ID %lf \n",id);
TIME_START;
compute_density_kstarnn(ctx, id, MY_FALSE);
elapsed_time = TIME_STOP;
LOG_WRITE("Density estimate", elapsed_time)
/* find density */
#if defined (WRITE_NGBH)
ordered_data_to_file(ctx);
#endif
/*
for(int i = 0; i < ctx -> local_n_points; ++i)
{
free(dp_info[i].ngbh.data);
......@@ -2534,6 +2854,7 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx)
}
free(foreign_dp_info);
*/
top_tree_free(ctx, &tree);
......@@ -2541,7 +2862,7 @@ void simulate_master_read_and_scatter(int dims, size_t n, global_context_t *ctx)
free(send_counts);
free(displacements);
free(dp_info);
//free(dp_info);
if (ctx->mpi_rank == 0) free(data);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment