PostGIS 3.7.0dev-r@@SVN_REVISION@@
Loading...
Searching...
No Matches
lwkmeans.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * Copyright (c) 2018-2021, Darafei Praliaskouski <me@komzpa.net>
4 * Copyright (c) 2016, Paul Ramsey <pramsey@cleverelephant.ca>
5 *
6 *------------------------------------------------------------------------*/
7
9
10/*
11 * When clustering lists with NULL or EMPTY elements, they will get this as
12 * their cluster number. (All the other clusters will be non-negative)
13 */
14#define KMEANS_NULL_CLUSTER -1
15
16/*
17 * If the algorithm doesn't converge within this number of iterations,
18 * it will return with a failure error code.
19 */
20#define KMEANS_MAX_ITERATIONS 1000
21
22static uint32_t kmeans(POINT4D *objs,
23 uint32_t *clusters,
24 uint32_t n,
25 POINT4D *centers,
26 double *radii,
27 uint32_t min_k,
28 double max_radius);
29
30inline static double
32{
33 double hside = p2->x - p1->x;
34 double vside = p2->y - p1->y;
35 double zside = p2->z - p1->z;
36
37 return hside * hside + vside * vside + zside * zside;
38}
39
40/* Split the clusters that need to be split */
41static uint32_t
43 uint32_t *clusters,
44 uint32_t n,
45 POINT4D *centers,
46 double *radii,
47 uint32_t k,
48 double max_radius)
49{
50 /* Input check: radius limit should be measurable */
51 if (max_radius <= 0)
52 return k;
53
54 double max_radius_sq = max_radius * max_radius;
55
56 /* Do we have the big clusters to split at all? */
57 uint32_t first_cluster_to_split = 0;
58 for (; first_cluster_to_split < k; first_cluster_to_split++)
59 if (radii[first_cluster_to_split] > max_radius_sq)
60 break;
61 if (first_cluster_to_split == k)
62 return k;
63
64 POINT4D *temp_objs = lwalloc(sizeof(POINT4D) * n);
65 uint32_t *temp_clusters = lwalloc(sizeof(uint32_t) * n);
66 double *temp_radii = lwalloc(sizeof(double) * n);
67 POINT4D *temp_centers = lwalloc(sizeof(POINT4D) * n);
68
69 uint32_t new_k = k;
70
71 for (uint32_t cluster = first_cluster_to_split; cluster < k; cluster++)
72 {
73 if (radii[cluster] <= max_radius_sq)
74 continue;
75
76 /* copy cluster alone */
77 uint32_t cluster_size = 0;
78 for (uint32_t i = 0; i < n; i++)
79 if (clusters[i] == cluster)
80 temp_objs[cluster_size++] = objs[i];
81 if (cluster_size <= 1)
82 continue;
83
84 /* run 2-means on the cluster */
85 kmeans(temp_objs, temp_clusters, cluster_size, temp_centers, temp_radii, 2, 0);
86
87 /* replace cluster with split */
88 uint32_t d = 0;
89 for (uint32_t i = 0; i < n; i++)
90 if (clusters[i] == cluster)
91 if (temp_clusters[d++])
92 clusters[i] = new_k;
93
94 centers[cluster] = temp_centers[0];
95 centers[new_k] = temp_centers[1];
96 radii[cluster] = temp_radii[0];
97 radii[new_k] = temp_radii[1];
98 new_k++;
99 }
100 lwfree(temp_centers);
101 lwfree(temp_radii);
102 lwfree(temp_clusters);
103 lwfree(temp_objs);
104 return new_k;
105}
106
107/* Refresh mapping of point to closest cluster */
108static uint8_t
109update_r(POINT4D *objs, uint32_t *clusters, uint32_t n, POINT4D *centers, double *radii, uint32_t k)
110{
111 uint8_t converged = LW_TRUE;
112 if (radii)
113 memset(radii, 0, sizeof(double) * k);
114
115 for (uint32_t i = 0; i < n; i++)
116 {
117 POINT4D obj = objs[i];
118
119 /* Initialize with distance to first cluster */
120 double curr_distance = distance3d_sqr_pt4d_pt4d(&obj, &centers[0]);
121 uint32_t curr_cluster = 0;
122
123 /* Check all other cluster centers and find the nearest */
124 for (uint32_t cluster = 1; cluster < k; cluster++)
125 {
126 double distance = distance3d_sqr_pt4d_pt4d(&obj, &centers[cluster]);
127 if (distance < curr_distance)
128 {
129 curr_distance = distance;
130 curr_cluster = cluster;
131 }
132 }
133
134 /* Store the nearest cluster this object is in */
135 if (clusters[i] != curr_cluster)
136 {
137 converged = LW_FALSE;
138 clusters[i] = curr_cluster;
139 }
140 if (radii)
141 if (radii[curr_cluster] < curr_distance)
142 radii[curr_cluster] = curr_distance;
143 }
144 return converged;
145}
146
147/* Refresh cluster centroids based on all of their objects */
148static void
149update_means(POINT4D *objs, uint32_t *clusters, uint32_t n, POINT4D *centers, uint32_t k)
150{
151 memset(centers, 0, sizeof(POINT4D) * k);
152 /* calculate weighted sum */
153 for (uint32_t i = 0; i < n; i++)
154 {
155 uint32_t cluster = clusters[i];
156 centers[cluster].x += objs[i].x * objs[i].m;
157 centers[cluster].y += objs[i].y * objs[i].m;
158 centers[cluster].z += objs[i].z * objs[i].m;
159 centers[cluster].m += objs[i].m;
160 }
161 /* divide by weight to get average */
162 for (uint32_t i = 0; i < k; i++)
163 {
164 if (centers[i].m)
165 {
166 centers[i].x /= centers[i].m;
167 centers[i].y /= centers[i].m;
168 centers[i].z /= centers[i].m;
169 }
170 }
171}
172
173/* Assign initial clusters centroids heuristically */
174static void
175kmeans_init(POINT4D *objs, uint32_t n, POINT4D *centers, uint32_t k)
176{
177 double *distances;
178 uint32_t p1 = 0, p2 = 0;
179 uint32_t duplicate_count = 1; /* a point is a duplicate of itself */
180 double max_dst = -1;
181
182 /* k=0, k=1: any point will do */
183 assert(n > 0);
184 if (k < 2)
185 {
186 centers[0] = objs[0];
187 return;
188 }
189
190 /* k >= 2: find two distant points greedily */
191 for (uint32_t i = 1; i < n; i++)
192 {
193 /* if we found a larger distance, replace our choice */
194 double dst_p1 = distance3d_sqr_pt4d_pt4d(&objs[i], &objs[p1]);
195 double dst_p2 = distance3d_sqr_pt4d_pt4d(&objs[i], &objs[p2]);
196 if ((dst_p1 > max_dst) || (dst_p2 > max_dst))
197 {
198 if (dst_p1 > dst_p2)
199 {
200 max_dst = dst_p1;
201 p2 = i;
202 }
203 else
204 {
205 max_dst = dst_p2;
206 p1 = i;
207 }
208 }
209 if ((dst_p1 == 0) || (dst_p2 == 0))
210 duplicate_count++;
211 }
212 if (duplicate_count > 1)
213 lwnotice(
214 "%s: there are at least %u duplicate inputs, number of output clusters may be less than you requested",
215 __func__,
216 duplicate_count);
217
218 /* by now two points should be found and non-same */
219 assert(p1 != p2 && max_dst >= 0);
220
221 /* accept these two points */
222 centers[0] = objs[p1];
223 centers[1] = objs[p2];
224
225 if (k > 2)
226 {
227 /* array of minimum distance to a point from accepted cluster centers */
228 distances = lwalloc(sizeof(double) * n);
229
230 /* initialize array with distance to first object */
231 for (uint32_t j = 0; j < n; j++)
232 distances[j] = distance3d_sqr_pt4d_pt4d(&objs[j], &centers[0]);
233 distances[p1] = -1;
234 distances[p2] = -1;
235
236 /* loop i on clusters, skip 0 and 1 as found already */
237 for (uint32_t i = 2; i < k; i++)
238 {
239 uint32_t candidate_center = 0;
240 double max_distance = -DBL_MAX;
241
242 /* loop j on objs */
243 for (uint32_t j = 0; j < n; j++)
244 {
245 /* empty objs and accepted clusters are already marked with distance = -1 */
246 if (distances[j] < 0)
247 continue;
248
249 /* update minimal distance with previously accepted cluster */
250 double current_distance = distance3d_sqr_pt4d_pt4d(&objs[j], &centers[i - 1]);
251 if (current_distance < distances[j])
252 distances[j] = current_distance;
253
254 /* greedily take a point that's farthest from any of accepted clusters */
255 if (distances[j] > max_distance)
256 {
257 candidate_center = j;
258 max_distance = distances[j];
259 }
260 }
261
262 /* Checked earlier by counting entries on input, just in case */
263 assert(max_distance >= 0);
264
265 /* accept candidate to centers */
266 distances[candidate_center] = -1;
267 /* Copy the point coordinates into the initial centers array
268 * Centers array is an array of pointers to points, not an array of points */
269 centers[i] = objs[candidate_center];
270 }
271 lwfree(distances);
272 }
273}
274
275static uint32_t
277 uint32_t *clusters,
278 uint32_t n,
279 POINT4D *centers,
280 double *radii,
281 uint32_t min_k,
282 double max_radius)
283{
284 uint8_t converged = LW_FALSE;
285 uint32_t cur_k = min_k;
286
287 kmeans_init(objs, n, centers, cur_k);
288 /* One iteration of kmeans needs to happen without shortcuts to fully initialize structures */
289 update_r(objs, clusters, n, centers, radii, cur_k);
290 update_means(objs, clusters, n, centers, cur_k);
291 for (uint32_t t = 0; t < KMEANS_MAX_ITERATIONS; t++)
292 {
293 /* Standard KMeans loop */
294 for (uint32_t i = 0; i < KMEANS_MAX_ITERATIONS; i++)
295 {
296 LW_ON_INTERRUPT(break);
297 converged = update_r(objs, clusters, n, centers, radii, cur_k);
298 if (converged)
299 break;
300 update_means(objs, clusters, n, centers, cur_k);
301 }
302 if (!converged || !max_radius)
303 break;
304
305 /* XMeans-inspired improve_structure pass to split clusters bigger than limit into 2 */
306 uint32_t new_k = improve_structure(objs, clusters, n, centers, radii, cur_k, max_radius);
307 if (new_k == cur_k)
308 break;
309 cur_k = new_k;
310 }
311
312 if (!converged)
313 {
314 lwerror("%s did not converge after %d iterations", __func__, KMEANS_MAX_ITERATIONS);
315 return 0;
316 }
317 return cur_k;
318}
319
320int *
321lwgeom_cluster_kmeans(const LWGEOM **geoms, uint32_t n, uint32_t k, double max_radius)
322{
323 uint32_t num_non_empty = 0;
324
325 assert(k > 0);
326 assert(n > 0);
327 assert(max_radius >= 0);
328 assert(geoms);
329
330 if (n < k)
331 {
332 lwerror(
333 "%s: number of geometries is less than the number of clusters requested, not all clusters will get data",
334 __func__);
335 k = n;
336 }
337
338 /* An array of objects to be analyzed. */
339 POINT4D *objs_dense = lwalloc(sizeof(POINT4D) * n);
340
341 /* Array to mark unclusterable objects. Will be returned as KMEANS_NULL_CLUSTER. */
342 uint8_t *geom_valid = lwalloc(sizeof(uint8_t) * n);
343 memset(geom_valid, 0, sizeof(uint8_t) * n);
344
345 /* Array to fill in with cluster numbers. */
346 int *clusters = lwalloc(sizeof(int) * n);
347 for (uint32_t i = 0; i < n; i++)
348 clusters[i] = KMEANS_NULL_CLUSTER;
349
350 /* An array of clusters centers for the algorithm. */
351 POINT4D *centers = lwalloc(sizeof(POINT4D) * n);
352 memset(centers, 0, sizeof(POINT4D) * n);
353
354 /* An array of clusters radii for the algorithm. */
355 double *radii = lwalloc(sizeof(double) * n);
356 memset(radii, 0, sizeof(double) * n);
357
358 /* Prepare the list of object pointers for K-means */
359 for (uint32_t i = 0; i < n; i++)
360 {
361 const LWGEOM *geom = geoms[i];
362 /* Unset M values will be 1 */
363 POINT4D out = {0, 0, 0, 1};
364
365 /* Null/empty geometries get geom_valid=LW_FALSE set earlier with memset */
366 if ((!geom) || lwgeom_is_empty(geom))
367 continue;
368
369 /* If the input is a point, use its coordinates */
370 if (lwgeom_get_type(geom) == POINTTYPE)
371 {
372 out.x = lwpoint_get_x(lwgeom_as_lwpoint(geom));
373 out.y = lwpoint_get_y(lwgeom_as_lwpoint(geom));
374 if (lwgeom_has_z(geom))
375 out.z = lwpoint_get_z(lwgeom_as_lwpoint(geom));
376 if (lwgeom_has_m(geom))
377 {
378 out.m = lwpoint_get_m(lwgeom_as_lwpoint(geom));
379 if (out.m <= 0)
380 lwerror("%s has an input point geometry with weight in M less or equal to 0",
381 __func__);
382 }
383 }
384 else if (!lwgeom_has_z(geom))
385 {
386 /* For 2D, we can take a centroid */
388 if (!centroid)
389 continue;
391 {
393 continue;
394 }
398 }
399 else
400 {
401 /* For 3D non-point, we can have a box center */
402 const GBOX *box = lwgeom_get_bbox(geom);
403 if (!gbox_is_valid(box))
404 continue;
405 out.x = (box->xmax + box->xmin) / 2;
406 out.y = (box->ymax + box->ymin) / 2;
407 out.z = (box->zmax + box->zmin) / 2;
408 }
409 geom_valid[i] = LW_TRUE;
410 objs_dense[num_non_empty++] = out;
411 }
412
413 if (num_non_empty < k)
414 {
415 lwnotice(
416 "%s: number of non-empty geometries (%d) is less than the number of clusters (%d) requested, not all clusters will get data",
417 __func__,
418 num_non_empty,
419 k);
420 k = num_non_empty;
421 }
422
423 uint8_t converged = LW_TRUE;
424
425 if (num_non_empty > 0)
426 {
427 uint32_t *clusters_dense = lwalloc(sizeof(uint32_t) * num_non_empty);
428 memset(clusters_dense, 0, sizeof(uint32_t) * num_non_empty);
429 uint32_t output_cluster_count = kmeans(objs_dense, clusters_dense, num_non_empty, centers, radii, k, max_radius);
430
431 uint32_t d = 0;
432 for (uint32_t i = 0; i < n; i++)
433 if (geom_valid[i])
434 clusters[i] = (int)clusters_dense[d++];
435
436 converged = output_cluster_count > 0;
437 lwfree(clusters_dense);
438 }
439
440 /* Before error handling, might as well clean up all the inputs */
441 lwfree(objs_dense);
442 lwfree(centers);
443 lwfree(geom_valid);
444 lwfree(radii);
445
446 /* Good result */
447 if (converged)
448 return clusters;
449
450 /* Bad result, not going to need the answer */
451 lwfree(clusters);
452 return NULL;
453}
int gbox_is_valid(const GBOX *gbox)
Return false if any of the dimensions is NaN or infinite.
Definition gbox.c:197
#define LW_FALSE
Definition liblwgeom.h:94
double lwpoint_get_m(const LWPOINT *point)
Definition lwpoint.c:107
void lwgeom_free(LWGEOM *geom)
Definition lwgeom.c:1246
LWGEOM * lwgeom_centroid(const LWGEOM *geom)
double lwpoint_get_x(const LWPOINT *point)
Definition lwpoint.c:63
int lwgeom_has_z(const LWGEOM *geom)
Return LW_TRUE if geometry has Z ordinates.
Definition lwgeom.c:962
#define POINTTYPE
LWTYPE numbers, used internally by PostGIS.
Definition liblwgeom.h:102
void * lwalloc(size_t size)
Definition lwutil.c:227
void lwfree(void *mem)
Definition lwutil.c:248
double lwpoint_get_z(const LWPOINT *point)
Definition lwpoint.c:89
#define LW_TRUE
Return types for functions with status returns.
Definition liblwgeom.h:93
const GBOX * lwgeom_get_bbox(const LWGEOM *lwgeom)
Get a non-empty geometry bounding box, computing and caching it if not already there.
Definition lwgeom.c:771
int lwgeom_has_m(const LWGEOM *geom)
Return LW_TRUE if geometry has M ordinates.
Definition lwgeom.c:969
double lwpoint_get_y(const LWPOINT *point)
Definition lwpoint.c:76
#define LW_ON_INTERRUPT(x)
void lwnotice(const char *fmt,...) __attribute__((format(printf
Write a notice out to the notice handler.
void void lwerror(const char *fmt,...) __attribute__((format(printf
Write a notice out to the error handler.
static uint32_t lwgeom_get_type(const LWGEOM *geom)
Return LWTYPE number.
Definition lwinline.h:141
static LWPOINT * lwgeom_as_lwpoint(const LWGEOM *lwgeom)
Definition lwinline.h:127
static int lwgeom_is_empty(const LWGEOM *geom)
Return true or false depending on whether a geometry is an "empty" geometry (no vertices members)
Definition lwinline.h:199
#define KMEANS_MAX_ITERATIONS
Definition lwkmeans.c:20
static uint32_t kmeans(POINT4D *objs, uint32_t *clusters, uint32_t n, POINT4D *centers, double *radii, uint32_t min_k, double max_radius)
Definition lwkmeans.c:276
static void kmeans_init(POINT4D *objs, uint32_t n, POINT4D *centers, uint32_t k)
Definition lwkmeans.c:175
static uint8_t update_r(POINT4D *objs, uint32_t *clusters, uint32_t n, POINT4D *centers, double *radii, uint32_t k)
Definition lwkmeans.c:109
int * lwgeom_cluster_kmeans(const LWGEOM **geoms, uint32_t n, uint32_t k, double max_radius)
Take a list of LWGEOMs and a number of clusters and return an integer array indicating which cluster ...
Definition lwkmeans.c:321
static void update_means(POINT4D *objs, uint32_t *clusters, uint32_t n, POINT4D *centers, uint32_t k)
Definition lwkmeans.c:149
#define KMEANS_NULL_CLUSTER
Definition lwkmeans.c:14
static double distance3d_sqr_pt4d_pt4d(const POINT4D *p1, const POINT4D *p2)
Definition lwkmeans.c:31
static uint32_t improve_structure(POINT4D *objs, uint32_t *clusters, uint32_t n, POINT4D *centers, double *radii, uint32_t k, double max_radius)
Definition lwkmeans.c:42
static double distance(double x1, double y1, double x2, double y2)
Definition lwtree.c:1032
Datum centroid(PG_FUNCTION_ARGS)
double ymax
Definition liblwgeom.h:357
double zmax
Definition liblwgeom.h:359
double xmax
Definition liblwgeom.h:355
double zmin
Definition liblwgeom.h:358
double ymin
Definition liblwgeom.h:356
double xmin
Definition liblwgeom.h:354
double m
Definition liblwgeom.h:414
double x
Definition liblwgeom.h:414
double z
Definition liblwgeom.h:414
double y
Definition liblwgeom.h:414