PostGIS  3.4.0dev-r@@SVN_REVISION@@
lwkmeans.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * Copyright (c) 2018-2021, Darafei Praliaskouski <me@komzpa.net>
4  * Copyright (c) 2016, Paul Ramsey <pramsey@cleverelephant.ca>
5  *
6  *------------------------------------------------------------------------*/
7 
8 #include "liblwgeom_internal.h"
9 
10 /*
11  * When clustering lists with NULL or EMPTY elements, they will get this as
12  * their cluster number. (All the other clusters will be non-negative)
13  */
14 #define KMEANS_NULL_CLUSTER -1
15 
16 /*
17  * If the algorithm doesn't converge within this number of iterations,
18  * it will return with a failure error code.
19  */
20 #define KMEANS_MAX_ITERATIONS 1000
21 
22 static uint32_t kmeans(POINT4D *objs,
23  uint32_t *clusters,
24  uint32_t n,
25  POINT4D *centers,
26  double *radii,
27  uint32_t min_k,
28  double max_radius);
29 
30 inline static double
32 {
33  double hside = p2->x - p1->x;
34  double vside = p2->y - p1->y;
35  double zside = p2->z - p1->z;
36 
37  return hside * hside + vside * vside + zside * zside;
38 }
39 
40 /* Split the clusters that need to be split */
41 static uint32_t
43  uint32_t *clusters,
44  uint32_t n,
45  POINT4D *centers,
46  double *radii,
47  uint32_t k,
48  double max_radius)
49 {
50  /* Input check: radius limit should be measurable */
51  if (max_radius <= 0)
52  return k;
53 
54  double max_radius_sq = max_radius * max_radius;
55 
56  /* Do we have the big clusters to split at all? */
57  uint32_t first_cluster_to_split = 0;
58  for (; first_cluster_to_split < k; first_cluster_to_split++)
59  if (radii[first_cluster_to_split] > max_radius_sq)
60  break;
61  if (first_cluster_to_split == k)
62  return k;
63 
64  POINT4D *temp_objs = lwalloc(sizeof(POINT4D) * n);
65  uint32_t *temp_clusters = lwalloc(sizeof(uint32_t) * n);
66  double *temp_radii = lwalloc(sizeof(double) * n);
67  POINT4D *temp_centers = lwalloc(sizeof(POINT4D) * n);
68 
69  uint32_t new_k = k;
70 
71  for (uint32_t cluster = first_cluster_to_split; cluster < k; cluster++)
72  {
73  if (radii[cluster] <= max_radius_sq)
74  continue;
75 
76  /* copy cluster alone */
77  uint32_t cluster_size = 0;
78  for (uint32_t i = 0; i < n; i++)
79  if (clusters[i] == cluster)
80  temp_objs[cluster_size++] = objs[i];
81  if (cluster_size <= 1)
82  continue;
83 
84  /* run 2-means on the cluster */
85  kmeans(temp_objs, temp_clusters, cluster_size, temp_centers, temp_radii, 2, 0);
86 
87  /* replace cluster with split */
88  uint32_t d = 0;
89  for (uint32_t i = 0; i < n; i++)
90  if (clusters[i] == cluster)
91  if (temp_clusters[d++])
92  clusters[i] = new_k;
93 
94  centers[cluster] = temp_centers[0];
95  centers[new_k] = temp_centers[1];
96  radii[cluster] = temp_radii[0];
97  radii[new_k] = temp_radii[1];
98  new_k++;
99  }
100  lwfree(temp_centers);
101  lwfree(temp_radii);
102  lwfree(temp_clusters);
103  lwfree(temp_objs);
104  return new_k;
105 }
106 
107 /* Refresh mapping of point to closest cluster */
108 static uint8_t
109 update_r(POINT4D *objs, uint32_t *clusters, uint32_t n, POINT4D *centers, double *radii, uint32_t k)
110 {
111  uint8_t converged = LW_TRUE;
112  if (radii)
113  memset(radii, 0, sizeof(double) * k);
114 
115  for (uint32_t i = 0; i < n; i++)
116  {
117  POINT4D obj = objs[i];
118 
119  /* Initialize with distance to first cluster */
120  double curr_distance = distance3d_sqr_pt4d_pt4d(&obj, &centers[0]);
121  uint32_t curr_cluster = 0;
122 
123  /* Check all other cluster centers and find the nearest */
124  for (uint32_t cluster = 1; cluster < k; cluster++)
125  {
126  double distance = distance3d_sqr_pt4d_pt4d(&obj, &centers[cluster]);
127  if (distance < curr_distance)
128  {
129  curr_distance = distance;
130  curr_cluster = cluster;
131  }
132  }
133 
134  /* Store the nearest cluster this object is in */
135  if (clusters[i] != curr_cluster)
136  {
137  converged = LW_FALSE;
138  clusters[i] = curr_cluster;
139  }
140  if (radii)
141  if (radii[curr_cluster] < curr_distance)
142  radii[curr_cluster] = curr_distance;
143  }
144  return converged;
145 }
146 
147 /* Refresh cluster centroids based on all of their objects */
148 static void
149 update_means(POINT4D *objs, uint32_t *clusters, uint32_t n, POINT4D *centers, uint32_t k)
150 {
151  memset(centers, 0, sizeof(POINT4D) * k);
152  /* calculate weighted sum */
153  for (uint32_t i = 0; i < n; i++)
154  {
155  uint32_t cluster = clusters[i];
156  centers[cluster].x += objs[i].x * objs[i].m;
157  centers[cluster].y += objs[i].y * objs[i].m;
158  centers[cluster].z += objs[i].z * objs[i].m;
159  centers[cluster].m += objs[i].m;
160  }
161  /* divide by weight to get average */
162  for (uint32_t i = 0; i < k; i++)
163  {
164  if (centers[i].m)
165  {
166  centers[i].x /= centers[i].m;
167  centers[i].y /= centers[i].m;
168  centers[i].z /= centers[i].m;
169  }
170  }
171 }
172 
173 /* Assign initial clusters centroids heuristically */
174 static void
175 kmeans_init(POINT4D *objs, uint32_t n, POINT4D *centers, uint32_t k)
176 {
177  double *distances;
178  uint32_t p1 = 0, p2 = 0;
179  uint32_t duplicate_count = 1; /* a point is a duplicate of itself */
180  double max_dst = -1;
181 
182  /* k=0, k=1: any point will do */
183  assert(n > 0);
184  if (k < 2)
185  {
186  centers[0] = objs[0];
187  return;
188  }
189 
190  /* k >= 2: find two distant points greedily */
191  for (uint32_t i = 1; i < n; i++)
192  {
193  /* if we found a larger distance, replace our choice */
194  double dst_p1 = distance3d_sqr_pt4d_pt4d(&objs[i], &objs[p1]);
195  double dst_p2 = distance3d_sqr_pt4d_pt4d(&objs[i], &objs[p2]);
196  if ((dst_p1 > max_dst) || (dst_p2 > max_dst))
197  {
198  if (dst_p1 > dst_p2)
199  {
200  max_dst = dst_p1;
201  p2 = i;
202  }
203  else
204  {
205  max_dst = dst_p2;
206  p1 = i;
207  }
208  }
209  if ((dst_p1 == 0) || (dst_p2 == 0))
210  duplicate_count++;
211  }
212  if (duplicate_count > 1)
213  lwnotice(
214  "%s: there are at least %u duplicate inputs, number of output clusters may be less than you requested",
215  __func__,
216  duplicate_count);
217 
218  /* by now two points should be found and non-same */
219  assert(p1 != p2 && max_dst >= 0);
220 
221  /* accept these two points */
222  centers[0] = objs[p1];
223  centers[1] = objs[p2];
224 
225  if (k > 2)
226  {
227  /* array of minimum distance to a point from accepted cluster centers */
228  distances = lwalloc(sizeof(double) * n);
229 
230  /* initialize array with distance to first object */
231  for (uint32_t j = 0; j < n; j++)
232  distances[j] = distance3d_sqr_pt4d_pt4d(&objs[j], &centers[0]);
233  distances[p1] = -1;
234  distances[p2] = -1;
235 
236  /* loop i on clusters, skip 0 and 1 as found already */
237  for (uint32_t i = 2; i < k; i++)
238  {
239  uint32_t candidate_center = 0;
240  double max_distance = -DBL_MAX;
241 
242  /* loop j on objs */
243  for (uint32_t j = 0; j < n; j++)
244  {
245  /* empty objs and accepted clusters are already marked with distance = -1 */
246  if (distances[j] < 0)
247  continue;
248 
249  /* update minimal distance with previosuly accepted cluster */
250  double current_distance = distance3d_sqr_pt4d_pt4d(&objs[j], &centers[i - 1]);
251  if (current_distance < distances[j])
252  distances[j] = current_distance;
253 
254  /* greedily take a point that's farthest from any of accepted clusters */
255  if (distances[j] > max_distance)
256  {
257  candidate_center = j;
258  max_distance = distances[j];
259  }
260  }
261 
262  /* Checked earlier by counting entries on input, just in case */
263  assert(max_distance >= 0);
264 
265  /* accept candidate to centers */
266  distances[candidate_center] = -1;
267  /* Copy the point coordinates into the initial centers array
268  * Centers array is an array of pointers to points, not an array of points */
269  centers[i] = objs[candidate_center];
270  }
271  lwfree(distances);
272  }
273 }
274 
275 static uint32_t
277  uint32_t *clusters,
278  uint32_t n,
279  POINT4D *centers,
280  double *radii,
281  uint32_t min_k,
282  double max_radius)
283 {
284  uint8_t converged = LW_FALSE;
285  uint32_t cur_k = min_k;
286 
287  kmeans_init(objs, n, centers, cur_k);
288  /* One iteration of kmeans needs to happen without shortcuts to fully initialize structures */
289  update_r(objs, clusters, n, centers, radii, cur_k);
290  update_means(objs, clusters, n, centers, cur_k);
291  for (uint32_t t = 0; t < KMEANS_MAX_ITERATIONS; t++)
292  {
293  /* Standard KMeans loop */
294  for (uint32_t i = 0; i < KMEANS_MAX_ITERATIONS; i++)
295  {
296  LW_ON_INTERRUPT(break);
297  converged = update_r(objs, clusters, n, centers, radii, cur_k);
298  if (converged)
299  break;
300  update_means(objs, clusters, n, centers, cur_k);
301  }
302  if (!converged || !max_radius)
303  break;
304 
305  /* XMeans-inspired improve_structure pass to split clusters bigger than limit into 2 */
306  uint32_t new_k = improve_structure(objs, clusters, n, centers, radii, cur_k, max_radius);
307  if (new_k == cur_k)
308  break;
309  cur_k = new_k;
310  }
311 
312  if (!converged)
313  {
314  lwerror("%s did not converge after %d iterations", __func__, KMEANS_MAX_ITERATIONS);
315  return 0;
316  }
317  return cur_k;
318 }
319 
320 int *
321 lwgeom_cluster_kmeans(const LWGEOM **geoms, uint32_t n, uint32_t k, double max_radius)
322 {
323  uint32_t num_non_empty = 0;
324 
325  assert(k > 0);
326  assert(n > 0);
327  assert(max_radius >= 0);
328  assert(geoms);
329 
330  if (n < k)
331  {
332  lwerror(
333  "%s: number of geometries is less than the number of clusters requested, not all clusters will get data",
334  __func__);
335  k = n;
336  }
337 
338  /* An array of objects to be analyzed. */
339  POINT4D *objs_dense = lwalloc(sizeof(POINT4D) * n);
340 
341  /* Array to mark unclusterable objects. Will be returned as KMEANS_NULL_CLUSTER. */
342  uint8_t *geom_valid = lwalloc(sizeof(uint8_t) * n);
343  memset(geom_valid, 0, sizeof(uint8_t) * n);
344 
345  /* Array to fill in with cluster numbers. */
346  int *clusters = lwalloc(sizeof(int) * n);
347  for (uint32_t i = 0; i < n; i++)
348  clusters[i] = KMEANS_NULL_CLUSTER;
349 
350  /* An array of clusters centers for the algorithm. */
351  POINT4D *centers = lwalloc(sizeof(POINT4D) * n);
352  memset(centers, 0, sizeof(POINT4D) * n);
353 
354  /* An array of clusters radii for the algorithm. */
355  double *radii = lwalloc(sizeof(double) * n);
356  memset(radii, 0, sizeof(double) * n);
357 
358  /* Prepare the list of object pointers for K-means */
359  for (uint32_t i = 0; i < n; i++)
360  {
361  const LWGEOM *geom = geoms[i];
362  /* Unset M values will be 1 */
363  POINT4D out = {0, 0, 0, 1};
364 
365  /* Null/empty geometries get geom_valid=LW_FALSE set earlier with memset */
366  if ((!geom) || lwgeom_is_empty(geom))
367  continue;
368 
369  /* If the input is a point, use its coordinates */
370  if (lwgeom_get_type(geom) == POINTTYPE)
371  {
372  out.x = lwpoint_get_x(lwgeom_as_lwpoint(geom));
373  out.y = lwpoint_get_y(lwgeom_as_lwpoint(geom));
374  if (lwgeom_has_z(geom))
375  out.z = lwpoint_get_z(lwgeom_as_lwpoint(geom));
376  if (lwgeom_has_m(geom))
377  {
378  out.m = lwpoint_get_m(lwgeom_as_lwpoint(geom));
379  if (out.m <= 0)
380  lwerror("%s has an input point geometry with weight in M less or equal to 0",
381  __func__);
382  }
383  }
384  else if (!lwgeom_has_z(geom))
385  {
386  /* For 2D, we can take a centroid */
388  if (!centroid)
389  continue;
391  {
393  continue;
394  }
398  }
399  else
400  {
401  /* For 3D non-point, we can have a box center */
402  const GBOX *box = lwgeom_get_bbox(geom);
403  if (!gbox_is_valid(box))
404  continue;
405  out.x = (box->xmax + box->xmin) / 2;
406  out.y = (box->ymax + box->ymin) / 2;
407  out.z = (box->zmax + box->zmin) / 2;
408  }
409  geom_valid[i] = LW_TRUE;
410  objs_dense[num_non_empty++] = out;
411  }
412 
413  if (num_non_empty < k)
414  {
415  lwnotice(
416  "%s: number of non-empty geometries (%d) is less than the number of clusters (%d) requested, not all clusters will get data",
417  __func__,
418  num_non_empty,
419  k);
420  k = num_non_empty;
421  }
422 
423  uint8_t converged = LW_TRUE;
424 
425  if (num_non_empty > 0)
426  {
427  uint32_t *clusters_dense = lwalloc(sizeof(uint32_t) * num_non_empty);
428  memset(clusters_dense, 0, sizeof(uint32_t) * num_non_empty);
429  uint32_t output_cluster_count = kmeans(objs_dense, clusters_dense, num_non_empty, centers, radii, k, max_radius);
430 
431  uint32_t d = 0;
432  for (uint32_t i = 0; i < n; i++)
433  if (geom_valid[i])
434  clusters[i] = (int)clusters_dense[d++];
435 
436  converged = output_cluster_count > 0;
437  lwfree(clusters_dense);
438  }
439 
440  /* Before error handling, might as well clean up all the inputs */
441  lwfree(objs_dense);
442  lwfree(centers);
443  lwfree(geom_valid);
444  lwfree(radii);
445 
446  /* Good result */
447  if (converged)
448  return clusters;
449 
450  /* Bad result, not going to need the answer */
451  lwfree(clusters);
452  return NULL;
453 }
int gbox_is_valid(const GBOX *gbox)
Return false if any of the dimensions is NaN or infinite.
Definition: gbox.c:197
LWGEOM * lwgeom_centroid(const LWGEOM *geom)
#define LW_FALSE
Definition: liblwgeom.h:94
double lwpoint_get_m(const LWPOINT *point)
Definition: lwpoint.c:107
void lwgeom_free(LWGEOM *geom)
Definition: lwgeom.c:1155
double lwpoint_get_x(const LWPOINT *point)
Definition: lwpoint.c:63
int lwgeom_has_z(const LWGEOM *geom)
Return LW_TRUE if geometry has Z ordinates.
Definition: lwgeom.c:934
#define POINTTYPE
LWTYPE numbers, used internally by PostGIS.
Definition: liblwgeom.h:102
void lwfree(void *mem)
Definition: lwutil.c:242
const GBOX * lwgeom_get_bbox(const LWGEOM *lwgeom)
Get a non-empty geometry bounding box, computing and caching it if not already there.
Definition: lwgeom.c:743
void * lwalloc(size_t size)
Definition: lwutil.c:227
double lwpoint_get_z(const LWPOINT *point)
Definition: lwpoint.c:89
#define LW_TRUE
Return types for functions with status returns.
Definition: liblwgeom.h:93
int lwgeom_has_m(const LWGEOM *geom)
Return LW_TRUE if geometry has M ordinates.
Definition: lwgeom.c:941
double lwpoint_get_y(const LWPOINT *point)
Definition: lwpoint.c:76
#define LW_ON_INTERRUPT(x)
void lwerror(const char *fmt,...)
Write a notice out to the error handler.
Definition: lwutil.c:190
void lwnotice(const char *fmt,...)
Write a notice out to the notice handler.
Definition: lwutil.c:177
static uint32_t lwgeom_get_type(const LWGEOM *geom)
Return LWTYPE number.
Definition: lwinline.h:145
static int lwgeom_is_empty(const LWGEOM *geom)
Return true or false depending on whether a geometry is an "empty" geometry (no vertices members)
Definition: lwinline.h:203
static LWPOINT * lwgeom_as_lwpoint(const LWGEOM *lwgeom)
Definition: lwinline.h:131
#define KMEANS_MAX_ITERATIONS
Definition: lwkmeans.c:20
static uint32_t kmeans(POINT4D *objs, uint32_t *clusters, uint32_t n, POINT4D *centers, double *radii, uint32_t min_k, double max_radius)
Definition: lwkmeans.c:276
static void kmeans_init(POINT4D *objs, uint32_t n, POINT4D *centers, uint32_t k)
Definition: lwkmeans.c:175
static uint8_t update_r(POINT4D *objs, uint32_t *clusters, uint32_t n, POINT4D *centers, double *radii, uint32_t k)
Definition: lwkmeans.c:109
int * lwgeom_cluster_kmeans(const LWGEOM **geoms, uint32_t n, uint32_t k, double max_radius)
Take a list of LWGEOMs and a number of clusters and return an integer array indicating which cluster ...
Definition: lwkmeans.c:321
static void update_means(POINT4D *objs, uint32_t *clusters, uint32_t n, POINT4D *centers, uint32_t k)
Definition: lwkmeans.c:149
#define KMEANS_NULL_CLUSTER
Definition: lwkmeans.c:14
static double distance3d_sqr_pt4d_pt4d(const POINT4D *p1, const POINT4D *p2)
Definition: lwkmeans.c:31
static uint32_t improve_structure(POINT4D *objs, uint32_t *clusters, uint32_t n, POINT4D *centers, double *radii, uint32_t k, double max_radius)
Definition: lwkmeans.c:42
static double distance(double x1, double y1, double x2, double y2)
Definition: lwtree.c:1032
Datum centroid(PG_FUNCTION_ARGS)
double ymax
Definition: liblwgeom.h:357
double zmax
Definition: liblwgeom.h:359
double xmax
Definition: liblwgeom.h:355
double zmin
Definition: liblwgeom.h:358
double ymin
Definition: liblwgeom.h:356
double xmin
Definition: liblwgeom.h:354
double m
Definition: liblwgeom.h:414
double x
Definition: liblwgeom.h:414
double z
Definition: liblwgeom.h:414
double y
Definition: liblwgeom.h:414