PostGIS  3.1.6dev-r@@SVN_REVISION@@
lwkmeans.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * Copyright (c) 2018-2020, Darafei Praliaskouski <me@komzpa.net>
4  * Copyright (c) 2016, Paul Ramsey <pramsey@cleverelephant.ca>
5  *
6  *------------------------------------------------------------------------*/
7 
8 #include "liblwgeom_internal.h"
9 
10 /*
11  * When clustering lists with NULL or EMPTY elements, they will get this as
12  * their cluster number. (All the other clusters will be non-negative)
13  */
14 #define KMEANS_NULL_CLUSTER -1
15 
16 /*
17  * If the algorithm doesn't converge within this number of iterations,
18  * it will return with a failure error code.
19  */
20 #define KMEANS_MAX_ITERATIONS 1000
21 
22 inline static double
24 {
25  double hside = p2->x - p1->x;
26  double vside = p2->y - p1->y;
27  double zside = p2->z - p1->z;
28 
29  return hside * hside + vside * vside + zside * zside;
30 }
31 
32 static uint8_t
33 update_r(POINT4D *objs, int *clusters, uint32_t n, POINT4D *centers, uint32_t k)
34 {
35  uint8_t converged = LW_TRUE;
36 
37  for (uint32_t i = 0; i < n; i++)
38  {
39  POINT4D obj = objs[i];
40 
41  /* Initialize with distance to first cluster */
42  double curr_distance = distance3d_sqr_pt4d_pt4d(&obj, &centers[0]);
43  int curr_cluster = 0;
44 
45  /* Check all other cluster centers and find the nearest */
46  for (int cluster = 1; cluster < (int)k; cluster++)
47  {
48  double distance = distance3d_sqr_pt4d_pt4d(&obj, &centers[cluster]);
49  if (distance < curr_distance)
50  {
51  curr_distance = distance;
52  curr_cluster = cluster;
53  }
54  }
55 
56  /* Store the nearest cluster this object is in */
57  if (clusters[i] != curr_cluster)
58  {
59  converged = LW_FALSE;
60  clusters[i] = curr_cluster;
61  }
62  }
63  return converged;
64 }
65 
66 static void
67 update_means(POINT4D *objs, int *clusters, uint32_t n, POINT4D *centers, uint32_t k)
68 {
69  memset(centers, 0, sizeof(POINT4D) * k);
70  for (uint32_t i = 0; i < n; i++)
71  {
72  int cluster = clusters[i];
73  centers[cluster].x += objs[i].x * objs[i].m;
74  centers[cluster].y += objs[i].y * objs[i].m;
75  centers[cluster].z += objs[i].z * objs[i].m;
76  centers[cluster].m += objs[i].m;
77  }
78  for (uint32_t i = 0; i < k; i++)
79  {
80  if (centers[i].m)
81  {
82  centers[i].x /= centers[i].m;
83  centers[i].y /= centers[i].m;
84  centers[i].z /= centers[i].m;
85  }
86  }
87 }
88 
89 static uint8_t
90 kmeans(POINT4D *objs, int *clusters, uint32_t n, POINT4D *centers, uint32_t k)
91 {
92  uint8_t converged = LW_FALSE;
93 
94  for (uint32_t i = 0; i < KMEANS_MAX_ITERATIONS; i++)
95  {
96  LW_ON_INTERRUPT(break);
97  converged = update_r(objs, clusters, n, centers, k);
98  if (converged)
99  break;
100  update_means(objs, clusters, n, centers, k);
101  }
102  if (!converged)
103  lwerror("%s did not converge after %d iterations", __func__, KMEANS_MAX_ITERATIONS);
104  return converged;
105 }
106 
107 static void
108 kmeans_init(POINT4D *objs, uint32_t n, POINT4D *centers, uint32_t k)
109 {
110  double *distances;
111  uint32_t p1 = 0, p2 = 0;
112  uint32_t i, j;
113  uint32_t duplicate_count = 1; /* a point is a duplicate of itself */
114  double max_dst = -1, current_distance;
115  double dst_p1, dst_p2;
116 
117  /* k=0, k=1: "clustering" is just input validation */
118  assert(k > 1);
119 
120  /* k >= 2: find two distant points greedily */
121  for (i = 1; i < n; i++)
122  {
123  /* if we found a larger distance, replace our choice */
124  dst_p1 = distance3d_sqr_pt4d_pt4d(&objs[i], &objs[p1]);
125  dst_p2 = distance3d_sqr_pt4d_pt4d(&objs[i], &objs[p2]);
126  if ((dst_p1 > max_dst) || (dst_p2 > max_dst))
127  {
128  if (dst_p1 > dst_p2)
129  {
130  max_dst = dst_p1;
131  p2 = i;
132  }
133  else
134  {
135  max_dst = dst_p2;
136  p1 = i;
137  }
138  }
139  if ((dst_p1 == 0) || (dst_p2 == 0))
140  duplicate_count++;
141  }
142  if (duplicate_count > 1)
143  lwnotice(
144  "%s: there are at least %u duplicate inputs, number of output clusters may be less than you requested",
145  __func__,
146  duplicate_count);
147 
148  /* by now two points should be found and non-same */
149  assert(p1 != p2 && max_dst >= 0);
150 
151  /* accept these two points */
152  centers[0] = objs[p1];
153  centers[1] = objs[p2];
154 
155  if (k > 2)
156  {
157  /* array of minimum distance to a point from accepted cluster centers */
158  distances = lwalloc(sizeof(double) * n);
159 
160  /* initialize array with distance to first object */
161  for (j = 0; j < n; j++)
162  distances[j] = distance3d_sqr_pt4d_pt4d(&objs[j], &centers[0]);
163  distances[p1] = -1;
164  distances[p2] = -1;
165 
166  /* loop i on clusters, skip 0 and 1 as found already */
167  for (i = 2; i < k; i++)
168  {
169  uint32_t candidate_center = 0;
170  double max_distance = -DBL_MAX;
171 
172  /* loop j on objs */
173  for (j = 0; j < n; j++)
174  {
175  /* empty objs and accepted clusters are already marked with distance = -1 */
176  if (distances[j] < 0)
177  continue;
178 
179  /* update minimal distance with previosuly accepted cluster */
180  current_distance = distance3d_sqr_pt4d_pt4d(&objs[j], &centers[i - 1]);
181  if (current_distance < distances[j])
182  distances[j] = current_distance;
183 
184  /* greedily take a point that's farthest from any of accepted clusters */
185  if (distances[j] > max_distance)
186  {
187  candidate_center = j;
188  max_distance = distances[j];
189  }
190  }
191 
192  /* Checked earlier by counting entries on input, just in case */
193  assert(max_distance >= 0);
194 
195  /* accept candidate to centers */
196  distances[candidate_center] = -1;
197  /* Copy the point coordinates into the initial centers array
198  * Centers array is an array of pointers to points, not an array of points */
199  centers[i] = objs[candidate_center];
200  }
201  lwfree(distances);
202  }
203 }
204 
205 int *
206 lwgeom_cluster_kmeans(const LWGEOM **geoms, uint32_t n, uint32_t k)
207 {
208  uint32_t num_non_empty = 0;
209  uint8_t converged = LW_FALSE;
210 
211  assert(k > 0);
212  assert(n > 0);
213  assert(geoms);
214 
215  if (n < k)
216  {
217  lwerror(
218  "%s: number of geometries is less than the number of clusters requested, not all clusters will get data",
219  __func__);
220  k = n;
221  }
222 
223  /* An array of objects to be analyzed. */
224  POINT4D *objs = lwalloc(sizeof(POINT4D) * n);
225 
226  /* Array to mark unclusterable objects. Will be returned as KMEANS_NULL_CLUSTER. */
227  uint8_t *geom_valid = lwalloc(sizeof(uint8_t) * n);
228  memset(geom_valid, 0, sizeof(uint8_t) * n);
229 
230  /* Array to fill in with cluster numbers. */
231  int *clusters = lwalloc(sizeof(int) * n);
232  memset(clusters, 0, sizeof(int) * n);
233 
234  /* An array of clusters centers for the algorithm. */
235  POINT4D *centers = lwalloc(sizeof(POINT4D) * k);
236  memset(centers, 0, sizeof(POINT4D) * k);
237 
238  /* Prepare the list of object pointers for K-means */
239  for (uint32_t i = 0; i < n; i++)
240  {
241  const LWGEOM *geom = geoms[i];
242  /* Unset M values will be 1 */
243  POINT4D out = {0, 0, 0, 1};
244 
245  /* Null/empty geometries get geom_valid=LW_FALSE set earlier with memset */
246  if ((!geom) || lwgeom_is_empty(geom))
247  continue;
248 
249  /* If the input is a point, use its coordinates */
250  if (lwgeom_get_type(geom) == POINTTYPE)
251  {
252  out.x = lwpoint_get_x(lwgeom_as_lwpoint(geom));
253  out.y = lwpoint_get_y(lwgeom_as_lwpoint(geom));
254  if (lwgeom_has_z(geom))
255  out.z = lwpoint_get_z(lwgeom_as_lwpoint(geom));
256  if (lwgeom_has_m(geom))
257  {
258  out.m = lwpoint_get_m(lwgeom_as_lwpoint(geom));
259  if (out.m <= 0)
260  lwerror("%s has an input point geometry with weight in M less or equal to 0",
261  __func__);
262  }
263  }
264  else if (!lwgeom_has_z(geom))
265  {
266  /* For 2D, we can take a centroid*/
268  if (!centroid)
269  continue;
271  {
273  continue;
274  }
278  }
279  else
280  {
281  /* For 3D non-point, we can have a box center */
282  const GBOX *box = lwgeom_get_bbox(geom);
283  if (!gbox_is_valid(box))
284  continue;
285  out.x = (box->xmax + box->xmin) / 2;
286  out.y = (box->ymax + box->ymin) / 2;
287  out.z = (box->zmax + box->zmin) / 2;
288  }
289  geom_valid[i] = LW_TRUE;
290  objs[num_non_empty++] = out;
291  }
292 
293  if (num_non_empty < k)
294  {
295  lwnotice(
296  "%s: number of non-empty geometries (%d) is less than the number of clusters (%d) requested, not all clusters will get data",
297  __func__,
298  num_non_empty,
299  k);
300  k = num_non_empty;
301  }
302 
303  if (k > 1)
304  {
305  int *clusters_dense = lwalloc(sizeof(int) * num_non_empty);
306  memset(clusters_dense, 0, sizeof(int) * num_non_empty);
307 
308  kmeans_init(objs, num_non_empty, centers, k);
309  converged = kmeans(objs, clusters_dense, num_non_empty, centers, k);
310 
311  if (converged)
312  {
313  uint32_t d = 0;
314  for (uint32_t i = 0; i < n; i++)
315  if (geom_valid[i])
316  clusters[i] = clusters_dense[d++];
317  else
318  clusters[i] = KMEANS_NULL_CLUSTER;
319  }
320  lwfree(clusters_dense);
321  }
322  else if (k == 0)
323  {
324  /* k=0: everything is unclusterable */
325  for (uint32_t i = 0; i < n; i++)
326  clusters[i] = KMEANS_NULL_CLUSTER;
327  converged = LW_TRUE;
328  }
329  else
330  {
331  /* k=1: mark up NULL and non-NULL */
332  for (uint32_t i = 0; i < n; i++)
333  {
334  if (!geom_valid[i])
335  clusters[i] = KMEANS_NULL_CLUSTER;
336  else
337  clusters[i] = 0;
338  }
339  converged = LW_TRUE;
340  }
341 
342  /* Before error handling, might as well clean up all the inputs */
343  lwfree(objs);
344  lwfree(centers);
345  lwfree(geom_valid);
346 
347  /* Good result */
348  if (converged)
349  return clusters;
350 
351  /* Bad result, not going to need the answer */
352  lwfree(clusters);
353  return NULL;
354 }
int gbox_is_valid(const GBOX *gbox)
Return false if any of the dimensions is NaN or infinite.
Definition: gbox.c:197
LWGEOM * lwgeom_centroid(const LWGEOM *geom)
#define LW_FALSE
Definition: liblwgeom.h:108
double lwpoint_get_m(const LWPOINT *point)
Definition: lwpoint.c:107
void lwgeom_free(LWGEOM *geom)
Definition: lwgeom.c:1138
double lwpoint_get_x(const LWPOINT *point)
Definition: lwpoint.c:63
int lwgeom_has_z(const LWGEOM *geom)
Return LW_TRUE if geometry has Z ordinates.
Definition: lwgeom.c:917
#define POINTTYPE
LWTYPE numbers, used internally by PostGIS.
Definition: liblwgeom.h:116
void lwfree(void *mem)
Definition: lwutil.c:242
const GBOX * lwgeom_get_bbox(const LWGEOM *lwgeom)
Get a non-empty geometry bounding box, computing and caching it if not already there.
Definition: lwgeom.c:726
void * lwalloc(size_t size)
Definition: lwutil.c:227
double lwpoint_get_z(const LWPOINT *point)
Definition: lwpoint.c:89
#define LW_TRUE
Return types for functions with status returns.
Definition: liblwgeom.h:107
int lwgeom_has_m(const LWGEOM *geom)
Return LW_TRUE if geometry has M ordinates.
Definition: lwgeom.c:924
double lwpoint_get_y(const LWPOINT *point)
Definition: lwpoint.c:76
#define LW_ON_INTERRUPT(x)
void lwerror(const char *fmt,...)
Write a notice out to the error handler.
Definition: lwutil.c:190
void lwnotice(const char *fmt,...)
Write a notice out to the notice handler.
Definition: lwutil.c:177
static uint32_t lwgeom_get_type(const LWGEOM *geom)
Return LWTYPE number.
Definition: lwinline.h:145
static int lwgeom_is_empty(const LWGEOM *geom)
Return true or false depending on whether a geometry is an "empty" geometry (no vertices members)
Definition: lwinline.h:203
static LWPOINT * lwgeom_as_lwpoint(const LWGEOM *lwgeom)
Definition: lwinline.h:131
static uint8_t kmeans(POINT4D *objs, int *clusters, uint32_t n, POINT4D *centers, uint32_t k)
Definition: lwkmeans.c:90
#define KMEANS_MAX_ITERATIONS
Definition: lwkmeans.c:20
static void kmeans_init(POINT4D *objs, uint32_t n, POINT4D *centers, uint32_t k)
Definition: lwkmeans.c:108
static void update_means(POINT4D *objs, int *clusters, uint32_t n, POINT4D *centers, uint32_t k)
Definition: lwkmeans.c:67
int * lwgeom_cluster_kmeans(const LWGEOM **geoms, uint32_t n, uint32_t k)
Take a list of LWGEOMs and a number of clusters and return an integer array indicating which cluster ...
Definition: lwkmeans.c:206
static uint8_t update_r(POINT4D *objs, int *clusters, uint32_t n, POINT4D *centers, uint32_t k)
Definition: lwkmeans.c:33
#define KMEANS_NULL_CLUSTER
Definition: lwkmeans.c:14
static double distance3d_sqr_pt4d_pt4d(const POINT4D *p1, const POINT4D *p2)
Definition: lwkmeans.c:23
static double distance(double x1, double y1, double x2, double y2)
Definition: lwtree.c:1032
Datum centroid(PG_FUNCTION_ARGS)
double ymax
Definition: liblwgeom.h:371
double zmax
Definition: liblwgeom.h:373
double xmax
Definition: liblwgeom.h:369
double zmin
Definition: liblwgeom.h:372
double ymin
Definition: liblwgeom.h:370
double xmin
Definition: liblwgeom.h:368
double m
Definition: liblwgeom.h:428
double x
Definition: liblwgeom.h:428
double z
Definition: liblwgeom.h:428
double y
Definition: liblwgeom.h:428