PostGIS  2.5.7dev-r@@SVN_REVISION@@
lwkmeans.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * Copyright (c) 2018, Darafei Praliaskouski <me@komzpa.net>
4  * Copyright (c) 2016, Paul Ramsey <pramsey@cleverelephant.ca>
5  *
6  *------------------------------------------------------------------------*/
7 
8 #include <float.h>
9 #include <math.h>
10 
11 #include "liblwgeom_internal.h"
12 
13 /*
14  * When clustering lists with NULL or EMPTY elements, they will get this as
15  * their cluster number. (All the other clusters will be non-negative)
16  */
17 #define KMEANS_NULL_CLUSTER -1
18 
19 /*
20  * If the algorithm doesn't converge within this number of iterations,
21  * it will return with a failure error code.
22  */
23 #define KMEANS_MAX_ITERATIONS 1000
24 
25 static void
26 update_r(POINT2D** objs, int* clusters, uint32_t n, POINT2D** centers, uint32_t k)
27 {
28  POINT2D* obj;
29  unsigned int i;
30  double distance, curr_distance;
31  uint32_t cluster, curr_cluster;
32 
33  for (i = 0; i < n; i++)
34  {
35  obj = objs[i];
36 
37  /* Don't try to cluster NULL objects, just add them to the "unclusterable cluster" */
38  if (!obj)
39  {
40  clusters[i] = KMEANS_NULL_CLUSTER;
41  continue;
42  }
43 
44  /* Initialize with distance to first cluster */
45  curr_distance = distance2d_sqr_pt_pt(obj, centers[0]);
46  curr_cluster = 0;
47 
48  /* Check all other cluster centers and find the nearest */
49  for (cluster = 1; cluster < k; cluster++)
50  {
51  distance = distance2d_sqr_pt_pt(obj, centers[cluster]);
52  if (distance < curr_distance)
53  {
54  curr_distance = distance;
55  curr_cluster = cluster;
56  }
57  }
58 
59  /* Store the nearest cluster this object is in */
60  clusters[i] = (int) curr_cluster;
61  }
62 }
63 
64 static void
65 update_means(POINT2D** objs, int* clusters, uint32_t n, POINT2D** centers, uint32_t* weights, uint32_t k)
66 {
67  uint32_t i;
68  int cluster;
69 
70  memset(weights, 0, sizeof(uint32_t) * k);
71  for (i = 0; i < k; i++)
72  {
73  centers[i]->x = 0.0;
74  centers[i]->y = 0.0;
75  }
76  for (i = 0; i < n; i++)
77  {
78  cluster = clusters[i];
79  if (cluster != KMEANS_NULL_CLUSTER)
80  {
81  centers[cluster]->x += objs[i]->x;
82  centers[cluster]->y += objs[i]->y;
83  weights[cluster] += 1;
84  }
85  }
86  for (i = 0; i < k; i++)
87  {
88  if (weights[i])
89  {
90  centers[i]->x /= weights[i];
91  centers[i]->y /= weights[i];
92  }
93  }
94 }
95 
96 static int
97 kmeans(POINT2D** objs, int* clusters, uint32_t n, POINT2D** centers, uint32_t k)
98 {
99  uint32_t i = 0;
100  int* clusters_last;
101  int converged = LW_FALSE;
102  size_t clusters_sz = sizeof(int) * n;
103  uint32_t* weights;
104 
105  weights = lwalloc(sizeof(int) * k);
106 
107  /* previous cluster state array */
108  clusters_last = lwalloc(clusters_sz);
109 
110  for (i = 0; i < KMEANS_MAX_ITERATIONS && !converged; i++)
111  {
112  LW_ON_INTERRUPT(break);
113 
114  /* store the previous state of the clustering */
115  memcpy(clusters_last, clusters, clusters_sz);
116 
117  update_r(objs, clusters, n, centers, k);
118  update_means(objs, clusters, n, centers, weights, k);
119 
120  /* if all the cluster numbers are unchanged, we are at a stable solution */
121  converged = memcmp(clusters_last, clusters, clusters_sz) == 0;
122  }
123 
124  lwfree(clusters_last);
125  lwfree(weights);
126  if (!converged)
127  lwerror("%s did not converge after %d iterations", __func__, i);
128  return converged;
129 }
130 
131 static void
132 kmeans_init(POINT2D** objs, int* clusters, uint32_t n, POINT2D** centers, POINT2D* centers_raw, uint32_t k)
133 {
134  double* distances;
135  uint32_t p1 = 0, p2 = 0;
136  uint32_t i, j;
137  uint32_t duplicate_count = 1; /* a point is a duplicate of itself */
138  double max_dst = -1;
139  double dst_p1, dst_p2;
140 
141  /* k=0, k=1: "clustering" is just input validation */
142  assert(k > 1);
143 
144  /* k >= 2: find two distant points greedily */
145  for (i = 1; i < n; i++)
146  {
147  /* skip null */
148  if (!objs[i]) continue;
149 
150  /* reinit if first element happened to be null */
151  if (!objs[p1] && !objs[p2])
152  {
153  p1 = i;
154  p2 = i;
155  continue;
156  }
157 
158  /* if we found a larger distance, replace our choice */
159  dst_p1 = distance2d_sqr_pt_pt(objs[i], objs[p1]);
160  dst_p2 = distance2d_sqr_pt_pt(objs[i], objs[p2]);
161  if ((dst_p1 > max_dst) || (dst_p2 > max_dst))
162  {
163  max_dst = fmax(dst_p1, dst_p2);
164  if (dst_p1 > dst_p2)
165  p2 = i;
166  else
167  p1 = i;
168  }
169  if ((dst_p1 == 0) || (dst_p2 == 0)) duplicate_count++;
170  }
171  if (duplicate_count > 1)
172  lwnotice(
173  "%s: there are at least %u duplicate inputs, number of output clusters may be less than you requested",
174  __func__,
175  duplicate_count);
176 
177  /* by now two points should be found and non-same */
178  assert(p1 != p2 && objs[p1] && objs[p2] && max_dst >= 0);
179 
180  /* accept these two points */
181  centers_raw[0] = *((POINT2D *)objs[p1]);
182  centers[0] = &(centers_raw[0]);
183  centers_raw[1] = *((POINT2D *)objs[p2]);
184  centers[1] = &(centers_raw[1]);
185 
186  if (k > 2)
187  {
188  /* array of minimum distance to a point from accepted cluster centers */
189  distances = lwalloc(sizeof(double) * n);
190 
191  /* initialize array with distance to first object */
192  for (j = 0; j < n; j++)
193  {
194  if (objs[j])
195  distances[j] = distance2d_sqr_pt_pt(objs[j], centers[0]);
196  else
197  distances[j] = -1;
198  }
199  distances[p1] = -1;
200  distances[p2] = -1;
201 
202  /* loop i on clusters, skip 0 and 1 as found already */
203  for (i = 2; i < k; i++)
204  {
205  uint32_t candidate_center = 0;
206  double max_distance = -DBL_MAX;
207 
208  /* loop j on objs */
209  for (j = 0; j < n; j++)
210  {
211  /* empty objs and accepted clusters are already marked with distance = -1 */
212  if (distances[j] < 0) continue;
213 
214  /* update minimal distance with previosuly accepted cluster */
215  distances[j] = fmin(distance2d_sqr_pt_pt(objs[j], centers[i - 1]), distances[j]);
216 
217  /* greedily take a point that's farthest from any of accepted clusters */
218  if (distances[j] > max_distance)
219  {
220  candidate_center = j;
221  max_distance = distances[j];
222  }
223  }
224 
225  /* Checked earlier by counting entries on input, just in case */
226  assert(max_distance >= 0);
227 
228  /* accept candidate to centers */
229  distances[candidate_center] = -1;
230  /* Copy the point coordinates into the initial centers array
231  * Centers array is an array of pointers to points, not an array of points */
232  centers_raw[i] = *((POINT2D *)objs[candidate_center]);
233  centers[i] = &(centers_raw[i]);
234  }
235  lwfree(distances);
236  }
237 }
238 
239 int*
241 {
242  uint32_t i;
243  uint32_t num_centroids = 0;
244  uint32_t num_non_empty = 0;
245  LWGEOM** centroids;
246  POINT2D* centers_raw;
247  const POINT2D* cp;
248  int result = LW_FALSE;
249 
250  /* An array of objects to be analyzed.
251  * All NULL values will be returned in the KMEANS_NULL_CLUSTER. */
252  POINT2D** objs;
253 
254  /* An array of centers for the algorithm. */
255  POINT2D** centers;
256 
257  /* Array to fill in with cluster numbers. */
258  int* clusters;
259 
260  assert(k > 0);
261  assert(n > 0);
262  assert(geoms);
263 
264  if (n < k)
265  {
266  lwerror("%s: number of geometries is less than the number of clusters requested, not all clusters will get data", __func__);
267  k = n;
268  }
269 
270  /* We'll hold the temporary centroid objects here */
271  centroids = lwalloc(sizeof(LWGEOM*) * n);
272  memset(centroids, 0, sizeof(LWGEOM*) * n);
273 
274  /* The vector of cluster means. We have to allocate a chunk of memory for these because we'll be mutating them
275  * in the kmeans algorithm */
276  centers_raw = lwalloc(sizeof(POINT2D) * k);
277  memset(centers_raw, 0, sizeof(POINT2D) * k);
278 
279  /* K-means configuration setup */
280  objs = lwalloc(sizeof(POINT2D*) * n);
281  clusters = lwalloc(sizeof(int) * n);
282  centers = lwalloc(sizeof(POINT2D*) * k);
283 
284  /* Clean the memory */
285  memset(objs, 0, sizeof(POINT2D*) * n);
286  memset(clusters, 0, sizeof(int) * n);
287  memset(centers, 0, sizeof(POINT2D*) * k);
288 
289  /* Prepare the list of object pointers for K-means */
290  for (i = 0; i < n; i++)
291  {
292  const LWGEOM* geom = geoms[i];
293  LWPOINT* lwpoint;
294 
295  /* Null/empty geometries get a NULL pointer, set earlier with memset */
296  if ((!geom) || lwgeom_is_empty(geom)) continue;
297 
298  /* If the input is a point, use its coordinates */
299  /* If its not a point, convert it to one via centroid */
300  if (lwgeom_get_type(geom) != POINTTYPE)
301  {
303  if ((!centroid)) continue;
305  {
307  continue;
308  }
309  centroids[num_centroids++] = centroid;
310  lwpoint = lwgeom_as_lwpoint(centroid);
311  }
312  else
313  lwpoint = lwgeom_as_lwpoint(geom);
314 
315  /* Store a pointer to the POINT2D we are interested in */
316  cp = getPoint2d_cp(lwpoint->point, 0);
317  objs[i] = (POINT2D*)cp;
318  num_non_empty++;
319  }
320 
321  if (num_non_empty < k)
322  {
323  lwnotice("%s: number of non-empty geometries is less than the number of clusters requested, not all clusters will get data", __func__);
324  k = num_non_empty;
325  }
326 
327  if (k > 1)
328  {
329  kmeans_init(objs, clusters, n, centers, centers_raw, k);
330  result = kmeans(objs, clusters, n, centers, k);
331  }
332  else
333  {
334  /* k=0: everythong is unclusterable
335  * k=1: mark up NULL and non-NULL */
336  for (i = 0; i < n; i++)
337  {
338  if (k == 0 || !objs[i])
339  clusters[i] = KMEANS_NULL_CLUSTER;
340  else
341  clusters[i] = 0;
342  }
343  result = LW_TRUE;
344  }
345 
346  /* Before error handling, might as well clean up all the inputs */
347  lwfree(objs);
348  lwfree(centers);
349  lwfree(centers_raw);
350  lwfree(centroids);
351 
352  /* Good result */
353  if (result) return clusters;
354 
355  /* Bad result, not going to need the answer */
356  lwfree(clusters);
357  return NULL;
358 }
LWGEOM * lwgeom_centroid(const LWGEOM *geom)
#define LW_FALSE
Definition: liblwgeom.h:77
void lwgeom_free(LWGEOM *geom)
Definition: lwgeom.c:1144
double distance2d_sqr_pt_pt(const POINT2D *p1, const POINT2D *p2)
Definition: measures.c:2323
uint32_t lwgeom_get_type(const LWGEOM *geom)
Return LWTYPE number.
Definition: lwgeom.c:923
#define POINTTYPE
LWTYPE numbers, used internally by PostGIS.
Definition: liblwgeom.h:85
LWPOINT * lwgeom_as_lwpoint(const LWGEOM *lwgeom)
Definition: lwgeom.c:161
void lwfree(void *mem)
Definition: lwutil.c:244
int lwgeom_is_empty(const LWGEOM *geom)
Return true or false depending on whether a geometry is an "empty" geometry (no vertices members)
Definition: lwgeom.c:1393
void * lwalloc(size_t size)
Definition: lwutil.c:229
#define LW_TRUE
Return types for functions with status returns.
Definition: liblwgeom.h:76
const POINT2D * getPoint2d_cp(const POINTARRAY *pa, uint32_t n)
Returns a POINT2D pointer into the POINTARRAY serialized_ptlist, suitable for reading from.
Definition: lwgeom_api.c:374
#define LW_ON_INTERRUPT(x)
Datum distance(PG_FUNCTION_ARGS)
void lwerror(const char *fmt,...)
Write a notice out to the error handler.
Definition: lwutil.c:190
void lwnotice(const char *fmt,...)
Write a notice out to the notice handler.
Definition: lwutil.c:177
#define KMEANS_MAX_ITERATIONS
Definition: lwkmeans.c:23
int * lwgeom_cluster_2d_kmeans(const LWGEOM **geoms, uint32_t n, uint32_t k)
Take a list of LWGEOMs and a number of clusters and return an integer array indicating which cluster ...
Definition: lwkmeans.c:240
static void kmeans_init(POINT2D **objs, int *clusters, uint32_t n, POINT2D **centers, POINT2D *centers_raw, uint32_t k)
Definition: lwkmeans.c:132
static void update_r(POINT2D **objs, int *clusters, uint32_t n, POINT2D **centers, uint32_t k)
Definition: lwkmeans.c:26
#define KMEANS_NULL_CLUSTER
Definition: lwkmeans.c:17
static void update_means(POINT2D **objs, int *clusters, uint32_t n, POINT2D **centers, uint32_t *weights, uint32_t k)
Definition: lwkmeans.c:65
static int kmeans(POINT2D **objs, int *clusters, uint32_t n, POINT2D **centers, uint32_t k)
Definition: lwkmeans.c:97
Datum centroid(PG_FUNCTION_ARGS)
POINTARRAY * point
Definition: liblwgeom.h:414
double y
Definition: liblwgeom.h:331
double x
Definition: liblwgeom.h:331
unsigned int uint32_t
Definition: uthash.h:78