PostGIS  3.0.0dev-r@@SVN_REVISION@@
lwkmeans.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * Copyright (c) 2018, Darafei Praliaskouski <me@komzpa.net>
4  * Copyright (c) 2016, Paul Ramsey <pramsey@cleverelephant.ca>
5  *
6  *------------------------------------------------------------------------*/
7 
8 #include "liblwgeom_internal.h"
9 
10 /*
11  * When clustering lists with NULL or EMPTY elements, they will get this as
12  * their cluster number. (All the other clusters will be non-negative)
13  */
14 #define KMEANS_NULL_CLUSTER -1
15 
16 /*
17  * If the algorithm doesn't converge within this number of iterations,
18  * it will return with a failure error code.
19  */
20 #define KMEANS_MAX_ITERATIONS 1000
21 
22 static void
23 update_r(POINT2D** objs, int* clusters, uint32_t n, POINT2D** centers, uint32_t k)
24 {
25  POINT2D* obj;
26  unsigned int i;
27  double distance, curr_distance;
28  uint32_t cluster, curr_cluster;
29 
30  for (i = 0; i < n; i++)
31  {
32  obj = objs[i];
33 
34  /* Don't try to cluster NULL objects, just add them to the "unclusterable cluster" */
35  if (!obj)
36  {
37  clusters[i] = KMEANS_NULL_CLUSTER;
38  continue;
39  }
40 
41  /* Initialize with distance to first cluster */
42  curr_distance = distance2d_sqr_pt_pt(obj, centers[0]);
43  curr_cluster = 0;
44 
45  /* Check all other cluster centers and find the nearest */
46  for (cluster = 1; cluster < k; cluster++)
47  {
48  distance = distance2d_sqr_pt_pt(obj, centers[cluster]);
49  if (distance < curr_distance)
50  {
51  curr_distance = distance;
52  curr_cluster = cluster;
53  }
54  }
55 
56  /* Store the nearest cluster this object is in */
57  clusters[i] = (int) curr_cluster;
58  }
59 }
60 
61 static void
62 update_means(POINT2D** objs, int* clusters, uint32_t n, POINT2D** centers, uint32_t* weights, uint32_t k)
63 {
64  uint32_t i;
65  int cluster;
66 
67  memset(weights, 0, sizeof(uint32_t) * k);
68  for (i = 0; i < k; i++)
69  {
70  centers[i]->x = 0.0;
71  centers[i]->y = 0.0;
72  }
73  for (i = 0; i < n; i++)
74  {
75  cluster = clusters[i];
76  if (cluster != KMEANS_NULL_CLUSTER)
77  {
78  centers[cluster]->x += objs[i]->x;
79  centers[cluster]->y += objs[i]->y;
80  weights[cluster] += 1;
81  }
82  }
83  for (i = 0; i < k; i++)
84  {
85  centers[i]->x /= weights[i];
86  centers[i]->y /= weights[i];
87  }
88 }
89 
90 static int
91 kmeans(POINT2D** objs, int* clusters, uint32_t n, POINT2D** centers, uint32_t k)
92 {
93  uint32_t i = 0;
94  int* clusters_last;
95  int converged = LW_FALSE;
96  size_t clusters_sz = sizeof(int) * n;
97  uint32_t* weights;
98 
99  weights = lwalloc(sizeof(int) * k);
100 
101  /* previous cluster state array */
102  clusters_last = lwalloc(clusters_sz);
103 
104  for (i = 0; i < KMEANS_MAX_ITERATIONS && !converged; i++)
105  {
106  LW_ON_INTERRUPT(break);
107 
108  /* store the previous state of the clustering */
109  memcpy(clusters_last, clusters, clusters_sz);
110 
111  update_r(objs, clusters, n, centers, k);
112  update_means(objs, clusters, n, centers, weights, k);
113 
114  /* if all the cluster numbers are unchanged, we are at a stable solution */
115  converged = memcmp(clusters_last, clusters, clusters_sz) == 0;
116  }
117 
118  lwfree(clusters_last);
119  lwfree(weights);
120  if (!converged)
121  lwerror("%s did not converge after %d iterations", __func__, i);
122  return converged;
123 }
124 
125 static void
126 kmeans_init(POINT2D **objs, uint32_t n, POINT2D **centers, POINT2D *centers_raw, uint32_t k)
127 {
128  double* distances;
129  uint32_t p1 = 0, p2 = 0;
130  uint32_t i, j;
131  uint32_t duplicate_count = 1; /* a point is a duplicate of itself */
132  double max_dst = -1, current_distance;
133  double dst_p1, dst_p2;
134 
135  /* k=0, k=1: "clustering" is just input validation */
136  assert(k > 1);
137 
138  /* k >= 2: find two distant points greedily */
139  for (i = 1; i < n; i++)
140  {
141  /* skip null */
142  if (!objs[i]) continue;
143 
144  /* reinit if first element happened to be null */
145  if (!objs[p1] && !objs[p2])
146  {
147  p1 = i;
148  p2 = i;
149  continue;
150  }
151 
152  /* if we found a larger distance, replace our choice */
153  dst_p1 = distance2d_sqr_pt_pt(objs[i], objs[p1]);
154  dst_p2 = distance2d_sqr_pt_pt(objs[i], objs[p2]);
155  if ((dst_p1 > max_dst) || (dst_p2 > max_dst))
156  {
157  if (dst_p1 > dst_p2)
158  {
159  max_dst = dst_p1;
160  p2 = i;
161  }
162  else
163  {
164  max_dst = dst_p2;
165  p1 = i;
166  }
167  }
168  if ((dst_p1 == 0) || (dst_p2 == 0)) duplicate_count++;
169  }
170  if (duplicate_count > 1)
171  lwnotice(
172  "%s: there are at least %u duplicate inputs, number of output clusters may be less than you requested",
173  __func__,
174  duplicate_count);
175 
176  /* by now two points should be found and non-same */
177  assert(p1 != p2 && objs[p1] && objs[p2] && max_dst >= 0);
178 
179  /* accept these two points */
180  centers_raw[0] = *((POINT2D *)objs[p1]);
181  centers[0] = &(centers_raw[0]);
182  centers_raw[1] = *((POINT2D *)objs[p2]);
183  centers[1] = &(centers_raw[1]);
184 
185  if (k > 2)
186  {
187  /* array of minimum distance to a point from accepted cluster centers */
188  distances = lwalloc(sizeof(double) * n);
189 
190  /* initialize array with distance to first object */
191  for (j = 0; j < n; j++)
192  {
193  if (objs[j])
194  distances[j] = distance2d_sqr_pt_pt(objs[j], centers[0]);
195  else
196  distances[j] = -1;
197  }
198  distances[p1] = -1;
199  distances[p2] = -1;
200 
201  /* loop i on clusters, skip 0 and 1 as found already */
202  for (i = 2; i < k; i++)
203  {
204  uint32_t candidate_center = 0;
205  double max_distance = -DBL_MAX;
206 
207  /* loop j on objs */
208  for (j = 0; j < n; j++)
209  {
210  /* empty objs and accepted clusters are already marked with distance = -1 */
211  if (distances[j] < 0) continue;
212 
213  /* update minimal distance with previosuly accepted cluster */
214  current_distance = distance2d_sqr_pt_pt(objs[j], centers[i - 1]);
215  if (current_distance < distances[j])
216  distances[j] = current_distance;
217 
218  /* greedily take a point that's farthest from any of accepted clusters */
219  if (distances[j] > max_distance)
220  {
221  candidate_center = j;
222  max_distance = distances[j];
223  }
224  }
225 
226  /* Checked earlier by counting entries on input, just in case */
227  assert(max_distance >= 0);
228 
229  /* accept candidate to centers */
230  distances[candidate_center] = -1;
231  /* Copy the point coordinates into the initial centers array
232  * Centers array is an array of pointers to points, not an array of points */
233  centers_raw[i] = *((POINT2D *)objs[candidate_center]);
234  centers[i] = &(centers_raw[i]);
235  }
236  lwfree(distances);
237  }
238 }
239 
240 int*
242 {
243  uint32_t i;
244  uint32_t num_centroids = 0;
245  uint32_t num_non_empty = 0;
246  LWGEOM** centroids;
247  POINT2D* centers_raw;
248  const POINT2D* cp;
249  int result = LW_FALSE;
250 
251  /* An array of objects to be analyzed.
252  * All NULL values will be returned in the KMEANS_NULL_CLUSTER. */
253  POINT2D** objs;
254 
255  /* An array of centers for the algorithm. */
256  POINT2D** centers;
257 
258  /* Array to fill in with cluster numbers. */
259  int* clusters;
260 
261  assert(k > 0);
262  assert(n > 0);
263  assert(geoms);
264 
265  if (n < k)
266  {
267  lwerror("%s: number of geometries is less than the number of clusters requested, not all clusters will get data", __func__);
268  k = n;
269  }
270 
271  /* We'll hold the temporary centroid objects here */
272  centroids = lwalloc(sizeof(LWGEOM*) * n);
273  memset(centroids, 0, sizeof(LWGEOM*) * n);
274 
275  /* The vector of cluster means. We have to allocate a chunk of memory for these because we'll be mutating them
276  * in the kmeans algorithm */
277  centers_raw = lwalloc(sizeof(POINT2D) * k);
278  memset(centers_raw, 0, sizeof(POINT2D) * k);
279 
280  /* K-means configuration setup */
281  objs = lwalloc(sizeof(POINT2D*) * n);
282  clusters = lwalloc(sizeof(int) * n);
283  centers = lwalloc(sizeof(POINT2D*) * k);
284 
285  /* Clean the memory */
286  memset(objs, 0, sizeof(POINT2D*) * n);
287  memset(clusters, 0, sizeof(int) * n);
288  memset(centers, 0, sizeof(POINT2D*) * k);
289 
290  /* Prepare the list of object pointers for K-means */
291  for (i = 0; i < n; i++)
292  {
293  const LWGEOM* geom = geoms[i];
294  LWPOINT* lwpoint;
295 
296  /* Null/empty geometries get a NULL pointer, set earlier with memset */
297  if ((!geom) || lwgeom_is_empty(geom)) continue;
298 
299  /* If the input is a point, use its coordinates */
300  /* If its not a point, convert it to one via centroid */
301  if (lwgeom_get_type(geom) != POINTTYPE)
302  {
304  if ((!centroid)) continue;
305  if (lwgeom_is_empty(centroid))
306  {
307  lwgeom_free(centroid);
308  continue;
309  }
310  centroids[num_centroids++] = centroid;
311  lwpoint = lwgeom_as_lwpoint(centroid);
312  }
313  else
314  lwpoint = lwgeom_as_lwpoint(geom);
315 
316  /* Store a pointer to the POINT2D we are interested in */
317  cp = getPoint2d_cp(lwpoint->point, 0);
318  objs[i] = (POINT2D*)cp;
319  num_non_empty++;
320  }
321 
322  if (num_non_empty < k)
323  {
324  lwnotice("%s: number of non-empty geometries is less than the number of clusters requested, not all clusters will get data", __func__);
325  k = num_non_empty;
326  }
327 
328  if (k > 1)
329  {
330  kmeans_init(objs, n, centers, centers_raw, k);
331  result = kmeans(objs, clusters, n, centers, k);
332  }
333  else
334  {
335  /* k=0: everything is unclusterable
336  * k=1: mark up NULL and non-NULL */
337  for (i = 0; i < n; i++)
338  {
339  if (k == 0 || !objs[i])
340  clusters[i] = KMEANS_NULL_CLUSTER;
341  else
342  clusters[i] = 0;
343  }
344  result = LW_TRUE;
345  }
346 
347  /* Before error handling, might as well clean up all the inputs */
348  lwfree(objs);
349  lwfree(centers);
350  lwfree(centers_raw);
351  lwfree(centroids);
352 
353  /* Good result */
354  if (result) return clusters;
355 
356  /* Bad result, not going to need the answer */
357  lwfree(clusters);
358  return NULL;
359 }
static LWPOINT * lwgeom_as_lwpoint(const LWGEOM *lwgeom)
Definition: lwinline.h:100
#define KMEANS_NULL_CLUSTER
Definition: lwkmeans.c:14
void lwnotice(const char *fmt,...)
Write a notice out to the notice handler.
Definition: lwutil.c:177
void lwfree(void *mem)
Definition: lwutil.c:242
void lwgeom_free(LWGEOM *geom)
Definition: lwgeom.c:1128
Datum centroid(PG_FUNCTION_ARGS)
LWGEOM * lwgeom_centroid(const LWGEOM *geom)
#define LW_ON_INTERRUPT(x)
static int kmeans(POINT2D **objs, int *clusters, uint32_t n, POINT2D **centers, uint32_t k)
Definition: lwkmeans.c:91
static void update_means(POINT2D **objs, int *clusters, uint32_t n, POINT2D **centers, uint32_t *weights, uint32_t k)
Definition: lwkmeans.c:62
POINTARRAY * point
Definition: liblwgeom.h:413
static int lwgeom_is_empty(const LWGEOM *geom)
Return true or false depending on whether a geometry is an "empty" geometry (no vertices members) ...
Definition: lwinline.h:172
static const POINT2D * getPoint2d_cp(const POINTARRAY *pa, uint32_t n)
Returns a POINT2D pointer into the POINTARRAY serialized_ptlist, suitable for reading from...
Definition: lwinline.h:91
unsigned int uint32_t
Definition: uthash.h:78
double x
Definition: liblwgeom.h:330
int * lwgeom_cluster_2d_kmeans(const LWGEOM **geoms, uint32_t n, uint32_t k)
Take a list of LWGEOMs and a number of clusters and return an integer array indicating which cluster ...
Definition: lwkmeans.c:241
#define LW_FALSE
Definition: liblwgeom.h:76
#define LW_TRUE
Return types for functions with status returns.
Definition: liblwgeom.h:75
#define KMEANS_MAX_ITERATIONS
Definition: lwkmeans.c:20
double y
Definition: liblwgeom.h:330
static void update_r(POINT2D **objs, int *clusters, uint32_t n, POINT2D **centers, uint32_t k)
Definition: lwkmeans.c:23
static double distance2d_sqr_pt_pt(const POINT2D *p1, const POINT2D *p2)
Definition: lwinline.h:35
Datum distance(PG_FUNCTION_ARGS)
#define POINTTYPE
LWTYPE numbers, used internally by PostGIS.
Definition: liblwgeom.h:84
static uint32_t lwgeom_get_type(const LWGEOM *geom)
Return LWTYPE number.
Definition: lwinline.h:114
void * lwalloc(size_t size)
Definition: lwutil.c:227
static void kmeans_init(POINT2D **objs, uint32_t n, POINT2D **centers, POINT2D *centers_raw, uint32_t k)
Definition: lwkmeans.c:126
void lwerror(const char *fmt,...)
Write a notice out to the error handler.
Definition: lwutil.c:190