PostGIS  2.5.0dev-r@@SVN_REVISION@@
lwkmeans.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * Copyright (c) 2018, Darafei Praliaskouski <me@komzpa.net>
4  * Copyright (c) 2016, Paul Ramsey <pramsey@cleverelephant.ca>
5  *
6  *------------------------------------------------------------------------*/
7 
8 #include <float.h>
9 #include <math.h>
10 
11 #include "liblwgeom_internal.h"
12 
13 /*
14  * When clustering lists with NULL elements, they will get this as
15  * their cluster number. (All the other clusters will be non-negative)
16  */
17 #define KMEANS_NULL_CLUSTER -1
18 
19 /*
20  * If the algorithm doesn't converge within this number of iterations,
21  * it will return with a failure error code.
22  */
23 #define KMEANS_MAX_ITERATIONS 1000
24 
25 static void
26 update_r(POINT2D** objs, int* clusters, uint32_t n, POINT2D** centers, uint32_t k)
27 {
28  POINT2D* obj;
29  unsigned int i;
30  double distance, curr_distance;
31  uint32_t cluster, curr_cluster;
32 
33  for (i = 0; i < n; i++)
34  {
35  obj = objs[i];
36 
37  /* Don't try to cluster NULL objects, just add them to the "unclusterable cluster" */
38  if (!obj)
39  {
40  clusters[i] = KMEANS_NULL_CLUSTER;
41  continue;
42  }
43 
44  /* Initialize with distance to first cluster */
45  curr_distance = distance2d_sqr_pt_pt(obj, centers[0]);
46  curr_cluster = 0;
47 
48  /* Check all other cluster centers and find the nearest */
49  for (cluster = 1; cluster < k; cluster++)
50  {
51  distance = distance2d_sqr_pt_pt(obj, centers[cluster]);
52  if (distance < curr_distance)
53  {
54  curr_distance = distance;
55  curr_cluster = cluster;
56  }
57  }
58 
59  /* Store the nearest cluster this object is in */
60  clusters[i] = (int) curr_cluster;
61  }
62 }
63 
64 static void
65 update_means(POINT2D** objs, int* clusters, uint32_t n, POINT2D** centers, uint32_t* weights, uint32_t k)
66 {
67  uint32_t i;
68 
69  memset(weights, 0, sizeof(int) * k);
70  for (i = 0; i < k; i++)
71  {
72  centers[i]->x = 0.0;
73  centers[i]->y = 0.0;
74  }
75  for (i = 0; i < n; i++)
76  {
77  centers[clusters[i]]->x += objs[i]->x;
78  centers[clusters[i]]->y += objs[i]->y;
79  weights[clusters[i]] += 1;
80  }
81  for (i = 0; i < k; i++)
82  {
83  centers[i]->x /= weights[i];
84  centers[i]->y /= weights[i];
85  }
86 }
87 
88 static int
89 kmeans(POINT2D** objs, int* clusters, uint32_t n, POINT2D** centers, uint32_t k)
90 {
91  uint32_t i = 0;
92  int* clusters_last;
93  int converged = LW_FALSE;
94  size_t clusters_sz = sizeof(int) * n;
95  uint32_t* weights;
96 
97  weights = lwalloc(sizeof(int) * k);
98 
99  /*
100  * Previous cluster state array. At this time, r doesn't mean anything
101  * but it's ok
102  */
103  clusters_last = lwalloc(clusters_sz);
104 
105  for (i = 0; i < KMEANS_MAX_ITERATIONS && !converged; i++)
106  {
107  LW_ON_INTERRUPT(break);
108 
109  /* Store the previous state of the clustering */
110  memcpy(clusters_last, clusters, clusters_sz);
111 
112  update_r(objs, clusters, n, centers, k);
113  update_means(objs, clusters, n, centers, weights, k);
114 
115  /* if all the cluster numbers are unchanged, we are at a stable solution */
116  converged = memcmp(clusters_last, clusters, clusters_sz) == 0;
117  }
118 
119  lwfree(clusters_last);
120  lwfree(weights);
121  if (!converged)
122  lwerror("%s did not converge after %d iterations", __func__, i);
123  return converged;
124 }
125 
126 int*
128 {
129  uint32_t i;
130  uint32_t num_centroids = 0;
131  LWGEOM** centroids;
132  POINT2D* centers_raw;
133  double* distances;
134  const POINT2D* cp;
135  int result = LW_FALSE;
136  uint32_t boundary_point_idx = 0;
137  double max_norm = -DBL_MAX;
138  double curr_norm;
139 
140  /* An array of objects to be analyzed.
141  * All NULL values will be returned in the KMEANS_NULL_CLUSTER. */
142  POINT2D** objs;
143 
144  /* An array of centers for the algorithm. */
145  POINT2D** centers;
146 
147  /* Array to fill in with cluster numbers. */
148  int* clusters;
149 
150  assert(k > 0);
151  assert(n > 0);
152  assert(geoms);
153 
154  if (n < k)
155  lwerror("%s: number of geometries is less than the number of clusters requested", __func__);
156 
157  /* We'll hold the temporary centroid objects here */
158  centroids = lwalloc(sizeof(LWGEOM*) * n);
159  memset(centroids, 0, sizeof(LWGEOM*) * n);
160 
161  /* The vector of cluster means. We have to allocate a chunk of memory for these because we'll be mutating them
162  * in the kmeans algorithm */
163  centers_raw = lwalloc(sizeof(POINT2D) * k);
164  memset(centers_raw, 0, sizeof(POINT2D) * k);
165 
166  /* K-means configuration setup */
167  objs = lwalloc(sizeof(POINT2D*) * n);
168  clusters = lwalloc(sizeof(int) * n);
169  centers = lwalloc(sizeof(POINT2D*) * k);
170 
171  /* Clean the memory */
172  memset(objs, 0, sizeof(POINT2D*) * n);
173  memset(clusters, 0, sizeof(int) * n);
174  memset(centers, 0, sizeof(POINT2D*) * k);
175 
176  /* Array of sums of distances to a point from accepted cluster centers */
177  distances = lwalloc(sizeof(double) * n);
178 
179  /* Prepare the list of object pointers for K-means */
180  for (i = 0; i < n; i++)
181  {
182  const LWGEOM* geom = geoms[i];
183  LWPOINT* lwpoint;
184 
185  /* Null/empty geometries get a NULL pointer */
186  if ((!geom) || lwgeom_is_empty(geom))
187  {
188  objs[i] = NULL;
189  /* mark as unreachable */
190  distances[i] = -1;
191  continue;
192  }
193 
194  distances[i] = DBL_MAX;
195 
196  /* If the input is a point, use its coordinates */
197  /* If its not a point, convert it to one via centroid */
198  if (lwgeom_get_type(geom) != POINTTYPE)
199  {
201  if ((!centroid) || lwgeom_is_empty(centroid))
202  {
203  objs[i] = NULL;
204  continue;
205  }
206  centroids[num_centroids++] = centroid;
207  lwpoint = lwgeom_as_lwpoint(centroid);
208  }
209  else
210  {
211  lwpoint = lwgeom_as_lwpoint(geom);
212  }
213 
214  /* Store a pointer to the POINT2D we are interested in */
215  cp = getPoint2d_cp(lwpoint->point, 0);
216  objs[i] = (POINT2D*)cp;
217 
218  /* Find the point with largest Euclidean norm to use as seed */
219  curr_norm = (cp->x) * (cp->x) + (cp->y) * (cp->y);
220  if (curr_norm > max_norm)
221  {
222  boundary_point_idx = i;
223  max_norm = curr_norm;
224  }
225  }
226 
227  if (max_norm == -DBL_MAX)
228  {
229  lwerror("unable to calculate any cluster seed point, too many NULLs or empties?");
230  }
231 
232  /* start with point on boundary */
233  distances[boundary_point_idx] = -1;
234  centers_raw[0] = *((POINT2D*)objs[boundary_point_idx]);
235  centers[0] = &(centers_raw[0]);
236  /* loop i on clusters, skip 0 as it's found already */
237  for (i = 1; i < k; i++)
238  {
239  uint32_t j;
240  double max_distance = -DBL_MAX;
241  double curr_distance;
242  uint32_t candidate_center = 0;
243 
244  /* loop j on objs */
245  for (j = 0; j < n; j++)
246  {
247  /* empty objs and accepted clusters are already marked with distance = -1 */
248  if (distances[j] < 0)
249  continue;
250 
251  /* update distance to closest cluster */
252  curr_distance = distance2d_sqr_pt_pt(objs[j], centers[i - 1]);
253  distances[j] = fmin(curr_distance, distances[j]);
254 
255  /* greedily take a point that's farthest from any of accepted clusters */
256  if (distances[j] > max_distance)
257  {
258  candidate_center = j;
259  max_distance = distances[j];
260  }
261  }
262 
263  /* something is wrong with data, cannot find a candidate */
264  if (max_distance < 0)
265  lwerror("unable to calculate cluster seed points, too many NULLs or empties?");
266 
267  /* accept candidtate to centers */
268  distances[candidate_center] = -1;
269  /* Copy the point coordinates into the initial centers array
270  * This is ugly, but the centers array is an array of pointers to points, not an array of points */
271  centers_raw[i] = *((POINT2D*)objs[candidate_center]);
272  centers[i] = &(centers_raw[i]);
273  }
274 
275  result = kmeans(objs, clusters, n, centers, k);
276 
277  /* Before error handling, might as well clean up all the inputs */
278  lwfree(objs);
279  lwfree(centers);
280  lwfree(centers_raw);
281  lwfree(centroids);
282  lwfree(distances);
283 
284  /* Good result */
285  if (result)
286  return clusters;
287 
288  /* Bad result, not going to need the answer */
289  lwfree(clusters);
290  return NULL;
291 }
#define KMEANS_NULL_CLUSTER
Definition: lwkmeans.c:17
void lwfree(void *mem)
Definition: lwutil.c:244
uint32_t lwgeom_get_type(const LWGEOM *geom)
Return LWTYPE number.
Definition: lwgeom.c:916
Datum centroid(PG_FUNCTION_ARGS)
LWGEOM * lwgeom_centroid(const LWGEOM *geom)
#define LW_ON_INTERRUPT(x)
static int kmeans(POINT2D **objs, int *clusters, uint32_t n, POINT2D **centers, uint32_t k)
Definition: lwkmeans.c:89
static void update_means(POINT2D **objs, int *clusters, uint32_t n, POINT2D **centers, uint32_t *weights, uint32_t k)
Definition: lwkmeans.c:65
LWPOINT * lwgeom_as_lwpoint(const LWGEOM *lwgeom)
Definition: lwgeom.c:160
POINTARRAY * point
Definition: liblwgeom.h:410
unsigned int uint32_t
Definition: uthash.h:78
double x
Definition: liblwgeom.h:327
int * lwgeom_cluster_2d_kmeans(const LWGEOM **geoms, uint32_t n, uint32_t k)
Take a list of LWGEOMs and a number of clusters and return an integer array indicating which cluster ...
Definition: lwkmeans.c:127
#define LW_FALSE
Definition: liblwgeom.h:76
#define KMEANS_MAX_ITERATIONS
Definition: lwkmeans.c:23
double y
Definition: liblwgeom.h:327
static void update_r(POINT2D **objs, int *clusters, uint32_t n, POINT2D **centers, uint32_t k)
Definition: lwkmeans.c:26
Datum distance(PG_FUNCTION_ARGS)
#define POINTTYPE
LWTYPE numbers, used internally by PostGIS.
Definition: liblwgeom.h:84
double distance2d_sqr_pt_pt(const POINT2D *p1, const POINT2D *p2)
Definition: measures.c:2322
void * lwalloc(size_t size)
Definition: lwutil.c:229
int lwgeom_is_empty(const LWGEOM *geom)
Return true or false depending on whether a geometry is an "empty" geometry (no vertices members) ...
Definition: lwgeom.c:1386
void lwerror(const char *fmt,...)
Write a notice out to the error handler.
Definition: lwutil.c:190
const POINT2D * getPoint2d_cp(const POINTARRAY *pa, uint32_t n)
Returns a POINT2D pointer into the POINTARRAY serialized_ptlist, suitable for reading from...
Definition: lwgeom_api.c:364