PostGIS  2.5.0dev-r@@SVN_REVISION@@

◆ lwgeom_cluster_2d_kmeans()

int* lwgeom_cluster_2d_kmeans ( const LWGEOM **  geoms,
uint32_t  ngeoms,
uint32_t  k 
)

Take a list of LWGEOMs and a number of clusters and return an integer array indicating which cluster each geometry is in.

Parameters
geomsthe input array of LWGEOM pointers
ngeomsthe number of elements in the array
kthe number of clusters to calculate

Definition at line 127 of file lwkmeans.c.

References centroid(), distance2d_sqr_pt_pt(), getPoint2d_cp(), kmeans(), LW_FALSE, lwalloc(), lwerror(), lwfree(), lwgeom_as_lwpoint(), lwgeom_centroid(), lwgeom_get_type(), lwgeom_is_empty(), LWPOINT::point, POINTTYPE, POINT2D::x, and POINT2D::y.

Referenced by ST_ClusterKMeans(), and test_kmeans().

128 {
129  uint32_t i;
130  uint32_t num_centroids = 0;
131  LWGEOM** centroids;
132  POINT2D* centers_raw;
133  double* distances;
134  const POINT2D* cp;
135  int result = LW_FALSE;
136  uint32_t boundary_point_idx = 0;
137  double max_norm = -DBL_MAX;
138  double curr_norm;
139 
140  /* An array of objects to be analyzed.
141  * All NULL values will be returned in the KMEANS_NULL_CLUSTER. */
142  POINT2D** objs;
143 
144  /* An array of centers for the algorithm. */
145  POINT2D** centers;
146 
147  /* Array to fill in with cluster numbers. */
148  int* clusters;
149 
150  assert(k > 0);
151  assert(n > 0);
152  assert(geoms);
153 
154  if (n < k)
155  lwerror("%s: number of geometries is less than the number of clusters requested", __func__);
156 
157  /* We'll hold the temporary centroid objects here */
158  centroids = lwalloc(sizeof(LWGEOM*) * n);
159  memset(centroids, 0, sizeof(LWGEOM*) * n);
160 
161  /* The vector of cluster means. We have to allocate a chunk of memory for these because we'll be mutating them
162  * in the kmeans algorithm */
163  centers_raw = lwalloc(sizeof(POINT2D) * k);
164  memset(centers_raw, 0, sizeof(POINT2D) * k);
165 
166  /* K-means configuration setup */
167  objs = lwalloc(sizeof(POINT2D*) * n);
168  clusters = lwalloc(sizeof(int) * n);
169  centers = lwalloc(sizeof(POINT2D*) * k);
170 
171  /* Clean the memory */
172  memset(objs, 0, sizeof(POINT2D*) * n);
173  memset(clusters, 0, sizeof(int) * n);
174  memset(centers, 0, sizeof(POINT2D*) * k);
175 
176  /* Array of sums of distances to a point from accepted cluster centers */
177  distances = lwalloc(sizeof(double) * n);
178 
179  /* Prepare the list of object pointers for K-means */
180  for (i = 0; i < n; i++)
181  {
182  const LWGEOM* geom = geoms[i];
183  LWPOINT* lwpoint;
184 
185  /* Null/empty geometries get a NULL pointer */
186  if ((!geom) || lwgeom_is_empty(geom))
187  {
188  objs[i] = NULL;
189  /* mark as unreachable */
190  distances[i] = -1;
191  continue;
192  }
193 
194  distances[i] = DBL_MAX;
195 
196  /* If the input is a point, use its coordinates */
197  /* If its not a point, convert it to one via centroid */
198  if (lwgeom_get_type(geom) != POINTTYPE)
199  {
201  if ((!centroid) || lwgeom_is_empty(centroid))
202  {
203  objs[i] = NULL;
204  continue;
205  }
206  centroids[num_centroids++] = centroid;
207  lwpoint = lwgeom_as_lwpoint(centroid);
208  }
209  else
210  {
211  lwpoint = lwgeom_as_lwpoint(geom);
212  }
213 
214  /* Store a pointer to the POINT2D we are interested in */
215  cp = getPoint2d_cp(lwpoint->point, 0);
216  objs[i] = (POINT2D*)cp;
217 
218  /* Find the point with largest Euclidean norm to use as seed */
219  curr_norm = (cp->x) * (cp->x) + (cp->y) * (cp->y);
220  if (curr_norm > max_norm)
221  {
222  boundary_point_idx = i;
223  max_norm = curr_norm;
224  }
225  }
226 
227  if (max_norm == -DBL_MAX)
228  {
229  lwerror("unable to calculate any cluster seed point, too many NULLs or empties?");
230  }
231 
232  /* start with point on boundary */
233  distances[boundary_point_idx] = -1;
234  centers_raw[0] = *((POINT2D*)objs[boundary_point_idx]);
235  centers[0] = &(centers_raw[0]);
236  /* loop i on clusters, skip 0 as it's found already */
237  for (i = 1; i < k; i++)
238  {
239  uint32_t j;
240  double max_distance = -DBL_MAX;
241  double curr_distance;
242  uint32_t candidate_center = 0;
243 
244  /* loop j on objs */
245  for (j = 0; j < n; j++)
246  {
247  /* empty objs and accepted clusters are already marked with distance = -1 */
248  if (distances[j] < 0)
249  continue;
250 
251  /* update distance to closest cluster */
252  curr_distance = distance2d_sqr_pt_pt(objs[j], centers[i - 1]);
253  distances[j] = fmin(curr_distance, distances[j]);
254 
255  /* greedily take a point that's farthest from any of accepted clusters */
256  if (distances[j] > max_distance)
257  {
258  candidate_center = j;
259  max_distance = distances[j];
260  }
261  }
262 
263  /* something is wrong with data, cannot find a candidate */
264  if (max_distance < 0)
265  lwerror("unable to calculate cluster seed points, too many NULLs or empties?");
266 
267  /* accept candidtate to centers */
268  distances[candidate_center] = -1;
269  /* Copy the point coordinates into the initial centers array
270  * This is ugly, but the centers array is an array of pointers to points, not an array of points */
271  centers_raw[i] = *((POINT2D*)objs[candidate_center]);
272  centers[i] = &(centers_raw[i]);
273  }
274 
275  result = kmeans(objs, clusters, n, centers, k);
276 
277  /* Before error handling, might as well clean up all the inputs */
278  lwfree(objs);
279  lwfree(centers);
280  lwfree(centers_raw);
281  lwfree(centroids);
282  lwfree(distances);
283 
284  /* Good result */
285  if (result)
286  return clusters;
287 
288  /* Bad result, not going to need the answer */
289  lwfree(clusters);
290  return NULL;
291 }
void lwfree(void *mem)
Definition: lwutil.c:244
uint32_t lwgeom_get_type(const LWGEOM *geom)
Return LWTYPE number.
Definition: lwgeom.c:916
Datum centroid(PG_FUNCTION_ARGS)
LWGEOM * lwgeom_centroid(const LWGEOM *geom)
static int kmeans(POINT2D **objs, int *clusters, uint32_t n, POINT2D **centers, uint32_t k)
Definition: lwkmeans.c:89
LWPOINT * lwgeom_as_lwpoint(const LWGEOM *lwgeom)
Definition: lwgeom.c:160
POINTARRAY * point
Definition: liblwgeom.h:410
unsigned int uint32_t
Definition: uthash.h:78
double x
Definition: liblwgeom.h:327
#define LW_FALSE
Definition: liblwgeom.h:76
double y
Definition: liblwgeom.h:327
#define POINTTYPE
LWTYPE numbers, used internally by PostGIS.
Definition: liblwgeom.h:84
double distance2d_sqr_pt_pt(const POINT2D *p1, const POINT2D *p2)
Definition: measures.c:2326
void * lwalloc(size_t size)
Definition: lwutil.c:229
int lwgeom_is_empty(const LWGEOM *geom)
Return true or false depending on whether a geometry is an "empty" geometry (no vertices members) ...
Definition: lwgeom.c:1386
void lwerror(const char *fmt,...)
Write a notice out to the error handler.
Definition: lwutil.c:190
const POINT2D * getPoint2d_cp(const POINTARRAY *pa, uint32_t n)
Returns a POINT2D pointer into the POINTARRAY serialized_ptlist, suitable for reading from...
Definition: lwgeom_api.c:364
Here is the call graph for this function:
Here is the caller graph for this function: