◆ nd_box_array_distribution()

static int nd_box_array_distribution	(	const ND_BOX **	nd_boxes,
		int	num_boxes,
		const ND_BOX *	extent,
		int	ndims,
		double *	distribution
	)

static

Calculate how much a set of boxes is homogenously distributed or contentrated within one dimension, returning the range_quintile of of the overlap counts per cell in a uniform partition of the extent of the dimension.

A uniform distribution of counts will have a small range and will require few cells in a selectivity histogram. A diverse distribution of counts will have a larger range and require more cells in a selectivity histogram (to distinguish between areas of feature density and areas of feature sparseness. This measurement should help us identify cases like X/Y/Z data where there is lots of variability in density in X/Y (diversely in a multi-kilometer range) and far less in Z (in a few-hundred meter range).

Definition at line 775 of file gserialized_estimate.c.

 {
         int d, i, k, range;
         int counts[NUM_BINS];
         double smin, smax;   /* Spatial min, spatial max */
         double swidth;       /* Spatial width of dimension */
 #if POSTGIS_DEBUG_LEVEL >= 3
         double average, sdev, sdev_ratio;
 #endif
         int   bmin, bmax;   /* Bin min, bin max */
         const ND_BOX *ndb;
  
         /* For each dimension... */
         for ( d = 0; d < ndims; d++ )
         {
                 /* Initialize counts for this dimension */
                 memset(counts, 0, sizeof(counts));
  
                 smin = extent->min[d];
                 smax = extent->max[d];
                 swidth = smax - smin;
  
                 /* Don't try and calculate distribution of overly narrow */
                 /* or overly wide dimensions. Here we're being pretty geographical, */
                 /* expecting "normal" planar or geographic coordinates. */
                 /* Otherwise we have to "handle" +/- Inf bounded features and */
                 /* the assumptions needed for that are as bad as this hack. */
                 if ( swidth < MIN_DIMENSION_WIDTH || swidth > MAX_DIMENSION_WIDTH )
                 {
                         distribution[d] = 0;
                         continue;
                 }
  
                 /* Sum up the overlaps of each feature with the dimensional bins */
                 for ( i = 0; i < num_boxes; i++ )
                 {
                         double minoffset, maxoffset;
  
                         /* Skip null entries */
                         ndb = nd_boxes[i];
                         if ( ! ndb ) continue;
  
                         /* Where does box fall relative to the working range */
                         minoffset = ndb->min[d] - smin;
                         maxoffset = ndb->max[d] - smin;
  
                         /* Skip boxes that are outside our working range */
                         if ( minoffset < 0 || minoffset > swidth ||
                              maxoffset < 0 || maxoffset > swidth )
                         {
                                 continue;
                         }
  
                         /* What bins does this range correspond to? */
                         bmin = floor(NUM_BINS * minoffset / swidth);
                         bmax = floor(NUM_BINS * maxoffset / swidth);
  
                         /* Should only happen when maxoffset==swidth */
                         bmax = bmax >= NUM_BINS ? NUM_BINS-1 : bmax;
  
                         POSTGIS_DEBUGF(4, " dimension %d, feature %d: bin %d to bin %d", d, i, bmin, bmax);
  
                         /* Increment the counts in all the bins this feature overlaps */
                         for ( k = bmin; k <= bmax; k++ )
                         {
                                 counts[k] += 1;
                         }
  
                 }
  
                 /* How dispersed is the distribution of features across bins? */
                 range = range_quintile(counts, NUM_BINS);
  
 #if POSTGIS_DEBUG_LEVEL >= 3
                 average = avg(counts, NUM_BINS);
                 sdev = stddev(counts, NUM_BINS);
                 sdev_ratio = sdev/average;
  
                 POSTGIS_DEBUGF(3, " dimension %d: range = %d", d, range);
                 POSTGIS_DEBUGF(3, " dimension %d: average = %.6g", d, average);
                 POSTGIS_DEBUGF(3, " dimension %d: stddev = %.6g", d, sdev);
                 POSTGIS_DEBUGF(3, " dimension %d: stddev_ratio = %.6g", d, sdev_ratio);
 #endif
  
                 distribution[d] = range;
         }
  
         return true;
 }

References ND_BOX_T::max, MAX_DIMENSION_WIDTH, ND_BOX_T::min, NUM_BINS, and range_quintile().

Referenced by compute_gserialized_stats_mode().

Here is the call graph for this function:

Here is the caller graph for this function: