PostGIS  2.2.7dev-r@@SVN_REVISION@@
static int nd_box_array_distribution ( const ND_BOX **  nd_boxes,
int  num_boxes,
const ND_BOX extent,
int  ndims,
double *  distribution 
)
static

Calculate how much a set of boxes is homogenously distributed or contentrated within one dimension, returning the range_quintile of of the overlap counts per cell in a uniform partition of the extent of the dimension.

A uniform distribution of counts will have a small range and will require few cells in a selectivity histogram. A diverse distribution of counts will have a larger range and require more cells in a selectivity histogram (to distinguish between areas of feature density and areas of feature sparseness. This measurement should help us identify cases like X/Y/Z data where there is lots of variability in density in X/Y (diversely in a multi-kilometer range) and far less in Z (in a few-hundred meter range).

Definition at line 690 of file gserialized_estimate.c.

References ND_BOX_T::max, ND_BOX_T::min, MIN_DIMENSION_WIDTH, range_quintile(), and TRUE.

Referenced by compute_gserialized_stats_mode().

691 {
692  /* How many bins shall we use in figuring out the distribution? */
693  static int num_bins = 50;
694  int d, i, k, range;
695  int counts[num_bins];
696  double smin, smax; /* Spatial min, spatial max */
697  double swidth; /* Spatial width of dimension */
698 #if POSTGIS_DEBUG_LEVEL >= 3
699  double average, sdev, sdev_ratio;
700 #endif
701  int bmin, bmax; /* Bin min, bin max */
702  const ND_BOX *ndb;
703 
704  /* For each dimension... */
705  for ( d = 0; d < ndims; d++ )
706  {
707  /* Initialize counts for this dimension */
708  memset(counts, 0, sizeof(int)*num_bins);
709 
710  smin = extent->min[d];
711  smax = extent->max[d];
712  swidth = smax - smin;
713 
714  /* Don't try and calculate distribution of overly narrow dimensions */
715  if ( swidth < MIN_DIMENSION_WIDTH )
716  {
717  distribution[d] = 0;
718  continue;
719  }
720 
721  /* Sum up the overlaps of each feature with the dimensional bins */
722  for ( i = 0; i < num_boxes; i++ )
723  {
724  double minoffset, maxoffset;
725 
726  /* Skip null entries */
727  ndb = nd_boxes[i];
728  if ( ! ndb ) continue;
729 
730  /* Where does box fall relative to the working range */
731  minoffset = ndb->min[d] - smin;
732  maxoffset = ndb->max[d] - smin;
733 
734  /* Skip boxes that our outside our working range */
735  if ( minoffset < 0 || minoffset > swidth ||
736  maxoffset < 0 || maxoffset > swidth )
737  {
738  continue;
739  }
740 
741  /* What bins does this range correspond to? */
742  bmin = num_bins * (minoffset) / swidth;
743  bmax = num_bins * (maxoffset) / swidth;
744 
745  POSTGIS_DEBUGF(4, " dimension %d, feature %d: bin %d to bin %d", d, i, bmin, bmax);
746 
747  /* Increment the counts in all the bins this feature overlaps */
748  for ( k = bmin; k <= bmax; k++ )
749  {
750  counts[k] += 1;
751  }
752 
753  }
754 
755  /* How dispersed is the distribution of features across bins? */
756  range = range_quintile(counts, num_bins);
757 
758 #if POSTGIS_DEBUG_LEVEL >= 3
759  average = avg(counts, num_bins);
760  sdev = stddev(counts, num_bins);
761  sdev_ratio = sdev/average;
762 
763  POSTGIS_DEBUGF(3, " dimension %d: range = %d", d, range);
764  POSTGIS_DEBUGF(3, " dimension %d: average = %.6g", d, average);
765  POSTGIS_DEBUGF(3, " dimension %d: stddev = %.6g", d, sdev);
766  POSTGIS_DEBUGF(3, " dimension %d: stddev_ratio = %.6g", d, sdev_ratio);
767 #endif
768 
769  distribution[d] = range;
770  }
771 
772  return TRUE;
773 }
#define MIN_DIMENSION_WIDTH
Minimum width of a dimension that we'll bother trying to compute statistics on.
static int range_quintile(int *vals, int nvals)
The difference between the fourth and first quintile values, the "inter-quintile range".
float4 max[ND_DIMS]
float4 min[ND_DIMS]
#define TRUE
Definition: dbfopen.c:169
N-dimensional box type for calculations, to avoid doing explicit axis conversions from GBOX in all ca...

Here is the call graph for this function:

Here is the caller graph for this function: