PostGIS  3.6.1dev-r@@SVN_REVISION@@

◆ nd_box_array_distribution()

static int nd_box_array_distribution ( const ND_BOX **  nd_boxes,
int  num_boxes,
const ND_BOX extent,
int  ndims,
double *  distribution 
)
static

Calculate how much a set of boxes is homogeneously distributed or contentrated within one dimension, returning the range_quintile of of the overlap counts per cell in a uniform partition of the extent of the dimension.

A uniform distribution of counts will have a small range and will require few cells in a selectivity histogram. A diverse distribution of counts will have a larger range and require more cells in a selectivity histogram (to distinguish between areas of feature density and areas of feature sparseness. This measurement should help us identify cases like X/Y/Z data where there is lots of variability in density in X/Y (diversely in a multi-kilometer range) and far less in Z (in a few-hundred meter range).

Definition at line 789 of file gserialized_estimate.c.

790 {
791  int d, i, k, range;
792  int *counts;
793  double smin, smax; /* Spatial min, spatial max */
794  double swidth; /* Spatial width of dimension */
795 #if POSTGIS_DEBUG_LEVEL >= 3
796  double average, sdev, sdev_ratio;
797 #endif
798  int bmin, bmax; /* Bin min, bin max */
799  const ND_BOX *ndb;
800 
801  int num_bins = Min(Max(2, num_boxes/BIN_MIN_SIZE), MAX_NUM_BINS);
802  counts = palloc0(num_bins * sizeof(int));
803 
804  /* For each dimension... */
805  for ( d = 0; d < ndims; d++ )
806  {
807  /* Initialize counts for this dimension */
808  memset(counts, 0, num_bins * sizeof(int));
809 
810 
811  smin = extent->min[d];
812  smax = extent->max[d];
813  swidth = smax - smin;
814 
815  /* Don't try and calculate distribution of overly narrow */
816  /* or overly wide dimensions. Here we're being pretty geographical, */
817  /* expecting "normal" planar or geographic coordinates. */
818  /* Otherwise we have to "handle" +/- Inf bounded features and */
819  /* the assumptions needed for that are as bad as this hack. */
820  if ( swidth < MIN_DIMENSION_WIDTH || swidth > MAX_DIMENSION_WIDTH )
821  {
822  distribution[d] = 0;
823  continue;
824  }
825 
826  /* Sum up the overlaps of each feature with the dimensional bins */
827  for ( i = 0; i < num_boxes; i++ )
828  {
829  double minoffset, maxoffset;
830 
831  /* Skip null entries */
832  ndb = nd_boxes[i];
833  if ( ! ndb ) continue;
834 
835  /* Where does box fall relative to the working range */
836  minoffset = ndb->min[d] - smin;
837  maxoffset = ndb->max[d] - smin;
838 
839  /* Skip boxes that our outside our working range */
840  if ( minoffset < 0 || minoffset > swidth ||
841  maxoffset < 0 || maxoffset > swidth )
842  {
843  continue;
844  }
845 
846  /* What bins does this range correspond to? */
847  bmin = floor(num_bins * minoffset / swidth);
848  bmax = floor(num_bins * maxoffset / swidth);
849 
850  /* Should only happen when maxoffset==swidth */
851  if (bmax >= num_bins)
852  bmax = num_bins-1;
853 
854  POSTGIS_DEBUGF(4, " dimension %d, feature %d: bin %d to bin %d", d, i, bmin, bmax);
855 
856  /* Increment the counts in all the bins this feature overlaps */
857  for ( k = bmin; k <= bmax; k++ )
858  {
859  counts[k] += 1;
860  }
861 
862  }
863 
864  /* How dispersed is the distribution of features across bins? */
865  // range = range_quintile(counts, num_bins);
866  range = range_full(counts, num_bins);
867 
868 #if POSTGIS_DEBUG_LEVEL >= 3
869  average = avg(counts, num_bins);
870  sdev = stddev(counts, num_bins);
871  sdev_ratio = sdev/average;
872 
873  POSTGIS_DEBUGF(3, " dimension %d: range = %d", d, range);
874  POSTGIS_DEBUGF(3, " dimension %d: average = %.6g", d, average);
875  POSTGIS_DEBUGF(3, " dimension %d: stddev = %.6g", d, sdev);
876  POSTGIS_DEBUGF(3, " dimension %d: stddev_ratio = %.6g", d, sdev_ratio);
877 #endif
878 
879  distribution[d] = range;
880  }
881 
882  pfree(counts);
883 
884  return true;
885 }
static int range_full(int *vals, int nvals)
The difference between the fourth and first quintile values, the "inter-quintile range".
#define MAX_NUM_BINS
#define MAX_DIMENSION_WIDTH
Maximum width of a dimension that we'll bother trying to compute statistics on.
#define BIN_MIN_SIZE
float4 max[ND_DIMS]
float4 min[ND_DIMS]
N-dimensional box type for calculations, to avoid doing explicit axis conversions from GBOX in all ca...

References BIN_MIN_SIZE, ND_BOX_T::max, MAX_DIMENSION_WIDTH, MAX_NUM_BINS, ND_BOX_T::min, and range_full().

Referenced by compute_gserialized_stats_mode().

Here is the call graph for this function:
Here is the caller graph for this function: