PostGIS 3.6.2dev-r@@SVN_REVISION@@
Loading...
Searching...
No Matches

◆ nd_box_array_distribution()

static int nd_box_array_distribution ( const ND_BOX **  nd_boxes,
int  num_boxes,
const ND_BOX extent,
int  ndims,
double *  distribution 
)
static

Calculate how much a set of boxes is homogeneously distributed or contentrated within one dimension, returning the range_quintile of of the overlap counts per cell in a uniform partition of the extent of the dimension.

A uniform distribution of counts will have a small range and will require few cells in a selectivity histogram. A diverse distribution of counts will have a larger range and require more cells in a selectivity histogram (to distinguish between areas of feature density and areas of feature sparseness. This measurement should help us identify cases like X/Y/Z data where there is lots of variability in density in X/Y (diversely in a multi-kilometer range) and far less in Z (in a few-hundred meter range).

Definition at line 648 of file gserialized_estimate.c.

649{
650 int d, i, k, range;
651 int *counts;
652 double smin, smax; /* Spatial min, spatial max */
653 double swidth; /* Spatial width of dimension */
654#if POSTGIS_DEBUG_LEVEL >= 3
655 double average, sdev, sdev_ratio;
656#endif
657 int bmin, bmax; /* Bin min, bin max */
658 const ND_BOX *ndb;
659
660 int num_bins = Min(Max(2, num_boxes/BIN_MIN_SIZE), MAX_NUM_BINS);
661 counts = palloc0(num_bins * sizeof(int));
662
663 /* For each dimension... */
664 for ( d = 0; d < ndims; d++ )
665 {
666 /* Initialize counts for this dimension */
667 memset(counts, 0, num_bins * sizeof(int));
668
669
670 smin = extent->min[d];
671 smax = extent->max[d];
672 swidth = smax - smin;
673
674 /* Don't try and calculate distribution of overly narrow */
675 /* or overly wide dimensions. Here we're being pretty geographical, */
676 /* expecting "normal" planar or geographic coordinates. */
677 /* Otherwise we have to "handle" +/- Inf bounded features and */
678 /* the assumptions needed for that are as bad as this hack. */
679 if ( swidth < MIN_DIMENSION_WIDTH || swidth > MAX_DIMENSION_WIDTH )
680 {
681 distribution[d] = 0;
682 continue;
683 }
684
685 /* Sum up the overlaps of each feature with the dimensional bins */
686 for ( i = 0; i < num_boxes; i++ )
687 {
688 double minoffset, maxoffset;
689
690 /* Skip null entries */
691 ndb = nd_boxes[i];
692 if ( ! ndb ) continue;
693
694 /* Where does box fall relative to the working range */
695 minoffset = ndb->min[d] - smin;
696 maxoffset = ndb->max[d] - smin;
697
698 /* Skip boxes that our outside our working range */
699 if ( minoffset < 0 || minoffset > swidth ||
700 maxoffset < 0 || maxoffset > swidth )
701 {
702 continue;
703 }
704
705 /* What bins does this range correspond to? */
706 bmin = floor(num_bins * minoffset / swidth);
707 bmax = floor(num_bins * maxoffset / swidth);
708
709 /* Should only happen when maxoffset==swidth */
710 if (bmax >= num_bins)
711 bmax = num_bins-1;
712
713 POSTGIS_DEBUGF(4, " dimension %d, feature %d: bin %d to bin %d", d, i, bmin, bmax);
714
715 /* Increment the counts in all the bins this feature overlaps */
716 for ( k = bmin; k <= bmax; k++ )
717 {
718 counts[k] += 1;
719 }
720
721 }
722
723 /* How dispersed is the distribution of features across bins? */
724 // range = range_quintile(counts, num_bins);
725 range = range_full(counts, num_bins);
726
727#if POSTGIS_DEBUG_LEVEL >= 3
728 average = avg(counts, num_bins);
729 sdev = stddev(counts, num_bins);
730 sdev_ratio = sdev/average;
731
732 POSTGIS_DEBUGF(3, " dimension %d: range = %d", d, range);
733 POSTGIS_DEBUGF(3, " dimension %d: average = %.6g", d, average);
734 POSTGIS_DEBUGF(3, " dimension %d: stddev = %.6g", d, sdev);
735 POSTGIS_DEBUGF(3, " dimension %d: stddev_ratio = %.6g", d, sdev_ratio);
736#endif
737
738 distribution[d] = range;
739 }
740
741 pfree(counts);
742
743 return true;
744}
static int range_full(int *vals, int nvals)
The difference between the fourth and first quintile values, the "inter-quintile range".
#define MAX_NUM_BINS
#define MAX_DIMENSION_WIDTH
Maximum width of a dimension that we'll bother trying to compute statistics on.
#define BIN_MIN_SIZE

References BIN_MIN_SIZE, ND_BOX_T::max, MAX_DIMENSION_WIDTH, MAX_NUM_BINS, ND_BOX_T::min, and range_full().

Referenced by compute_gserialized_stats_mode().

Here is the call graph for this function:
Here is the caller graph for this function: