PostGIS 3.7.0dev-r@@SVN_REVISION@@
Loading...
Searching...
No Matches

◆ nd_box_array_distribution()

static int nd_box_array_distribution ( const ND_BOX **  nd_boxes,
int  num_boxes,
const ND_BOX extent,
int  ndims,
double *  distribution 
)
static

Calculate how much a set of boxes is homogeneously distributed or contentrated within one dimension, returning the range_quintile of of the overlap counts per cell in a uniform partition of the extent of the dimension.

A uniform distribution of counts will have a small range and will require few cells in a selectivity histogram. A diverse distribution of counts will have a larger range and require more cells in a selectivity histogram (to distinguish between areas of feature density and areas of feature sparseness. This measurement should help us identify cases like X/Y/Z data where there is lots of variability in density in X/Y (diversely in a multi-kilometer range) and far less in Z (in a few-hundred meter range).

Definition at line 644 of file gserialized_estimate.c.

645{
646 int d, i, k, range;
647 int *counts;
648 double smin, smax; /* Spatial min, spatial max */
649 double swidth; /* Spatial width of dimension */
650#if POSTGIS_DEBUG_LEVEL >= 3
651 double average, sdev, sdev_ratio;
652#endif
653 int bmin, bmax; /* Bin min, bin max */
654 const ND_BOX *ndb;
655
656 int num_bins = Min(Max(2, num_boxes/BIN_MIN_SIZE), MAX_NUM_BINS);
657 counts = palloc0(num_bins * sizeof(int));
658
659 /* For each dimension... */
660 for ( d = 0; d < ndims; d++ )
661 {
662 /* Initialize counts for this dimension */
663 memset(counts, 0, num_bins * sizeof(int));
664
665
666 smin = extent->min[d];
667 smax = extent->max[d];
668 swidth = smax - smin;
669
670 /* Don't try and calculate distribution of overly narrow */
671 /* or overly wide dimensions. Here we're being pretty geographical, */
672 /* expecting "normal" planar or geographic coordinates. */
673 /* Otherwise we have to "handle" +/- Inf bounded features and */
674 /* the assumptions needed for that are as bad as this hack. */
675 if ( swidth < MIN_DIMENSION_WIDTH || swidth > MAX_DIMENSION_WIDTH )
676 {
677 distribution[d] = 0;
678 continue;
679 }
680
681 /* Sum up the overlaps of each feature with the dimensional bins */
682 for ( i = 0; i < num_boxes; i++ )
683 {
684 double minoffset, maxoffset;
685
686 /* Skip null entries */
687 ndb = nd_boxes[i];
688 if ( ! ndb ) continue;
689
690 /* Where does box fall relative to the working range */
691 minoffset = ndb->min[d] - smin;
692 maxoffset = ndb->max[d] - smin;
693
694 /* Skip boxes that our outside our working range */
695 if ( minoffset < 0 || minoffset > swidth ||
696 maxoffset < 0 || maxoffset > swidth )
697 {
698 continue;
699 }
700
701 /* What bins does this range correspond to? */
702 bmin = floor(num_bins * minoffset / swidth);
703 bmax = floor(num_bins * maxoffset / swidth);
704
705 /* Should only happen when maxoffset==swidth */
706 if (bmax >= num_bins)
707 bmax = num_bins-1;
708
709 POSTGIS_DEBUGF(4, " dimension %d, feature %d: bin %d to bin %d", d, i, bmin, bmax);
710
711 /* Increment the counts in all the bins this feature overlaps */
712 for ( k = bmin; k <= bmax; k++ )
713 {
714 counts[k] += 1;
715 }
716
717 }
718
719 /* How dispersed is the distribution of features across bins? */
720 // range = range_quintile(counts, num_bins);
721 range = range_full(counts, num_bins);
722
723#if POSTGIS_DEBUG_LEVEL >= 3
724 average = avg(counts, num_bins);
725 sdev = stddev(counts, num_bins);
726 sdev_ratio = sdev/average;
727
728 POSTGIS_DEBUGF(3, " dimension %d: range = %d", d, range);
729 POSTGIS_DEBUGF(3, " dimension %d: average = %.6g", d, average);
730 POSTGIS_DEBUGF(3, " dimension %d: stddev = %.6g", d, sdev);
731 POSTGIS_DEBUGF(3, " dimension %d: stddev_ratio = %.6g", d, sdev_ratio);
732#endif
733
734 distribution[d] = range;
735 }
736
737 pfree(counts);
738
739 return true;
740}
static int range_full(int *vals, int nvals)
The difference between the fourth and first quintile values, the "inter-quintile range".
#define MAX_NUM_BINS
#define MAX_DIMENSION_WIDTH
Maximum width of a dimension that we'll bother trying to compute statistics on.
#define BIN_MIN_SIZE

References BIN_MIN_SIZE, ND_BOX_T::max, MAX_DIMENSION_WIDTH, MAX_NUM_BINS, ND_BOX_T::min, and range_full().

Referenced by compute_gserialized_stats_mode().

Here is the call graph for this function:
Here is the caller graph for this function: