Calculate how much a set of boxes is homogeneously distributed or contentrated within one dimension, returning the range_quintile of of the overlap counts per cell in a uniform partition of the extent of the dimension.
A uniform distribution of counts will have a small range and will require few cells in a selectivity histogram. A diverse distribution of counts will have a larger range and require more cells in a selectivity histogram (to distinguish between areas of feature density and areas of feature sparseness. This measurement should help us identify cases like X/Y/Z data where there is lots of variability in density in X/Y (diversely in a multi-kilometer range) and far less in Z (in a few-hundred meter range).
645{
646 int d, i, k, range;
647 int *counts;
648 double smin, smax;
649 double swidth;
650#if POSTGIS_DEBUG_LEVEL >= 3
651 double average, sdev, sdev_ratio;
652#endif
653 int bmin, bmax;
655
657 counts = palloc0(num_bins * sizeof(int));
658
659
660 for ( d = 0; d < ndims; d++ )
661 {
662
663 memset(counts, 0, num_bins * sizeof(int));
664
665
666 smin = extent->
min[d];
667 smax = extent->
max[d];
668 swidth = smax - smin;
669
670
671
672
673
674
676 {
677 distribution[d] = 0;
678 continue;
679 }
680
681
682 for ( i = 0; i < num_boxes; i++ )
683 {
684 double minoffset, maxoffset;
685
686
687 ndb = nd_boxes[i];
688 if ( ! ndb ) continue;
689
690
691 minoffset = ndb->
min[d] - smin;
692 maxoffset = ndb->
max[d] - smin;
693
694
695 if ( minoffset < 0 || minoffset > swidth ||
696 maxoffset < 0 || maxoffset > swidth )
697 {
698 continue;
699 }
700
701
702 bmin = floor(num_bins * minoffset / swidth);
703 bmax = floor(num_bins * maxoffset / swidth);
704
705
706 if (bmax >= num_bins)
707 bmax = num_bins-1;
708
709 POSTGIS_DEBUGF(4, " dimension %d, feature %d: bin %d to bin %d", d, i, bmin, bmax);
710
711
712 for ( k = bmin; k <= bmax; k++ )
713 {
714 counts[k] += 1;
715 }
716
717 }
718
719
720
722
723#if POSTGIS_DEBUG_LEVEL >= 3
724 average = avg(counts, num_bins);
725 sdev = stddev(counts, num_bins);
726 sdev_ratio = sdev/average;
727
728 POSTGIS_DEBUGF(3, " dimension %d: range = %d", d, range);
729 POSTGIS_DEBUGF(3, " dimension %d: average = %.6g", d, average);
730 POSTGIS_DEBUGF(3, " dimension %d: stddev = %.6g", d, sdev);
731 POSTGIS_DEBUGF(3, " dimension %d: stddev_ratio = %.6g", d, sdev_ratio);
732#endif
733
734 distribution[d] = range;
735 }
736
737 pfree(counts);
738
739 return true;
740}
static int range_full(int *vals, int nvals)
The difference between the fourth and first quintile values, the "inter-quintile range".
#define MAX_DIMENSION_WIDTH
Maximum width of a dimension that we'll bother trying to compute statistics on.