PostGIS 3.6.2dev-r@@SVN_REVISION@@
Loading...
Searching...
No Matches

◆ estimate_join_selectivity()

static float8 estimate_join_selectivity ( const ND_STATS s1,
const ND_STATS s2 
)
static

Given two statistics histograms, what is the selectivity of a join driven by the && or &&& operator?

Join selectivity is defined as the number of rows returned by the join operator divided by the number of rows that an unconstrained join would return (nrows1*nrows2).

To get the estimate of join rows, we walk through the cells of one histogram, and multiply the cell value by the proportion of the cells in the other histogram the cell overlaps: val += val1 * ( val2 * overlap_ratio )

Definition at line 894 of file gserialized_estimate.c.

895{
896 int ncells1, ncells2;
897 int ndims1, ndims2, ndims;
898 double ntuples_max;
899 double ntuples_not_null1, ntuples_not_null2;
900
901 ND_BOX extent1, extent2;
902 ND_IBOX ibox1, ibox2;
903 int at1[ND_DIMS];
904 int at2[ND_DIMS];
905 double min1[ND_DIMS];
906 double width1[ND_DIMS];
907 double cellsize1[ND_DIMS];
908 int size2[ND_DIMS];
909 double min2[ND_DIMS];
910 double width2[ND_DIMS];
911 double cellsize2[ND_DIMS];
912 int size1[ND_DIMS];
913 int d;
914 double val = 0;
915 float8 selectivity;
916
917 /* Drop out on null inputs */
918 if ( ! ( s1 && s2 ) )
919 {
920 elog(NOTICE, " estimate_join_selectivity called with null inputs");
921 return FALLBACK_ND_SEL;
922 }
923
924 /* We need to know how many cells each side has... */
925 ncells1 = (int)roundf(s1->histogram_cells);
926 ncells2 = (int)roundf(s2->histogram_cells);
927
928 /* ...so that we can drive the summation loop with the smaller histogram. */
929 if ( ncells1 > ncells2 )
930 {
931 const ND_STATS *stats_tmp = s1;
932 s1 = s2;
933 s2 = stats_tmp;
934 }
935
936 POSTGIS_DEBUGF(3, "s1: %s", nd_stats_to_json(s1));
937 POSTGIS_DEBUGF(3, "s2: %s", nd_stats_to_json(s2));
938
939 /* Re-read that info after the swap */
940 ncells1 = (int)roundf(s1->histogram_cells);
941 ncells2 = (int)roundf(s2->histogram_cells);
942
943 /* Q: What's the largest possible join size these relations can create? */
944 /* A: The product of the # of non-null rows in each relation. */
945 ntuples_not_null1 = s1->table_features * ((double)s1->not_null_features / s1->sample_features);
946 ntuples_not_null2 = s2->table_features * ((double)s2->not_null_features / s2->sample_features);
947 ntuples_max = ntuples_not_null1 * ntuples_not_null2;
948
949 /* Get the ndims as ints */
950 ndims1 = (int)roundf(s1->ndims);
951 ndims2 = (int)roundf(s2->ndims);
952 ndims = Max(ndims1, ndims2);
953
954 /* Get the extents */
955 extent1 = s1->extent;
956 extent2 = s2->extent;
957
958 /* If relation stats do not intersect, join is very very selective. */
959 if ( ! nd_box_intersects(&extent1, &extent2, ndims) )
960 {
961 POSTGIS_DEBUG(3, "relation stats do not intersect, returning 0");
962 PG_RETURN_FLOAT8(0.0);
963 }
964
965 /*
966 * First find the index range of the part of the smaller
967 * histogram that overlaps the larger one.
968 */
969 if ( ! nd_box_overlap(s1, &extent2, &ibox1) )
970 {
971 POSTGIS_DEBUG(3, "could not calculate overlap of relations");
972 PG_RETURN_FLOAT8(FALLBACK_ND_JOINSEL);
973 }
974
975 /* Initialize counters / constants on s1 */
976 for ( d = 0; d < ndims1; d++ )
977 {
978 at1[d] = ibox1.min[d];
979 min1[d] = s1->extent.min[d];
980 width1[d] = s1->extent.max[d] - s1->extent.min[d];
981 size1[d] = (int)roundf(s1->size[d]);
982 cellsize1[d] = width1[d] / size1[d];
983 }
984
985 /* Initialize counters / constants on s2 */
986 for ( d = 0; d < ndims2; d++ )
987 {
988 min2[d] = s2->extent.min[d];
989 width2[d] = s2->extent.max[d] - s2->extent.min[d];
990 size2[d] = (int)roundf(s2->size[d]);
991 cellsize2[d] = width2[d] / size2[d];
992 }
993
994 /* For each affected cell of s1... */
995 do
996 {
997 double val1;
998 /* Construct the bounds of this cell */
999 ND_BOX nd_cell1;
1000 nd_box_init(&nd_cell1);
1001 for ( d = 0; d < ndims1; d++ )
1002 {
1003 nd_cell1.min[d] = min1[d] + (at1[d]+0) * cellsize1[d];
1004 nd_cell1.max[d] = min1[d] + (at1[d]+1) * cellsize1[d];
1005 }
1006
1007 /* Find the cells of s2 that cell1 overlaps.. */
1008 nd_box_overlap(s2, &nd_cell1, &ibox2);
1009
1010 /* Initialize counter */
1011 for ( d = 0; d < ndims2; d++ )
1012 {
1013 at2[d] = ibox2.min[d];
1014 }
1015
1016 POSTGIS_DEBUGF(3, "at1 %d,%d %s", at1[0], at1[1], nd_box_to_json(&nd_cell1, ndims1));
1017
1018 /* Get the value at this cell */
1019 val1 = s1->value[nd_stats_value_index(s1, at1)];
1020
1021 /* For each overlapped cell of s2... */
1022 do
1023 {
1024 double ratio2;
1025 double val2;
1026
1027 /* Construct the bounds of this cell */
1028 ND_BOX nd_cell2;
1029 nd_box_init(&nd_cell2);
1030 for ( d = 0; d < ndims2; d++ )
1031 {
1032 nd_cell2.min[d] = min2[d] + (at2[d]+0) * cellsize2[d];
1033 nd_cell2.max[d] = min2[d] + (at2[d]+1) * cellsize2[d];
1034 }
1035
1036 POSTGIS_DEBUGF(3, " at2 %d,%d %s", at2[0], at2[1], nd_box_to_json(&nd_cell2, ndims2));
1037
1038 /* Calculate overlap ratio of the cells */
1039 ratio2 = nd_box_ratio(&nd_cell1, &nd_cell2, Max(ndims1, ndims2));
1040
1041 /* Multiply the cell counts, scaled by overlap ratio */
1042 val2 = s2->value[nd_stats_value_index(s2, at2)];
1043 POSTGIS_DEBUGF(3, " val1 %.6g val2 %.6g ratio %.6g", val1, val2, ratio2);
1044 val += val1 * (val2 * ratio2);
1045 }
1046 while ( nd_increment(&ibox2, ndims2, at2) );
1047
1048 }
1049 while( nd_increment(&ibox1, ndims1, at1) );
1050
1051 POSTGIS_DEBUGF(3, "val of histogram = %g", val);
1052
1053 /*
1054 * In order to compare our total cell count "val" to the
1055 * ntuples_max, we need to scale val up to reflect a full
1056 * table estimate. So, multiply by ratio of table size to
1057 * sample size.
1058 */
1059 val *= (s1->table_features / s1->sample_features);
1060 val *= (s2->table_features / s2->sample_features);
1061
1062 POSTGIS_DEBUGF(3, "val scaled to full table size = %g", val);
1063
1064 /*
1065 * Because the cell counts are over-determined due to
1066 * double counting of features that overlap multiple cells
1067 * (see the compute_gserialized_stats routine)
1068 * we also have to scale our cell count "val" *down*
1069 * to adjust for the double counting.
1070 */
1071// val /= (s1->cells_covered / s1->histogram_features);
1072// val /= (s2->cells_covered / s2->histogram_features);
1073
1074 /*
1075 * Finally, the selectivity is the estimated number of
1076 * rows to be returned divided by the maximum possible
1077 * number of rows that can be returned.
1078 */
1079 selectivity = val / ntuples_max;
1080
1081 /* Guard against over-estimates and crazy numbers :) */
1082 if ( isnan(selectivity) || ! isfinite(selectivity) || selectivity < 0.0 )
1083 {
1084 selectivity = DEFAULT_ND_JOINSEL;
1085 }
1086 else if ( selectivity > 1.0 )
1087 {
1088 selectivity = 1.0;
1089 }
1090
1091 return selectivity;
1092}
static int nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
Return true if ND_BOX a overlaps b, false otherwise.
static int nd_increment(ND_IBOX *ibox, int ndims, int *counter)
Given an n-d index array (counter), and a domain to increment it in (ibox) increment it by one,...
static char * nd_box_to_json(const ND_BOX *nd_box, int ndims)
Convert an ND_BOX to a JSON string for printing.
static char * nd_stats_to_json(const ND_STATS *nd_stats)
Convert an ND_STATS to a JSON representation for external use.
#define DEFAULT_ND_JOINSEL
#define FALLBACK_ND_SEL
More modest fallback selectivity factor.
#define FALLBACK_ND_JOINSEL
static int nd_box_init(ND_BOX *a)
Zero out an ND_BOX.
static int nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
What stats cells overlap with this ND_BOX? Put the lowest cell addresses in ND_IBOX->min and the high...
static double nd_box_ratio(const ND_BOX *cover, const ND_BOX *target, int ndims)
static int nd_stats_value_index(const ND_STATS *stats, const int *indexes)

References DEFAULT_ND_JOINSEL, ND_STATS_T::extent, FALLBACK_ND_JOINSEL, FALLBACK_ND_SEL, ND_STATS_T::histogram_cells, ND_BOX_T::max, ND_BOX_T::min, ND_IBOX_T::min, nd_box_init(), nd_box_intersects(), nd_box_overlap(), nd_box_ratio(), nd_box_to_json(), ND_DIMS, nd_increment(), nd_stats_to_json(), nd_stats_value_index(), ND_STATS_T::ndims, ND_STATS_T::not_null_features, ND_STATS_T::sample_features, ND_STATS_T::size, ND_STATS_T::table_features, and ND_STATS_T::value.

Referenced by _postgis_gserialized_joinsel(), and gserialized_joinsel_internal().

Here is the call graph for this function:
Here is the caller graph for this function: