PostGIS 3.7.0dev-r@@SVN_REVISION@@
Loading...
Searching...
No Matches

◆ estimate_join_selectivity()

static float8 estimate_join_selectivity ( const ND_STATS s1,
const ND_STATS s2 
)
static

Given two statistics histograms, what is the selectivity of a join driven by the && or &&& operator?

Join selectivity is defined as the number of rows returned by the join operator divided by the number of rows that an unconstrained join would return (nrows1*nrows2).

To get the estimate of join rows, we walk through the cells of one histogram, and multiply the cell value by the proportion of the cells in the other histogram the cell overlaps: val += val1 * ( val2 * overlap_ratio )

Definition at line 890 of file gserialized_estimate.c.

891{
892 int ncells1, ncells2;
893 int ndims1, ndims2, ndims;
894 double ntuples_max;
895 double ntuples_not_null1, ntuples_not_null2;
896
897 ND_BOX extent1, extent2;
898 ND_IBOX ibox1, ibox2;
899 int at1[ND_DIMS];
900 int at2[ND_DIMS];
901 double min1[ND_DIMS];
902 double width1[ND_DIMS];
903 double cellsize1[ND_DIMS];
904 int size2[ND_DIMS];
905 double min2[ND_DIMS];
906 double width2[ND_DIMS];
907 double cellsize2[ND_DIMS];
908 int size1[ND_DIMS];
909 int d;
910 double val = 0;
911 float8 selectivity;
912
913 /* Drop out on null inputs */
914 if ( ! ( s1 && s2 ) )
915 {
916 elog(NOTICE, " estimate_join_selectivity called with null inputs");
917 return FALLBACK_ND_SEL;
918 }
919
920 /* We need to know how many cells each side has... */
921 ncells1 = (int)roundf(s1->histogram_cells);
922 ncells2 = (int)roundf(s2->histogram_cells);
923
924 /* ...so that we can drive the summation loop with the smaller histogram. */
925 if ( ncells1 > ncells2 )
926 {
927 const ND_STATS *stats_tmp = s1;
928 s1 = s2;
929 s2 = stats_tmp;
930 }
931
932 POSTGIS_DEBUGF(3, "s1: %s", nd_stats_to_json(s1));
933 POSTGIS_DEBUGF(3, "s2: %s", nd_stats_to_json(s2));
934
935 /* Re-read that info after the swap */
936 ncells1 = (int)roundf(s1->histogram_cells);
937 ncells2 = (int)roundf(s2->histogram_cells);
938
939 /* Q: What's the largest possible join size these relations can create? */
940 /* A: The product of the # of non-null rows in each relation. */
941 ntuples_not_null1 = s1->table_features * ((double)s1->not_null_features / s1->sample_features);
942 ntuples_not_null2 = s2->table_features * ((double)s2->not_null_features / s2->sample_features);
943 ntuples_max = ntuples_not_null1 * ntuples_not_null2;
944
945 /* Get the ndims as ints */
946 ndims1 = (int)roundf(s1->ndims);
947 ndims2 = (int)roundf(s2->ndims);
948 ndims = Max(ndims1, ndims2);
949
950 /* Get the extents */
951 extent1 = s1->extent;
952 extent2 = s2->extent;
953
954 /* If relation stats do not intersect, join is very very selective. */
955 if ( ! nd_box_intersects(&extent1, &extent2, ndims) )
956 {
957 POSTGIS_DEBUG(3, "relation stats do not intersect, returning 0");
958 PG_RETURN_FLOAT8(0.0);
959 }
960
961 /*
962 * First find the index range of the part of the smaller
963 * histogram that overlaps the larger one.
964 */
965 if ( ! nd_box_overlap(s1, &extent2, &ibox1) )
966 {
967 POSTGIS_DEBUG(3, "could not calculate overlap of relations");
968 PG_RETURN_FLOAT8(FALLBACK_ND_JOINSEL);
969 }
970
971 /* Initialize counters / constants on s1 */
972 for ( d = 0; d < ndims1; d++ )
973 {
974 at1[d] = ibox1.min[d];
975 min1[d] = s1->extent.min[d];
976 width1[d] = s1->extent.max[d] - s1->extent.min[d];
977 size1[d] = (int)roundf(s1->size[d]);
978 cellsize1[d] = width1[d] / size1[d];
979 }
980
981 /* Initialize counters / constants on s2 */
982 for ( d = 0; d < ndims2; d++ )
983 {
984 min2[d] = s2->extent.min[d];
985 width2[d] = s2->extent.max[d] - s2->extent.min[d];
986 size2[d] = (int)roundf(s2->size[d]);
987 cellsize2[d] = width2[d] / size2[d];
988 }
989
990 /* For each affected cell of s1... */
991 do
992 {
993 double val1;
994 /* Construct the bounds of this cell */
995 ND_BOX nd_cell1;
996 nd_box_init(&nd_cell1);
997 for ( d = 0; d < ndims1; d++ )
998 {
999 nd_cell1.min[d] = min1[d] + (at1[d]+0) * cellsize1[d];
1000 nd_cell1.max[d] = min1[d] + (at1[d]+1) * cellsize1[d];
1001 }
1002
1003 /* Find the cells of s2 that cell1 overlaps.. */
1004 nd_box_overlap(s2, &nd_cell1, &ibox2);
1005
1006 /* Initialize counter */
1007 for ( d = 0; d < ndims2; d++ )
1008 {
1009 at2[d] = ibox2.min[d];
1010 }
1011
1012 POSTGIS_DEBUGF(3, "at1 %d,%d %s", at1[0], at1[1], nd_box_to_json(&nd_cell1, ndims1));
1013
1014 /* Get the value at this cell */
1015 val1 = s1->value[nd_stats_value_index(s1, at1)];
1016
1017 /* For each overlapped cell of s2... */
1018 do
1019 {
1020 double ratio2;
1021 double val2;
1022
1023 /* Construct the bounds of this cell */
1024 ND_BOX nd_cell2;
1025 nd_box_init(&nd_cell2);
1026 for ( d = 0; d < ndims2; d++ )
1027 {
1028 nd_cell2.min[d] = min2[d] + (at2[d]+0) * cellsize2[d];
1029 nd_cell2.max[d] = min2[d] + (at2[d]+1) * cellsize2[d];
1030 }
1031
1032 POSTGIS_DEBUGF(3, " at2 %d,%d %s", at2[0], at2[1], nd_box_to_json(&nd_cell2, ndims2));
1033
1034 /* Calculate overlap ratio of the cells */
1035 ratio2 = nd_box_ratio(&nd_cell1, &nd_cell2, Max(ndims1, ndims2));
1036
1037 /* Multiply the cell counts, scaled by overlap ratio */
1038 val2 = s2->value[nd_stats_value_index(s2, at2)];
1039 POSTGIS_DEBUGF(3, " val1 %.6g val2 %.6g ratio %.6g", val1, val2, ratio2);
1040 val += val1 * (val2 * ratio2);
1041 }
1042 while ( nd_increment(&ibox2, ndims2, at2) );
1043
1044 }
1045 while( nd_increment(&ibox1, ndims1, at1) );
1046
1047 POSTGIS_DEBUGF(3, "val of histogram = %g", val);
1048
1049 /*
1050 * In order to compare our total cell count "val" to the
1051 * ntuples_max, we need to scale val up to reflect a full
1052 * table estimate. So, multiply by ratio of table size to
1053 * sample size.
1054 */
1055 val *= (s1->table_features / s1->sample_features);
1056 val *= (s2->table_features / s2->sample_features);
1057
1058 POSTGIS_DEBUGF(3, "val scaled to full table size = %g", val);
1059
1060 /*
1061 * Because the cell counts are over-determined due to
1062 * double counting of features that overlap multiple cells
1063 * (see the compute_gserialized_stats routine)
1064 * we also have to scale our cell count "val" *down*
1065 * to adjust for the double counting.
1066 */
1067// val /= (s1->cells_covered / s1->histogram_features);
1068// val /= (s2->cells_covered / s2->histogram_features);
1069
1070 /*
1071 * Finally, the selectivity is the estimated number of
1072 * rows to be returned divided by the maximum possible
1073 * number of rows that can be returned.
1074 */
1075 selectivity = val / ntuples_max;
1076
1077 /* Guard against over-estimates and crazy numbers :) */
1078 if ( isnan(selectivity) || ! isfinite(selectivity) || selectivity < 0.0 )
1079 {
1080 selectivity = DEFAULT_ND_JOINSEL;
1081 }
1082 else if ( selectivity > 1.0 )
1083 {
1084 selectivity = 1.0;
1085 }
1086
1087 return selectivity;
1088}
static int nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
Return true if ND_BOX a overlaps b, false otherwise.
static int nd_increment(ND_IBOX *ibox, int ndims, int *counter)
Given an n-d index array (counter), and a domain to increment it in (ibox) increment it by one,...
static char * nd_box_to_json(const ND_BOX *nd_box, int ndims)
Convert an ND_BOX to a JSON string for printing.
static char * nd_stats_to_json(const ND_STATS *nd_stats)
Convert an ND_STATS to a JSON representation for external use.
#define DEFAULT_ND_JOINSEL
#define FALLBACK_ND_SEL
More modest fallback selectivity factor.
#define FALLBACK_ND_JOINSEL
static int nd_box_init(ND_BOX *a)
Zero out an ND_BOX.
static int nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
What stats cells overlap with this ND_BOX? Put the lowest cell addresses in ND_IBOX->min and the high...
static double nd_box_ratio(const ND_BOX *cover, const ND_BOX *target, int ndims)
static int nd_stats_value_index(const ND_STATS *stats, const int *indexes)

References DEFAULT_ND_JOINSEL, ND_STATS_T::extent, FALLBACK_ND_JOINSEL, FALLBACK_ND_SEL, ND_STATS_T::histogram_cells, ND_BOX_T::max, ND_BOX_T::min, ND_IBOX_T::min, nd_box_init(), nd_box_intersects(), nd_box_overlap(), nd_box_ratio(), nd_box_to_json(), ND_DIMS, nd_increment(), nd_stats_to_json(), nd_stats_value_index(), ND_STATS_T::ndims, ND_STATS_T::not_null_features, ND_STATS_T::sample_features, ND_STATS_T::size, ND_STATS_T::table_features, and ND_STATS_T::value.

Referenced by _postgis_gserialized_joinsel(), and gserialized_joinsel_internal().

Here is the call graph for this function:
Here is the caller graph for this function: