Given two statistics histograms, what is the selectivity of a join driven by the && or &&& operator?
Join selectivity is defined as the number of rows returned by the join operator divided by the number of rows that an unconstrained join would return (nrows1*nrows2).
To get the estimate of join rows, we walk through the cells of one histogram, and multiply the cell value by the proportion of the cells in the other histogram the cell overlaps: val += val1 * ( val2 * overlap_ratio )
895{
896 int ncells1, ncells2;
897 int ndims1, ndims2, ndims;
898 double ntuples_max;
899 double ntuples_not_null1, ntuples_not_null2;
900
913 int d;
914 double val = 0;
915 float8 selectivity;
916
917
918 if ( ! ( s1 && s2 ) )
919 {
920 elog(NOTICE, " estimate_join_selectivity called with null inputs");
922 }
923
924
927
928
929 if ( ncells1 > ncells2 )
930 {
932 s1 = s2;
933 s2 = stats_tmp;
934 }
935
938
939
942
943
944
947 ntuples_max = ntuples_not_null1 * ntuples_not_null2;
948
949
950 ndims1 = (int)roundf(s1->
ndims);
951 ndims2 = (int)roundf(s2->
ndims);
952 ndims = Max(ndims1, ndims2);
953
954
957
958
960 {
961 POSTGIS_DEBUG(3, "relation stats do not intersect, returning 0");
962 PG_RETURN_FLOAT8(0.0);
963 }
964
965
966
967
968
970 {
971 POSTGIS_DEBUG(3, "could not calculate overlap of relations");
973 }
974
975
976 for ( d = 0; d < ndims1; d++ )
977 {
978 at1[d] = ibox1.
min[d];
981 size1[d] = (int)roundf(s1->
size[d]);
982 cellsize1[d] = width1[d] / size1[d];
983 }
984
985
986 for ( d = 0; d < ndims2; d++ )
987 {
990 size2[d] = (int)roundf(s2->
size[d]);
991 cellsize2[d] = width2[d] / size2[d];
992 }
993
994
995 do
996 {
997 double val1;
998
1001 for ( d = 0; d < ndims1; d++ )
1002 {
1003 nd_cell1.
min[d] = min1[d] + (at1[d]+0) * cellsize1[d];
1004 nd_cell1.
max[d] = min1[d] + (at1[d]+1) * cellsize1[d];
1005 }
1006
1007
1009
1010
1011 for ( d = 0; d < ndims2; d++ )
1012 {
1013 at2[d] = ibox2.
min[d];
1014 }
1015
1016 POSTGIS_DEBUGF(3,
"at1 %d,%d %s", at1[0], at1[1],
nd_box_to_json(&nd_cell1, ndims1));
1017
1018
1020
1021
1022 do
1023 {
1024 double ratio2;
1025 double val2;
1026
1027
1030 for ( d = 0; d < ndims2; d++ )
1031 {
1032 nd_cell2.
min[d] = min2[d] + (at2[d]+0) * cellsize2[d];
1033 nd_cell2.
max[d] = min2[d] + (at2[d]+1) * cellsize2[d];
1034 }
1035
1036 POSTGIS_DEBUGF(3,
" at2 %d,%d %s", at2[0], at2[1],
nd_box_to_json(&nd_cell2, ndims2));
1037
1038
1039 ratio2 =
nd_box_ratio(&nd_cell1, &nd_cell2, Max(ndims1, ndims2));
1040
1041
1043 POSTGIS_DEBUGF(3, " val1 %.6g val2 %.6g ratio %.6g", val1, val2, ratio2);
1044 val += val1 * (val2 * ratio2);
1045 }
1047
1048 }
1050
1051 POSTGIS_DEBUGF(3, "val of histogram = %g", val);
1052
1053
1054
1055
1056
1057
1058
1061
1062 POSTGIS_DEBUGF(3, "val scaled to full table size = %g", val);
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079 selectivity = val / ntuples_max;
1080
1081
1082 if ( isnan(selectivity) || ! isfinite(selectivity) || selectivity < 0.0 )
1083 {
1085 }
1086 else if ( selectivity > 1.0 )
1087 {
1088 selectivity = 1.0;
1089 }
1090
1091 return selectivity;
1092}
static int nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
Return true if ND_BOX a overlaps b, false otherwise.
static int nd_increment(ND_IBOX *ibox, int ndims, int *counter)
Given an n-d index array (counter), and a domain to increment it in (ibox) increment it by one,...
static char * nd_box_to_json(const ND_BOX *nd_box, int ndims)
Convert an ND_BOX to a JSON string for printing.
static char * nd_stats_to_json(const ND_STATS *nd_stats)
Convert an ND_STATS to a JSON representation for external use.
#define DEFAULT_ND_JOINSEL
#define FALLBACK_ND_SEL
More modest fallback selectivity factor.
#define FALLBACK_ND_JOINSEL
static int nd_box_init(ND_BOX *a)
Zero out an ND_BOX.
static int nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
What stats cells overlap with this ND_BOX? Put the lowest cell addresses in ND_IBOX->min and the high...
static double nd_box_ratio(const ND_BOX *cover, const ND_BOX *target, int ndims)
static int nd_stats_value_index(const ND_STATS *stats, const int *indexes)