PostGIS  2.3.8dev-r@@SVN_REVISION@@

◆ estimate_join_selectivity()

static float8 estimate_join_selectivity ( const ND_STATS s1,
const ND_STATS s2 
)
static

Given two statistics histograms, what is the selectivity of a join driven by the && or &&& operator?

Join selectivity is defined as the number of rows returned by the join operator divided by the number of rows that an unconstrained join would return (nrows1*nrows2).

To get the estimate of join rows, we walk through the cells of one histogram, and multiply the cell value by the proportion of the cells in the other histogram the cell overlaps: val += val1 * ( val2 * overlap_ratio )

Definition at line 973 of file gserialized_estimate.c.

References DEFAULT_ND_JOINSEL, ND_STATS_T::extent, FALLBACK_ND_JOINSEL, FALLBACK_ND_SEL, gserialized_gist_joinsel_nd(), ND_STATS_T::histogram_cells, ND_BOX_T::max, ND_BOX_T::min, ND_IBOX_T::min, nd_box_init(), nd_box_intersects(), nd_box_overlap(), nd_box_ratio(), nd_box_to_json(), ND_DIMS, nd_increment(), nd_stats_to_json(), nd_stats_value_index(), ND_STATS_T::ndims, ND_STATS_T::not_null_features, PG_FUNCTION_INFO_V1(), ND_STATS_T::sample_features, ND_STATS_T::size, ND_STATS_T::table_features, and ND_STATS_T::value.

Referenced by _postgis_gserialized_joinsel(), and gserialized_gist_joinsel().

974 {
975  int ncells1, ncells2;
976  int ndims1, ndims2, ndims;
977  double ntuples_max;
978  double ntuples_not_null1, ntuples_not_null2;
979 
980  ND_BOX extent1, extent2;
981  ND_IBOX ibox1, ibox2;
982  int at1[ND_DIMS];
983  int at2[ND_DIMS];
984  double min1[ND_DIMS];
985  double width1[ND_DIMS];
986  double cellsize1[ND_DIMS];
987  int size2[ND_DIMS];
988  double min2[ND_DIMS];
989  double width2[ND_DIMS];
990  double cellsize2[ND_DIMS];
991  int size1[ND_DIMS];
992  int d;
993  double val = 0;
994  float8 selectivity;
995 
996  /* Drop out on null inputs */
997  if ( ! ( s1 && s2 ) )
998  {
999  elog(NOTICE, " estimate_join_selectivity called with null inputs");
1000  return FALLBACK_ND_SEL;
1001  }
1002 
1003  /* We need to know how many cells each side has... */
1004  ncells1 = (int)roundf(s1->histogram_cells);
1005  ncells2 = (int)roundf(s2->histogram_cells);
1006 
1007  /* ...so that we can drive the summation loop with the smaller histogram. */
1008  if ( ncells1 > ncells2 )
1009  {
1010  const ND_STATS *stats_tmp = s1;
1011  s1 = s2;
1012  s2 = stats_tmp;
1013  }
1014 
1015  POSTGIS_DEBUGF(3, "s1: %s", nd_stats_to_json(s1));
1016  POSTGIS_DEBUGF(3, "s2: %s", nd_stats_to_json(s2));
1017 
1018  /* Re-read that info after the swap */
1019  ncells1 = (int)roundf(s1->histogram_cells);
1020  ncells2 = (int)roundf(s2->histogram_cells);
1021 
1022  /* Q: What's the largest possible join size these relations can create? */
1023  /* A: The product of the # of non-null rows in each relation. */
1024  ntuples_not_null1 = s1->table_features * (s1->not_null_features / s1->sample_features);
1025  ntuples_not_null2 = s2->table_features * (s2->not_null_features / s2->sample_features);
1026  ntuples_max = ntuples_not_null1 * ntuples_not_null2;
1027 
1028  /* Get the ndims as ints */
1029  ndims1 = (int)roundf(s1->ndims);
1030  ndims2 = (int)roundf(s2->ndims);
1031  ndims = Max(ndims1, ndims2);
1032 
1033  /* Get the extents */
1034  extent1 = s1->extent;
1035  extent2 = s2->extent;
1036 
1037  /* If relation stats do not intersect, join is very very selective. */
1038  if ( ! nd_box_intersects(&extent1, &extent2, ndims) )
1039  {
1040  POSTGIS_DEBUG(3, "relation stats do not intersect, returning 0");
1041  PG_RETURN_FLOAT8(0.0);
1042  }
1043 
1044  /*
1045  * First find the index range of the part of the smaller
1046  * histogram that overlaps the larger one.
1047  */
1048  if ( ! nd_box_overlap(s1, &extent2, &ibox1) )
1049  {
1050  POSTGIS_DEBUG(3, "could not calculate overlap of relations");
1051  PG_RETURN_FLOAT8(FALLBACK_ND_JOINSEL);
1052  }
1053 
1054  /* Initialize counters / constants on s1 */
1055  for ( d = 0; d < ndims1; d++ )
1056  {
1057  at1[d] = ibox1.min[d];
1058  min1[d] = s1->extent.min[d];
1059  width1[d] = s1->extent.max[d] - s1->extent.min[d];
1060  size1[d] = (int)roundf(s1->size[d]);
1061  cellsize1[d] = width1[d] / size1[d];
1062  }
1063 
1064  /* Initialize counters / constants on s2 */
1065  for ( d = 0; d < ndims2; d++ )
1066  {
1067  min2[d] = s2->extent.min[d];
1068  width2[d] = s2->extent.max[d] - s2->extent.min[d];
1069  size2[d] = (int)roundf(s2->size[d]);
1070  cellsize2[d] = width2[d] / size2[d];
1071  }
1072 
1073  /* For each affected cell of s1... */
1074  do
1075  {
1076  double val1;
1077  /* Construct the bounds of this cell */
1078  ND_BOX nd_cell1;
1079  nd_box_init(&nd_cell1);
1080  for ( d = 0; d < ndims1; d++ )
1081  {
1082  nd_cell1.min[d] = min1[d] + (at1[d]+0) * cellsize1[d];
1083  nd_cell1.max[d] = min1[d] + (at1[d]+1) * cellsize1[d];
1084  }
1085 
1086  /* Find the cells of s2 that cell1 overlaps.. */
1087  nd_box_overlap(s2, &nd_cell1, &ibox2);
1088 
1089  /* Initialize counter */
1090  for ( d = 0; d < ndims2; d++ )
1091  {
1092  at2[d] = ibox2.min[d];
1093  }
1094 
1095  POSTGIS_DEBUGF(3, "at1 %d,%d %s", at1[0], at1[1], nd_box_to_json(&nd_cell1, ndims1));
1096 
1097  /* Get the value at this cell */
1098  val1 = s1->value[nd_stats_value_index(s1, at1)];
1099 
1100  /* For each overlapped cell of s2... */
1101  do
1102  {
1103  double ratio2;
1104  double val2;
1105 
1106  /* Construct the bounds of this cell */
1107  ND_BOX nd_cell2;
1108  nd_box_init(&nd_cell2);
1109  for ( d = 0; d < ndims2; d++ )
1110  {
1111  nd_cell2.min[d] = min2[d] + (at2[d]+0) * cellsize2[d];
1112  nd_cell2.max[d] = min2[d] + (at2[d]+1) * cellsize2[d];
1113  }
1114 
1115  POSTGIS_DEBUGF(3, " at2 %d,%d %s", at2[0], at2[1], nd_box_to_json(&nd_cell2, ndims2));
1116 
1117  /* Calculate overlap ratio of the cells */
1118  ratio2 = nd_box_ratio(&nd_cell1, &nd_cell2, Max(ndims1, ndims2));
1119 
1120  /* Multiply the cell counts, scaled by overlap ratio */
1121  val2 = s2->value[nd_stats_value_index(s2, at2)];
1122  POSTGIS_DEBUGF(3, " val1 %.6g val2 %.6g ratio %.6g", val1, val2, ratio2);
1123  val += val1 * (val2 * ratio2);
1124  }
1125  while ( nd_increment(&ibox2, ndims2, at2) );
1126 
1127  }
1128  while( nd_increment(&ibox1, ndims1, at1) );
1129 
1130  POSTGIS_DEBUGF(3, "val of histogram = %g", val);
1131 
1132  /*
1133  * In order to compare our total cell count "val" to the
1134  * ntuples_max, we need to scale val up to reflect a full
1135  * table estimate. So, multiply by ratio of table size to
1136  * sample size.
1137  */
1138  val *= (s1->table_features / s1->sample_features);
1139  val *= (s2->table_features / s2->sample_features);
1140 
1141  POSTGIS_DEBUGF(3, "val scaled to full table size = %g", val);
1142 
1143  /*
1144  * Because the cell counts are over-determined due to
1145  * double counting of features that overlap multiple cells
1146  * (see the compute_gserialized_stats routine)
1147  * we also have to scale our cell count "val" *down*
1148  * to adjust for the double counting.
1149  */
1150 // val /= (s1->cells_covered / s1->histogram_features);
1151 // val /= (s2->cells_covered / s2->histogram_features);
1152 
1153  /*
1154  * Finally, the selectivity is the estimated number of
1155  * rows to be returned divided by the maximum possible
1156  * number of rows that can be returned.
1157  */
1158  selectivity = val / ntuples_max;
1159 
1160  /* Guard against over-estimates and crazy numbers :) */
1161  if ( isnan(selectivity) || ! isfinite(selectivity) || selectivity < 0.0 )
1162  {
1163  selectivity = DEFAULT_ND_JOINSEL;
1164  }
1165  else if ( selectivity > 1.0 )
1166  {
1167  selectivity = 1.0;
1168  }
1169 
1170  return selectivity;
1171 }
static int nd_increment(ND_IBOX *ibox, int ndims, int *counter)
Given an n-d index array (counter), and a domain to increment it in (ibox) increment it by one...
static int nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
What stats cells overlap with this ND_BOX? Put the lowest cell addresses in ND_IBOX->min and the high...
#define DEFAULT_ND_JOINSEL
#define ND_DIMS
The maximum number of dimensions our code can handle.
#define FALLBACK_ND_JOINSEL
#define FALLBACK_ND_SEL
More modest fallback selectivity factor.
static int nd_box_init(ND_BOX *a)
Zero out an ND_BOX.
static int nd_stats_value_index(const ND_STATS *stats, int *indexes)
Given a position in the n-d histogram (i,j,k) return the position in the 1-d values array...
float4 size[ND_DIMS]
static int nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
Return TRUE if ND_BOX a overlaps b, false otherwise.
int min[ND_DIMS]
N-dimensional box index type.
static char * nd_box_to_json(const ND_BOX *nd_box, int ndims)
Convert an ND_BOX to a JSON string for printing.
float4 max[ND_DIMS]
float4 min[ND_DIMS]
static char * nd_stats_to_json(const ND_STATS *nd_stats)
Convert an ND_STATS to a JSON representation for external use.
static double nd_box_ratio(const ND_BOX *b1, const ND_BOX *b2, int ndims)
Returns the proportion of b2 that is covered by b1.
N-dimensional statistics structure.
N-dimensional box type for calculations, to avoid doing explicit axis conversions from GBOX in all ca...
Here is the call graph for this function:
Here is the caller graph for this function: