PostGIS  2.4.9dev-r@@SVN_REVISION@@

◆ estimate_join_selectivity()

static float8 estimate_join_selectivity ( const ND_STATS s1,
const ND_STATS s2 
)
static

Given two statistics histograms, what is the selectivity of a join driven by the && or &&& operator?

Join selectivity is defined as the number of rows returned by the join operator divided by the number of rows that an unconstrained join would return (nrows1*nrows2).

To get the estimate of join rows, we walk through the cells of one histogram, and multiply the cell value by the proportion of the cells in the other histogram the cell overlaps: val += val1 * ( val2 * overlap_ratio )

Definition at line 997 of file gserialized_estimate.c.

References DEFAULT_ND_JOINSEL, ND_STATS_T::extent, FALLBACK_ND_JOINSEL, FALLBACK_ND_SEL, gserialized_gist_joinsel_nd(), ND_STATS_T::histogram_cells, ND_BOX_T::max, ND_BOX_T::min, ND_IBOX_T::min, nd_box_init(), nd_box_intersects(), nd_box_overlap(), nd_box_ratio(), nd_box_to_json(), ND_DIMS, nd_increment(), nd_stats_to_json(), nd_stats_value_index(), ND_STATS_T::ndims, ND_STATS_T::not_null_features, PG_FUNCTION_INFO_V1(), ND_STATS_T::sample_features, ND_STATS_T::size, ND_STATS_T::table_features, and ND_STATS_T::value.

Referenced by _postgis_gserialized_joinsel(), and gserialized_gist_joinsel().

998 {
999  int ncells1, ncells2;
1000  int ndims1, ndims2, ndims;
1001  double ntuples_max;
1002  double ntuples_not_null1, ntuples_not_null2;
1003 
1004  ND_BOX extent1, extent2;
1005  ND_IBOX ibox1, ibox2;
1006  int at1[ND_DIMS];
1007  int at2[ND_DIMS];
1008  double min1[ND_DIMS];
1009  double width1[ND_DIMS];
1010  double cellsize1[ND_DIMS];
1011  int size2[ND_DIMS];
1012  double min2[ND_DIMS];
1013  double width2[ND_DIMS];
1014  double cellsize2[ND_DIMS];
1015  int size1[ND_DIMS];
1016  int d;
1017  double val = 0;
1018  float8 selectivity;
1019 
1020  /* Drop out on null inputs */
1021  if ( ! ( s1 && s2 ) )
1022  {
1023  elog(NOTICE, " estimate_join_selectivity called with null inputs");
1024  return FALLBACK_ND_SEL;
1025  }
1026 
1027  /* We need to know how many cells each side has... */
1028  ncells1 = (int)roundf(s1->histogram_cells);
1029  ncells2 = (int)roundf(s2->histogram_cells);
1030 
1031  /* ...so that we can drive the summation loop with the smaller histogram. */
1032  if ( ncells1 > ncells2 )
1033  {
1034  const ND_STATS *stats_tmp = s1;
1035  s1 = s2;
1036  s2 = stats_tmp;
1037  }
1038 
1039  POSTGIS_DEBUGF(3, "s1: %s", nd_stats_to_json(s1));
1040  POSTGIS_DEBUGF(3, "s2: %s", nd_stats_to_json(s2));
1041 
1042  /* Re-read that info after the swap */
1043  ncells1 = (int)roundf(s1->histogram_cells);
1044  ncells2 = (int)roundf(s2->histogram_cells);
1045 
1046  /* Q: What's the largest possible join size these relations can create? */
1047  /* A: The product of the # of non-null rows in each relation. */
1048  ntuples_not_null1 = s1->table_features * (s1->not_null_features / s1->sample_features);
1049  ntuples_not_null2 = s2->table_features * (s2->not_null_features / s2->sample_features);
1050  ntuples_max = ntuples_not_null1 * ntuples_not_null2;
1051 
1052  /* Get the ndims as ints */
1053  ndims1 = (int)roundf(s1->ndims);
1054  ndims2 = (int)roundf(s2->ndims);
1055  ndims = Max(ndims1, ndims2);
1056 
1057  /* Get the extents */
1058  extent1 = s1->extent;
1059  extent2 = s2->extent;
1060 
1061  /* If relation stats do not intersect, join is very very selective. */
1062  if ( ! nd_box_intersects(&extent1, &extent2, ndims) )
1063  {
1064  POSTGIS_DEBUG(3, "relation stats do not intersect, returning 0");
1065  PG_RETURN_FLOAT8(0.0);
1066  }
1067 
1068  /*
1069  * First find the index range of the part of the smaller
1070  * histogram that overlaps the larger one.
1071  */
1072  if ( ! nd_box_overlap(s1, &extent2, &ibox1) )
1073  {
1074  POSTGIS_DEBUG(3, "could not calculate overlap of relations");
1075  PG_RETURN_FLOAT8(FALLBACK_ND_JOINSEL);
1076  }
1077 
1078  /* Initialize counters / constants on s1 */
1079  for ( d = 0; d < ndims1; d++ )
1080  {
1081  at1[d] = ibox1.min[d];
1082  min1[d] = s1->extent.min[d];
1083  width1[d] = s1->extent.max[d] - s1->extent.min[d];
1084  size1[d] = (int)roundf(s1->size[d]);
1085  cellsize1[d] = width1[d] / size1[d];
1086  }
1087 
1088  /* Initialize counters / constants on s2 */
1089  for ( d = 0; d < ndims2; d++ )
1090  {
1091  min2[d] = s2->extent.min[d];
1092  width2[d] = s2->extent.max[d] - s2->extent.min[d];
1093  size2[d] = (int)roundf(s2->size[d]);
1094  cellsize2[d] = width2[d] / size2[d];
1095  }
1096 
1097  /* For each affected cell of s1... */
1098  do
1099  {
1100  double val1;
1101  /* Construct the bounds of this cell */
1102  ND_BOX nd_cell1;
1103  nd_box_init(&nd_cell1);
1104  for ( d = 0; d < ndims1; d++ )
1105  {
1106  nd_cell1.min[d] = min1[d] + (at1[d]+0) * cellsize1[d];
1107  nd_cell1.max[d] = min1[d] + (at1[d]+1) * cellsize1[d];
1108  }
1109 
1110  /* Find the cells of s2 that cell1 overlaps.. */
1111  nd_box_overlap(s2, &nd_cell1, &ibox2);
1112 
1113  /* Initialize counter */
1114  for ( d = 0; d < ndims2; d++ )
1115  {
1116  at2[d] = ibox2.min[d];
1117  }
1118 
1119  POSTGIS_DEBUGF(3, "at1 %d,%d %s", at1[0], at1[1], nd_box_to_json(&nd_cell1, ndims1));
1120 
1121  /* Get the value at this cell */
1122  val1 = s1->value[nd_stats_value_index(s1, at1)];
1123 
1124  /* For each overlapped cell of s2... */
1125  do
1126  {
1127  double ratio2;
1128  double val2;
1129 
1130  /* Construct the bounds of this cell */
1131  ND_BOX nd_cell2;
1132  nd_box_init(&nd_cell2);
1133  for ( d = 0; d < ndims2; d++ )
1134  {
1135  nd_cell2.min[d] = min2[d] + (at2[d]+0) * cellsize2[d];
1136  nd_cell2.max[d] = min2[d] + (at2[d]+1) * cellsize2[d];
1137  }
1138 
1139  POSTGIS_DEBUGF(3, " at2 %d,%d %s", at2[0], at2[1], nd_box_to_json(&nd_cell2, ndims2));
1140 
1141  /* Calculate overlap ratio of the cells */
1142  ratio2 = nd_box_ratio(&nd_cell1, &nd_cell2, Max(ndims1, ndims2));
1143 
1144  /* Multiply the cell counts, scaled by overlap ratio */
1145  val2 = s2->value[nd_stats_value_index(s2, at2)];
1146  POSTGIS_DEBUGF(3, " val1 %.6g val2 %.6g ratio %.6g", val1, val2, ratio2);
1147  val += val1 * (val2 * ratio2);
1148  }
1149  while ( nd_increment(&ibox2, ndims2, at2) );
1150 
1151  }
1152  while( nd_increment(&ibox1, ndims1, at1) );
1153 
1154  POSTGIS_DEBUGF(3, "val of histogram = %g", val);
1155 
1156  /*
1157  * In order to compare our total cell count "val" to the
1158  * ntuples_max, we need to scale val up to reflect a full
1159  * table estimate. So, multiply by ratio of table size to
1160  * sample size.
1161  */
1162  val *= (s1->table_features / s1->sample_features);
1163  val *= (s2->table_features / s2->sample_features);
1164 
1165  POSTGIS_DEBUGF(3, "val scaled to full table size = %g", val);
1166 
1167  /*
1168  * Because the cell counts are over-determined due to
1169  * double counting of features that overlap multiple cells
1170  * (see the compute_gserialized_stats routine)
1171  * we also have to scale our cell count "val" *down*
1172  * to adjust for the double counting.
1173  */
1174 // val /= (s1->cells_covered / s1->histogram_features);
1175 // val /= (s2->cells_covered / s2->histogram_features);
1176 
1177  /*
1178  * Finally, the selectivity is the estimated number of
1179  * rows to be returned divided by the maximum possible
1180  * number of rows that can be returned.
1181  */
1182  selectivity = val / ntuples_max;
1183 
1184  /* Guard against over-estimates and crazy numbers :) */
1185  if ( isnan(selectivity) || ! isfinite(selectivity) || selectivity < 0.0 )
1186  {
1187  selectivity = DEFAULT_ND_JOINSEL;
1188  }
1189  else if ( selectivity > 1.0 )
1190  {
1191  selectivity = 1.0;
1192  }
1193 
1194  return selectivity;
1195 }
static int nd_increment(ND_IBOX *ibox, int ndims, int *counter)
Given an n-d index array (counter), and a domain to increment it in (ibox) increment it by one...
static int nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
What stats cells overlap with this ND_BOX? Put the lowest cell addresses in ND_IBOX->min and the high...
#define DEFAULT_ND_JOINSEL
#define ND_DIMS
The maximum number of dimensions our code can handle.
#define FALLBACK_ND_JOINSEL
#define FALLBACK_ND_SEL
More modest fallback selectivity factor.
static int nd_box_init(ND_BOX *a)
Zero out an ND_BOX.
static int nd_stats_value_index(const ND_STATS *stats, int *indexes)
Given a position in the n-d histogram (i,j,k) return the position in the 1-d values array...
float4 size[ND_DIMS]
static int nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
Return TRUE if ND_BOX a overlaps b, false otherwise.
int min[ND_DIMS]
N-dimensional box index type.
static char * nd_box_to_json(const ND_BOX *nd_box, int ndims)
Convert an ND_BOX to a JSON string for printing.
float4 max[ND_DIMS]
float4 min[ND_DIMS]
static char * nd_stats_to_json(const ND_STATS *nd_stats)
Convert an ND_STATS to a JSON representation for external use.
static double nd_box_ratio(const ND_BOX *b1, const ND_BOX *b2, int ndims)
Returns the proportion of b2 that is covered by b1.
N-dimensional statistics structure.
N-dimensional box type for calculations, to avoid doing explicit axis conversions from GBOX in all ca...
Here is the call graph for this function:
Here is the caller graph for this function: