PostGIS  3.3.9dev-r@@SVN_REVISION@@

◆ estimate_join_selectivity()

static float8 estimate_join_selectivity ( const ND_STATS s1,
const ND_STATS s2 
)
static

Given two statistics histograms, what is the selectivity of a join driven by the && or &&& operator?

Join selectivity is defined as the number of rows returned by the join operator divided by the number of rows that an unconstrained join would return (nrows1*nrows2).

To get the estimate of join rows, we walk through the cells of one histogram, and multiply the cell value by the proportion of the cells in the other histogram the cell overlaps: val += val1 * ( val2 * overlap_ratio )

Definition at line 1044 of file gserialized_estimate.c.

1045 {
1046  int ncells1, ncells2;
1047  int ndims1, ndims2, ndims;
1048  double ntuples_max;
1049  double ntuples_not_null1, ntuples_not_null2;
1050 
1051  ND_BOX extent1, extent2;
1052  ND_IBOX ibox1, ibox2;
1053  int at1[ND_DIMS];
1054  int at2[ND_DIMS];
1055  double min1[ND_DIMS];
1056  double width1[ND_DIMS];
1057  double cellsize1[ND_DIMS];
1058  int size2[ND_DIMS];
1059  double min2[ND_DIMS];
1060  double width2[ND_DIMS];
1061  double cellsize2[ND_DIMS];
1062  int size1[ND_DIMS];
1063  int d;
1064  double val = 0;
1065  float8 selectivity;
1066 
1067  /* Drop out on null inputs */
1068  if ( ! ( s1 && s2 ) )
1069  {
1070  elog(NOTICE, " estimate_join_selectivity called with null inputs");
1071  return FALLBACK_ND_SEL;
1072  }
1073 
1074  /* We need to know how many cells each side has... */
1075  ncells1 = (int)roundf(s1->histogram_cells);
1076  ncells2 = (int)roundf(s2->histogram_cells);
1077 
1078  /* ...so that we can drive the summation loop with the smaller histogram. */
1079  if ( ncells1 > ncells2 )
1080  {
1081  const ND_STATS *stats_tmp = s1;
1082  s1 = s2;
1083  s2 = stats_tmp;
1084  }
1085 
1086  POSTGIS_DEBUGF(3, "s1: %s", nd_stats_to_json(s1));
1087  POSTGIS_DEBUGF(3, "s2: %s", nd_stats_to_json(s2));
1088 
1089  /* Re-read that info after the swap */
1090  ncells1 = (int)roundf(s1->histogram_cells);
1091  ncells2 = (int)roundf(s2->histogram_cells);
1092 
1093  /* Q: What's the largest possible join size these relations can create? */
1094  /* A: The product of the # of non-null rows in each relation. */
1095  ntuples_not_null1 = s1->table_features * (s1->not_null_features / s1->sample_features);
1096  ntuples_not_null2 = s2->table_features * (s2->not_null_features / s2->sample_features);
1097  ntuples_max = ntuples_not_null1 * ntuples_not_null2;
1098 
1099  /* Get the ndims as ints */
1100  ndims1 = (int)roundf(s1->ndims);
1101  ndims2 = (int)roundf(s2->ndims);
1102  ndims = Max(ndims1, ndims2);
1103 
1104  /* Get the extents */
1105  extent1 = s1->extent;
1106  extent2 = s2->extent;
1107 
1108  /* If relation stats do not intersect, join is very very selective. */
1109  if ( ! nd_box_intersects(&extent1, &extent2, ndims) )
1110  {
1111  POSTGIS_DEBUG(3, "relation stats do not intersect, returning 0");
1112  PG_RETURN_FLOAT8(0.0);
1113  }
1114 
1115  /*
1116  * First find the index range of the part of the smaller
1117  * histogram that overlaps the larger one.
1118  */
1119  if ( ! nd_box_overlap(s1, &extent2, &ibox1) )
1120  {
1121  POSTGIS_DEBUG(3, "could not calculate overlap of relations");
1122  PG_RETURN_FLOAT8(FALLBACK_ND_JOINSEL);
1123  }
1124 
1125  /* Initialize counters / constants on s1 */
1126  for ( d = 0; d < ndims1; d++ )
1127  {
1128  at1[d] = ibox1.min[d];
1129  min1[d] = s1->extent.min[d];
1130  width1[d] = s1->extent.max[d] - s1->extent.min[d];
1131  size1[d] = (int)roundf(s1->size[d]);
1132  cellsize1[d] = width1[d] / size1[d];
1133  }
1134 
1135  /* Initialize counters / constants on s2 */
1136  for ( d = 0; d < ndims2; d++ )
1137  {
1138  min2[d] = s2->extent.min[d];
1139  width2[d] = s2->extent.max[d] - s2->extent.min[d];
1140  size2[d] = (int)roundf(s2->size[d]);
1141  cellsize2[d] = width2[d] / size2[d];
1142  }
1143 
1144  /* For each affected cell of s1... */
1145  do
1146  {
1147  double val1;
1148  /* Construct the bounds of this cell */
1149  ND_BOX nd_cell1;
1150  nd_box_init(&nd_cell1);
1151  for ( d = 0; d < ndims1; d++ )
1152  {
1153  nd_cell1.min[d] = min1[d] + (at1[d]+0) * cellsize1[d];
1154  nd_cell1.max[d] = min1[d] + (at1[d]+1) * cellsize1[d];
1155  }
1156 
1157  /* Find the cells of s2 that cell1 overlaps.. */
1158  nd_box_overlap(s2, &nd_cell1, &ibox2);
1159 
1160  /* Initialize counter */
1161  for ( d = 0; d < ndims2; d++ )
1162  {
1163  at2[d] = ibox2.min[d];
1164  }
1165 
1166  POSTGIS_DEBUGF(3, "at1 %d,%d %s", at1[0], at1[1], nd_box_to_json(&nd_cell1, ndims1));
1167 
1168  /* Get the value at this cell */
1169  val1 = s1->value[nd_stats_value_index(s1, at1)];
1170 
1171  /* For each overlapped cell of s2... */
1172  do
1173  {
1174  double ratio2;
1175  double val2;
1176 
1177  /* Construct the bounds of this cell */
1178  ND_BOX nd_cell2;
1179  nd_box_init(&nd_cell2);
1180  for ( d = 0; d < ndims2; d++ )
1181  {
1182  nd_cell2.min[d] = min2[d] + (at2[d]+0) * cellsize2[d];
1183  nd_cell2.max[d] = min2[d] + (at2[d]+1) * cellsize2[d];
1184  }
1185 
1186  POSTGIS_DEBUGF(3, " at2 %d,%d %s", at2[0], at2[1], nd_box_to_json(&nd_cell2, ndims2));
1187 
1188  /* Calculate overlap ratio of the cells */
1189  ratio2 = nd_box_ratio(&nd_cell1, &nd_cell2, Max(ndims1, ndims2));
1190 
1191  /* Multiply the cell counts, scaled by overlap ratio */
1192  val2 = s2->value[nd_stats_value_index(s2, at2)];
1193  POSTGIS_DEBUGF(3, " val1 %.6g val2 %.6g ratio %.6g", val1, val2, ratio2);
1194  val += val1 * (val2 * ratio2);
1195  }
1196  while ( nd_increment(&ibox2, ndims2, at2) );
1197 
1198  }
1199  while( nd_increment(&ibox1, ndims1, at1) );
1200 
1201  POSTGIS_DEBUGF(3, "val of histogram = %g", val);
1202 
1203  /*
1204  * In order to compare our total cell count "val" to the
1205  * ntuples_max, we need to scale val up to reflect a full
1206  * table estimate. So, multiply by ratio of table size to
1207  * sample size.
1208  */
1209  val *= (s1->table_features / s1->sample_features);
1210  val *= (s2->table_features / s2->sample_features);
1211 
1212  POSTGIS_DEBUGF(3, "val scaled to full table size = %g", val);
1213 
1214  /*
1215  * Because the cell counts are over-determined due to
1216  * double counting of features that overlap multiple cells
1217  * (see the compute_gserialized_stats routine)
1218  * we also have to scale our cell count "val" *down*
1219  * to adjust for the double counting.
1220  */
1221 // val /= (s1->cells_covered / s1->histogram_features);
1222 // val /= (s2->cells_covered / s2->histogram_features);
1223 
1224  /*
1225  * Finally, the selectivity is the estimated number of
1226  * rows to be returned divided by the maximum possible
1227  * number of rows that can be returned.
1228  */
1229  selectivity = val / ntuples_max;
1230 
1231  /* Guard against over-estimates and crazy numbers :) */
1232  if ( isnan(selectivity) || ! isfinite(selectivity) || selectivity < 0.0 )
1233  {
1234  selectivity = DEFAULT_ND_JOINSEL;
1235  }
1236  else if ( selectivity > 1.0 )
1237  {
1238  selectivity = 1.0;
1239  }
1240 
1241  return selectivity;
1242 }
static char * nd_stats_to_json(const ND_STATS *nd_stats)
Convert an ND_STATS to a JSON representation for external use.
static int nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
Return true if ND_BOX a overlaps b, false otherwise.
static int nd_increment(ND_IBOX *ibox, int ndims, int *counter)
Given an n-d index array (counter), and a domain to increment it in (ibox) increment it by one,...
#define ND_DIMS
The maximum number of dimensions our code can handle.
#define DEFAULT_ND_JOINSEL
#define FALLBACK_ND_SEL
More modest fallback selectivity factor.
#define FALLBACK_ND_JOINSEL
static int nd_box_init(ND_BOX *a)
Zero out an ND_BOX.
static int nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
What stats cells overlap with this ND_BOX? Put the lowest cell addresses in ND_IBOX->min and the high...
static double nd_box_ratio(const ND_BOX *b1, const ND_BOX *b2, int ndims)
Returns the proportion of b2 that is covered by b1.
static int nd_stats_value_index(const ND_STATS *stats, int *indexes)
Given a position in the n-d histogram (i,j,k) return the position in the 1-d values array.
static char * nd_box_to_json(const ND_BOX *nd_box, int ndims)
Convert an ND_BOX to a JSON string for printing.
float4 max[ND_DIMS]
float4 min[ND_DIMS]
N-dimensional box type for calculations, to avoid doing explicit axis conversions from GBOX in all ca...
int min[ND_DIMS]
N-dimensional box index type.
float4 size[ND_DIMS]
N-dimensional statistics structure.

References DEFAULT_ND_JOINSEL, ND_STATS_T::extent, FALLBACK_ND_JOINSEL, FALLBACK_ND_SEL, ND_STATS_T::histogram_cells, ND_BOX_T::max, ND_BOX_T::min, ND_IBOX_T::min, nd_box_init(), nd_box_intersects(), nd_box_overlap(), nd_box_ratio(), nd_box_to_json(), ND_DIMS, nd_increment(), nd_stats_to_json(), nd_stats_value_index(), ND_STATS_T::ndims, ND_STATS_T::not_null_features, ND_STATS_T::sample_features, ND_STATS_T::size, ND_STATS_T::table_features, and ND_STATS_T::value.

Referenced by _postgis_gserialized_joinsel(), and gserialized_joinsel_internal().

Here is the call graph for this function:
Here is the caller graph for this function: