PostGIS  3.1.6dev-r@@SVN_REVISION@@

◆ estimate_join_selectivity()

static float8 estimate_join_selectivity ( const ND_STATS s1,
const ND_STATS s2 
)
static

Given two statistics histograms, what is the selectivity of a join driven by the && or &&& operator?

Join selectivity is defined as the number of rows returned by the join operator divided by the number of rows that an unconstrained join would return (nrows1*nrows2).

To get the estimate of join rows, we walk through the cells of one histogram, and multiply the cell value by the proportion of the cells in the other histogram the cell overlaps: val += val1 * ( val2 * overlap_ratio )

Definition at line 1049 of file gserialized_estimate.c.

1050 {
1051  int ncells1, ncells2;
1052  int ndims1, ndims2, ndims;
1053  double ntuples_max;
1054  double ntuples_not_null1, ntuples_not_null2;
1055 
1056  ND_BOX extent1, extent2;
1057  ND_IBOX ibox1, ibox2;
1058  int at1[ND_DIMS];
1059  int at2[ND_DIMS];
1060  double min1[ND_DIMS];
1061  double width1[ND_DIMS];
1062  double cellsize1[ND_DIMS];
1063  int size2[ND_DIMS];
1064  double min2[ND_DIMS];
1065  double width2[ND_DIMS];
1066  double cellsize2[ND_DIMS];
1067  int size1[ND_DIMS];
1068  int d;
1069  double val = 0;
1070  float8 selectivity;
1071 
1072  /* Drop out on null inputs */
1073  if ( ! ( s1 && s2 ) )
1074  {
1075  elog(NOTICE, " estimate_join_selectivity called with null inputs");
1076  return FALLBACK_ND_SEL;
1077  }
1078 
1079  /* We need to know how many cells each side has... */
1080  ncells1 = (int)roundf(s1->histogram_cells);
1081  ncells2 = (int)roundf(s2->histogram_cells);
1082 
1083  /* ...so that we can drive the summation loop with the smaller histogram. */
1084  if ( ncells1 > ncells2 )
1085  {
1086  const ND_STATS *stats_tmp = s1;
1087  s1 = s2;
1088  s2 = stats_tmp;
1089  }
1090 
1091  POSTGIS_DEBUGF(3, "s1: %s", nd_stats_to_json(s1));
1092  POSTGIS_DEBUGF(3, "s2: %s", nd_stats_to_json(s2));
1093 
1094  /* Re-read that info after the swap */
1095  ncells1 = (int)roundf(s1->histogram_cells);
1096  ncells2 = (int)roundf(s2->histogram_cells);
1097 
1098  /* Q: What's the largest possible join size these relations can create? */
1099  /* A: The product of the # of non-null rows in each relation. */
1100  ntuples_not_null1 = s1->table_features * (s1->not_null_features / s1->sample_features);
1101  ntuples_not_null2 = s2->table_features * (s2->not_null_features / s2->sample_features);
1102  ntuples_max = ntuples_not_null1 * ntuples_not_null2;
1103 
1104  /* Get the ndims as ints */
1105  ndims1 = (int)roundf(s1->ndims);
1106  ndims2 = (int)roundf(s2->ndims);
1107  ndims = Max(ndims1, ndims2);
1108 
1109  /* Get the extents */
1110  extent1 = s1->extent;
1111  extent2 = s2->extent;
1112 
1113  /* If relation stats do not intersect, join is very very selective. */
1114  if ( ! nd_box_intersects(&extent1, &extent2, ndims) )
1115  {
1116  POSTGIS_DEBUG(3, "relation stats do not intersect, returning 0");
1117  PG_RETURN_FLOAT8(0.0);
1118  }
1119 
1120  /*
1121  * First find the index range of the part of the smaller
1122  * histogram that overlaps the larger one.
1123  */
1124  if ( ! nd_box_overlap(s1, &extent2, &ibox1) )
1125  {
1126  POSTGIS_DEBUG(3, "could not calculate overlap of relations");
1127  PG_RETURN_FLOAT8(FALLBACK_ND_JOINSEL);
1128  }
1129 
1130  /* Initialize counters / constants on s1 */
1131  for ( d = 0; d < ndims1; d++ )
1132  {
1133  at1[d] = ibox1.min[d];
1134  min1[d] = s1->extent.min[d];
1135  width1[d] = s1->extent.max[d] - s1->extent.min[d];
1136  size1[d] = (int)roundf(s1->size[d]);
1137  cellsize1[d] = width1[d] / size1[d];
1138  }
1139 
1140  /* Initialize counters / constants on s2 */
1141  for ( d = 0; d < ndims2; d++ )
1142  {
1143  min2[d] = s2->extent.min[d];
1144  width2[d] = s2->extent.max[d] - s2->extent.min[d];
1145  size2[d] = (int)roundf(s2->size[d]);
1146  cellsize2[d] = width2[d] / size2[d];
1147  }
1148 
1149  /* For each affected cell of s1... */
1150  do
1151  {
1152  double val1;
1153  /* Construct the bounds of this cell */
1154  ND_BOX nd_cell1;
1155  nd_box_init(&nd_cell1);
1156  for ( d = 0; d < ndims1; d++ )
1157  {
1158  nd_cell1.min[d] = min1[d] + (at1[d]+0) * cellsize1[d];
1159  nd_cell1.max[d] = min1[d] + (at1[d]+1) * cellsize1[d];
1160  }
1161 
1162  /* Find the cells of s2 that cell1 overlaps.. */
1163  nd_box_overlap(s2, &nd_cell1, &ibox2);
1164 
1165  /* Initialize counter */
1166  for ( d = 0; d < ndims2; d++ )
1167  {
1168  at2[d] = ibox2.min[d];
1169  }
1170 
1171  POSTGIS_DEBUGF(3, "at1 %d,%d %s", at1[0], at1[1], nd_box_to_json(&nd_cell1, ndims1));
1172 
1173  /* Get the value at this cell */
1174  val1 = s1->value[nd_stats_value_index(s1, at1)];
1175 
1176  /* For each overlapped cell of s2... */
1177  do
1178  {
1179  double ratio2;
1180  double val2;
1181 
1182  /* Construct the bounds of this cell */
1183  ND_BOX nd_cell2;
1184  nd_box_init(&nd_cell2);
1185  for ( d = 0; d < ndims2; d++ )
1186  {
1187  nd_cell2.min[d] = min2[d] + (at2[d]+0) * cellsize2[d];
1188  nd_cell2.max[d] = min2[d] + (at2[d]+1) * cellsize2[d];
1189  }
1190 
1191  POSTGIS_DEBUGF(3, " at2 %d,%d %s", at2[0], at2[1], nd_box_to_json(&nd_cell2, ndims2));
1192 
1193  /* Calculate overlap ratio of the cells */
1194  ratio2 = nd_box_ratio(&nd_cell1, &nd_cell2, Max(ndims1, ndims2));
1195 
1196  /* Multiply the cell counts, scaled by overlap ratio */
1197  val2 = s2->value[nd_stats_value_index(s2, at2)];
1198  POSTGIS_DEBUGF(3, " val1 %.6g val2 %.6g ratio %.6g", val1, val2, ratio2);
1199  val += val1 * (val2 * ratio2);
1200  }
1201  while ( nd_increment(&ibox2, ndims2, at2) );
1202 
1203  }
1204  while( nd_increment(&ibox1, ndims1, at1) );
1205 
1206  POSTGIS_DEBUGF(3, "val of histogram = %g", val);
1207 
1208  /*
1209  * In order to compare our total cell count "val" to the
1210  * ntuples_max, we need to scale val up to reflect a full
1211  * table estimate. So, multiply by ratio of table size to
1212  * sample size.
1213  */
1214  val *= (s1->table_features / s1->sample_features);
1215  val *= (s2->table_features / s2->sample_features);
1216 
1217  POSTGIS_DEBUGF(3, "val scaled to full table size = %g", val);
1218 
1219  /*
1220  * Because the cell counts are over-determined due to
1221  * double counting of features that overlap multiple cells
1222  * (see the compute_gserialized_stats routine)
1223  * we also have to scale our cell count "val" *down*
1224  * to adjust for the double counting.
1225  */
1226 // val /= (s1->cells_covered / s1->histogram_features);
1227 // val /= (s2->cells_covered / s2->histogram_features);
1228 
1229  /*
1230  * Finally, the selectivity is the estimated number of
1231  * rows to be returned divided by the maximum possible
1232  * number of rows that can be returned.
1233  */
1234  selectivity = val / ntuples_max;
1235 
1236  /* Guard against over-estimates and crazy numbers :) */
1237  if ( isnan(selectivity) || ! isfinite(selectivity) || selectivity < 0.0 )
1238  {
1239  selectivity = DEFAULT_ND_JOINSEL;
1240  }
1241  else if ( selectivity > 1.0 )
1242  {
1243  selectivity = 1.0;
1244  }
1245 
1246  return selectivity;
1247 }
static char * nd_stats_to_json(const ND_STATS *nd_stats)
Convert an ND_STATS to a JSON representation for external use.
static int nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
Return true if ND_BOX a overlaps b, false otherwise.
static int nd_increment(ND_IBOX *ibox, int ndims, int *counter)
Given an n-d index array (counter), and a domain to increment it in (ibox) increment it by one,...
#define ND_DIMS
The maximum number of dimensions our code can handle.
#define DEFAULT_ND_JOINSEL
#define FALLBACK_ND_SEL
More modest fallback selectivity factor.
#define FALLBACK_ND_JOINSEL
static int nd_box_init(ND_BOX *a)
Zero out an ND_BOX.
static int nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
What stats cells overlap with this ND_BOX? Put the lowest cell addresses in ND_IBOX->min and the high...
static double nd_box_ratio(const ND_BOX *b1, const ND_BOX *b2, int ndims)
Returns the proportion of b2 that is covered by b1.
static int nd_stats_value_index(const ND_STATS *stats, int *indexes)
Given a position in the n-d histogram (i,j,k) return the position in the 1-d values array.
static char * nd_box_to_json(const ND_BOX *nd_box, int ndims)
Convert an ND_BOX to a JSON string for printing.
float4 max[ND_DIMS]
float4 min[ND_DIMS]
N-dimensional box type for calculations, to avoid doing explicit axis conversions from GBOX in all ca...
int min[ND_DIMS]
N-dimensional box index type.
float4 size[ND_DIMS]
N-dimensional statistics structure.

References DEFAULT_ND_JOINSEL, ND_STATS_T::extent, FALLBACK_ND_JOINSEL, FALLBACK_ND_SEL, ND_STATS_T::histogram_cells, ND_BOX_T::max, ND_BOX_T::min, ND_IBOX_T::min, nd_box_init(), nd_box_intersects(), nd_box_overlap(), nd_box_ratio(), nd_box_to_json(), ND_DIMS, nd_increment(), nd_stats_to_json(), nd_stats_value_index(), ND_STATS_T::ndims, ND_STATS_T::not_null_features, ND_STATS_T::sample_features, ND_STATS_T::size, ND_STATS_T::table_features, and ND_STATS_T::value.

Referenced by _postgis_gserialized_joinsel(), and gserialized_joinsel_internal().

Here is the call graph for this function:
Here is the caller graph for this function: