PostGIS  3.2.2dev-r@@SVN_REVISION@@

◆ estimate_join_selectivity()

static float8 estimate_join_selectivity ( const ND_STATS s1,
const ND_STATS s2 
)
static

Given two statistics histograms, what is the selectivity of a join driven by the && or &&& operator?

Join selectivity is defined as the number of rows returned by the join operator divided by the number of rows that an unconstrained join would return (nrows1*nrows2).

To get the estimate of join rows, we walk through the cells of one histogram, and multiply the cell value by the proportion of the cells in the other histogram the cell overlaps: val += val1 * ( val2 * overlap_ratio )

Definition at line 1047 of file gserialized_estimate.c.

1048 {
1049  int ncells1, ncells2;
1050  int ndims1, ndims2, ndims;
1051  double ntuples_max;
1052  double ntuples_not_null1, ntuples_not_null2;
1053 
1054  ND_BOX extent1, extent2;
1055  ND_IBOX ibox1, ibox2;
1056  int at1[ND_DIMS];
1057  int at2[ND_DIMS];
1058  double min1[ND_DIMS];
1059  double width1[ND_DIMS];
1060  double cellsize1[ND_DIMS];
1061  int size2[ND_DIMS];
1062  double min2[ND_DIMS];
1063  double width2[ND_DIMS];
1064  double cellsize2[ND_DIMS];
1065  int size1[ND_DIMS];
1066  int d;
1067  double val = 0;
1068  float8 selectivity;
1069 
1070  /* Drop out on null inputs */
1071  if ( ! ( s1 && s2 ) )
1072  {
1073  elog(NOTICE, " estimate_join_selectivity called with null inputs");
1074  return FALLBACK_ND_SEL;
1075  }
1076 
1077  /* We need to know how many cells each side has... */
1078  ncells1 = (int)roundf(s1->histogram_cells);
1079  ncells2 = (int)roundf(s2->histogram_cells);
1080 
1081  /* ...so that we can drive the summation loop with the smaller histogram. */
1082  if ( ncells1 > ncells2 )
1083  {
1084  const ND_STATS *stats_tmp = s1;
1085  s1 = s2;
1086  s2 = stats_tmp;
1087  }
1088 
1089  POSTGIS_DEBUGF(3, "s1: %s", nd_stats_to_json(s1));
1090  POSTGIS_DEBUGF(3, "s2: %s", nd_stats_to_json(s2));
1091 
1092  /* Re-read that info after the swap */
1093  ncells1 = (int)roundf(s1->histogram_cells);
1094  ncells2 = (int)roundf(s2->histogram_cells);
1095 
1096  /* Q: What's the largest possible join size these relations can create? */
1097  /* A: The product of the # of non-null rows in each relation. */
1098  ntuples_not_null1 = s1->table_features * (s1->not_null_features / s1->sample_features);
1099  ntuples_not_null2 = s2->table_features * (s2->not_null_features / s2->sample_features);
1100  ntuples_max = ntuples_not_null1 * ntuples_not_null2;
1101 
1102  /* Get the ndims as ints */
1103  ndims1 = (int)roundf(s1->ndims);
1104  ndims2 = (int)roundf(s2->ndims);
1105  ndims = Max(ndims1, ndims2);
1106 
1107  /* Get the extents */
1108  extent1 = s1->extent;
1109  extent2 = s2->extent;
1110 
1111  /* If relation stats do not intersect, join is very very selective. */
1112  if ( ! nd_box_intersects(&extent1, &extent2, ndims) )
1113  {
1114  POSTGIS_DEBUG(3, "relation stats do not intersect, returning 0");
1115  PG_RETURN_FLOAT8(0.0);
1116  }
1117 
1118  /*
1119  * First find the index range of the part of the smaller
1120  * histogram that overlaps the larger one.
1121  */
1122  if ( ! nd_box_overlap(s1, &extent2, &ibox1) )
1123  {
1124  POSTGIS_DEBUG(3, "could not calculate overlap of relations");
1125  PG_RETURN_FLOAT8(FALLBACK_ND_JOINSEL);
1126  }
1127 
1128  /* Initialize counters / constants on s1 */
1129  for ( d = 0; d < ndims1; d++ )
1130  {
1131  at1[d] = ibox1.min[d];
1132  min1[d] = s1->extent.min[d];
1133  width1[d] = s1->extent.max[d] - s1->extent.min[d];
1134  size1[d] = (int)roundf(s1->size[d]);
1135  cellsize1[d] = width1[d] / size1[d];
1136  }
1137 
1138  /* Initialize counters / constants on s2 */
1139  for ( d = 0; d < ndims2; d++ )
1140  {
1141  min2[d] = s2->extent.min[d];
1142  width2[d] = s2->extent.max[d] - s2->extent.min[d];
1143  size2[d] = (int)roundf(s2->size[d]);
1144  cellsize2[d] = width2[d] / size2[d];
1145  }
1146 
1147  /* For each affected cell of s1... */
1148  do
1149  {
1150  double val1;
1151  /* Construct the bounds of this cell */
1152  ND_BOX nd_cell1;
1153  nd_box_init(&nd_cell1);
1154  for ( d = 0; d < ndims1; d++ )
1155  {
1156  nd_cell1.min[d] = min1[d] + (at1[d]+0) * cellsize1[d];
1157  nd_cell1.max[d] = min1[d] + (at1[d]+1) * cellsize1[d];
1158  }
1159 
1160  /* Find the cells of s2 that cell1 overlaps.. */
1161  nd_box_overlap(s2, &nd_cell1, &ibox2);
1162 
1163  /* Initialize counter */
1164  for ( d = 0; d < ndims2; d++ )
1165  {
1166  at2[d] = ibox2.min[d];
1167  }
1168 
1169  POSTGIS_DEBUGF(3, "at1 %d,%d %s", at1[0], at1[1], nd_box_to_json(&nd_cell1, ndims1));
1170 
1171  /* Get the value at this cell */
1172  val1 = s1->value[nd_stats_value_index(s1, at1)];
1173 
1174  /* For each overlapped cell of s2... */
1175  do
1176  {
1177  double ratio2;
1178  double val2;
1179 
1180  /* Construct the bounds of this cell */
1181  ND_BOX nd_cell2;
1182  nd_box_init(&nd_cell2);
1183  for ( d = 0; d < ndims2; d++ )
1184  {
1185  nd_cell2.min[d] = min2[d] + (at2[d]+0) * cellsize2[d];
1186  nd_cell2.max[d] = min2[d] + (at2[d]+1) * cellsize2[d];
1187  }
1188 
1189  POSTGIS_DEBUGF(3, " at2 %d,%d %s", at2[0], at2[1], nd_box_to_json(&nd_cell2, ndims2));
1190 
1191  /* Calculate overlap ratio of the cells */
1192  ratio2 = nd_box_ratio(&nd_cell1, &nd_cell2, Max(ndims1, ndims2));
1193 
1194  /* Multiply the cell counts, scaled by overlap ratio */
1195  val2 = s2->value[nd_stats_value_index(s2, at2)];
1196  POSTGIS_DEBUGF(3, " val1 %.6g val2 %.6g ratio %.6g", val1, val2, ratio2);
1197  val += val1 * (val2 * ratio2);
1198  }
1199  while ( nd_increment(&ibox2, ndims2, at2) );
1200 
1201  }
1202  while( nd_increment(&ibox1, ndims1, at1) );
1203 
1204  POSTGIS_DEBUGF(3, "val of histogram = %g", val);
1205 
1206  /*
1207  * In order to compare our total cell count "val" to the
1208  * ntuples_max, we need to scale val up to reflect a full
1209  * table estimate. So, multiply by ratio of table size to
1210  * sample size.
1211  */
1212  val *= (s1->table_features / s1->sample_features);
1213  val *= (s2->table_features / s2->sample_features);
1214 
1215  POSTGIS_DEBUGF(3, "val scaled to full table size = %g", val);
1216 
1217  /*
1218  * Because the cell counts are over-determined due to
1219  * double counting of features that overlap multiple cells
1220  * (see the compute_gserialized_stats routine)
1221  * we also have to scale our cell count "val" *down*
1222  * to adjust for the double counting.
1223  */
1224 // val /= (s1->cells_covered / s1->histogram_features);
1225 // val /= (s2->cells_covered / s2->histogram_features);
1226 
1227  /*
1228  * Finally, the selectivity is the estimated number of
1229  * rows to be returned divided by the maximum possible
1230  * number of rows that can be returned.
1231  */
1232  selectivity = val / ntuples_max;
1233 
1234  /* Guard against over-estimates and crazy numbers :) */
1235  if ( isnan(selectivity) || ! isfinite(selectivity) || selectivity < 0.0 )
1236  {
1237  selectivity = DEFAULT_ND_JOINSEL;
1238  }
1239  else if ( selectivity > 1.0 )
1240  {
1241  selectivity = 1.0;
1242  }
1243 
1244  return selectivity;
1245 }
static char * nd_stats_to_json(const ND_STATS *nd_stats)
Convert an ND_STATS to a JSON representation for external use.
static int nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
Return true if ND_BOX a overlaps b, false otherwise.
static int nd_increment(ND_IBOX *ibox, int ndims, int *counter)
Given an n-d index array (counter), and a domain to increment it in (ibox) increment it by one,...
#define ND_DIMS
The maximum number of dimensions our code can handle.
#define DEFAULT_ND_JOINSEL
#define FALLBACK_ND_SEL
More modest fallback selectivity factor.
#define FALLBACK_ND_JOINSEL
static int nd_box_init(ND_BOX *a)
Zero out an ND_BOX.
static int nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
What stats cells overlap with this ND_BOX? Put the lowest cell addresses in ND_IBOX->min and the high...
static double nd_box_ratio(const ND_BOX *b1, const ND_BOX *b2, int ndims)
Returns the proportion of b2 that is covered by b1.
static int nd_stats_value_index(const ND_STATS *stats, int *indexes)
Given a position in the n-d histogram (i,j,k) return the position in the 1-d values array.
static char * nd_box_to_json(const ND_BOX *nd_box, int ndims)
Convert an ND_BOX to a JSON string for printing.
float4 max[ND_DIMS]
float4 min[ND_DIMS]
N-dimensional box type for calculations, to avoid doing explicit axis conversions from GBOX in all ca...
int min[ND_DIMS]
N-dimensional box index type.
float4 size[ND_DIMS]
N-dimensional statistics structure.

References DEFAULT_ND_JOINSEL, ND_STATS_T::extent, FALLBACK_ND_JOINSEL, FALLBACK_ND_SEL, ND_STATS_T::histogram_cells, ND_BOX_T::max, ND_BOX_T::min, ND_IBOX_T::min, nd_box_init(), nd_box_intersects(), nd_box_overlap(), nd_box_ratio(), nd_box_to_json(), ND_DIMS, nd_increment(), nd_stats_to_json(), nd_stats_value_index(), ND_STATS_T::ndims, ND_STATS_T::not_null_features, ND_STATS_T::sample_features, ND_STATS_T::size, ND_STATS_T::table_features, and ND_STATS_T::value.

Referenced by _postgis_gserialized_joinsel(), and gserialized_joinsel_internal().

Here is the call graph for this function:
Here is the caller graph for this function: