PostGIS  3.4.0dev-r@@SVN_REVISION@@

◆ estimate_join_selectivity()

static float8 estimate_join_selectivity ( const ND_STATS s1,
const ND_STATS s2 
)
static

Given two statistics histograms, what is the selectivity of a join driven by the && or &&& operator?

Join selectivity is defined as the number of rows returned by the join operator divided by the number of rows that an unconstrained join would return (nrows1*nrows2).

To get the estimate of join rows, we walk through the cells of one histogram, and multiply the cell value by the proportion of the cells in the other histogram the cell overlaps: val += val1 * ( val2 * overlap_ratio )

Definition at line 1038 of file gserialized_estimate.c.

1039 {
1040  int ncells1, ncells2;
1041  int ndims1, ndims2, ndims;
1042  double ntuples_max;
1043  double ntuples_not_null1, ntuples_not_null2;
1044 
1045  ND_BOX extent1, extent2;
1046  ND_IBOX ibox1, ibox2;
1047  int at1[ND_DIMS];
1048  int at2[ND_DIMS];
1049  double min1[ND_DIMS];
1050  double width1[ND_DIMS];
1051  double cellsize1[ND_DIMS];
1052  int size2[ND_DIMS];
1053  double min2[ND_DIMS];
1054  double width2[ND_DIMS];
1055  double cellsize2[ND_DIMS];
1056  int size1[ND_DIMS];
1057  int d;
1058  double val = 0;
1059  float8 selectivity;
1060 
1061  /* Drop out on null inputs */
1062  if ( ! ( s1 && s2 ) )
1063  {
1064  elog(NOTICE, " estimate_join_selectivity called with null inputs");
1065  return FALLBACK_ND_SEL;
1066  }
1067 
1068  /* We need to know how many cells each side has... */
1069  ncells1 = (int)roundf(s1->histogram_cells);
1070  ncells2 = (int)roundf(s2->histogram_cells);
1071 
1072  /* ...so that we can drive the summation loop with the smaller histogram. */
1073  if ( ncells1 > ncells2 )
1074  {
1075  const ND_STATS *stats_tmp = s1;
1076  s1 = s2;
1077  s2 = stats_tmp;
1078  }
1079 
1080  POSTGIS_DEBUGF(3, "s1: %s", nd_stats_to_json(s1));
1081  POSTGIS_DEBUGF(3, "s2: %s", nd_stats_to_json(s2));
1082 
1083  /* Re-read that info after the swap */
1084  ncells1 = (int)roundf(s1->histogram_cells);
1085  ncells2 = (int)roundf(s2->histogram_cells);
1086 
1087  /* Q: What's the largest possible join size these relations can create? */
1088  /* A: The product of the # of non-null rows in each relation. */
1089  ntuples_not_null1 = s1->table_features * (s1->not_null_features / s1->sample_features);
1090  ntuples_not_null2 = s2->table_features * (s2->not_null_features / s2->sample_features);
1091  ntuples_max = ntuples_not_null1 * ntuples_not_null2;
1092 
1093  /* Get the ndims as ints */
1094  ndims1 = (int)roundf(s1->ndims);
1095  ndims2 = (int)roundf(s2->ndims);
1096  ndims = Max(ndims1, ndims2);
1097 
1098  /* Get the extents */
1099  extent1 = s1->extent;
1100  extent2 = s2->extent;
1101 
1102  /* If relation stats do not intersect, join is very very selective. */
1103  if ( ! nd_box_intersects(&extent1, &extent2, ndims) )
1104  {
1105  POSTGIS_DEBUG(3, "relation stats do not intersect, returning 0");
1106  PG_RETURN_FLOAT8(0.0);
1107  }
1108 
1109  /*
1110  * First find the index range of the part of the smaller
1111  * histogram that overlaps the larger one.
1112  */
1113  if ( ! nd_box_overlap(s1, &extent2, &ibox1) )
1114  {
1115  POSTGIS_DEBUG(3, "could not calculate overlap of relations");
1116  PG_RETURN_FLOAT8(FALLBACK_ND_JOINSEL);
1117  }
1118 
1119  /* Initialize counters / constants on s1 */
1120  for ( d = 0; d < ndims1; d++ )
1121  {
1122  at1[d] = ibox1.min[d];
1123  min1[d] = s1->extent.min[d];
1124  width1[d] = s1->extent.max[d] - s1->extent.min[d];
1125  size1[d] = (int)roundf(s1->size[d]);
1126  cellsize1[d] = width1[d] / size1[d];
1127  }
1128 
1129  /* Initialize counters / constants on s2 */
1130  for ( d = 0; d < ndims2; d++ )
1131  {
1132  min2[d] = s2->extent.min[d];
1133  width2[d] = s2->extent.max[d] - s2->extent.min[d];
1134  size2[d] = (int)roundf(s2->size[d]);
1135  cellsize2[d] = width2[d] / size2[d];
1136  }
1137 
1138  /* For each affected cell of s1... */
1139  do
1140  {
1141  double val1;
1142  /* Construct the bounds of this cell */
1143  ND_BOX nd_cell1;
1144  nd_box_init(&nd_cell1);
1145  for ( d = 0; d < ndims1; d++ )
1146  {
1147  nd_cell1.min[d] = min1[d] + (at1[d]+0) * cellsize1[d];
1148  nd_cell1.max[d] = min1[d] + (at1[d]+1) * cellsize1[d];
1149  }
1150 
1151  /* Find the cells of s2 that cell1 overlaps.. */
1152  nd_box_overlap(s2, &nd_cell1, &ibox2);
1153 
1154  /* Initialize counter */
1155  for ( d = 0; d < ndims2; d++ )
1156  {
1157  at2[d] = ibox2.min[d];
1158  }
1159 
1160  POSTGIS_DEBUGF(3, "at1 %d,%d %s", at1[0], at1[1], nd_box_to_json(&nd_cell1, ndims1));
1161 
1162  /* Get the value at this cell */
1163  val1 = s1->value[nd_stats_value_index(s1, at1)];
1164 
1165  /* For each overlapped cell of s2... */
1166  do
1167  {
1168  double ratio2;
1169  double val2;
1170 
1171  /* Construct the bounds of this cell */
1172  ND_BOX nd_cell2;
1173  nd_box_init(&nd_cell2);
1174  for ( d = 0; d < ndims2; d++ )
1175  {
1176  nd_cell2.min[d] = min2[d] + (at2[d]+0) * cellsize2[d];
1177  nd_cell2.max[d] = min2[d] + (at2[d]+1) * cellsize2[d];
1178  }
1179 
1180  POSTGIS_DEBUGF(3, " at2 %d,%d %s", at2[0], at2[1], nd_box_to_json(&nd_cell2, ndims2));
1181 
1182  /* Calculate overlap ratio of the cells */
1183  ratio2 = nd_box_ratio(&nd_cell1, &nd_cell2, Max(ndims1, ndims2));
1184 
1185  /* Multiply the cell counts, scaled by overlap ratio */
1186  val2 = s2->value[nd_stats_value_index(s2, at2)];
1187  POSTGIS_DEBUGF(3, " val1 %.6g val2 %.6g ratio %.6g", val1, val2, ratio2);
1188  val += val1 * (val2 * ratio2);
1189  }
1190  while ( nd_increment(&ibox2, ndims2, at2) );
1191 
1192  }
1193  while( nd_increment(&ibox1, ndims1, at1) );
1194 
1195  POSTGIS_DEBUGF(3, "val of histogram = %g", val);
1196 
1197  /*
1198  * In order to compare our total cell count "val" to the
1199  * ntuples_max, we need to scale val up to reflect a full
1200  * table estimate. So, multiply by ratio of table size to
1201  * sample size.
1202  */
1203  val *= (s1->table_features / s1->sample_features);
1204  val *= (s2->table_features / s2->sample_features);
1205 
1206  POSTGIS_DEBUGF(3, "val scaled to full table size = %g", val);
1207 
1208  /*
1209  * Because the cell counts are over-determined due to
1210  * double counting of features that overlap multiple cells
1211  * (see the compute_gserialized_stats routine)
1212  * we also have to scale our cell count "val" *down*
1213  * to adjust for the double counting.
1214  */
1215 // val /= (s1->cells_covered / s1->histogram_features);
1216 // val /= (s2->cells_covered / s2->histogram_features);
1217 
1218  /*
1219  * Finally, the selectivity is the estimated number of
1220  * rows to be returned divided by the maximum possible
1221  * number of rows that can be returned.
1222  */
1223  selectivity = val / ntuples_max;
1224 
1225  /* Guard against over-estimates and crazy numbers :) */
1226  if ( isnan(selectivity) || ! isfinite(selectivity) || selectivity < 0.0 )
1227  {
1228  selectivity = DEFAULT_ND_JOINSEL;
1229  }
1230  else if ( selectivity > 1.0 )
1231  {
1232  selectivity = 1.0;
1233  }
1234 
1235  return selectivity;
1236 }
static char * nd_stats_to_json(const ND_STATS *nd_stats)
Convert an ND_STATS to a JSON representation for external use.
static int nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
Return true if ND_BOX a overlaps b, false otherwise.
static int nd_increment(ND_IBOX *ibox, int ndims, int *counter)
Given an n-d index array (counter), and a domain to increment it in (ibox) increment it by one,...
#define ND_DIMS
The maximum number of dimensions our code can handle.
#define DEFAULT_ND_JOINSEL
#define FALLBACK_ND_SEL
More modest fallback selectivity factor.
#define FALLBACK_ND_JOINSEL
static int nd_box_init(ND_BOX *a)
Zero out an ND_BOX.
static int nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
What stats cells overlap with this ND_BOX? Put the lowest cell addresses in ND_IBOX->min and the high...
static double nd_box_ratio(const ND_BOX *b1, const ND_BOX *b2, int ndims)
Returns the proportion of b2 that is covered by b1.
static int nd_stats_value_index(const ND_STATS *stats, int *indexes)
Given a position in the n-d histogram (i,j,k) return the position in the 1-d values array.
static char * nd_box_to_json(const ND_BOX *nd_box, int ndims)
Convert an ND_BOX to a JSON string for printing.
float4 max[ND_DIMS]
float4 min[ND_DIMS]
N-dimensional box type for calculations, to avoid doing explicit axis conversions from GBOX in all ca...
int min[ND_DIMS]
N-dimensional box index type.
float4 size[ND_DIMS]
N-dimensional statistics structure.

References DEFAULT_ND_JOINSEL, ND_STATS_T::extent, FALLBACK_ND_JOINSEL, FALLBACK_ND_SEL, ND_STATS_T::histogram_cells, ND_BOX_T::max, ND_BOX_T::min, ND_IBOX_T::min, nd_box_init(), nd_box_intersects(), nd_box_overlap(), nd_box_ratio(), nd_box_to_json(), ND_DIMS, nd_increment(), nd_stats_to_json(), nd_stats_value_index(), ND_STATS_T::ndims, ND_STATS_T::not_null_features, ND_STATS_T::sample_features, ND_STATS_T::size, ND_STATS_T::table_features, and ND_STATS_T::value.

Referenced by _postgis_gserialized_joinsel(), and gserialized_joinsel_internal().

Here is the call graph for this function:
Here is the caller graph for this function: