PostGIS  3.0.6dev-r@@SVN_REVISION@@

◆ estimate_join_selectivity()

static float8 estimate_join_selectivity ( const ND_STATS s1,
const ND_STATS s2 
)
static

Given two statistics histograms, what is the selectivity of a join driven by the && or &&& operator?

Join selectivity is defined as the number of rows returned by the join operator divided by the number of rows that an unconstrained join would return (nrows1*nrows2).

To get the estimate of join rows, we walk through the cells of one histogram, and multiply the cell value by the proportion of the cells in the other histogram the cell overlaps: val += val1 * ( val2 * overlap_ratio )

Definition at line 1030 of file gserialized_estimate.c.

1031 {
1032  int ncells1, ncells2;
1033  int ndims1, ndims2, ndims;
1034  double ntuples_max;
1035  double ntuples_not_null1, ntuples_not_null2;
1036 
1037  ND_BOX extent1, extent2;
1038  ND_IBOX ibox1, ibox2;
1039  int at1[ND_DIMS];
1040  int at2[ND_DIMS];
1041  double min1[ND_DIMS];
1042  double width1[ND_DIMS];
1043  double cellsize1[ND_DIMS];
1044  int size2[ND_DIMS];
1045  double min2[ND_DIMS];
1046  double width2[ND_DIMS];
1047  double cellsize2[ND_DIMS];
1048  int size1[ND_DIMS];
1049  int d;
1050  double val = 0;
1051  float8 selectivity;
1052 
1053  /* Drop out on null inputs */
1054  if ( ! ( s1 && s2 ) )
1055  {
1056  elog(NOTICE, " estimate_join_selectivity called with null inputs");
1057  return FALLBACK_ND_SEL;
1058  }
1059 
1060  /* We need to know how many cells each side has... */
1061  ncells1 = (int)roundf(s1->histogram_cells);
1062  ncells2 = (int)roundf(s2->histogram_cells);
1063 
1064  /* ...so that we can drive the summation loop with the smaller histogram. */
1065  if ( ncells1 > ncells2 )
1066  {
1067  const ND_STATS *stats_tmp = s1;
1068  s1 = s2;
1069  s2 = stats_tmp;
1070  }
1071 
1072  POSTGIS_DEBUGF(3, "s1: %s", nd_stats_to_json(s1));
1073  POSTGIS_DEBUGF(3, "s2: %s", nd_stats_to_json(s2));
1074 
1075  /* Re-read that info after the swap */
1076  ncells1 = (int)roundf(s1->histogram_cells);
1077  ncells2 = (int)roundf(s2->histogram_cells);
1078 
1079  /* Q: What's the largest possible join size these relations can create? */
1080  /* A: The product of the # of non-null rows in each relation. */
1081  ntuples_not_null1 = s1->table_features * (s1->not_null_features / s1->sample_features);
1082  ntuples_not_null2 = s2->table_features * (s2->not_null_features / s2->sample_features);
1083  ntuples_max = ntuples_not_null1 * ntuples_not_null2;
1084 
1085  /* Get the ndims as ints */
1086  ndims1 = (int)roundf(s1->ndims);
1087  ndims2 = (int)roundf(s2->ndims);
1088  ndims = Max(ndims1, ndims2);
1089 
1090  /* Get the extents */
1091  extent1 = s1->extent;
1092  extent2 = s2->extent;
1093 
1094  /* If relation stats do not intersect, join is very very selective. */
1095  if ( ! nd_box_intersects(&extent1, &extent2, ndims) )
1096  {
1097  POSTGIS_DEBUG(3, "relation stats do not intersect, returning 0");
1098  PG_RETURN_FLOAT8(0.0);
1099  }
1100 
1101  /*
1102  * First find the index range of the part of the smaller
1103  * histogram that overlaps the larger one.
1104  */
1105  if ( ! nd_box_overlap(s1, &extent2, &ibox1) )
1106  {
1107  POSTGIS_DEBUG(3, "could not calculate overlap of relations");
1108  PG_RETURN_FLOAT8(FALLBACK_ND_JOINSEL);
1109  }
1110 
1111  /* Initialize counters / constants on s1 */
1112  for ( d = 0; d < ndims1; d++ )
1113  {
1114  at1[d] = ibox1.min[d];
1115  min1[d] = s1->extent.min[d];
1116  width1[d] = s1->extent.max[d] - s1->extent.min[d];
1117  size1[d] = (int)roundf(s1->size[d]);
1118  cellsize1[d] = width1[d] / size1[d];
1119  }
1120 
1121  /* Initialize counters / constants on s2 */
1122  for ( d = 0; d < ndims2; d++ )
1123  {
1124  min2[d] = s2->extent.min[d];
1125  width2[d] = s2->extent.max[d] - s2->extent.min[d];
1126  size2[d] = (int)roundf(s2->size[d]);
1127  cellsize2[d] = width2[d] / size2[d];
1128  }
1129 
1130  /* For each affected cell of s1... */
1131  do
1132  {
1133  double val1;
1134  /* Construct the bounds of this cell */
1135  ND_BOX nd_cell1;
1136  nd_box_init(&nd_cell1);
1137  for ( d = 0; d < ndims1; d++ )
1138  {
1139  nd_cell1.min[d] = min1[d] + (at1[d]+0) * cellsize1[d];
1140  nd_cell1.max[d] = min1[d] + (at1[d]+1) * cellsize1[d];
1141  }
1142 
1143  /* Find the cells of s2 that cell1 overlaps.. */
1144  nd_box_overlap(s2, &nd_cell1, &ibox2);
1145 
1146  /* Initialize counter */
1147  for ( d = 0; d < ndims2; d++ )
1148  {
1149  at2[d] = ibox2.min[d];
1150  }
1151 
1152  POSTGIS_DEBUGF(3, "at1 %d,%d %s", at1[0], at1[1], nd_box_to_json(&nd_cell1, ndims1));
1153 
1154  /* Get the value at this cell */
1155  val1 = s1->value[nd_stats_value_index(s1, at1)];
1156 
1157  /* For each overlapped cell of s2... */
1158  do
1159  {
1160  double ratio2;
1161  double val2;
1162 
1163  /* Construct the bounds of this cell */
1164  ND_BOX nd_cell2;
1165  nd_box_init(&nd_cell2);
1166  for ( d = 0; d < ndims2; d++ )
1167  {
1168  nd_cell2.min[d] = min2[d] + (at2[d]+0) * cellsize2[d];
1169  nd_cell2.max[d] = min2[d] + (at2[d]+1) * cellsize2[d];
1170  }
1171 
1172  POSTGIS_DEBUGF(3, " at2 %d,%d %s", at2[0], at2[1], nd_box_to_json(&nd_cell2, ndims2));
1173 
1174  /* Calculate overlap ratio of the cells */
1175  ratio2 = nd_box_ratio(&nd_cell1, &nd_cell2, Max(ndims1, ndims2));
1176 
1177  /* Multiply the cell counts, scaled by overlap ratio */
1178  val2 = s2->value[nd_stats_value_index(s2, at2)];
1179  POSTGIS_DEBUGF(3, " val1 %.6g val2 %.6g ratio %.6g", val1, val2, ratio2);
1180  val += val1 * (val2 * ratio2);
1181  }
1182  while ( nd_increment(&ibox2, ndims2, at2) );
1183 
1184  }
1185  while( nd_increment(&ibox1, ndims1, at1) );
1186 
1187  POSTGIS_DEBUGF(3, "val of histogram = %g", val);
1188 
1189  /*
1190  * In order to compare our total cell count "val" to the
1191  * ntuples_max, we need to scale val up to reflect a full
1192  * table estimate. So, multiply by ratio of table size to
1193  * sample size.
1194  */
1195  val *= (s1->table_features / s1->sample_features);
1196  val *= (s2->table_features / s2->sample_features);
1197 
1198  POSTGIS_DEBUGF(3, "val scaled to full table size = %g", val);
1199 
1200  /*
1201  * Because the cell counts are over-determined due to
1202  * double counting of features that overlap multiple cells
1203  * (see the compute_gserialized_stats routine)
1204  * we also have to scale our cell count "val" *down*
1205  * to adjust for the double counting.
1206  */
1207 // val /= (s1->cells_covered / s1->histogram_features);
1208 // val /= (s2->cells_covered / s2->histogram_features);
1209 
1210  /*
1211  * Finally, the selectivity is the estimated number of
1212  * rows to be returned divided by the maximum possible
1213  * number of rows that can be returned.
1214  */
1215  selectivity = val / ntuples_max;
1216 
1217  /* Guard against over-estimates and crazy numbers :) */
1218  if ( isnan(selectivity) || ! isfinite(selectivity) || selectivity < 0.0 )
1219  {
1220  selectivity = DEFAULT_ND_JOINSEL;
1221  }
1222  else if ( selectivity > 1.0 )
1223  {
1224  selectivity = 1.0;
1225  }
1226 
1227  return selectivity;
1228 }
static char * nd_stats_to_json(const ND_STATS *nd_stats)
Convert an ND_STATS to a JSON representation for external use.
static int nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
Return true if ND_BOX a overlaps b, false otherwise.
static int nd_increment(ND_IBOX *ibox, int ndims, int *counter)
Given an n-d index array (counter), and a domain to increment it in (ibox) increment it by one,...
#define ND_DIMS
The maximum number of dimensions our code can handle.
#define DEFAULT_ND_JOINSEL
#define FALLBACK_ND_SEL
More modest fallback selectivity factor.
#define FALLBACK_ND_JOINSEL
static int nd_box_init(ND_BOX *a)
Zero out an ND_BOX.
static int nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
What stats cells overlap with this ND_BOX? Put the lowest cell addresses in ND_IBOX->min and the high...
static double nd_box_ratio(const ND_BOX *b1, const ND_BOX *b2, int ndims)
Returns the proportion of b2 that is covered by b1.
static int nd_stats_value_index(const ND_STATS *stats, int *indexes)
Given a position in the n-d histogram (i,j,k) return the position in the 1-d values array.
static char * nd_box_to_json(const ND_BOX *nd_box, int ndims)
Convert an ND_BOX to a JSON string for printing.
float4 max[ND_DIMS]
float4 min[ND_DIMS]
N-dimensional box type for calculations, to avoid doing explicit axis conversions from GBOX in all ca...
int min[ND_DIMS]
N-dimensional box index type.
float4 size[ND_DIMS]
N-dimensional statistics structure.

References DEFAULT_ND_JOINSEL, ND_STATS_T::extent, FALLBACK_ND_JOINSEL, FALLBACK_ND_SEL, ND_STATS_T::histogram_cells, ND_BOX_T::max, ND_BOX_T::min, ND_IBOX_T::min, nd_box_init(), nd_box_intersects(), nd_box_overlap(), nd_box_ratio(), nd_box_to_json(), ND_DIMS, nd_increment(), nd_stats_to_json(), nd_stats_value_index(), ND_STATS_T::ndims, ND_STATS_T::not_null_features, ND_STATS_T::sample_features, ND_STATS_T::size, ND_STATS_T::table_features, and ND_STATS_T::value.

Referenced by _postgis_gserialized_joinsel(), and gserialized_joinsel_internal().

Here is the call graph for this function:
Here is the caller graph for this function: