PostGIS  3.6.1dev-r@@SVN_REVISION@@

◆ estimate_join_selectivity()

static float8 estimate_join_selectivity ( const ND_STATS s1,
const ND_STATS s2 
)
static

Given two statistics histograms, what is the selectivity of a join driven by the && or &&& operator?

Join selectivity is defined as the number of rows returned by the join operator divided by the number of rows that an unconstrained join would return (nrows1*nrows2).

To get the estimate of join rows, we walk through the cells of one histogram, and multiply the cell value by the proportion of the cells in the other histogram the cell overlaps: val += val1 * ( val2 * overlap_ratio )

Definition at line 1035 of file gserialized_estimate.c.

1036 {
1037  int ncells1, ncells2;
1038  int ndims1, ndims2, ndims;
1039  double ntuples_max;
1040  double ntuples_not_null1, ntuples_not_null2;
1041 
1042  ND_BOX extent1, extent2;
1043  ND_IBOX ibox1, ibox2;
1044  int at1[ND_DIMS];
1045  int at2[ND_DIMS];
1046  double min1[ND_DIMS];
1047  double width1[ND_DIMS];
1048  double cellsize1[ND_DIMS];
1049  int size2[ND_DIMS];
1050  double min2[ND_DIMS];
1051  double width2[ND_DIMS];
1052  double cellsize2[ND_DIMS];
1053  int size1[ND_DIMS];
1054  int d;
1055  double val = 0;
1056  float8 selectivity;
1057 
1058  /* Drop out on null inputs */
1059  if ( ! ( s1 && s2 ) )
1060  {
1061  elog(NOTICE, " estimate_join_selectivity called with null inputs");
1062  return FALLBACK_ND_SEL;
1063  }
1064 
1065  /* We need to know how many cells each side has... */
1066  ncells1 = (int)roundf(s1->histogram_cells);
1067  ncells2 = (int)roundf(s2->histogram_cells);
1068 
1069  /* ...so that we can drive the summation loop with the smaller histogram. */
1070  if ( ncells1 > ncells2 )
1071  {
1072  const ND_STATS *stats_tmp = s1;
1073  s1 = s2;
1074  s2 = stats_tmp;
1075  }
1076 
1077  POSTGIS_DEBUGF(3, "s1: %s", nd_stats_to_json(s1));
1078  POSTGIS_DEBUGF(3, "s2: %s", nd_stats_to_json(s2));
1079 
1080  /* Re-read that info after the swap */
1081  ncells1 = (int)roundf(s1->histogram_cells);
1082  ncells2 = (int)roundf(s2->histogram_cells);
1083 
1084  /* Q: What's the largest possible join size these relations can create? */
1085  /* A: The product of the # of non-null rows in each relation. */
1086  ntuples_not_null1 = s1->table_features * ((double)s1->not_null_features / s1->sample_features);
1087  ntuples_not_null2 = s2->table_features * ((double)s2->not_null_features / s2->sample_features);
1088  ntuples_max = ntuples_not_null1 * ntuples_not_null2;
1089 
1090  /* Get the ndims as ints */
1091  ndims1 = (int)roundf(s1->ndims);
1092  ndims2 = (int)roundf(s2->ndims);
1093  ndims = Max(ndims1, ndims2);
1094 
1095  /* Get the extents */
1096  extent1 = s1->extent;
1097  extent2 = s2->extent;
1098 
1099  /* If relation stats do not intersect, join is very very selective. */
1100  if ( ! nd_box_intersects(&extent1, &extent2, ndims) )
1101  {
1102  POSTGIS_DEBUG(3, "relation stats do not intersect, returning 0");
1103  PG_RETURN_FLOAT8(0.0);
1104  }
1105 
1106  /*
1107  * First find the index range of the part of the smaller
1108  * histogram that overlaps the larger one.
1109  */
1110  if ( ! nd_box_overlap(s1, &extent2, &ibox1) )
1111  {
1112  POSTGIS_DEBUG(3, "could not calculate overlap of relations");
1113  PG_RETURN_FLOAT8(FALLBACK_ND_JOINSEL);
1114  }
1115 
1116  /* Initialize counters / constants on s1 */
1117  for ( d = 0; d < ndims1; d++ )
1118  {
1119  at1[d] = ibox1.min[d];
1120  min1[d] = s1->extent.min[d];
1121  width1[d] = s1->extent.max[d] - s1->extent.min[d];
1122  size1[d] = (int)roundf(s1->size[d]);
1123  cellsize1[d] = width1[d] / size1[d];
1124  }
1125 
1126  /* Initialize counters / constants on s2 */
1127  for ( d = 0; d < ndims2; d++ )
1128  {
1129  min2[d] = s2->extent.min[d];
1130  width2[d] = s2->extent.max[d] - s2->extent.min[d];
1131  size2[d] = (int)roundf(s2->size[d]);
1132  cellsize2[d] = width2[d] / size2[d];
1133  }
1134 
1135  /* For each affected cell of s1... */
1136  do
1137  {
1138  double val1;
1139  /* Construct the bounds of this cell */
1140  ND_BOX nd_cell1;
1141  nd_box_init(&nd_cell1);
1142  for ( d = 0; d < ndims1; d++ )
1143  {
1144  nd_cell1.min[d] = min1[d] + (at1[d]+0) * cellsize1[d];
1145  nd_cell1.max[d] = min1[d] + (at1[d]+1) * cellsize1[d];
1146  }
1147 
1148  /* Find the cells of s2 that cell1 overlaps.. */
1149  nd_box_overlap(s2, &nd_cell1, &ibox2);
1150 
1151  /* Initialize counter */
1152  for ( d = 0; d < ndims2; d++ )
1153  {
1154  at2[d] = ibox2.min[d];
1155  }
1156 
1157  POSTGIS_DEBUGF(3, "at1 %d,%d %s", at1[0], at1[1], nd_box_to_json(&nd_cell1, ndims1));
1158 
1159  /* Get the value at this cell */
1160  val1 = s1->value[nd_stats_value_index(s1, at1)];
1161 
1162  /* For each overlapped cell of s2... */
1163  do
1164  {
1165  double ratio2;
1166  double val2;
1167 
1168  /* Construct the bounds of this cell */
1169  ND_BOX nd_cell2;
1170  nd_box_init(&nd_cell2);
1171  for ( d = 0; d < ndims2; d++ )
1172  {
1173  nd_cell2.min[d] = min2[d] + (at2[d]+0) * cellsize2[d];
1174  nd_cell2.max[d] = min2[d] + (at2[d]+1) * cellsize2[d];
1175  }
1176 
1177  POSTGIS_DEBUGF(3, " at2 %d,%d %s", at2[0], at2[1], nd_box_to_json(&nd_cell2, ndims2));
1178 
1179  /* Calculate overlap ratio of the cells */
1180  ratio2 = nd_box_ratio(&nd_cell1, &nd_cell2, Max(ndims1, ndims2));
1181 
1182  /* Multiply the cell counts, scaled by overlap ratio */
1183  val2 = s2->value[nd_stats_value_index(s2, at2)];
1184  POSTGIS_DEBUGF(3, " val1 %.6g val2 %.6g ratio %.6g", val1, val2, ratio2);
1185  val += val1 * (val2 * ratio2);
1186  }
1187  while ( nd_increment(&ibox2, ndims2, at2) );
1188 
1189  }
1190  while( nd_increment(&ibox1, ndims1, at1) );
1191 
1192  POSTGIS_DEBUGF(3, "val of histogram = %g", val);
1193 
1194  /*
1195  * In order to compare our total cell count "val" to the
1196  * ntuples_max, we need to scale val up to reflect a full
1197  * table estimate. So, multiply by ratio of table size to
1198  * sample size.
1199  */
1200  val *= (s1->table_features / s1->sample_features);
1201  val *= (s2->table_features / s2->sample_features);
1202 
1203  POSTGIS_DEBUGF(3, "val scaled to full table size = %g", val);
1204 
1205  /*
1206  * Because the cell counts are over-determined due to
1207  * double counting of features that overlap multiple cells
1208  * (see the compute_gserialized_stats routine)
1209  * we also have to scale our cell count "val" *down*
1210  * to adjust for the double counting.
1211  */
1212 // val /= (s1->cells_covered / s1->histogram_features);
1213 // val /= (s2->cells_covered / s2->histogram_features);
1214 
1215  /*
1216  * Finally, the selectivity is the estimated number of
1217  * rows to be returned divided by the maximum possible
1218  * number of rows that can be returned.
1219  */
1220  selectivity = val / ntuples_max;
1221 
1222  /* Guard against over-estimates and crazy numbers :) */
1223  if ( isnan(selectivity) || ! isfinite(selectivity) || selectivity < 0.0 )
1224  {
1225  selectivity = DEFAULT_ND_JOINSEL;
1226  }
1227  else if ( selectivity > 1.0 )
1228  {
1229  selectivity = 1.0;
1230  }
1231 
1232  return selectivity;
1233 }
static char * nd_stats_to_json(const ND_STATS *nd_stats)
Convert an ND_STATS to a JSON representation for external use.
static int nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
Return true if ND_BOX a overlaps b, false otherwise.
static int nd_increment(ND_IBOX *ibox, int ndims, int *counter)
Given an n-d index array (counter), and a domain to increment it in (ibox) increment it by one,...
#define ND_DIMS
The maximum number of dimensions our code can handle.
#define DEFAULT_ND_JOINSEL
#define FALLBACK_ND_SEL
More modest fallback selectivity factor.
#define FALLBACK_ND_JOINSEL
static int nd_box_init(ND_BOX *a)
Zero out an ND_BOX.
static int nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
What stats cells overlap with this ND_BOX? Put the lowest cell addresses in ND_IBOX->min and the high...
static double nd_box_ratio(const ND_BOX *b1, const ND_BOX *b2, int ndims)
Returns the proportion of b2 that is covered by b1.
static int nd_stats_value_index(const ND_STATS *stats, int *indexes)
Given a position in the n-d histogram (i,j,k) return the position in the 1-d values array.
static char * nd_box_to_json(const ND_BOX *nd_box, int ndims)
Convert an ND_BOX to a JSON string for printing.
float4 max[ND_DIMS]
float4 min[ND_DIMS]
N-dimensional box type for calculations, to avoid doing explicit axis conversions from GBOX in all ca...
int min[ND_DIMS]
N-dimensional box index type.
float4 size[ND_DIMS]
N-dimensional statistics structure.

References DEFAULT_ND_JOINSEL, ND_STATS_T::extent, FALLBACK_ND_JOINSEL, FALLBACK_ND_SEL, ND_STATS_T::histogram_cells, ND_BOX_T::max, ND_BOX_T::min, ND_IBOX_T::min, nd_box_init(), nd_box_intersects(), nd_box_overlap(), nd_box_ratio(), nd_box_to_json(), ND_DIMS, nd_increment(), nd_stats_to_json(), nd_stats_value_index(), ND_STATS_T::ndims, ND_STATS_T::not_null_features, ND_STATS_T::sample_features, ND_STATS_T::size, ND_STATS_T::table_features, and ND_STATS_T::value.

Referenced by _postgis_gserialized_joinsel(), and gserialized_joinsel_internal().

Here is the call graph for this function:
Here is the caller graph for this function: