PostGIS  3.0.0dev-r@@SVN_REVISION@@

◆ estimate_join_selectivity()

static float8 estimate_join_selectivity ( const ND_STATS s1,
const ND_STATS s2 
)
static

Given two statistics histograms, what is the selectivity of a join driven by the && or &&& operator?

Join selectivity is defined as the number of rows returned by the join operator divided by the number of rows that an unconstrained join would return (nrows1*nrows2).

To get the estimate of join rows, we walk through the cells of one histogram, and multiply the cell value by the proportion of the cells in the other histogram the cell overlaps: val += val1 * ( val2 * overlap_ratio )

Definition at line 1031 of file gserialized_estimate.c.

References DEFAULT_ND_JOINSEL, ND_STATS_T::extent, FALLBACK_ND_JOINSEL, FALLBACK_ND_SEL, gserialized_gist_joinsel_nd(), ND_STATS_T::histogram_cells, ND_BOX_T::max, ND_BOX_T::min, ND_IBOX_T::min, nd_box_init(), nd_box_intersects(), nd_box_overlap(), nd_box_ratio(), nd_box_to_json(), ND_DIMS, nd_increment(), nd_stats_to_json(), nd_stats_value_index(), ND_STATS_T::ndims, ND_STATS_T::not_null_features, PG_FUNCTION_INFO_V1(), ND_STATS_T::sample_features, ND_STATS_T::size, ND_STATS_T::table_features, and ND_STATS_T::value.

Referenced by _postgis_gserialized_joinsel(), and gserialized_gist_joinsel().

1032 {
1033  int ncells1, ncells2;
1034  int ndims1, ndims2, ndims;
1035  double ntuples_max;
1036  double ntuples_not_null1, ntuples_not_null2;
1037 
1038  ND_BOX extent1, extent2;
1039  ND_IBOX ibox1, ibox2;
1040  int at1[ND_DIMS];
1041  int at2[ND_DIMS];
1042  double min1[ND_DIMS];
1043  double width1[ND_DIMS];
1044  double cellsize1[ND_DIMS];
1045  int size2[ND_DIMS];
1046  double min2[ND_DIMS];
1047  double width2[ND_DIMS];
1048  double cellsize2[ND_DIMS];
1049  int size1[ND_DIMS];
1050  int d;
1051  double val = 0;
1052  float8 selectivity;
1053 
1054  /* Drop out on null inputs */
1055  if ( ! ( s1 && s2 ) )
1056  {
1057  elog(NOTICE, " estimate_join_selectivity called with null inputs");
1058  return FALLBACK_ND_SEL;
1059  }
1060 
1061  /* We need to know how many cells each side has... */
1062  ncells1 = (int)roundf(s1->histogram_cells);
1063  ncells2 = (int)roundf(s2->histogram_cells);
1064 
1065  /* ...so that we can drive the summation loop with the smaller histogram. */
1066  if ( ncells1 > ncells2 )
1067  {
1068  const ND_STATS *stats_tmp = s1;
1069  s1 = s2;
1070  s2 = stats_tmp;
1071  }
1072 
1073  POSTGIS_DEBUGF(3, "s1: %s", nd_stats_to_json(s1));
1074  POSTGIS_DEBUGF(3, "s2: %s", nd_stats_to_json(s2));
1075 
1076  /* Re-read that info after the swap */
1077  ncells1 = (int)roundf(s1->histogram_cells);
1078  ncells2 = (int)roundf(s2->histogram_cells);
1079 
1080  /* Q: What's the largest possible join size these relations can create? */
1081  /* A: The product of the # of non-null rows in each relation. */
1082  ntuples_not_null1 = s1->table_features * (s1->not_null_features / s1->sample_features);
1083  ntuples_not_null2 = s2->table_features * (s2->not_null_features / s2->sample_features);
1084  ntuples_max = ntuples_not_null1 * ntuples_not_null2;
1085 
1086  /* Get the ndims as ints */
1087  ndims1 = (int)roundf(s1->ndims);
1088  ndims2 = (int)roundf(s2->ndims);
1089  ndims = Max(ndims1, ndims2);
1090 
1091  /* Get the extents */
1092  extent1 = s1->extent;
1093  extent2 = s2->extent;
1094 
1095  /* If relation stats do not intersect, join is very very selective. */
1096  if ( ! nd_box_intersects(&extent1, &extent2, ndims) )
1097  {
1098  POSTGIS_DEBUG(3, "relation stats do not intersect, returning 0");
1099  PG_RETURN_FLOAT8(0.0);
1100  }
1101 
1102  /*
1103  * First find the index range of the part of the smaller
1104  * histogram that overlaps the larger one.
1105  */
1106  if ( ! nd_box_overlap(s1, &extent2, &ibox1) )
1107  {
1108  POSTGIS_DEBUG(3, "could not calculate overlap of relations");
1109  PG_RETURN_FLOAT8(FALLBACK_ND_JOINSEL);
1110  }
1111 
1112  /* Initialize counters / constants on s1 */
1113  for ( d = 0; d < ndims1; d++ )
1114  {
1115  at1[d] = ibox1.min[d];
1116  min1[d] = s1->extent.min[d];
1117  width1[d] = s1->extent.max[d] - s1->extent.min[d];
1118  size1[d] = (int)roundf(s1->size[d]);
1119  cellsize1[d] = width1[d] / size1[d];
1120  }
1121 
1122  /* Initialize counters / constants on s2 */
1123  for ( d = 0; d < ndims2; d++ )
1124  {
1125  min2[d] = s2->extent.min[d];
1126  width2[d] = s2->extent.max[d] - s2->extent.min[d];
1127  size2[d] = (int)roundf(s2->size[d]);
1128  cellsize2[d] = width2[d] / size2[d];
1129  }
1130 
1131  /* For each affected cell of s1... */
1132  do
1133  {
1134  double val1;
1135  /* Construct the bounds of this cell */
1136  ND_BOX nd_cell1;
1137  nd_box_init(&nd_cell1);
1138  for ( d = 0; d < ndims1; d++ )
1139  {
1140  nd_cell1.min[d] = min1[d] + (at1[d]+0) * cellsize1[d];
1141  nd_cell1.max[d] = min1[d] + (at1[d]+1) * cellsize1[d];
1142  }
1143 
1144  /* Find the cells of s2 that cell1 overlaps.. */
1145  nd_box_overlap(s2, &nd_cell1, &ibox2);
1146 
1147  /* Initialize counter */
1148  for ( d = 0; d < ndims2; d++ )
1149  {
1150  at2[d] = ibox2.min[d];
1151  }
1152 
1153  POSTGIS_DEBUGF(3, "at1 %d,%d %s", at1[0], at1[1], nd_box_to_json(&nd_cell1, ndims1));
1154 
1155  /* Get the value at this cell */
1156  val1 = s1->value[nd_stats_value_index(s1, at1)];
1157 
1158  /* For each overlapped cell of s2... */
1159  do
1160  {
1161  double ratio2;
1162  double val2;
1163 
1164  /* Construct the bounds of this cell */
1165  ND_BOX nd_cell2;
1166  nd_box_init(&nd_cell2);
1167  for ( d = 0; d < ndims2; d++ )
1168  {
1169  nd_cell2.min[d] = min2[d] + (at2[d]+0) * cellsize2[d];
1170  nd_cell2.max[d] = min2[d] + (at2[d]+1) * cellsize2[d];
1171  }
1172 
1173  POSTGIS_DEBUGF(3, " at2 %d,%d %s", at2[0], at2[1], nd_box_to_json(&nd_cell2, ndims2));
1174 
1175  /* Calculate overlap ratio of the cells */
1176  ratio2 = nd_box_ratio(&nd_cell1, &nd_cell2, Max(ndims1, ndims2));
1177 
1178  /* Multiply the cell counts, scaled by overlap ratio */
1179  val2 = s2->value[nd_stats_value_index(s2, at2)];
1180  POSTGIS_DEBUGF(3, " val1 %.6g val2 %.6g ratio %.6g", val1, val2, ratio2);
1181  val += val1 * (val2 * ratio2);
1182  }
1183  while ( nd_increment(&ibox2, ndims2, at2) );
1184 
1185  }
1186  while( nd_increment(&ibox1, ndims1, at1) );
1187 
1188  POSTGIS_DEBUGF(3, "val of histogram = %g", val);
1189 
1190  /*
1191  * In order to compare our total cell count "val" to the
1192  * ntuples_max, we need to scale val up to reflect a full
1193  * table estimate. So, multiply by ratio of table size to
1194  * sample size.
1195  */
1196  val *= (s1->table_features / s1->sample_features);
1197  val *= (s2->table_features / s2->sample_features);
1198 
1199  POSTGIS_DEBUGF(3, "val scaled to full table size = %g", val);
1200 
1201  /*
1202  * Because the cell counts are over-determined due to
1203  * double counting of features that overlap multiple cells
1204  * (see the compute_gserialized_stats routine)
1205  * we also have to scale our cell count "val" *down*
1206  * to adjust for the double counting.
1207  */
1208 // val /= (s1->cells_covered / s1->histogram_features);
1209 // val /= (s2->cells_covered / s2->histogram_features);
1210 
1211  /*
1212  * Finally, the selectivity is the estimated number of
1213  * rows to be returned divided by the maximum possible
1214  * number of rows that can be returned.
1215  */
1216  selectivity = val / ntuples_max;
1217 
1218  /* Guard against over-estimates and crazy numbers :) */
1219  if ( isnan(selectivity) || ! isfinite(selectivity) || selectivity < 0.0 )
1220  {
1221  selectivity = DEFAULT_ND_JOINSEL;
1222  }
1223  else if ( selectivity > 1.0 )
1224  {
1225  selectivity = 1.0;
1226  }
1227 
1228  return selectivity;
1229 }
static int nd_increment(ND_IBOX *ibox, int ndims, int *counter)
Given an n-d index array (counter), and a domain to increment it in (ibox) increment it by one...
static int nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
What stats cells overlap with this ND_BOX? Put the lowest cell addresses in ND_IBOX->min and the high...
#define DEFAULT_ND_JOINSEL
#define ND_DIMS
The maximum number of dimensions our code can handle.
#define FALLBACK_ND_JOINSEL
#define FALLBACK_ND_SEL
More modest fallback selectivity factor.
static int nd_box_init(ND_BOX *a)
Zero out an ND_BOX.
static int nd_stats_value_index(const ND_STATS *stats, int *indexes)
Given a position in the n-d histogram (i,j,k) return the position in the 1-d values array...
float4 size[ND_DIMS]
static int nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
Return true if ND_BOX a overlaps b, false otherwise.
int min[ND_DIMS]
N-dimensional box index type.
static char * nd_box_to_json(const ND_BOX *nd_box, int ndims)
Convert an ND_BOX to a JSON string for printing.
float4 max[ND_DIMS]
float4 min[ND_DIMS]
static char * nd_stats_to_json(const ND_STATS *nd_stats)
Convert an ND_STATS to a JSON representation for external use.
static double nd_box_ratio(const ND_BOX *b1, const ND_BOX *b2, int ndims)
Returns the proportion of b2 that is covered by b1.
N-dimensional statistics structure.
N-dimensional box type for calculations, to avoid doing explicit axis conversions from GBOX in all ca...
Here is the call graph for this function:
Here is the caller graph for this function: