PostGIS  2.5.2dev-r@@SVN_REVISION@@

◆ estimate_join_selectivity()

static float8 estimate_join_selectivity ( const ND_STATS s1,
const ND_STATS s2 
)
static

Given two statistics histograms, what is the selectivity of a join driven by the && or &&& operator?

Join selectivity is defined as the number of rows returned by the join operator divided by the number of rows that an unconstrained join would return (nrows1*nrows2).

To get the estimate of join rows, we walk through the cells of one histogram, and multiply the cell value by the proportion of the cells in the other histogram the cell overlaps: val += val1 * ( val2 * overlap_ratio )

Definition at line 1041 of file gserialized_estimate.c.

References DEFAULT_ND_JOINSEL, ND_STATS_T::extent, FALLBACK_ND_JOINSEL, FALLBACK_ND_SEL, gserialized_gist_joinsel_nd(), ND_STATS_T::histogram_cells, ND_BOX_T::max, ND_BOX_T::min, ND_IBOX_T::min, nd_box_init(), nd_box_intersects(), nd_box_overlap(), nd_box_ratio(), nd_box_to_json(), ND_DIMS, nd_increment(), nd_stats_to_json(), nd_stats_value_index(), ND_STATS_T::ndims, ND_STATS_T::not_null_features, PG_FUNCTION_INFO_V1(), ND_STATS_T::sample_features, ND_STATS_T::size, ND_STATS_T::table_features, and ND_STATS_T::value.

Referenced by _postgis_gserialized_joinsel(), and gserialized_gist_joinsel().

1042 {
1043  int ncells1, ncells2;
1044  int ndims1, ndims2, ndims;
1045  double ntuples_max;
1046  double ntuples_not_null1, ntuples_not_null2;
1047 
1048  ND_BOX extent1, extent2;
1049  ND_IBOX ibox1, ibox2;
1050  int at1[ND_DIMS];
1051  int at2[ND_DIMS];
1052  double min1[ND_DIMS];
1053  double width1[ND_DIMS];
1054  double cellsize1[ND_DIMS];
1055  int size2[ND_DIMS];
1056  double min2[ND_DIMS];
1057  double width2[ND_DIMS];
1058  double cellsize2[ND_DIMS];
1059  int size1[ND_DIMS];
1060  int d;
1061  double val = 0;
1062  float8 selectivity;
1063 
1064  /* Drop out on null inputs */
1065  if ( ! ( s1 && s2 ) )
1066  {
1067  elog(NOTICE, " estimate_join_selectivity called with null inputs");
1068  return FALLBACK_ND_SEL;
1069  }
1070 
1071  /* We need to know how many cells each side has... */
1072  ncells1 = (int)roundf(s1->histogram_cells);
1073  ncells2 = (int)roundf(s2->histogram_cells);
1074 
1075  /* ...so that we can drive the summation loop with the smaller histogram. */
1076  if ( ncells1 > ncells2 )
1077  {
1078  const ND_STATS *stats_tmp = s1;
1079  s1 = s2;
1080  s2 = stats_tmp;
1081  }
1082 
1083  POSTGIS_DEBUGF(3, "s1: %s", nd_stats_to_json(s1));
1084  POSTGIS_DEBUGF(3, "s2: %s", nd_stats_to_json(s2));
1085 
1086  /* Re-read that info after the swap */
1087  ncells1 = (int)roundf(s1->histogram_cells);
1088  ncells2 = (int)roundf(s2->histogram_cells);
1089 
1090  /* Q: What's the largest possible join size these relations can create? */
1091  /* A: The product of the # of non-null rows in each relation. */
1092  ntuples_not_null1 = s1->table_features * (s1->not_null_features / s1->sample_features);
1093  ntuples_not_null2 = s2->table_features * (s2->not_null_features / s2->sample_features);
1094  ntuples_max = ntuples_not_null1 * ntuples_not_null2;
1095 
1096  /* Get the ndims as ints */
1097  ndims1 = (int)roundf(s1->ndims);
1098  ndims2 = (int)roundf(s2->ndims);
1099  ndims = Max(ndims1, ndims2);
1100 
1101  /* Get the extents */
1102  extent1 = s1->extent;
1103  extent2 = s2->extent;
1104 
1105  /* If relation stats do not intersect, join is very very selective. */
1106  if ( ! nd_box_intersects(&extent1, &extent2, ndims) )
1107  {
1108  POSTGIS_DEBUG(3, "relation stats do not intersect, returning 0");
1109  PG_RETURN_FLOAT8(0.0);
1110  }
1111 
1112  /*
1113  * First find the index range of the part of the smaller
1114  * histogram that overlaps the larger one.
1115  */
1116  if ( ! nd_box_overlap(s1, &extent2, &ibox1) )
1117  {
1118  POSTGIS_DEBUG(3, "could not calculate overlap of relations");
1119  PG_RETURN_FLOAT8(FALLBACK_ND_JOINSEL);
1120  }
1121 
1122  /* Initialize counters / constants on s1 */
1123  for ( d = 0; d < ndims1; d++ )
1124  {
1125  at1[d] = ibox1.min[d];
1126  min1[d] = s1->extent.min[d];
1127  width1[d] = s1->extent.max[d] - s1->extent.min[d];
1128  size1[d] = (int)roundf(s1->size[d]);
1129  cellsize1[d] = width1[d] / size1[d];
1130  }
1131 
1132  /* Initialize counters / constants on s2 */
1133  for ( d = 0; d < ndims2; d++ )
1134  {
1135  min2[d] = s2->extent.min[d];
1136  width2[d] = s2->extent.max[d] - s2->extent.min[d];
1137  size2[d] = (int)roundf(s2->size[d]);
1138  cellsize2[d] = width2[d] / size2[d];
1139  }
1140 
1141  /* For each affected cell of s1... */
1142  do
1143  {
1144  double val1;
1145  /* Construct the bounds of this cell */
1146  ND_BOX nd_cell1;
1147  nd_box_init(&nd_cell1);
1148  for ( d = 0; d < ndims1; d++ )
1149  {
1150  nd_cell1.min[d] = min1[d] + (at1[d]+0) * cellsize1[d];
1151  nd_cell1.max[d] = min1[d] + (at1[d]+1) * cellsize1[d];
1152  }
1153 
1154  /* Find the cells of s2 that cell1 overlaps.. */
1155  nd_box_overlap(s2, &nd_cell1, &ibox2);
1156 
1157  /* Initialize counter */
1158  for ( d = 0; d < ndims2; d++ )
1159  {
1160  at2[d] = ibox2.min[d];
1161  }
1162 
1163  POSTGIS_DEBUGF(3, "at1 %d,%d %s", at1[0], at1[1], nd_box_to_json(&nd_cell1, ndims1));
1164 
1165  /* Get the value at this cell */
1166  val1 = s1->value[nd_stats_value_index(s1, at1)];
1167 
1168  /* For each overlapped cell of s2... */
1169  do
1170  {
1171  double ratio2;
1172  double val2;
1173 
1174  /* Construct the bounds of this cell */
1175  ND_BOX nd_cell2;
1176  nd_box_init(&nd_cell2);
1177  for ( d = 0; d < ndims2; d++ )
1178  {
1179  nd_cell2.min[d] = min2[d] + (at2[d]+0) * cellsize2[d];
1180  nd_cell2.max[d] = min2[d] + (at2[d]+1) * cellsize2[d];
1181  }
1182 
1183  POSTGIS_DEBUGF(3, " at2 %d,%d %s", at2[0], at2[1], nd_box_to_json(&nd_cell2, ndims2));
1184 
1185  /* Calculate overlap ratio of the cells */
1186  ratio2 = nd_box_ratio(&nd_cell1, &nd_cell2, Max(ndims1, ndims2));
1187 
1188  /* Multiply the cell counts, scaled by overlap ratio */
1189  val2 = s2->value[nd_stats_value_index(s2, at2)];
1190  POSTGIS_DEBUGF(3, " val1 %.6g val2 %.6g ratio %.6g", val1, val2, ratio2);
1191  val += val1 * (val2 * ratio2);
1192  }
1193  while ( nd_increment(&ibox2, ndims2, at2) );
1194 
1195  }
1196  while( nd_increment(&ibox1, ndims1, at1) );
1197 
1198  POSTGIS_DEBUGF(3, "val of histogram = %g", val);
1199 
1200  /*
1201  * In order to compare our total cell count "val" to the
1202  * ntuples_max, we need to scale val up to reflect a full
1203  * table estimate. So, multiply by ratio of table size to
1204  * sample size.
1205  */
1206  val *= (s1->table_features / s1->sample_features);
1207  val *= (s2->table_features / s2->sample_features);
1208 
1209  POSTGIS_DEBUGF(3, "val scaled to full table size = %g", val);
1210 
1211  /*
1212  * Because the cell counts are over-determined due to
1213  * double counting of features that overlap multiple cells
1214  * (see the compute_gserialized_stats routine)
1215  * we also have to scale our cell count "val" *down*
1216  * to adjust for the double counting.
1217  */
1218 // val /= (s1->cells_covered / s1->histogram_features);
1219 // val /= (s2->cells_covered / s2->histogram_features);
1220 
1221  /*
1222  * Finally, the selectivity is the estimated number of
1223  * rows to be returned divided by the maximum possible
1224  * number of rows that can be returned.
1225  */
1226  selectivity = val / ntuples_max;
1227 
1228  /* Guard against over-estimates and crazy numbers :) */
1229  if ( isnan(selectivity) || ! isfinite(selectivity) || selectivity < 0.0 )
1230  {
1231  selectivity = DEFAULT_ND_JOINSEL;
1232  }
1233  else if ( selectivity > 1.0 )
1234  {
1235  selectivity = 1.0;
1236  }
1237 
1238  return selectivity;
1239 }
static int nd_increment(ND_IBOX *ibox, int ndims, int *counter)
Given an n-d index array (counter), and a domain to increment it in (ibox) increment it by one...
static int nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
What stats cells overlap with this ND_BOX? Put the lowest cell addresses in ND_IBOX->min and the high...
#define DEFAULT_ND_JOINSEL
#define ND_DIMS
The maximum number of dimensions our code can handle.
#define FALLBACK_ND_JOINSEL
#define FALLBACK_ND_SEL
More modest fallback selectivity factor.
static int nd_box_init(ND_BOX *a)
Zero out an ND_BOX.
static int nd_stats_value_index(const ND_STATS *stats, int *indexes)
Given a position in the n-d histogram (i,j,k) return the position in the 1-d values array...
float4 size[ND_DIMS]
static int nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
Return true if ND_BOX a overlaps b, false otherwise.
int min[ND_DIMS]
N-dimensional box index type.
static char * nd_box_to_json(const ND_BOX *nd_box, int ndims)
Convert an ND_BOX to a JSON string for printing.
float4 max[ND_DIMS]
float4 min[ND_DIMS]
static char * nd_stats_to_json(const ND_STATS *nd_stats)
Convert an ND_STATS to a JSON representation for external use.
static double nd_box_ratio(const ND_BOX *b1, const ND_BOX *b2, int ndims)
Returns the proportion of b2 that is covered by b1.
N-dimensional statistics structure.
N-dimensional box type for calculations, to avoid doing explicit axis conversions from GBOX in all ca...
Here is the call graph for this function:
Here is the caller graph for this function: