PostGIS  2.5.7dev-r@@SVN_REVISION@@

◆ estimate_join_selectivity()

static float8 estimate_join_selectivity ( const ND_STATS s1,
const ND_STATS s2 
)
static

Given two statistics histograms, what is the selectivity of a join driven by the && or &&& operator?

Join selectivity is defined as the number of rows returned by the join operator divided by the number of rows that an unconstrained join would return (nrows1*nrows2).

To get the estimate of join rows, we walk through the cells of one histogram, and multiply the cell value by the proportion of the cells in the other histogram the cell overlaps: val += val1 * ( val2 * overlap_ratio )

Definition at line 1037 of file gserialized_estimate.c.

1038 {
1039  int ncells1, ncells2;
1040  int ndims1, ndims2, ndims;
1041  double ntuples_max;
1042  double ntuples_not_null1, ntuples_not_null2;
1043 
1044  ND_BOX extent1, extent2;
1045  ND_IBOX ibox1, ibox2;
1046  int at1[ND_DIMS];
1047  int at2[ND_DIMS];
1048  double min1[ND_DIMS];
1049  double width1[ND_DIMS];
1050  double cellsize1[ND_DIMS];
1051  int size2[ND_DIMS];
1052  double min2[ND_DIMS];
1053  double width2[ND_DIMS];
1054  double cellsize2[ND_DIMS];
1055  int size1[ND_DIMS];
1056  int d;
1057  double val = 0;
1058  float8 selectivity;
1059 
1060  /* Drop out on null inputs */
1061  if ( ! ( s1 && s2 ) )
1062  {
1063  elog(NOTICE, " estimate_join_selectivity called with null inputs");
1064  return FALLBACK_ND_SEL;
1065  }
1066 
1067  /* We need to know how many cells each side has... */
1068  ncells1 = (int)roundf(s1->histogram_cells);
1069  ncells2 = (int)roundf(s2->histogram_cells);
1070 
1071  /* ...so that we can drive the summation loop with the smaller histogram. */
1072  if ( ncells1 > ncells2 )
1073  {
1074  const ND_STATS *stats_tmp = s1;
1075  s1 = s2;
1076  s2 = stats_tmp;
1077  }
1078 
1079  POSTGIS_DEBUGF(3, "s1: %s", nd_stats_to_json(s1));
1080  POSTGIS_DEBUGF(3, "s2: %s", nd_stats_to_json(s2));
1081 
1082  /* Re-read that info after the swap */
1083  ncells1 = (int)roundf(s1->histogram_cells);
1084  ncells2 = (int)roundf(s2->histogram_cells);
1085 
1086  /* Q: What's the largest possible join size these relations can create? */
1087  /* A: The product of the # of non-null rows in each relation. */
1088  ntuples_not_null1 = s1->table_features * (s1->not_null_features / s1->sample_features);
1089  ntuples_not_null2 = s2->table_features * (s2->not_null_features / s2->sample_features);
1090  ntuples_max = ntuples_not_null1 * ntuples_not_null2;
1091 
1092  /* Get the ndims as ints */
1093  ndims1 = (int)roundf(s1->ndims);
1094  ndims2 = (int)roundf(s2->ndims);
1095  ndims = Max(ndims1, ndims2);
1096 
1097  /* Get the extents */
1098  extent1 = s1->extent;
1099  extent2 = s2->extent;
1100 
1101  /* If relation stats do not intersect, join is very very selective. */
1102  if ( ! nd_box_intersects(&extent1, &extent2, ndims) )
1103  {
1104  POSTGIS_DEBUG(3, "relation stats do not intersect, returning 0");
1105  PG_RETURN_FLOAT8(0.0);
1106  }
1107 
1108  /*
1109  * First find the index range of the part of the smaller
1110  * histogram that overlaps the larger one.
1111  */
1112  if ( ! nd_box_overlap(s1, &extent2, &ibox1) )
1113  {
1114  POSTGIS_DEBUG(3, "could not calculate overlap of relations");
1115  PG_RETURN_FLOAT8(FALLBACK_ND_JOINSEL);
1116  }
1117 
1118  /* Initialize counters / constants on s1 */
1119  for ( d = 0; d < ndims1; d++ )
1120  {
1121  at1[d] = ibox1.min[d];
1122  min1[d] = s1->extent.min[d];
1123  width1[d] = s1->extent.max[d] - s1->extent.min[d];
1124  size1[d] = (int)roundf(s1->size[d]);
1125  cellsize1[d] = width1[d] / size1[d];
1126  }
1127 
1128  /* Initialize counters / constants on s2 */
1129  for ( d = 0; d < ndims2; d++ )
1130  {
1131  min2[d] = s2->extent.min[d];
1132  width2[d] = s2->extent.max[d] - s2->extent.min[d];
1133  size2[d] = (int)roundf(s2->size[d]);
1134  cellsize2[d] = width2[d] / size2[d];
1135  }
1136 
1137  /* For each affected cell of s1... */
1138  do
1139  {
1140  double val1;
1141  /* Construct the bounds of this cell */
1142  ND_BOX nd_cell1;
1143  nd_box_init(&nd_cell1);
1144  for ( d = 0; d < ndims1; d++ )
1145  {
1146  nd_cell1.min[d] = min1[d] + (at1[d]+0) * cellsize1[d];
1147  nd_cell1.max[d] = min1[d] + (at1[d]+1) * cellsize1[d];
1148  }
1149 
1150  /* Find the cells of s2 that cell1 overlaps.. */
1151  nd_box_overlap(s2, &nd_cell1, &ibox2);
1152 
1153  /* Initialize counter */
1154  for ( d = 0; d < ndims2; d++ )
1155  {
1156  at2[d] = ibox2.min[d];
1157  }
1158 
1159  POSTGIS_DEBUGF(3, "at1 %d,%d %s", at1[0], at1[1], nd_box_to_json(&nd_cell1, ndims1));
1160 
1161  /* Get the value at this cell */
1162  val1 = s1->value[nd_stats_value_index(s1, at1)];
1163 
1164  /* For each overlapped cell of s2... */
1165  do
1166  {
1167  double ratio2;
1168  double val2;
1169 
1170  /* Construct the bounds of this cell */
1171  ND_BOX nd_cell2;
1172  nd_box_init(&nd_cell2);
1173  for ( d = 0; d < ndims2; d++ )
1174  {
1175  nd_cell2.min[d] = min2[d] + (at2[d]+0) * cellsize2[d];
1176  nd_cell2.max[d] = min2[d] + (at2[d]+1) * cellsize2[d];
1177  }
1178 
1179  POSTGIS_DEBUGF(3, " at2 %d,%d %s", at2[0], at2[1], nd_box_to_json(&nd_cell2, ndims2));
1180 
1181  /* Calculate overlap ratio of the cells */
1182  ratio2 = nd_box_ratio(&nd_cell1, &nd_cell2, Max(ndims1, ndims2));
1183 
1184  /* Multiply the cell counts, scaled by overlap ratio */
1185  val2 = s2->value[nd_stats_value_index(s2, at2)];
1186  POSTGIS_DEBUGF(3, " val1 %.6g val2 %.6g ratio %.6g", val1, val2, ratio2);
1187  val += val1 * (val2 * ratio2);
1188  }
1189  while ( nd_increment(&ibox2, ndims2, at2) );
1190 
1191  }
1192  while( nd_increment(&ibox1, ndims1, at1) );
1193 
1194  POSTGIS_DEBUGF(3, "val of histogram = %g", val);
1195 
1196  /*
1197  * In order to compare our total cell count "val" to the
1198  * ntuples_max, we need to scale val up to reflect a full
1199  * table estimate. So, multiply by ratio of table size to
1200  * sample size.
1201  */
1202  val *= (s1->table_features / s1->sample_features);
1203  val *= (s2->table_features / s2->sample_features);
1204 
1205  POSTGIS_DEBUGF(3, "val scaled to full table size = %g", val);
1206 
1207  /*
1208  * Because the cell counts are over-determined due to
1209  * double counting of features that overlap multiple cells
1210  * (see the compute_gserialized_stats routine)
1211  * we also have to scale our cell count "val" *down*
1212  * to adjust for the double counting.
1213  */
1214 // val /= (s1->cells_covered / s1->histogram_features);
1215 // val /= (s2->cells_covered / s2->histogram_features);
1216 
1217  /*
1218  * Finally, the selectivity is the estimated number of
1219  * rows to be returned divided by the maximum possible
1220  * number of rows that can be returned.
1221  */
1222  selectivity = val / ntuples_max;
1223 
1224  /* Guard against over-estimates and crazy numbers :) */
1225  if ( isnan(selectivity) || ! isfinite(selectivity) || selectivity < 0.0 )
1226  {
1227  selectivity = DEFAULT_ND_JOINSEL;
1228  }
1229  else if ( selectivity > 1.0 )
1230  {
1231  selectivity = 1.0;
1232  }
1233 
1234  return selectivity;
1235 }
static char * nd_stats_to_json(const ND_STATS *nd_stats)
Convert an ND_STATS to a JSON representation for external use.
static int nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
Return true if ND_BOX a overlaps b, false otherwise.
static int nd_increment(ND_IBOX *ibox, int ndims, int *counter)
Given an n-d index array (counter), and a domain to increment it in (ibox) increment it by one,...
#define ND_DIMS
The maximum number of dimensions our code can handle.
#define DEFAULT_ND_JOINSEL
#define FALLBACK_ND_SEL
More modest fallback selectivity factor.
#define FALLBACK_ND_JOINSEL
static int nd_box_init(ND_BOX *a)
Zero out an ND_BOX.
static int nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
What stats cells overlap with this ND_BOX? Put the lowest cell addresses in ND_IBOX->min and the high...
static double nd_box_ratio(const ND_BOX *b1, const ND_BOX *b2, int ndims)
Returns the proportion of b2 that is covered by b1.
static int nd_stats_value_index(const ND_STATS *stats, int *indexes)
Given a position in the n-d histogram (i,j,k) return the position in the 1-d values array.
static char * nd_box_to_json(const ND_BOX *nd_box, int ndims)
Convert an ND_BOX to a JSON string for printing.
float4 max[ND_DIMS]
float4 min[ND_DIMS]
N-dimensional box type for calculations, to avoid doing explicit axis conversions from GBOX in all ca...
int min[ND_DIMS]
N-dimensional box index type.
float4 size[ND_DIMS]
N-dimensional statistics structure.

References DEFAULT_ND_JOINSEL, ND_STATS_T::extent, FALLBACK_ND_JOINSEL, FALLBACK_ND_SEL, ND_STATS_T::histogram_cells, ND_BOX_T::max, ND_BOX_T::min, ND_IBOX_T::min, nd_box_init(), nd_box_intersects(), nd_box_overlap(), nd_box_ratio(), nd_box_to_json(), ND_DIMS, nd_increment(), nd_stats_to_json(), nd_stats_value_index(), ND_STATS_T::ndims, ND_STATS_T::not_null_features, ND_STATS_T::sample_features, ND_STATS_T::size, ND_STATS_T::table_features, and ND_STATS_T::value.

Referenced by _postgis_gserialized_joinsel(), and gserialized_gist_joinsel().

Here is the call graph for this function:
Here is the caller graph for this function: