PostGIS  2.5.0beta1dev-r@@SVN_REVISION@@

◆ estimate_join_selectivity()

static float8 estimate_join_selectivity ( const ND_STATS s1,
const ND_STATS s2 
)
static

Given two statistics histograms, what is the selectivity of a join driven by the && or &&& operator?

Join selectivity is defined as the number of rows returned by the join operator divided by the number of rows that an unconstrained join would return (nrows1*nrows2).

To get the estimate of join rows, we walk through the cells of one histogram, and multiply the cell value by the proportion of the cells in the other histogram the cell overlaps: val += val1 * ( val2 * overlap_ratio )

Definition at line 1018 of file gserialized_estimate.c.

References DEFAULT_ND_JOINSEL, ND_STATS_T::extent, FALLBACK_ND_JOINSEL, FALLBACK_ND_SEL, gserialized_gist_joinsel_nd(), ND_STATS_T::histogram_cells, ND_BOX_T::max, ND_BOX_T::min, ND_IBOX_T::min, nd_box_init(), nd_box_intersects(), nd_box_overlap(), nd_box_ratio(), nd_box_to_json(), ND_DIMS, nd_increment(), nd_stats_to_json(), nd_stats_value_index(), ND_STATS_T::ndims, ND_STATS_T::not_null_features, PG_FUNCTION_INFO_V1(), ND_STATS_T::sample_features, ND_STATS_T::size, ND_STATS_T::table_features, and ND_STATS_T::value.

Referenced by _postgis_gserialized_joinsel(), and gserialized_gist_joinsel().

1019 {
1020  int ncells1, ncells2;
1021  int ndims1, ndims2, ndims;
1022  double ntuples_max;
1023  double ntuples_not_null1, ntuples_not_null2;
1024 
1025  ND_BOX extent1, extent2;
1026  ND_IBOX ibox1, ibox2;
1027  int at1[ND_DIMS];
1028  int at2[ND_DIMS];
1029  double min1[ND_DIMS];
1030  double width1[ND_DIMS];
1031  double cellsize1[ND_DIMS];
1032  int size2[ND_DIMS];
1033  double min2[ND_DIMS];
1034  double width2[ND_DIMS];
1035  double cellsize2[ND_DIMS];
1036  int size1[ND_DIMS];
1037  int d;
1038  double val = 0;
1039  float8 selectivity;
1040 
1041  /* Drop out on null inputs */
1042  if ( ! ( s1 && s2 ) )
1043  {
1044  elog(NOTICE, " estimate_join_selectivity called with null inputs");
1045  return FALLBACK_ND_SEL;
1046  }
1047 
1048  /* We need to know how many cells each side has... */
1049  ncells1 = (int)roundf(s1->histogram_cells);
1050  ncells2 = (int)roundf(s2->histogram_cells);
1051 
1052  /* ...so that we can drive the summation loop with the smaller histogram. */
1053  if ( ncells1 > ncells2 )
1054  {
1055  const ND_STATS *stats_tmp = s1;
1056  s1 = s2;
1057  s2 = stats_tmp;
1058  }
1059 
1060  POSTGIS_DEBUGF(3, "s1: %s", nd_stats_to_json(s1));
1061  POSTGIS_DEBUGF(3, "s2: %s", nd_stats_to_json(s2));
1062 
1063  /* Re-read that info after the swap */
1064  ncells1 = (int)roundf(s1->histogram_cells);
1065  ncells2 = (int)roundf(s2->histogram_cells);
1066 
1067  /* Q: What's the largest possible join size these relations can create? */
1068  /* A: The product of the # of non-null rows in each relation. */
1069  ntuples_not_null1 = s1->table_features * (s1->not_null_features / s1->sample_features);
1070  ntuples_not_null2 = s2->table_features * (s2->not_null_features / s2->sample_features);
1071  ntuples_max = ntuples_not_null1 * ntuples_not_null2;
1072 
1073  /* Get the ndims as ints */
1074  ndims1 = (int)roundf(s1->ndims);
1075  ndims2 = (int)roundf(s2->ndims);
1076  ndims = Max(ndims1, ndims2);
1077 
1078  /* Get the extents */
1079  extent1 = s1->extent;
1080  extent2 = s2->extent;
1081 
1082  /* If relation stats do not intersect, join is very very selective. */
1083  if ( ! nd_box_intersects(&extent1, &extent2, ndims) )
1084  {
1085  POSTGIS_DEBUG(3, "relation stats do not intersect, returning 0");
1086  PG_RETURN_FLOAT8(0.0);
1087  }
1088 
1089  /*
1090  * First find the index range of the part of the smaller
1091  * histogram that overlaps the larger one.
1092  */
1093  if ( ! nd_box_overlap(s1, &extent2, &ibox1) )
1094  {
1095  POSTGIS_DEBUG(3, "could not calculate overlap of relations");
1096  PG_RETURN_FLOAT8(FALLBACK_ND_JOINSEL);
1097  }
1098 
1099  /* Initialize counters / constants on s1 */
1100  for ( d = 0; d < ndims1; d++ )
1101  {
1102  at1[d] = ibox1.min[d];
1103  min1[d] = s1->extent.min[d];
1104  width1[d] = s1->extent.max[d] - s1->extent.min[d];
1105  size1[d] = (int)roundf(s1->size[d]);
1106  cellsize1[d] = width1[d] / size1[d];
1107  }
1108 
1109  /* Initialize counters / constants on s2 */
1110  for ( d = 0; d < ndims2; d++ )
1111  {
1112  min2[d] = s2->extent.min[d];
1113  width2[d] = s2->extent.max[d] - s2->extent.min[d];
1114  size2[d] = (int)roundf(s2->size[d]);
1115  cellsize2[d] = width2[d] / size2[d];
1116  }
1117 
1118  /* For each affected cell of s1... */
1119  do
1120  {
1121  double val1;
1122  /* Construct the bounds of this cell */
1123  ND_BOX nd_cell1;
1124  nd_box_init(&nd_cell1);
1125  for ( d = 0; d < ndims1; d++ )
1126  {
1127  nd_cell1.min[d] = min1[d] + (at1[d]+0) * cellsize1[d];
1128  nd_cell1.max[d] = min1[d] + (at1[d]+1) * cellsize1[d];
1129  }
1130 
1131  /* Find the cells of s2 that cell1 overlaps.. */
1132  nd_box_overlap(s2, &nd_cell1, &ibox2);
1133 
1134  /* Initialize counter */
1135  for ( d = 0; d < ndims2; d++ )
1136  {
1137  at2[d] = ibox2.min[d];
1138  }
1139 
1140  POSTGIS_DEBUGF(3, "at1 %d,%d %s", at1[0], at1[1], nd_box_to_json(&nd_cell1, ndims1));
1141 
1142  /* Get the value at this cell */
1143  val1 = s1->value[nd_stats_value_index(s1, at1)];
1144 
1145  /* For each overlapped cell of s2... */
1146  do
1147  {
1148  double ratio2;
1149  double val2;
1150 
1151  /* Construct the bounds of this cell */
1152  ND_BOX nd_cell2;
1153  nd_box_init(&nd_cell2);
1154  for ( d = 0; d < ndims2; d++ )
1155  {
1156  nd_cell2.min[d] = min2[d] + (at2[d]+0) * cellsize2[d];
1157  nd_cell2.max[d] = min2[d] + (at2[d]+1) * cellsize2[d];
1158  }
1159 
1160  POSTGIS_DEBUGF(3, " at2 %d,%d %s", at2[0], at2[1], nd_box_to_json(&nd_cell2, ndims2));
1161 
1162  /* Calculate overlap ratio of the cells */
1163  ratio2 = nd_box_ratio(&nd_cell1, &nd_cell2, Max(ndims1, ndims2));
1164 
1165  /* Multiply the cell counts, scaled by overlap ratio */
1166  val2 = s2->value[nd_stats_value_index(s2, at2)];
1167  POSTGIS_DEBUGF(3, " val1 %.6g val2 %.6g ratio %.6g", val1, val2, ratio2);
1168  val += val1 * (val2 * ratio2);
1169  }
1170  while ( nd_increment(&ibox2, ndims2, at2) );
1171 
1172  }
1173  while( nd_increment(&ibox1, ndims1, at1) );
1174 
1175  POSTGIS_DEBUGF(3, "val of histogram = %g", val);
1176 
1177  /*
1178  * In order to compare our total cell count "val" to the
1179  * ntuples_max, we need to scale val up to reflect a full
1180  * table estimate. So, multiply by ratio of table size to
1181  * sample size.
1182  */
1183  val *= (s1->table_features / s1->sample_features);
1184  val *= (s2->table_features / s2->sample_features);
1185 
1186  POSTGIS_DEBUGF(3, "val scaled to full table size = %g", val);
1187 
1188  /*
1189  * Because the cell counts are over-determined due to
1190  * double counting of features that overlap multiple cells
1191  * (see the compute_gserialized_stats routine)
1192  * we also have to scale our cell count "val" *down*
1193  * to adjust for the double counting.
1194  */
1195 // val /= (s1->cells_covered / s1->histogram_features);
1196 // val /= (s2->cells_covered / s2->histogram_features);
1197 
1198  /*
1199  * Finally, the selectivity is the estimated number of
1200  * rows to be returned divided by the maximum possible
1201  * number of rows that can be returned.
1202  */
1203  selectivity = val / ntuples_max;
1204 
1205  /* Guard against over-estimates and crazy numbers :) */
1206  if ( isnan(selectivity) || ! isfinite(selectivity) || selectivity < 0.0 )
1207  {
1208  selectivity = DEFAULT_ND_JOINSEL;
1209  }
1210  else if ( selectivity > 1.0 )
1211  {
1212  selectivity = 1.0;
1213  }
1214 
1215  return selectivity;
1216 }
static int nd_increment(ND_IBOX *ibox, int ndims, int *counter)
Given an n-d index array (counter), and a domain to increment it in (ibox) increment it by one...
static int nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
What stats cells overlap with this ND_BOX? Put the lowest cell addresses in ND_IBOX->min and the high...
#define DEFAULT_ND_JOINSEL
#define ND_DIMS
The maximum number of dimensions our code can handle.
#define FALLBACK_ND_JOINSEL
#define FALLBACK_ND_SEL
More modest fallback selectivity factor.
static int nd_box_init(ND_BOX *a)
Zero out an ND_BOX.
static int nd_stats_value_index(const ND_STATS *stats, int *indexes)
Given a position in the n-d histogram (i,j,k) return the position in the 1-d values array...
float4 size[ND_DIMS]
static int nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
Return true if ND_BOX a overlaps b, false otherwise.
int min[ND_DIMS]
N-dimensional box index type.
static char * nd_box_to_json(const ND_BOX *nd_box, int ndims)
Convert an ND_BOX to a JSON string for printing.
float4 max[ND_DIMS]
float4 min[ND_DIMS]
static char * nd_stats_to_json(const ND_STATS *nd_stats)
Convert an ND_STATS to a JSON representation for external use.
static double nd_box_ratio(const ND_BOX *b1, const ND_BOX *b2, int ndims)
Returns the proportion of b2 that is covered by b1.
N-dimensional statistics structure.
N-dimensional box type for calculations, to avoid doing explicit axis conversions from GBOX in all ca...
Here is the call graph for this function:
Here is the caller graph for this function: