PostGIS  2.5.0dev-r@@SVN_REVISION@@
static float8 estimate_join_selectivity ( const ND_STATS s1,
const ND_STATS s2 
)
static

Given two statistics histograms, what is the selectivity of a join driven by the && or &&& operator?

Join selectivity is defined as the number of rows returned by the join operator divided by the number of rows that an unconstrained join would return (nrows1*nrows2).

To get the estimate of join rows, we walk through the cells of one histogram, and multiply the cell value by the proportion of the cells in the other histogram the cell overlaps: val += val1 * ( val2 * overlap_ratio )

Definition at line 1015 of file gserialized_estimate.c.

References DEFAULT_ND_JOINSEL, ND_STATS_T::extent, FALLBACK_ND_JOINSEL, FALLBACK_ND_SEL, ND_STATS_T::histogram_cells, ND_BOX_T::max, ND_BOX_T::min, ND_IBOX_T::min, nd_box_init(), nd_box_intersects(), nd_box_overlap(), nd_box_ratio(), nd_box_to_json(), ND_DIMS, nd_increment(), nd_stats_to_json(), nd_stats_value_index(), ND_STATS_T::ndims, ND_STATS_T::not_null_features, ND_STATS_T::sample_features, ND_STATS_T::size, ND_STATS_T::table_features, and ND_STATS_T::value.

Referenced by _postgis_gserialized_joinsel(), and gserialized_gist_joinsel().

1016 {
1017  int ncells1, ncells2;
1018  int ndims1, ndims2, ndims;
1019  double ntuples_max;
1020  double ntuples_not_null1, ntuples_not_null2;
1021 
1022  ND_BOX extent1, extent2;
1023  ND_IBOX ibox1, ibox2;
1024  int at1[ND_DIMS];
1025  int at2[ND_DIMS];
1026  double min1[ND_DIMS];
1027  double width1[ND_DIMS];
1028  double cellsize1[ND_DIMS];
1029  int size2[ND_DIMS];
1030  double min2[ND_DIMS];
1031  double width2[ND_DIMS];
1032  double cellsize2[ND_DIMS];
1033  int size1[ND_DIMS];
1034  int d;
1035  double val = 0;
1036  float8 selectivity;
1037 
1038  /* Drop out on null inputs */
1039  if ( ! ( s1 && s2 ) )
1040  {
1041  elog(NOTICE, " estimate_join_selectivity called with null inputs");
1042  return FALLBACK_ND_SEL;
1043  }
1044 
1045  /* We need to know how many cells each side has... */
1046  ncells1 = (int)roundf(s1->histogram_cells);
1047  ncells2 = (int)roundf(s2->histogram_cells);
1048 
1049  /* ...so that we can drive the summation loop with the smaller histogram. */
1050  if ( ncells1 > ncells2 )
1051  {
1052  const ND_STATS *stats_tmp = s1;
1053  s1 = s2;
1054  s2 = stats_tmp;
1055  }
1056 
1057  POSTGIS_DEBUGF(3, "s1: %s", nd_stats_to_json(s1));
1058  POSTGIS_DEBUGF(3, "s2: %s", nd_stats_to_json(s2));
1059 
1060  /* Re-read that info after the swap */
1061  ncells1 = (int)roundf(s1->histogram_cells);
1062  ncells2 = (int)roundf(s2->histogram_cells);
1063 
1064  /* Q: What's the largest possible join size these relations can create? */
1065  /* A: The product of the # of non-null rows in each relation. */
1066  ntuples_not_null1 = s1->table_features * (s1->not_null_features / s1->sample_features);
1067  ntuples_not_null2 = s2->table_features * (s2->not_null_features / s2->sample_features);
1068  ntuples_max = ntuples_not_null1 * ntuples_not_null2;
1069 
1070  /* Get the ndims as ints */
1071  ndims1 = (int)roundf(s1->ndims);
1072  ndims2 = (int)roundf(s2->ndims);
1073  ndims = Max(ndims1, ndims2);
1074 
1075  /* Get the extents */
1076  extent1 = s1->extent;
1077  extent2 = s2->extent;
1078 
1079  /* If relation stats do not intersect, join is very very selective. */
1080  if ( ! nd_box_intersects(&extent1, &extent2, ndims) )
1081  {
1082  POSTGIS_DEBUG(3, "relation stats do not intersect, returning 0");
1083  PG_RETURN_FLOAT8(0.0);
1084  }
1085 
1086  /*
1087  * First find the index range of the part of the smaller
1088  * histogram that overlaps the larger one.
1089  */
1090  if ( ! nd_box_overlap(s1, &extent2, &ibox1) )
1091  {
1092  POSTGIS_DEBUG(3, "could not calculate overlap of relations");
1093  PG_RETURN_FLOAT8(FALLBACK_ND_JOINSEL);
1094  }
1095 
1096  /* Initialize counters / constants on s1 */
1097  for ( d = 0; d < ndims1; d++ )
1098  {
1099  at1[d] = ibox1.min[d];
1100  min1[d] = s1->extent.min[d];
1101  width1[d] = s1->extent.max[d] - s1->extent.min[d];
1102  size1[d] = (int)roundf(s1->size[d]);
1103  cellsize1[d] = width1[d] / size1[d];
1104  }
1105 
1106  /* Initialize counters / constants on s2 */
1107  for ( d = 0; d < ndims2; d++ )
1108  {
1109  min2[d] = s2->extent.min[d];
1110  width2[d] = s2->extent.max[d] - s2->extent.min[d];
1111  size2[d] = (int)roundf(s2->size[d]);
1112  cellsize2[d] = width2[d] / size2[d];
1113  }
1114 
1115  /* For each affected cell of s1... */
1116  do
1117  {
1118  double val1;
1119  /* Construct the bounds of this cell */
1120  ND_BOX nd_cell1;
1121  nd_box_init(&nd_cell1);
1122  for ( d = 0; d < ndims1; d++ )
1123  {
1124  nd_cell1.min[d] = min1[d] + (at1[d]+0) * cellsize1[d];
1125  nd_cell1.max[d] = min1[d] + (at1[d]+1) * cellsize1[d];
1126  }
1127 
1128  /* Find the cells of s2 that cell1 overlaps.. */
1129  nd_box_overlap(s2, &nd_cell1, &ibox2);
1130 
1131  /* Initialize counter */
1132  for ( d = 0; d < ndims2; d++ )
1133  {
1134  at2[d] = ibox2.min[d];
1135  }
1136 
1137  POSTGIS_DEBUGF(3, "at1 %d,%d %s", at1[0], at1[1], nd_box_to_json(&nd_cell1, ndims1));
1138 
1139  /* Get the value at this cell */
1140  val1 = s1->value[nd_stats_value_index(s1, at1)];
1141 
1142  /* For each overlapped cell of s2... */
1143  do
1144  {
1145  double ratio2;
1146  double val2;
1147 
1148  /* Construct the bounds of this cell */
1149  ND_BOX nd_cell2;
1150  nd_box_init(&nd_cell2);
1151  for ( d = 0; d < ndims2; d++ )
1152  {
1153  nd_cell2.min[d] = min2[d] + (at2[d]+0) * cellsize2[d];
1154  nd_cell2.max[d] = min2[d] + (at2[d]+1) * cellsize2[d];
1155  }
1156 
1157  POSTGIS_DEBUGF(3, " at2 %d,%d %s", at2[0], at2[1], nd_box_to_json(&nd_cell2, ndims2));
1158 
1159  /* Calculate overlap ratio of the cells */
1160  ratio2 = nd_box_ratio(&nd_cell1, &nd_cell2, Max(ndims1, ndims2));
1161 
1162  /* Multiply the cell counts, scaled by overlap ratio */
1163  val2 = s2->value[nd_stats_value_index(s2, at2)];
1164  POSTGIS_DEBUGF(3, " val1 %.6g val2 %.6g ratio %.6g", val1, val2, ratio2);
1165  val += val1 * (val2 * ratio2);
1166  }
1167  while ( nd_increment(&ibox2, ndims2, at2) );
1168 
1169  }
1170  while( nd_increment(&ibox1, ndims1, at1) );
1171 
1172  POSTGIS_DEBUGF(3, "val of histogram = %g", val);
1173 
1174  /*
1175  * In order to compare our total cell count "val" to the
1176  * ntuples_max, we need to scale val up to reflect a full
1177  * table estimate. So, multiply by ratio of table size to
1178  * sample size.
1179  */
1180  val *= (s1->table_features / s1->sample_features);
1181  val *= (s2->table_features / s2->sample_features);
1182 
1183  POSTGIS_DEBUGF(3, "val scaled to full table size = %g", val);
1184 
1185  /*
1186  * Because the cell counts are over-determined due to
1187  * double counting of features that overlap multiple cells
1188  * (see the compute_gserialized_stats routine)
1189  * we also have to scale our cell count "val" *down*
1190  * to adjust for the double counting.
1191  */
1192 // val /= (s1->cells_covered / s1->histogram_features);
1193 // val /= (s2->cells_covered / s2->histogram_features);
1194 
1195  /*
1196  * Finally, the selectivity is the estimated number of
1197  * rows to be returned divided by the maximum possible
1198  * number of rows that can be returned.
1199  */
1200  selectivity = val / ntuples_max;
1201 
1202  /* Guard against over-estimates and crazy numbers :) */
1203  if ( isnan(selectivity) || ! isfinite(selectivity) || selectivity < 0.0 )
1204  {
1205  selectivity = DEFAULT_ND_JOINSEL;
1206  }
1207  else if ( selectivity > 1.0 )
1208  {
1209  selectivity = 1.0;
1210  }
1211 
1212  return selectivity;
1213 }
static int nd_increment(ND_IBOX *ibox, int ndims, int *counter)
Given an n-d index array (counter), and a domain to increment it in (ibox) increment it by one...
static int nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
What stats cells overlap with this ND_BOX? Put the lowest cell addresses in ND_IBOX->min and the high...
#define DEFAULT_ND_JOINSEL
#define ND_DIMS
The maximum number of dimensions our code can handle.
#define FALLBACK_ND_JOINSEL
#define FALLBACK_ND_SEL
More modest fallback selectivity factor.
static int nd_box_init(ND_BOX *a)
Zero out an ND_BOX.
static int nd_stats_value_index(const ND_STATS *stats, int *indexes)
Given a position in the n-d histogram (i,j,k) return the position in the 1-d values array...
float4 size[ND_DIMS]
static int nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
Return true if ND_BOX a overlaps b, false otherwise.
int min[ND_DIMS]
N-dimensional box index type.
static char * nd_box_to_json(const ND_BOX *nd_box, int ndims)
Convert an ND_BOX to a JSON string for printing.
float4 max[ND_DIMS]
float4 min[ND_DIMS]
static char * nd_stats_to_json(const ND_STATS *nd_stats)
Convert an ND_STATS to a JSON representation for external use.
static double nd_box_ratio(const ND_BOX *b1, const ND_BOX *b2, int ndims)
Returns the proportion of b2 that is covered by b1.
N-dimensional statistics structure.
N-dimensional box type for calculations, to avoid doing explicit axis conversions from GBOX in all ca...

Here is the call graph for this function:

Here is the caller graph for this function: