PostGIS  2.3.8dev-r@@SVN_REVISION@@

◆ estimate_join_selectivity()

static float8 estimate_join_selectivity ( const ND_STATS s1,
const ND_STATS s2 
)
static

Given two statistics histograms, what is the selectivity of a join driven by the && or &&& operator?

Join selectivity is defined as the number of rows returned by the join operator divided by the number of rows that an unconstrained join would return (nrows1*nrows2).

To get the estimate of join rows, we walk through the cells of one histogram, and multiply the cell value by the proportion of the cells in the other histogram the cell overlaps: val += val1 * ( val2 * overlap_ratio )

Definition at line 969 of file gserialized_estimate.c.

References DEFAULT_ND_JOINSEL, ND_STATS_T::extent, FALLBACK_ND_JOINSEL, FALLBACK_ND_SEL, gserialized_gist_joinsel_nd(), ND_STATS_T::histogram_cells, ND_BOX_T::max, ND_BOX_T::min, ND_IBOX_T::min, nd_box_init(), nd_box_intersects(), nd_box_overlap(), nd_box_ratio(), nd_box_to_json(), ND_DIMS, nd_increment(), nd_stats_to_json(), nd_stats_value_index(), ND_STATS_T::ndims, ND_STATS_T::not_null_features, PG_FUNCTION_INFO_V1(), ND_STATS_T::sample_features, ND_STATS_T::size, ND_STATS_T::table_features, and ND_STATS_T::value.

Referenced by _postgis_gserialized_joinsel(), and gserialized_gist_joinsel().

970 {
971  int ncells1, ncells2;
972  int ndims1, ndims2, ndims;
973  double ntuples_max;
974  double ntuples_not_null1, ntuples_not_null2;
975 
976  ND_BOX extent1, extent2;
977  ND_IBOX ibox1, ibox2;
978  int at1[ND_DIMS];
979  int at2[ND_DIMS];
980  double min1[ND_DIMS];
981  double width1[ND_DIMS];
982  double cellsize1[ND_DIMS];
983  int size2[ND_DIMS];
984  double min2[ND_DIMS];
985  double width2[ND_DIMS];
986  double cellsize2[ND_DIMS];
987  int size1[ND_DIMS];
988  int d;
989  double val = 0;
990  float8 selectivity;
991 
992  /* Drop out on null inputs */
993  if ( ! ( s1 && s2 ) )
994  {
995  elog(NOTICE, " estimate_join_selectivity called with null inputs");
996  return FALLBACK_ND_SEL;
997  }
998 
999  /* We need to know how many cells each side has... */
1000  ncells1 = (int)roundf(s1->histogram_cells);
1001  ncells2 = (int)roundf(s2->histogram_cells);
1002 
1003  /* ...so that we can drive the summation loop with the smaller histogram. */
1004  if ( ncells1 > ncells2 )
1005  {
1006  const ND_STATS *stats_tmp = s1;
1007  s1 = s2;
1008  s2 = stats_tmp;
1009  }
1010 
1011  POSTGIS_DEBUGF(3, "s1: %s", nd_stats_to_json(s1));
1012  POSTGIS_DEBUGF(3, "s2: %s", nd_stats_to_json(s2));
1013 
1014  /* Re-read that info after the swap */
1015  ncells1 = (int)roundf(s1->histogram_cells);
1016  ncells2 = (int)roundf(s2->histogram_cells);
1017 
1018  /* Q: What's the largest possible join size these relations can create? */
1019  /* A: The product of the # of non-null rows in each relation. */
1020  ntuples_not_null1 = s1->table_features * (s1->not_null_features / s1->sample_features);
1021  ntuples_not_null2 = s2->table_features * (s2->not_null_features / s2->sample_features);
1022  ntuples_max = ntuples_not_null1 * ntuples_not_null2;
1023 
1024  /* Get the ndims as ints */
1025  ndims1 = (int)roundf(s1->ndims);
1026  ndims2 = (int)roundf(s2->ndims);
1027  ndims = Max(ndims1, ndims2);
1028 
1029  /* Get the extents */
1030  extent1 = s1->extent;
1031  extent2 = s2->extent;
1032 
1033  /* If relation stats do not intersect, join is very very selective. */
1034  if ( ! nd_box_intersects(&extent1, &extent2, ndims) )
1035  {
1036  POSTGIS_DEBUG(3, "relation stats do not intersect, returning 0");
1037  PG_RETURN_FLOAT8(0.0);
1038  }
1039 
1040  /*
1041  * First find the index range of the part of the smaller
1042  * histogram that overlaps the larger one.
1043  */
1044  if ( ! nd_box_overlap(s1, &extent2, &ibox1) )
1045  {
1046  POSTGIS_DEBUG(3, "could not calculate overlap of relations");
1047  PG_RETURN_FLOAT8(FALLBACK_ND_JOINSEL);
1048  }
1049 
1050  /* Initialize counters / constants on s1 */
1051  for ( d = 0; d < ndims1; d++ )
1052  {
1053  at1[d] = ibox1.min[d];
1054  min1[d] = s1->extent.min[d];
1055  width1[d] = s1->extent.max[d] - s1->extent.min[d];
1056  size1[d] = (int)roundf(s1->size[d]);
1057  cellsize1[d] = width1[d] / size1[d];
1058  }
1059 
1060  /* Initialize counters / constants on s2 */
1061  for ( d = 0; d < ndims2; d++ )
1062  {
1063  min2[d] = s2->extent.min[d];
1064  width2[d] = s2->extent.max[d] - s2->extent.min[d];
1065  size2[d] = (int)roundf(s2->size[d]);
1066  cellsize2[d] = width2[d] / size2[d];
1067  }
1068 
1069  /* For each affected cell of s1... */
1070  do
1071  {
1072  double val1;
1073  /* Construct the bounds of this cell */
1074  ND_BOX nd_cell1;
1075  nd_box_init(&nd_cell1);
1076  for ( d = 0; d < ndims1; d++ )
1077  {
1078  nd_cell1.min[d] = min1[d] + (at1[d]+0) * cellsize1[d];
1079  nd_cell1.max[d] = min1[d] + (at1[d]+1) * cellsize1[d];
1080  }
1081 
1082  /* Find the cells of s2 that cell1 overlaps.. */
1083  nd_box_overlap(s2, &nd_cell1, &ibox2);
1084 
1085  /* Initialize counter */
1086  for ( d = 0; d < ndims2; d++ )
1087  {
1088  at2[d] = ibox2.min[d];
1089  }
1090 
1091  POSTGIS_DEBUGF(3, "at1 %d,%d %s", at1[0], at1[1], nd_box_to_json(&nd_cell1, ndims1));
1092 
1093  /* Get the value at this cell */
1094  val1 = s1->value[nd_stats_value_index(s1, at1)];
1095 
1096  /* For each overlapped cell of s2... */
1097  do
1098  {
1099  double ratio2;
1100  double val2;
1101 
1102  /* Construct the bounds of this cell */
1103  ND_BOX nd_cell2;
1104  nd_box_init(&nd_cell2);
1105  for ( d = 0; d < ndims2; d++ )
1106  {
1107  nd_cell2.min[d] = min2[d] + (at2[d]+0) * cellsize2[d];
1108  nd_cell2.max[d] = min2[d] + (at2[d]+1) * cellsize2[d];
1109  }
1110 
1111  POSTGIS_DEBUGF(3, " at2 %d,%d %s", at2[0], at2[1], nd_box_to_json(&nd_cell2, ndims2));
1112 
1113  /* Calculate overlap ratio of the cells */
1114  ratio2 = nd_box_ratio(&nd_cell1, &nd_cell2, Max(ndims1, ndims2));
1115 
1116  /* Multiply the cell counts, scaled by overlap ratio */
1117  val2 = s2->value[nd_stats_value_index(s2, at2)];
1118  POSTGIS_DEBUGF(3, " val1 %.6g val2 %.6g ratio %.6g", val1, val2, ratio2);
1119  val += val1 * (val2 * ratio2);
1120  }
1121  while ( nd_increment(&ibox2, ndims2, at2) );
1122 
1123  }
1124  while( nd_increment(&ibox1, ndims1, at1) );
1125 
1126  POSTGIS_DEBUGF(3, "val of histogram = %g", val);
1127 
1128  /*
1129  * In order to compare our total cell count "val" to the
1130  * ntuples_max, we need to scale val up to reflect a full
1131  * table estimate. So, multiply by ratio of table size to
1132  * sample size.
1133  */
1134  val *= (s1->table_features / s1->sample_features);
1135  val *= (s2->table_features / s2->sample_features);
1136 
1137  POSTGIS_DEBUGF(3, "val scaled to full table size = %g", val);
1138 
1139  /*
1140  * Because the cell counts are over-determined due to
1141  * double counting of features that overlap multiple cells
1142  * (see the compute_gserialized_stats routine)
1143  * we also have to scale our cell count "val" *down*
1144  * to adjust for the double counting.
1145  */
1146 // val /= (s1->cells_covered / s1->histogram_features);
1147 // val /= (s2->cells_covered / s2->histogram_features);
1148 
1149  /*
1150  * Finally, the selectivity is the estimated number of
1151  * rows to be returned divided by the maximum possible
1152  * number of rows that can be returned.
1153  */
1154  selectivity = val / ntuples_max;
1155 
1156  /* Guard against over-estimates and crazy numbers :) */
1157  if ( isnan(selectivity) || ! isfinite(selectivity) || selectivity < 0.0 )
1158  {
1159  selectivity = DEFAULT_ND_JOINSEL;
1160  }
1161  else if ( selectivity > 1.0 )
1162  {
1163  selectivity = 1.0;
1164  }
1165 
1166  return selectivity;
1167 }
static int nd_increment(ND_IBOX *ibox, int ndims, int *counter)
Given an n-d index array (counter), and a domain to increment it in (ibox) increment it by one...
static int nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
What stats cells overlap with this ND_BOX? Put the lowest cell addresses in ND_IBOX->min and the high...
#define DEFAULT_ND_JOINSEL
#define ND_DIMS
The maximum number of dimensions our code can handle.
#define FALLBACK_ND_JOINSEL
#define FALLBACK_ND_SEL
More modest fallback selectivity factor.
static int nd_box_init(ND_BOX *a)
Zero out an ND_BOX.
static int nd_stats_value_index(const ND_STATS *stats, int *indexes)
Given a position in the n-d histogram (i,j,k) return the position in the 1-d values array...
float4 size[ND_DIMS]
static int nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
Return TRUE if ND_BOX a overlaps b, false otherwise.
int min[ND_DIMS]
N-dimensional box index type.
static char * nd_box_to_json(const ND_BOX *nd_box, int ndims)
Convert an ND_BOX to a JSON string for printing.
float4 max[ND_DIMS]
float4 min[ND_DIMS]
static char * nd_stats_to_json(const ND_STATS *nd_stats)
Convert an ND_STATS to a JSON representation for external use.
static double nd_box_ratio(const ND_BOX *b1, const ND_BOX *b2, int ndims)
Returns the proportion of b2 that is covered by b1.
N-dimensional statistics structure.
N-dimensional box type for calculations, to avoid doing explicit axis conversions from GBOX in all ca...
Here is the call graph for this function:
Here is the caller graph for this function: