PostGIS  2.2.8dev-r@@SVN_REVISION@@

◆ estimate_join_selectivity()

static float8 estimate_join_selectivity ( const ND_STATS s1,
const ND_STATS s2 
)
static

Given two statistics histograms, what is the selectivity of a join driven by the && or &&& operator?

Join selectivity is defined as the number of rows returned by the join operator divided by the number of rows that an unconstrained join would return (nrows1*nrows2).

To get the estimate of join rows, we walk through the cells of one histogram, and multiply the cell value by the proportion of the cells in the other histogram the cell overlaps: val += val1 * ( val2 * overlap_ratio )

Definition at line 907 of file gserialized_estimate.c.

References DEFAULT_ND_JOINSEL, ND_STATS_T::extent, FALLBACK_ND_JOINSEL, FALLBACK_ND_SEL, gserialized_gist_joinsel_nd(), ND_STATS_T::histogram_cells, ND_BOX_T::max, ND_BOX_T::min, ND_IBOX_T::min, nd_box_init(), nd_box_intersects(), nd_box_overlap(), nd_box_ratio(), nd_box_to_json(), ND_DIMS, nd_increment(), nd_stats_to_json(), nd_stats_value_index(), ND_STATS_T::ndims, ND_STATS_T::not_null_features, PG_FUNCTION_INFO_V1(), ND_STATS_T::sample_features, ND_STATS_T::size, ND_STATS_T::table_features, and ND_STATS_T::value.

Referenced by _postgis_gserialized_joinsel(), and gserialized_gist_joinsel().

908 {
909  int ncells1, ncells2;
910  int ndims1, ndims2, ndims;
911  double ntuples_max;
912  double ntuples_not_null1, ntuples_not_null2;
913 
914  ND_BOX extent1, extent2;
915  ND_IBOX ibox1, ibox2;
916  int at1[ND_DIMS];
917  int at2[ND_DIMS];
918  double min1[ND_DIMS];
919  double width1[ND_DIMS];
920  double cellsize1[ND_DIMS];
921  int size2[ND_DIMS];
922  double min2[ND_DIMS];
923  double width2[ND_DIMS];
924  double cellsize2[ND_DIMS];
925  int size1[ND_DIMS];
926  int d;
927  double val = 0;
928  float8 selectivity;
929 
930  /* Drop out on null inputs */
931  if ( ! ( s1 && s2 ) )
932  {
933  elog(NOTICE, " estimate_join_selectivity called with null inputs");
934  return FALLBACK_ND_SEL;
935  }
936 
937  /* We need to know how many cells each side has... */
938  ncells1 = (int)roundf(s1->histogram_cells);
939  ncells2 = (int)roundf(s2->histogram_cells);
940 
941  /* ...so that we can drive the summation loop with the smaller histogram. */
942  if ( ncells1 > ncells2 )
943  {
944  const ND_STATS *stats_tmp = s1;
945  s1 = s2;
946  s2 = stats_tmp;
947  }
948 
949  POSTGIS_DEBUGF(3, "s1: %s", nd_stats_to_json(s1));
950  POSTGIS_DEBUGF(3, "s2: %s", nd_stats_to_json(s2));
951 
952  /* Re-read that info after the swap */
953  ncells1 = (int)roundf(s1->histogram_cells);
954  ncells2 = (int)roundf(s2->histogram_cells);
955 
956  /* Q: What's the largest possible join size these relations can create? */
957  /* A: The product of the # of non-null rows in each relation. */
958  ntuples_not_null1 = s1->table_features * (s1->not_null_features / s1->sample_features);
959  ntuples_not_null2 = s2->table_features * (s2->not_null_features / s2->sample_features);
960  ntuples_max = ntuples_not_null1 * ntuples_not_null2;
961 
962  /* Get the ndims as ints */
963  ndims1 = (int)roundf(s1->ndims);
964  ndims2 = (int)roundf(s2->ndims);
965  ndims = Max(ndims1, ndims2);
966 
967  /* Get the extents */
968  extent1 = s1->extent;
969  extent2 = s2->extent;
970 
971  /* If relation stats do not intersect, join is very very selective. */
972  if ( ! nd_box_intersects(&extent1, &extent2, ndims) )
973  {
974  POSTGIS_DEBUG(3, "relation stats do not intersect, returning 0");
975  PG_RETURN_FLOAT8(0.0);
976  }
977 
978  /*
979  * First find the index range of the part of the smaller
980  * histogram that overlaps the larger one.
981  */
982  if ( ! nd_box_overlap(s1, &extent2, &ibox1) )
983  {
984  POSTGIS_DEBUG(3, "could not calculate overlap of relations");
985  PG_RETURN_FLOAT8(FALLBACK_ND_JOINSEL);
986  }
987 
988  /* Initialize counters / constants on s1 */
989  for ( d = 0; d < ndims1; d++ )
990  {
991  at1[d] = ibox1.min[d];
992  min1[d] = s1->extent.min[d];
993  width1[d] = s1->extent.max[d] - s1->extent.min[d];
994  size1[d] = (int)roundf(s1->size[d]);
995  cellsize1[d] = width1[d] / size1[d];
996  }
997 
998  /* Initialize counters / constants on s2 */
999  for ( d = 0; d < ndims2; d++ )
1000  {
1001  min2[d] = s2->extent.min[d];
1002  width2[d] = s2->extent.max[d] - s2->extent.min[d];
1003  size2[d] = (int)roundf(s2->size[d]);
1004  cellsize2[d] = width2[d] / size2[d];
1005  }
1006 
1007  /* For each affected cell of s1... */
1008  do
1009  {
1010  double val1;
1011  /* Construct the bounds of this cell */
1012  ND_BOX nd_cell1;
1013  nd_box_init(&nd_cell1);
1014  for ( d = 0; d < ndims1; d++ )
1015  {
1016  nd_cell1.min[d] = min1[d] + (at1[d]+0) * cellsize1[d];
1017  nd_cell1.max[d] = min1[d] + (at1[d]+1) * cellsize1[d];
1018  }
1019 
1020  /* Find the cells of s2 that cell1 overlaps.. */
1021  nd_box_overlap(s2, &nd_cell1, &ibox2);
1022 
1023  /* Initialize counter */
1024  for ( d = 0; d < ndims2; d++ )
1025  {
1026  at2[d] = ibox2.min[d];
1027  }
1028 
1029  POSTGIS_DEBUGF(3, "at1 %d,%d %s", at1[0], at1[1], nd_box_to_json(&nd_cell1, ndims1));
1030 
1031  /* Get the value at this cell */
1032  val1 = s1->value[nd_stats_value_index(s1, at1)];
1033 
1034  /* For each overlapped cell of s2... */
1035  do
1036  {
1037  double ratio2;
1038  double val2;
1039 
1040  /* Construct the bounds of this cell */
1041  ND_BOX nd_cell2;
1042  nd_box_init(&nd_cell2);
1043  for ( d = 0; d < ndims2; d++ )
1044  {
1045  nd_cell2.min[d] = min2[d] + (at2[d]+0) * cellsize2[d];
1046  nd_cell2.max[d] = min2[d] + (at2[d]+1) * cellsize2[d];
1047  }
1048 
1049  POSTGIS_DEBUGF(3, " at2 %d,%d %s", at2[0], at2[1], nd_box_to_json(&nd_cell2, ndims2));
1050 
1051  /* Calculate overlap ratio of the cells */
1052  ratio2 = nd_box_ratio(&nd_cell1, &nd_cell2, Max(ndims1, ndims2));
1053 
1054  /* Multiply the cell counts, scaled by overlap ratio */
1055  val2 = s2->value[nd_stats_value_index(s2, at2)];
1056  POSTGIS_DEBUGF(3, " val1 %.6g val2 %.6g ratio %.6g", val1, val2, ratio2);
1057  val += val1 * (val2 * ratio2);
1058  }
1059  while ( nd_increment(&ibox2, ndims2, at2) );
1060 
1061  }
1062  while( nd_increment(&ibox1, ndims1, at1) );
1063 
1064  POSTGIS_DEBUGF(3, "val of histogram = %g", val);
1065 
1066  /*
1067  * In order to compare our total cell count "val" to the
1068  * ntuples_max, we need to scale val up to reflect a full
1069  * table estimate. So, multiply by ratio of table size to
1070  * sample size.
1071  */
1072  val *= (s1->table_features / s1->sample_features);
1073  val *= (s2->table_features / s2->sample_features);
1074 
1075  POSTGIS_DEBUGF(3, "val scaled to full table size = %g", val);
1076 
1077  /*
1078  * Because the cell counts are over-determined due to
1079  * double counting of features that overlap multiple cells
1080  * (see the compute_gserialized_stats routine)
1081  * we also have to scale our cell count "val" *down*
1082  * to adjust for the double counting.
1083  */
1084 // val /= (s1->cells_covered / s1->histogram_features);
1085 // val /= (s2->cells_covered / s2->histogram_features);
1086 
1087  /*
1088  * Finally, the selectivity is the estimated number of
1089  * rows to be returned divided by the maximum possible
1090  * number of rows that can be returned.
1091  */
1092  selectivity = val / ntuples_max;
1093 
1094  /* Guard against over-estimates and crazy numbers :) */
1095  if ( isnan(selectivity) || ! isfinite(selectivity) || selectivity < 0.0 )
1096  {
1097  selectivity = DEFAULT_ND_JOINSEL;
1098  }
1099  else if ( selectivity > 1.0 )
1100  {
1101  selectivity = 1.0;
1102  }
1103 
1104  return selectivity;
1105 }
static int nd_increment(ND_IBOX *ibox, int ndims, int *counter)
Given an n-d index array (counter), and a domain to increment it in (ibox) increment it by one...
static int nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
What stats cells overlap with this ND_BOX? Put the lowest cell addresses in ND_IBOX->min and the high...
#define DEFAULT_ND_JOINSEL
#define ND_DIMS
The maximum number of dimensions our code can handle.
#define FALLBACK_ND_JOINSEL
#define FALLBACK_ND_SEL
More modest fallback selectivity factor.
static int nd_box_init(ND_BOX *a)
Zero out an ND_BOX.
static int nd_stats_value_index(const ND_STATS *stats, int *indexes)
Given a position in the n-d histogram (i,j,k) return the position in the 1-d values array...
float4 size[ND_DIMS]
static int nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
Return TRUE if ND_BOX a overlaps b, false otherwise.
int min[ND_DIMS]
N-dimensional box index type.
static char * nd_box_to_json(const ND_BOX *nd_box, int ndims)
Convert an ND_BOX to a JSON string for printing.
float4 max[ND_DIMS]
float4 min[ND_DIMS]
static char * nd_stats_to_json(const ND_STATS *nd_stats)
Convert an ND_STATS to a JSON representation for external use.
static double nd_box_ratio(const ND_BOX *b1, const ND_BOX *b2, int ndims)
Returns the proportion of b2 that is covered by b1.
N-dimensional statistics structure.
N-dimensional box type for calculations, to avoid doing explicit axis conversions from GBOX in all ca...
Here is the call graph for this function:
Here is the caller graph for this function: