PostGIS  2.1.10dev-r@@SVN_REVISION@@
 static float8 estimate_join_selectivity ( const ND_STATS * s1, const ND_STATS * s2 )
static

Given two statistics histograms, what is the selectivity of a join driven by the && or &&& operator?

Join selectivity is defined as the number of rows returned by the join operator divided by the number of rows that an unconstrained join would return (nrows1*nrows2).

To get the estimate of join rows, we walk through the cells of one histogram, and multiply the cell value by the proportion of the cells in the other histogram the cell overlaps: val += val1 * ( val2 * overlap_ratio )

Definition at line 893 of file gserialized_estimate.c.

Referenced by _postgis_gserialized_joinsel(), and gserialized_gist_joinsel().

894 {
895  int ncells1, ncells2;
896  int ndims1, ndims2, ndims;
897  double ntuples_max;
898  double ntuples_not_null1, ntuples_not_null2;
899
900  ND_BOX extent1, extent2;
901  ND_IBOX ibox1, ibox2;
902  int at1[ND_DIMS];
903  int at2[ND_DIMS];
904  double min1[ND_DIMS];
905  double width1[ND_DIMS];
906  double cellsize1[ND_DIMS];
907  int size2[ND_DIMS];
908  double min2[ND_DIMS];
909  double width2[ND_DIMS];
910  double cellsize2[ND_DIMS];
911  int size1[ND_DIMS];
912  int d;
913  double val = 0;
914  float8 selectivity;
915
916  /* Drop out on null inputs */
917  if ( ! ( s1 && s2 ) )
918  {
919  elog(NOTICE, " estimate_join_selectivity called with null inputs");
920  return FALLBACK_ND_SEL;
921  }
922
923  /* We need to know how many cells each side has... */
924  ncells1 = (int)roundf(s1->histogram_cells);
925  ncells2 = (int)roundf(s2->histogram_cells);
926
927  /* ...so that we can drive the summation loop with the smaller histogram. */
928  if ( ncells1 > ncells2 )
929  {
930  const ND_STATS *stats_tmp = s1;
931  s1 = s2;
932  s2 = stats_tmp;
933  }
934
935  POSTGIS_DEBUGF(3, "s1: %s", nd_stats_to_json(s1));
936  POSTGIS_DEBUGF(3, "s2: %s", nd_stats_to_json(s2));
937
938  /* Re-read that info after the swap */
939  ncells1 = (int)roundf(s1->histogram_cells);
940  ncells2 = (int)roundf(s2->histogram_cells);
941
942  /* Q: What's the largest possible join size these relations can create? */
943  /* A: The product of the # of non-null rows in each relation. */
944  ntuples_not_null1 = s1->table_features * (s1->not_null_features / s1->sample_features);
945  ntuples_not_null2 = s2->table_features * (s2->not_null_features / s2->sample_features);
946  ntuples_max = ntuples_not_null1 * ntuples_not_null2;
947
948  /* Get the ndims as ints */
949  ndims1 = (int)roundf(s1->ndims);
950  ndims2 = (int)roundf(s2->ndims);
951  ndims = Max(ndims1, ndims2);
952
953  /* Get the extents */
954  extent1 = s1->extent;
955  extent2 = s2->extent;
956
957  /* If relation stats do not intersect, join is very very selective. */
958  if ( ! nd_box_intersects(&extent1, &extent2, ndims) )
959  {
960  POSTGIS_DEBUG(3, "relation stats do not intersect, returning 0");
961  PG_RETURN_FLOAT8(0.0);
962  }
963
964  /*
965  * First find the index range of the part of the smaller
966  * histogram that overlaps the larger one.
967  */
968  if ( ! nd_box_overlap(s1, &extent2, &ibox1) )
969  {
970  POSTGIS_DEBUG(3, "could not calculate overlap of relations");
971  PG_RETURN_FLOAT8(FALLBACK_ND_JOINSEL);
972  }
973
974  /* Initialize counters / constants on s1 */
975  for ( d = 0; d < ndims1; d++ )
976  {
977  at1[d] = ibox1.min[d];
978  min1[d] = s1->extent.min[d];
979  width1[d] = s1->extent.max[d] - s1->extent.min[d];
980  size1[d] = (int)roundf(s1->size[d]);
981  cellsize1[d] = width1[d] / size1[d];
982  }
983
984  /* Initialize counters / constants on s2 */
985  for ( d = 0; d < ndims2; d++ )
986  {
987  min2[d] = s2->extent.min[d];
988  width2[d] = s2->extent.max[d] - s2->extent.min[d];
989  size2[d] = (int)roundf(s2->size[d]);
990  cellsize2[d] = width2[d] / size2[d];
991  }
992
993  /* For each affected cell of s1... */
994  do
995  {
996  double val1;
997  /* Construct the bounds of this cell */
998  ND_BOX nd_cell1;
999  nd_box_init(&nd_cell1);
1000  for ( d = 0; d < ndims1; d++ )
1001  {
1002  nd_cell1.min[d] = min1[d] + (at1[d]+0) * cellsize1[d];
1003  nd_cell1.max[d] = min1[d] + (at1[d]+1) * cellsize1[d];
1004  }
1005
1006  /* Find the cells of s2 that cell1 overlaps.. */
1007  nd_box_overlap(s2, &nd_cell1, &ibox2);
1008
1009  /* Initialize counter */
1010  for ( d = 0; d < ndims2; d++ )
1011  {
1012  at2[d] = ibox2.min[d];
1013  }
1014
1015  POSTGIS_DEBUGF(3, "at1 %d,%d %s", at1[0], at1[1], nd_box_to_json(&nd_cell1, ndims1));
1016
1017  /* Get the value at this cell */
1018  val1 = s1->value[nd_stats_value_index(s1, at1)];
1019
1020  /* For each overlapped cell of s2... */
1021  do
1022  {
1023  double ratio2;
1024  double val2;
1025
1026  /* Construct the bounds of this cell */
1027  ND_BOX nd_cell2;
1028  nd_box_init(&nd_cell2);
1029  for ( d = 0; d < ndims2; d++ )
1030  {
1031  nd_cell2.min[d] = min2[d] + (at2[d]+0) * cellsize2[d];
1032  nd_cell2.max[d] = min2[d] + (at2[d]+1) * cellsize2[d];
1033  }
1034
1035  POSTGIS_DEBUGF(3, " at2 %d,%d %s", at2[0], at2[1], nd_box_to_json(&nd_cell2, ndims2));
1036
1037  /* Calculate overlap ratio of the cells */
1038  ratio2 = nd_box_ratio(&nd_cell1, &nd_cell2, Max(ndims1, ndims2));
1039
1040  /* Multiply the cell counts, scaled by overlap ratio */
1041  val2 = s2->value[nd_stats_value_index(s2, at2)];
1042  POSTGIS_DEBUGF(3, " val1 %.6g val2 %.6g ratio %.6g", val1, val2, ratio2);
1043  val += val1 * (val2 * ratio2);
1044  }
1045  while ( nd_increment(&ibox2, ndims2, at2) );
1046
1047  }
1048  while( nd_increment(&ibox1, ndims1, at1) );
1049
1050  POSTGIS_DEBUGF(3, "val of histogram = %g", val);
1051
1052  /*
1053  * In order to compare our total cell count "val" to the
1054  * ntuples_max, we need to scale val up to reflect a full
1055  * table estimate. So, multiply by ratio of table size to
1056  * sample size.
1057  */
1058  val *= (s1->table_features / s1->sample_features);
1059  val *= (s2->table_features / s2->sample_features);
1060
1061  POSTGIS_DEBUGF(3, "val scaled to full table size = %g", val);
1062
1063  /*
1064  * Because the cell counts are over-determined due to
1065  * double counting of features that overlap multiple cells
1066  * (see the compute_gserialized_stats routine)
1067  * we also have to scale our cell count "val" *down*
1068  * to adjust for the double counting.
1069  */
1070 // val /= (s1->cells_covered / s1->histogram_features);
1071 // val /= (s2->cells_covered / s2->histogram_features);
1072
1073  /*
1074  * Finally, the selectivity is the estimated number of
1075  * rows to be returned divided by the maximum possible
1076  * number of rows that can be returned.
1077  */
1078  selectivity = val / ntuples_max;
1079
1080  /* Guard against over-estimates :) */
1081  if ( selectivity > 1.0 )
1082  selectivity = 1.0;
1083
1084  return selectivity;
1085 }
static int nd_increment(ND_IBOX *ibox, int ndims, int *counter)
Given an n-d index array (counter), and a domain to increment it in (ibox) increment it by one...
static int nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
What stats cells overlap with this ND_BOX? Put the lowest cell addresses in ND_IBOX->min and the high...
#define ND_DIMS
The maximum number of dimensions our code can handle.
#define FALLBACK_ND_JOINSEL
#define FALLBACK_ND_SEL
More modest fallback selectivity factor.
static int nd_box_init(ND_BOX *a)
Zero out an ND_BOX.
static int nd_stats_value_index(const ND_STATS *stats, int *indexes)
Given a position in the n-d histogram (i,j,k) return the position in the 1-d values array...
float4 size[ND_DIMS]
static int nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
Return TRUE if ND_BOX a overlaps b, false otherwise.
int min[ND_DIMS]
N-dimensional box index type.
static char * nd_box_to_json(const ND_BOX *nd_box, int ndims)
Convert an ND_BOX to a JSON string for printing.
float4 max[ND_DIMS]
float4 min[ND_DIMS]
static char * nd_stats_to_json(const ND_STATS *nd_stats)
Convert an ND_STATS to a JSON representation for external use.
static double nd_box_ratio(const ND_BOX *b1, const ND_BOX *b2, int ndims)
Returns the proportion of b2 that is covered by b1.
N-dimensional statistics structure.
N-dimensional box type for calculations, to avoid doing explicit axis conversions from GBOX in all ca...

Here is the call graph for this function:

Here is the caller graph for this function: