The gserialized_analyze_nd sets this function as a callback on the stats object when called by the ANALYZE command.
ANALYZE then gathers the requisite number of sample rows and then calls this function.
We could also pass stats->extra_data in from gserialized_analyze_nd (things like the column type or other stuff from the system catalogs) but so far we don't use that capability.
Our job is to build some statistics on the sample data for use by operator estimators.
We will populate an n-d histogram using the provided sample rows. The selectivity estimators (sel and j_oinsel) can then use the histogram
1328 MemoryContext old_context;
1330 int notnull_cnt = 0;
1332 int histogram_features = 0;
1335 size_t nd_stats_size;
1337 double total_width = 0;
1338 double total_sample_volume = 0;
1339 double total_cell_count = 0;
1345 const ND_BOX **sample_boxes;
1350 int histo_cells_target;
1352 int histo_cells_new = 1;
1355 int histo_ndims = 0;
1356 double sample_distribution[
ND_DIMS];
1357 double total_distribution;
1375 POSTGIS_DEBUG(2,
"compute_gserialized_stats called");
1376 POSTGIS_DEBUGF(3,
" # sample_rows: %d", sample_rows);
1377 POSTGIS_DEBUGF(3,
" estimate of total_rows: %.6g", total_rows);
1383 sample_boxes = palloc(
sizeof(
ND_BOX*) * sample_rows);
1395 for ( i = 0; i < sample_rows; i++ )
1404 datum = fetchfunc(stats, i, &is_null);
1409 POSTGIS_DEBUGF(4,
" skipped null geometry %d", i);
1416 is_copy = VARATT_IS_EXTENDED(datum);
1420 POSTGIS_DEBUGF(3,
" skipped empty geometry %d", i);
1431 POSTGIS_DEBUGF(3,
" skipped infinite/nan geometry %d", i);
1443 nd_box = palloc(
sizeof(
ND_BOX));
1447 sample_boxes[notnull_cnt] = nd_box;
1450 if ( ! notnull_cnt )
1457 total_width += VARSIZE(geom);
1460 for ( d = 0; d < ndims; d++ )
1462 sum.
min[d] += nd_box->
min[d];
1463 sum.
max[d] += nd_box->
max[d];
1474 vacuum_delay_point();
1485 histo_cells_target = (int)pow((
double)(stats->attr->attstattarget), (
double)ndims);
1486 histo_cells_target = Min(histo_cells_target, ndims * 10000);
1487 histo_cells_target = Min(histo_cells_target, (
int)(total_rows/5));
1488 POSTGIS_DEBUGF(3,
" stats->attr->attstattarget: %d", stats->attr->attstattarget);
1489 POSTGIS_DEBUGF(3,
" target # of histogram cells: %d", histo_cells_target);
1492 if ( ! notnull_cnt )
1494 elog(NOTICE,
"no non-null/empty features, unable to compute statistics");
1495 stats->stats_valid =
false;
1499 POSTGIS_DEBUGF(3,
" sample_extent: %s",
nd_box_to_json(&sample_extent, ndims));
1505 for ( d = 0; d < ndims; d++ )
1508 avg.
min[d] = sum.
min[d] / notnull_cnt;
1509 avg.
max[d] = sum.
max[d] / notnull_cnt;
1512 for ( i = 0; i < notnull_cnt; i++ )
1514 const ND_BOX *ndb = sample_boxes[i];
1515 stddev.
min[d] += (ndb->
min[d] - avg.
min[d]) * (ndb->
min[d] - avg.
min[d]);
1516 stddev.
max[d] += (ndb->
max[d] - avg.
max[d]) * (ndb->
max[d] - avg.
max[d]);
1518 stddev.
min[d] = sqrt(stddev.
min[d] / notnull_cnt);
1519 stddev.
max[d] = sqrt(stddev.
max[d] / notnull_cnt);
1532 for ( i = 0; i < notnull_cnt; i++ )
1534 const ND_BOX *ndb = sample_boxes[i];
1538 POSTGIS_DEBUGF(4,
" feature %d is a hard deviant, skipped", i);
1539 sample_boxes[i] = NULL;
1550 histo_extent = histo_extent_new;
1565 sample_distribution);
1581 for ( d = 0; d < ndims; d++ )
1583 if ( sample_distribution[d] > 0 )
1587 if ( histo_ndims == 0 )
1591 POSTGIS_DEBUG(3,
" special case: no axes have variability");
1592 histo_cells_new = 1;
1593 for ( d = 0; d < ndims; d++ )
1595 histo_size[d] = 1 + (int)pow((
double)histo_cells_target, 1/(double)ndims);
1596 POSTGIS_DEBUGF(3,
" histo_size[d]: %d", histo_size[d]);
1597 histo_cells_new *= histo_size[d];
1599 POSTGIS_DEBUGF(3,
" histo_cells_new: %d", histo_cells_new);
1608 POSTGIS_DEBUG(3,
" allocating histogram axes based on axis variability");
1609 total_distribution =
total_double(sample_distribution, ndims);
1610 POSTGIS_DEBUGF(3,
" total_distribution: %.8g", total_distribution);
1611 histo_cells_new = 1;
1612 for ( d = 0; d < ndims; d++ )
1614 if ( sample_distribution[d] == 0 )
1621 float edge_ratio = (float)sample_distribution[d] / (
float)total_distribution;
1627 histo_size[d] = (int)pow(histo_cells_target * histo_ndims * edge_ratio, 1/(
double)histo_ndims);
1629 if ( ! histo_size[d] )
1632 histo_cells_new *= histo_size[d];
1634 POSTGIS_DEBUGF(3,
" histo_cells_new: %d", histo_cells_new);
1638 histo_cells = histo_cells_new;
1639 POSTGIS_DEBUGF(3,
" histo_cells: %d", histo_cells);
1644 old_context = MemoryContextSwitchTo(stats->anl_context);
1645 nd_stats_size =
sizeof(
ND_STATS) + ((histo_cells - 1) *
sizeof(float4));
1646 nd_stats = palloc(nd_stats_size);
1647 memset(nd_stats, 0, nd_stats_size);
1648 MemoryContextSwitchTo(old_context);
1651 nd_stats->
ndims = ndims;
1652 nd_stats->
extent = histo_extent;
1657 for ( d = 0; d < ndims; d++ )
1658 nd_stats->
size[d] = histo_size[d];
1673 for ( i = 0; i < notnull_cnt; i++ )
1679 double num_cells = 0;
1680 double tmp_volume = 1.0;
1685 nd_box = sample_boxes[i];
1686 if ( ! nd_box )
continue;
1689 vacuum_delay_point();
1693 memset(at, 0,
sizeof(
int)*
ND_DIMS);
1695 POSTGIS_DEBUGF(3,
" feature %d: ibox (%d, %d, %d, %d) (%d, %d, %d, %d)", i,
1696 nd_ibox.
min[0], nd_ibox.
min[1], nd_ibox.
min[2], nd_ibox.
min[3],
1697 nd_ibox.
max[0], nd_ibox.
max[1], nd_ibox.
max[2], nd_ibox.
max[3]);
1699 for ( d = 0; d < nd_stats->
ndims; d++ )
1702 at[d] = nd_ibox.
min[d];
1705 cellsize[d] = (max[d] - min[d])/(nd_stats->
size[d]);
1708 tmp_volume *= (nd_box->
max[d] - nd_box->
min[d]);
1712 total_sample_volume += tmp_volume;
1723 for ( d = 0; d < nd_stats->
ndims; d++ )
1725 nd_cell.
min[d] = min[d] + (at[d]+0) * cellsize[d];
1726 nd_cell.
max[d] = min[d] + (at[d]+1) * cellsize[d];
1737 POSTGIS_DEBUGF(3,
" ratio (%.8g) num_cells (%.8g)", ratio, num_cells);
1738 POSTGIS_DEBUGF(3,
" at (%d, %d, %d, %d)", at[0], at[1], at[2], at[3]);
1743 total_cell_count += num_cells;
1745 histogram_features++;
1748 POSTGIS_DEBUGF(3,
" histogram_features: %d", histogram_features);
1749 POSTGIS_DEBUGF(3,
" sample_rows: %d", sample_rows);
1750 POSTGIS_DEBUGF(3,
" table_rows: %.6g", total_rows);
1753 if ( ! histogram_features )
1755 POSTGIS_DEBUG(3,
" no stats have been gathered");
1756 elog(NOTICE,
" no features lie in the stats histogram, invalid stats");
1757 stats->stats_valid =
false;
1778 stats->stakind[stats_slot] = stats_kind;
1779 stats->staop[stats_slot] = InvalidOid;
1780 stats->stanumbers[stats_slot] = (float4*)nd_stats;
1781 stats->numnumbers[stats_slot] = nd_stats_size/
sizeof(float4);
1782 stats->stanullfrac = (float4)null_cnt/sample_rows;
1783 stats->stawidth = total_width/notnull_cnt;
1784 stats->stadistinct = -1.0;
1785 stats->stats_valid =
true;
1787 POSTGIS_DEBUGF(3,
" out: slot 0: kind %d (STATISTIC_KIND_ND)", stats->stakind[0]);
1788 POSTGIS_DEBUGF(3,
" out: slot 0: op %d (InvalidOid)", stats->staop[0]);
1789 POSTGIS_DEBUGF(3,
" out: slot 0: numnumbers %d", stats->numnumbers[0]);
1790 POSTGIS_DEBUGF(3,
" out: null fraction: %f=%d/%d", stats->stanullfrac, null_cnt, sample_rows);
1791 POSTGIS_DEBUGF(3,
" out: average width: %d bytes", stats->stawidth);
1792 POSTGIS_DEBUG (3,
" out: distinct values: all (no check done)");
int gserialized_get_gbox_p(const GSERIALIZED *g, GBOX *box)
Read the bounding box off a serialization and calculate one if it is not already there.
static int nd_increment(ND_IBOX *ibox, int ndims, int *counter)
Given an n-d index array (counter), and a domain to increment it in (ibox) increment it by one...
static int nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
What stats cells overlap with this ND_BOX? Put the lowest cell addresses in ND_IBOX->min and the high...
int gbox_is_valid(const GBOX *gbox)
Return false if any of the dimensions is NaN or infinite.
#define ND_DIMS
The maximum number of dimensions our code can handle.
#define STATISTIC_SLOT_2D
static void nd_box_from_gbox(const GBOX *gbox, ND_BOX *nd_box)
Set the values of an ND_BOX from a GBOX.
static int nd_box_init(ND_BOX *a)
Zero out an ND_BOX.
static int nd_box_array_distribution(const ND_BOX **nd_boxes, int num_boxes, const ND_BOX *extent, int ndims, double *distribution)
Calculate how much a set of boxes is homogenously distributed or contentrated within one dimension...
static int nd_stats_value_index(const ND_STATS *stats, int *indexes)
Given a position in the n-d histogram (i,j,k) return the position in the 1-d values array...
static int nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
Return TRUE if ND_BOX a overlaps b, false otherwise.
static double total_double(const double *vals, int nvals)
Given double array, return sum of values.
static int nd_box_init_bounds(ND_BOX *a)
Prepare an ND_BOX for bounds calculation: set the maxes to the smallest thing possible and the mins t...
N-dimensional box index type.
float4 histogram_features
static char * nd_box_to_json(const ND_BOX *nd_box, int ndims)
Convert an ND_BOX to a JSON string for printing.
#define STATISTIC_KIND_2D
struct ND_STATS_T ND_STATS
N-dimensional statistics structure.
static int nd_box_merge(const ND_BOX *source, ND_BOX *target)
Create a printable view of the ND_STATS histogram.
static int nd_box_expand(ND_BOX *nd_box, double expansion_factor)
Expand an ND_BOX ever so slightly.
static char * nd_stats_to_json(const ND_STATS *nd_stats)
Convert an ND_STATS to a JSON representation for external use.
#define STATISTIC_SLOT_ND
static double nd_box_ratio(const ND_BOX *b1, const ND_BOX *b2, int ndims)
Returns the proportion of b2 that is covered by b1.
N-dimensional statistics structure.
N-dimensional box type for calculations, to avoid doing explicit axis conversions from GBOX in all ca...
static int gbox_ndims(const GBOX *gbox)
Given that geodetic boxes are X/Y/Z regardless of the underlying geometry dimensionality and other bo...
#define STATISTIC_KIND_ND
Assign a number to the n-dimensional statistics kind.