The gserialized_analyze_nd sets this function as a callback on the stats object when called by the ANALYZE command.
ANALYZE then gathers the requisite number of sample rows and then calls this function.
We could also pass stats->extra_data in from gserialized_analyze_nd (things like the column type or other stuff from the system catalogs) but so far we don't use that capability.
Our job is to build some statistics on the sample data for use by operator estimators.
We will populate an n-d histogram using the provided sample rows. The selectivity estimators (sel and joinsel) can then use the histogram
1371 MemoryContext old_context;
1373 int notnull_cnt = 0;
1375 int histogram_features = 0;
1378 size_t nd_stats_size;
1380 double total_width = 0;
1381 double total_cell_count = 0;
1387 const ND_BOX **sample_boxes;
1392 int histo_cells_target;
1394 int histo_cells_new = 1;
1397 int histo_ndims = 0;
1398 double sample_distribution[
ND_DIMS];
1399 double total_distribution;
1417 POSTGIS_DEBUG(2,
"compute_gserialized_stats called");
1418 POSTGIS_DEBUGF(3,
" # sample_rows: %d", sample_rows);
1419 POSTGIS_DEBUGF(3,
" estimate of total_rows: %.6g", total_rows);
1425 sample_boxes = palloc(
sizeof(
ND_BOX*) * sample_rows);
1437 for ( i = 0; i < sample_rows; i++ )
1444 datum = fetchfunc(stats, i, &is_null);
1449 POSTGIS_DEBUGF(4,
" skipped null geometry %d", i);
1458 POSTGIS_DEBUGF(3,
" skipped empty geometry %d", i);
1469 POSTGIS_DEBUGF(3,
" skipped infinite/nan geometry %d", i);
1481 nd_box = palloc(
sizeof(
ND_BOX));
1485 sample_boxes[notnull_cnt] = nd_box;
1488 if ( ! notnull_cnt )
1495 total_width += toast_raw_datum_size(datum);
1498 for ( d = 0; d < ndims; d++ )
1500 sum.
min[d] += nd_box->
min[d];
1501 sum.
max[d] += nd_box->
max[d];
1508 #if POSTGIS_PGSQL_VERSION >= 180
1509 vacuum_delay_point(
true);
1511 vacuum_delay_point();
1522 #if POSTGIS_PGSQL_VERSION >= 170
1523 histo_cells_target = (int)pow((
double)(stats->attstattarget), (
double)ndims);
1524 POSTGIS_DEBUGF(3,
" stats->attstattarget: %d", stats->attstattarget);
1526 histo_cells_target = (int)pow((
double)(stats->attr->attstattarget), (
double)ndims);
1527 POSTGIS_DEBUGF(3,
" stats->attr->attstattarget: %d", stats->attr->attstattarget);
1529 histo_cells_target = Min(histo_cells_target, ndims * 100000);
1530 histo_cells_target = Min(histo_cells_target, (
int)(10 * ndims * total_rows));
1531 POSTGIS_DEBUGF(3,
" target # of histogram cells: %d", histo_cells_target);
1534 if ( ! notnull_cnt )
1536 stats->stats_valid =
false;
1540 POSTGIS_DEBUGF(3,
" sample_extent: %s",
nd_box_to_json(&sample_extent, ndims));
1546 for ( d = 0; d < ndims; d++ )
1549 avg.
min[d] = sum.
min[d] / notnull_cnt;
1550 avg.
max[d] = sum.
max[d] / notnull_cnt;
1553 for ( i = 0; i < notnull_cnt; i++ )
1555 const ND_BOX *ndb = sample_boxes[i];
1556 stddev.
min[d] += (ndb->
min[d] - avg.
min[d]) * (ndb->
min[d] - avg.
min[d]);
1557 stddev.
max[d] += (ndb->
max[d] - avg.
max[d]) * (ndb->
max[d] - avg.
max[d]);
1559 stddev.
min[d] = sqrt(stddev.
min[d] / notnull_cnt);
1560 stddev.
max[d] = sqrt(stddev.
max[d] / notnull_cnt);
1573 for ( i = 0; i < notnull_cnt; i++ )
1575 const ND_BOX *ndb = sample_boxes[i];
1579 POSTGIS_DEBUGF(4,
" feature %d is a hard deviant, skipped", i);
1580 sample_boxes[i] = NULL;
1591 histo_extent = histo_extent_new;
1606 sample_distribution);
1622 for ( d = 0; d < ndims; d++ )
1624 if ( sample_distribution[d] > 0 )
1628 if ( histo_ndims == 0 )
1632 POSTGIS_DEBUG(3,
" special case: no axes have variability");
1633 histo_cells_new = 1;
1634 for ( d = 0; d < ndims; d++ )
1636 histo_size[d] = (int)pow((
double)histo_cells_target, 1/(double)ndims);
1637 if ( ! histo_size[d] )
1639 POSTGIS_DEBUGF(3,
" histo_size[d]: %d", histo_size[d]);
1640 histo_cells_new *= histo_size[d];
1642 POSTGIS_DEBUGF(3,
" histo_cells_new: %d", histo_cells_new);
1651 POSTGIS_DEBUG(3,
" allocating histogram axes based on axis variability");
1652 total_distribution =
total_double(sample_distribution, ndims);
1653 POSTGIS_DEBUGF(3,
" total_distribution: %.8g", total_distribution);
1654 histo_cells_new = 1;
1655 for ( d = 0; d < ndims; d++ )
1657 if ( sample_distribution[d] == 0 )
1664 float edge_ratio = (float)sample_distribution[d] / (
float)total_distribution;
1670 histo_size[d] = (int)pow((
double)histo_cells_target * histo_ndims * edge_ratio, 1/(double)histo_ndims);
1672 if ( ! histo_size[d] )
1675 histo_cells_new *= histo_size[d];
1677 POSTGIS_DEBUGF(3,
" histo_cells_new: %d", histo_cells_new);
1681 histo_cells = histo_cells_new;
1682 POSTGIS_DEBUGF(3,
" histo_cells: %d", histo_cells);
1687 old_context = MemoryContextSwitchTo(stats->anl_context);
1688 nd_stats_size =
sizeof(
ND_STATS) + ((histo_cells - 1) *
sizeof(float4));
1689 nd_stats = palloc(nd_stats_size);
1690 memset(nd_stats, 0, nd_stats_size);
1691 MemoryContextSwitchTo(old_context);
1694 nd_stats->
ndims = ndims;
1695 nd_stats->
extent = histo_extent;
1700 for ( d = 0; d < ndims; d++ )
1701 nd_stats->
size[d] = histo_size[d];
1716 for ( i = 0; i < notnull_cnt; i++ )
1721 double num_cells = 0;
1722 double min[
ND_DIMS] = {0.0, 0.0, 0.0, 0.0};
1723 double max[
ND_DIMS] = {0.0, 0.0, 0.0, 0.0};
1724 double cellsize[
ND_DIMS] = {0.0, 0.0, 0.0, 0.0};
1726 nd_box = sample_boxes[i];
1727 if ( ! nd_box )
continue;
1730 #if POSTGIS_PGSQL_VERSION >= 180
1731 vacuum_delay_point(
true);
1733 vacuum_delay_point();
1738 memset(at, 0,
sizeof(
int)*
ND_DIMS);
1740 POSTGIS_DEBUGF(3,
" feature %d: ibox (%d, %d, %d, %d) (%d, %d, %d, %d)", i,
1741 nd_ibox.
min[0], nd_ibox.
min[1], nd_ibox.
min[2], nd_ibox.
min[3],
1742 nd_ibox.
max[0], nd_ibox.
max[1], nd_ibox.
max[2], nd_ibox.
max[3]);
1744 for ( d = 0; d < nd_stats->
ndims; d++ )
1747 at[d] = nd_ibox.
min[d];
1750 cellsize[d] = (max[d] - min[d])/(nd_stats->
size[d]);
1759 ND_BOX nd_cell = { {0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0} };
1762 for ( d = 0; d < nd_stats->
ndims; d++ )
1764 nd_cell.
min[d] = min[d] + (at[d]+0) * cellsize[d];
1765 nd_cell.
max[d] = min[d] + (at[d]+1) * cellsize[d];
1776 POSTGIS_DEBUGF(3,
" ratio (%.8g) num_cells (%.8g)", ratio, num_cells);
1777 POSTGIS_DEBUGF(3,
" at (%d, %d, %d, %d)", at[0], at[1], at[2], at[3]);
1782 total_cell_count += num_cells;
1784 histogram_features++;
1787 POSTGIS_DEBUGF(3,
" histogram_features: %d", histogram_features);
1788 POSTGIS_DEBUGF(3,
" sample_rows: %d", sample_rows);
1789 POSTGIS_DEBUGF(3,
" table_rows: %.6g", total_rows);
1792 if ( ! histogram_features )
1794 POSTGIS_DEBUG(3,
" no stats have been gathered");
1795 elog(NOTICE,
" no features lie in the stats histogram, invalid stats");
1796 stats->stats_valid =
false;
1817 stats->stakind[stats_slot] = stats_kind;
1818 stats->staop[stats_slot] = InvalidOid;
1819 stats->stanumbers[stats_slot] = (float4*)nd_stats;
1820 stats->numnumbers[stats_slot] = nd_stats_size/
sizeof(float4);
1821 stats->stanullfrac = (float4)null_cnt/sample_rows;
1822 stats->stawidth = total_width/notnull_cnt;
1823 stats->stadistinct = -1.0;
1824 stats->stats_valid =
true;
1826 POSTGIS_DEBUGF(3,
" out: slot 0: kind %d (STATISTIC_KIND_ND)", stats->stakind[0]);
1827 POSTGIS_DEBUGF(3,
" out: slot 0: op %d (InvalidOid)", stats->staop[0]);
1828 POSTGIS_DEBUGF(3,
" out: slot 0: numnumbers %d", stats->numnumbers[0]);
1829 POSTGIS_DEBUGF(3,
" out: null fraction: %f=%d/%d", stats->stanullfrac, null_cnt, sample_rows);
1830 POSTGIS_DEBUGF(3,
" out: average width: %d bytes", stats->stawidth);
1831 POSTGIS_DEBUG (3,
" out: distinct values: all (no check done)");
int gbox_is_valid(const GBOX *gbox)
Return false if any of the dimensions is NaN or infinite.
struct ND_STATS_T ND_STATS
N-dimensional statistics structure.
static char * nd_stats_to_json(const ND_STATS *nd_stats)
Convert an ND_STATS to a JSON representation for external use.
static int nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
Return true if ND_BOX a overlaps b, false otherwise.
static int nd_box_init_bounds(ND_BOX *a)
Prepare an ND_BOX for bounds calculation: set the maxes to the smallest thing possible and the mins t...
static int nd_increment(ND_IBOX *ibox, int ndims, int *counter)
Given an n-d index array (counter), and a domain to increment it in (ibox) increment it by one,...
#define STATISTIC_SLOT_ND
static int gbox_ndims(const GBOX *gbox)
Given that geodetic boxes are X/Y/Z regardless of the underlying geometry dimensionality and other bo...
#define ND_DIMS
The maximum number of dimensions our code can handle.
#define STATISTIC_KIND_2D
static int nd_box_merge(const ND_BOX *source, ND_BOX *target)
Expand the bounds of target to include source.
#define STATISTIC_KIND_ND
static double total_double(const double *vals, int nvals)
Given double array, return sum of values.
static void nd_box_from_gbox(const GBOX *gbox, ND_BOX *nd_box)
Set the values of an ND_BOX from a GBOX.
static int nd_box_init(ND_BOX *a)
Zero out an ND_BOX.
static int nd_box_expand(ND_BOX *nd_box, double expansion_factor)
Expand an ND_BOX ever so slightly.
static int nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
What stats cells overlap with this ND_BOX? Put the lowest cell addresses in ND_IBOX->min and the high...
static double nd_box_ratio(const ND_BOX *b1, const ND_BOX *b2, int ndims)
Returns the proportion of b2 that is covered by b1.
static int nd_stats_value_index(const ND_STATS *stats, int *indexes)
Given a position in the n-d histogram (i,j,k) return the position in the 1-d values array.
static char * nd_box_to_json(const ND_BOX *nd_box, int ndims)
Convert an ND_BOX to a JSON string for printing.
#define STATISTIC_SLOT_2D
static int nd_box_array_distribution(const ND_BOX **nd_boxes, int num_boxes, const ND_BOX *extent, int ndims, double *distribution)
Calculate how much a set of boxes is homogeneously distributed or contentrated within one dimension,...
int gserialized_datum_get_gbox_p(Datum gsdatum, GBOX *gbox)
Given a GSERIALIZED datum, as quickly as possible (peaking into the top of the memory) return the gbo...
N-dimensional box type for calculations, to avoid doing explicit axis conversions from GBOX in all ca...
N-dimensional box index type.
float4 histogram_features
N-dimensional statistics structure.