The gserialized_analyze_nd sets this function as a callback on the stats object when called by the ANALYZE command.
ANALYZE then gathers the requisite number of sample rows and then calls this function.
We could also pass stats->extra_data in from gserialized_analyze_nd (things like the column type or other stuff from the system catalogs) but so far we don't use that capability.
Our job is to build some statistics on the sample data for use by operator estimators.
We will populate an n-d histogram using the provided sample rows. The selectivity estimators (sel and joinsel) can then use the histogram
1374 MemoryContext old_context;
1376 int notnull_cnt = 0;
1378 int histogram_features = 0;
1381 size_t nd_stats_size;
1383 double total_width = 0;
1384 double total_cell_count = 0;
1390 const ND_BOX **sample_boxes;
1395 int histo_cells_target;
1397 int histo_cells_new = 1;
1400 int histo_ndims = 0;
1401 double sample_distribution[
ND_DIMS];
1402 double total_distribution;
1420 POSTGIS_DEBUG(2,
"compute_gserialized_stats called");
1421 POSTGIS_DEBUGF(3,
" # sample_rows: %d", sample_rows);
1422 POSTGIS_DEBUGF(3,
" estimate of total_rows: %.6g", total_rows);
1428 sample_boxes = palloc(
sizeof(
ND_BOX*) * sample_rows);
1440 for ( i = 0; i < sample_rows; i++ )
1447 datum = fetchfunc(stats, i, &is_null);
1452 POSTGIS_DEBUGF(4,
" skipped null geometry %d", i);
1461 POSTGIS_DEBUGF(3,
" skipped empty geometry %d", i);
1472 POSTGIS_DEBUGF(3,
" skipped infinite/nan geometry %d", i);
1484 nd_box = palloc(
sizeof(
ND_BOX));
1488 sample_boxes[notnull_cnt] = nd_box;
1491 if ( ! notnull_cnt )
1498 total_width += toast_raw_datum_size(datum);
1501 for ( d = 0; d < ndims; d++ )
1503 sum.
min[d] += nd_box->
min[d];
1504 sum.
max[d] += nd_box->
max[d];
1511 vacuum_delay_point();
1521 #if POSTGIS_PGSQL_VERSION >= 170
1522 histo_cells_target = (int)pow((
double)(stats->attstattarget), (
double)ndims);
1523 POSTGIS_DEBUGF(3,
" stats->attstattarget: %d", stats->attstattarget);
1525 histo_cells_target = (int)pow((
double)(stats->attr->attstattarget), (
double)ndims);
1526 POSTGIS_DEBUGF(3,
" stats->attr->attstattarget: %d", stats->attr->attstattarget);
1528 histo_cells_target = Min(histo_cells_target, ndims * 100000);
1529 histo_cells_target = Min(histo_cells_target, (
int)(10 * ndims * total_rows));
1530 POSTGIS_DEBUGF(3,
" target # of histogram cells: %d", histo_cells_target);
1533 if ( ! notnull_cnt )
1535 stats->stats_valid =
false;
1539 POSTGIS_DEBUGF(3,
" sample_extent: %s",
nd_box_to_json(&sample_extent, ndims));
1545 for ( d = 0; d < ndims; d++ )
1548 avg.
min[d] = sum.
min[d] / notnull_cnt;
1549 avg.
max[d] = sum.
max[d] / notnull_cnt;
1552 for ( i = 0; i < notnull_cnt; i++ )
1554 const ND_BOX *ndb = sample_boxes[i];
1555 stddev.
min[d] += (ndb->
min[d] - avg.
min[d]) * (ndb->
min[d] - avg.
min[d]);
1556 stddev.
max[d] += (ndb->
max[d] - avg.
max[d]) * (ndb->
max[d] - avg.
max[d]);
1558 stddev.
min[d] = sqrt(stddev.
min[d] / notnull_cnt);
1559 stddev.
max[d] = sqrt(stddev.
max[d] / notnull_cnt);
1572 for ( i = 0; i < notnull_cnt; i++ )
1574 const ND_BOX *ndb = sample_boxes[i];
1578 POSTGIS_DEBUGF(4,
" feature %d is a hard deviant, skipped", i);
1579 sample_boxes[i] = NULL;
1590 histo_extent = histo_extent_new;
1605 sample_distribution);
1621 for ( d = 0; d < ndims; d++ )
1623 if ( sample_distribution[d] > 0 )
1627 if ( histo_ndims == 0 )
1631 POSTGIS_DEBUG(3,
" special case: no axes have variability");
1632 histo_cells_new = 1;
1633 for ( d = 0; d < ndims; d++ )
1635 histo_size[d] = (int)pow((
double)histo_cells_target, 1/(double)ndims);
1636 if ( ! histo_size[d] )
1638 POSTGIS_DEBUGF(3,
" histo_size[d]: %d", histo_size[d]);
1639 histo_cells_new *= histo_size[d];
1641 POSTGIS_DEBUGF(3,
" histo_cells_new: %d", histo_cells_new);
1650 POSTGIS_DEBUG(3,
" allocating histogram axes based on axis variability");
1651 total_distribution =
total_double(sample_distribution, ndims);
1652 POSTGIS_DEBUGF(3,
" total_distribution: %.8g", total_distribution);
1653 histo_cells_new = 1;
1654 for ( d = 0; d < ndims; d++ )
1656 if ( sample_distribution[d] == 0 )
1663 float edge_ratio = (float)sample_distribution[d] / (
float)total_distribution;
1669 histo_size[d] = (int)pow(histo_cells_target * histo_ndims * edge_ratio, 1/(
double)histo_ndims);
1671 if ( ! histo_size[d] )
1674 histo_cells_new *= histo_size[d];
1676 POSTGIS_DEBUGF(3,
" histo_cells_new: %d", histo_cells_new);
1680 histo_cells = histo_cells_new;
1681 POSTGIS_DEBUGF(3,
" histo_cells: %d", histo_cells);
1686 old_context = MemoryContextSwitchTo(stats->anl_context);
1687 nd_stats_size =
sizeof(
ND_STATS) + ((histo_cells - 1) *
sizeof(float4));
1688 nd_stats = palloc(nd_stats_size);
1689 memset(nd_stats, 0, nd_stats_size);
1690 MemoryContextSwitchTo(old_context);
1693 nd_stats->
ndims = ndims;
1694 nd_stats->
extent = histo_extent;
1699 for ( d = 0; d < ndims; d++ )
1700 nd_stats->
size[d] = histo_size[d];
1715 for ( i = 0; i < notnull_cnt; i++ )
1720 double num_cells = 0;
1721 double min[
ND_DIMS] = {0.0, 0.0, 0.0, 0.0};
1722 double max[
ND_DIMS] = {0.0, 0.0, 0.0, 0.0};
1723 double cellsize[
ND_DIMS] = {0.0, 0.0, 0.0, 0.0};
1725 nd_box = sample_boxes[i];
1726 if ( ! nd_box )
continue;
1729 vacuum_delay_point();
1733 memset(at, 0,
sizeof(
int)*
ND_DIMS);
1735 POSTGIS_DEBUGF(3,
" feature %d: ibox (%d, %d, %d, %d) (%d, %d, %d, %d)", i,
1736 nd_ibox.
min[0], nd_ibox.
min[1], nd_ibox.
min[2], nd_ibox.
min[3],
1737 nd_ibox.
max[0], nd_ibox.
max[1], nd_ibox.
max[2], nd_ibox.
max[3]);
1739 for ( d = 0; d < nd_stats->
ndims; d++ )
1742 at[d] = nd_ibox.
min[d];
1745 cellsize[d] = (max[d] - min[d])/(nd_stats->
size[d]);
1754 ND_BOX nd_cell = { {0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0} };
1757 for ( d = 0; d < nd_stats->
ndims; d++ )
1759 nd_cell.
min[d] = min[d] + (at[d]+0) * cellsize[d];
1760 nd_cell.
max[d] = min[d] + (at[d]+1) * cellsize[d];
1771 POSTGIS_DEBUGF(3,
" ratio (%.8g) num_cells (%.8g)", ratio, num_cells);
1772 POSTGIS_DEBUGF(3,
" at (%d, %d, %d, %d)", at[0], at[1], at[2], at[3]);
1777 total_cell_count += num_cells;
1779 histogram_features++;
1782 POSTGIS_DEBUGF(3,
" histogram_features: %d", histogram_features);
1783 POSTGIS_DEBUGF(3,
" sample_rows: %d", sample_rows);
1784 POSTGIS_DEBUGF(3,
" table_rows: %.6g", total_rows);
1787 if ( ! histogram_features )
1789 POSTGIS_DEBUG(3,
" no stats have been gathered");
1790 elog(NOTICE,
" no features lie in the stats histogram, invalid stats");
1791 stats->stats_valid =
false;
1812 stats->stakind[stats_slot] = stats_kind;
1813 stats->staop[stats_slot] = InvalidOid;
1814 stats->stanumbers[stats_slot] = (float4*)nd_stats;
1815 stats->numnumbers[stats_slot] = nd_stats_size/
sizeof(float4);
1816 stats->stanullfrac = (float4)null_cnt/sample_rows;
1817 stats->stawidth = total_width/notnull_cnt;
1818 stats->stadistinct = -1.0;
1819 stats->stats_valid =
true;
1821 POSTGIS_DEBUGF(3,
" out: slot 0: kind %d (STATISTIC_KIND_ND)", stats->stakind[0]);
1822 POSTGIS_DEBUGF(3,
" out: slot 0: op %d (InvalidOid)", stats->staop[0]);
1823 POSTGIS_DEBUGF(3,
" out: slot 0: numnumbers %d", stats->numnumbers[0]);
1824 POSTGIS_DEBUGF(3,
" out: null fraction: %f=%d/%d", stats->stanullfrac, null_cnt, sample_rows);
1825 POSTGIS_DEBUGF(3,
" out: average width: %d bytes", stats->stawidth);
1826 POSTGIS_DEBUG (3,
" out: distinct values: all (no check done)");
int gbox_is_valid(const GBOX *gbox)
Return false if any of the dimensions is NaN or infinite.
struct ND_STATS_T ND_STATS
N-dimensional statistics structure.
static char * nd_stats_to_json(const ND_STATS *nd_stats)
Convert an ND_STATS to a JSON representation for external use.
static int nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
Return true if ND_BOX a overlaps b, false otherwise.
static int nd_box_init_bounds(ND_BOX *a)
Prepare an ND_BOX for bounds calculation: set the maxes to the smallest thing possible and the mins t...
static int nd_increment(ND_IBOX *ibox, int ndims, int *counter)
Given an n-d index array (counter), and a domain to increment it in (ibox) increment it by one,...
#define STATISTIC_SLOT_ND
static int gbox_ndims(const GBOX *gbox)
Given that geodetic boxes are X/Y/Z regardless of the underlying geometry dimensionality and other bo...
#define ND_DIMS
The maximum number of dimensions our code can handle.
#define STATISTIC_KIND_2D
static int nd_box_merge(const ND_BOX *source, ND_BOX *target)
Expand the bounds of target to include source.
#define STATISTIC_KIND_ND
static double total_double(const double *vals, int nvals)
Given double array, return sum of values.
static void nd_box_from_gbox(const GBOX *gbox, ND_BOX *nd_box)
Set the values of an ND_BOX from a GBOX.
static int nd_box_init(ND_BOX *a)
Zero out an ND_BOX.
static int nd_box_expand(ND_BOX *nd_box, double expansion_factor)
Expand an ND_BOX ever so slightly.
static int nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
What stats cells overlap with this ND_BOX? Put the lowest cell addresses in ND_IBOX->min and the high...
static double nd_box_ratio(const ND_BOX *b1, const ND_BOX *b2, int ndims)
Returns the proportion of b2 that is covered by b1.
static int nd_stats_value_index(const ND_STATS *stats, int *indexes)
Given a position in the n-d histogram (i,j,k) return the position in the 1-d values array.
static char * nd_box_to_json(const ND_BOX *nd_box, int ndims)
Convert an ND_BOX to a JSON string for printing.
#define STATISTIC_SLOT_2D
static int nd_box_array_distribution(const ND_BOX **nd_boxes, int num_boxes, const ND_BOX *extent, int ndims, double *distribution)
Calculate how much a set of boxes is homogenously distributed or contentrated within one dimension,...
int gserialized_datum_get_gbox_p(Datum gsdatum, GBOX *gbox)
Given a GSERIALIZED datum, as quickly as possible (peaking into the top of the memory) return the gbo...
N-dimensional box type for calculations, to avoid doing explicit axis conversions from GBOX in all ca...
N-dimensional box index type.
float4 histogram_features
N-dimensional statistics structure.