The gserialized_analyze_nd sets this function as a callback on the stats object when called by the ANALYZE command.
ANALYZE then gathers the requisite number of sample rows and then calls this function.
We could also pass stats->extra_data in from gserialized_analyze_nd (things like the column type or other stuff from the system catalogs) but so far we don't use that capability.
Our job is to build some statistics on the sample data for use by operator estimators.
We will populate an n-d histogram using the provided sample rows. The selectivity estimators (sel and joinsel) can then use the histogram
1380 MemoryContext old_context;
1382 int notnull_cnt = 0;
1384 int histogram_features = 0;
1387 size_t nd_stats_size;
1389 double total_width = 0;
1390 double total_cell_count = 0;
1396 const ND_BOX **sample_boxes;
1401 int histo_cells_target;
1403 int histo_cells_new = 1;
1406 int histo_ndims = 0;
1407 double sample_distribution[
ND_DIMS];
1408 double total_distribution;
1426 POSTGIS_DEBUG(2,
"compute_gserialized_stats called");
1427 POSTGIS_DEBUGF(3,
" # sample_rows: %d", sample_rows);
1428 POSTGIS_DEBUGF(3,
" estimate of total_rows: %.6g", total_rows);
1434 sample_boxes = palloc(
sizeof(
ND_BOX*) * sample_rows);
1446 for ( i = 0; i < sample_rows; i++ )
1453 datum = fetchfunc(stats, i, &is_null);
1458 POSTGIS_DEBUGF(4,
" skipped null geometry %d", i);
1467 POSTGIS_DEBUGF(3,
" skipped empty geometry %d", i);
1478 POSTGIS_DEBUGF(3,
" skipped infinite/nan geometry %d", i);
1490 nd_box = palloc(
sizeof(
ND_BOX));
1494 sample_boxes[notnull_cnt] = nd_box;
1497 if ( ! notnull_cnt )
1504 total_width += toast_raw_datum_size(datum);
1507 for ( d = 0; d < ndims; d++ )
1509 sum.
min[d] += nd_box->
min[d];
1510 sum.
max[d] += nd_box->
max[d];
1517 vacuum_delay_point();
1528 #if POSTGIS_PGSQL_VERSION >= 170
1529 histo_cells_target = (int)pow((
double)(stats->attstattarget), (
double)ndims);
1531 histo_cells_target = (int)pow((
double)(stats->attr->attstattarget), (
double)ndims);
1533 histo_cells_target = Min(histo_cells_target, ndims * 10000);
1534 histo_cells_target = Min(histo_cells_target, (
int)(total_rows/5));
1535 POSTGIS_DEBUGF(3,
" stats->attr->attstattarget: %d", stats->attr->attstattarget);
1536 POSTGIS_DEBUGF(3,
" target # of histogram cells: %d", histo_cells_target);
1539 if ( ! notnull_cnt )
1541 stats->stats_valid =
false;
1545 POSTGIS_DEBUGF(3,
" sample_extent: %s",
nd_box_to_json(&sample_extent, ndims));
1551 for ( d = 0; d < ndims; d++ )
1554 avg.
min[d] = sum.
min[d] / notnull_cnt;
1555 avg.
max[d] = sum.
max[d] / notnull_cnt;
1558 for ( i = 0; i < notnull_cnt; i++ )
1560 const ND_BOX *ndb = sample_boxes[i];
1561 stddev.
min[d] += (ndb->
min[d] - avg.
min[d]) * (ndb->
min[d] - avg.
min[d]);
1562 stddev.
max[d] += (ndb->
max[d] - avg.
max[d]) * (ndb->
max[d] - avg.
max[d]);
1564 stddev.
min[d] = sqrt(stddev.
min[d] / notnull_cnt);
1565 stddev.
max[d] = sqrt(stddev.
max[d] / notnull_cnt);
1578 for ( i = 0; i < notnull_cnt; i++ )
1580 const ND_BOX *ndb = sample_boxes[i];
1584 POSTGIS_DEBUGF(4,
" feature %d is a hard deviant, skipped", i);
1585 sample_boxes[i] = NULL;
1596 histo_extent = histo_extent_new;
1611 sample_distribution);
1627 for ( d = 0; d < ndims; d++ )
1629 if ( sample_distribution[d] > 0 )
1633 if ( histo_ndims == 0 )
1637 POSTGIS_DEBUG(3,
" special case: no axes have variability");
1638 histo_cells_new = 1;
1639 for ( d = 0; d < ndims; d++ )
1641 histo_size[d] = (int)pow((
double)histo_cells_target, 1/(double)ndims);
1642 if ( ! histo_size[d] )
1644 POSTGIS_DEBUGF(3,
" histo_size[d]: %d", histo_size[d]);
1645 histo_cells_new *= histo_size[d];
1647 POSTGIS_DEBUGF(3,
" histo_cells_new: %d", histo_cells_new);
1656 POSTGIS_DEBUG(3,
" allocating histogram axes based on axis variability");
1657 total_distribution =
total_double(sample_distribution, ndims);
1658 POSTGIS_DEBUGF(3,
" total_distribution: %.8g", total_distribution);
1659 histo_cells_new = 1;
1660 for ( d = 0; d < ndims; d++ )
1662 if ( sample_distribution[d] == 0 )
1669 float edge_ratio = (float)sample_distribution[d] / (
float)total_distribution;
1675 histo_size[d] = (int)pow(histo_cells_target * histo_ndims * edge_ratio, 1/(
double)histo_ndims);
1677 if ( ! histo_size[d] )
1680 histo_cells_new *= histo_size[d];
1682 POSTGIS_DEBUGF(3,
" histo_cells_new: %d", histo_cells_new);
1686 histo_cells = histo_cells_new;
1687 POSTGIS_DEBUGF(3,
" histo_cells: %d", histo_cells);
1692 old_context = MemoryContextSwitchTo(stats->anl_context);
1693 nd_stats_size =
sizeof(
ND_STATS) + ((histo_cells - 1) *
sizeof(float4));
1694 nd_stats = palloc(nd_stats_size);
1695 memset(nd_stats, 0, nd_stats_size);
1696 MemoryContextSwitchTo(old_context);
1699 nd_stats->
ndims = ndims;
1700 nd_stats->
extent = histo_extent;
1705 for ( d = 0; d < ndims; d++ )
1706 nd_stats->
size[d] = histo_size[d];
1721 for ( i = 0; i < notnull_cnt; i++ )
1726 double num_cells = 0;
1727 double min[
ND_DIMS] = {0.0, 0.0, 0.0, 0.0};
1728 double max[
ND_DIMS] = {0.0, 0.0, 0.0, 0.0};
1729 double cellsize[
ND_DIMS] = {0.0, 0.0, 0.0, 0.0};
1731 nd_box = sample_boxes[i];
1732 if ( ! nd_box )
continue;
1735 vacuum_delay_point();
1739 memset(at, 0,
sizeof(
int)*
ND_DIMS);
1741 POSTGIS_DEBUGF(3,
" feature %d: ibox (%d, %d, %d, %d) (%d, %d, %d, %d)", i,
1742 nd_ibox.
min[0], nd_ibox.
min[1], nd_ibox.
min[2], nd_ibox.
min[3],
1743 nd_ibox.
max[0], nd_ibox.
max[1], nd_ibox.
max[2], nd_ibox.
max[3]);
1745 for ( d = 0; d < nd_stats->
ndims; d++ )
1748 at[d] = nd_ibox.
min[d];
1751 cellsize[d] = (max[d] - min[d])/(nd_stats->
size[d]);
1760 ND_BOX nd_cell = { {0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0} };
1763 for ( d = 0; d < nd_stats->
ndims; d++ )
1765 nd_cell.
min[d] = min[d] + (at[d]+0) * cellsize[d];
1766 nd_cell.
max[d] = min[d] + (at[d]+1) * cellsize[d];
1777 POSTGIS_DEBUGF(3,
" ratio (%.8g) num_cells (%.8g)", ratio, num_cells);
1778 POSTGIS_DEBUGF(3,
" at (%d, %d, %d, %d)", at[0], at[1], at[2], at[3]);
1783 total_cell_count += num_cells;
1785 histogram_features++;
1788 POSTGIS_DEBUGF(3,
" histogram_features: %d", histogram_features);
1789 POSTGIS_DEBUGF(3,
" sample_rows: %d", sample_rows);
1790 POSTGIS_DEBUGF(3,
" table_rows: %.6g", total_rows);
1793 if ( ! histogram_features )
1795 POSTGIS_DEBUG(3,
" no stats have been gathered");
1796 elog(NOTICE,
" no features lie in the stats histogram, invalid stats");
1797 stats->stats_valid =
false;
1818 stats->stakind[stats_slot] = stats_kind;
1819 stats->staop[stats_slot] = InvalidOid;
1820 stats->stanumbers[stats_slot] = (float4*)nd_stats;
1821 stats->numnumbers[stats_slot] = nd_stats_size/
sizeof(float4);
1822 stats->stanullfrac = (float4)null_cnt/sample_rows;
1823 stats->stawidth = total_width/notnull_cnt;
1824 stats->stadistinct = -1.0;
1825 stats->stats_valid =
true;
1827 POSTGIS_DEBUGF(3,
" out: slot 0: kind %d (STATISTIC_KIND_ND)", stats->stakind[0]);
1828 POSTGIS_DEBUGF(3,
" out: slot 0: op %d (InvalidOid)", stats->staop[0]);
1829 POSTGIS_DEBUGF(3,
" out: slot 0: numnumbers %d", stats->numnumbers[0]);
1830 POSTGIS_DEBUGF(3,
" out: null fraction: %f=%d/%d", stats->stanullfrac, null_cnt, sample_rows);
1831 POSTGIS_DEBUGF(3,
" out: average width: %d bytes", stats->stawidth);
1832 POSTGIS_DEBUG (3,
" out: distinct values: all (no check done)");
int gbox_is_valid(const GBOX *gbox)
Return false if any of the dimensions is NaN or infinite.
struct ND_STATS_T ND_STATS
N-dimensional statistics structure.
static char * nd_stats_to_json(const ND_STATS *nd_stats)
Convert an ND_STATS to a JSON representation for external use.
static int nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
Return true if ND_BOX a overlaps b, false otherwise.
static int nd_box_init_bounds(ND_BOX *a)
Prepare an ND_BOX for bounds calculation: set the maxes to the smallest thing possible and the mins t...
static int nd_increment(ND_IBOX *ibox, int ndims, int *counter)
Given an n-d index array (counter), and a domain to increment it in (ibox) increment it by one,...
#define STATISTIC_SLOT_ND
static int gbox_ndims(const GBOX *gbox)
Given that geodetic boxes are X/Y/Z regardless of the underlying geometry dimensionality and other bo...
#define ND_DIMS
The maximum number of dimensions our code can handle.
#define STATISTIC_KIND_2D
static int nd_box_merge(const ND_BOX *source, ND_BOX *target)
Create a printable view of the ND_STATS histogram.
#define STATISTIC_KIND_ND
static double total_double(const double *vals, int nvals)
Given double array, return sum of values.
static void nd_box_from_gbox(const GBOX *gbox, ND_BOX *nd_box)
Set the values of an ND_BOX from a GBOX.
static int nd_box_init(ND_BOX *a)
Zero out an ND_BOX.
static int nd_box_expand(ND_BOX *nd_box, double expansion_factor)
Expand an ND_BOX ever so slightly.
static int nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
What stats cells overlap with this ND_BOX? Put the lowest cell addresses in ND_IBOX->min and the high...
static double nd_box_ratio(const ND_BOX *b1, const ND_BOX *b2, int ndims)
Returns the proportion of b2 that is covered by b1.
static int nd_stats_value_index(const ND_STATS *stats, int *indexes)
Given a position in the n-d histogram (i,j,k) return the position in the 1-d values array.
static char * nd_box_to_json(const ND_BOX *nd_box, int ndims)
Convert an ND_BOX to a JSON string for printing.
#define STATISTIC_SLOT_2D
static int nd_box_array_distribution(const ND_BOX **nd_boxes, int num_boxes, const ND_BOX *extent, int ndims, double *distribution)
Calculate how much a set of boxes is homogenously distributed or contentrated within one dimension,...
int gserialized_datum_get_gbox_p(Datum gsdatum, GBOX *gbox)
Given a GSERIALIZED datum, as quickly as possible (peaking into the top of the memory) return the gbo...
N-dimensional box type for calculations, to avoid doing explicit axis conversions from GBOX in all ca...
N-dimensional box index type.
float4 histogram_features
N-dimensional statistics structure.