The gserialized_analyze_nd sets this function as a callback on the stats object when called by the ANALYZE command.
ANALYZE then gathers the requisite number of sample rows and then calls this function.
We could also pass stats->extra_data in from gserialized_analyze_nd (things like the column type or other stuff from the system catalogs) but so far we don't use that capability.
Our job is to build some statistics on the sample data for use by operator estimators.
We will populate an n-d histogram using the provided sample rows. The selectivity estimators (sel and j_oinsel) can then use the histogram
1368 MemoryContext old_context;
1370 int notnull_cnt = 0;
1372 int histogram_features = 0;
1375 size_t nd_stats_size;
1377 double total_width = 0;
1378 double total_sample_volume = 0;
1379 double total_cell_count = 0;
1385 const ND_BOX **sample_boxes;
1390 int histo_cells_target;
1392 int histo_cells_new = 1;
1395 int histo_ndims = 0;
1396 double sample_distribution[
ND_DIMS];
1397 double total_distribution;
1415 POSTGIS_DEBUG(2,
"compute_gserialized_stats called");
1416 POSTGIS_DEBUGF(3,
" # sample_rows: %d", sample_rows);
1417 POSTGIS_DEBUGF(3,
" estimate of total_rows: %.6g", total_rows);
1423 sample_boxes = palloc(
sizeof(
ND_BOX*) * sample_rows);
1435 for ( i = 0; i < sample_rows; i++ )
1444 datum = fetchfunc(stats, i, &is_null);
1449 POSTGIS_DEBUGF(4,
" skipped null geometry %d", i);
1456 is_copy = VARATT_IS_EXTENDED(datum);
1460 POSTGIS_DEBUGF(3,
" skipped empty geometry %d", i);
1471 POSTGIS_DEBUGF(3,
" skipped infinite/nan geometry %d", i);
1483 nd_box = palloc(
sizeof(
ND_BOX));
1487 sample_boxes[notnull_cnt] = nd_box;
1490 if ( ! notnull_cnt )
1497 total_width += VARSIZE(geom);
1500 for ( d = 0; d < ndims; d++ )
1502 sum.
min[d] += nd_box->
min[d];
1503 sum.
max[d] += nd_box->
max[d];
1514 vacuum_delay_point();
1525 histo_cells_target = (int)pow((
double)(stats->attr->attstattarget), (
double)ndims);
1526 histo_cells_target = Min(histo_cells_target, ndims * 10000);
1527 histo_cells_target = Min(histo_cells_target, (
int)(total_rows/5));
1528 POSTGIS_DEBUGF(3,
" stats->attr->attstattarget: %d", stats->attr->attstattarget);
1529 POSTGIS_DEBUGF(3,
" target # of histogram cells: %d", histo_cells_target);
1532 if ( ! notnull_cnt )
1534 elog(NOTICE,
"no non-null/empty features, unable to compute statistics");
1535 stats->stats_valid =
false;
1539 POSTGIS_DEBUGF(3,
" sample_extent: %s",
nd_box_to_json(&sample_extent, ndims));
1545 for ( d = 0; d < ndims; d++ )
1548 avg.
min[d] = sum.
min[d] / notnull_cnt;
1549 avg.
max[d] = sum.
max[d] / notnull_cnt;
1552 for ( i = 0; i < notnull_cnt; i++ )
1554 const ND_BOX *ndb = sample_boxes[i];
1555 stddev.
min[d] += (ndb->
min[d] - avg.
min[d]) * (ndb->
min[d] - avg.
min[d]);
1556 stddev.
max[d] += (ndb->
max[d] - avg.
max[d]) * (ndb->
max[d] - avg.
max[d]);
1558 stddev.
min[d] = sqrt(stddev.
min[d] / notnull_cnt);
1559 stddev.
max[d] = sqrt(stddev.
max[d] / notnull_cnt);
1572 for ( i = 0; i < notnull_cnt; i++ )
1574 const ND_BOX *ndb = sample_boxes[i];
1578 POSTGIS_DEBUGF(4,
" feature %d is a hard deviant, skipped", i);
1579 sample_boxes[i] = NULL;
1590 histo_extent = histo_extent_new;
1605 sample_distribution);
1621 for ( d = 0; d < ndims; d++ )
1623 if ( sample_distribution[d] > 0 )
1627 if ( histo_ndims == 0 )
1631 POSTGIS_DEBUG(3,
" special case: no axes have variability");
1632 histo_cells_new = 1;
1633 for ( d = 0; d < ndims; d++ )
1635 histo_size[d] = 1 + (int)pow((
double)histo_cells_target, 1/(double)ndims);
1636 POSTGIS_DEBUGF(3,
" histo_size[d]: %d", histo_size[d]);
1637 histo_cells_new *= histo_size[d];
1639 POSTGIS_DEBUGF(3,
" histo_cells_new: %d", histo_cells_new);
1648 POSTGIS_DEBUG(3,
" allocating histogram axes based on axis variability");
1649 total_distribution =
total_double(sample_distribution, ndims);
1650 POSTGIS_DEBUGF(3,
" total_distribution: %.8g", total_distribution);
1651 histo_cells_new = 1;
1652 for ( d = 0; d < ndims; d++ )
1654 if ( sample_distribution[d] == 0 )
1661 float edge_ratio = (float)sample_distribution[d] / (
float)total_distribution;
1667 histo_size[d] = (int)pow(histo_cells_target * histo_ndims * edge_ratio, 1/(
double)histo_ndims);
1669 if ( ! histo_size[d] )
1672 histo_cells_new *= histo_size[d];
1674 POSTGIS_DEBUGF(3,
" histo_cells_new: %d", histo_cells_new);
1678 histo_cells = histo_cells_new;
1679 POSTGIS_DEBUGF(3,
" histo_cells: %d", histo_cells);
1684 old_context = MemoryContextSwitchTo(stats->anl_context);
1685 nd_stats_size =
sizeof(
ND_STATS) + ((histo_cells - 1) *
sizeof(float4));
1686 nd_stats = palloc(nd_stats_size);
1687 memset(nd_stats, 0, nd_stats_size);
1688 MemoryContextSwitchTo(old_context);
1691 nd_stats->
ndims = ndims;
1692 nd_stats->
extent = histo_extent;
1697 for ( d = 0; d < ndims; d++ )
1698 nd_stats->
size[d] = histo_size[d];
1713 for ( i = 0; i < notnull_cnt; i++ )
1719 double num_cells = 0;
1720 double tmp_volume = 1.0;
1721 double min[
ND_DIMS] = {0.0, 0.0, 0.0, 0.0};
1722 double max[
ND_DIMS] = {0.0, 0.0, 0.0, 0.0};
1723 double cellsize[
ND_DIMS] = {0.0, 0.0, 0.0, 0.0};
1725 nd_box = sample_boxes[i];
1726 if ( ! nd_box )
continue;
1729 vacuum_delay_point();
1733 memset(at, 0,
sizeof(
int)*
ND_DIMS);
1735 POSTGIS_DEBUGF(3,
" feature %d: ibox (%d, %d, %d, %d) (%d, %d, %d, %d)", i,
1736 nd_ibox.
min[0], nd_ibox.
min[1], nd_ibox.
min[2], nd_ibox.
min[3],
1737 nd_ibox.
max[0], nd_ibox.
max[1], nd_ibox.
max[2], nd_ibox.
max[3]);
1739 for ( d = 0; d < nd_stats->
ndims; d++ )
1742 at[d] = nd_ibox.
min[d];
1745 cellsize[d] = (max[d] - min[d])/(nd_stats->
size[d]);
1748 tmp_volume *= (nd_box->
max[d] - nd_box->
min[d]);
1752 total_sample_volume += tmp_volume;
1760 ND_BOX nd_cell = { {0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0} };
1763 for ( d = 0; d < nd_stats->
ndims; d++ )
1765 nd_cell.
min[d] = min[d] + (at[d]+0) * cellsize[d];
1766 nd_cell.
max[d] = min[d] + (at[d]+1) * cellsize[d];
1777 POSTGIS_DEBUGF(3,
" ratio (%.8g) num_cells (%.8g)", ratio, num_cells);
1778 POSTGIS_DEBUGF(3,
" at (%d, %d, %d, %d)", at[0], at[1], at[2], at[3]);
1783 total_cell_count += num_cells;
1785 histogram_features++;
1788 POSTGIS_DEBUGF(3,
" histogram_features: %d", histogram_features);
1789 POSTGIS_DEBUGF(3,
" sample_rows: %d", sample_rows);
1790 POSTGIS_DEBUGF(3,
" table_rows: %.6g", total_rows);
1793 if ( ! histogram_features )
1795 POSTGIS_DEBUG(3,
" no stats have been gathered");
1796 elog(NOTICE,
" no features lie in the stats histogram, invalid stats");
1797 stats->stats_valid =
false;
1818 stats->stakind[stats_slot] = stats_kind;
1819 stats->staop[stats_slot] = InvalidOid;
1820 stats->stanumbers[stats_slot] = (float4*)nd_stats;
1821 stats->numnumbers[stats_slot] = nd_stats_size/
sizeof(float4);
1822 stats->stanullfrac = (float4)null_cnt/sample_rows;
1823 stats->stawidth = total_width/notnull_cnt;
1824 stats->stadistinct = -1.0;
1825 stats->stats_valid =
true;
1827 POSTGIS_DEBUGF(3,
" out: slot 0: kind %d (STATISTIC_KIND_ND)", stats->stakind[0]);
1828 POSTGIS_DEBUGF(3,
" out: slot 0: op %d (InvalidOid)", stats->staop[0]);
1829 POSTGIS_DEBUGF(3,
" out: slot 0: numnumbers %d", stats->numnumbers[0]);
1830 POSTGIS_DEBUGF(3,
" out: null fraction: %f=%d/%d", stats->stanullfrac, null_cnt, sample_rows);
1831 POSTGIS_DEBUGF(3,
" out: average width: %d bytes", stats->stawidth);
1832 POSTGIS_DEBUG (3,
" out: distinct values: all (no check done)");
int gbox_is_valid(const GBOX *gbox)
Return false if any of the dimensions is NaN or infinite.
int gserialized_get_gbox_p(const GSERIALIZED *g, GBOX *box)
Read the bounding box off a serialization and calculate one if it is not already there.
struct ND_STATS_T ND_STATS
N-dimensional statistics structure.
static char * nd_stats_to_json(const ND_STATS *nd_stats)
Convert an ND_STATS to a JSON representation for external use.
static int nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
Return true if ND_BOX a overlaps b, false otherwise.
static int nd_box_init_bounds(ND_BOX *a)
Prepare an ND_BOX for bounds calculation: set the maxes to the smallest thing possible and the mins t...
static int nd_increment(ND_IBOX *ibox, int ndims, int *counter)
Given an n-d index array (counter), and a domain to increment it in (ibox) increment it by one,...
#define STATISTIC_SLOT_ND
static int gbox_ndims(const GBOX *gbox)
Given that geodetic boxes are X/Y/Z regardless of the underlying geometry dimensionality and other bo...
#define ND_DIMS
The maximum number of dimensions our code can handle.
#define STATISTIC_KIND_2D
static int nd_box_merge(const ND_BOX *source, ND_BOX *target)
Create a printable view of the ND_STATS histogram.
#define STATISTIC_KIND_ND
static double total_double(const double *vals, int nvals)
Given double array, return sum of values.
static void nd_box_from_gbox(const GBOX *gbox, ND_BOX *nd_box)
Set the values of an ND_BOX from a GBOX.
static int nd_box_init(ND_BOX *a)
Zero out an ND_BOX.
static int nd_box_expand(ND_BOX *nd_box, double expansion_factor)
Expand an ND_BOX ever so slightly.
static int nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
What stats cells overlap with this ND_BOX? Put the lowest cell addresses in ND_IBOX->min and the high...
static double nd_box_ratio(const ND_BOX *b1, const ND_BOX *b2, int ndims)
Returns the proportion of b2 that is covered by b1.
static int nd_stats_value_index(const ND_STATS *stats, int *indexes)
Given a position in the n-d histogram (i,j,k) return the position in the 1-d values array.
static char * nd_box_to_json(const ND_BOX *nd_box, int ndims)
Convert an ND_BOX to a JSON string for printing.
#define STATISTIC_SLOT_2D
static int nd_box_array_distribution(const ND_BOX **nd_boxes, int num_boxes, const ND_BOX *extent, int ndims, double *distribution)
Calculate how much a set of boxes is homogenously distributed or contentrated within one dimension,...
N-dimensional box type for calculations, to avoid doing explicit axis conversions from GBOX in all ca...
N-dimensional box index type.
float4 histogram_features
N-dimensional statistics structure.