The gserialized_analyze_nd sets this function as a callback on the stats object when called by the ANALYZE command.
ANALYZE then gathers the requisite number of sample rows and then calls this function.
We could also pass stats->extra_data in from gserialized_analyze_nd (things like the column type or other stuff from the system catalogs) but so far we don't use that capability.
Our job is to build some statistics on the sample data for use by operator estimators.
We will populate an n-d histogram using the provided sample rows. The selectivity estimators (sel and j_oinsel) can then use the histogram
1369 MemoryContext old_context;
1371 int notnull_cnt = 0;
1373 int histogram_features = 0;
1376 size_t nd_stats_size;
1378 double total_width = 0;
1379 double total_sample_volume = 0;
1380 double total_cell_count = 0;
1386 const ND_BOX **sample_boxes;
1391 int histo_cells_target;
1393 int histo_cells_new = 1;
1396 int histo_ndims = 0;
1397 double sample_distribution[
ND_DIMS];
1398 double total_distribution;
1416 POSTGIS_DEBUG(2,
"compute_gserialized_stats called");
1417 POSTGIS_DEBUGF(3,
" # sample_rows: %d", sample_rows);
1418 POSTGIS_DEBUGF(3,
" estimate of total_rows: %.6g", total_rows);
1424 sample_boxes = palloc(
sizeof(
ND_BOX*) * sample_rows);
1436 for ( i = 0; i < sample_rows; i++ )
1445 datum = fetchfunc(stats, i, &is_null);
1450 POSTGIS_DEBUGF(4,
" skipped null geometry %d", i);
1457 is_copy = VARATT_IS_EXTENDED(datum);
1461 POSTGIS_DEBUGF(3,
" skipped empty geometry %d", i);
1472 POSTGIS_DEBUGF(3,
" skipped infinite/nan geometry %d", i);
1484 nd_box = palloc(
sizeof(
ND_BOX));
1488 sample_boxes[notnull_cnt] = nd_box;
1491 if ( ! notnull_cnt )
1498 total_width += VARSIZE(geom);
1501 for ( d = 0; d < ndims; d++ )
1503 sum.
min[d] += nd_box->
min[d];
1504 sum.
max[d] += nd_box->
max[d];
1515 vacuum_delay_point();
1526 histo_cells_target = (int)pow((
double)(stats->attr->attstattarget), (
double)ndims);
1527 histo_cells_target = Min(histo_cells_target, ndims * 10000);
1528 histo_cells_target = Min(histo_cells_target, (
int)(total_rows/5));
1529 POSTGIS_DEBUGF(3,
" stats->attr->attstattarget: %d", stats->attr->attstattarget);
1530 POSTGIS_DEBUGF(3,
" target # of histogram cells: %d", histo_cells_target);
1533 if ( ! notnull_cnt )
1535 Oid relation_oid = stats->attr->attrelid;
1536 char *relation_name = get_rel_name(relation_oid);
1538 "PostGIS: Unable to compute statistics for \"%s.%s\": No non-null/empty features",
1539 relation_name ? relation_name :
"(NULL)",
1540 stats->attr->attname.data);
1541 stats->stats_valid =
false;
1545 POSTGIS_DEBUGF(3,
" sample_extent: %s",
nd_box_to_json(&sample_extent, ndims));
1551 for ( d = 0; d < ndims; d++ )
1554 avg.
min[d] = sum.
min[d] / notnull_cnt;
1555 avg.
max[d] = sum.
max[d] / notnull_cnt;
1558 for ( i = 0; i < notnull_cnt; i++ )
1560 const ND_BOX *ndb = sample_boxes[i];
1561 stddev.
min[d] += (ndb->
min[d] - avg.
min[d]) * (ndb->
min[d] - avg.
min[d]);
1562 stddev.
max[d] += (ndb->
max[d] - avg.
max[d]) * (ndb->
max[d] - avg.
max[d]);
1564 stddev.
min[d] = sqrt(stddev.
min[d] / notnull_cnt);
1565 stddev.
max[d] = sqrt(stddev.
max[d] / notnull_cnt);
1578 for ( i = 0; i < notnull_cnt; i++ )
1580 const ND_BOX *ndb = sample_boxes[i];
1584 POSTGIS_DEBUGF(4,
" feature %d is a hard deviant, skipped", i);
1585 sample_boxes[i] = NULL;
1596 histo_extent = histo_extent_new;
1611 sample_distribution);
1627 for ( d = 0; d < ndims; d++ )
1629 if ( sample_distribution[d] > 0 )
1633 if ( histo_ndims == 0 )
1637 POSTGIS_DEBUG(3,
" special case: no axes have variability");
1638 histo_cells_new = 1;
1639 for ( d = 0; d < ndims; d++ )
1641 histo_size[d] = (int)pow((
double)histo_cells_target, 1/(double)ndims);
1642 if ( ! histo_size[d] )
1644 POSTGIS_DEBUGF(3,
" histo_size[d]: %d", histo_size[d]);
1645 histo_cells_new *= histo_size[d];
1647 POSTGIS_DEBUGF(3,
" histo_cells_new: %d", histo_cells_new);
1656 POSTGIS_DEBUG(3,
" allocating histogram axes based on axis variability");
1657 total_distribution =
total_double(sample_distribution, ndims);
1658 POSTGIS_DEBUGF(3,
" total_distribution: %.8g", total_distribution);
1659 histo_cells_new = 1;
1660 for ( d = 0; d < ndims; d++ )
1662 if ( sample_distribution[d] == 0 )
1669 float edge_ratio = (float)sample_distribution[d] / (
float)total_distribution;
1675 histo_size[d] = (int)pow(histo_cells_target * histo_ndims * edge_ratio, 1/(
double)histo_ndims);
1677 if ( ! histo_size[d] )
1680 histo_cells_new *= histo_size[d];
1682 POSTGIS_DEBUGF(3,
" histo_cells_new: %d", histo_cells_new);
1686 histo_cells = histo_cells_new;
1687 POSTGIS_DEBUGF(3,
" histo_cells: %d", histo_cells);
1692 old_context = MemoryContextSwitchTo(stats->anl_context);
1693 nd_stats_size =
sizeof(
ND_STATS) + ((histo_cells - 1) *
sizeof(float4));
1694 nd_stats = palloc(nd_stats_size);
1695 memset(nd_stats, 0, nd_stats_size);
1696 MemoryContextSwitchTo(old_context);
1699 nd_stats->
ndims = ndims;
1700 nd_stats->
extent = histo_extent;
1705 for ( d = 0; d < ndims; d++ )
1706 nd_stats->
size[d] = histo_size[d];
1721 for ( i = 0; i < notnull_cnt; i++ )
1727 double num_cells = 0;
1728 double tmp_volume = 1.0;
1729 double min[
ND_DIMS] = {0.0, 0.0, 0.0, 0.0};
1730 double max[
ND_DIMS] = {0.0, 0.0, 0.0, 0.0};
1731 double cellsize[
ND_DIMS] = {0.0, 0.0, 0.0, 0.0};
1733 nd_box = sample_boxes[i];
1734 if ( ! nd_box )
continue;
1737 vacuum_delay_point();
1741 memset(at, 0,
sizeof(
int)*
ND_DIMS);
1743 POSTGIS_DEBUGF(3,
" feature %d: ibox (%d, %d, %d, %d) (%d, %d, %d, %d)", i,
1744 nd_ibox.
min[0], nd_ibox.
min[1], nd_ibox.
min[2], nd_ibox.
min[3],
1745 nd_ibox.
max[0], nd_ibox.
max[1], nd_ibox.
max[2], nd_ibox.
max[3]);
1747 for ( d = 0; d < nd_stats->
ndims; d++ )
1750 at[d] = nd_ibox.
min[d];
1753 cellsize[d] = (max[d] - min[d])/(nd_stats->
size[d]);
1756 tmp_volume *= (nd_box->
max[d] - nd_box->
min[d]);
1760 total_sample_volume += tmp_volume;
1768 ND_BOX nd_cell = { {0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0} };
1771 for ( d = 0; d < nd_stats->
ndims; d++ )
1773 nd_cell.
min[d] = min[d] + (at[d]+0) * cellsize[d];
1774 nd_cell.
max[d] = min[d] + (at[d]+1) * cellsize[d];
1785 POSTGIS_DEBUGF(3,
" ratio (%.8g) num_cells (%.8g)", ratio, num_cells);
1786 POSTGIS_DEBUGF(3,
" at (%d, %d, %d, %d)", at[0], at[1], at[2], at[3]);
1791 total_cell_count += num_cells;
1793 histogram_features++;
1796 POSTGIS_DEBUGF(3,
" histogram_features: %d", histogram_features);
1797 POSTGIS_DEBUGF(3,
" sample_rows: %d", sample_rows);
1798 POSTGIS_DEBUGF(3,
" table_rows: %.6g", total_rows);
1801 if ( ! histogram_features )
1803 POSTGIS_DEBUG(3,
" no stats have been gathered");
1804 elog(NOTICE,
" no features lie in the stats histogram, invalid stats");
1805 stats->stats_valid =
false;
1826 stats->stakind[stats_slot] = stats_kind;
1827 stats->staop[stats_slot] = InvalidOid;
1828 stats->stanumbers[stats_slot] = (float4*)nd_stats;
1829 stats->numnumbers[stats_slot] = nd_stats_size/
sizeof(float4);
1830 stats->stanullfrac = (float4)null_cnt/sample_rows;
1831 stats->stawidth = total_width/notnull_cnt;
1832 stats->stadistinct = -1.0;
1833 stats->stats_valid =
true;
1835 POSTGIS_DEBUGF(3,
" out: slot 0: kind %d (STATISTIC_KIND_ND)", stats->stakind[0]);
1836 POSTGIS_DEBUGF(3,
" out: slot 0: op %d (InvalidOid)", stats->staop[0]);
1837 POSTGIS_DEBUGF(3,
" out: slot 0: numnumbers %d", stats->numnumbers[0]);
1838 POSTGIS_DEBUGF(3,
" out: null fraction: %f=%d/%d", stats->stanullfrac, null_cnt, sample_rows);
1839 POSTGIS_DEBUGF(3,
" out: average width: %d bytes", stats->stawidth);
1840 POSTGIS_DEBUG (3,
" out: distinct values: all (no check done)");
int gbox_is_valid(const GBOX *gbox)
Return false if any of the dimensions is NaN or infinite.
int gserialized_get_gbox_p(const GSERIALIZED *g, GBOX *gbox)
Read the box from the GSERIALIZED or calculate it if necessary.
struct ND_STATS_T ND_STATS
N-dimensional statistics structure.
static char * nd_stats_to_json(const ND_STATS *nd_stats)
Convert an ND_STATS to a JSON representation for external use.
static int nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
Return true if ND_BOX a overlaps b, false otherwise.
static int nd_box_init_bounds(ND_BOX *a)
Prepare an ND_BOX for bounds calculation: set the maxes to the smallest thing possible and the mins t...
static int nd_increment(ND_IBOX *ibox, int ndims, int *counter)
Given an n-d index array (counter), and a domain to increment it in (ibox) increment it by one,...
#define STATISTIC_SLOT_ND
static int gbox_ndims(const GBOX *gbox)
Given that geodetic boxes are X/Y/Z regardless of the underlying geometry dimensionality and other bo...
#define ND_DIMS
The maximum number of dimensions our code can handle.
#define STATISTIC_KIND_2D
static int nd_box_merge(const ND_BOX *source, ND_BOX *target)
Create a printable view of the ND_STATS histogram.
#define STATISTIC_KIND_ND
static double total_double(const double *vals, int nvals)
Given double array, return sum of values.
static void nd_box_from_gbox(const GBOX *gbox, ND_BOX *nd_box)
Set the values of an ND_BOX from a GBOX.
static int nd_box_init(ND_BOX *a)
Zero out an ND_BOX.
static int nd_box_expand(ND_BOX *nd_box, double expansion_factor)
Expand an ND_BOX ever so slightly.
static int nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
What stats cells overlap with this ND_BOX? Put the lowest cell addresses in ND_IBOX->min and the high...
static double nd_box_ratio(const ND_BOX *b1, const ND_BOX *b2, int ndims)
Returns the proportion of b2 that is covered by b1.
static int nd_stats_value_index(const ND_STATS *stats, int *indexes)
Given a position in the n-d histogram (i,j,k) return the position in the 1-d values array.
static char * nd_box_to_json(const ND_BOX *nd_box, int ndims)
Convert an ND_BOX to a JSON string for printing.
#define STATISTIC_SLOT_2D
static int nd_box_array_distribution(const ND_BOX **nd_boxes, int num_boxes, const ND_BOX *extent, int ndims, double *distribution)
Calculate how much a set of boxes is homogenously distributed or contentrated within one dimension,...
N-dimensional box type for calculations, to avoid doing explicit axis conversions from GBOX in all ca...
N-dimensional box index type.
float4 histogram_features
N-dimensional statistics structure.