The gserialized_analyze_nd sets this function as a callback on the stats object when called by the ANALYZE command.
ANALYZE then gathers the requisite number of sample rows and then calls this function.
We could also pass stats->extra_data in from gserialized_analyze_nd (things like the column type or other stuff from the system catalogs) but so far we don't use that capability.
Our job is to build some statistics on the sample data for use by operator estimators.
We will populate an n-d histogram using the provided sample rows. The selectivity estimators (sel and joinsel) can then use the histogram
1385 MemoryContext old_context;
1387 int notnull_cnt = 0;
1389 int histogram_features = 0;
1392 size_t nd_stats_size;
1394 double total_width = 0;
1395 double total_sample_volume = 0;
1396 double total_cell_count = 0;
1402 const ND_BOX **sample_boxes;
1407 int histo_cells_target;
1409 int histo_cells_new = 1;
1412 int histo_ndims = 0;
1413 double sample_distribution[
ND_DIMS];
1414 double total_distribution;
1419 (void)total_sample_volume;
1434 POSTGIS_DEBUG(2,
"compute_gserialized_stats called");
1435 POSTGIS_DEBUGF(3,
" # sample_rows: %d", sample_rows);
1436 POSTGIS_DEBUGF(3,
" estimate of total_rows: %.6g", total_rows);
1442 sample_boxes = palloc(
sizeof(
ND_BOX*) * sample_rows);
1454 for ( i = 0; i < sample_rows; i++ )
1461 datum = fetchfunc(stats, i, &is_null);
1466 POSTGIS_DEBUGF(4,
" skipped null geometry %d", i);
1475 POSTGIS_DEBUGF(3,
" skipped empty geometry %d", i);
1486 POSTGIS_DEBUGF(3,
" skipped infinite/nan geometry %d", i);
1498 nd_box = palloc(
sizeof(
ND_BOX));
1502 sample_boxes[notnull_cnt] = nd_box;
1505 if ( ! notnull_cnt )
1512 total_width += toast_raw_datum_size(datum);
1515 for ( d = 0; d < ndims; d++ )
1517 sum.
min[d] += nd_box->
min[d];
1518 sum.
max[d] += nd_box->
max[d];
1525 vacuum_delay_point();
1536 histo_cells_target = (int)pow((
double)(stats->attr->attstattarget), (
double)ndims);
1537 histo_cells_target = Min(histo_cells_target, ndims * 10000);
1538 histo_cells_target = Min(histo_cells_target, (
int)(total_rows/5));
1539 POSTGIS_DEBUGF(3,
" stats->attr->attstattarget: %d", stats->attr->attstattarget);
1540 POSTGIS_DEBUGF(3,
" target # of histogram cells: %d", histo_cells_target);
1543 if ( ! notnull_cnt )
1545 Oid relation_oid = stats->attr->attrelid;
1546 char *relation_name = get_rel_name(relation_oid);
1548 "PostGIS: Unable to compute statistics for \"%s.%s\": No non-null/empty features",
1549 relation_name ? relation_name :
"(NULL)",
1550 stats->attr->attname.data);
1551 stats->stats_valid =
false;
1555 POSTGIS_DEBUGF(3,
" sample_extent: %s",
nd_box_to_json(&sample_extent, ndims));
1561 for ( d = 0; d < ndims; d++ )
1564 avg.
min[d] = sum.
min[d] / notnull_cnt;
1565 avg.
max[d] = sum.
max[d] / notnull_cnt;
1568 for ( i = 0; i < notnull_cnt; i++ )
1570 const ND_BOX *ndb = sample_boxes[i];
1571 stddev.
min[d] += (ndb->
min[d] - avg.
min[d]) * (ndb->
min[d] - avg.
min[d]);
1572 stddev.
max[d] += (ndb->
max[d] - avg.
max[d]) * (ndb->
max[d] - avg.
max[d]);
1574 stddev.
min[d] = sqrt(stddev.
min[d] / notnull_cnt);
1575 stddev.
max[d] = sqrt(stddev.
max[d] / notnull_cnt);
1588 for ( i = 0; i < notnull_cnt; i++ )
1590 const ND_BOX *ndb = sample_boxes[i];
1594 POSTGIS_DEBUGF(4,
" feature %d is a hard deviant, skipped", i);
1595 sample_boxes[i] = NULL;
1606 histo_extent = histo_extent_new;
1621 sample_distribution);
1637 for ( d = 0; d < ndims; d++ )
1639 if ( sample_distribution[d] > 0 )
1643 if ( histo_ndims == 0 )
1647 POSTGIS_DEBUG(3,
" special case: no axes have variability");
1648 histo_cells_new = 1;
1649 for ( d = 0; d < ndims; d++ )
1651 histo_size[d] = (int)pow((
double)histo_cells_target, 1/(double)ndims);
1652 if ( ! histo_size[d] )
1654 POSTGIS_DEBUGF(3,
" histo_size[d]: %d", histo_size[d]);
1655 histo_cells_new *= histo_size[d];
1657 POSTGIS_DEBUGF(3,
" histo_cells_new: %d", histo_cells_new);
1666 POSTGIS_DEBUG(3,
" allocating histogram axes based on axis variability");
1667 total_distribution =
total_double(sample_distribution, ndims);
1668 POSTGIS_DEBUGF(3,
" total_distribution: %.8g", total_distribution);
1669 histo_cells_new = 1;
1670 for ( d = 0; d < ndims; d++ )
1672 if ( sample_distribution[d] == 0 )
1679 float edge_ratio = (float)sample_distribution[d] / (
float)total_distribution;
1685 histo_size[d] = (int)pow(histo_cells_target * histo_ndims * edge_ratio, 1/(
double)histo_ndims);
1687 if ( ! histo_size[d] )
1690 histo_cells_new *= histo_size[d];
1692 POSTGIS_DEBUGF(3,
" histo_cells_new: %d", histo_cells_new);
1696 histo_cells = histo_cells_new;
1697 POSTGIS_DEBUGF(3,
" histo_cells: %d", histo_cells);
1702 old_context = MemoryContextSwitchTo(stats->anl_context);
1703 nd_stats_size =
sizeof(
ND_STATS) + ((histo_cells - 1) *
sizeof(float4));
1704 nd_stats = palloc(nd_stats_size);
1705 memset(nd_stats, 0, nd_stats_size);
1706 MemoryContextSwitchTo(old_context);
1709 nd_stats->
ndims = ndims;
1710 nd_stats->
extent = histo_extent;
1715 for ( d = 0; d < ndims; d++ )
1716 nd_stats->
size[d] = histo_size[d];
1731 for ( i = 0; i < notnull_cnt; i++ )
1737 double num_cells = 0;
1738 double tmp_volume = 1.0;
1739 double min[
ND_DIMS] = {0.0, 0.0, 0.0, 0.0};
1740 double max[
ND_DIMS] = {0.0, 0.0, 0.0, 0.0};
1741 double cellsize[
ND_DIMS] = {0.0, 0.0, 0.0, 0.0};
1743 nd_box = sample_boxes[i];
1744 if ( ! nd_box )
continue;
1747 vacuum_delay_point();
1751 memset(at, 0,
sizeof(
int)*
ND_DIMS);
1753 POSTGIS_DEBUGF(3,
" feature %d: ibox (%d, %d, %d, %d) (%d, %d, %d, %d)", i,
1754 nd_ibox.
min[0], nd_ibox.
min[1], nd_ibox.
min[2], nd_ibox.
min[3],
1755 nd_ibox.
max[0], nd_ibox.
max[1], nd_ibox.
max[2], nd_ibox.
max[3]);
1757 for ( d = 0; d < nd_stats->
ndims; d++ )
1760 at[d] = nd_ibox.
min[d];
1763 cellsize[d] = (max[d] - min[d])/(nd_stats->
size[d]);
1766 tmp_volume *= (nd_box->
max[d] - nd_box->
min[d]);
1770 total_sample_volume += tmp_volume;
1778 ND_BOX nd_cell = { {0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0} };
1781 for ( d = 0; d < nd_stats->
ndims; d++ )
1783 nd_cell.
min[d] = min[d] + (at[d]+0) * cellsize[d];
1784 nd_cell.
max[d] = min[d] + (at[d]+1) * cellsize[d];
1795 POSTGIS_DEBUGF(3,
" ratio (%.8g) num_cells (%.8g)", ratio, num_cells);
1796 POSTGIS_DEBUGF(3,
" at (%d, %d, %d, %d)", at[0], at[1], at[2], at[3]);
1801 total_cell_count += num_cells;
1803 histogram_features++;
1806 POSTGIS_DEBUGF(3,
" histogram_features: %d", histogram_features);
1807 POSTGIS_DEBUGF(3,
" sample_rows: %d", sample_rows);
1808 POSTGIS_DEBUGF(3,
" table_rows: %.6g", total_rows);
1811 if ( ! histogram_features )
1813 POSTGIS_DEBUG(3,
" no stats have been gathered");
1814 elog(NOTICE,
" no features lie in the stats histogram, invalid stats");
1815 stats->stats_valid =
false;
1836 stats->stakind[stats_slot] = stats_kind;
1837 stats->staop[stats_slot] = InvalidOid;
1838 stats->stanumbers[stats_slot] = (float4*)nd_stats;
1839 stats->numnumbers[stats_slot] = nd_stats_size/
sizeof(float4);
1840 stats->stanullfrac = (float4)null_cnt/sample_rows;
1841 stats->stawidth = total_width/notnull_cnt;
1842 stats->stadistinct = -1.0;
1843 stats->stats_valid =
true;
1845 POSTGIS_DEBUGF(3,
" out: slot 0: kind %d (STATISTIC_KIND_ND)", stats->stakind[0]);
1846 POSTGIS_DEBUGF(3,
" out: slot 0: op %d (InvalidOid)", stats->staop[0]);
1847 POSTGIS_DEBUGF(3,
" out: slot 0: numnumbers %d", stats->numnumbers[0]);
1848 POSTGIS_DEBUGF(3,
" out: null fraction: %f=%d/%d", stats->stanullfrac, null_cnt, sample_rows);
1849 POSTGIS_DEBUGF(3,
" out: average width: %d bytes", stats->stawidth);
1850 POSTGIS_DEBUG (3,
" out: distinct values: all (no check done)");
int gbox_is_valid(const GBOX *gbox)
Return false if any of the dimensions is NaN or infinite.
struct ND_STATS_T ND_STATS
N-dimensional statistics structure.
static char * nd_stats_to_json(const ND_STATS *nd_stats)
Convert an ND_STATS to a JSON representation for external use.
static int nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
Return true if ND_BOX a overlaps b, false otherwise.
static int nd_box_init_bounds(ND_BOX *a)
Prepare an ND_BOX for bounds calculation: set the maxes to the smallest thing possible and the mins t...
static int nd_increment(ND_IBOX *ibox, int ndims, int *counter)
Given an n-d index array (counter), and a domain to increment it in (ibox) increment it by one,...
#define STATISTIC_SLOT_ND
static int gbox_ndims(const GBOX *gbox)
Given that geodetic boxes are X/Y/Z regardless of the underlying geometry dimensionality and other bo...
#define ND_DIMS
The maximum number of dimensions our code can handle.
#define STATISTIC_KIND_2D
static int nd_box_merge(const ND_BOX *source, ND_BOX *target)
Create a printable view of the ND_STATS histogram.
#define STATISTIC_KIND_ND
static double total_double(const double *vals, int nvals)
Given double array, return sum of values.
static void nd_box_from_gbox(const GBOX *gbox, ND_BOX *nd_box)
Set the values of an ND_BOX from a GBOX.
static int nd_box_init(ND_BOX *a)
Zero out an ND_BOX.
static int nd_box_expand(ND_BOX *nd_box, double expansion_factor)
Expand an ND_BOX ever so slightly.
static int nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
What stats cells overlap with this ND_BOX? Put the lowest cell addresses in ND_IBOX->min and the high...
static double nd_box_ratio(const ND_BOX *b1, const ND_BOX *b2, int ndims)
Returns the proportion of b2 that is covered by b1.
static int nd_stats_value_index(const ND_STATS *stats, int *indexes)
Given a position in the n-d histogram (i,j,k) return the position in the 1-d values array.
static char * nd_box_to_json(const ND_BOX *nd_box, int ndims)
Convert an ND_BOX to a JSON string for printing.
#define STATISTIC_SLOT_2D
static int nd_box_array_distribution(const ND_BOX **nd_boxes, int num_boxes, const ND_BOX *extent, int ndims, double *distribution)
Calculate how much a set of boxes is homogenously distributed or contentrated within one dimension,...
int gserialized_datum_get_gbox_p(Datum gsdatum, GBOX *gbox)
Given a GSERIALIZED datum, as quickly as possible (peaking into the top of the memory) return the gbo...
N-dimensional box type for calculations, to avoid doing explicit axis conversions from GBOX in all ca...
N-dimensional box index type.
float4 histogram_features
N-dimensional statistics structure.