The gserialized_analyze_nd sets this function as a callback on the stats object when called by the ANALYZE command.
ANALYZE then gathers the requisite number of sample rows and then calls this function.
We could also pass stats->extra_data in from gserialized_analyze_nd (things like the column type or other stuff from the system catalogs) but so far we don't use that capability.
Our job is to build some statistics on the sample data for use by operator estimators.
We will populate an n-d histogram using the provided sample rows. The selectivity estimators (sel and joinsel) can then use the histogram
1229{
1230 MemoryContext old_context;
1231 int d, i;
1232 int notnull_cnt = 0;
1233 int null_cnt = 0;
1234 int histogram_features = 0;
1235
1237 size_t nd_stats_size;
1238
1239 double total_width = 0;
1240 double total_cell_count = 0;
1241
1245
1246 const ND_BOX **sample_boxes;
1251 int histo_cells_target;
1252 int histo_cells;
1253 int histo_cells_new = 1;
1254
1255 int ndims = 2;
1256 int histo_ndims = 0;
1257 double sample_distribution[
ND_DIMS];
1258 double total_distribution;
1259
1260 int stats_slot;
1261 int stats_kind;
1262
1263
1269
1270
1271
1272
1273
1274
1275
1276 POSTGIS_DEBUG(2, "compute_gserialized_stats called");
1277 POSTGIS_DEBUGF(3, " # sample_rows: %d", sample_rows);
1278 POSTGIS_DEBUGF(3, " estimate of total_rows: %.6g", total_rows);
1279
1280
1281
1282
1283
1284 sample_boxes = palloc(
sizeof(
ND_BOX*) * sample_rows);
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296 for ( i = 0; i < sample_rows; i++ )
1297 {
1298 Datum datum;
1301 bool is_null;
1302
1303 datum = fetchfunc(stats, i, &is_null);
1304
1305
1306 if ( is_null )
1307 {
1308 POSTGIS_DEBUGF(4, " skipped null geometry %d", i);
1309 null_cnt++;
1310 continue;
1311 }
1312
1313
1315 {
1316
1317 POSTGIS_DEBUGF(3, " skipped empty geometry %d", i);
1318 continue;
1319 }
1320
1321
1322 if ( mode == 2 )
1324
1325
1327 {
1328 POSTGIS_DEBUGF(3, " skipped infinite/nan geometry %d", i);
1329 continue;
1330 }
1331
1332
1333
1334
1335
1336 if ( mode != 2 )
1338
1339
1340 nd_box = palloc(
sizeof(
ND_BOX));
1342
1343
1344 sample_boxes[notnull_cnt] = nd_box;
1345
1346
1347 if ( ! notnull_cnt )
1349
1350
1352
1353
1354 total_width += toast_raw_datum_size(datum);
1355
1356
1357 for ( d = 0; d < ndims; d++ )
1358 {
1359 sum.
min[d] += nd_box->
min[d];
1360 sum.
max[d] += nd_box->
max[d];
1361 }
1362
1363
1364 notnull_cnt++;
1365
1366
1367#if POSTGIS_PGSQL_VERSION >= 180
1368 vacuum_delay_point(true);
1369#else
1370 vacuum_delay_point();
1371#endif
1372 }
1373
1374#if POSTGIS_PGSQL_VERSION >= 170
1375 POSTGIS_DEBUGF(3, " stats->attstattarget: %d", stats->attstattarget);
1377#else
1378 POSTGIS_DEBUGF(3, " stats->attr->attstattarget: %d", stats->attr->attstattarget);
1380#endif
1381 POSTGIS_DEBUGF(3, " target # of histogram cells: %d", histo_cells_target);
1382
1383
1384 if ( ! notnull_cnt )
1385 {
1386 stats->stats_valid = false;
1387 return;
1388 }
1389
1390 POSTGIS_DEBUGF(3,
" sample_extent: %s",
nd_box_to_json(&sample_extent, ndims));
1391
1392
1393
1394
1395
1396 for ( d = 0; d < ndims; d++ )
1397 {
1398
1399 avg.
min[d] = sum.
min[d] / notnull_cnt;
1400 avg.
max[d] = sum.
max[d] / notnull_cnt;
1401
1402
1403 for ( i = 0; i < notnull_cnt; i++ )
1404 {
1405 const ND_BOX *ndb = sample_boxes[i];
1406 stddev.
min[d] += (ndb->
min[d] - avg.
min[d]) * (ndb->
min[d] - avg.
min[d]);
1407 stddev.
max[d] += (ndb->
max[d] - avg.
max[d]) * (ndb->
max[d] - avg.
max[d]);
1408 }
1409 stddev.
min[d] = sqrt(stddev.
min[d] / notnull_cnt);
1410 stddev.
max[d] = sqrt(stddev.
max[d] / notnull_cnt);
1411
1412
1415 }
1416
1417
1418
1419
1420
1421
1423 for ( i = 0; i < notnull_cnt; i++ )
1424 {
1425 const ND_BOX *ndb = sample_boxes[i];
1426
1428 {
1429 POSTGIS_DEBUGF(4, " feature %d is a hard deviant, skipped", i);
1430 sample_boxes[i] = NULL;
1431 continue;
1432 }
1433
1435 }
1436
1437
1438
1439
1441 histo_extent = histo_extent_new;
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1456 sample_distribution);
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472 for ( d = 0; d < ndims; d++ )
1473 {
1474 if ( sample_distribution[d] > 0 )
1475 histo_ndims++;
1476 }
1477
1478 if ( histo_ndims == 0 )
1479 {
1480
1481
1482 POSTGIS_DEBUG(3, " special case: no axes have variability");
1483 histo_cells_new = 1;
1484 for ( d = 0; d < ndims; d++ )
1485 {
1486 histo_size[d] = (int)pow((double)histo_cells_target, 1/(double)ndims);
1487 if ( ! histo_size[d] )
1488 histo_size[d] = 1;
1489 POSTGIS_DEBUGF(3, " histo_size[d]: %d", histo_size[d]);
1490 histo_cells_new *= histo_size[d];
1491 }
1492 POSTGIS_DEBUGF(3, " histo_cells_new: %d", histo_cells_new);
1493 }
1494 else
1495 {
1496
1497
1498
1499
1500
1501 POSTGIS_DEBUG(3, " allocating histogram axes based on axis variability");
1502 total_distribution =
total_double(sample_distribution, ndims);
1503 POSTGIS_DEBUGF(3, " total_distribution: %.8g", total_distribution);
1504 histo_cells_new = 1;
1505 for ( d = 0; d < ndims; d++ )
1506 {
1507 if ( sample_distribution[d] == 0 )
1508 {
1509 histo_size[d] = 1;
1510 }
1511 else
1512 {
1513
1514 float edge_ratio = (float)sample_distribution[d] / (float)total_distribution;
1515
1516
1517
1518
1519
1520
1521
1523 }
1524 histo_cells_new *= histo_size[d];
1525 }
1526 POSTGIS_DEBUGF(3, " histo_cells_new: %d", histo_cells_new);
1527 }
1528
1529
1530 histo_cells = histo_cells_new;
1531 POSTGIS_DEBUGF(3, " histo_cells: %d", histo_cells);
1532
1533
1534
1535
1536 old_context = MemoryContextSwitchTo(stats->anl_context);
1537 nd_stats_size =
sizeof(
ND_STATS) + ((histo_cells - 1) *
sizeof(float4));
1538 nd_stats = palloc(nd_stats_size);
1539 memset(nd_stats, 0, nd_stats_size);
1540 MemoryContextSwitchTo(old_context);
1541
1542
1543 nd_stats->
ndims = ndims;
1544 nd_stats->
extent = histo_extent;
1548
1549 for ( d = 0; d < ndims; d++ )
1550 nd_stats->
size[d] = histo_size[d];
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565 for ( i = 0; i < notnull_cnt; i++ )
1566 {
1570 double num_cells = 0;
1571 double min[
ND_DIMS] = {0.0, 0.0, 0.0, 0.0};
1572 double max[
ND_DIMS] = {0.0, 0.0, 0.0, 0.0};
1573 double cellsize[
ND_DIMS] = {0.0, 0.0, 0.0, 0.0};
1574
1575 nd_box = sample_boxes[i];
1576 if ( ! nd_box ) continue;
1577
1578
1579#if POSTGIS_PGSQL_VERSION >= 180
1580 vacuum_delay_point(true);
1581#else
1582 vacuum_delay_point();
1583#endif
1584
1585
1587 memset(at, 0,
sizeof(
int)*
ND_DIMS);
1588
1589 POSTGIS_DEBUGF(3, " feature %d: ibox (%d, %d, %d, %d) (%d, %d, %d, %d)", i,
1590 nd_ibox.
min[0], nd_ibox.
min[1], nd_ibox.
min[2], nd_ibox.
min[3],
1591 nd_ibox.
max[0], nd_ibox.
max[1], nd_ibox.
max[2], nd_ibox.
max[3]);
1592
1593 for ( d = 0; d < nd_stats->
ndims; d++ )
1594 {
1595
1596 at[d] = nd_ibox.
min[d];
1599 cellsize[d] = (max[d] - min[d])/(nd_stats->
size[d]);
1600 }
1601
1602
1603
1604
1605
1606 do
1607 {
1608 ND_BOX nd_cell = { {0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0} };
1609 double ratio;
1610
1611 for ( d = 0; d < nd_stats->
ndims; d++ )
1612 {
1613 nd_cell.
min[d] = min[d] + (at[d]+0) * cellsize[d];
1614 nd_cell.
max[d] = min[d] + (at[d]+1) * cellsize[d];
1615 }
1616
1617
1618
1619
1620
1621
1624 num_cells += ratio;
1625 POSTGIS_DEBUGF(3, " ratio (%.8g) num_cells (%.8g)", ratio, num_cells);
1626 POSTGIS_DEBUGF(3, " at (%d, %d, %d, %d)", at[0], at[1], at[2], at[3]);
1627 }
1629
1630
1631 total_cell_count += num_cells;
1632
1633 histogram_features++;
1634 }
1635
1636 POSTGIS_DEBUGF(3, " histogram_features: %d", histogram_features);
1637 POSTGIS_DEBUGF(3, " sample_rows: %d", sample_rows);
1638 POSTGIS_DEBUGF(3, " table_rows: %.6g", total_rows);
1639
1640
1641 if ( ! histogram_features )
1642 {
1643 POSTGIS_DEBUG(3, " no stats have been gathered");
1644 elog(NOTICE, " no features lie in the stats histogram, invalid stats");
1645 stats->stats_valid = false;
1646 return;
1647 }
1648
1652
1653
1654 if ( mode == 2 )
1655 {
1658 }
1659 else
1660 {
1663 }
1664
1665
1666 stats->stakind[stats_slot] = stats_kind;
1667 stats->staop[stats_slot] = InvalidOid;
1668 stats->stanumbers[stats_slot] = (float4*)nd_stats;
1669 stats->numnumbers[stats_slot] = nd_stats_size/sizeof(float4);
1670 stats->stanullfrac = (float4)null_cnt/sample_rows;
1671 stats->stawidth = total_width/notnull_cnt;
1672 stats->stadistinct = -1.0;
1673 stats->stats_valid = true;
1674
1675 POSTGIS_DEBUGF(3, " out: slot 0: kind %d (STATISTIC_KIND_ND)", stats->stakind[0]);
1676 POSTGIS_DEBUGF(3, " out: slot 0: op %d (InvalidOid)", stats->staop[0]);
1677 POSTGIS_DEBUGF(3, " out: slot 0: numnumbers %d", stats->numnumbers[0]);
1678 POSTGIS_DEBUGF(3, " out: null fraction: %f=%d/%d", stats->stanullfrac, null_cnt, sample_rows);
1679 POSTGIS_DEBUGF(3, " out: average width: %d bytes", stats->stawidth);
1680 POSTGIS_DEBUG (3, " out: distinct values: all (no check done)");
1682
1683
1684
1685
1686 return;
1687}
int gbox_is_valid(const GBOX *gbox)
Return false if any of the dimensions is NaN or infinite.
static int nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
Return true if ND_BOX a overlaps b, false otherwise.
static int nd_box_init_bounds(ND_BOX *a)
Prepare an ND_BOX for bounds calculation: set the maxes to the smallest thing possible and the mins t...
static int nd_increment(ND_IBOX *ibox, int ndims, int *counter)
Given an n-d index array (counter), and a domain to increment it in (ibox) increment it by one,...
#define STATISTIC_SLOT_ND
static int gbox_ndims(const GBOX *gbox)
Given that geodetic boxes are X/Y/Z regardless of the underlying geometry dimensionality and other bo...
static char * nd_box_to_json(const ND_BOX *nd_box, int ndims)
Convert an ND_BOX to a JSON string for printing.
static char * nd_stats_to_json(const ND_STATS *nd_stats)
Convert an ND_STATS to a JSON representation for external use.
#define STATISTIC_KIND_2D
static int nd_box_merge(const ND_BOX *source, ND_BOX *target)
Expand the bounds of target to include source.
#define STATISTIC_KIND_ND
static double total_double(const double *vals, int nvals)
Given double array, return sum of values.
static void nd_box_from_gbox(const GBOX *gbox, ND_BOX *nd_box)
Set the values of an ND_BOX from a GBOX.
static int nd_box_init(ND_BOX *a)
Zero out an ND_BOX.
static int nd_box_expand(ND_BOX *nd_box, double expansion_factor)
Expand an ND_BOX ever so slightly.
static int nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
What stats cells overlap with this ND_BOX? Put the lowest cell addresses in ND_IBOX->min and the high...
#define STATISTIC_SLOT_2D
static int nd_box_array_distribution(const ND_BOX **nd_boxes, int num_boxes, const ND_BOX *extent, int ndims, double *distribution)
Calculate how much a set of boxes is homogeneously distributed or contentrated within one dimension,...
struct ND_STATS_T ND_STATS
static int histogram_axis_cells(int histo_cells_target, int histo_ndims, double edge_ratio)
static double nd_box_ratio(const ND_BOX *cover, const ND_BOX *target, int ndims)
static int histogram_cell_budget(double total_rows, int ndims, int attstattarget)
static int nd_stats_value_index(const ND_STATS *stats, const int *indexes)
int gserialized_datum_get_gbox_p(Datum gsdatum, GBOX *gbox)
Given a GSERIALIZED datum, as quickly as possible (peaking into the top of the memory) return the gbo...
float4 histogram_features