The gserialized_analyze_nd sets this function as a callback on the stats object when called by the ANALYZE command.
ANALYZE then gathers the requisite number of sample rows and then calls this function.
We could also pass stats->extra_data in from gserialized_analyze_nd (things like the column type or other stuff from the system catalogs) but so far we don't use that capability.
Our job is to build some statistics on the sample data for use by operator estimators.
We will populate an n-d histogram using the provided sample rows. The selectivity estimators (sel and joinsel) can then use the histogram
1225{
1226 MemoryContext old_context;
1227 int d, i;
1228 int notnull_cnt = 0;
1229 int null_cnt = 0;
1230 int histogram_features = 0;
1231
1233 size_t nd_stats_size;
1234
1235 double total_width = 0;
1236 double total_cell_count = 0;
1237
1241
1242 const ND_BOX **sample_boxes;
1247 int histo_cells_target;
1248 int histo_cells;
1249 int histo_cells_new = 1;
1250
1251 int ndims = 2;
1252 int histo_ndims = 0;
1253 double sample_distribution[
ND_DIMS];
1254 double total_distribution;
1255
1256 int stats_slot;
1257 int stats_kind;
1258
1259
1265
1266
1267
1268
1269
1270
1271
1272 POSTGIS_DEBUG(2, "compute_gserialized_stats called");
1273 POSTGIS_DEBUGF(3, " # sample_rows: %d", sample_rows);
1274 POSTGIS_DEBUGF(3, " estimate of total_rows: %.6g", total_rows);
1275
1276
1277
1278
1279
1280 sample_boxes = palloc(
sizeof(
ND_BOX*) * sample_rows);
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292 for ( i = 0; i < sample_rows; i++ )
1293 {
1294 Datum datum;
1297 bool is_null;
1298
1299 datum = fetchfunc(stats, i, &is_null);
1300
1301
1302 if ( is_null )
1303 {
1304 POSTGIS_DEBUGF(4, " skipped null geometry %d", i);
1305 null_cnt++;
1306 continue;
1307 }
1308
1309
1311 {
1312
1313 POSTGIS_DEBUGF(3, " skipped empty geometry %d", i);
1314 continue;
1315 }
1316
1317
1318 if ( mode == 2 )
1320
1321
1323 {
1324 POSTGIS_DEBUGF(3, " skipped infinite/nan geometry %d", i);
1325 continue;
1326 }
1327
1328
1329
1330
1331
1332 if ( mode != 2 )
1334
1335
1336 nd_box = palloc(
sizeof(
ND_BOX));
1338
1339
1340 sample_boxes[notnull_cnt] = nd_box;
1341
1342
1343 if ( ! notnull_cnt )
1345
1346
1348
1349
1350 total_width += toast_raw_datum_size(datum);
1351
1352
1353 for ( d = 0; d < ndims; d++ )
1354 {
1355 sum.
min[d] += nd_box->
min[d];
1356 sum.
max[d] += nd_box->
max[d];
1357 }
1358
1359
1360 notnull_cnt++;
1361
1362
1363#if POSTGIS_PGSQL_VERSION >= 180
1364 vacuum_delay_point(true);
1365#else
1366 vacuum_delay_point();
1367#endif
1368 }
1369
1370#if POSTGIS_PGSQL_VERSION >= 170
1371 POSTGIS_DEBUGF(3, " stats->attstattarget: %d", stats->attstattarget);
1373#else
1374 POSTGIS_DEBUGF(3, " stats->attr->attstattarget: %d", stats->attr->attstattarget);
1376#endif
1377 POSTGIS_DEBUGF(3, " target # of histogram cells: %d", histo_cells_target);
1378
1379
1380 if ( ! notnull_cnt )
1381 {
1382 stats->stats_valid = false;
1383 return;
1384 }
1385
1386 POSTGIS_DEBUGF(3,
" sample_extent: %s",
nd_box_to_json(&sample_extent, ndims));
1387
1388
1389
1390
1391
1392 for ( d = 0; d < ndims; d++ )
1393 {
1394
1395 avg.
min[d] = sum.
min[d] / notnull_cnt;
1396 avg.
max[d] = sum.
max[d] / notnull_cnt;
1397
1398
1399 for ( i = 0; i < notnull_cnt; i++ )
1400 {
1401 const ND_BOX *ndb = sample_boxes[i];
1402 stddev.
min[d] += (ndb->
min[d] - avg.
min[d]) * (ndb->
min[d] - avg.
min[d]);
1403 stddev.
max[d] += (ndb->
max[d] - avg.
max[d]) * (ndb->
max[d] - avg.
max[d]);
1404 }
1405 stddev.
min[d] = sqrt(stddev.
min[d] / notnull_cnt);
1406 stddev.
max[d] = sqrt(stddev.
max[d] / notnull_cnt);
1407
1408
1411 }
1412
1413
1414
1415
1416
1417
1419 for ( i = 0; i < notnull_cnt; i++ )
1420 {
1421 const ND_BOX *ndb = sample_boxes[i];
1422
1424 {
1425 POSTGIS_DEBUGF(4, " feature %d is a hard deviant, skipped", i);
1426 sample_boxes[i] = NULL;
1427 continue;
1428 }
1429
1431 }
1432
1433
1434
1435
1437 histo_extent = histo_extent_new;
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1452 sample_distribution);
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468 for ( d = 0; d < ndims; d++ )
1469 {
1470 if ( sample_distribution[d] > 0 )
1471 histo_ndims++;
1472 }
1473
1474 if ( histo_ndims == 0 )
1475 {
1476
1477
1478 POSTGIS_DEBUG(3, " special case: no axes have variability");
1479 histo_cells_new = 1;
1480 for ( d = 0; d < ndims; d++ )
1481 {
1482 histo_size[d] = (int)pow((double)histo_cells_target, 1/(double)ndims);
1483 if ( ! histo_size[d] )
1484 histo_size[d] = 1;
1485 POSTGIS_DEBUGF(3, " histo_size[d]: %d", histo_size[d]);
1486 histo_cells_new *= histo_size[d];
1487 }
1488 POSTGIS_DEBUGF(3, " histo_cells_new: %d", histo_cells_new);
1489 }
1490 else
1491 {
1492
1493
1494
1495
1496
1497 POSTGIS_DEBUG(3, " allocating histogram axes based on axis variability");
1498 total_distribution =
total_double(sample_distribution, ndims);
1499 POSTGIS_DEBUGF(3, " total_distribution: %.8g", total_distribution);
1500 histo_cells_new = 1;
1501 for ( d = 0; d < ndims; d++ )
1502 {
1503 if ( sample_distribution[d] == 0 )
1504 {
1505 histo_size[d] = 1;
1506 }
1507 else
1508 {
1509
1510 float edge_ratio = (float)sample_distribution[d] / (float)total_distribution;
1511
1512
1513
1514
1515
1516
1517
1519 }
1520 histo_cells_new *= histo_size[d];
1521 }
1522 POSTGIS_DEBUGF(3, " histo_cells_new: %d", histo_cells_new);
1523 }
1524
1525
1526 histo_cells = histo_cells_new;
1527 POSTGIS_DEBUGF(3, " histo_cells: %d", histo_cells);
1528
1529
1530
1531
1532 old_context = MemoryContextSwitchTo(stats->anl_context);
1533 nd_stats_size =
sizeof(
ND_STATS) + ((histo_cells - 1) *
sizeof(float4));
1534 nd_stats = palloc(nd_stats_size);
1535 memset(nd_stats, 0, nd_stats_size);
1536 MemoryContextSwitchTo(old_context);
1537
1538
1539 nd_stats->
ndims = ndims;
1540 nd_stats->
extent = histo_extent;
1544
1545 for ( d = 0; d < ndims; d++ )
1546 nd_stats->
size[d] = histo_size[d];
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561 for ( i = 0; i < notnull_cnt; i++ )
1562 {
1566 double num_cells = 0;
1567 double min[
ND_DIMS] = {0.0, 0.0, 0.0, 0.0};
1568 double max[
ND_DIMS] = {0.0, 0.0, 0.0, 0.0};
1569 double cellsize[
ND_DIMS] = {0.0, 0.0, 0.0, 0.0};
1570
1571 nd_box = sample_boxes[i];
1572 if ( ! nd_box ) continue;
1573
1574
1575#if POSTGIS_PGSQL_VERSION >= 180
1576 vacuum_delay_point(true);
1577#else
1578 vacuum_delay_point();
1579#endif
1580
1581
1583 memset(at, 0,
sizeof(
int)*
ND_DIMS);
1584
1585 POSTGIS_DEBUGF(3, " feature %d: ibox (%d, %d, %d, %d) (%d, %d, %d, %d)", i,
1586 nd_ibox.
min[0], nd_ibox.
min[1], nd_ibox.
min[2], nd_ibox.
min[3],
1587 nd_ibox.
max[0], nd_ibox.
max[1], nd_ibox.
max[2], nd_ibox.
max[3]);
1588
1589 for ( d = 0; d < nd_stats->
ndims; d++ )
1590 {
1591
1592 at[d] = nd_ibox.
min[d];
1595 cellsize[d] = (max[d] - min[d])/(nd_stats->
size[d]);
1596 }
1597
1598
1599
1600
1601
1602 do
1603 {
1604 ND_BOX nd_cell = { {0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0} };
1605 double ratio;
1606
1607 for ( d = 0; d < nd_stats->
ndims; d++ )
1608 {
1609 nd_cell.
min[d] = min[d] + (at[d]+0) * cellsize[d];
1610 nd_cell.
max[d] = min[d] + (at[d]+1) * cellsize[d];
1611 }
1612
1613
1614
1615
1616
1617
1620 num_cells += ratio;
1621 POSTGIS_DEBUGF(3, " ratio (%.8g) num_cells (%.8g)", ratio, num_cells);
1622 POSTGIS_DEBUGF(3, " at (%d, %d, %d, %d)", at[0], at[1], at[2], at[3]);
1623 }
1625
1626
1627 total_cell_count += num_cells;
1628
1629 histogram_features++;
1630 }
1631
1632 POSTGIS_DEBUGF(3, " histogram_features: %d", histogram_features);
1633 POSTGIS_DEBUGF(3, " sample_rows: %d", sample_rows);
1634 POSTGIS_DEBUGF(3, " table_rows: %.6g", total_rows);
1635
1636
1637 if ( ! histogram_features )
1638 {
1639 POSTGIS_DEBUG(3, " no stats have been gathered");
1640 elog(NOTICE, " no features lie in the stats histogram, invalid stats");
1641 stats->stats_valid = false;
1642 return;
1643 }
1644
1648
1649
1650 if ( mode == 2 )
1651 {
1654 }
1655 else
1656 {
1659 }
1660
1661
1662 stats->stakind[stats_slot] = stats_kind;
1663 stats->staop[stats_slot] = InvalidOid;
1664 stats->stanumbers[stats_slot] = (float4*)nd_stats;
1665 stats->numnumbers[stats_slot] = nd_stats_size/sizeof(float4);
1666 stats->stanullfrac = (float4)null_cnt/sample_rows;
1667 stats->stawidth = total_width/notnull_cnt;
1668 stats->stadistinct = -1.0;
1669 stats->stats_valid = true;
1670
1671 POSTGIS_DEBUGF(3, " out: slot 0: kind %d (STATISTIC_KIND_ND)", stats->stakind[0]);
1672 POSTGIS_DEBUGF(3, " out: slot 0: op %d (InvalidOid)", stats->staop[0]);
1673 POSTGIS_DEBUGF(3, " out: slot 0: numnumbers %d", stats->numnumbers[0]);
1674 POSTGIS_DEBUGF(3, " out: null fraction: %f=%d/%d", stats->stanullfrac, null_cnt, sample_rows);
1675 POSTGIS_DEBUGF(3, " out: average width: %d bytes", stats->stawidth);
1676 POSTGIS_DEBUG (3, " out: distinct values: all (no check done)");
1678
1679
1680
1681
1682 return;
1683}
int gbox_is_valid(const GBOX *gbox)
Return false if any of the dimensions is NaN or infinite.
static int nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
Return true if ND_BOX a overlaps b, false otherwise.
static int nd_box_init_bounds(ND_BOX *a)
Prepare an ND_BOX for bounds calculation: set the maxes to the smallest thing possible and the mins t...
static int nd_increment(ND_IBOX *ibox, int ndims, int *counter)
Given an n-d index array (counter), and a domain to increment it in (ibox) increment it by one,...
#define STATISTIC_SLOT_ND
static int gbox_ndims(const GBOX *gbox)
Given that geodetic boxes are X/Y/Z regardless of the underlying geometry dimensionality and other bo...
static char * nd_box_to_json(const ND_BOX *nd_box, int ndims)
Convert an ND_BOX to a JSON string for printing.
static char * nd_stats_to_json(const ND_STATS *nd_stats)
Convert an ND_STATS to a JSON representation for external use.
#define STATISTIC_KIND_2D
static int nd_box_merge(const ND_BOX *source, ND_BOX *target)
Expand the bounds of target to include source.
#define STATISTIC_KIND_ND
static double total_double(const double *vals, int nvals)
Given double array, return sum of values.
static void nd_box_from_gbox(const GBOX *gbox, ND_BOX *nd_box)
Set the values of an ND_BOX from a GBOX.
static int nd_box_init(ND_BOX *a)
Zero out an ND_BOX.
static int nd_box_expand(ND_BOX *nd_box, double expansion_factor)
Expand an ND_BOX ever so slightly.
static int nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
What stats cells overlap with this ND_BOX? Put the lowest cell addresses in ND_IBOX->min and the high...
#define STATISTIC_SLOT_2D
static int nd_box_array_distribution(const ND_BOX **nd_boxes, int num_boxes, const ND_BOX *extent, int ndims, double *distribution)
Calculate how much a set of boxes is homogeneously distributed or contentrated within one dimension,...
struct ND_STATS_T ND_STATS
static int histogram_axis_cells(int histo_cells_target, int histo_ndims, double edge_ratio)
static double nd_box_ratio(const ND_BOX *cover, const ND_BOX *target, int ndims)
static int histogram_cell_budget(double total_rows, int ndims, int attstattarget)
static int nd_stats_value_index(const ND_STATS *stats, const int *indexes)
int gserialized_datum_get_gbox_p(Datum gsdatum, GBOX *gbox)
Given a GSERIALIZED datum, as quickly as possible (peaking into the top of the memory) return the gbo...
float4 histogram_features