◆ compute_gserialized_stats_mode()

static void compute_gserialized_stats_mode	(	VacAttrStats *	stats,
		AnalyzeAttrFetchFunc	fetchfunc,
		int	sample_rows,
		double	total_rows,
		int	mode
	)
static
The gserialized_analyze_nd sets this function as a callback on the stats object when called by the ANALYZE command.
ANALYZE then gathers the requisite number of sample rows and then calls this function.
We could also pass stats->extra_data in from gserialized_analyze_nd (things like the column type or other stuff from the system catalogs) but so far we don't use that capability.
Our job is to build some statistics on the sample data for use by operator estimators.
We will populate an n-d histogram using the provided sample rows. The selectivity estimators (sel and joinsel) can then use the histogram
Definition at line 1223 of file gserialized_estimate.c.
{
        MemoryContext old_context;
        int d, i;                          /* Counters */
        int notnull_cnt = 0;               /* # not null rows in the sample */
        int null_cnt = 0;                  /* # null rows in the sample */
        int histogram_features = 0;        /* # rows that actually got counted in the histogram */
 
        ND_STATS *nd_stats;                /* Our histogram */
        size_t    nd_stats_size;           /* Size to allocate */
 
        double total_width = 0;            /* # of bytes used by sample */
        double total_cell_count = 0;       /* # of cells in histogram affected by sample */
 
        ND_BOX sum;                        /* Sum of extents of sample boxes */
        ND_BOX avg;                        /* Avg of extents of sample boxes */
        ND_BOX stddev;                     /* StdDev of extents of sample boxes */
 
        const ND_BOX **sample_boxes;       /* ND_BOXes for each of the sample features */
        ND_BOX sample_extent;              /* Extent of the raw sample */
        int    histo_size[ND_DIMS];        /* histogram nrows, ncols, etc */
        ND_BOX histo_extent;               /* Spatial extent of the histogram */
        ND_BOX histo_extent_new;           /* Temporary variable */
        int    histo_cells_target;         /* Number of cells we will shoot for, given the stats target */
        int    histo_cells;                /* Number of cells in the histogram */
        int    histo_cells_new = 1;        /* Temporary variable */
 
        int   ndims = 2;                    /* Dimensionality of the sample */
        int   histo_ndims = 0;              /* Dimensionality of the histogram */
        double sample_distribution[ND_DIMS]; /* How homogeneous is distribution of sample in each axis? */
        double total_distribution;           /* Total of sample_distribution */
 
        int stats_slot;                     /* What slot is this data going into? (2D vs ND) */
        int stats_kind;                     /* And this is what? (2D vs ND) */
 
        /* Initialize sum and stddev */
        nd_box_init(&sum);
        nd_box_init(&stddev);
        nd_box_init(&avg);
        nd_box_init(&histo_extent);
        nd_box_init(&histo_extent_new);
 
        /*
         * This is where gserialized_analyze_nd
         * should put its' custom parameters.
         */
        /* void *mystats = stats->extra_data; */
 
        POSTGIS_DEBUG(2, "compute_gserialized_stats called");
        POSTGIS_DEBUGF(3, " # sample_rows: %d", sample_rows);
        POSTGIS_DEBUGF(3, " estimate of total_rows: %.6g", total_rows);
 
        /*
         * We might need less space, but don't think
         * its worth saving...
         */
        sample_boxes = palloc(sizeof(ND_BOX*) * sample_rows);
 
        /*
         * First scan:
         *  o read boxes
         *  o find dimensionality of the sample
         *  o find extent of the sample
         *  o count null-infinite/not-null values
         *  o compute total_width
         *  o compute total features's box area (for avgFeatureArea)
         *  o sum features box coordinates (for standard deviation)
         */
        for ( i = 0; i < sample_rows; i++ )
        {
                Datum datum;
                GBOX gbox = {0};
                ND_BOX *nd_box;
                bool is_null;
 
                datum = fetchfunc(stats, i, &is_null);
 
                /* Skip all NULLs. */
                if ( is_null )
                {
                        POSTGIS_DEBUGF(4, " skipped null geometry %d", i);
                        null_cnt++;
                        continue;
                }
 
                /* Read the bounds from the gserialized. */
                if (LW_FAILURE == gserialized_datum_get_gbox_p(datum, &gbox))
                {
                        /* Skip empties too. */
                        POSTGIS_DEBUGF(3, " skipped empty geometry %d", i);
                        continue;
                }
 
                /* If we're in 2D mode, zero out the higher dimensions for "safety" */
                if ( mode == 2 )
                        gbox.zmin = gbox.zmax = gbox.mmin = gbox.mmax = 0.0;
 
                /* Check bounds for validity (finite and not NaN) */
                if ( ! gbox_is_valid(&gbox) )
                {
                        POSTGIS_DEBUGF(3, " skipped infinite/nan geometry %d", i);
                        continue;
                }
 
                /*
                 * In N-D mode, set the ndims to the maximum dimensionality found
                 * in the sample. Otherwise, leave at ndims == 2.
                 */
                if ( mode != 2 )
                        ndims = Max(gbox_ndims(&gbox), ndims);
 
                /* Convert gbox to n-d box */
                nd_box = palloc(sizeof(ND_BOX));
                nd_box_from_gbox(&gbox, nd_box);
 
                /* Cache n-d bounding box */
                sample_boxes[notnull_cnt] = nd_box;
 
                /* Initialize sample extent before merging first entry */
                if ( ! notnull_cnt )
                        nd_box_init_bounds(&sample_extent);
 
                /* Add current sample to overall sample extent */
                nd_box_merge(nd_box, &sample_extent);
 
                /* How many bytes does this sample use? */
                total_width += toast_raw_datum_size(datum);
 
                /* Add bounds coordinates to sums for stddev calculation */
                for ( d = 0; d < ndims; d++ )
                {
                        sum.min[d] += nd_box->min[d];
                        sum.max[d] += nd_box->max[d];
                }
 
                /* Increment our "good feature" count */
                notnull_cnt++;
 
                /* Give backend a chance of interrupting us */
#if POSTGIS_PGSQL_VERSION >= 180
                vacuum_delay_point(true);
#else
                vacuum_delay_point();
#endif
        }
 
#if POSTGIS_PGSQL_VERSION >= 170
        POSTGIS_DEBUGF(3, " stats->attstattarget: %d", stats->attstattarget);
        histo_cells_target = histogram_cell_budget(total_rows, ndims, stats->attstattarget);
#else
        POSTGIS_DEBUGF(3, " stats->attr->attstattarget: %d", stats->attr->attstattarget);
        histo_cells_target = histogram_cell_budget(total_rows, ndims, stats->attr->attstattarget);
#endif
        POSTGIS_DEBUGF(3, " target # of histogram cells: %d", histo_cells_target);
 
        /* If there's no useful features, we can't work out stats */
        if ( ! notnull_cnt )
        {
                stats->stats_valid = false;
                return;
        }
 
        POSTGIS_DEBUGF(3, " sample_extent: %s", nd_box_to_json(&sample_extent, ndims));
 
        /*
         * Second scan:
         *  o compute standard deviation
         */
        for ( d = 0; d < ndims; d++ )
        {
                /* Calculate average bounds values */
                avg.min[d] = sum.min[d] / notnull_cnt;
                avg.max[d] = sum.max[d] / notnull_cnt;
 
                /* Calculate standard deviation for this dimension bounds */
                for ( i = 0; i < notnull_cnt; i++ )
                {
                        const ND_BOX *ndb = sample_boxes[i];
                        stddev.min[d] += (ndb->min[d] - avg.min[d]) * (ndb->min[d] - avg.min[d]);
                        stddev.max[d] += (ndb->max[d] - avg.max[d]) * (ndb->max[d] - avg.max[d]);
                }
                stddev.min[d] = sqrt(stddev.min[d] / notnull_cnt);
                stddev.max[d] = sqrt(stddev.max[d] / notnull_cnt);
 
                /* Histogram bounds for this dimension bounds is avg +/- SDFACTOR * stdev */
                histo_extent.min[d] = Max(avg.min[d] - SDFACTOR * stddev.min[d], sample_extent.min[d]);
                histo_extent.max[d] = Min(avg.max[d] + SDFACTOR * stddev.max[d], sample_extent.max[d]);
        }
 
        /*
         * Third scan:
         *   o skip hard deviants
         *   o compute new histogram box
         */
        nd_box_init_bounds(&histo_extent_new);
        for ( i = 0; i < notnull_cnt; i++ )
        {
                const ND_BOX *ndb = sample_boxes[i];
                /* Skip any hard deviants (boxes entirely outside our histo_extent */
                if ( ! nd_box_intersects(&histo_extent, ndb, ndims) )
                {
                        POSTGIS_DEBUGF(4, " feature %d is a hard deviant, skipped", i);
                        sample_boxes[i] = NULL;
                        continue;
                }
                /* Expand our new box to fit all the other features. */
                nd_box_merge(ndb, &histo_extent_new);
        }
        /*
         * Expand the box slightly (1%) to avoid edge effects
         * with objects that are on the boundary
         */
        nd_box_expand(&histo_extent_new, 0.01);
        histo_extent = histo_extent_new;
 
        /*
         * How should we allocate our histogram cells to the
         * different dimensions? We can't do it by raw dimensional width,
         * because in x/y/z space, the z can have different units
         * from the x/y. Similarly for x/y/t space.
         * So, we instead calculate how much features overlap
         * each other in their dimension to figure out which
         *  dimensions have useful selectivity characteristics (more
         * variability in density) and therefore would find
         * more cells useful (to distinguish between dense places and
         * homogeneous places).
         */
        nd_box_array_distribution(sample_boxes, notnull_cnt, &histo_extent, ndims,
                                  sample_distribution);
 
        /*
         * The sample_distribution array now tells us how spread out the
         * data is in each dimension, so we use that data to allocate
         * the histogram cells we have available.
         * At this point, histo_cells_target is the approximate target number
         * of cells.
         */
 
        /*
         * Some dimensions have basically a uniform distribution, we want
         * to allocate no cells to those dimensions, only to dimensions
         * that have some interesting differences in data distribution.
         * Here we count up the number of interesting dimensions
         */
        for ( d = 0; d < ndims; d++ )
        {
                if ( sample_distribution[d] > 0 )
                        histo_ndims++;
        }
 
        if ( histo_ndims == 0 )
        {
                /* Special case: all our dimensions had low variability! */
                /* We just divide the cells up evenly */
                POSTGIS_DEBUG(3, " special case: no axes have variability");
                histo_cells_new = 1;
                for ( d = 0; d < ndims; d++ )
                {
                        histo_size[d] = (int)pow((double)histo_cells_target, 1/(double)ndims);
                        if ( ! histo_size[d] )
                                histo_size[d] = 1;
                        POSTGIS_DEBUGF(3, "   histo_size[d]: %d", histo_size[d]);
                        histo_cells_new *= histo_size[d];
                }
                POSTGIS_DEBUGF(3, " histo_cells_new: %d", histo_cells_new);
        }
        else
        {
                /*
                 * We're going to express the amount of variability in each dimension
                 * as a proportion of the total variability and allocate cells in that
                 * dimension relative to that proportion.
                 */
                POSTGIS_DEBUG(3, " allocating histogram axes based on axis variability");
                total_distribution = total_double(sample_distribution, ndims); /* First get the total */
                POSTGIS_DEBUGF(3, " total_distribution: %.8g", total_distribution);
                histo_cells_new = 1; /* For the number of cells in the final histogram */
                for ( d = 0; d < ndims; d++ )
                {
                        if ( sample_distribution[d] == 0 ) /* Uninteresting dimensions don't get any room */
                        {
                                histo_size[d] = 1;
                        }
                        else /* Interesting dimension */
                        {
                                /* How does this dims variability compare to the total? */
                                float edge_ratio = (float)sample_distribution[d] / (float)total_distribution;
                                /*
                                 * Scale the target cells number by the # of dims and ratio,
                                 * then take the appropriate root to get the estimated number of cells
                                 * on this axis (eg, pow(0.5) for 2d, pow(0.333) for 3d, pow(0.25) for 4d)
                                 * The dedicated helper clamps pathological floating point inputs so we
                                 * do not resurrect the NaN propagation reported in #5959 on amd64.
                                 */
                                histo_size[d] = histogram_axis_cells(histo_cells_target, histo_ndims, edge_ratio);
                        }
                        histo_cells_new *= histo_size[d];
                }
                POSTGIS_DEBUGF(3, " histo_cells_new: %d", histo_cells_new);
        }
 
        /* Update histo_cells to the actual number of cells we need to allocate */
        histo_cells = histo_cells_new;
        POSTGIS_DEBUGF(3, " histo_cells: %d", histo_cells);
 
        /*
         * Create the histogram (ND_STATS) in the stats memory context
         */
        old_context = MemoryContextSwitchTo(stats->anl_context);
        nd_stats_size = sizeof(ND_STATS) + ((histo_cells - 1) * sizeof(float4));
        nd_stats = palloc(nd_stats_size);
        memset(nd_stats, 0, nd_stats_size); /* Initialize all values to 0 */
        MemoryContextSwitchTo(old_context);
 
        /* Initialize the #ND_STATS objects */
        nd_stats->ndims = ndims;
        nd_stats->extent = histo_extent;
        nd_stats->sample_features = sample_rows;
        nd_stats->table_features = total_rows;
        nd_stats->not_null_features = notnull_cnt;
        /* Copy in the histogram dimensions */
        for ( d = 0; d < ndims; d++ )
                nd_stats->size[d] = histo_size[d];
 
        /*
         * Fourth scan:
         *  o fill histogram values with the proportion of
         *    features' bbox overlaps: a feature's bvol
         *    can fully overlap (1) or partially overlap
         *    (fraction of 1) an histogram cell.
         *
         * Note that we are filling each cell with the "portion of
         * the feature's box that overlaps the cell". So, if we sum
         * up the values in the histogram, we could get the
         * histogram feature count.
         *
         */
        for ( i = 0; i < notnull_cnt; i++ )
        {
                const ND_BOX *nd_box;
                ND_IBOX nd_ibox;
                int at[ND_DIMS];
                double num_cells = 0;
                double min[ND_DIMS] = {0.0, 0.0, 0.0, 0.0};
                double max[ND_DIMS] = {0.0, 0.0, 0.0, 0.0};
                double cellsize[ND_DIMS] = {0.0, 0.0, 0.0, 0.0};
 
                nd_box = sample_boxes[i];
                if ( ! nd_box ) continue; /* Skip Null'ed out hard deviants */
 
                /* Give backend a chance of interrupting us */
#if POSTGIS_PGSQL_VERSION >= 180
                vacuum_delay_point(true);
#else
                vacuum_delay_point();
#endif
 
                /* Find the cells that overlap with this box and put them into the ND_IBOX */
                nd_box_overlap(nd_stats, nd_box, &nd_ibox);
                memset(at, 0, sizeof(int)*ND_DIMS);
 
                POSTGIS_DEBUGF(3, " feature %d: ibox (%d, %d, %d, %d) (%d, %d, %d, %d)", i,
                  nd_ibox.min[0], nd_ibox.min[1], nd_ibox.min[2], nd_ibox.min[3],
                  nd_ibox.max[0], nd_ibox.max[1], nd_ibox.max[2], nd_ibox.max[3]);
 
                for ( d = 0; d < nd_stats->ndims; d++ )
                {
                        /* Initialize the starting values */
                        at[d] = nd_ibox.min[d];
                        min[d] = nd_stats->extent.min[d];
                        max[d] = nd_stats->extent.max[d];
                        cellsize[d] = (max[d] - min[d])/(nd_stats->size[d]);
                }
 
                /*
                 * Move through all the overlapped histogram cells values and
                 * add the box overlap proportion to them.
                 */
                do
                {
                        ND_BOX nd_cell = { {0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0} };
                        double ratio;
                        /* Create a box for this histogram cell */
                        for ( d = 0; d < nd_stats->ndims; d++ )
                        {
                                nd_cell.min[d] = min[d] + (at[d]+0) * cellsize[d];
                                nd_cell.max[d] = min[d] + (at[d]+1) * cellsize[d];
                        }
 
                        /*
                         * If a feature box is completely inside one cell the ratio will be
                         * 1.0. If a feature box is 50% in two cells, each cell will get
                         * 0.5 added on.
                         */
                        ratio = nd_box_ratio(&nd_cell, nd_box, nd_stats->ndims);
                        nd_stats->value[nd_stats_value_index(nd_stats, at)] += ratio;
                        num_cells += ratio;
                        POSTGIS_DEBUGF(3, "               ratio (%.8g)  num_cells (%.8g)", ratio, num_cells);
                        POSTGIS_DEBUGF(3, "               at (%d, %d, %d, %d)", at[0], at[1], at[2], at[3]);
                }
                while ( nd_increment(&nd_ibox, nd_stats->ndims, at) );
 
                /* Keep track of overall number of overlaps counted */
                total_cell_count += num_cells;
                /* How many features have we added to this histogram? */
                histogram_features++;
        }
 
        POSTGIS_DEBUGF(3, " histogram_features: %d", histogram_features);
        POSTGIS_DEBUGF(3, " sample_rows: %d", sample_rows);
        POSTGIS_DEBUGF(3, " table_rows: %.6g", total_rows);
 
        /* Error out if we got no sample information */
        if ( ! histogram_features )
        {
                POSTGIS_DEBUG(3, " no stats have been gathered");
                elog(NOTICE, " no features lie in the stats histogram, invalid stats");
                stats->stats_valid = false;
                return;
        }
 
        nd_stats->histogram_features = histogram_features;
        nd_stats->histogram_cells = histo_cells;
        nd_stats->cells_covered = total_cell_count;
 
        /* Put this histogram data into the right slot/kind */
        if ( mode == 2 )
        {
                stats_slot = STATISTIC_SLOT_2D;
                stats_kind = STATISTIC_KIND_2D;
        }
        else
        {
                stats_slot = STATISTIC_SLOT_ND;
                stats_kind = STATISTIC_KIND_ND;
        }
 
        /* Write the statistics data */
        stats->stakind[stats_slot] = stats_kind;
        stats->staop[stats_slot] = InvalidOid;
        stats->stanumbers[stats_slot] = (float4*)nd_stats;
        stats->numnumbers[stats_slot] = nd_stats_size/sizeof(float4);
        stats->stanullfrac = (float4)null_cnt/sample_rows;
        stats->stawidth = total_width/notnull_cnt;
        stats->stadistinct = -1.0;
        stats->stats_valid = true;
 
        POSTGIS_DEBUGF(3, " out: slot 0: kind %d (STATISTIC_KIND_ND)", stats->stakind[0]);
        POSTGIS_DEBUGF(3, " out: slot 0: op %d (InvalidOid)", stats->staop[0]);
        POSTGIS_DEBUGF(3, " out: slot 0: numnumbers %d", stats->numnumbers[0]);
        POSTGIS_DEBUGF(3, " out: null fraction: %f=%d/%d", stats->stanullfrac, null_cnt, sample_rows);
        POSTGIS_DEBUGF(3, " out: average width: %d bytes", stats->stawidth);
        POSTGIS_DEBUG (3, " out: distinct values: all (no check done)");
        POSTGIS_DEBUGF(3, " out: %s", nd_stats_to_json(nd_stats));
        /*
        POSTGIS_DEBUGF(3, " out histogram:\n%s", nd_stats_to_grid(nd_stats));
        */
 
        return;
}
References ND_STATS_T::cells_covered, ND_STATS_T::extent, gbox_is_valid(), gbox_ndims(), gserialized_datum_get_gbox_p(), histogram_axis_cells(), histogram_cell_budget(), ND_STATS_T::histogram_cells, ND_STATS_T::histogram_features, LW_FAILURE, ND_BOX_T::max, ND_IBOX_T::max, ND_BOX_T::min, ND_IBOX_T::min, GBOX::mmax, GBOX::mmin, nd_box_array_distribution(), nd_box_expand(), nd_box_from_gbox(), nd_box_init(), nd_box_init_bounds(), nd_box_intersects(), nd_box_merge(), nd_box_overlap(), nd_box_ratio(), nd_box_to_json(), ND_DIMS, nd_increment(), nd_stats_to_json(), nd_stats_value_index(), ND_STATS_T::ndims, ND_STATS_T::not_null_features, ND_STATS_T::sample_features, SDFACTOR, ND_STATS_T::size, STATISTIC_KIND_2D, STATISTIC_KIND_ND, STATISTIC_SLOT_2D, STATISTIC_SLOT_ND, ND_STATS_T::table_features, total_double(), ND_STATS_T::value, GBOX::zmax, and GBOX::zmin.
Referenced by compute_gserialized_stats().
Here is the call graph for this function:
Here is the caller graph for this function: