PostGIS  2.3.8dev-r@@SVN_REVISION@@
gserialized_estimate.c
Go to the documentation of this file.
1 /**********************************************************************
2  *
3  * PostGIS - Spatial Types for PostgreSQL
4  * http://postgis.net
5  *
6  * PostGIS is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * PostGIS is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with PostGIS. If not, see <http://www.gnu.org/licenses/>.
18  *
19  **********************************************************************
20  *
21  * Copyright 2012 (C) Paul Ramsey <pramsey@cleverelephant.ca>
22  *
23  **********************************************************************/
24 
25 
26 
27 /**********************************************************************
28  THEORY OF OPERATION
29 
30 The ANALYZE command hooks to a callback (gserialized_analyze_nd) that
31 calculates (compute_gserialized_stats_mode) two histograms of occurances of
32 features, once for the 2D domain (and the && operator) one for the
33 ND domain (and the &&& operator).
34 
35 Queries in PostgreSQL call into the selectivity sub-system to find out
36 the relative effectiveness of different clauses in sub-setting
37 relations. Queries with constant arguments call gserialized_gist_sel,
38 queries with relations on both sides call gserialized_gist_joinsel.
39 
40 gserialized_gist_sel sums up the values in the histogram that overlap
41 the contant search box.
42 
43 gserialized_gist_joinsel sums up the product of the overlapping
44 cells in each relation's histogram.
45 
46 Depending on the operator and type, the mode of selectivity calculation
47 will be 2D or ND.
48 
49 - geometry && geometry ==> 2D
50 - geometry &&& geometry ==> ND
51 - geography && geography ==> ND
52 
53 The 2D mode is put in effect by retrieving the 2D histogram from the
54 statistics cache and then allowing the generic ND calculations to
55 go to work.
56 
57 TO DO: More testing and examination of the &&& operator and mixed
58 dimensionality cases. (2D geometry) &&& (3D column), etc.
59 
60 **********************************************************************/
61 
62 #include "postgres.h"
63 #include "executor/spi.h"
64 #include "fmgr.h"
65 #include "commands/vacuum.h"
66 #include "nodes/relation.h"
67 #include "parser/parsetree.h"
68 #include "utils/array.h"
69 #include "utils/lsyscache.h"
70 #include "utils/builtins.h"
71 #include "utils/syscache.h"
72 #include "utils/rel.h"
73 #include "utils/selfuncs.h"
74 
75 #include "../postgis_config.h"
76 
77 #if POSTGIS_PGSQL_VERSION >= 93
78  #include "access/htup_details.h"
79 #endif
80 
81 #include "stringbuffer.h"
82 #include "liblwgeom.h"
83 #include "lwgeom_pg.h" /* For debugging macros. */
84 #include "gserialized_gist.h" /* For index common functions */
85 
86 #include <math.h>
87 #if HAVE_IEEEFP_H
88 #include <ieeefp.h>
89 #endif
90 #include <float.h>
91 #include <string.h>
92 #include <stdio.h>
93 #include <errno.h>
94 #include <ctype.h>
95 
96 /* Fall back to older finite() if necessary */
97 #ifndef HAVE_ISFINITE
98 # ifdef HAVE_GNU_ISFINITE
99 # define _GNU_SOURCE
100 # else
101 # define isfinite finite
102 # endif
103 #endif
104 
105 
106 /* Prototypes */
107 Datum gserialized_gist_joinsel(PG_FUNCTION_ARGS);
108 Datum gserialized_gist_joinsel_2d(PG_FUNCTION_ARGS);
109 Datum gserialized_gist_joinsel_nd(PG_FUNCTION_ARGS);
110 Datum gserialized_gist_sel(PG_FUNCTION_ARGS);
111 Datum gserialized_gist_sel_2d(PG_FUNCTION_ARGS);
112 Datum gserialized_gist_sel_nd(PG_FUNCTION_ARGS);
113 Datum gserialized_analyze_nd(PG_FUNCTION_ARGS);
114 Datum gserialized_estimated_extent(PG_FUNCTION_ARGS);
115 Datum _postgis_gserialized_sel(PG_FUNCTION_ARGS);
116 Datum _postgis_gserialized_joinsel(PG_FUNCTION_ARGS);
117 Datum _postgis_gserialized_stats(PG_FUNCTION_ARGS);
118 
119 /* Old Prototype */
120 Datum geometry_estimated_extent(PG_FUNCTION_ARGS);
121 
132 #define STATISTIC_KIND_ND 102
133 #define STATISTIC_KIND_2D 103
134 #define STATISTIC_SLOT_ND 0
135 #define STATISTIC_SLOT_2D 1
136 
137 /*
138 * The SD factor restricts the side of the statistics histogram
139 * based on the standard deviation of the extent of the data.
140 * SDFACTOR is the number of standard deviations from the mean
141 * the histogram will extend.
142 */
143 #define SDFACTOR 3.25
144 
150 #define ND_DIMS 4
151 
158 #define MIN_DIMENSION_WIDTH 0.000000001
159 
163 #define DEFAULT_ND_SEL 0.0001
164 #define DEFAULT_ND_JOINSEL 0.001
165 
169 #define FALLBACK_ND_SEL 0.2
170 #define FALLBACK_ND_JOINSEL 0.3
171 
177 typedef struct ND_BOX_T
178 {
179  float4 min[ND_DIMS];
180  float4 max[ND_DIMS];
181 } ND_BOX;
182 
186 typedef struct ND_IBOX_T
187 {
188  int min[ND_DIMS];
189  int max[ND_DIMS];
190 } ND_IBOX;
191 
192 
199 typedef struct ND_STATS_T
200 {
201  /* Dimensionality of the histogram. */
202  float4 ndims;
203 
204  /* Size of n-d histogram in each dimension. */
205  float4 size[ND_DIMS];
206 
207  /* Lower-left (min) and upper-right (max) spatial bounds of histogram. */
209 
210  /* How many rows in the table itself? */
212 
213  /* How many rows were in the sample that built this histogram? */
215 
216  /* How many not-Null/Empty features were in the sample? */
218 
219  /* How many features actually got sampled in the histogram? */
221 
222  /* How many cells in histogram? (sizex*sizey*sizez*sizem) */
224 
225  /* How many cells did those histogram features cover? */
226  /* Since we are pro-rating coverage, this number should */
227  /* now always equal histogram_features */
229 
230  /* Variable length # of floats for histogram */
231  float4 value[1];
232 } ND_STATS;
233 
234 
235 
236 
243 static int
244 gbox_ndims(const GBOX* gbox)
245 {
246  int dims = 2;
247  if ( FLAGS_GET_GEODETIC(gbox->flags) )
248  return 3;
249  if ( FLAGS_GET_Z(gbox->flags) )
250  dims++;
251  if ( FLAGS_GET_M(gbox->flags) )
252  dims++;
253  return dims;
254 }
255 
261 static int
262 text_p_get_mode(const text *txt)
263 {
264  int mode = 2;
265  char *modestr = text2cstring(txt);
266  if ( modestr[0] == 'N' )
267  mode = 0;
268  pfree(modestr);
269  return mode;
270 }
271 
272 
276 static int
277 cmp_int (const void *a, const void *b)
278 {
279  int ia = *((const int*)a);
280  int ib = *((const int*)b);
281 
282  if ( ia == ib )
283  return 0;
284  else if ( ia > ib )
285  return 1;
286  else
287  return -1;
288 }
289 
294 static int
295 range_quintile(int *vals, int nvals)
296 {
297  qsort(vals, nvals, sizeof(int), cmp_int);
298  return vals[4*nvals/5] - vals[nvals/5];
299 }
300 
304 static double
305 total_double(const double *vals, int nvals)
306 {
307  int i;
308  float total = 0;
309  /* Calculate total */
310  for ( i = 0; i < nvals; i++ )
311  total += vals[i];
312 
313  return total;
314 }
315 
316 #if POSTGIS_DEBUG_LEVEL >= 3
317 
321 static int
322 total_int(const int *vals, int nvals)
323 {
324  int i;
325  int total = 0;
326  /* Calculate total */
327  for ( i = 0; i < nvals; i++ )
328  total += vals[i];
329 
330  return total;
331 }
332 
336 static double
337 avg(const int *vals, int nvals)
338 {
339  int t = total_int(vals, nvals);
340  return (double)t / (double)nvals;
341 }
342 
346 static double
347 stddev(const int *vals, int nvals)
348 {
349  int i;
350  double sigma2 = 0;
351  double mean = avg(vals, nvals);
352 
353  /* Calculate sigma2 */
354  for ( i = 0; i < nvals; i++ )
355  {
356  double v = (double)(vals[i]);
357  sigma2 += (mean - v) * (mean - v);
358  }
359  return sqrt(sigma2 / nvals);
360 }
361 #endif /* POSTGIS_DEBUG_LEVEL >= 3 */
362 
367 static int
368 nd_stats_value_index(const ND_STATS *stats, int *indexes)
369 {
370  int d;
371  int accum = 1, vdx = 0;
372 
373  /* Calculate the index into the 1-d values array that the (i,j,k,l) */
374  /* n-d histogram coordinate implies. */
375  /* index = x + y * sizex + z * sizex * sizey + m * sizex * sizey * sizez */
376  for ( d = 0; d < (int)(stats->ndims); d++ )
377  {
378  int size = (int)(stats->size[d]);
379  if ( indexes[d] < 0 || indexes[d] >= size )
380  {
381  POSTGIS_DEBUGF(3, " bad index at (%d, %d)", indexes[0], indexes[1]);
382  return -1;
383  }
384  vdx += indexes[d] * accum;
385  accum *= size;
386  }
387  return vdx;
388 }
389 
393 static char*
394 nd_box_to_json(const ND_BOX *nd_box, int ndims)
395 {
396  char *rv;
397  int i;
399 
400  stringbuffer_append(sb, "{\"min\":[");
401  for ( i = 0; i < ndims; i++ )
402  {
403  if ( i ) stringbuffer_append(sb, ",");
404  stringbuffer_aprintf(sb, "%.6g", nd_box->min[i]);
405  }
406  stringbuffer_append(sb, "],\"max\":[");
407  for ( i = 0; i < ndims; i++ )
408  {
409  if ( i ) stringbuffer_append(sb, ",");
410  stringbuffer_aprintf(sb, "%.6g", nd_box->max[i]);
411  }
412  stringbuffer_append(sb, "]}");
413 
416  return rv;
417 }
418 
419 
424 static char*
425 nd_stats_to_json(const ND_STATS *nd_stats)
426 {
427  char *json_extent, *str;
428  int d;
430  int ndims = (int)roundf(nd_stats->ndims);
431 
432  stringbuffer_append(sb, "{");
433  stringbuffer_aprintf(sb, "\"ndims\":%d,", ndims);
434 
435  /* Size */
436  stringbuffer_append(sb, "\"size\":[");
437  for ( d = 0; d < ndims; d++ )
438  {
439  if ( d ) stringbuffer_append(sb, ",");
440  stringbuffer_aprintf(sb, "%d", (int)roundf(nd_stats->size[d]));
441  }
442  stringbuffer_append(sb, "],");
443 
444  /* Extent */
445  json_extent = nd_box_to_json(&(nd_stats->extent), ndims);
446  stringbuffer_aprintf(sb, "\"extent\":%s,", json_extent);
447  pfree(json_extent);
448 
449  stringbuffer_aprintf(sb, "\"table_features\":%d,", (int)roundf(nd_stats->table_features));
450  stringbuffer_aprintf(sb, "\"sample_features\":%d,", (int)roundf(nd_stats->sample_features));
451  stringbuffer_aprintf(sb, "\"not_null_features\":%d,", (int)roundf(nd_stats->not_null_features));
452  stringbuffer_aprintf(sb, "\"histogram_features\":%d,", (int)roundf(nd_stats->histogram_features));
453  stringbuffer_aprintf(sb, "\"histogram_cells\":%d,", (int)roundf(nd_stats->histogram_cells));
454  stringbuffer_aprintf(sb, "\"cells_covered\":%d", (int)roundf(nd_stats->cells_covered));
455  stringbuffer_append(sb, "}");
456 
457  str = stringbuffer_getstringcopy(sb);
459  return str;
460 }
461 
462 
468 // static char*
469 // nd_stats_to_grid(const ND_STATS *stats)
470 // {
471 // char *rv;
472 // int j, k;
473 // int sizex = (int)roundf(stats->size[0]);
474 // int sizey = (int)roundf(stats->size[1]);
475 // stringbuffer_t *sb = stringbuffer_create();
476 //
477 // for ( k = 0; k < sizey; k++ )
478 // {
479 // for ( j = 0; j < sizex; j++ )
480 // {
481 // stringbuffer_aprintf(sb, "%3d ", (int)roundf(stats->value[j + k*sizex]));
482 // }
483 // stringbuffer_append(sb, "\n");
484 // }
485 //
486 // rv = stringbuffer_getstringcopy(sb);
487 // stringbuffer_destroy(sb);
488 // return rv;
489 // }
490 
491 
493 static int
494 nd_box_merge(const ND_BOX *source, ND_BOX *target)
495 {
496  int d;
497  for ( d = 0; d < ND_DIMS; d++ )
498  {
499  target->min[d] = Min(target->min[d], source->min[d]);
500  target->max[d] = Max(target->max[d], source->max[d]);
501  }
502  return TRUE;
503 }
504 
506 static int
508 {
509  memset(a, 0, sizeof(ND_BOX));
510  return TRUE;
511 }
512 
518 static int
520 {
521  int d;
522  for ( d = 0; d < ND_DIMS; d++ )
523  {
524  a->min[d] = FLT_MAX;
525  a->max[d] = -1 * FLT_MAX;
526  }
527  return TRUE;
528 }
529 
531 static void
532 nd_box_from_gbox(const GBOX *gbox, ND_BOX *nd_box)
533 {
534  int d = 0;
535  POSTGIS_DEBUGF(3, " %s", gbox_to_string(gbox));
536 
537  nd_box_init(nd_box);
538  nd_box->min[d] = gbox->xmin;
539  nd_box->max[d] = gbox->xmax;
540  d++;
541  nd_box->min[d] = gbox->ymin;
542  nd_box->max[d] = gbox->ymax;
543  d++;
544  if ( FLAGS_GET_GEODETIC(gbox->flags) )
545  {
546  nd_box->min[d] = gbox->zmin;
547  nd_box->max[d] = gbox->zmax;
548  return;
549  }
550  if ( FLAGS_GET_Z(gbox->flags) )
551  {
552  nd_box->min[d] = gbox->zmin;
553  nd_box->max[d] = gbox->zmax;
554  d++;
555  }
556  if ( FLAGS_GET_M(gbox->flags) )
557  {
558  nd_box->min[d] = gbox->mmin;
559  nd_box->max[d] = gbox->mmax;
560  d++;
561  }
562  return;
563 }
564 
568 static int
569 nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
570 {
571  int d;
572  for ( d = 0; d < ndims; d++ )
573  {
574  if ( (a->min[d] > b->max[d]) || (a->max[d] < b->min[d]) )
575  return FALSE;
576  }
577  return TRUE;
578 }
579 
583 static int
584 nd_box_contains(const ND_BOX *a, const ND_BOX *b, int ndims)
585 {
586  int d;
587  for ( d = 0; d < ndims; d++ )
588  {
589  if ( ! ((a->min[d] < b->min[d]) && (a->max[d] > b->max[d])) )
590  return FALSE;
591  }
592  return TRUE;
593 }
594 
599 static int
600 nd_box_expand(ND_BOX *nd_box, double expansion_factor)
601 {
602  int d;
603  double size;
604  for ( d = 0; d < ND_DIMS; d++ )
605  {
606  size = nd_box->max[d] - nd_box->min[d];
607  if ( size <= 0 ) continue;
608  nd_box->min[d] -= size * expansion_factor / 2;
609  nd_box->max[d] += size * expansion_factor / 2;
610  }
611  return TRUE;
612 }
613 
618 static inline int
619 nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
620 {
621  int d;
622 
623  POSTGIS_DEBUGF(4, " nd_box: %s", nd_box_to_json(nd_box, nd_stats->ndims));
624 
625  /* Initialize ibox */
626  memset(nd_ibox, 0, sizeof(ND_IBOX));
627 
628  /* In each dimension... */
629  for ( d = 0; d < nd_stats->ndims; d++ )
630  {
631  double smin = nd_stats->extent.min[d];
632  double smax = nd_stats->extent.max[d];
633  double width = smax - smin;
634  int size = roundf(nd_stats->size[d]);
635 
636  /* ... find cells the box overlaps with in this dimension */
637  nd_ibox->min[d] = floor(size * (nd_box->min[d] - smin) / width);
638  nd_ibox->max[d] = floor(size * (nd_box->max[d] - smin) / width);
639 
640  POSTGIS_DEBUGF(5, " stats: dim %d: min %g: max %g: width %g", d, smin, smax, width);
641  POSTGIS_DEBUGF(5, " overlap: dim %d: (%d, %d)", d, nd_ibox->min[d], nd_ibox->max[d]);
642 
643  /* Push any out-of range values into range */
644  nd_ibox->min[d] = Max(nd_ibox->min[d], 0);
645  nd_ibox->max[d] = Min(nd_ibox->max[d], size-1);
646  }
647  return TRUE;
648 }
649 
653 static inline double
654 nd_box_ratio(const ND_BOX *b1, const ND_BOX *b2, int ndims)
655 {
656  int d;
657  bool covered = TRUE;
658  double ivol = 1.0;
659  double vol2 = 1.0;
660  double vol1 = 1.0;
661 
662  for ( d = 0 ; d < ndims; d++ )
663  {
664  if ( b1->max[d] <= b2->min[d] || b1->min[d] >= b2->max[d] )
665  return 0.0; /* Disjoint */
666 
667  if ( b1->min[d] > b2->min[d] || b1->max[d] < b2->max[d] )
668  covered = FALSE;
669  }
670 
671  if ( covered )
672  return 1.0;
673 
674  for ( d = 0; d < ndims; d++ )
675  {
676  double width1 = b1->max[d] - b1->min[d];
677  double width2 = b2->max[d] - b2->min[d];
678  double imin, imax, iwidth;
679 
680  vol1 *= width1;
681  vol2 *= width2;
682 
683  imin = Max(b1->min[d], b2->min[d]);
684  imax = Min(b1->max[d], b2->max[d]);
685  iwidth = imax - imin;
686  iwidth = Max(0.0, iwidth);
687 
688  ivol *= iwidth;
689  }
690 
691  if ( vol2 == 0.0 )
692  return vol2;
693 
694  return ivol / vol2;
695 }
696 
697 /* How many bins shall we use in figuring out the distribution? */
698 #define NUM_BINS 50
699 
715 static int
716 nd_box_array_distribution(const ND_BOX **nd_boxes, int num_boxes, const ND_BOX *extent, int ndims, double *distribution)
717 {
718  int d, i, k, range;
719  int counts[NUM_BINS];
720  double smin, smax; /* Spatial min, spatial max */
721  double swidth; /* Spatial width of dimension */
722 #if POSTGIS_DEBUG_LEVEL >= 3
723  double average, sdev, sdev_ratio;
724 #endif
725  int bmin, bmax; /* Bin min, bin max */
726  const ND_BOX *ndb;
727 
728  /* For each dimension... */
729  for ( d = 0; d < ndims; d++ )
730  {
731  /* Initialize counts for this dimension */
732  memset(counts, 0, sizeof(counts));
733 
734  smin = extent->min[d];
735  smax = extent->max[d];
736  swidth = smax - smin;
737 
738  /* Don't try and calculate distribution of overly narrow dimensions */
739  if ( swidth < MIN_DIMENSION_WIDTH )
740  {
741  distribution[d] = 0;
742  continue;
743  }
744 
745  /* Sum up the overlaps of each feature with the dimensional bins */
746  for ( i = 0; i < num_boxes; i++ )
747  {
748  double minoffset, maxoffset;
749 
750  /* Skip null entries */
751  ndb = nd_boxes[i];
752  if ( ! ndb ) continue;
753 
754  /* Where does box fall relative to the working range */
755  minoffset = ndb->min[d] - smin;
756  maxoffset = ndb->max[d] - smin;
757 
758  /* Skip boxes that are outside our working range */
759  if ( minoffset < 0 || minoffset > swidth ||
760  maxoffset < 0 || maxoffset > swidth )
761  {
762  continue;
763  }
764 
765  /* What bins does this range correspond to? */
766  bmin = floor(NUM_BINS * minoffset / swidth);
767  bmax = floor(NUM_BINS * maxoffset / swidth);
768 
769  /* Should only happen when maxoffset==swidth */
770  if (bmax >= NUM_BINS)
771  bmax = NUM_BINS-1;
772 
773  POSTGIS_DEBUGF(4, " dimension %d, feature %d: bin %d to bin %d", d, i, bmin, bmax);
774 
775  /* Increment the counts in all the bins this feature overlaps */
776  for ( k = bmin; k <= bmax; k++ )
777  {
778  counts[k] += 1;
779  }
780 
781  }
782 
783  /* How dispersed is the distribution of features across bins? */
784  range = range_quintile(counts, NUM_BINS);
785 
786 #if POSTGIS_DEBUG_LEVEL >= 3
787  average = avg(counts, NUM_BINS);
788  sdev = stddev(counts, NUM_BINS);
789  sdev_ratio = sdev/average;
790 
791  POSTGIS_DEBUGF(3, " dimension %d: range = %d", d, range);
792  POSTGIS_DEBUGF(3, " dimension %d: average = %.6g", d, average);
793  POSTGIS_DEBUGF(3, " dimension %d: stddev = %.6g", d, sdev);
794  POSTGIS_DEBUGF(3, " dimension %d: stddev_ratio = %.6g", d, sdev_ratio);
795 #endif
796 
797  distribution[d] = range;
798  }
799 
800  return TRUE;
801 }
802 
808 static inline int
809 nd_increment(ND_IBOX *ibox, int ndims, int *counter)
810 {
811  int d = 0;
812 
813  while ( d < ndims )
814  {
815  if ( counter[d] < ibox->max[d] )
816  {
817  counter[d] += 1;
818  break;
819  }
820  counter[d] = ibox->min[d];
821  d++;
822  }
823  /* That's it, cannot increment any more! */
824  if ( d == ndims )
825  return FALSE;
826 
827  /* Increment complete! */
828  return TRUE;
829 }
830 
831 static ND_STATS*
832 pg_nd_stats_from_tuple(HeapTuple stats_tuple, int mode)
833 {
834  int stats_kind = STATISTIC_KIND_ND;
835  int rv;
836  ND_STATS *nd_stats;
837 
838  /* If we're in 2D mode, set the kind appropriately */
839  if ( mode == 2 ) stats_kind = STATISTIC_KIND_2D;
840 
841  /* Then read the geom status histogram from that */
842 
843 #if POSTGIS_PGSQL_VERSION < 100
844  float4 *floatptr;
845  int nvalues;
846 
847  rv = get_attstatsslot(stats_tuple, 0, 0, stats_kind, InvalidOid,
848  NULL, NULL, NULL, &floatptr, &nvalues);
849 
850  if ( ! rv ) {
851  POSTGIS_DEBUGF(2,
852  "no slot of kind %d in stats tuple", stats_kind);
853  return NULL;
854  }
855 
856  /* Clone the stats here so we can release the attstatsslot immediately */
857  nd_stats = palloc(sizeof(float) * nvalues);
858  memcpy(nd_stats, floatptr, sizeof(float) * nvalues);
859 
860  /* Clean up */
861  free_attstatsslot(0, NULL, 0, floatptr, nvalues);
862 #else /* PostgreSQL 10 or higher */
863  AttStatsSlot sslot;
864  rv = get_attstatsslot(&sslot, stats_tuple, stats_kind, InvalidOid,
865  ATTSTATSSLOT_NUMBERS);
866  if ( ! rv ) {
867  POSTGIS_DEBUGF(2,
868  "no slot of kind %d in stats tuple", stats_kind);
869  return NULL;
870  }
871 
872  /* Clone the stats here so we can release the attstatsslot immediately */
873  nd_stats = palloc(sizeof(float4) * sslot.nnumbers);
874  memcpy(nd_stats, sslot.numbers, sizeof(float4) * sslot.nnumbers);
875 
876  free_attstatsslot(&sslot);
877 #endif
878 
879  return nd_stats;
880 }
881 
886 static ND_STATS*
887 pg_get_nd_stats(const Oid table_oid, AttrNumber att_num, int mode, bool only_parent)
888 {
889  HeapTuple stats_tuple = NULL;
890  ND_STATS *nd_stats;
891 
892  /* First pull the stats tuple for the whole tree */
893  if ( ! only_parent )
894  {
895  POSTGIS_DEBUGF(2, "searching whole tree stats for \"%s\"", get_rel_name(table_oid)? get_rel_name(table_oid) : "NULL");
896  stats_tuple = SearchSysCache3(STATRELATT, table_oid, att_num, TRUE);
897  if ( stats_tuple )
898  POSTGIS_DEBUGF(2, "found whole tree stats for \"%s\"", get_rel_name(table_oid)? get_rel_name(table_oid) : "NULL");
899  }
900  /* Fall-back to main table stats only, if not found for whole tree or explicitly ignored */
901  if ( only_parent || ! stats_tuple )
902  {
903  POSTGIS_DEBUGF(2, "searching parent table stats for \"%s\"", get_rel_name(table_oid)? get_rel_name(table_oid) : "NULL");
904  stats_tuple = SearchSysCache2(STATRELATT, table_oid, att_num);
905  if ( stats_tuple )
906  POSTGIS_DEBUGF(2, "found parent table stats for \"%s\"", get_rel_name(table_oid)? get_rel_name(table_oid) : "NULL");
907  }
908  if ( ! stats_tuple )
909  {
910  POSTGIS_DEBUGF(2, "stats for \"%s\" do not exist", get_rel_name(table_oid)? get_rel_name(table_oid) : "NULL");
911  return NULL;
912  }
913 
914  nd_stats = pg_nd_stats_from_tuple(stats_tuple, mode);
915  ReleaseSysCache(stats_tuple);
916  if ( ! nd_stats )
917  {
918  POSTGIS_DEBUGF(2,
919  "histogram for attribute %d of table \"%s\" does not exist?",
920  att_num, get_rel_name(table_oid));
921  }
922 
923  return nd_stats;
924 }
925 
934 static ND_STATS*
935 pg_get_nd_stats_by_name(const Oid table_oid, const text *att_text, int mode, bool only_parent)
936 {
937  const char *att_name = text2cstring(att_text);
938  AttrNumber att_num;
939 
940  /* We know the name? Look up the num */
941  if ( att_text )
942  {
943  /* Get the attribute number */
944  att_num = get_attnum(table_oid, att_name);
945  if ( ! att_num ) {
946  elog(ERROR, "attribute \"%s\" does not exist", att_name);
947  return NULL;
948  }
949  }
950  else
951  {
952  elog(ERROR, "attribute name is null");
953  return NULL;
954  }
955 
956  return pg_get_nd_stats(table_oid, att_num, mode, only_parent);
957 }
958 
972 static float8
974 {
975  int ncells1, ncells2;
976  int ndims1, ndims2, ndims;
977  double ntuples_max;
978  double ntuples_not_null1, ntuples_not_null2;
979 
980  ND_BOX extent1, extent2;
981  ND_IBOX ibox1, ibox2;
982  int at1[ND_DIMS];
983  int at2[ND_DIMS];
984  double min1[ND_DIMS];
985  double width1[ND_DIMS];
986  double cellsize1[ND_DIMS];
987  int size2[ND_DIMS];
988  double min2[ND_DIMS];
989  double width2[ND_DIMS];
990  double cellsize2[ND_DIMS];
991  int size1[ND_DIMS];
992  int d;
993  double val = 0;
994  float8 selectivity;
995 
996  /* Drop out on null inputs */
997  if ( ! ( s1 && s2 ) )
998  {
999  elog(NOTICE, " estimate_join_selectivity called with null inputs");
1000  return FALLBACK_ND_SEL;
1001  }
1002 
1003  /* We need to know how many cells each side has... */
1004  ncells1 = (int)roundf(s1->histogram_cells);
1005  ncells2 = (int)roundf(s2->histogram_cells);
1006 
1007  /* ...so that we can drive the summation loop with the smaller histogram. */
1008  if ( ncells1 > ncells2 )
1009  {
1010  const ND_STATS *stats_tmp = s1;
1011  s1 = s2;
1012  s2 = stats_tmp;
1013  }
1014 
1015  POSTGIS_DEBUGF(3, "s1: %s", nd_stats_to_json(s1));
1016  POSTGIS_DEBUGF(3, "s2: %s", nd_stats_to_json(s2));
1017 
1018  /* Re-read that info after the swap */
1019  ncells1 = (int)roundf(s1->histogram_cells);
1020  ncells2 = (int)roundf(s2->histogram_cells);
1021 
1022  /* Q: What's the largest possible join size these relations can create? */
1023  /* A: The product of the # of non-null rows in each relation. */
1024  ntuples_not_null1 = s1->table_features * (s1->not_null_features / s1->sample_features);
1025  ntuples_not_null2 = s2->table_features * (s2->not_null_features / s2->sample_features);
1026  ntuples_max = ntuples_not_null1 * ntuples_not_null2;
1027 
1028  /* Get the ndims as ints */
1029  ndims1 = (int)roundf(s1->ndims);
1030  ndims2 = (int)roundf(s2->ndims);
1031  ndims = Max(ndims1, ndims2);
1032 
1033  /* Get the extents */
1034  extent1 = s1->extent;
1035  extent2 = s2->extent;
1036 
1037  /* If relation stats do not intersect, join is very very selective. */
1038  if ( ! nd_box_intersects(&extent1, &extent2, ndims) )
1039  {
1040  POSTGIS_DEBUG(3, "relation stats do not intersect, returning 0");
1041  PG_RETURN_FLOAT8(0.0);
1042  }
1043 
1044  /*
1045  * First find the index range of the part of the smaller
1046  * histogram that overlaps the larger one.
1047  */
1048  if ( ! nd_box_overlap(s1, &extent2, &ibox1) )
1049  {
1050  POSTGIS_DEBUG(3, "could not calculate overlap of relations");
1051  PG_RETURN_FLOAT8(FALLBACK_ND_JOINSEL);
1052  }
1053 
1054  /* Initialize counters / constants on s1 */
1055  for ( d = 0; d < ndims1; d++ )
1056  {
1057  at1[d] = ibox1.min[d];
1058  min1[d] = s1->extent.min[d];
1059  width1[d] = s1->extent.max[d] - s1->extent.min[d];
1060  size1[d] = (int)roundf(s1->size[d]);
1061  cellsize1[d] = width1[d] / size1[d];
1062  }
1063 
1064  /* Initialize counters / constants on s2 */
1065  for ( d = 0; d < ndims2; d++ )
1066  {
1067  min2[d] = s2->extent.min[d];
1068  width2[d] = s2->extent.max[d] - s2->extent.min[d];
1069  size2[d] = (int)roundf(s2->size[d]);
1070  cellsize2[d] = width2[d] / size2[d];
1071  }
1072 
1073  /* For each affected cell of s1... */
1074  do
1075  {
1076  double val1;
1077  /* Construct the bounds of this cell */
1078  ND_BOX nd_cell1;
1079  nd_box_init(&nd_cell1);
1080  for ( d = 0; d < ndims1; d++ )
1081  {
1082  nd_cell1.min[d] = min1[d] + (at1[d]+0) * cellsize1[d];
1083  nd_cell1.max[d] = min1[d] + (at1[d]+1) * cellsize1[d];
1084  }
1085 
1086  /* Find the cells of s2 that cell1 overlaps.. */
1087  nd_box_overlap(s2, &nd_cell1, &ibox2);
1088 
1089  /* Initialize counter */
1090  for ( d = 0; d < ndims2; d++ )
1091  {
1092  at2[d] = ibox2.min[d];
1093  }
1094 
1095  POSTGIS_DEBUGF(3, "at1 %d,%d %s", at1[0], at1[1], nd_box_to_json(&nd_cell1, ndims1));
1096 
1097  /* Get the value at this cell */
1098  val1 = s1->value[nd_stats_value_index(s1, at1)];
1099 
1100  /* For each overlapped cell of s2... */
1101  do
1102  {
1103  double ratio2;
1104  double val2;
1105 
1106  /* Construct the bounds of this cell */
1107  ND_BOX nd_cell2;
1108  nd_box_init(&nd_cell2);
1109  for ( d = 0; d < ndims2; d++ )
1110  {
1111  nd_cell2.min[d] = min2[d] + (at2[d]+0) * cellsize2[d];
1112  nd_cell2.max[d] = min2[d] + (at2[d]+1) * cellsize2[d];
1113  }
1114 
1115  POSTGIS_DEBUGF(3, " at2 %d,%d %s", at2[0], at2[1], nd_box_to_json(&nd_cell2, ndims2));
1116 
1117  /* Calculate overlap ratio of the cells */
1118  ratio2 = nd_box_ratio(&nd_cell1, &nd_cell2, Max(ndims1, ndims2));
1119 
1120  /* Multiply the cell counts, scaled by overlap ratio */
1121  val2 = s2->value[nd_stats_value_index(s2, at2)];
1122  POSTGIS_DEBUGF(3, " val1 %.6g val2 %.6g ratio %.6g", val1, val2, ratio2);
1123  val += val1 * (val2 * ratio2);
1124  }
1125  while ( nd_increment(&ibox2, ndims2, at2) );
1126 
1127  }
1128  while( nd_increment(&ibox1, ndims1, at1) );
1129 
1130  POSTGIS_DEBUGF(3, "val of histogram = %g", val);
1131 
1132  /*
1133  * In order to compare our total cell count "val" to the
1134  * ntuples_max, we need to scale val up to reflect a full
1135  * table estimate. So, multiply by ratio of table size to
1136  * sample size.
1137  */
1138  val *= (s1->table_features / s1->sample_features);
1139  val *= (s2->table_features / s2->sample_features);
1140 
1141  POSTGIS_DEBUGF(3, "val scaled to full table size = %g", val);
1142 
1143  /*
1144  * Because the cell counts are over-determined due to
1145  * double counting of features that overlap multiple cells
1146  * (see the compute_gserialized_stats routine)
1147  * we also have to scale our cell count "val" *down*
1148  * to adjust for the double counting.
1149  */
1150 // val /= (s1->cells_covered / s1->histogram_features);
1151 // val /= (s2->cells_covered / s2->histogram_features);
1152 
1153  /*
1154  * Finally, the selectivity is the estimated number of
1155  * rows to be returned divided by the maximum possible
1156  * number of rows that can be returned.
1157  */
1158  selectivity = val / ntuples_max;
1159 
1160  /* Guard against over-estimates and crazy numbers :) */
1161  if ( isnan(selectivity) || ! isfinite(selectivity) || selectivity < 0.0 )
1162  {
1163  selectivity = DEFAULT_ND_JOINSEL;
1164  }
1165  else if ( selectivity > 1.0 )
1166  {
1167  selectivity = 1.0;
1168  }
1169 
1170  return selectivity;
1171 }
1172 
1178 Datum gserialized_gist_joinsel_nd(PG_FUNCTION_ARGS)
1179 {
1180  PG_RETURN_DATUM(DirectFunctionCall5(
1182  PG_GETARG_DATUM(0), PG_GETARG_DATUM(1),
1183  PG_GETARG_DATUM(2), PG_GETARG_DATUM(3),
1184  Int32GetDatum(0) /* ND mode */
1185  ));
1186 }
1187 
1193 Datum gserialized_gist_joinsel_2d(PG_FUNCTION_ARGS)
1194 {
1195  PG_RETURN_DATUM(DirectFunctionCall5(
1197  PG_GETARG_DATUM(0), PG_GETARG_DATUM(1),
1198  PG_GETARG_DATUM(2), PG_GETARG_DATUM(3),
1199  Int32GetDatum(2) /* 2D mode */
1200  ));
1201 }
1202 
1212 Datum gserialized_gist_joinsel(PG_FUNCTION_ARGS)
1213 {
1214  PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
1215  /* Oid operator = PG_GETARG_OID(1); */
1216  List *args = (List *) PG_GETARG_POINTER(2);
1217  JoinType jointype = (JoinType) PG_GETARG_INT16(3);
1218  int mode = PG_GETARG_INT32(4);
1219 
1220  Node *arg1, *arg2;
1221  Var *var1, *var2;
1222  Oid relid1, relid2;
1223 
1224  ND_STATS *stats1, *stats2;
1225  float8 selectivity;
1226 
1227  /* Only respond to an inner join/unknown context join */
1228  if (jointype != JOIN_INNER)
1229  {
1230  elog(DEBUG1, "%s: jointype %d not supported", __func__, jointype);
1231  PG_RETURN_FLOAT8(DEFAULT_ND_JOINSEL);
1232  }
1233 
1234  /* Find Oids of the geometry columns we are working with */
1235  arg1 = (Node*) linitial(args);
1236  arg2 = (Node*) lsecond(args);
1237  var1 = (Var*) arg1;
1238  var2 = (Var*) arg2;
1239 
1240  /* We only do column joins right now, no functional joins */
1241  /* TODO: handle g1 && ST_Expand(g2) */
1242  if (!IsA(arg1, Var) || !IsA(arg2, Var))
1243  {
1244  elog(DEBUG1, "%s called with arguments that are not column references", __func__);
1245  PG_RETURN_FLOAT8(DEFAULT_ND_JOINSEL);
1246  }
1247 
1248  /* What are the Oids of our tables/relations? */
1249  relid1 = getrelid(var1->varno, root->parse->rtable);
1250  relid2 = getrelid(var2->varno, root->parse->rtable);
1251 
1252  POSTGIS_DEBUGF(3, "using relations \"%s\" Oid(%d), \"%s\" Oid(%d)",
1253  get_rel_name(relid1) ? get_rel_name(relid1) : "NULL", relid1, get_rel_name(relid2) ? get_rel_name(relid2) : "NULL", relid2);
1254 
1255  /* Pull the stats from the stats system. */
1256  stats1 = pg_get_nd_stats(relid1, var1->varattno, mode, FALSE);
1257  stats2 = pg_get_nd_stats(relid2, var2->varattno, mode, FALSE);
1258 
1259  /* If we can't get stats, we have to stop here! */
1260  if ( ! stats1 )
1261  {
1262  POSTGIS_DEBUGF(3, "unable to retrieve stats for \"%s\" Oid(%d)", get_rel_name(relid1) ? get_rel_name(relid1) : "NULL" , relid1);
1263  PG_RETURN_FLOAT8(DEFAULT_ND_JOINSEL);
1264  }
1265  else if ( ! stats2 )
1266  {
1267  POSTGIS_DEBUGF(3, "unable to retrieve stats for \"%s\" Oid(%d)", get_rel_name(relid2) ? get_rel_name(relid2) : "NULL", relid2);
1268  PG_RETURN_FLOAT8(DEFAULT_ND_JOINSEL);
1269  }
1270 
1271  selectivity = estimate_join_selectivity(stats1, stats2);
1272  POSTGIS_DEBUGF(2, "got selectivity %g", selectivity);
1273 
1274  pfree(stats1);
1275  pfree(stats2);
1276  PG_RETURN_FLOAT8(selectivity);
1277 }
1278 
1279 
1280 
1281 
1300 static void
1301 compute_gserialized_stats_mode(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
1302  int sample_rows, double total_rows, int mode)
1303 {
1304  MemoryContext old_context;
1305  int d, i; /* Counters */
1306  int notnull_cnt = 0; /* # not null rows in the sample */
1307  int null_cnt = 0; /* # null rows in the sample */
1308  int histogram_features = 0; /* # rows that actually got counted in the histogram */
1309 
1310  ND_STATS *nd_stats; /* Our histogram */
1311  size_t nd_stats_size; /* Size to allocate */
1312 
1313  double total_width = 0; /* # of bytes used by sample */
1314  double total_sample_volume = 0; /* Area/volume coverage of the sample */
1315  double total_cell_count = 0; /* # of cells in histogram affected by sample */
1316 
1317  ND_BOX sum; /* Sum of extents of sample boxes */
1318  ND_BOX avg; /* Avg of extents of sample boxes */
1319  ND_BOX stddev; /* StdDev of extents of sample boxes */
1320 
1321  const ND_BOX **sample_boxes; /* ND_BOXes for each of the sample features */
1322  ND_BOX sample_extent; /* Extent of the raw sample */
1323  int histo_size[ND_DIMS]; /* histogram nrows, ncols, etc */
1324  ND_BOX histo_extent; /* Spatial extent of the histogram */
1325  ND_BOX histo_extent_new; /* Temporary variable */
1326  int histo_cells_target; /* Number of cells we will shoot for, given the stats target */
1327  int histo_cells; /* Number of cells in the histogram */
1328  int histo_cells_new = 1; /* Temporary variable */
1329 
1330  int ndims = 2; /* Dimensionality of the sample */
1331  int histo_ndims = 0; /* Dimensionality of the histogram */
1332  double sample_distribution[ND_DIMS]; /* How homogeneous is distribution of sample in each axis? */
1333  double total_distribution; /* Total of sample_distribution */
1334 
1335  int stats_slot; /* What slot is this data going into? (2D vs ND) */
1336  int stats_kind; /* And this is what? (2D vs ND) */
1337 
1338  /* Initialize sum and stddev */
1339  nd_box_init(&sum);
1340  nd_box_init(&stddev);
1341 
1342  /*
1343  * This is where gserialized_analyze_nd
1344  * should put its' custom parameters.
1345  */
1346  /* void *mystats = stats->extra_data; */
1347 
1348  POSTGIS_DEBUG(2, "compute_gserialized_stats called");
1349  POSTGIS_DEBUGF(3, " # sample_rows: %d", sample_rows);
1350  POSTGIS_DEBUGF(3, " estimate of total_rows: %.6g", total_rows);
1351 
1352  /*
1353  * We might need less space, but don't think
1354  * its worth saving...
1355  */
1356  sample_boxes = palloc(sizeof(ND_BOX*) * sample_rows);
1357 
1358  /*
1359  * First scan:
1360  * o read boxes
1361  * o find dimensionality of the sample
1362  * o find extent of the sample
1363  * o count null-infinite/not-null values
1364  * o compute total_width
1365  * o compute total features's box area (for avgFeatureArea)
1366  * o sum features box coordinates (for standard deviation)
1367  */
1368  for ( i = 0; i < sample_rows; i++ )
1369  {
1370  Datum datum;
1371  GSERIALIZED *geom;
1372  GBOX gbox;
1373  ND_BOX *nd_box;
1374  bool is_null;
1375  bool is_copy;
1376 
1377  datum = fetchfunc(stats, i, &is_null);
1378 
1379  /* Skip all NULLs. */
1380  if ( is_null )
1381  {
1382  POSTGIS_DEBUGF(4, " skipped null geometry %d", i);
1383  null_cnt++;
1384  continue;
1385  }
1386 
1387  /* Read the bounds from the gserialized. */
1388  geom = (GSERIALIZED *)PG_DETOAST_DATUM(datum);
1389  is_copy = VARATT_IS_EXTENDED(datum);
1390  if ( LW_FAILURE == gserialized_get_gbox_p(geom, &gbox) )
1391  {
1392  /* Skip empties too. */
1393  POSTGIS_DEBUGF(3, " skipped empty geometry %d", i);
1394  continue;
1395  }
1396 
1397  /* If we're in 2D mode, zero out the higher dimensions for "safety" */
1398  if ( mode == 2 )
1399  gbox.zmin = gbox.zmax = gbox.mmin = gbox.mmax = 0.0;
1400 
1401  /* Check bounds for validity (finite and not NaN) */
1402  if ( ! gbox_is_valid(&gbox) )
1403  {
1404  POSTGIS_DEBUGF(3, " skipped infinite/nan geometry %d", i);
1405  continue;
1406  }
1407 
1408  /*
1409  * In N-D mode, set the ndims to the maximum dimensionality found
1410  * in the sample. Otherwise, leave at ndims == 2.
1411  */
1412  if ( mode != 2 )
1413  ndims = Max(gbox_ndims(&gbox), ndims);
1414 
1415  /* Convert gbox to n-d box */
1416  nd_box = palloc(sizeof(ND_BOX));
1417  nd_box_from_gbox(&gbox, nd_box);
1418 
1419  /* Cache n-d bounding box */
1420  sample_boxes[notnull_cnt] = nd_box;
1421 
1422  /* Initialize sample extent before merging first entry */
1423  if ( ! notnull_cnt )
1424  nd_box_init_bounds(&sample_extent);
1425 
1426  /* Add current sample to overall sample extent */
1427  nd_box_merge(nd_box, &sample_extent);
1428 
1429  /* How many bytes does this sample use? */
1430  total_width += VARSIZE(geom);
1431 
1432  /* Add bounds coordinates to sums for stddev calculation */
1433  for ( d = 0; d < ndims; d++ )
1434  {
1435  sum.min[d] += nd_box->min[d];
1436  sum.max[d] += nd_box->max[d];
1437  }
1438 
1439  /* Increment our "good feature" count */
1440  notnull_cnt++;
1441 
1442  /* Free up memory if our sample geometry was copied */
1443  if ( is_copy )
1444  pfree(geom);
1445 
1446  /* Give backend a chance of interrupting us */
1447  vacuum_delay_point();
1448  }
1449 
1450  /*
1451  * We'll build a histogram having stats->attr->attstattarget cells
1452  * on each side, within reason... we'll use ndims*10000 as the
1453  * maximum number of cells.
1454  * Also, if we're sampling a relatively small table, we'll try to ensure that
1455  * we have an average of 5 features for each cell so the histogram isn't
1456  * so sparse.
1457  */
1458  histo_cells_target = (int)pow((double)(stats->attr->attstattarget), (double)ndims);
1459  histo_cells_target = Min(histo_cells_target, ndims * 10000);
1460  histo_cells_target = Min(histo_cells_target, (int)(total_rows/5));
1461  POSTGIS_DEBUGF(3, " stats->attr->attstattarget: %d", stats->attr->attstattarget);
1462  POSTGIS_DEBUGF(3, " target # of histogram cells: %d", histo_cells_target);
1463 
1464  /* If there's no useful features, we can't work out stats */
1465  if ( ! notnull_cnt )
1466  {
1467  elog(NOTICE, "no non-null/empty features, unable to compute statistics");
1468  stats->stats_valid = false;
1469  return;
1470  }
1471 
1472  POSTGIS_DEBUGF(3, " sample_extent: %s", nd_box_to_json(&sample_extent, ndims));
1473 
1474  /*
1475  * Second scan:
1476  * o compute standard deviation
1477  */
1478  for ( d = 0; d < ndims; d++ )
1479  {
1480  /* Calculate average bounds values */
1481  avg.min[d] = sum.min[d] / notnull_cnt;
1482  avg.max[d] = sum.max[d] / notnull_cnt;
1483 
1484  /* Calculate standard deviation for this dimension bounds */
1485  for ( i = 0; i < notnull_cnt; i++ )
1486  {
1487  const ND_BOX *ndb = sample_boxes[i];
1488  stddev.min[d] += (ndb->min[d] - avg.min[d]) * (ndb->min[d] - avg.min[d]);
1489  stddev.max[d] += (ndb->max[d] - avg.max[d]) * (ndb->max[d] - avg.max[d]);
1490  }
1491  stddev.min[d] = sqrt(stddev.min[d] / notnull_cnt);
1492  stddev.max[d] = sqrt(stddev.max[d] / notnull_cnt);
1493 
1494  /* Histogram bounds for this dimension bounds is avg +/- SDFACTOR * stdev */
1495  histo_extent.min[d] = Max(avg.min[d] - SDFACTOR * stddev.min[d], sample_extent.min[d]);
1496  histo_extent.max[d] = Min(avg.max[d] + SDFACTOR * stddev.max[d], sample_extent.max[d]);
1497  }
1498 
1499  /*
1500  * Third scan:
1501  * o skip hard deviants
1502  * o compute new histogram box
1503  */
1504  nd_box_init_bounds(&histo_extent_new);
1505  for ( i = 0; i < notnull_cnt; i++ )
1506  {
1507  const ND_BOX *ndb = sample_boxes[i];
1508  /* Skip any hard deviants (boxes entirely outside our histo_extent */
1509  if ( ! nd_box_intersects(&histo_extent, ndb, ndims) )
1510  {
1511  POSTGIS_DEBUGF(4, " feature %d is a hard deviant, skipped", i);
1512  sample_boxes[i] = NULL;
1513  continue;
1514  }
1515  /* Expand our new box to fit all the other features. */
1516  nd_box_merge(ndb, &histo_extent_new);
1517  }
1518  /*
1519  * Expand the box slightly (1%) to avoid edge effects
1520  * with objects that are on the boundary
1521  */
1522  nd_box_expand(&histo_extent_new, 0.01);
1523  histo_extent = histo_extent_new;
1524 
1525  /*
1526  * How should we allocate our histogram cells to the
1527  * different dimensions? We can't do it by raw dimensional width,
1528  * because in x/y/z space, the z can have different units
1529  * from the x/y. Similarly for x/y/t space.
1530  * So, we instead calculate how much features overlap
1531  * each other in their dimension to figure out which
1532  * dimensions have useful selectivity characteristics (more
1533  * variability in density) and therefor would find
1534  * more cells useful (to distinguish between dense places and
1535  * homogeneous places).
1536  */
1537  nd_box_array_distribution(sample_boxes, notnull_cnt, &histo_extent, ndims,
1538  sample_distribution);
1539 
1540  /*
1541  * The sample_distribution array now tells us how spread out the
1542  * data is in each dimension, so we use that data to allocate
1543  * the histogram cells we have available.
1544  * At this point, histo_cells_target is the approximate target number
1545  * of cells.
1546  */
1547 
1548  /*
1549  * Some dimensions have basically a uniform distribution, we want
1550  * to allocate no cells to those dimensions, only to dimensions
1551  * that have some interesting differences in data distribution.
1552  * Here we count up the number of interesting dimensions
1553  */
1554  for ( d = 0; d < ndims; d++ )
1555  {
1556  if ( sample_distribution[d] > 0 )
1557  histo_ndims++;
1558  }
1559 
1560  if ( histo_ndims == 0 )
1561  {
1562  /* Special case: all our dimensions had low variability! */
1563  /* We just divide the cells up evenly */
1564  POSTGIS_DEBUG(3, " special case: no axes have variability");
1565  histo_cells_new = 1;
1566  for ( d = 0; d < ndims; d++ )
1567  {
1568  histo_size[d] = 1 + (int)pow((double)histo_cells_target, 1/(double)ndims);
1569  POSTGIS_DEBUGF(3, " histo_size[d]: %d", histo_size[d]);
1570  histo_cells_new *= histo_size[d];
1571  }
1572  POSTGIS_DEBUGF(3, " histo_cells_new: %d", histo_cells_new);
1573  }
1574  else
1575  {
1576  /*
1577  * We're going to express the amount of variability in each dimension
1578  * as a proportion of the total variability and allocate cells in that
1579  * dimension relative to that proportion.
1580  */
1581  POSTGIS_DEBUG(3, " allocating histogram axes based on axis variability");
1582  total_distribution = total_double(sample_distribution, ndims); /* First get the total */
1583  POSTGIS_DEBUGF(3, " total_distribution: %.8g", total_distribution);
1584  histo_cells_new = 1; /* For the number of cells in the final histogram */
1585  for ( d = 0; d < ndims; d++ )
1586  {
1587  if ( sample_distribution[d] == 0 ) /* Uninteresting dimensions don't get any room */
1588  {
1589  histo_size[d] = 1;
1590  }
1591  else /* Interesting dimension */
1592  {
1593  /* How does this dims variability compare to the total? */
1594  float edge_ratio = (float)sample_distribution[d] / (float)total_distribution;
1595  /*
1596  * Scale the target cells number by the # of dims and ratio,
1597  * then take the appropriate root to get the estimated number of cells
1598  * on this axis (eg, pow(0.5) for 2d, pow(0.333) for 3d, pow(0.25) for 4d)
1599  */
1600  histo_size[d] = (int)pow(histo_cells_target * histo_ndims * edge_ratio, 1/(double)histo_ndims);
1601  /* If something goes awry, just give this dim one slot */
1602  if ( ! histo_size[d] )
1603  histo_size[d] = 1;
1604  }
1605  histo_cells_new *= histo_size[d];
1606  }
1607  POSTGIS_DEBUGF(3, " histo_cells_new: %d", histo_cells_new);
1608  }
1609 
1610  /* Update histo_cells to the actual number of cells we need to allocate */
1611  histo_cells = histo_cells_new;
1612  POSTGIS_DEBUGF(3, " histo_cells: %d", histo_cells);
1613 
1614  /*
1615  * Create the histogram (ND_STATS) in the stats memory context
1616  */
1617  old_context = MemoryContextSwitchTo(stats->anl_context);
1618  nd_stats_size = sizeof(ND_STATS) + ((histo_cells - 1) * sizeof(float4));
1619  nd_stats = palloc(nd_stats_size);
1620  memset(nd_stats, 0, nd_stats_size); /* Initialize all values to 0 */
1621  MemoryContextSwitchTo(old_context);
1622 
1623  /* Initialize the #ND_STATS objects */
1624  nd_stats->ndims = ndims;
1625  nd_stats->extent = histo_extent;
1626  nd_stats->sample_features = sample_rows;
1627  nd_stats->table_features = total_rows;
1628  nd_stats->not_null_features = notnull_cnt;
1629  /* Copy in the histogram dimensions */
1630  for ( d = 0; d < ndims; d++ )
1631  nd_stats->size[d] = histo_size[d];
1632 
1633  /*
1634  * Fourth scan:
1635  * o fill histogram values with the proportion of
1636  * features' bbox overlaps: a feature's bvol
1637  * can fully overlap (1) or partially overlap
1638  * (fraction of 1) an histogram cell.
1639  *
1640  * Note that we are filling each cell with the "portion of
1641  * the feature's box that overlaps the cell". So, if we sum
1642  * up the values in the histogram, we could get the
1643  * histogram feature count.
1644  *
1645  */
1646  for ( i = 0; i < notnull_cnt; i++ )
1647  {
1648  const ND_BOX *nd_box;
1649  ND_IBOX nd_ibox;
1650  int at[ND_DIMS];
1651  int d;
1652  double num_cells = 0;
1653  double tmp_volume = 1.0;
1654  double min[ND_DIMS];
1655  double max[ND_DIMS];
1656  double cellsize[ND_DIMS];
1657 
1658  nd_box = sample_boxes[i];
1659  if ( ! nd_box ) continue; /* Skip Null'ed out hard deviants */
1660 
1661  /* Give backend a chance of interrupting us */
1662  vacuum_delay_point();
1663 
1664  /* Find the cells that overlap with this box and put them into the ND_IBOX */
1665  nd_box_overlap(nd_stats, nd_box, &nd_ibox);
1666  memset(at, 0, sizeof(int)*ND_DIMS);
1667 
1668  POSTGIS_DEBUGF(3, " feature %d: ibox (%d, %d, %d, %d) (%d, %d, %d, %d)", i,
1669  nd_ibox.min[0], nd_ibox.min[1], nd_ibox.min[2], nd_ibox.min[3],
1670  nd_ibox.max[0], nd_ibox.max[1], nd_ibox.max[2], nd_ibox.max[3]);
1671 
1672  for ( d = 0; d < nd_stats->ndims; d++ )
1673  {
1674  /* Initialize the starting values */
1675  at[d] = nd_ibox.min[d];
1676  min[d] = nd_stats->extent.min[d];
1677  max[d] = nd_stats->extent.max[d];
1678  cellsize[d] = (max[d] - min[d])/(nd_stats->size[d]);
1679 
1680  /* What's the volume (area) of this feature's box? */
1681  tmp_volume *= (nd_box->max[d] - nd_box->min[d]);
1682  }
1683 
1684  /* Add feature volume (area) to our total */
1685  total_sample_volume += tmp_volume;
1686 
1687  /*
1688  * Move through all the overlaped histogram cells values and
1689  * add the box overlap proportion to them.
1690  */
1691  do
1692  {
1693  ND_BOX nd_cell;
1694  double ratio;
1695  /* Create a box for this histogram cell */
1696  for ( d = 0; d < nd_stats->ndims; d++ )
1697  {
1698  nd_cell.min[d] = min[d] + (at[d]+0) * cellsize[d];
1699  nd_cell.max[d] = min[d] + (at[d]+1) * cellsize[d];
1700  }
1701 
1702  /*
1703  * If a feature box is completely inside one cell the ratio will be
1704  * 1.0. If a feature box is 50% in two cells, each cell will get
1705  * 0.5 added on.
1706  */
1707  ratio = nd_box_ratio(&nd_cell, nd_box, nd_stats->ndims);
1708  nd_stats->value[nd_stats_value_index(nd_stats, at)] += ratio;
1709  num_cells += ratio;
1710  POSTGIS_DEBUGF(3, " ratio (%.8g) num_cells (%.8g)", ratio, num_cells);
1711  POSTGIS_DEBUGF(3, " at (%d, %d, %d, %d)", at[0], at[1], at[2], at[3]);
1712  }
1713  while ( nd_increment(&nd_ibox, nd_stats->ndims, at) );
1714 
1715  /* Keep track of overall number of overlaps counted */
1716  total_cell_count += num_cells;
1717  /* How many features have we added to this histogram? */
1718  histogram_features++;
1719  }
1720 
1721  POSTGIS_DEBUGF(3, " histogram_features: %d", histogram_features);
1722  POSTGIS_DEBUGF(3, " sample_rows: %d", sample_rows);
1723  POSTGIS_DEBUGF(3, " table_rows: %.6g", total_rows);
1724 
1725  /* Error out if we got no sample information */
1726  if ( ! histogram_features )
1727  {
1728  POSTGIS_DEBUG(3, " no stats have been gathered");
1729  elog(NOTICE, " no features lie in the stats histogram, invalid stats");
1730  stats->stats_valid = false;
1731  return;
1732  }
1733 
1734  nd_stats->histogram_features = histogram_features;
1735  nd_stats->histogram_cells = histo_cells;
1736  nd_stats->cells_covered = total_cell_count;
1737 
1738  /* Put this histogram data into the right slot/kind */
1739  if ( mode == 2 )
1740  {
1741  stats_slot = STATISTIC_SLOT_2D;
1742  stats_kind = STATISTIC_KIND_2D;
1743  }
1744  else
1745  {
1746  stats_slot = STATISTIC_SLOT_ND;
1747  stats_kind = STATISTIC_KIND_ND;
1748  }
1749 
1750  /* Write the statistics data */
1751  stats->stakind[stats_slot] = stats_kind;
1752  stats->staop[stats_slot] = InvalidOid;
1753  stats->stanumbers[stats_slot] = (float4*)nd_stats;
1754  stats->numnumbers[stats_slot] = nd_stats_size/sizeof(float4);
1755  stats->stanullfrac = (float4)null_cnt/sample_rows;
1756  stats->stawidth = total_width/notnull_cnt;
1757  stats->stadistinct = -1.0;
1758  stats->stats_valid = true;
1759 
1760  POSTGIS_DEBUGF(3, " out: slot 0: kind %d (STATISTIC_KIND_ND)", stats->stakind[0]);
1761  POSTGIS_DEBUGF(3, " out: slot 0: op %d (InvalidOid)", stats->staop[0]);
1762  POSTGIS_DEBUGF(3, " out: slot 0: numnumbers %d", stats->numnumbers[0]);
1763  POSTGIS_DEBUGF(3, " out: null fraction: %f=%d/%d", stats->stanullfrac, null_cnt, sample_rows);
1764  POSTGIS_DEBUGF(3, " out: average width: %d bytes", stats->stawidth);
1765  POSTGIS_DEBUG (3, " out: distinct values: all (no check done)");
1766  POSTGIS_DEBUGF(3, " out: %s", nd_stats_to_json(nd_stats));
1767  /*
1768  POSTGIS_DEBUGF(3, " out histogram:\n%s", nd_stats_to_grid(nd_stats));
1769  */
1770 
1771  return;
1772 }
1773 
1774 
1792 static void
1793 compute_gserialized_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
1794  int sample_rows, double total_rows)
1795 {
1796  /* 2D Mode */
1797  compute_gserialized_stats_mode(stats, fetchfunc, sample_rows, total_rows, 2);
1798  /* ND Mode */
1799  compute_gserialized_stats_mode(stats, fetchfunc, sample_rows, total_rows, 0);
1800 }
1801 
1802 
1831 Datum gserialized_analyze_nd(PG_FUNCTION_ARGS)
1832 {
1833  VacAttrStats *stats = (VacAttrStats *)PG_GETARG_POINTER(0);
1834  Form_pg_attribute attr = stats->attr;
1835 
1836  POSTGIS_DEBUG(2, "gserialized_analyze_nd called");
1837 
1838  /* If the attstattarget column is negative, use the default value */
1839  /* NB: it is okay to scribble on stats->attr since it's a copy */
1840  if (attr->attstattarget < 0)
1841  attr->attstattarget = default_statistics_target;
1842 
1843  POSTGIS_DEBUGF(3, " attribute stat target: %d", attr->attstattarget);
1844 
1845  /* Setup the minimum rows and the algorithm function */
1846  stats->minrows = 300 * stats->attr->attstattarget;
1847  stats->compute_stats = compute_gserialized_stats;
1848 
1849  POSTGIS_DEBUGF(3, " minrows: %d", stats->minrows);
1850 
1851  /* Indicate we are done successfully */
1852  PG_RETURN_BOOL(true);
1853 }
1854 
1867 static float8
1868 estimate_selectivity(const GBOX *box, const ND_STATS *nd_stats, int mode)
1869 {
1870  int d; /* counter */
1871  float8 selectivity;
1872  ND_BOX nd_box;
1873  ND_IBOX nd_ibox;
1874  int at[ND_DIMS];
1875  double cell_size[ND_DIMS];
1876  double min[ND_DIMS];
1877  double max[ND_DIMS];
1878  double total_count = 0.0;
1879  int ndims_max = Max(nd_stats->ndims, gbox_ndims(box));
1880 // int ndims_min = Min(nd_stats->ndims, gbox_ndims(box));
1881 
1882  /* Calculate the overlap of the box on the histogram */
1883  if ( ! nd_stats )
1884  {
1885  elog(NOTICE, " estimate_selectivity called with null input");
1886  return FALLBACK_ND_SEL;
1887  }
1888 
1889  /* Initialize nd_box. */
1890  nd_box_from_gbox(box, &nd_box);
1891 
1892  /*
1893  * To return 2D stats on an ND sample, we need to make the
1894  * 2D box cover the full range of the other dimensions in the
1895  * histogram.
1896  */
1897  POSTGIS_DEBUGF(3, " mode: %d", mode);
1898  if ( mode == 2 )
1899  {
1900  POSTGIS_DEBUG(3, " in 2d mode, stripping the computation down to 2d");
1901  ndims_max = 2;
1902  }
1903 
1904  POSTGIS_DEBUGF(3, " nd_stats->extent: %s", nd_box_to_json(&(nd_stats->extent), nd_stats->ndims));
1905  POSTGIS_DEBUGF(3, " nd_box: %s", nd_box_to_json(&(nd_box), gbox_ndims(box)));
1906 
1907  /*
1908  * Search box completely misses histogram extent?
1909  * We have to intersect in all N dimensions or else we have
1910  * zero interaction under the &&& operator. It's important
1911  * to short circuit in this case, as some of the tests below
1912  * will return junk results when run on non-intersecting inputs.
1913  */
1914  if ( ! nd_box_intersects(&nd_box, &(nd_stats->extent), ndims_max) )
1915  {
1916  POSTGIS_DEBUG(3, " search box does not overlap histogram, returning 0");
1917  return 0.0;
1918  }
1919 
1920  /* Search box completely contains histogram extent! */
1921  if ( nd_box_contains(&nd_box, &(nd_stats->extent), ndims_max) )
1922  {
1923  POSTGIS_DEBUG(3, " search box contains histogram, returning 1");
1924  return 1.0;
1925  }
1926 
1927  /* Calculate the overlap of the box on the histogram */
1928  if ( ! nd_box_overlap(nd_stats, &nd_box, &nd_ibox) )
1929  {
1930  POSTGIS_DEBUG(3, " search box overlap with stats histogram failed");
1931  return FALLBACK_ND_SEL;
1932  }
1933 
1934  /* Work out some measurements of the histogram */
1935  for ( d = 0; d < nd_stats->ndims; d++ )
1936  {
1937  /* Cell size in each dim */
1938  min[d] = nd_stats->extent.min[d];
1939  max[d] = nd_stats->extent.max[d];
1940  cell_size[d] = (max[d] - min[d]) / nd_stats->size[d];
1941  POSTGIS_DEBUGF(3, " cell_size[%d] : %.9g", d, cell_size[d]);
1942 
1943  /* Initialize the counter */
1944  at[d] = nd_ibox.min[d];
1945  }
1946 
1947  /* Move through all the overlap values and sum them */
1948  do
1949  {
1950  float cell_count, ratio;
1951  ND_BOX nd_cell;
1952 
1953  /* We have to pro-rate partially overlapped cells. */
1954  for ( d = 0; d < nd_stats->ndims; d++ )
1955  {
1956  nd_cell.min[d] = min[d] + (at[d]+0) * cell_size[d];
1957  nd_cell.max[d] = min[d] + (at[d]+1) * cell_size[d];
1958  }
1959 
1960  ratio = nd_box_ratio(&nd_box, &nd_cell, nd_stats->ndims);
1961  cell_count = nd_stats->value[nd_stats_value_index(nd_stats, at)];
1962 
1963  /* Add the pro-rated count for this cell to the overall total */
1964  total_count += cell_count * ratio;
1965  POSTGIS_DEBUGF(4, " cell (%d,%d), cell value %.6f, ratio %.6f", at[0], at[1], cell_count, ratio);
1966  }
1967  while ( nd_increment(&nd_ibox, nd_stats->ndims, at) );
1968 
1969  /* Scale by the number of features in our histogram to get the proportion */
1970  selectivity = total_count / nd_stats->histogram_features;
1971 
1972  POSTGIS_DEBUGF(3, " nd_stats->histogram_features = %f", nd_stats->histogram_features);
1973  POSTGIS_DEBUGF(3, " nd_stats->histogram_cells = %f", nd_stats->histogram_cells);
1974  POSTGIS_DEBUGF(3, " sum(overlapped histogram cells) = %f", total_count);
1975  POSTGIS_DEBUGF(3, " selectivity = %f", selectivity);
1976 
1977  /* Prevent rounding overflows */
1978  if (selectivity > 1.0) selectivity = 1.0;
1979  else if (selectivity < 0.0) selectivity = 0.0;
1980 
1981  return selectivity;
1982 }
1983 
1984 
1985 
1991 Datum _postgis_gserialized_stats(PG_FUNCTION_ARGS)
1992 {
1993  Oid table_oid = PG_GETARG_OID(0);
1994  text *att_text = PG_GETARG_TEXT_P(1);
1995  ND_STATS *nd_stats;
1996  char *str;
1997  text *json;
1998  int mode = 2; /* default to 2D mode */
1999  bool only_parent = FALSE; /* default to whole tree stats */
2000 
2001  /* Check if we've been asked to not use 2d mode */
2002  if ( ! PG_ARGISNULL(2) )
2003  mode = text_p_get_mode(PG_GETARG_TEXT_P(2));
2004 
2005  /* Check if we've been asked to only use stats from parent */
2006  if ( ! PG_ARGISNULL(3) )
2007  only_parent = PG_GETARG_BOOL(3);
2008 
2009  /* Retrieve the stats object */
2010  nd_stats = pg_get_nd_stats_by_name(table_oid, att_text, mode, only_parent);
2011  if ( ! nd_stats )
2012  elog(ERROR, "stats for \"%s.%s\" do not exist", get_rel_name(table_oid), text2cstring(att_text));
2013 
2014  /* Convert to JSON */
2015  str = nd_stats_to_json(nd_stats);
2016  json = cstring2text(str);
2017  pfree(str);
2018  pfree(nd_stats);
2019  PG_RETURN_TEXT_P(json);
2020 }
2021 
2022 
2028 Datum _postgis_gserialized_sel(PG_FUNCTION_ARGS)
2029 {
2030  Oid table_oid = PG_GETARG_OID(0);
2031  text *att_text = PG_GETARG_TEXT_P(1);
2032  Datum geom_datum = PG_GETARG_DATUM(2);
2033  GBOX gbox; /* search box read from gserialized datum */
2034  float8 selectivity = 0;
2035  ND_STATS *nd_stats;
2036  int mode = 2; /* 2D mode by default */
2037 
2038  /* Check if we've been asked to not use 2d mode */
2039  if ( ! PG_ARGISNULL(3) )
2040  mode = text_p_get_mode(PG_GETARG_TEXT_P(3));
2041 
2042  /* Retrieve the stats object */
2043  nd_stats = pg_get_nd_stats_by_name(table_oid, att_text, mode, FALSE);
2044 
2045  if ( ! nd_stats )
2046  elog(ERROR, "stats for \"%s.%s\" do not exist", get_rel_name(table_oid), text2cstring(att_text));
2047 
2048  /* Calculate the gbox */
2049  if ( ! gserialized_datum_get_gbox_p(geom_datum, &gbox) )
2050  elog(ERROR, "unable to calculate bounding box from geometry");
2051 
2052  POSTGIS_DEBUGF(3, " %s", gbox_to_string(&gbox));
2053 
2054  /* Do the estimation */
2055  selectivity = estimate_selectivity(&gbox, nd_stats, mode);
2056 
2057  pfree(nd_stats);
2058  PG_RETURN_FLOAT8(selectivity);
2059 }
2060 
2061 
2067 Datum _postgis_gserialized_joinsel(PG_FUNCTION_ARGS)
2068 {
2069  Oid table_oid1 = PG_GETARG_OID(0);
2070  text *att_text1 = PG_GETARG_TEXT_P(1);
2071  Oid table_oid2 = PG_GETARG_OID(2);
2072  text *att_text2 = PG_GETARG_TEXT_P(3);
2073  ND_STATS *nd_stats1, *nd_stats2;
2074  float8 selectivity = 0;
2075  int mode = 2; /* 2D mode by default */
2076 
2077 
2078  /* Retrieve the stats object */
2079  nd_stats1 = pg_get_nd_stats_by_name(table_oid1, att_text1, mode, FALSE);
2080  nd_stats2 = pg_get_nd_stats_by_name(table_oid2, att_text2, mode, FALSE);
2081 
2082  if ( ! nd_stats1 )
2083  elog(ERROR, "stats for \"%s.%s\" do not exist", get_rel_name(table_oid1), text2cstring(att_text1));
2084 
2085  if ( ! nd_stats2 )
2086  elog(ERROR, "stats for \"%s.%s\" do not exist", get_rel_name(table_oid2), text2cstring(att_text2));
2087 
2088  /* Check if we've been asked to not use 2d mode */
2089  if ( ! PG_ARGISNULL(4) )
2090  {
2091  text *modetxt = PG_GETARG_TEXT_P(4);
2092  char *modestr = text2cstring(modetxt);
2093  if ( modestr[0] == 'N' )
2094  mode = 0;
2095  }
2096 
2097  /* Do the estimation */
2098  selectivity = estimate_join_selectivity(nd_stats1, nd_stats2);
2099 
2100  pfree(nd_stats1);
2101  pfree(nd_stats2);
2102  PG_RETURN_FLOAT8(selectivity);
2103 }
2104 
2110 Datum gserialized_gist_sel_2d(PG_FUNCTION_ARGS)
2111 {
2112  PG_RETURN_DATUM(DirectFunctionCall5(
2114  PG_GETARG_DATUM(0), PG_GETARG_DATUM(1),
2115  PG_GETARG_DATUM(2), PG_GETARG_DATUM(3),
2116  Int32GetDatum(2) /* 2-D mode */
2117  ));
2118 }
2119 
2125 Datum gserialized_gist_sel_nd(PG_FUNCTION_ARGS)
2126 {
2127  PG_RETURN_DATUM(DirectFunctionCall5(
2129  PG_GETARG_DATUM(0), PG_GETARG_DATUM(1),
2130  PG_GETARG_DATUM(2), PG_GETARG_DATUM(3),
2131  Int32GetDatum(0) /* N-D mode */
2132  ));
2133 }
2134 
2149 Datum gserialized_gist_sel(PG_FUNCTION_ARGS)
2150 {
2151  PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
2152  /* Oid operator_oid = PG_GETARG_OID(1); */
2153  List *args = (List *) PG_GETARG_POINTER(2);
2154  /* int varRelid = PG_GETARG_INT32(3); */
2155  int mode = PG_GETARG_INT32(4);
2156 
2157  VariableStatData vardata;
2158  ND_STATS *nd_stats = NULL;
2159 
2160  Node *other;
2161  Var *self;
2162  GBOX search_box;
2163  float8 selectivity = 0;
2164 
2165  POSTGIS_DEBUG(2, "gserialized_gist_sel called");
2166 
2167  /*
2168  * TODO: This is a big one,
2169  * All this statistics code *only* tries to generate a valid
2170  * selectivity for && and &&&. That leaves all the other
2171  * geometry operators with bad stats! The selectivity
2172  * calculation should take account of the incoming operator
2173  * type and do the right thing.
2174  */
2175 
2176  /* Fail if not a binary opclause (probably shouldn't happen) */
2177  if (list_length(args) != 2)
2178  {
2179  POSTGIS_DEBUG(3, "gserialized_gist_sel: not a binary opclause");
2180  PG_RETURN_FLOAT8(DEFAULT_ND_SEL);
2181  }
2182 
2183  /* Find the constant part */
2184  other = (Node *) linitial(args);
2185  if ( ! IsA(other, Const) )
2186  {
2187  self = (Var *)other;
2188  other = (Node *) lsecond(args);
2189  }
2190  else
2191  {
2192  self = (Var *) lsecond(args);
2193  }
2194 
2195  if ( ! IsA(other, Const) )
2196  {
2197  POSTGIS_DEBUG(3, " no constant arguments - returning a default selectivity");
2198  PG_RETURN_FLOAT8(DEFAULT_ND_SEL);
2199  }
2200 
2201  /* Convert the constant to a BOX */
2202  if( ! gserialized_datum_get_gbox_p(((Const*)other)->constvalue, &search_box) )
2203  {
2204  POSTGIS_DEBUG(3, "search box is EMPTY");
2205  PG_RETURN_FLOAT8(0.0);
2206  }
2207  POSTGIS_DEBUGF(4, " requested search box is: %s", gbox_to_string(&search_box));
2208 
2209  /* Get pg_statistic row */
2210  examine_variable(root, (Node*)self, 0, &vardata);
2211  if ( vardata.statsTuple ) {
2212  nd_stats = pg_nd_stats_from_tuple(vardata.statsTuple, mode);
2213  }
2214  ReleaseVariableStats(vardata);
2215 
2216  if ( ! nd_stats )
2217  {
2218  POSTGIS_DEBUG(3, " unable to load stats from syscache, not analyzed yet?");
2219  PG_RETURN_FLOAT8(FALLBACK_ND_SEL);
2220  }
2221 
2222  POSTGIS_DEBUGF(4, " got stats:\n%s", nd_stats_to_json(nd_stats));
2223 
2224  /* Do the estimation! */
2225  selectivity = estimate_selectivity(&search_box, nd_stats, mode);
2226  POSTGIS_DEBUGF(3, " returning computed value: %f", selectivity);
2227 
2228  pfree(nd_stats);
2229  PG_RETURN_FLOAT8(selectivity);
2230 }
2231 
2232 
2233 
2240 Datum gserialized_estimated_extent(PG_FUNCTION_ARGS)
2241 {
2242  char *nsp = NULL;
2243  char *tbl = NULL;
2244  text *col = NULL;
2245  char *nsp_tbl = NULL;
2246  Oid tbl_oid;
2247  ND_STATS *nd_stats;
2248  GBOX *gbox;
2249  bool only_parent = FALSE;
2250 
2251  if ( PG_NARGS() == 4 )
2252  {
2253  nsp = text2cstring(PG_GETARG_TEXT_P(0));
2254  tbl = text2cstring(PG_GETARG_TEXT_P(1));
2255  col = PG_GETARG_TEXT_P(2);
2256  only_parent = PG_GETARG_BOOL(3);
2257  nsp_tbl = palloc(strlen(nsp) + strlen(tbl) + 6);
2258  sprintf(nsp_tbl, "\"%s\".\"%s\"", nsp, tbl);
2259  tbl_oid = DatumGetObjectId(DirectFunctionCall1(regclassin, CStringGetDatum(nsp_tbl)));
2260  pfree(nsp_tbl);
2261  }
2262  else if ( PG_NARGS() == 3 )
2263  {
2264  nsp = text2cstring(PG_GETARG_TEXT_P(0));
2265  tbl = text2cstring(PG_GETARG_TEXT_P(1));
2266  col = PG_GETARG_TEXT_P(2);
2267  nsp_tbl = palloc(strlen(nsp) + strlen(tbl) + 6);
2268  sprintf(nsp_tbl, "\"%s\".\"%s\"", nsp, tbl);
2269  tbl_oid = DatumGetObjectId(DirectFunctionCall1(regclassin, CStringGetDatum(nsp_tbl)));
2270  pfree(nsp_tbl);
2271  }
2272  else if ( PG_NARGS() == 2 )
2273  {
2274  tbl = text2cstring(PG_GETARG_TEXT_P(0));
2275  col = PG_GETARG_TEXT_P(1);
2276  nsp_tbl = palloc(strlen(tbl) + 3);
2277  sprintf(nsp_tbl, "\"%s\"", tbl);
2278  tbl_oid = DatumGetObjectId(DirectFunctionCall1(regclassin, CStringGetDatum(nsp_tbl)));
2279  pfree(nsp_tbl);
2280  }
2281  else
2282  {
2283  elog(ERROR, "estimated_extent() called with wrong number of arguments");
2284  PG_RETURN_NULL();
2285  }
2286 
2287  /* Estimated extent only returns 2D bounds, so use mode 2 */
2288  nd_stats = pg_get_nd_stats_by_name(tbl_oid, col, 2, only_parent);
2289 
2290  /* Error out on no stats */
2291  if ( ! nd_stats ) {
2292  elog(WARNING, "stats for \"%s.%s\" do not exist", tbl, text2cstring(col));
2293  PG_RETURN_NULL();
2294  }
2295 
2296  /* Construct the box */
2297  gbox = palloc(sizeof(GBOX));
2298  FLAGS_SET_GEODETIC(gbox->flags, 0);
2299  FLAGS_SET_Z(gbox->flags, 0);
2300  FLAGS_SET_M(gbox->flags, 0);
2301  gbox->xmin = nd_stats->extent.min[0];
2302  gbox->xmax = nd_stats->extent.max[0];
2303  gbox->ymin = nd_stats->extent.min[1];
2304  gbox->ymax = nd_stats->extent.max[1];
2305 
2306  pfree(nd_stats);
2307  PG_RETURN_POINTER(gbox);
2308 }
2309 
2317 Datum geometry_estimated_extent(PG_FUNCTION_ARGS)
2318 {
2319  if ( PG_NARGS() == 3 )
2320  {
2321  PG_RETURN_DATUM(
2322  DirectFunctionCall3(gserialized_estimated_extent,
2323  PG_GETARG_DATUM(0),
2324  PG_GETARG_DATUM(1),
2325  PG_GETARG_DATUM(2)));
2326  }
2327  else if ( PG_NARGS() == 2 )
2328  {
2329  PG_RETURN_DATUM(
2330  DirectFunctionCall2(gserialized_estimated_extent,
2331  PG_GETARG_DATUM(0),
2332  PG_GETARG_DATUM(1)));
2333  }
2334 
2335  elog(ERROR, "geometry_estimated_extent() called with wrong number of arguments");
2336  PG_RETURN_NULL();
2337 }
args
Definition: ovdump.py:44
int gserialized_get_gbox_p(const GSERIALIZED *g, GBOX *box)
Read the bounding box off a serialization and calculate one if it is not already there.
Definition: g_serialized.c:398
static int nd_increment(ND_IBOX *ibox, int ndims, int *counter)
Given an n-d index array (counter), and a domain to increment it in (ibox) increment it by one...
static ND_STATS * pg_get_nd_stats_by_name(const Oid table_oid, const text *att_text, int mode, bool only_parent)
Pull the stats object from the PgSQL system catalogs.
static int text_p_get_mode(const text *txt)
Utility function to see if the first letter of the mode argument is &#39;N&#39;.
static int nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
What stats cells overlap with this ND_BOX? Put the lowest cell addresses in ND_IBOX->min and the high...
PG_FUNCTION_INFO_V1(gserialized_gist_joinsel_nd)
For (geometry &&& geometry) and (geography && geography) we call into the N-D mode.
#define DEFAULT_ND_JOINSEL
static void compute_gserialized_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc, int sample_rows, double total_rows)
In order to do useful selectivity calculations in both 2-D and N-D modes, we actually have to generat...
stringbuffer_t * stringbuffer_create(void)
Allocate a new stringbuffer_t.
Definition: stringbuffer.c:35
#define NUM_BINS
int gbox_is_valid(const GBOX *gbox)
Return false if any of the dimensions is NaN or infinite.
Definition: g_box.c:209
#define MIN_DIMENSION_WIDTH
Minimum width of a dimension that we&#39;ll bother trying to compute statistics on.
char * gbox_to_string(const GBOX *gbox)
Allocate a string representation of the GBOX, based on dimensionality of flags.
Definition: g_box.c:404
#define FLAGS_GET_GEODETIC(flags)
Definition: liblwgeom.h:142
char * stringbuffer_getstringcopy(stringbuffer_t *s)
Returns a newly allocated string large enough to contain the current state of the string...
Definition: stringbuffer.c:160
double xmax
Definition: liblwgeom.h:292
#define ND_DIMS
The maximum number of dimensions our code can handle.
static void compute_gserialized_stats_mode(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc, int sample_rows, double total_rows, int mode)
The gserialized_analyze_nd sets this function as a callback on the stats object when called by the AN...
Datum geometry_estimated_extent(PG_FUNCTION_ARGS)
static int nd_box_contains(const ND_BOX *a, const ND_BOX *b, int ndims)
Return TRUE if ND_BOX a contains b, false otherwise.
Datum _postgis_gserialized_stats(PG_FUNCTION_ARGS)
#define STATISTIC_SLOT_2D
static void nd_box_from_gbox(const GBOX *gbox, ND_BOX *nd_box)
Set the values of an ND_BOX from a GBOX.
struct ND_BOX_T ND_BOX
N-dimensional box type for calculations, to avoid doing explicit axis conversions from GBOX in all ca...
#define FALLBACK_ND_JOINSEL
Datum gserialized_estimated_extent(PG_FUNCTION_ARGS)
#define FALLBACK_ND_SEL
More modest fallback selectivity factor.
#define FLAGS_SET_GEODETIC(flags, value)
Definition: liblwgeom.h:148
Datum _postgis_gserialized_joinsel(PG_FUNCTION_ARGS)
static float8 estimate_selectivity(const GBOX *box, const ND_STATS *nd_stats, int mode)
This function returns an estimate of the selectivity of a search GBOX by looking at data in the ND_ST...
#define LW_FAILURE
Definition: liblwgeom.h:78
int stringbuffer_aprintf(stringbuffer_t *s, const char *fmt,...)
Appends a formatted string to the current string buffer, using the format and argument list provided...
Definition: stringbuffer.c:253
static int nd_box_init(ND_BOX *a)
Zero out an ND_BOX.
Datum gserialized_gist_joinsel(PG_FUNCTION_ARGS)
#define FLAGS_SET_Z(flags, value)
Definition: liblwgeom.h:145
double zmax
Definition: liblwgeom.h:296
double ymin
Definition: liblwgeom.h:293
Datum gserialized_gist_sel_2d(PG_FUNCTION_ARGS)
Datum gserialized_gist_joinsel_2d(PG_FUNCTION_ARGS)
Datum gserialized_gist_sel(PG_FUNCTION_ARGS)
static int nd_box_array_distribution(const ND_BOX **nd_boxes, int num_boxes, const ND_BOX *extent, int ndims, double *distribution)
Calculate how much a set of boxes is homogenously distributed or contentrated within one dimension...
double xmin
Definition: liblwgeom.h:291
static int nd_stats_value_index(const ND_STATS *stats, int *indexes)
Given a position in the n-d histogram (i,j,k) return the position in the 1-d values array...
float4 size[ND_DIMS]
Datum gserialized_gist_joinsel_nd(PG_FUNCTION_ARGS)
static int nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
Return TRUE if ND_BOX a overlaps b, false otherwise.
static double total_double(const double *vals, int nvals)
Given double array, return sum of values.
int min[ND_DIMS]
static ND_STATS * pg_nd_stats_from_tuple(HeapTuple stats_tuple, int mode)
char * text2cstring(const text *textptr)
Datum gserialized_gist_sel_nd(PG_FUNCTION_ARGS)
static int cmp_int(const void *a, const void *b)
Integer comparison function for qsort.
static int nd_box_init_bounds(ND_BOX *a)
Prepare an ND_BOX for bounds calculation: set the maxes to the smallest thing possible and the mins t...
double ymax
Definition: liblwgeom.h:294
N-dimensional box index type.
#define FLAGS_GET_Z(flags)
Macros for manipulating the &#39;flags&#39; byte.
Definition: liblwgeom.h:139
static int range_quintile(int *vals, int nvals)
The difference between the fourth and first quintile values, the "inter-quintile range".
Datum _postgis_gserialized_sel(PG_FUNCTION_ARGS)
static float8 estimate_join_selectivity(const ND_STATS *s1, const ND_STATS *s2)
Given two statistics histograms, what is the selectivity of a join driven by the && or &&& operator...
uint8_t flags
Definition: liblwgeom.h:290
static char * nd_box_to_json(const ND_BOX *nd_box, int ndims)
Convert an ND_BOX to a JSON string for printing.
float4 max[ND_DIMS]
void stringbuffer_append(stringbuffer_t *s, const char *a)
Append the specified string to the stringbuffer_t.
Definition: stringbuffer.c:134
#define STATISTIC_KIND_2D
struct ND_IBOX_T ND_IBOX
N-dimensional box index type.
void stringbuffer_destroy(stringbuffer_t *s)
Free the stringbuffer_t and all memory managed within it.
Definition: stringbuffer.c:78
int max[ND_DIMS]
struct ND_STATS_T ND_STATS
N-dimensional statistics structure.
float4 min[ND_DIMS]
#define FALSE
Definition: dbfopen.c:168
static int nd_box_merge(const ND_BOX *source, ND_BOX *target)
Create a printable view of the ND_STATS histogram.
static int nd_box_expand(ND_BOX *nd_box, double expansion_factor)
Expand an ND_BOX ever so slightly.
double mmin
Definition: liblwgeom.h:297
#define SDFACTOR
double zmin
Definition: liblwgeom.h:295
#define FLAGS_GET_M(flags)
Definition: liblwgeom.h:140
static char * nd_stats_to_json(const ND_STATS *nd_stats)
Convert an ND_STATS to a JSON representation for external use.
Datum gserialized_analyze_nd(PG_FUNCTION_ARGS)
double mmax
Definition: liblwgeom.h:298
#define STATISTIC_SLOT_ND
static double nd_box_ratio(const ND_BOX *b1, const ND_BOX *b2, int ndims)
Returns the proportion of b2 that is covered by b1.
int value
Definition: genraster.py:61
N-dimensional statistics structure.
static ND_STATS * pg_get_nd_stats(const Oid table_oid, AttrNumber att_num, int mode, bool only_parent)
Pull the stats object from the PgSQL system catalogs.
#define TRUE
Definition: dbfopen.c:169
N-dimensional box type for calculations, to avoid doing explicit axis conversions from GBOX in all ca...
static int gbox_ndims(const GBOX *gbox)
Given that geodetic boxes are X/Y/Z regardless of the underlying geometry dimensionality and other bo...
This library is the generic geometry handling section of PostGIS.
#define DEFAULT_ND_SEL
Default geometry selectivity factor.
#define STATISTIC_KIND_ND
Assign a number to the n-dimensional statistics kind.
#define FLAGS_SET_M(flags, value)
Definition: liblwgeom.h:146