PostGIS  2.4.9dev-r@@SVN_REVISION@@
gserialized_estimate.c
Go to the documentation of this file.
1 /**********************************************************************
2  *
3  * PostGIS - Spatial Types for PostgreSQL
4  * http://postgis.net
5  *
6  * PostGIS is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * PostGIS is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with PostGIS. If not, see <http://www.gnu.org/licenses/>.
18  *
19  **********************************************************************
20  *
21  * Copyright 2012 (C) Paul Ramsey <pramsey@cleverelephant.ca>
22  *
23  **********************************************************************/
24 
25 
26 
27 /**********************************************************************
28  THEORY OF OPERATION
29 
30 The ANALYZE command hooks to a callback (gserialized_analyze_nd) that
31 calculates (compute_gserialized_stats_mode) two histograms of occurances of
32 features, once for the 2D domain (and the && operator) one for the
33 ND domain (and the &&& operator).
34 
35 Queries in PostgreSQL call into the selectivity sub-system to find out
36 the relative effectiveness of different clauses in sub-setting
37 relations. Queries with constant arguments call gserialized_gist_sel,
38 queries with relations on both sides call gserialized_gist_joinsel.
39 
40 gserialized_gist_sel sums up the values in the histogram that overlap
41 the contant search box.
42 
43 gserialized_gist_joinsel sums up the product of the overlapping
44 cells in each relation's histogram.
45 
46 Depending on the operator and type, the mode of selectivity calculation
47 will be 2D or ND.
48 
49 - geometry && geometry ==> 2D
50 - geometry &&& geometry ==> ND
51 - geography && geography ==> ND
52 
53 The 2D mode is put in effect by retrieving the 2D histogram from the
54 statistics cache and then allowing the generic ND calculations to
55 go to work.
56 
57 TO DO: More testing and examination of the &&& operator and mixed
58 dimensionality cases. (2D geometry) &&& (3D column), etc.
59 
60 **********************************************************************/
61 
62 #include "postgres.h"
63 #include "executor/spi.h"
64 #include "fmgr.h"
65 #include "commands/vacuum.h"
66 #if PG_VERSION_NUM < 120000
67 #include "nodes/relation.h"
68 #else
69 #include "nodes/pathnodes.h"
70 #endif
71 #include "parser/parsetree.h"
72 #include "utils/array.h"
73 #include "utils/lsyscache.h"
74 #include "utils/builtins.h"
75 #include "utils/syscache.h"
76 #include "utils/rel.h"
77 #include "utils/selfuncs.h"
78 
79 #include "../postgis_config.h"
80 
81 #if POSTGIS_PGSQL_VERSION >= 93
82  #include "access/htup_details.h"
83 #endif
84 
85 #include "stringbuffer.h"
86 #include "liblwgeom.h"
87 #include "lwgeom_pg.h" /* For debugging macros. */
88 #include "gserialized_gist.h" /* For index common functions */
89 
90 #include <math.h>
91 #if HAVE_IEEEFP_H
92 #include <ieeefp.h>
93 #endif
94 #include <float.h>
95 #include <string.h>
96 #include <stdio.h>
97 #include <errno.h>
98 #include <ctype.h>
99 
100 /* Fall back to older finite() if necessary */
101 #ifndef HAVE_ISFINITE
102 # ifdef HAVE_GNU_ISFINITE
103 # define _GNU_SOURCE
104 # else
105 # define isfinite finite
106 # endif
107 #endif
108 
109 
110 /* Prototypes */
111 Datum gserialized_gist_joinsel(PG_FUNCTION_ARGS);
112 Datum gserialized_gist_joinsel_2d(PG_FUNCTION_ARGS);
113 Datum gserialized_gist_joinsel_nd(PG_FUNCTION_ARGS);
114 Datum gserialized_gist_sel(PG_FUNCTION_ARGS);
115 Datum gserialized_gist_sel_2d(PG_FUNCTION_ARGS);
116 Datum gserialized_gist_sel_nd(PG_FUNCTION_ARGS);
117 Datum gserialized_analyze_nd(PG_FUNCTION_ARGS);
118 Datum gserialized_estimated_extent(PG_FUNCTION_ARGS);
119 Datum _postgis_gserialized_sel(PG_FUNCTION_ARGS);
120 Datum _postgis_gserialized_joinsel(PG_FUNCTION_ARGS);
121 Datum _postgis_gserialized_stats(PG_FUNCTION_ARGS);
122 
123 /* Old Prototype */
124 Datum geometry_estimated_extent(PG_FUNCTION_ARGS);
125 
136 #define STATISTIC_KIND_ND 102
137 #define STATISTIC_KIND_2D 103
138 #define STATISTIC_SLOT_ND 0
139 #define STATISTIC_SLOT_2D 1
140 
141 /*
142 * The SD factor restricts the side of the statistics histogram
143 * based on the standard deviation of the extent of the data.
144 * SDFACTOR is the number of standard deviations from the mean
145 * the histogram will extend.
146 */
147 #define SDFACTOR 3.25
148 
154 #define ND_DIMS 4
155 
162 #define MIN_DIMENSION_WIDTH 0.000000001
163 
168 #define MAX_DIMENSION_WIDTH 1.0E+20
169 
173 #define DEFAULT_ND_SEL 0.0001
174 #define DEFAULT_ND_JOINSEL 0.001
175 
179 #define FALLBACK_ND_SEL 0.2
180 #define FALLBACK_ND_JOINSEL 0.3
181 
187 typedef struct ND_BOX_T
188 {
189  float4 min[ND_DIMS];
190  float4 max[ND_DIMS];
191 } ND_BOX;
192 
196 typedef struct ND_IBOX_T
197 {
198  int min[ND_DIMS];
199  int max[ND_DIMS];
200 } ND_IBOX;
201 
202 
209 typedef struct ND_STATS_T
210 {
211  /* Dimensionality of the histogram. */
212  float4 ndims;
213 
214  /* Size of n-d histogram in each dimension. */
215  float4 size[ND_DIMS];
216 
217  /* Lower-left (min) and upper-right (max) spatial bounds of histogram. */
219 
220  /* How many rows in the table itself? */
222 
223  /* How many rows were in the sample that built this histogram? */
225 
226  /* How many not-Null/Empty features were in the sample? */
228 
229  /* How many features actually got sampled in the histogram? */
231 
232  /* How many cells in histogram? (sizex*sizey*sizez*sizem) */
234 
235  /* How many cells did those histogram features cover? */
236  /* Since we are pro-rating coverage, this number should */
237  /* now always equal histogram_features */
239 
240  /* Variable length # of floats for histogram */
241  float4 value[1];
242 } ND_STATS;
243 
244 
245 
246 
253 static int
254 gbox_ndims(const GBOX* gbox)
255 {
256  int dims = 2;
257  if ( FLAGS_GET_GEODETIC(gbox->flags) )
258  return 3;
259  if ( FLAGS_GET_Z(gbox->flags) )
260  dims++;
261  if ( FLAGS_GET_M(gbox->flags) )
262  dims++;
263  return dims;
264 }
265 
271 static int
272 text_p_get_mode(const text *txt)
273 {
274  int mode = 2;
275  char *modestr = text2cstring(txt);
276  if ( modestr[0] == 'N' )
277  mode = 0;
278  pfree(modestr);
279  return mode;
280 }
281 
282 
286 static int
287 cmp_int (const void *a, const void *b)
288 {
289  int ia = *((const int*)a);
290  int ib = *((const int*)b);
291 
292  if ( ia == ib )
293  return 0;
294  else if ( ia > ib )
295  return 1;
296  else
297  return -1;
298 }
299 
304 static int
305 range_quintile(int *vals, int nvals)
306 {
307  qsort(vals, nvals, sizeof(int), cmp_int);
308  return vals[4*nvals/5] - vals[nvals/5];
309 }
310 
314 static double
315 total_double(const double *vals, int nvals)
316 {
317  int i;
318  float total = 0;
319  /* Calculate total */
320  for ( i = 0; i < nvals; i++ )
321  total += vals[i];
322 
323  return total;
324 }
325 
326 #if POSTGIS_DEBUG_LEVEL >= 3
327 
331 static int
332 total_int(const int *vals, int nvals)
333 {
334  int i;
335  int total = 0;
336  /* Calculate total */
337  for ( i = 0; i < nvals; i++ )
338  total += vals[i];
339 
340  return total;
341 }
342 
346 static double
347 avg(const int *vals, int nvals)
348 {
349  int t = total_int(vals, nvals);
350  return (double)t / (double)nvals;
351 }
352 
356 static double
357 stddev(const int *vals, int nvals)
358 {
359  int i;
360  double sigma2 = 0;
361  double mean = avg(vals, nvals);
362 
363  /* Calculate sigma2 */
364  for ( i = 0; i < nvals; i++ )
365  {
366  double v = (double)(vals[i]);
367  sigma2 += (mean - v) * (mean - v);
368  }
369  return sqrt(sigma2 / nvals);
370 }
371 #endif /* POSTGIS_DEBUG_LEVEL >= 3 */
372 
377 static int
378 nd_stats_value_index(const ND_STATS *stats, int *indexes)
379 {
380  int d;
381  int accum = 1, vdx = 0;
382 
383  /* Calculate the index into the 1-d values array that the (i,j,k,l) */
384  /* n-d histogram coordinate implies. */
385  /* index = x + y * sizex + z * sizex * sizey + m * sizex * sizey * sizez */
386  for ( d = 0; d < (int)(stats->ndims); d++ )
387  {
388  int size = (int)(stats->size[d]);
389  if ( indexes[d] < 0 || indexes[d] >= size )
390  {
391  POSTGIS_DEBUGF(3, " bad index at (%d, %d)", indexes[0], indexes[1]);
392  return -1;
393  }
394  vdx += indexes[d] * accum;
395  accum *= size;
396  }
397  return vdx;
398 }
399 
403 static char*
404 nd_box_to_json(const ND_BOX *nd_box, int ndims)
405 {
406  char *rv;
407  int i;
409 
410  stringbuffer_append(sb, "{\"min\":[");
411  for ( i = 0; i < ndims; i++ )
412  {
413  if ( i ) stringbuffer_append(sb, ",");
414  stringbuffer_aprintf(sb, "%.6g", nd_box->min[i]);
415  }
416  stringbuffer_append(sb, "],\"max\":[");
417  for ( i = 0; i < ndims; i++ )
418  {
419  if ( i ) stringbuffer_append(sb, ",");
420  stringbuffer_aprintf(sb, "%.6g", nd_box->max[i]);
421  }
422  stringbuffer_append(sb, "]}");
423 
426  return rv;
427 }
428 
429 
434 static char*
435 nd_stats_to_json(const ND_STATS *nd_stats)
436 {
437  char *json_extent, *str;
438  int d;
440  int ndims = (int)roundf(nd_stats->ndims);
441 
442  stringbuffer_append(sb, "{");
443  stringbuffer_aprintf(sb, "\"ndims\":%d,", ndims);
444 
445  /* Size */
446  stringbuffer_append(sb, "\"size\":[");
447  for ( d = 0; d < ndims; d++ )
448  {
449  if ( d ) stringbuffer_append(sb, ",");
450  stringbuffer_aprintf(sb, "%d", (int)roundf(nd_stats->size[d]));
451  }
452  stringbuffer_append(sb, "],");
453 
454  /* Extent */
455  json_extent = nd_box_to_json(&(nd_stats->extent), ndims);
456  stringbuffer_aprintf(sb, "\"extent\":%s,", json_extent);
457  pfree(json_extent);
458 
459  stringbuffer_aprintf(sb, "\"table_features\":%d,", (int)roundf(nd_stats->table_features));
460  stringbuffer_aprintf(sb, "\"sample_features\":%d,", (int)roundf(nd_stats->sample_features));
461  stringbuffer_aprintf(sb, "\"not_null_features\":%d,", (int)roundf(nd_stats->not_null_features));
462  stringbuffer_aprintf(sb, "\"histogram_features\":%d,", (int)roundf(nd_stats->histogram_features));
463  stringbuffer_aprintf(sb, "\"histogram_cells\":%d,", (int)roundf(nd_stats->histogram_cells));
464  stringbuffer_aprintf(sb, "\"cells_covered\":%d", (int)roundf(nd_stats->cells_covered));
465  stringbuffer_append(sb, "}");
466 
467  str = stringbuffer_getstringcopy(sb);
469  return str;
470 }
471 
472 
478 // static char*
479 // nd_stats_to_grid(const ND_STATS *stats)
480 // {
481 // char *rv;
482 // int j, k;
483 // int sizex = (int)roundf(stats->size[0]);
484 // int sizey = (int)roundf(stats->size[1]);
485 // stringbuffer_t *sb = stringbuffer_create();
486 //
487 // for ( k = 0; k < sizey; k++ )
488 // {
489 // for ( j = 0; j < sizex; j++ )
490 // {
491 // stringbuffer_aprintf(sb, "%3d ", (int)roundf(stats->value[j + k*sizex]));
492 // }
493 // stringbuffer_append(sb, "\n");
494 // }
495 //
496 // rv = stringbuffer_getstringcopy(sb);
497 // stringbuffer_destroy(sb);
498 // return rv;
499 // }
500 
501 
503 static int
504 nd_box_merge(const ND_BOX *source, ND_BOX *target)
505 {
506  int d;
507  for ( d = 0; d < ND_DIMS; d++ )
508  {
509  target->min[d] = Min(target->min[d], source->min[d]);
510  target->max[d] = Max(target->max[d], source->max[d]);
511  }
512  return TRUE;
513 }
514 
516 static int
518 {
519  memset(a, 0, sizeof(ND_BOX));
520  return TRUE;
521 }
522 
528 static int
530 {
531  int d;
532  for ( d = 0; d < ND_DIMS; d++ )
533  {
534  a->min[d] = FLT_MAX;
535  a->max[d] = -1 * FLT_MAX;
536  }
537  return TRUE;
538 }
539 
541 static void
542 nd_box_from_gbox(const GBOX *gbox, ND_BOX *nd_box)
543 {
544  int d = 0;
545  POSTGIS_DEBUGF(3, " %s", gbox_to_string(gbox));
546 
547  nd_box_init(nd_box);
548  nd_box->min[d] = gbox->xmin;
549  nd_box->max[d] = gbox->xmax;
550  d++;
551  nd_box->min[d] = gbox->ymin;
552  nd_box->max[d] = gbox->ymax;
553  d++;
554  if ( FLAGS_GET_GEODETIC(gbox->flags) )
555  {
556  nd_box->min[d] = gbox->zmin;
557  nd_box->max[d] = gbox->zmax;
558  return;
559  }
560  if ( FLAGS_GET_Z(gbox->flags) )
561  {
562  nd_box->min[d] = gbox->zmin;
563  nd_box->max[d] = gbox->zmax;
564  d++;
565  }
566  if ( FLAGS_GET_M(gbox->flags) )
567  {
568  nd_box->min[d] = gbox->mmin;
569  nd_box->max[d] = gbox->mmax;
570  d++;
571  }
572  return;
573 }
574 
578 static int
579 nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
580 {
581  int d;
582  for ( d = 0; d < ndims; d++ )
583  {
584  if ( (a->min[d] > b->max[d]) || (a->max[d] < b->min[d]) )
585  return FALSE;
586  }
587  return TRUE;
588 }
589 
593 static int
594 nd_box_contains(const ND_BOX *a, const ND_BOX *b, int ndims)
595 {
596  int d;
597  for ( d = 0; d < ndims; d++ )
598  {
599  if ( ! ((a->min[d] < b->min[d]) && (a->max[d] > b->max[d])) )
600  return FALSE;
601  }
602  return TRUE;
603 }
604 
609 static int
610 nd_box_expand(ND_BOX *nd_box, double expansion_factor)
611 {
612  int d;
613  double size;
614  for ( d = 0; d < ND_DIMS; d++ )
615  {
616  size = nd_box->max[d] - nd_box->min[d];
617  /* Avoid expanding boxes that are either too wide or too narrow*/
618  if (size < MIN_DIMENSION_WIDTH || size > MAX_DIMENSION_WIDTH)
619  continue;
620  nd_box->min[d] -= size * expansion_factor / 2;
621  nd_box->max[d] += size * expansion_factor / 2;
622  }
623  return TRUE;
624 }
625 
630 static inline int
631 nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
632 {
633  int d;
634 
635  POSTGIS_DEBUGF(4, " nd_box: %s", nd_box_to_json(nd_box, nd_stats->ndims));
636 
637  /* Initialize ibox */
638  memset(nd_ibox, 0, sizeof(ND_IBOX));
639 
640  /* In each dimension... */
641  for ( d = 0; d < nd_stats->ndims; d++ )
642  {
643  double smin = nd_stats->extent.min[d];
644  double smax = nd_stats->extent.max[d];
645  double width = smax - smin;
646 
647  if (width < MIN_DIMENSION_WIDTH)
648  {
649  nd_ibox->min[d] = nd_ibox->max[d] = nd_stats->extent.min[d];
650  }
651  else
652  {
653  int size = (int)roundf(nd_stats->size[d]);
654 
655  /* ... find cells the box overlaps with in this dimension */
656  nd_ibox->min[d] = floor(size * (nd_box->min[d] - smin) / width);
657  nd_ibox->max[d] = floor(size * (nd_box->max[d] - smin) / width);
658 
659  POSTGIS_DEBUGF(5, " stats: dim %d: min %g: max %g: width %g", d, smin, smax, width);
660  POSTGIS_DEBUGF(5, " overlap: dim %d: (%d, %d)", d, nd_ibox->min[d], nd_ibox->max[d]);
661 
662  /* Push any out-of range values into range */
663  nd_ibox->min[d] = Max(nd_ibox->min[d], 0);
664  nd_ibox->max[d] = Min(nd_ibox->max[d], size - 1);
665  }
666  }
667  return TRUE;
668 }
669 
673 static inline double
674 nd_box_ratio(const ND_BOX *b1, const ND_BOX *b2, int ndims)
675 {
676  int d;
677  bool covered = TRUE;
678  double ivol = 1.0;
679  double vol2 = 1.0;
680  double vol1 = 1.0;
681 
682  for ( d = 0 ; d < ndims; d++ )
683  {
684  if ( b1->max[d] <= b2->min[d] || b1->min[d] >= b2->max[d] )
685  return 0.0; /* Disjoint */
686 
687  if ( b1->min[d] > b2->min[d] || b1->max[d] < b2->max[d] )
688  covered = FALSE;
689  }
690 
691  if ( covered )
692  return 1.0;
693 
694  for ( d = 0; d < ndims; d++ )
695  {
696  double width1 = b1->max[d] - b1->min[d];
697  double width2 = b2->max[d] - b2->min[d];
698  double imin, imax, iwidth;
699 
700  vol1 *= width1;
701  vol2 *= width2;
702 
703  imin = Max(b1->min[d], b2->min[d]);
704  imax = Min(b1->max[d], b2->max[d]);
705  iwidth = imax - imin;
706  iwidth = Max(0.0, iwidth);
707 
708  ivol *= iwidth;
709  }
710 
711  if ( vol2 == 0.0 )
712  return vol2;
713 
714  return ivol / vol2;
715 }
716 
717 /* How many bins shall we use in figuring out the distribution? */
718 #define NUM_BINS 50
719 
735 static int
736 nd_box_array_distribution(const ND_BOX **nd_boxes, int num_boxes, const ND_BOX *extent, int ndims, double *distribution)
737 {
738  int d, i, k, range;
739  int counts[NUM_BINS];
740  double smin, smax; /* Spatial min, spatial max */
741  double swidth; /* Spatial width of dimension */
742 #if POSTGIS_DEBUG_LEVEL >= 3
743  double average, sdev, sdev_ratio;
744 #endif
745  int bmin, bmax; /* Bin min, bin max */
746  const ND_BOX *ndb;
747 
748  /* For each dimension... */
749  for ( d = 0; d < ndims; d++ )
750  {
751  /* Initialize counts for this dimension */
752  memset(counts, 0, sizeof(counts));
753 
754  smin = extent->min[d];
755  smax = extent->max[d];
756  swidth = smax - smin;
757 
758  /* Don't try and calculate distribution of overly narrow */
759  /* or overly wide dimensions. Here we're being pretty geographical, */
760  /* expecting "normal" planar or geographic coordinates. */
761  /* Otherwise we have to "handle" +/- Inf bounded features and */
762  /* the assumptions needed for that are as bad as this hack. */
763  if ( swidth < MIN_DIMENSION_WIDTH || swidth > MAX_DIMENSION_WIDTH )
764  {
765  distribution[d] = 0;
766  continue;
767  }
768 
769  /* Sum up the overlaps of each feature with the dimensional bins */
770  for ( i = 0; i < num_boxes; i++ )
771  {
772  double minoffset, maxoffset;
773 
774  /* Skip null entries */
775  ndb = nd_boxes[i];
776  if ( ! ndb ) continue;
777 
778  /* Where does box fall relative to the working range */
779  minoffset = ndb->min[d] - smin;
780  maxoffset = ndb->max[d] - smin;
781 
782  /* Skip boxes that are outside our working range */
783  if ( minoffset < 0 || minoffset > swidth ||
784  maxoffset < 0 || maxoffset > swidth )
785  {
786  continue;
787  }
788 
789  /* What bins does this range correspond to? */
790  bmin = floor(NUM_BINS * minoffset / swidth);
791  bmax = floor(NUM_BINS * maxoffset / swidth);
792 
793  /* Should only happen when maxoffset==swidth */
794  if (bmax >= NUM_BINS)
795  bmax = NUM_BINS-1;
796 
797  POSTGIS_DEBUGF(4, " dimension %d, feature %d: bin %d to bin %d", d, i, bmin, bmax);
798 
799  /* Increment the counts in all the bins this feature overlaps */
800  for ( k = bmin; k <= bmax; k++ )
801  {
802  counts[k] += 1;
803  }
804 
805  }
806 
807  /* How dispersed is the distribution of features across bins? */
808  range = range_quintile(counts, NUM_BINS);
809 
810 #if POSTGIS_DEBUG_LEVEL >= 3
811  average = avg(counts, NUM_BINS);
812  sdev = stddev(counts, NUM_BINS);
813  sdev_ratio = sdev/average;
814 
815  POSTGIS_DEBUGF(3, " dimension %d: range = %d", d, range);
816  POSTGIS_DEBUGF(3, " dimension %d: average = %.6g", d, average);
817  POSTGIS_DEBUGF(3, " dimension %d: stddev = %.6g", d, sdev);
818  POSTGIS_DEBUGF(3, " dimension %d: stddev_ratio = %.6g", d, sdev_ratio);
819 #endif
820 
821  distribution[d] = range;
822  }
823 
824  return TRUE;
825 }
826 
832 static inline int
833 nd_increment(ND_IBOX *ibox, int ndims, int *counter)
834 {
835  int d = 0;
836 
837  while ( d < ndims )
838  {
839  if ( counter[d] < ibox->max[d] )
840  {
841  counter[d] += 1;
842  break;
843  }
844  counter[d] = ibox->min[d];
845  d++;
846  }
847  /* That's it, cannot increment any more! */
848  if ( d == ndims )
849  return FALSE;
850 
851  /* Increment complete! */
852  return TRUE;
853 }
854 
855 static ND_STATS*
856 pg_nd_stats_from_tuple(HeapTuple stats_tuple, int mode)
857 {
858  int stats_kind = STATISTIC_KIND_ND;
859  int rv;
860  ND_STATS *nd_stats;
861 
862  /* If we're in 2D mode, set the kind appropriately */
863  if ( mode == 2 ) stats_kind = STATISTIC_KIND_2D;
864 
865  /* Then read the geom status histogram from that */
866 
867 #if POSTGIS_PGSQL_VERSION < 100
868  float4 *floatptr;
869  int nvalues;
870 
871  rv = get_attstatsslot(stats_tuple, 0, 0, stats_kind, InvalidOid,
872  NULL, NULL, NULL, &floatptr, &nvalues);
873 
874  if ( ! rv ) {
875  POSTGIS_DEBUGF(2,
876  "no slot of kind %d in stats tuple", stats_kind);
877  return NULL;
878  }
879 
880  /* Clone the stats here so we can release the attstatsslot immediately */
881  nd_stats = palloc(sizeof(float) * nvalues);
882  memcpy(nd_stats, floatptr, sizeof(float) * nvalues);
883 
884  /* Clean up */
885  free_attstatsslot(0, NULL, 0, floatptr, nvalues);
886 #else /* PostgreSQL 10 or higher */
887  AttStatsSlot sslot;
888  rv = get_attstatsslot(&sslot, stats_tuple, stats_kind, InvalidOid,
889  ATTSTATSSLOT_NUMBERS);
890  if ( ! rv ) {
891  POSTGIS_DEBUGF(2,
892  "no slot of kind %d in stats tuple", stats_kind);
893  return NULL;
894  }
895 
896  /* Clone the stats here so we can release the attstatsslot immediately */
897  nd_stats = palloc(sizeof(float4) * sslot.nnumbers);
898  memcpy(nd_stats, sslot.numbers, sizeof(float4) * sslot.nnumbers);
899 
900  free_attstatsslot(&sslot);
901 #endif
902 
903  return nd_stats;
904 }
905 
910 static ND_STATS*
911 pg_get_nd_stats(const Oid table_oid, AttrNumber att_num, int mode, bool only_parent)
912 {
913  HeapTuple stats_tuple = NULL;
914  ND_STATS *nd_stats;
915 
916  /* First pull the stats tuple for the whole tree */
917  if ( ! only_parent )
918  {
919  POSTGIS_DEBUGF(2, "searching whole tree stats for \"%s\"", get_rel_name(table_oid)? get_rel_name(table_oid) : "NULL");
920  stats_tuple = SearchSysCache3(STATRELATTINH, ObjectIdGetDatum(table_oid), Int16GetDatum(att_num), BoolGetDatum(true));
921  if ( stats_tuple )
922  POSTGIS_DEBUGF(2, "found whole tree stats for \"%s\"", get_rel_name(table_oid)? get_rel_name(table_oid) : "NULL");
923  }
924  /* Fall-back to main table stats only, if not found for whole tree or explicitly ignored */
925  if ( only_parent || ! stats_tuple )
926  {
927  POSTGIS_DEBUGF(2, "searching parent table stats for \"%s\"", get_rel_name(table_oid)? get_rel_name(table_oid) : "NULL");
928  stats_tuple = SearchSysCache3(STATRELATTINH, ObjectIdGetDatum(table_oid), Int16GetDatum(att_num), BoolGetDatum(false));
929  if ( stats_tuple )
930  POSTGIS_DEBUGF(2, "found parent table stats for \"%s\"", get_rel_name(table_oid)? get_rel_name(table_oid) : "NULL");
931  }
932  if ( ! stats_tuple )
933  {
934  POSTGIS_DEBUGF(2, "stats for \"%s\" do not exist", get_rel_name(table_oid)? get_rel_name(table_oid) : "NULL");
935  return NULL;
936  }
937 
938  nd_stats = pg_nd_stats_from_tuple(stats_tuple, mode);
939  ReleaseSysCache(stats_tuple);
940  if ( ! nd_stats )
941  {
942  POSTGIS_DEBUGF(2,
943  "histogram for attribute %d of table \"%s\" does not exist?",
944  att_num, get_rel_name(table_oid));
945  }
946 
947  return nd_stats;
948 }
949 
958 static ND_STATS*
959 pg_get_nd_stats_by_name(const Oid table_oid, const text *att_text, int mode, bool only_parent)
960 {
961  const char *att_name = text2cstring(att_text);
962  AttrNumber att_num;
963 
964  /* We know the name? Look up the num */
965  if ( att_text )
966  {
967  /* Get the attribute number */
968  att_num = get_attnum(table_oid, att_name);
969  if ( ! att_num ) {
970  elog(ERROR, "attribute \"%s\" does not exist", att_name);
971  return NULL;
972  }
973  }
974  else
975  {
976  elog(ERROR, "attribute name is null");
977  return NULL;
978  }
979 
980  return pg_get_nd_stats(table_oid, att_num, mode, only_parent);
981 }
982 
996 static float8
998 {
999  int ncells1, ncells2;
1000  int ndims1, ndims2, ndims;
1001  double ntuples_max;
1002  double ntuples_not_null1, ntuples_not_null2;
1003 
1004  ND_BOX extent1, extent2;
1005  ND_IBOX ibox1, ibox2;
1006  int at1[ND_DIMS];
1007  int at2[ND_DIMS];
1008  double min1[ND_DIMS];
1009  double width1[ND_DIMS];
1010  double cellsize1[ND_DIMS];
1011  int size2[ND_DIMS];
1012  double min2[ND_DIMS];
1013  double width2[ND_DIMS];
1014  double cellsize2[ND_DIMS];
1015  int size1[ND_DIMS];
1016  int d;
1017  double val = 0;
1018  float8 selectivity;
1019 
1020  /* Drop out on null inputs */
1021  if ( ! ( s1 && s2 ) )
1022  {
1023  elog(NOTICE, " estimate_join_selectivity called with null inputs");
1024  return FALLBACK_ND_SEL;
1025  }
1026 
1027  /* We need to know how many cells each side has... */
1028  ncells1 = (int)roundf(s1->histogram_cells);
1029  ncells2 = (int)roundf(s2->histogram_cells);
1030 
1031  /* ...so that we can drive the summation loop with the smaller histogram. */
1032  if ( ncells1 > ncells2 )
1033  {
1034  const ND_STATS *stats_tmp = s1;
1035  s1 = s2;
1036  s2 = stats_tmp;
1037  }
1038 
1039  POSTGIS_DEBUGF(3, "s1: %s", nd_stats_to_json(s1));
1040  POSTGIS_DEBUGF(3, "s2: %s", nd_stats_to_json(s2));
1041 
1042  /* Re-read that info after the swap */
1043  ncells1 = (int)roundf(s1->histogram_cells);
1044  ncells2 = (int)roundf(s2->histogram_cells);
1045 
1046  /* Q: What's the largest possible join size these relations can create? */
1047  /* A: The product of the # of non-null rows in each relation. */
1048  ntuples_not_null1 = s1->table_features * (s1->not_null_features / s1->sample_features);
1049  ntuples_not_null2 = s2->table_features * (s2->not_null_features / s2->sample_features);
1050  ntuples_max = ntuples_not_null1 * ntuples_not_null2;
1051 
1052  /* Get the ndims as ints */
1053  ndims1 = (int)roundf(s1->ndims);
1054  ndims2 = (int)roundf(s2->ndims);
1055  ndims = Max(ndims1, ndims2);
1056 
1057  /* Get the extents */
1058  extent1 = s1->extent;
1059  extent2 = s2->extent;
1060 
1061  /* If relation stats do not intersect, join is very very selective. */
1062  if ( ! nd_box_intersects(&extent1, &extent2, ndims) )
1063  {
1064  POSTGIS_DEBUG(3, "relation stats do not intersect, returning 0");
1065  PG_RETURN_FLOAT8(0.0);
1066  }
1067 
1068  /*
1069  * First find the index range of the part of the smaller
1070  * histogram that overlaps the larger one.
1071  */
1072  if ( ! nd_box_overlap(s1, &extent2, &ibox1) )
1073  {
1074  POSTGIS_DEBUG(3, "could not calculate overlap of relations");
1075  PG_RETURN_FLOAT8(FALLBACK_ND_JOINSEL);
1076  }
1077 
1078  /* Initialize counters / constants on s1 */
1079  for ( d = 0; d < ndims1; d++ )
1080  {
1081  at1[d] = ibox1.min[d];
1082  min1[d] = s1->extent.min[d];
1083  width1[d] = s1->extent.max[d] - s1->extent.min[d];
1084  size1[d] = (int)roundf(s1->size[d]);
1085  cellsize1[d] = width1[d] / size1[d];
1086  }
1087 
1088  /* Initialize counters / constants on s2 */
1089  for ( d = 0; d < ndims2; d++ )
1090  {
1091  min2[d] = s2->extent.min[d];
1092  width2[d] = s2->extent.max[d] - s2->extent.min[d];
1093  size2[d] = (int)roundf(s2->size[d]);
1094  cellsize2[d] = width2[d] / size2[d];
1095  }
1096 
1097  /* For each affected cell of s1... */
1098  do
1099  {
1100  double val1;
1101  /* Construct the bounds of this cell */
1102  ND_BOX nd_cell1;
1103  nd_box_init(&nd_cell1);
1104  for ( d = 0; d < ndims1; d++ )
1105  {
1106  nd_cell1.min[d] = min1[d] + (at1[d]+0) * cellsize1[d];
1107  nd_cell1.max[d] = min1[d] + (at1[d]+1) * cellsize1[d];
1108  }
1109 
1110  /* Find the cells of s2 that cell1 overlaps.. */
1111  nd_box_overlap(s2, &nd_cell1, &ibox2);
1112 
1113  /* Initialize counter */
1114  for ( d = 0; d < ndims2; d++ )
1115  {
1116  at2[d] = ibox2.min[d];
1117  }
1118 
1119  POSTGIS_DEBUGF(3, "at1 %d,%d %s", at1[0], at1[1], nd_box_to_json(&nd_cell1, ndims1));
1120 
1121  /* Get the value at this cell */
1122  val1 = s1->value[nd_stats_value_index(s1, at1)];
1123 
1124  /* For each overlapped cell of s2... */
1125  do
1126  {
1127  double ratio2;
1128  double val2;
1129 
1130  /* Construct the bounds of this cell */
1131  ND_BOX nd_cell2;
1132  nd_box_init(&nd_cell2);
1133  for ( d = 0; d < ndims2; d++ )
1134  {
1135  nd_cell2.min[d] = min2[d] + (at2[d]+0) * cellsize2[d];
1136  nd_cell2.max[d] = min2[d] + (at2[d]+1) * cellsize2[d];
1137  }
1138 
1139  POSTGIS_DEBUGF(3, " at2 %d,%d %s", at2[0], at2[1], nd_box_to_json(&nd_cell2, ndims2));
1140 
1141  /* Calculate overlap ratio of the cells */
1142  ratio2 = nd_box_ratio(&nd_cell1, &nd_cell2, Max(ndims1, ndims2));
1143 
1144  /* Multiply the cell counts, scaled by overlap ratio */
1145  val2 = s2->value[nd_stats_value_index(s2, at2)];
1146  POSTGIS_DEBUGF(3, " val1 %.6g val2 %.6g ratio %.6g", val1, val2, ratio2);
1147  val += val1 * (val2 * ratio2);
1148  }
1149  while ( nd_increment(&ibox2, ndims2, at2) );
1150 
1151  }
1152  while( nd_increment(&ibox1, ndims1, at1) );
1153 
1154  POSTGIS_DEBUGF(3, "val of histogram = %g", val);
1155 
1156  /*
1157  * In order to compare our total cell count "val" to the
1158  * ntuples_max, we need to scale val up to reflect a full
1159  * table estimate. So, multiply by ratio of table size to
1160  * sample size.
1161  */
1162  val *= (s1->table_features / s1->sample_features);
1163  val *= (s2->table_features / s2->sample_features);
1164 
1165  POSTGIS_DEBUGF(3, "val scaled to full table size = %g", val);
1166 
1167  /*
1168  * Because the cell counts are over-determined due to
1169  * double counting of features that overlap multiple cells
1170  * (see the compute_gserialized_stats routine)
1171  * we also have to scale our cell count "val" *down*
1172  * to adjust for the double counting.
1173  */
1174 // val /= (s1->cells_covered / s1->histogram_features);
1175 // val /= (s2->cells_covered / s2->histogram_features);
1176 
1177  /*
1178  * Finally, the selectivity is the estimated number of
1179  * rows to be returned divided by the maximum possible
1180  * number of rows that can be returned.
1181  */
1182  selectivity = val / ntuples_max;
1183 
1184  /* Guard against over-estimates and crazy numbers :) */
1185  if ( isnan(selectivity) || ! isfinite(selectivity) || selectivity < 0.0 )
1186  {
1187  selectivity = DEFAULT_ND_JOINSEL;
1188  }
1189  else if ( selectivity > 1.0 )
1190  {
1191  selectivity = 1.0;
1192  }
1193 
1194  return selectivity;
1195 }
1196 
1202 Datum gserialized_gist_joinsel_nd(PG_FUNCTION_ARGS)
1203 {
1204  PG_RETURN_DATUM(DirectFunctionCall5(
1206  PG_GETARG_DATUM(0), PG_GETARG_DATUM(1),
1207  PG_GETARG_DATUM(2), PG_GETARG_DATUM(3),
1208  Int32GetDatum(0) /* ND mode */
1209  ));
1210 }
1211 
1217 Datum gserialized_gist_joinsel_2d(PG_FUNCTION_ARGS)
1218 {
1219  PG_RETURN_DATUM(DirectFunctionCall5(
1221  PG_GETARG_DATUM(0), PG_GETARG_DATUM(1),
1222  PG_GETARG_DATUM(2), PG_GETARG_DATUM(3),
1223  Int32GetDatum(2) /* 2D mode */
1224  ));
1225 }
1226 
1236 Datum gserialized_gist_joinsel(PG_FUNCTION_ARGS)
1237 {
1238  PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
1239  /* Oid operator = PG_GETARG_OID(1); */
1240  List *args = (List *) PG_GETARG_POINTER(2);
1241  JoinType jointype = (JoinType) PG_GETARG_INT16(3);
1242  int mode = PG_GETARG_INT32(4);
1243 
1244  Node *arg1, *arg2;
1245  Var *var1, *var2;
1246  Oid relid1, relid2;
1247 
1248  ND_STATS *stats1, *stats2;
1249  float8 selectivity;
1250 
1251  /* Only respond to an inner join/unknown context join */
1252  if (jointype != JOIN_INNER)
1253  {
1254  elog(DEBUG1, "%s: jointype %d not supported", __func__, jointype);
1255  PG_RETURN_FLOAT8(DEFAULT_ND_JOINSEL);
1256  }
1257 
1258  /* Find Oids of the geometry columns we are working with */
1259  arg1 = (Node*) linitial(args);
1260  arg2 = (Node*) lsecond(args);
1261  var1 = (Var*) arg1;
1262  var2 = (Var*) arg2;
1263 
1264  /* We only do column joins right now, no functional joins */
1265  /* TODO: handle g1 && ST_Expand(g2) */
1266  if (!IsA(arg1, Var) || !IsA(arg2, Var))
1267  {
1268  elog(DEBUG1, "%s called with arguments that are not column references", __func__);
1269  PG_RETURN_FLOAT8(DEFAULT_ND_JOINSEL);
1270  }
1271 
1272  /* What are the Oids of our tables/relations? */
1273  relid1 = rt_fetch(var1->varno, root->parse->rtable)->relid;
1274  relid2 = rt_fetch(var2->varno, root->parse->rtable)->relid;
1275 
1276  POSTGIS_DEBUGF(3, "using relations \"%s\" Oid(%d), \"%s\" Oid(%d)",
1277  get_rel_name(relid1) ? get_rel_name(relid1) : "NULL", relid1, get_rel_name(relid2) ? get_rel_name(relid2) : "NULL", relid2);
1278 
1279  /* Pull the stats from the stats system. */
1280  stats1 = pg_get_nd_stats(relid1, var1->varattno, mode, FALSE);
1281  stats2 = pg_get_nd_stats(relid2, var2->varattno, mode, FALSE);
1282 
1283  /* If we can't get stats, we have to stop here! */
1284  if ( ! stats1 )
1285  {
1286  POSTGIS_DEBUGF(3, "unable to retrieve stats for \"%s\" Oid(%d)", get_rel_name(relid1) ? get_rel_name(relid1) : "NULL" , relid1);
1287  PG_RETURN_FLOAT8(DEFAULT_ND_JOINSEL);
1288  }
1289  else if ( ! stats2 )
1290  {
1291  POSTGIS_DEBUGF(3, "unable to retrieve stats for \"%s\" Oid(%d)", get_rel_name(relid2) ? get_rel_name(relid2) : "NULL", relid2);
1292  PG_RETURN_FLOAT8(DEFAULT_ND_JOINSEL);
1293  }
1294 
1295  selectivity = estimate_join_selectivity(stats1, stats2);
1296  POSTGIS_DEBUGF(2, "got selectivity %g", selectivity);
1297 
1298  pfree(stats1);
1299  pfree(stats2);
1300  PG_RETURN_FLOAT8(selectivity);
1301 }
1302 
1303 
1304 
1305 
1324 static void
1325 compute_gserialized_stats_mode(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
1326  int sample_rows, double total_rows, int mode)
1327 {
1328  MemoryContext old_context;
1329  int d, i; /* Counters */
1330  int notnull_cnt = 0; /* # not null rows in the sample */
1331  int null_cnt = 0; /* # null rows in the sample */
1332  int histogram_features = 0; /* # rows that actually got counted in the histogram */
1333 
1334  ND_STATS *nd_stats; /* Our histogram */
1335  size_t nd_stats_size; /* Size to allocate */
1336 
1337  double total_width = 0; /* # of bytes used by sample */
1338  double total_sample_volume = 0; /* Area/volume coverage of the sample */
1339  double total_cell_count = 0; /* # of cells in histogram affected by sample */
1340 
1341  ND_BOX sum; /* Sum of extents of sample boxes */
1342  ND_BOX avg; /* Avg of extents of sample boxes */
1343  ND_BOX stddev; /* StdDev of extents of sample boxes */
1344 
1345  const ND_BOX **sample_boxes; /* ND_BOXes for each of the sample features */
1346  ND_BOX sample_extent; /* Extent of the raw sample */
1347  int histo_size[ND_DIMS]; /* histogram nrows, ncols, etc */
1348  ND_BOX histo_extent; /* Spatial extent of the histogram */
1349  ND_BOX histo_extent_new; /* Temporary variable */
1350  int histo_cells_target; /* Number of cells we will shoot for, given the stats target */
1351  int histo_cells; /* Number of cells in the histogram */
1352  int histo_cells_new = 1; /* Temporary variable */
1353 
1354  int ndims = 2; /* Dimensionality of the sample */
1355  int histo_ndims = 0; /* Dimensionality of the histogram */
1356  double sample_distribution[ND_DIMS]; /* How homogeneous is distribution of sample in each axis? */
1357  double total_distribution; /* Total of sample_distribution */
1358 
1359  int stats_slot; /* What slot is this data going into? (2D vs ND) */
1360  int stats_kind; /* And this is what? (2D vs ND) */
1361 
1362  /* Initialize sum and stddev */
1363  nd_box_init(&sum);
1364  nd_box_init(&stddev);
1365  nd_box_init(&avg);
1366  nd_box_init(&histo_extent);
1367  nd_box_init(&histo_extent_new);
1368 
1369  /*
1370  * This is where gserialized_analyze_nd
1371  * should put its' custom parameters.
1372  */
1373  /* void *mystats = stats->extra_data; */
1374 
1375  POSTGIS_DEBUG(2, "compute_gserialized_stats called");
1376  POSTGIS_DEBUGF(3, " # sample_rows: %d", sample_rows);
1377  POSTGIS_DEBUGF(3, " estimate of total_rows: %.6g", total_rows);
1378 
1379  /*
1380  * We might need less space, but don't think
1381  * its worth saving...
1382  */
1383  sample_boxes = palloc(sizeof(ND_BOX*) * sample_rows);
1384 
1385  /*
1386  * First scan:
1387  * o read boxes
1388  * o find dimensionality of the sample
1389  * o find extent of the sample
1390  * o count null-infinite/not-null values
1391  * o compute total_width
1392  * o compute total features's box area (for avgFeatureArea)
1393  * o sum features box coordinates (for standard deviation)
1394  */
1395  for ( i = 0; i < sample_rows; i++ )
1396  {
1397  Datum datum;
1398  GSERIALIZED *geom;
1399  GBOX gbox;
1400  ND_BOX *nd_box;
1401  bool is_null;
1402  bool is_copy;
1403 
1404  datum = fetchfunc(stats, i, &is_null);
1405 
1406  /* Skip all NULLs. */
1407  if ( is_null )
1408  {
1409  POSTGIS_DEBUGF(4, " skipped null geometry %d", i);
1410  null_cnt++;
1411  continue;
1412  }
1413 
1414  /* Read the bounds from the gserialized. */
1415  geom = (GSERIALIZED *)PG_DETOAST_DATUM(datum);
1416  is_copy = VARATT_IS_EXTENDED(datum);
1417  if ( LW_FAILURE == gserialized_get_gbox_p(geom, &gbox) )
1418  {
1419  /* Skip empties too. */
1420  POSTGIS_DEBUGF(3, " skipped empty geometry %d", i);
1421  continue;
1422  }
1423 
1424  /* If we're in 2D mode, zero out the higher dimensions for "safety" */
1425  if ( mode == 2 )
1426  gbox.zmin = gbox.zmax = gbox.mmin = gbox.mmax = 0.0;
1427 
1428  /* Check bounds for validity (finite and not NaN) */
1429  if ( ! gbox_is_valid(&gbox) )
1430  {
1431  POSTGIS_DEBUGF(3, " skipped infinite/nan geometry %d", i);
1432  continue;
1433  }
1434 
1435  /*
1436  * In N-D mode, set the ndims to the maximum dimensionality found
1437  * in the sample. Otherwise, leave at ndims == 2.
1438  */
1439  if ( mode != 2 )
1440  ndims = Max(gbox_ndims(&gbox), ndims);
1441 
1442  /* Convert gbox to n-d box */
1443  nd_box = palloc(sizeof(ND_BOX));
1444  nd_box_from_gbox(&gbox, nd_box);
1445 
1446  /* Cache n-d bounding box */
1447  sample_boxes[notnull_cnt] = nd_box;
1448 
1449  /* Initialize sample extent before merging first entry */
1450  if ( ! notnull_cnt )
1451  nd_box_init_bounds(&sample_extent);
1452 
1453  /* Add current sample to overall sample extent */
1454  nd_box_merge(nd_box, &sample_extent);
1455 
1456  /* How many bytes does this sample use? */
1457  total_width += VARSIZE(geom);
1458 
1459  /* Add bounds coordinates to sums for stddev calculation */
1460  for ( d = 0; d < ndims; d++ )
1461  {
1462  sum.min[d] += nd_box->min[d];
1463  sum.max[d] += nd_box->max[d];
1464  }
1465 
1466  /* Increment our "good feature" count */
1467  notnull_cnt++;
1468 
1469  /* Free up memory if our sample geometry was copied */
1470  if ( is_copy )
1471  pfree(geom);
1472 
1473  /* Give backend a chance of interrupting us */
1474  vacuum_delay_point();
1475  }
1476 
1477  /*
1478  * We'll build a histogram having stats->attr->attstattarget cells
1479  * on each side, within reason... we'll use ndims*10000 as the
1480  * maximum number of cells.
1481  * Also, if we're sampling a relatively small table, we'll try to ensure that
1482  * we have an average of 5 features for each cell so the histogram isn't
1483  * so sparse.
1484  */
1485  histo_cells_target = (int)pow((double)(stats->attr->attstattarget), (double)ndims);
1486  histo_cells_target = Min(histo_cells_target, ndims * 10000);
1487  histo_cells_target = Min(histo_cells_target, (int)(total_rows/5));
1488  POSTGIS_DEBUGF(3, " stats->attr->attstattarget: %d", stats->attr->attstattarget);
1489  POSTGIS_DEBUGF(3, " target # of histogram cells: %d", histo_cells_target);
1490 
1491  /* If there's no useful features, we can't work out stats */
1492  if ( ! notnull_cnt )
1493  {
1494  elog(NOTICE, "no non-null/empty features, unable to compute statistics");
1495  stats->stats_valid = false;
1496  return;
1497  }
1498 
1499  POSTGIS_DEBUGF(3, " sample_extent: %s", nd_box_to_json(&sample_extent, ndims));
1500 
1501  /*
1502  * Second scan:
1503  * o compute standard deviation
1504  */
1505  for ( d = 0; d < ndims; d++ )
1506  {
1507  /* Calculate average bounds values */
1508  avg.min[d] = sum.min[d] / notnull_cnt;
1509  avg.max[d] = sum.max[d] / notnull_cnt;
1510 
1511  /* Calculate standard deviation for this dimension bounds */
1512  for ( i = 0; i < notnull_cnt; i++ )
1513  {
1514  const ND_BOX *ndb = sample_boxes[i];
1515  stddev.min[d] += (ndb->min[d] - avg.min[d]) * (ndb->min[d] - avg.min[d]);
1516  stddev.max[d] += (ndb->max[d] - avg.max[d]) * (ndb->max[d] - avg.max[d]);
1517  }
1518  stddev.min[d] = sqrt(stddev.min[d] / notnull_cnt);
1519  stddev.max[d] = sqrt(stddev.max[d] / notnull_cnt);
1520 
1521  /* Histogram bounds for this dimension bounds is avg +/- SDFACTOR * stdev */
1522  histo_extent.min[d] = Max(avg.min[d] - SDFACTOR * stddev.min[d], sample_extent.min[d]);
1523  histo_extent.max[d] = Min(avg.max[d] + SDFACTOR * stddev.max[d], sample_extent.max[d]);
1524  }
1525 
1526  /*
1527  * Third scan:
1528  * o skip hard deviants
1529  * o compute new histogram box
1530  */
1531  nd_box_init_bounds(&histo_extent_new);
1532  for ( i = 0; i < notnull_cnt; i++ )
1533  {
1534  const ND_BOX *ndb = sample_boxes[i];
1535  /* Skip any hard deviants (boxes entirely outside our histo_extent */
1536  if ( ! nd_box_intersects(&histo_extent, ndb, ndims) )
1537  {
1538  POSTGIS_DEBUGF(4, " feature %d is a hard deviant, skipped", i);
1539  sample_boxes[i] = NULL;
1540  continue;
1541  }
1542  /* Expand our new box to fit all the other features. */
1543  nd_box_merge(ndb, &histo_extent_new);
1544  }
1545  /*
1546  * Expand the box slightly (1%) to avoid edge effects
1547  * with objects that are on the boundary
1548  */
1549  nd_box_expand(&histo_extent_new, 0.01);
1550  histo_extent = histo_extent_new;
1551 
1552  /*
1553  * How should we allocate our histogram cells to the
1554  * different dimensions? We can't do it by raw dimensional width,
1555  * because in x/y/z space, the z can have different units
1556  * from the x/y. Similarly for x/y/t space.
1557  * So, we instead calculate how much features overlap
1558  * each other in their dimension to figure out which
1559  * dimensions have useful selectivity characteristics (more
1560  * variability in density) and therefor would find
1561  * more cells useful (to distinguish between dense places and
1562  * homogeneous places).
1563  */
1564  nd_box_array_distribution(sample_boxes, notnull_cnt, &histo_extent, ndims,
1565  sample_distribution);
1566 
1567  /*
1568  * The sample_distribution array now tells us how spread out the
1569  * data is in each dimension, so we use that data to allocate
1570  * the histogram cells we have available.
1571  * At this point, histo_cells_target is the approximate target number
1572  * of cells.
1573  */
1574 
1575  /*
1576  * Some dimensions have basically a uniform distribution, we want
1577  * to allocate no cells to those dimensions, only to dimensions
1578  * that have some interesting differences in data distribution.
1579  * Here we count up the number of interesting dimensions
1580  */
1581  for ( d = 0; d < ndims; d++ )
1582  {
1583  if ( sample_distribution[d] > 0 )
1584  histo_ndims++;
1585  }
1586 
1587  if ( histo_ndims == 0 )
1588  {
1589  /* Special case: all our dimensions had low variability! */
1590  /* We just divide the cells up evenly */
1591  POSTGIS_DEBUG(3, " special case: no axes have variability");
1592  histo_cells_new = 1;
1593  for ( d = 0; d < ndims; d++ )
1594  {
1595  histo_size[d] = 1 + (int)pow((double)histo_cells_target, 1/(double)ndims);
1596  POSTGIS_DEBUGF(3, " histo_size[d]: %d", histo_size[d]);
1597  histo_cells_new *= histo_size[d];
1598  }
1599  POSTGIS_DEBUGF(3, " histo_cells_new: %d", histo_cells_new);
1600  }
1601  else
1602  {
1603  /*
1604  * We're going to express the amount of variability in each dimension
1605  * as a proportion of the total variability and allocate cells in that
1606  * dimension relative to that proportion.
1607  */
1608  POSTGIS_DEBUG(3, " allocating histogram axes based on axis variability");
1609  total_distribution = total_double(sample_distribution, ndims); /* First get the total */
1610  POSTGIS_DEBUGF(3, " total_distribution: %.8g", total_distribution);
1611  histo_cells_new = 1; /* For the number of cells in the final histogram */
1612  for ( d = 0; d < ndims; d++ )
1613  {
1614  if ( sample_distribution[d] == 0 ) /* Uninteresting dimensions don't get any room */
1615  {
1616  histo_size[d] = 1;
1617  }
1618  else /* Interesting dimension */
1619  {
1620  /* How does this dims variability compare to the total? */
1621  float edge_ratio = (float)sample_distribution[d] / (float)total_distribution;
1622  /*
1623  * Scale the target cells number by the # of dims and ratio,
1624  * then take the appropriate root to get the estimated number of cells
1625  * on this axis (eg, pow(0.5) for 2d, pow(0.333) for 3d, pow(0.25) for 4d)
1626  */
1627  histo_size[d] = (int)pow(histo_cells_target * histo_ndims * edge_ratio, 1/(double)histo_ndims);
1628  /* If something goes awry, just give this dim one slot */
1629  if ( ! histo_size[d] )
1630  histo_size[d] = 1;
1631  }
1632  histo_cells_new *= histo_size[d];
1633  }
1634  POSTGIS_DEBUGF(3, " histo_cells_new: %d", histo_cells_new);
1635  }
1636 
1637  /* Update histo_cells to the actual number of cells we need to allocate */
1638  histo_cells = histo_cells_new;
1639  POSTGIS_DEBUGF(3, " histo_cells: %d", histo_cells);
1640 
1641  /*
1642  * Create the histogram (ND_STATS) in the stats memory context
1643  */
1644  old_context = MemoryContextSwitchTo(stats->anl_context);
1645  nd_stats_size = sizeof(ND_STATS) + ((histo_cells - 1) * sizeof(float4));
1646  nd_stats = palloc(nd_stats_size);
1647  memset(nd_stats, 0, nd_stats_size); /* Initialize all values to 0 */
1648  MemoryContextSwitchTo(old_context);
1649 
1650  /* Initialize the #ND_STATS objects */
1651  nd_stats->ndims = ndims;
1652  nd_stats->extent = histo_extent;
1653  nd_stats->sample_features = sample_rows;
1654  nd_stats->table_features = total_rows;
1655  nd_stats->not_null_features = notnull_cnt;
1656  /* Copy in the histogram dimensions */
1657  for ( d = 0; d < ndims; d++ )
1658  nd_stats->size[d] = histo_size[d];
1659 
1660  /*
1661  * Fourth scan:
1662  * o fill histogram values with the proportion of
1663  * features' bbox overlaps: a feature's bvol
1664  * can fully overlap (1) or partially overlap
1665  * (fraction of 1) an histogram cell.
1666  *
1667  * Note that we are filling each cell with the "portion of
1668  * the feature's box that overlaps the cell". So, if we sum
1669  * up the values in the histogram, we could get the
1670  * histogram feature count.
1671  *
1672  */
1673  for ( i = 0; i < notnull_cnt; i++ )
1674  {
1675  const ND_BOX *nd_box;
1676  ND_IBOX nd_ibox;
1677  int at[ND_DIMS];
1678  int d;
1679  double num_cells = 0;
1680  double tmp_volume = 1.0;
1681  double min[ND_DIMS];
1682  double max[ND_DIMS];
1683  double cellsize[ND_DIMS];
1684 
1685  nd_box = sample_boxes[i];
1686  if ( ! nd_box ) continue; /* Skip Null'ed out hard deviants */
1687 
1688  /* Give backend a chance of interrupting us */
1689  vacuum_delay_point();
1690 
1691  /* Find the cells that overlap with this box and put them into the ND_IBOX */
1692  nd_box_overlap(nd_stats, nd_box, &nd_ibox);
1693  memset(at, 0, sizeof(int)*ND_DIMS);
1694 
1695  POSTGIS_DEBUGF(3, " feature %d: ibox (%d, %d, %d, %d) (%d, %d, %d, %d)", i,
1696  nd_ibox.min[0], nd_ibox.min[1], nd_ibox.min[2], nd_ibox.min[3],
1697  nd_ibox.max[0], nd_ibox.max[1], nd_ibox.max[2], nd_ibox.max[3]);
1698 
1699  for ( d = 0; d < nd_stats->ndims; d++ )
1700  {
1701  /* Initialize the starting values */
1702  at[d] = nd_ibox.min[d];
1703  min[d] = nd_stats->extent.min[d];
1704  max[d] = nd_stats->extent.max[d];
1705  cellsize[d] = (max[d] - min[d])/(nd_stats->size[d]);
1706 
1707  /* What's the volume (area) of this feature's box? */
1708  tmp_volume *= (nd_box->max[d] - nd_box->min[d]);
1709  }
1710 
1711  /* Add feature volume (area) to our total */
1712  total_sample_volume += tmp_volume;
1713 
1714  /*
1715  * Move through all the overlaped histogram cells values and
1716  * add the box overlap proportion to them.
1717  */
1718  do
1719  {
1720  ND_BOX nd_cell;
1721  double ratio;
1722  /* Create a box for this histogram cell */
1723  for ( d = 0; d < nd_stats->ndims; d++ )
1724  {
1725  nd_cell.min[d] = min[d] + (at[d]+0) * cellsize[d];
1726  nd_cell.max[d] = min[d] + (at[d]+1) * cellsize[d];
1727  }
1728 
1729  /*
1730  * If a feature box is completely inside one cell the ratio will be
1731  * 1.0. If a feature box is 50% in two cells, each cell will get
1732  * 0.5 added on.
1733  */
1734  ratio = nd_box_ratio(&nd_cell, nd_box, nd_stats->ndims);
1735  nd_stats->value[nd_stats_value_index(nd_stats, at)] += ratio;
1736  num_cells += ratio;
1737  POSTGIS_DEBUGF(3, " ratio (%.8g) num_cells (%.8g)", ratio, num_cells);
1738  POSTGIS_DEBUGF(3, " at (%d, %d, %d, %d)", at[0], at[1], at[2], at[3]);
1739  }
1740  while ( nd_increment(&nd_ibox, nd_stats->ndims, at) );
1741 
1742  /* Keep track of overall number of overlaps counted */
1743  total_cell_count += num_cells;
1744  /* How many features have we added to this histogram? */
1745  histogram_features++;
1746  }
1747 
1748  POSTGIS_DEBUGF(3, " histogram_features: %d", histogram_features);
1749  POSTGIS_DEBUGF(3, " sample_rows: %d", sample_rows);
1750  POSTGIS_DEBUGF(3, " table_rows: %.6g", total_rows);
1751 
1752  /* Error out if we got no sample information */
1753  if ( ! histogram_features )
1754  {
1755  POSTGIS_DEBUG(3, " no stats have been gathered");
1756  elog(NOTICE, " no features lie in the stats histogram, invalid stats");
1757  stats->stats_valid = false;
1758  return;
1759  }
1760 
1761  nd_stats->histogram_features = histogram_features;
1762  nd_stats->histogram_cells = histo_cells;
1763  nd_stats->cells_covered = total_cell_count;
1764 
1765  /* Put this histogram data into the right slot/kind */
1766  if ( mode == 2 )
1767  {
1768  stats_slot = STATISTIC_SLOT_2D;
1769  stats_kind = STATISTIC_KIND_2D;
1770  }
1771  else
1772  {
1773  stats_slot = STATISTIC_SLOT_ND;
1774  stats_kind = STATISTIC_KIND_ND;
1775  }
1776 
1777  /* Write the statistics data */
1778  stats->stakind[stats_slot] = stats_kind;
1779  stats->staop[stats_slot] = InvalidOid;
1780  stats->stanumbers[stats_slot] = (float4*)nd_stats;
1781  stats->numnumbers[stats_slot] = nd_stats_size/sizeof(float4);
1782  stats->stanullfrac = (float4)null_cnt/sample_rows;
1783  stats->stawidth = total_width/notnull_cnt;
1784  stats->stadistinct = -1.0;
1785  stats->stats_valid = true;
1786 
1787  POSTGIS_DEBUGF(3, " out: slot 0: kind %d (STATISTIC_KIND_ND)", stats->stakind[0]);
1788  POSTGIS_DEBUGF(3, " out: slot 0: op %d (InvalidOid)", stats->staop[0]);
1789  POSTGIS_DEBUGF(3, " out: slot 0: numnumbers %d", stats->numnumbers[0]);
1790  POSTGIS_DEBUGF(3, " out: null fraction: %f=%d/%d", stats->stanullfrac, null_cnt, sample_rows);
1791  POSTGIS_DEBUGF(3, " out: average width: %d bytes", stats->stawidth);
1792  POSTGIS_DEBUG (3, " out: distinct values: all (no check done)");
1793  POSTGIS_DEBUGF(3, " out: %s", nd_stats_to_json(nd_stats));
1794  /*
1795  POSTGIS_DEBUGF(3, " out histogram:\n%s", nd_stats_to_grid(nd_stats));
1796  */
1797 
1798  return;
1799 }
1800 
1801 
1819 static void
1820 compute_gserialized_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
1821  int sample_rows, double total_rows)
1822 {
1823  /* 2D Mode */
1824  compute_gserialized_stats_mode(stats, fetchfunc, sample_rows, total_rows, 2);
1825  /* ND Mode */
1826  compute_gserialized_stats_mode(stats, fetchfunc, sample_rows, total_rows, 0);
1827 }
1828 
1829 
1858 Datum gserialized_analyze_nd(PG_FUNCTION_ARGS)
1859 {
1860  VacAttrStats *stats = (VacAttrStats *)PG_GETARG_POINTER(0);
1861  Form_pg_attribute attr = stats->attr;
1862 
1863  POSTGIS_DEBUG(2, "gserialized_analyze_nd called");
1864 
1865  /* If the attstattarget column is negative, use the default value */
1866  /* NB: it is okay to scribble on stats->attr since it's a copy */
1867  if (attr->attstattarget < 0)
1868  attr->attstattarget = default_statistics_target;
1869 
1870  POSTGIS_DEBUGF(3, " attribute stat target: %d", attr->attstattarget);
1871 
1872  /* Setup the minimum rows and the algorithm function.
1873  * 300 matches the default value set in
1874  * postgresql/src/backend/commands/analyze.c */
1875  stats->minrows = 300 * stats->attr->attstattarget;
1876  stats->compute_stats = compute_gserialized_stats;
1877 
1878  POSTGIS_DEBUGF(3, " minrows: %d", stats->minrows);
1879 
1880  /* Indicate we are done successfully */
1881  PG_RETURN_BOOL(true);
1882 }
1883 
1896 static float8
1897 estimate_selectivity(const GBOX *box, const ND_STATS *nd_stats, int mode)
1898 {
1899  int d; /* counter */
1900  float8 selectivity;
1901  ND_BOX nd_box;
1902  ND_IBOX nd_ibox;
1903  int at[ND_DIMS];
1904  double cell_size[ND_DIMS];
1905  double min[ND_DIMS];
1906  double max[ND_DIMS];
1907  double total_count = 0.0;
1908  int ndims_max = Max(nd_stats->ndims, gbox_ndims(box));
1909 // int ndims_min = Min(nd_stats->ndims, gbox_ndims(box));
1910 
1911  /* Calculate the overlap of the box on the histogram */
1912  if ( ! nd_stats )
1913  {
1914  elog(NOTICE, " estimate_selectivity called with null input");
1915  return FALLBACK_ND_SEL;
1916  }
1917 
1918  /* Initialize nd_box. */
1919  nd_box_from_gbox(box, &nd_box);
1920 
1921  /*
1922  * To return 2D stats on an ND sample, we need to make the
1923  * 2D box cover the full range of the other dimensions in the
1924  * histogram.
1925  */
1926  POSTGIS_DEBUGF(3, " mode: %d", mode);
1927  if ( mode == 2 )
1928  {
1929  POSTGIS_DEBUG(3, " in 2d mode, stripping the computation down to 2d");
1930  ndims_max = 2;
1931  }
1932 
1933  POSTGIS_DEBUGF(3, " nd_stats->extent: %s", nd_box_to_json(&(nd_stats->extent), nd_stats->ndims));
1934  POSTGIS_DEBUGF(3, " nd_box: %s", nd_box_to_json(&(nd_box), gbox_ndims(box)));
1935 
1936  /*
1937  * Search box completely misses histogram extent?
1938  * We have to intersect in all N dimensions or else we have
1939  * zero interaction under the &&& operator. It's important
1940  * to short circuit in this case, as some of the tests below
1941  * will return junk results when run on non-intersecting inputs.
1942  */
1943  if ( ! nd_box_intersects(&nd_box, &(nd_stats->extent), ndims_max) )
1944  {
1945  POSTGIS_DEBUG(3, " search box does not overlap histogram, returning 0");
1946  return 0.0;
1947  }
1948 
1949  /* Search box completely contains histogram extent! */
1950  if ( nd_box_contains(&nd_box, &(nd_stats->extent), ndims_max) )
1951  {
1952  POSTGIS_DEBUG(3, " search box contains histogram, returning 1");
1953  return 1.0;
1954  }
1955 
1956  /* Calculate the overlap of the box on the histogram */
1957  if ( ! nd_box_overlap(nd_stats, &nd_box, &nd_ibox) )
1958  {
1959  POSTGIS_DEBUG(3, " search box overlap with stats histogram failed");
1960  return FALLBACK_ND_SEL;
1961  }
1962 
1963  /* Work out some measurements of the histogram */
1964  for ( d = 0; d < nd_stats->ndims; d++ )
1965  {
1966  /* Cell size in each dim */
1967  min[d] = nd_stats->extent.min[d];
1968  max[d] = nd_stats->extent.max[d];
1969  cell_size[d] = (max[d] - min[d]) / nd_stats->size[d];
1970  POSTGIS_DEBUGF(3, " cell_size[%d] : %.9g", d, cell_size[d]);
1971 
1972  /* Initialize the counter */
1973  at[d] = nd_ibox.min[d];
1974  }
1975 
1976  /* Move through all the overlap values and sum them */
1977  do
1978  {
1979  float cell_count, ratio;
1980  ND_BOX nd_cell;
1981 
1982  /* We have to pro-rate partially overlapped cells. */
1983  for ( d = 0; d < nd_stats->ndims; d++ )
1984  {
1985  nd_cell.min[d] = min[d] + (at[d]+0) * cell_size[d];
1986  nd_cell.max[d] = min[d] + (at[d]+1) * cell_size[d];
1987  }
1988 
1989  ratio = nd_box_ratio(&nd_box, &nd_cell, nd_stats->ndims);
1990  cell_count = nd_stats->value[nd_stats_value_index(nd_stats, at)];
1991 
1992  /* Add the pro-rated count for this cell to the overall total */
1993  total_count += cell_count * ratio;
1994  POSTGIS_DEBUGF(4, " cell (%d,%d), cell value %.6f, ratio %.6f", at[0], at[1], cell_count, ratio);
1995  }
1996  while ( nd_increment(&nd_ibox, nd_stats->ndims, at) );
1997 
1998  /* Scale by the number of features in our histogram to get the proportion */
1999  selectivity = total_count / nd_stats->histogram_features;
2000 
2001  POSTGIS_DEBUGF(3, " nd_stats->histogram_features = %f", nd_stats->histogram_features);
2002  POSTGIS_DEBUGF(3, " nd_stats->histogram_cells = %f", nd_stats->histogram_cells);
2003  POSTGIS_DEBUGF(3, " sum(overlapped histogram cells) = %f", total_count);
2004  POSTGIS_DEBUGF(3, " selectivity = %f", selectivity);
2005 
2006  /* Prevent rounding overflows */
2007  if (selectivity > 1.0) selectivity = 1.0;
2008  else if (selectivity < 0.0) selectivity = 0.0;
2009 
2010  return selectivity;
2011 }
2012 
2013 
2014 
2020 Datum _postgis_gserialized_stats(PG_FUNCTION_ARGS)
2021 {
2022  Oid table_oid = PG_GETARG_OID(0);
2023  text *att_text = PG_GETARG_TEXT_P(1);
2024  ND_STATS *nd_stats;
2025  char *str;
2026  text *json;
2027  int mode = 2; /* default to 2D mode */
2028  bool only_parent = FALSE; /* default to whole tree stats */
2029 
2030  /* Check if we've been asked to not use 2d mode */
2031  if ( ! PG_ARGISNULL(2) )
2032  mode = text_p_get_mode(PG_GETARG_TEXT_P(2));
2033 
2034  /* Retrieve the stats object */
2035  nd_stats = pg_get_nd_stats_by_name(table_oid, att_text, mode, only_parent);
2036  if ( ! nd_stats )
2037  elog(ERROR, "stats for \"%s.%s\" do not exist", get_rel_name(table_oid), text2cstring(att_text));
2038 
2039  /* Convert to JSON */
2040  str = nd_stats_to_json(nd_stats);
2041  json = cstring2text(str);
2042  pfree(str);
2043  pfree(nd_stats);
2044  PG_RETURN_TEXT_P(json);
2045 }
2046 
2047 
2053 Datum _postgis_gserialized_sel(PG_FUNCTION_ARGS)
2054 {
2055  Oid table_oid = PG_GETARG_OID(0);
2056  text *att_text = PG_GETARG_TEXT_P(1);
2057  Datum geom_datum = PG_GETARG_DATUM(2);
2058  GBOX gbox; /* search box read from gserialized datum */
2059  float8 selectivity = 0;
2060  ND_STATS *nd_stats;
2061  int mode = 2; /* 2D mode by default */
2062 
2063  /* Check if we've been asked to not use 2d mode */
2064  if ( ! PG_ARGISNULL(3) )
2065  mode = text_p_get_mode(PG_GETARG_TEXT_P(3));
2066 
2067  /* Retrieve the stats object */
2068  nd_stats = pg_get_nd_stats_by_name(table_oid, att_text, mode, FALSE);
2069 
2070  if ( ! nd_stats )
2071  elog(ERROR, "stats for \"%s.%s\" do not exist", get_rel_name(table_oid), text2cstring(att_text));
2072 
2073  /* Calculate the gbox */
2074  if ( ! gserialized_datum_get_gbox_p(geom_datum, &gbox) )
2075  elog(ERROR, "unable to calculate bounding box from geometry");
2076 
2077  POSTGIS_DEBUGF(3, " %s", gbox_to_string(&gbox));
2078 
2079  /* Do the estimation */
2080  selectivity = estimate_selectivity(&gbox, nd_stats, mode);
2081 
2082  pfree(nd_stats);
2083  PG_RETURN_FLOAT8(selectivity);
2084 }
2085 
2086 
2092 Datum _postgis_gserialized_joinsel(PG_FUNCTION_ARGS)
2093 {
2094  Oid table_oid1 = PG_GETARG_OID(0);
2095  text *att_text1 = PG_GETARG_TEXT_P(1);
2096  Oid table_oid2 = PG_GETARG_OID(2);
2097  text *att_text2 = PG_GETARG_TEXT_P(3);
2098  ND_STATS *nd_stats1, *nd_stats2;
2099  float8 selectivity = 0;
2100  int mode = 2; /* 2D mode by default */
2101 
2102 
2103  /* Retrieve the stats object */
2104  nd_stats1 = pg_get_nd_stats_by_name(table_oid1, att_text1, mode, FALSE);
2105  nd_stats2 = pg_get_nd_stats_by_name(table_oid2, att_text2, mode, FALSE);
2106 
2107  if ( ! nd_stats1 )
2108  elog(ERROR, "stats for \"%s.%s\" do not exist", get_rel_name(table_oid1), text2cstring(att_text1));
2109 
2110  if ( ! nd_stats2 )
2111  elog(ERROR, "stats for \"%s.%s\" do not exist", get_rel_name(table_oid2), text2cstring(att_text2));
2112 
2113  /* Check if we've been asked to not use 2d mode */
2114  if ( ! PG_ARGISNULL(4) )
2115  {
2116  text *modetxt = PG_GETARG_TEXT_P(4);
2117  char *modestr = text2cstring(modetxt);
2118  if ( modestr[0] == 'N' )
2119  mode = 0;
2120  }
2121 
2122  /* Do the estimation */
2123  selectivity = estimate_join_selectivity(nd_stats1, nd_stats2);
2124 
2125  pfree(nd_stats1);
2126  pfree(nd_stats2);
2127  PG_RETURN_FLOAT8(selectivity);
2128 }
2129 
2135 Datum gserialized_gist_sel_2d(PG_FUNCTION_ARGS)
2136 {
2137  PG_RETURN_DATUM(DirectFunctionCall5(
2139  PG_GETARG_DATUM(0), PG_GETARG_DATUM(1),
2140  PG_GETARG_DATUM(2), PG_GETARG_DATUM(3),
2141  Int32GetDatum(2) /* 2-D mode */
2142  ));
2143 }
2144 
2150 Datum gserialized_gist_sel_nd(PG_FUNCTION_ARGS)
2151 {
2152  PG_RETURN_DATUM(DirectFunctionCall5(
2154  PG_GETARG_DATUM(0), PG_GETARG_DATUM(1),
2155  PG_GETARG_DATUM(2), PG_GETARG_DATUM(3),
2156  Int32GetDatum(0) /* N-D mode */
2157  ));
2158 }
2159 
2174 Datum gserialized_gist_sel(PG_FUNCTION_ARGS)
2175 {
2176  PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
2177  /* Oid operator_oid = PG_GETARG_OID(1); */
2178  List *args = (List *) PG_GETARG_POINTER(2);
2179  /* int varRelid = PG_GETARG_INT32(3); */
2180  int mode = PG_GETARG_INT32(4);
2181 
2182  VariableStatData vardata;
2183  ND_STATS *nd_stats = NULL;
2184 
2185  Node *other;
2186  Var *self;
2187  GBOX search_box;
2188  float8 selectivity = 0;
2189 
2190  POSTGIS_DEBUG(2, "gserialized_gist_sel called");
2191 
2192  /*
2193  * TODO: This is a big one,
2194  * All this statistics code *only* tries to generate a valid
2195  * selectivity for && and &&&. That leaves all the other
2196  * geometry operators with bad stats! The selectivity
2197  * calculation should take account of the incoming operator
2198  * type and do the right thing.
2199  */
2200 
2201  /* Fail if not a binary opclause (probably shouldn't happen) */
2202  if (list_length(args) != 2)
2203  {
2204  POSTGIS_DEBUG(3, "gserialized_gist_sel: not a binary opclause");
2205  PG_RETURN_FLOAT8(DEFAULT_ND_SEL);
2206  }
2207 
2208  /* Find the constant part */
2209  other = (Node *) linitial(args);
2210  if ( ! IsA(other, Const) )
2211  {
2212  self = (Var *)other;
2213  other = (Node *) lsecond(args);
2214  }
2215  else
2216  {
2217  self = (Var *) lsecond(args);
2218  }
2219 
2220  if ( ! IsA(other, Const) )
2221  {
2222  POSTGIS_DEBUG(3, " no constant arguments - returning a default selectivity");
2223  PG_RETURN_FLOAT8(DEFAULT_ND_SEL);
2224  }
2225 
2226  /* Convert the constant to a BOX */
2227  if( ! gserialized_datum_get_gbox_p(((Const*)other)->constvalue, &search_box) )
2228  {
2229  POSTGIS_DEBUG(3, "search box is EMPTY");
2230  PG_RETURN_FLOAT8(0.0);
2231  }
2232  POSTGIS_DEBUGF(4, " requested search box is: %s", gbox_to_string(&search_box));
2233 
2234  /* Get pg_statistic row */
2235  examine_variable(root, (Node*)self, 0, &vardata);
2236  if ( vardata.statsTuple ) {
2237  nd_stats = pg_nd_stats_from_tuple(vardata.statsTuple, mode);
2238  }
2239  ReleaseVariableStats(vardata);
2240 
2241  if ( ! nd_stats )
2242  {
2243  POSTGIS_DEBUG(3, " unable to load stats from syscache, not analyzed yet?");
2244  PG_RETURN_FLOAT8(FALLBACK_ND_SEL);
2245  }
2246 
2247  POSTGIS_DEBUGF(4, " got stats:\n%s", nd_stats_to_json(nd_stats));
2248 
2249  /* Do the estimation! */
2250  selectivity = estimate_selectivity(&search_box, nd_stats, mode);
2251  POSTGIS_DEBUGF(3, " returning computed value: %f", selectivity);
2252 
2253  pfree(nd_stats);
2254  PG_RETURN_FLOAT8(selectivity);
2255 }
2256 
2257 
2258 
2265 Datum gserialized_estimated_extent(PG_FUNCTION_ARGS)
2266 {
2267  char *nsp = NULL;
2268  char *tbl = NULL;
2269  text *col = NULL;
2270  char *nsp_tbl = NULL;
2271  Oid tbl_oid;
2272  ND_STATS *nd_stats;
2273  GBOX *gbox;
2274  bool only_parent = FALSE;
2275 
2276  if ( PG_NARGS() == 4 )
2277  {
2278  nsp = text2cstring(PG_GETARG_TEXT_P(0));
2279  tbl = text2cstring(PG_GETARG_TEXT_P(1));
2280  col = PG_GETARG_TEXT_P(2);
2281  only_parent = PG_GETARG_BOOL(3);
2282  nsp_tbl = palloc(strlen(nsp) + strlen(tbl) + 6);
2283  sprintf(nsp_tbl, "\"%s\".\"%s\"", nsp, tbl);
2284  tbl_oid = DatumGetObjectId(DirectFunctionCall1(regclassin, CStringGetDatum(nsp_tbl)));
2285  pfree(nsp_tbl);
2286  }
2287  else if ( PG_NARGS() == 3 )
2288  {
2289  nsp = text2cstring(PG_GETARG_TEXT_P(0));
2290  tbl = text2cstring(PG_GETARG_TEXT_P(1));
2291  col = PG_GETARG_TEXT_P(2);
2292  nsp_tbl = palloc(strlen(nsp) + strlen(tbl) + 6);
2293  sprintf(nsp_tbl, "\"%s\".\"%s\"", nsp, tbl);
2294  tbl_oid = DatumGetObjectId(DirectFunctionCall1(regclassin, CStringGetDatum(nsp_tbl)));
2295  pfree(nsp_tbl);
2296  }
2297  else if ( PG_NARGS() == 2 )
2298  {
2299  tbl = text2cstring(PG_GETARG_TEXT_P(0));
2300  col = PG_GETARG_TEXT_P(1);
2301  nsp_tbl = palloc(strlen(tbl) + 3);
2302  sprintf(nsp_tbl, "\"%s\"", tbl);
2303  tbl_oid = DatumGetObjectId(DirectFunctionCall1(regclassin, CStringGetDatum(nsp_tbl)));
2304  pfree(nsp_tbl);
2305  }
2306  else
2307  {
2308  elog(ERROR, "estimated_extent() called with wrong number of arguments");
2309  PG_RETURN_NULL();
2310  }
2311 
2312  /* Estimated extent only returns 2D bounds, so use mode 2 */
2313  nd_stats = pg_get_nd_stats_by_name(tbl_oid, col, 2, only_parent);
2314 
2315  /* Error out on no stats */
2316  if ( ! nd_stats ) {
2317  elog(WARNING, "stats for \"%s.%s\" do not exist", tbl, text2cstring(col));
2318  PG_RETURN_NULL();
2319  }
2320 
2321  /* Construct the box */
2322  gbox = palloc(sizeof(GBOX));
2323  FLAGS_SET_GEODETIC(gbox->flags, 0);
2324  FLAGS_SET_Z(gbox->flags, 0);
2325  FLAGS_SET_M(gbox->flags, 0);
2326  gbox->xmin = nd_stats->extent.min[0];
2327  gbox->xmax = nd_stats->extent.max[0];
2328  gbox->ymin = nd_stats->extent.min[1];
2329  gbox->ymax = nd_stats->extent.max[1];
2330 
2331  pfree(nd_stats);
2332  PG_RETURN_POINTER(gbox);
2333 }
2334 
2342 Datum geometry_estimated_extent(PG_FUNCTION_ARGS)
2343 {
2344  if ( PG_NARGS() == 3 )
2345  {
2346  PG_RETURN_DATUM(
2347  DirectFunctionCall3(gserialized_estimated_extent,
2348  PG_GETARG_DATUM(0),
2349  PG_GETARG_DATUM(1),
2350  PG_GETARG_DATUM(2)));
2351  }
2352  else if ( PG_NARGS() == 2 )
2353  {
2354  PG_RETURN_DATUM(
2355  DirectFunctionCall2(gserialized_estimated_extent,
2356  PG_GETARG_DATUM(0),
2357  PG_GETARG_DATUM(1)));
2358  }
2359 
2360  elog(ERROR, "geometry_estimated_extent() called with wrong number of arguments");
2361  PG_RETURN_NULL();
2362 }
args
Definition: ovdump.py:44
int gserialized_get_gbox_p(const GSERIALIZED *g, GBOX *box)
Read the bounding box off a serialization and calculate one if it is not already there.
Definition: g_serialized.c:642
static int nd_increment(ND_IBOX *ibox, int ndims, int *counter)
Given an n-d index array (counter), and a domain to increment it in (ibox) increment it by one...
static ND_STATS * pg_get_nd_stats_by_name(const Oid table_oid, const text *att_text, int mode, bool only_parent)
Pull the stats object from the PgSQL system catalogs.
static int text_p_get_mode(const text *txt)
Utility function to see if the first letter of the mode argument is &#39;N&#39;.
static int nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
What stats cells overlap with this ND_BOX? Put the lowest cell addresses in ND_IBOX->min and the high...
PG_FUNCTION_INFO_V1(gserialized_gist_joinsel_nd)
For (geometry &&& geometry) and (geography && geography) we call into the N-D mode.
#define DEFAULT_ND_JOINSEL
static void compute_gserialized_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc, int sample_rows, double total_rows)
In order to do useful selectivity calculations in both 2-D and N-D modes, we actually have to generat...
stringbuffer_t * stringbuffer_create(void)
Allocate a new stringbuffer_t.
Definition: stringbuffer.c:35
#define NUM_BINS
int gbox_is_valid(const GBOX *gbox)
Return false if any of the dimensions is NaN or infinite.
Definition: g_box.c:209
#define MIN_DIMENSION_WIDTH
Minimum width of a dimension that we&#39;ll bother trying to compute statistics on.
char * gbox_to_string(const GBOX *gbox)
Allocate a string representation of the GBOX, based on dimensionality of flags.
Definition: g_box.c:404
#define FLAGS_GET_GEODETIC(flags)
Definition: liblwgeom.h:143
char * stringbuffer_getstringcopy(stringbuffer_t *s)
Returns a newly allocated string large enough to contain the current state of the string...
Definition: stringbuffer.c:160
double xmax
Definition: liblwgeom.h:293
#define ND_DIMS
The maximum number of dimensions our code can handle.
static void compute_gserialized_stats_mode(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc, int sample_rows, double total_rows, int mode)
The gserialized_analyze_nd sets this function as a callback on the stats object when called by the AN...
Datum geometry_estimated_extent(PG_FUNCTION_ARGS)
static int nd_box_contains(const ND_BOX *a, const ND_BOX *b, int ndims)
Return TRUE if ND_BOX a contains b, false otherwise.
Datum _postgis_gserialized_stats(PG_FUNCTION_ARGS)
#define STATISTIC_SLOT_2D
static void nd_box_from_gbox(const GBOX *gbox, ND_BOX *nd_box)
Set the values of an ND_BOX from a GBOX.
struct ND_BOX_T ND_BOX
N-dimensional box type for calculations, to avoid doing explicit axis conversions from GBOX in all ca...
#define FALLBACK_ND_JOINSEL
Datum gserialized_estimated_extent(PG_FUNCTION_ARGS)
#define FALLBACK_ND_SEL
More modest fallback selectivity factor.
#define FLAGS_SET_GEODETIC(flags, value)
Definition: liblwgeom.h:149
Datum _postgis_gserialized_joinsel(PG_FUNCTION_ARGS)
static float8 estimate_selectivity(const GBOX *box, const ND_STATS *nd_stats, int mode)
This function returns an estimate of the selectivity of a search GBOX by looking at data in the ND_ST...
#define LW_FAILURE
Definition: liblwgeom.h:79
int stringbuffer_aprintf(stringbuffer_t *s, const char *fmt,...)
Appends a formatted string to the current string buffer, using the format and argument list provided...
Definition: stringbuffer.c:253
static int nd_box_init(ND_BOX *a)
Zero out an ND_BOX.
Datum gserialized_gist_joinsel(PG_FUNCTION_ARGS)
#define FLAGS_SET_Z(flags, value)
Definition: liblwgeom.h:146
double zmax
Definition: liblwgeom.h:297
double ymin
Definition: liblwgeom.h:294
Datum gserialized_gist_sel_2d(PG_FUNCTION_ARGS)
Datum gserialized_gist_joinsel_2d(PG_FUNCTION_ARGS)
Datum gserialized_gist_sel(PG_FUNCTION_ARGS)
static int nd_box_array_distribution(const ND_BOX **nd_boxes, int num_boxes, const ND_BOX *extent, int ndims, double *distribution)
Calculate how much a set of boxes is homogenously distributed or contentrated within one dimension...
double xmin
Definition: liblwgeom.h:292
static int nd_stats_value_index(const ND_STATS *stats, int *indexes)
Given a position in the n-d histogram (i,j,k) return the position in the 1-d values array...
float4 size[ND_DIMS]
Datum gserialized_gist_joinsel_nd(PG_FUNCTION_ARGS)
static int nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
Return TRUE if ND_BOX a overlaps b, false otherwise.
static double total_double(const double *vals, int nvals)
Given double array, return sum of values.
int min[ND_DIMS]
static ND_STATS * pg_nd_stats_from_tuple(HeapTuple stats_tuple, int mode)
char * text2cstring(const text *textptr)
Datum gserialized_gist_sel_nd(PG_FUNCTION_ARGS)
static int cmp_int(const void *a, const void *b)
Integer comparison function for qsort.
static int nd_box_init_bounds(ND_BOX *a)
Prepare an ND_BOX for bounds calculation: set the maxes to the smallest thing possible and the mins t...
double ymax
Definition: liblwgeom.h:295
N-dimensional box index type.
#define FLAGS_GET_Z(flags)
Macros for manipulating the &#39;flags&#39; byte.
Definition: liblwgeom.h:140
static int range_quintile(int *vals, int nvals)
The difference between the fourth and first quintile values, the "inter-quintile range".
Datum _postgis_gserialized_sel(PG_FUNCTION_ARGS)
static float8 estimate_join_selectivity(const ND_STATS *s1, const ND_STATS *s2)
Given two statistics histograms, what is the selectivity of a join driven by the && or &&& operator...
uint8_t flags
Definition: liblwgeom.h:291
static char * nd_box_to_json(const ND_BOX *nd_box, int ndims)
Convert an ND_BOX to a JSON string for printing.
float4 max[ND_DIMS]
void stringbuffer_append(stringbuffer_t *s, const char *a)
Append the specified string to the stringbuffer_t.
Definition: stringbuffer.c:134
#define STATISTIC_KIND_2D
struct ND_IBOX_T ND_IBOX
N-dimensional box index type.
void stringbuffer_destroy(stringbuffer_t *s)
Free the stringbuffer_t and all memory managed within it.
Definition: stringbuffer.c:78
int max[ND_DIMS]
struct ND_STATS_T ND_STATS
N-dimensional statistics structure.
float4 min[ND_DIMS]
#define FALSE
Definition: dbfopen.c:168
static int nd_box_merge(const ND_BOX *source, ND_BOX *target)
Create a printable view of the ND_STATS histogram.
static int nd_box_expand(ND_BOX *nd_box, double expansion_factor)
Expand an ND_BOX ever so slightly.
double mmin
Definition: liblwgeom.h:298
#define SDFACTOR
double zmin
Definition: liblwgeom.h:296
#define FLAGS_GET_M(flags)
Definition: liblwgeom.h:141
static char * nd_stats_to_json(const ND_STATS *nd_stats)
Convert an ND_STATS to a JSON representation for external use.
Datum gserialized_analyze_nd(PG_FUNCTION_ARGS)
double mmax
Definition: liblwgeom.h:299
#define STATISTIC_SLOT_ND
static double nd_box_ratio(const ND_BOX *b1, const ND_BOX *b2, int ndims)
Returns the proportion of b2 that is covered by b1.
int value
Definition: genraster.py:61
N-dimensional statistics structure.
static ND_STATS * pg_get_nd_stats(const Oid table_oid, AttrNumber att_num, int mode, bool only_parent)
Pull the stats object from the PgSQL system catalogs.
#define TRUE
Definition: dbfopen.c:169
#define MAX_DIMENSION_WIDTH
Maximum width of a dimension that we&#39;ll bother trying to compute statistics on.
N-dimensional box type for calculations, to avoid doing explicit axis conversions from GBOX in all ca...
static int gbox_ndims(const GBOX *gbox)
Given that geodetic boxes are X/Y/Z regardless of the underlying geometry dimensionality and other bo...
This library is the generic geometry handling section of PostGIS.
#define DEFAULT_ND_SEL
Default geometry selectivity factor.
#define STATISTIC_KIND_ND
Assign a number to the n-dimensional statistics kind.
#define FLAGS_SET_M(flags, value)
Definition: liblwgeom.h:147