PostGIS  2.3.8dev-r@@SVN_REVISION@@
gserialized_estimate.c
Go to the documentation of this file.
1 /**********************************************************************
2  *
3  * PostGIS - Spatial Types for PostgreSQL
4  * http://postgis.net
5  *
6  * PostGIS is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * PostGIS is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with PostGIS. If not, see <http://www.gnu.org/licenses/>.
18  *
19  **********************************************************************
20  *
21  * Copyright 2012 (C) Paul Ramsey <pramsey@cleverelephant.ca>
22  *
23  **********************************************************************/
24 
25 
26 
27 /**********************************************************************
28  THEORY OF OPERATION
29 
30 The ANALYZE command hooks to a callback (gserialized_analyze_nd) that
31 calculates (compute_gserialized_stats_mode) two histograms of occurances of
32 features, once for the 2D domain (and the && operator) one for the
33 ND domain (and the &&& operator).
34 
35 Queries in PostgreSQL call into the selectivity sub-system to find out
36 the relative effectiveness of different clauses in sub-setting
37 relations. Queries with constant arguments call gserialized_gist_sel,
38 queries with relations on both sides call gserialized_gist_joinsel.
39 
40 gserialized_gist_sel sums up the values in the histogram that overlap
41 the contant search box.
42 
43 gserialized_gist_joinsel sums up the product of the overlapping
44 cells in each relation's histogram.
45 
46 Depending on the operator and type, the mode of selectivity calculation
47 will be 2D or ND.
48 
49 - geometry && geometry ==> 2D
50 - geometry &&& geometry ==> ND
51 - geography && geography ==> ND
52 
53 The 2D mode is put in effect by retrieving the 2D histogram from the
54 statistics cache and then allowing the generic ND calculations to
55 go to work.
56 
57 TO DO: More testing and examination of the &&& operator and mixed
58 dimensionality cases. (2D geometry) &&& (3D column), etc.
59 
60 **********************************************************************/
61 
62 #include "postgres.h"
63 #include "executor/spi.h"
64 #include "fmgr.h"
65 #include "commands/vacuum.h"
66 #include "nodes/relation.h"
67 #include "parser/parsetree.h"
68 #include "utils/array.h"
69 #include "utils/lsyscache.h"
70 #include "utils/builtins.h"
71 #include "utils/syscache.h"
72 #include "utils/rel.h"
73 #include "utils/selfuncs.h"
74 
75 #include "../postgis_config.h"
76 
77 #if POSTGIS_PGSQL_VERSION >= 93
78  #include "access/htup_details.h"
79 #endif
80 
81 #include "stringbuffer.h"
82 #include "liblwgeom.h"
83 #include "lwgeom_pg.h" /* For debugging macros. */
84 #include "gserialized_gist.h" /* For index common functions */
85 
86 #include <math.h>
87 #if HAVE_IEEEFP_H
88 #include <ieeefp.h>
89 #endif
90 #include <float.h>
91 #include <string.h>
92 #include <stdio.h>
93 #include <errno.h>
94 #include <ctype.h>
95 
96 /* Fall back to older finite() if necessary */
97 #ifndef HAVE_ISFINITE
98 # ifdef HAVE_GNU_ISFINITE
99 # define _GNU_SOURCE
100 # else
101 # define isfinite finite
102 # endif
103 #endif
104 
105 
106 /* Prototypes */
107 Datum gserialized_gist_joinsel(PG_FUNCTION_ARGS);
108 Datum gserialized_gist_joinsel_2d(PG_FUNCTION_ARGS);
109 Datum gserialized_gist_joinsel_nd(PG_FUNCTION_ARGS);
110 Datum gserialized_gist_sel(PG_FUNCTION_ARGS);
111 Datum gserialized_gist_sel_2d(PG_FUNCTION_ARGS);
112 Datum gserialized_gist_sel_nd(PG_FUNCTION_ARGS);
113 Datum gserialized_analyze_nd(PG_FUNCTION_ARGS);
114 Datum gserialized_estimated_extent(PG_FUNCTION_ARGS);
115 Datum _postgis_gserialized_sel(PG_FUNCTION_ARGS);
116 Datum _postgis_gserialized_joinsel(PG_FUNCTION_ARGS);
117 Datum _postgis_gserialized_stats(PG_FUNCTION_ARGS);
118 
119 /* Old Prototype */
120 Datum geometry_estimated_extent(PG_FUNCTION_ARGS);
121 
132 #define STATISTIC_KIND_ND 102
133 #define STATISTIC_KIND_2D 103
134 #define STATISTIC_SLOT_ND 0
135 #define STATISTIC_SLOT_2D 1
136 
137 /*
138 * The SD factor restricts the side of the statistics histogram
139 * based on the standard deviation of the extent of the data.
140 * SDFACTOR is the number of standard deviations from the mean
141 * the histogram will extend.
142 */
143 #define SDFACTOR 3.25
144 
150 #define ND_DIMS 4
151 
158 #define MIN_DIMENSION_WIDTH 0.000000001
159 
163 #define DEFAULT_ND_SEL 0.0001
164 #define DEFAULT_ND_JOINSEL 0.001
165 
169 #define FALLBACK_ND_SEL 0.2
170 #define FALLBACK_ND_JOINSEL 0.3
171 
177 typedef struct ND_BOX_T
178 {
179  float4 min[ND_DIMS];
180  float4 max[ND_DIMS];
181 } ND_BOX;
182 
186 typedef struct ND_IBOX_T
187 {
188  int min[ND_DIMS];
189  int max[ND_DIMS];
190 } ND_IBOX;
191 
192 
199 typedef struct ND_STATS_T
200 {
201  /* Dimensionality of the histogram. */
202  float4 ndims;
203 
204  /* Size of n-d histogram in each dimension. */
205  float4 size[ND_DIMS];
206 
207  /* Lower-left (min) and upper-right (max) spatial bounds of histogram. */
209 
210  /* How many rows in the table itself? */
212 
213  /* How many rows were in the sample that built this histogram? */
215 
216  /* How many not-Null/Empty features were in the sample? */
218 
219  /* How many features actually got sampled in the histogram? */
221 
222  /* How many cells in histogram? (sizex*sizey*sizez*sizem) */
224 
225  /* How many cells did those histogram features cover? */
226  /* Since we are pro-rating coverage, this number should */
227  /* now always equal histogram_features */
229 
230  /* Variable length # of floats for histogram */
231  float4 value[1];
232 } ND_STATS;
233 
234 
235 
236 
243 static int
244 gbox_ndims(const GBOX* gbox)
245 {
246  int dims = 2;
247  if ( FLAGS_GET_GEODETIC(gbox->flags) )
248  return 3;
249  if ( FLAGS_GET_Z(gbox->flags) )
250  dims++;
251  if ( FLAGS_GET_M(gbox->flags) )
252  dims++;
253  return dims;
254 }
255 
261 static int
262 text_p_get_mode(const text *txt)
263 {
264  int mode = 2;
265  char *modestr = text2cstring(txt);
266  if ( modestr[0] == 'N' )
267  mode = 0;
268  pfree(modestr);
269  return mode;
270 }
271 
272 
276 static int
277 cmp_int (const void *a, const void *b)
278 {
279  int ia = *((const int*)a);
280  int ib = *((const int*)b);
281 
282  if ( ia == ib )
283  return 0;
284  else if ( ia > ib )
285  return 1;
286  else
287  return -1;
288 }
289 
294 static int
295 range_quintile(int *vals, int nvals)
296 {
297  qsort(vals, nvals, sizeof(int), cmp_int);
298  return vals[4*nvals/5] - vals[nvals/5];
299 }
300 
304 static double
305 total_double(const double *vals, int nvals)
306 {
307  int i;
308  float total = 0;
309  /* Calculate total */
310  for ( i = 0; i < nvals; i++ )
311  total += vals[i];
312 
313  return total;
314 }
315 
316 #if POSTGIS_DEBUG_LEVEL >= 3
317 
321 static int
322 total_int(const int *vals, int nvals)
323 {
324  int i;
325  int total = 0;
326  /* Calculate total */
327  for ( i = 0; i < nvals; i++ )
328  total += vals[i];
329 
330  return total;
331 }
332 
336 static double
337 avg(const int *vals, int nvals)
338 {
339  int t = total_int(vals, nvals);
340  return (double)t / (double)nvals;
341 }
342 
346 static double
347 stddev(const int *vals, int nvals)
348 {
349  int i;
350  double sigma2 = 0;
351  double mean = avg(vals, nvals);
352 
353  /* Calculate sigma2 */
354  for ( i = 0; i < nvals; i++ )
355  {
356  double v = (double)(vals[i]);
357  sigma2 += (mean - v) * (mean - v);
358  }
359  return sqrt(sigma2 / nvals);
360 }
361 #endif /* POSTGIS_DEBUG_LEVEL >= 3 */
362 
367 static int
368 nd_stats_value_index(const ND_STATS *stats, int *indexes)
369 {
370  int d;
371  int accum = 1, vdx = 0;
372 
373  /* Calculate the index into the 1-d values array that the (i,j,k,l) */
374  /* n-d histogram coordinate implies. */
375  /* index = x + y * sizex + z * sizex * sizey + m * sizex * sizey * sizez */
376  for ( d = 0; d < (int)(stats->ndims); d++ )
377  {
378  int size = (int)(stats->size[d]);
379  if ( indexes[d] < 0 || indexes[d] >= size )
380  {
381  POSTGIS_DEBUGF(3, " bad index at (%d, %d)", indexes[0], indexes[1]);
382  return -1;
383  }
384  vdx += indexes[d] * accum;
385  accum *= size;
386  }
387  return vdx;
388 }
389 
393 static char*
394 nd_box_to_json(const ND_BOX *nd_box, int ndims)
395 {
396  char *rv;
397  int i;
399 
400  stringbuffer_append(sb, "{\"min\":[");
401  for ( i = 0; i < ndims; i++ )
402  {
403  if ( i ) stringbuffer_append(sb, ",");
404  stringbuffer_aprintf(sb, "%.6g", nd_box->min[i]);
405  }
406  stringbuffer_append(sb, "],\"max\":[");
407  for ( i = 0; i < ndims; i++ )
408  {
409  if ( i ) stringbuffer_append(sb, ",");
410  stringbuffer_aprintf(sb, "%.6g", nd_box->max[i]);
411  }
412  stringbuffer_append(sb, "]}");
413 
416  return rv;
417 }
418 
419 
424 static char*
425 nd_stats_to_json(const ND_STATS *nd_stats)
426 {
427  char *json_extent, *str;
428  int d;
430  int ndims = (int)roundf(nd_stats->ndims);
431 
432  stringbuffer_append(sb, "{");
433  stringbuffer_aprintf(sb, "\"ndims\":%d,", ndims);
434 
435  /* Size */
436  stringbuffer_append(sb, "\"size\":[");
437  for ( d = 0; d < ndims; d++ )
438  {
439  if ( d ) stringbuffer_append(sb, ",");
440  stringbuffer_aprintf(sb, "%d", (int)roundf(nd_stats->size[d]));
441  }
442  stringbuffer_append(sb, "],");
443 
444  /* Extent */
445  json_extent = nd_box_to_json(&(nd_stats->extent), ndims);
446  stringbuffer_aprintf(sb, "\"extent\":%s,", json_extent);
447  pfree(json_extent);
448 
449  stringbuffer_aprintf(sb, "\"table_features\":%d,", (int)roundf(nd_stats->table_features));
450  stringbuffer_aprintf(sb, "\"sample_features\":%d,", (int)roundf(nd_stats->sample_features));
451  stringbuffer_aprintf(sb, "\"not_null_features\":%d,", (int)roundf(nd_stats->not_null_features));
452  stringbuffer_aprintf(sb, "\"histogram_features\":%d,", (int)roundf(nd_stats->histogram_features));
453  stringbuffer_aprintf(sb, "\"histogram_cells\":%d,", (int)roundf(nd_stats->histogram_cells));
454  stringbuffer_aprintf(sb, "\"cells_covered\":%d", (int)roundf(nd_stats->cells_covered));
455  stringbuffer_append(sb, "}");
456 
457  str = stringbuffer_getstringcopy(sb);
459  return str;
460 }
461 
462 
468 // static char*
469 // nd_stats_to_grid(const ND_STATS *stats)
470 // {
471 // char *rv;
472 // int j, k;
473 // int sizex = (int)roundf(stats->size[0]);
474 // int sizey = (int)roundf(stats->size[1]);
475 // stringbuffer_t *sb = stringbuffer_create();
476 //
477 // for ( k = 0; k < sizey; k++ )
478 // {
479 // for ( j = 0; j < sizex; j++ )
480 // {
481 // stringbuffer_aprintf(sb, "%3d ", (int)roundf(stats->value[j + k*sizex]));
482 // }
483 // stringbuffer_append(sb, "\n");
484 // }
485 //
486 // rv = stringbuffer_getstringcopy(sb);
487 // stringbuffer_destroy(sb);
488 // return rv;
489 // }
490 
491 
493 static int
494 nd_box_merge(const ND_BOX *source, ND_BOX *target)
495 {
496  int d;
497  for ( d = 0; d < ND_DIMS; d++ )
498  {
499  target->min[d] = Min(target->min[d], source->min[d]);
500  target->max[d] = Max(target->max[d], source->max[d]);
501  }
502  return TRUE;
503 }
504 
506 static int
508 {
509  memset(a, 0, sizeof(ND_BOX));
510  return TRUE;
511 }
512 
518 static int
520 {
521  int d;
522  for ( d = 0; d < ND_DIMS; d++ )
523  {
524  a->min[d] = FLT_MAX;
525  a->max[d] = -1 * FLT_MAX;
526  }
527  return TRUE;
528 }
529 
531 static void
532 nd_box_from_gbox(const GBOX *gbox, ND_BOX *nd_box)
533 {
534  int d = 0;
535  POSTGIS_DEBUGF(3, " %s", gbox_to_string(gbox));
536 
537  nd_box_init(nd_box);
538  nd_box->min[d] = gbox->xmin;
539  nd_box->max[d] = gbox->xmax;
540  d++;
541  nd_box->min[d] = gbox->ymin;
542  nd_box->max[d] = gbox->ymax;
543  d++;
544  if ( FLAGS_GET_GEODETIC(gbox->flags) )
545  {
546  nd_box->min[d] = gbox->zmin;
547  nd_box->max[d] = gbox->zmax;
548  return;
549  }
550  if ( FLAGS_GET_Z(gbox->flags) )
551  {
552  nd_box->min[d] = gbox->zmin;
553  nd_box->max[d] = gbox->zmax;
554  d++;
555  }
556  if ( FLAGS_GET_M(gbox->flags) )
557  {
558  nd_box->min[d] = gbox->mmin;
559  nd_box->max[d] = gbox->mmax;
560  d++;
561  }
562  return;
563 }
564 
568 static int
569 nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
570 {
571  int d;
572  for ( d = 0; d < ndims; d++ )
573  {
574  if ( (a->min[d] > b->max[d]) || (a->max[d] < b->min[d]) )
575  return FALSE;
576  }
577  return TRUE;
578 }
579 
583 static int
584 nd_box_contains(const ND_BOX *a, const ND_BOX *b, int ndims)
585 {
586  int d;
587  for ( d = 0; d < ndims; d++ )
588  {
589  if ( ! ((a->min[d] < b->min[d]) && (a->max[d] > b->max[d])) )
590  return FALSE;
591  }
592  return TRUE;
593 }
594 
599 static int
600 nd_box_expand(ND_BOX *nd_box, double expansion_factor)
601 {
602  int d;
603  double size;
604  for ( d = 0; d < ND_DIMS; d++ )
605  {
606  size = nd_box->max[d] - nd_box->min[d];
607  if ( size <= 0 ) continue;
608  nd_box->min[d] -= size * expansion_factor / 2;
609  nd_box->max[d] += size * expansion_factor / 2;
610  }
611  return TRUE;
612 }
613 
618 static inline int
619 nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
620 {
621  int d;
622 
623  POSTGIS_DEBUGF(4, " nd_box: %s", nd_box_to_json(nd_box, nd_stats->ndims));
624 
625  /* Initialize ibox */
626  memset(nd_ibox, 0, sizeof(ND_IBOX));
627 
628  /* In each dimension... */
629  for ( d = 0; d < nd_stats->ndims; d++ )
630  {
631  double smin = nd_stats->extent.min[d];
632  double smax = nd_stats->extent.max[d];
633  double width = smax - smin;
634  int size = roundf(nd_stats->size[d]);
635 
636  /* ... find cells the box overlaps with in this dimension */
637  nd_ibox->min[d] = floor(size * (nd_box->min[d] - smin) / width);
638  nd_ibox->max[d] = floor(size * (nd_box->max[d] - smin) / width);
639 
640  POSTGIS_DEBUGF(5, " stats: dim %d: min %g: max %g: width %g", d, smin, smax, width);
641  POSTGIS_DEBUGF(5, " overlap: dim %d: (%d, %d)", d, nd_ibox->min[d], nd_ibox->max[d]);
642 
643  /* Push any out-of range values into range */
644  nd_ibox->min[d] = Max(nd_ibox->min[d], 0);
645  nd_ibox->max[d] = Min(nd_ibox->max[d], size-1);
646  }
647  return TRUE;
648 }
649 
653 static inline double
654 nd_box_ratio(const ND_BOX *b1, const ND_BOX *b2, int ndims)
655 {
656  int d;
657  bool covered = TRUE;
658  double ivol = 1.0;
659  double vol2 = 1.0;
660  double vol1 = 1.0;
661 
662  for ( d = 0 ; d < ndims; d++ )
663  {
664  if ( b1->max[d] <= b2->min[d] || b1->min[d] >= b2->max[d] )
665  return 0.0; /* Disjoint */
666 
667  if ( b1->min[d] > b2->min[d] || b1->max[d] < b2->max[d] )
668  covered = FALSE;
669  }
670 
671  if ( covered )
672  return 1.0;
673 
674  for ( d = 0; d < ndims; d++ )
675  {
676  double width1 = b1->max[d] - b1->min[d];
677  double width2 = b2->max[d] - b2->min[d];
678  double imin, imax, iwidth;
679 
680  vol1 *= width1;
681  vol2 *= width2;
682 
683  imin = Max(b1->min[d], b2->min[d]);
684  imax = Min(b1->max[d], b2->max[d]);
685  iwidth = imax - imin;
686  iwidth = Max(0.0, iwidth);
687 
688  ivol *= iwidth;
689  }
690 
691  if ( vol2 == 0.0 )
692  return vol2;
693 
694  return ivol / vol2;
695 }
696 
697 
713 static int
714 nd_box_array_distribution(const ND_BOX **nd_boxes, int num_boxes, const ND_BOX *extent, int ndims, double *distribution)
715 {
716  /* How many bins shall we use in figuring out the distribution? */
717  static int num_bins = 50;
718  int d, i, k, range;
719  int counts[num_bins];
720  double smin, smax; /* Spatial min, spatial max */
721  double swidth; /* Spatial width of dimension */
722 #if POSTGIS_DEBUG_LEVEL >= 3
723  double average, sdev, sdev_ratio;
724 #endif
725  int bmin, bmax; /* Bin min, bin max */
726  const ND_BOX *ndb;
727 
728  /* For each dimension... */
729  for ( d = 0; d < ndims; d++ )
730  {
731  /* Initialize counts for this dimension */
732  memset(counts, 0, sizeof(int)*num_bins);
733 
734  smin = extent->min[d];
735  smax = extent->max[d];
736  swidth = smax - smin;
737 
738  /* Don't try and calculate distribution of overly narrow dimensions */
739  if ( swidth < MIN_DIMENSION_WIDTH )
740  {
741  distribution[d] = 0;
742  continue;
743  }
744 
745  /* Sum up the overlaps of each feature with the dimensional bins */
746  for ( i = 0; i < num_boxes; i++ )
747  {
748  double minoffset, maxoffset;
749 
750  /* Skip null entries */
751  ndb = nd_boxes[i];
752  if ( ! ndb ) continue;
753 
754  /* Where does box fall relative to the working range */
755  minoffset = ndb->min[d] - smin;
756  maxoffset = ndb->max[d] - smin;
757 
758  /* Skip boxes that our outside our working range */
759  if ( minoffset < 0 || minoffset > swidth ||
760  maxoffset < 0 || maxoffset > swidth )
761  {
762  continue;
763  }
764 
765  /* What bins does this range correspond to? */
766  bmin = num_bins * (minoffset) / swidth;
767  bmax = num_bins * (maxoffset) / swidth;
768 
769  POSTGIS_DEBUGF(4, " dimension %d, feature %d: bin %d to bin %d", d, i, bmin, bmax);
770 
771  /* Increment the counts in all the bins this feature overlaps */
772  for ( k = bmin; k <= bmax; k++ )
773  {
774  counts[k] += 1;
775  }
776 
777  }
778 
779  /* How dispersed is the distribution of features across bins? */
780  range = range_quintile(counts, num_bins);
781 
782 #if POSTGIS_DEBUG_LEVEL >= 3
783  average = avg(counts, num_bins);
784  sdev = stddev(counts, num_bins);
785  sdev_ratio = sdev/average;
786 
787  POSTGIS_DEBUGF(3, " dimension %d: range = %d", d, range);
788  POSTGIS_DEBUGF(3, " dimension %d: average = %.6g", d, average);
789  POSTGIS_DEBUGF(3, " dimension %d: stddev = %.6g", d, sdev);
790  POSTGIS_DEBUGF(3, " dimension %d: stddev_ratio = %.6g", d, sdev_ratio);
791 #endif
792 
793  distribution[d] = range;
794  }
795 
796  return TRUE;
797 }
798 
804 static inline int
805 nd_increment(ND_IBOX *ibox, int ndims, int *counter)
806 {
807  int d = 0;
808 
809  while ( d < ndims )
810  {
811  if ( counter[d] < ibox->max[d] )
812  {
813  counter[d] += 1;
814  break;
815  }
816  counter[d] = ibox->min[d];
817  d++;
818  }
819  /* That's it, cannot increment any more! */
820  if ( d == ndims )
821  return FALSE;
822 
823  /* Increment complete! */
824  return TRUE;
825 }
826 
827 static ND_STATS*
828 pg_nd_stats_from_tuple(HeapTuple stats_tuple, int mode)
829 {
830  int stats_kind = STATISTIC_KIND_ND;
831  int rv;
832  ND_STATS *nd_stats;
833 
834  /* If we're in 2D mode, set the kind appropriately */
835  if ( mode == 2 ) stats_kind = STATISTIC_KIND_2D;
836 
837  /* Then read the geom status histogram from that */
838 
839 #if POSTGIS_PGSQL_VERSION < 100
840  float4 *floatptr;
841  int nvalues;
842 
843  rv = get_attstatsslot(stats_tuple, 0, 0, stats_kind, InvalidOid,
844  NULL, NULL, NULL, &floatptr, &nvalues);
845 
846  if ( ! rv ) {
847  POSTGIS_DEBUGF(2,
848  "no slot of kind %d in stats tuple", stats_kind);
849  return NULL;
850  }
851 
852  /* Clone the stats here so we can release the attstatsslot immediately */
853  nd_stats = palloc(sizeof(float) * nvalues);
854  memcpy(nd_stats, floatptr, sizeof(float) * nvalues);
855 
856  /* Clean up */
857  free_attstatsslot(0, NULL, 0, floatptr, nvalues);
858 #else /* PostgreSQL 10 or higher */
859  AttStatsSlot sslot;
860  rv = get_attstatsslot(&sslot, stats_tuple, stats_kind, InvalidOid,
861  ATTSTATSSLOT_NUMBERS);
862  if ( ! rv ) {
863  POSTGIS_DEBUGF(2,
864  "no slot of kind %d in stats tuple", stats_kind);
865  return NULL;
866  }
867 
868  /* Clone the stats here so we can release the attstatsslot immediately */
869  nd_stats = palloc(sizeof(float4) * sslot.nnumbers);
870  memcpy(nd_stats, sslot.numbers, sizeof(float4) * sslot.nnumbers);
871 
872  free_attstatsslot(&sslot);
873 #endif
874 
875  return nd_stats;
876 }
877 
882 static ND_STATS*
883 pg_get_nd_stats(const Oid table_oid, AttrNumber att_num, int mode, bool only_parent)
884 {
885  HeapTuple stats_tuple = NULL;
886  ND_STATS *nd_stats;
887 
888  /* First pull the stats tuple for the whole tree */
889  if ( ! only_parent )
890  {
891  POSTGIS_DEBUGF(2, "searching whole tree stats for \"%s\"", get_rel_name(table_oid)? get_rel_name(table_oid) : "NULL");
892  stats_tuple = SearchSysCache3(STATRELATT, table_oid, att_num, TRUE);
893  if ( stats_tuple )
894  POSTGIS_DEBUGF(2, "found whole tree stats for \"%s\"", get_rel_name(table_oid)? get_rel_name(table_oid) : "NULL");
895  }
896  /* Fall-back to main table stats only, if not found for whole tree or explicitly ignored */
897  if ( only_parent || ! stats_tuple )
898  {
899  POSTGIS_DEBUGF(2, "searching parent table stats for \"%s\"", get_rel_name(table_oid)? get_rel_name(table_oid) : "NULL");
900  stats_tuple = SearchSysCache2(STATRELATT, table_oid, att_num);
901  if ( stats_tuple )
902  POSTGIS_DEBUGF(2, "found parent table stats for \"%s\"", get_rel_name(table_oid)? get_rel_name(table_oid) : "NULL");
903  }
904  if ( ! stats_tuple )
905  {
906  POSTGIS_DEBUGF(2, "stats for \"%s\" do not exist", get_rel_name(table_oid)? get_rel_name(table_oid) : "NULL");
907  return NULL;
908  }
909 
910  nd_stats = pg_nd_stats_from_tuple(stats_tuple, mode);
911  ReleaseSysCache(stats_tuple);
912  if ( ! nd_stats )
913  {
914  POSTGIS_DEBUGF(2,
915  "histogram for attribute %d of table \"%s\" does not exist?",
916  att_num, get_rel_name(table_oid));
917  }
918 
919  return nd_stats;
920 }
921 
930 static ND_STATS*
931 pg_get_nd_stats_by_name(const Oid table_oid, const text *att_text, int mode, bool only_parent)
932 {
933  const char *att_name = text2cstring(att_text);
934  AttrNumber att_num;
935 
936  /* We know the name? Look up the num */
937  if ( att_text )
938  {
939  /* Get the attribute number */
940  att_num = get_attnum(table_oid, att_name);
941  if ( ! att_num ) {
942  elog(ERROR, "attribute \"%s\" does not exist", att_name);
943  return NULL;
944  }
945  }
946  else
947  {
948  elog(ERROR, "attribute name is null");
949  return NULL;
950  }
951 
952  return pg_get_nd_stats(table_oid, att_num, mode, only_parent);
953 }
954 
968 static float8
970 {
971  int ncells1, ncells2;
972  int ndims1, ndims2, ndims;
973  double ntuples_max;
974  double ntuples_not_null1, ntuples_not_null2;
975 
976  ND_BOX extent1, extent2;
977  ND_IBOX ibox1, ibox2;
978  int at1[ND_DIMS];
979  int at2[ND_DIMS];
980  double min1[ND_DIMS];
981  double width1[ND_DIMS];
982  double cellsize1[ND_DIMS];
983  int size2[ND_DIMS];
984  double min2[ND_DIMS];
985  double width2[ND_DIMS];
986  double cellsize2[ND_DIMS];
987  int size1[ND_DIMS];
988  int d;
989  double val = 0;
990  float8 selectivity;
991 
992  /* Drop out on null inputs */
993  if ( ! ( s1 && s2 ) )
994  {
995  elog(NOTICE, " estimate_join_selectivity called with null inputs");
996  return FALLBACK_ND_SEL;
997  }
998 
999  /* We need to know how many cells each side has... */
1000  ncells1 = (int)roundf(s1->histogram_cells);
1001  ncells2 = (int)roundf(s2->histogram_cells);
1002 
1003  /* ...so that we can drive the summation loop with the smaller histogram. */
1004  if ( ncells1 > ncells2 )
1005  {
1006  const ND_STATS *stats_tmp = s1;
1007  s1 = s2;
1008  s2 = stats_tmp;
1009  }
1010 
1011  POSTGIS_DEBUGF(3, "s1: %s", nd_stats_to_json(s1));
1012  POSTGIS_DEBUGF(3, "s2: %s", nd_stats_to_json(s2));
1013 
1014  /* Re-read that info after the swap */
1015  ncells1 = (int)roundf(s1->histogram_cells);
1016  ncells2 = (int)roundf(s2->histogram_cells);
1017 
1018  /* Q: What's the largest possible join size these relations can create? */
1019  /* A: The product of the # of non-null rows in each relation. */
1020  ntuples_not_null1 = s1->table_features * (s1->not_null_features / s1->sample_features);
1021  ntuples_not_null2 = s2->table_features * (s2->not_null_features / s2->sample_features);
1022  ntuples_max = ntuples_not_null1 * ntuples_not_null2;
1023 
1024  /* Get the ndims as ints */
1025  ndims1 = (int)roundf(s1->ndims);
1026  ndims2 = (int)roundf(s2->ndims);
1027  ndims = Max(ndims1, ndims2);
1028 
1029  /* Get the extents */
1030  extent1 = s1->extent;
1031  extent2 = s2->extent;
1032 
1033  /* If relation stats do not intersect, join is very very selective. */
1034  if ( ! nd_box_intersects(&extent1, &extent2, ndims) )
1035  {
1036  POSTGIS_DEBUG(3, "relation stats do not intersect, returning 0");
1037  PG_RETURN_FLOAT8(0.0);
1038  }
1039 
1040  /*
1041  * First find the index range of the part of the smaller
1042  * histogram that overlaps the larger one.
1043  */
1044  if ( ! nd_box_overlap(s1, &extent2, &ibox1) )
1045  {
1046  POSTGIS_DEBUG(3, "could not calculate overlap of relations");
1047  PG_RETURN_FLOAT8(FALLBACK_ND_JOINSEL);
1048  }
1049 
1050  /* Initialize counters / constants on s1 */
1051  for ( d = 0; d < ndims1; d++ )
1052  {
1053  at1[d] = ibox1.min[d];
1054  min1[d] = s1->extent.min[d];
1055  width1[d] = s1->extent.max[d] - s1->extent.min[d];
1056  size1[d] = (int)roundf(s1->size[d]);
1057  cellsize1[d] = width1[d] / size1[d];
1058  }
1059 
1060  /* Initialize counters / constants on s2 */
1061  for ( d = 0; d < ndims2; d++ )
1062  {
1063  min2[d] = s2->extent.min[d];
1064  width2[d] = s2->extent.max[d] - s2->extent.min[d];
1065  size2[d] = (int)roundf(s2->size[d]);
1066  cellsize2[d] = width2[d] / size2[d];
1067  }
1068 
1069  /* For each affected cell of s1... */
1070  do
1071  {
1072  double val1;
1073  /* Construct the bounds of this cell */
1074  ND_BOX nd_cell1;
1075  nd_box_init(&nd_cell1);
1076  for ( d = 0; d < ndims1; d++ )
1077  {
1078  nd_cell1.min[d] = min1[d] + (at1[d]+0) * cellsize1[d];
1079  nd_cell1.max[d] = min1[d] + (at1[d]+1) * cellsize1[d];
1080  }
1081 
1082  /* Find the cells of s2 that cell1 overlaps.. */
1083  nd_box_overlap(s2, &nd_cell1, &ibox2);
1084 
1085  /* Initialize counter */
1086  for ( d = 0; d < ndims2; d++ )
1087  {
1088  at2[d] = ibox2.min[d];
1089  }
1090 
1091  POSTGIS_DEBUGF(3, "at1 %d,%d %s", at1[0], at1[1], nd_box_to_json(&nd_cell1, ndims1));
1092 
1093  /* Get the value at this cell */
1094  val1 = s1->value[nd_stats_value_index(s1, at1)];
1095 
1096  /* For each overlapped cell of s2... */
1097  do
1098  {
1099  double ratio2;
1100  double val2;
1101 
1102  /* Construct the bounds of this cell */
1103  ND_BOX nd_cell2;
1104  nd_box_init(&nd_cell2);
1105  for ( d = 0; d < ndims2; d++ )
1106  {
1107  nd_cell2.min[d] = min2[d] + (at2[d]+0) * cellsize2[d];
1108  nd_cell2.max[d] = min2[d] + (at2[d]+1) * cellsize2[d];
1109  }
1110 
1111  POSTGIS_DEBUGF(3, " at2 %d,%d %s", at2[0], at2[1], nd_box_to_json(&nd_cell2, ndims2));
1112 
1113  /* Calculate overlap ratio of the cells */
1114  ratio2 = nd_box_ratio(&nd_cell1, &nd_cell2, Max(ndims1, ndims2));
1115 
1116  /* Multiply the cell counts, scaled by overlap ratio */
1117  val2 = s2->value[nd_stats_value_index(s2, at2)];
1118  POSTGIS_DEBUGF(3, " val1 %.6g val2 %.6g ratio %.6g", val1, val2, ratio2);
1119  val += val1 * (val2 * ratio2);
1120  }
1121  while ( nd_increment(&ibox2, ndims2, at2) );
1122 
1123  }
1124  while( nd_increment(&ibox1, ndims1, at1) );
1125 
1126  POSTGIS_DEBUGF(3, "val of histogram = %g", val);
1127 
1128  /*
1129  * In order to compare our total cell count "val" to the
1130  * ntuples_max, we need to scale val up to reflect a full
1131  * table estimate. So, multiply by ratio of table size to
1132  * sample size.
1133  */
1134  val *= (s1->table_features / s1->sample_features);
1135  val *= (s2->table_features / s2->sample_features);
1136 
1137  POSTGIS_DEBUGF(3, "val scaled to full table size = %g", val);
1138 
1139  /*
1140  * Because the cell counts are over-determined due to
1141  * double counting of features that overlap multiple cells
1142  * (see the compute_gserialized_stats routine)
1143  * we also have to scale our cell count "val" *down*
1144  * to adjust for the double counting.
1145  */
1146 // val /= (s1->cells_covered / s1->histogram_features);
1147 // val /= (s2->cells_covered / s2->histogram_features);
1148 
1149  /*
1150  * Finally, the selectivity is the estimated number of
1151  * rows to be returned divided by the maximum possible
1152  * number of rows that can be returned.
1153  */
1154  selectivity = val / ntuples_max;
1155 
1156  /* Guard against over-estimates and crazy numbers :) */
1157  if ( isnan(selectivity) || ! isfinite(selectivity) || selectivity < 0.0 )
1158  {
1159  selectivity = DEFAULT_ND_JOINSEL;
1160  }
1161  else if ( selectivity > 1.0 )
1162  {
1163  selectivity = 1.0;
1164  }
1165 
1166  return selectivity;
1167 }
1168 
1174 Datum gserialized_gist_joinsel_nd(PG_FUNCTION_ARGS)
1175 {
1176  PG_RETURN_DATUM(DirectFunctionCall5(
1178  PG_GETARG_DATUM(0), PG_GETARG_DATUM(1),
1179  PG_GETARG_DATUM(2), PG_GETARG_DATUM(3),
1180  Int32GetDatum(0) /* ND mode */
1181  ));
1182 }
1183 
1189 Datum gserialized_gist_joinsel_2d(PG_FUNCTION_ARGS)
1190 {
1191  PG_RETURN_DATUM(DirectFunctionCall5(
1193  PG_GETARG_DATUM(0), PG_GETARG_DATUM(1),
1194  PG_GETARG_DATUM(2), PG_GETARG_DATUM(3),
1195  Int32GetDatum(2) /* 2D mode */
1196  ));
1197 }
1198 
1208 Datum gserialized_gist_joinsel(PG_FUNCTION_ARGS)
1209 {
1210  PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
1211  /* Oid operator = PG_GETARG_OID(1); */
1212  List *args = (List *) PG_GETARG_POINTER(2);
1213  JoinType jointype = (JoinType) PG_GETARG_INT16(3);
1214  int mode = PG_GETARG_INT32(4);
1215 
1216  Node *arg1, *arg2;
1217  Var *var1, *var2;
1218  Oid relid1, relid2;
1219 
1220  ND_STATS *stats1, *stats2;
1221  float8 selectivity;
1222 
1223  /* Only respond to an inner join/unknown context join */
1224  if (jointype != JOIN_INNER)
1225  {
1226  elog(DEBUG1, "%s: jointype %d not supported", __func__, jointype);
1227  PG_RETURN_FLOAT8(DEFAULT_ND_JOINSEL);
1228  }
1229 
1230  /* Find Oids of the geometry columns we are working with */
1231  arg1 = (Node*) linitial(args);
1232  arg2 = (Node*) lsecond(args);
1233  var1 = (Var*) arg1;
1234  var2 = (Var*) arg2;
1235 
1236  /* We only do column joins right now, no functional joins */
1237  /* TODO: handle g1 && ST_Expand(g2) */
1238  if (!IsA(arg1, Var) || !IsA(arg2, Var))
1239  {
1240  elog(DEBUG1, "%s called with arguments that are not column references", __func__);
1241  PG_RETURN_FLOAT8(DEFAULT_ND_JOINSEL);
1242  }
1243 
1244  /* What are the Oids of our tables/relations? */
1245  relid1 = getrelid(var1->varno, root->parse->rtable);
1246  relid2 = getrelid(var2->varno, root->parse->rtable);
1247 
1248  POSTGIS_DEBUGF(3, "using relations \"%s\" Oid(%d), \"%s\" Oid(%d)",
1249  get_rel_name(relid1) ? get_rel_name(relid1) : "NULL", relid1, get_rel_name(relid2) ? get_rel_name(relid2) : "NULL", relid2);
1250 
1251  /* Pull the stats from the stats system. */
1252  stats1 = pg_get_nd_stats(relid1, var1->varattno, mode, FALSE);
1253  stats2 = pg_get_nd_stats(relid2, var2->varattno, mode, FALSE);
1254 
1255  /* If we can't get stats, we have to stop here! */
1256  if ( ! stats1 )
1257  {
1258  POSTGIS_DEBUGF(3, "unable to retrieve stats for \"%s\" Oid(%d)", get_rel_name(relid1) ? get_rel_name(relid1) : "NULL" , relid1);
1259  PG_RETURN_FLOAT8(DEFAULT_ND_JOINSEL);
1260  }
1261  else if ( ! stats2 )
1262  {
1263  POSTGIS_DEBUGF(3, "unable to retrieve stats for \"%s\" Oid(%d)", get_rel_name(relid2) ? get_rel_name(relid2) : "NULL", relid2);
1264  PG_RETURN_FLOAT8(DEFAULT_ND_JOINSEL);
1265  }
1266 
1267  selectivity = estimate_join_selectivity(stats1, stats2);
1268  POSTGIS_DEBUGF(2, "got selectivity %g", selectivity);
1269 
1270  pfree(stats1);
1271  pfree(stats2);
1272  PG_RETURN_FLOAT8(selectivity);
1273 }
1274 
1275 
1276 
1277 
1296 static void
1297 compute_gserialized_stats_mode(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
1298  int sample_rows, double total_rows, int mode)
1299 {
1300  MemoryContext old_context;
1301  int d, i; /* Counters */
1302  int notnull_cnt = 0; /* # not null rows in the sample */
1303  int null_cnt = 0; /* # null rows in the sample */
1304  int histogram_features = 0; /* # rows that actually got counted in the histogram */
1305 
1306  ND_STATS *nd_stats; /* Our histogram */
1307  size_t nd_stats_size; /* Size to allocate */
1308 
1309  double total_width = 0; /* # of bytes used by sample */
1310  double total_sample_volume = 0; /* Area/volume coverage of the sample */
1311  double total_cell_count = 0; /* # of cells in histogram affected by sample */
1312 
1313  ND_BOX sum; /* Sum of extents of sample boxes */
1314  ND_BOX avg; /* Avg of extents of sample boxes */
1315  ND_BOX stddev; /* StdDev of extents of sample boxes */
1316 
1317  const ND_BOX **sample_boxes; /* ND_BOXes for each of the sample features */
1318  ND_BOX sample_extent; /* Extent of the raw sample */
1319  int histo_size[ND_DIMS]; /* histogram nrows, ncols, etc */
1320  ND_BOX histo_extent; /* Spatial extent of the histogram */
1321  ND_BOX histo_extent_new; /* Temporary variable */
1322  int histo_cells_target; /* Number of cells we will shoot for, given the stats target */
1323  int histo_cells; /* Number of cells in the histogram */
1324  int histo_cells_new = 1; /* Temporary variable */
1325 
1326  int ndims = 2; /* Dimensionality of the sample */
1327  int histo_ndims = 0; /* Dimensionality of the histogram */
1328  double sample_distribution[ND_DIMS]; /* How homogeneous is distribution of sample in each axis? */
1329  double total_distribution; /* Total of sample_distribution */
1330 
1331  int stats_slot; /* What slot is this data going into? (2D vs ND) */
1332  int stats_kind; /* And this is what? (2D vs ND) */
1333 
1334  /* Initialize sum and stddev */
1335  nd_box_init(&sum);
1336  nd_box_init(&stddev);
1337 
1338  /*
1339  * This is where gserialized_analyze_nd
1340  * should put its' custom parameters.
1341  */
1342  /* void *mystats = stats->extra_data; */
1343 
1344  POSTGIS_DEBUG(2, "compute_gserialized_stats called");
1345  POSTGIS_DEBUGF(3, " # sample_rows: %d", sample_rows);
1346  POSTGIS_DEBUGF(3, " estimate of total_rows: %.6g", total_rows);
1347 
1348  /*
1349  * We might need less space, but don't think
1350  * its worth saving...
1351  */
1352  sample_boxes = palloc(sizeof(ND_BOX*) * sample_rows);
1353 
1354  /*
1355  * First scan:
1356  * o read boxes
1357  * o find dimensionality of the sample
1358  * o find extent of the sample
1359  * o count null-infinite/not-null values
1360  * o compute total_width
1361  * o compute total features's box area (for avgFeatureArea)
1362  * o sum features box coordinates (for standard deviation)
1363  */
1364  for ( i = 0; i < sample_rows; i++ )
1365  {
1366  Datum datum;
1367  GSERIALIZED *geom;
1368  GBOX gbox;
1369  ND_BOX *nd_box;
1370  bool is_null;
1371  bool is_copy;
1372 
1373  datum = fetchfunc(stats, i, &is_null);
1374 
1375  /* Skip all NULLs. */
1376  if ( is_null )
1377  {
1378  POSTGIS_DEBUGF(4, " skipped null geometry %d", i);
1379  null_cnt++;
1380  continue;
1381  }
1382 
1383  /* Read the bounds from the gserialized. */
1384  geom = (GSERIALIZED *)PG_DETOAST_DATUM(datum);
1385  is_copy = VARATT_IS_EXTENDED(datum);
1386  if ( LW_FAILURE == gserialized_get_gbox_p(geom, &gbox) )
1387  {
1388  /* Skip empties too. */
1389  POSTGIS_DEBUGF(3, " skipped empty geometry %d", i);
1390  continue;
1391  }
1392 
1393  /* If we're in 2D mode, zero out the higher dimensions for "safety" */
1394  if ( mode == 2 )
1395  gbox.zmin = gbox.zmax = gbox.mmin = gbox.mmax = 0.0;
1396 
1397  /* Check bounds for validity (finite and not NaN) */
1398  if ( ! gbox_is_valid(&gbox) )
1399  {
1400  POSTGIS_DEBUGF(3, " skipped infinite/nan geometry %d", i);
1401  continue;
1402  }
1403 
1404  /*
1405  * In N-D mode, set the ndims to the maximum dimensionality found
1406  * in the sample. Otherwise, leave at ndims == 2.
1407  */
1408  if ( mode != 2 )
1409  ndims = Max(gbox_ndims(&gbox), ndims);
1410 
1411  /* Convert gbox to n-d box */
1412  nd_box = palloc(sizeof(ND_BOX));
1413  nd_box_from_gbox(&gbox, nd_box);
1414 
1415  /* Cache n-d bounding box */
1416  sample_boxes[notnull_cnt] = nd_box;
1417 
1418  /* Initialize sample extent before merging first entry */
1419  if ( ! notnull_cnt )
1420  nd_box_init_bounds(&sample_extent);
1421 
1422  /* Add current sample to overall sample extent */
1423  nd_box_merge(nd_box, &sample_extent);
1424 
1425  /* How many bytes does this sample use? */
1426  total_width += VARSIZE(geom);
1427 
1428  /* Add bounds coordinates to sums for stddev calculation */
1429  for ( d = 0; d < ndims; d++ )
1430  {
1431  sum.min[d] += nd_box->min[d];
1432  sum.max[d] += nd_box->max[d];
1433  }
1434 
1435  /* Increment our "good feature" count */
1436  notnull_cnt++;
1437 
1438  /* Free up memory if our sample geometry was copied */
1439  if ( is_copy )
1440  pfree(geom);
1441 
1442  /* Give backend a chance of interrupting us */
1443  vacuum_delay_point();
1444  }
1445 
1446  /*
1447  * We'll build a histogram having stats->attr->attstattarget cells
1448  * on each side, within reason... we'll use ndims*10000 as the
1449  * maximum number of cells.
1450  * Also, if we're sampling a relatively small table, we'll try to ensure that
1451  * we have an average of 5 features for each cell so the histogram isn't
1452  * so sparse.
1453  */
1454  histo_cells_target = (int)pow((double)(stats->attr->attstattarget), (double)ndims);
1455  histo_cells_target = Min(histo_cells_target, ndims * 10000);
1456  histo_cells_target = Min(histo_cells_target, (int)(total_rows/5));
1457  POSTGIS_DEBUGF(3, " stats->attr->attstattarget: %d", stats->attr->attstattarget);
1458  POSTGIS_DEBUGF(3, " target # of histogram cells: %d", histo_cells_target);
1459 
1460  /* If there's no useful features, we can't work out stats */
1461  if ( ! notnull_cnt )
1462  {
1463  elog(NOTICE, "no non-null/empty features, unable to compute statistics");
1464  stats->stats_valid = false;
1465  return;
1466  }
1467 
1468  POSTGIS_DEBUGF(3, " sample_extent: %s", nd_box_to_json(&sample_extent, ndims));
1469 
1470  /*
1471  * Second scan:
1472  * o compute standard deviation
1473  */
1474  for ( d = 0; d < ndims; d++ )
1475  {
1476  /* Calculate average bounds values */
1477  avg.min[d] = sum.min[d] / notnull_cnt;
1478  avg.max[d] = sum.max[d] / notnull_cnt;
1479 
1480  /* Calculate standard deviation for this dimension bounds */
1481  for ( i = 0; i < notnull_cnt; i++ )
1482  {
1483  const ND_BOX *ndb = sample_boxes[i];
1484  stddev.min[d] += (ndb->min[d] - avg.min[d]) * (ndb->min[d] - avg.min[d]);
1485  stddev.max[d] += (ndb->max[d] - avg.max[d]) * (ndb->max[d] - avg.max[d]);
1486  }
1487  stddev.min[d] = sqrt(stddev.min[d] / notnull_cnt);
1488  stddev.max[d] = sqrt(stddev.max[d] / notnull_cnt);
1489 
1490  /* Histogram bounds for this dimension bounds is avg +/- SDFACTOR * stdev */
1491  histo_extent.min[d] = Max(avg.min[d] - SDFACTOR * stddev.min[d], sample_extent.min[d]);
1492  histo_extent.max[d] = Min(avg.max[d] + SDFACTOR * stddev.max[d], sample_extent.max[d]);
1493  }
1494 
1495  /*
1496  * Third scan:
1497  * o skip hard deviants
1498  * o compute new histogram box
1499  */
1500  nd_box_init_bounds(&histo_extent_new);
1501  for ( i = 0; i < notnull_cnt; i++ )
1502  {
1503  const ND_BOX *ndb = sample_boxes[i];
1504  /* Skip any hard deviants (boxes entirely outside our histo_extent */
1505  if ( ! nd_box_intersects(&histo_extent, ndb, ndims) )
1506  {
1507  POSTGIS_DEBUGF(4, " feature %d is a hard deviant, skipped", i);
1508  sample_boxes[i] = NULL;
1509  continue;
1510  }
1511  /* Expand our new box to fit all the other features. */
1512  nd_box_merge(ndb, &histo_extent_new);
1513  }
1514  /*
1515  * Expand the box slightly (1%) to avoid edge effects
1516  * with objects that are on the boundary
1517  */
1518  nd_box_expand(&histo_extent_new, 0.01);
1519  histo_extent = histo_extent_new;
1520 
1521  /*
1522  * How should we allocate our histogram cells to the
1523  * different dimensions? We can't do it by raw dimensional width,
1524  * because in x/y/z space, the z can have different units
1525  * from the x/y. Similarly for x/y/t space.
1526  * So, we instead calculate how much features overlap
1527  * each other in their dimension to figure out which
1528  * dimensions have useful selectivity characteristics (more
1529  * variability in density) and therefor would find
1530  * more cells useful (to distinguish between dense places and
1531  * homogeneous places).
1532  */
1533  nd_box_array_distribution(sample_boxes, notnull_cnt, &histo_extent, ndims,
1534  sample_distribution);
1535 
1536  /*
1537  * The sample_distribution array now tells us how spread out the
1538  * data is in each dimension, so we use that data to allocate
1539  * the histogram cells we have available.
1540  * At this point, histo_cells_target is the approximate target number
1541  * of cells.
1542  */
1543 
1544  /*
1545  * Some dimensions have basically a uniform distribution, we want
1546  * to allocate no cells to those dimensions, only to dimensions
1547  * that have some interesting differences in data distribution.
1548  * Here we count up the number of interesting dimensions
1549  */
1550  for ( d = 0; d < ndims; d++ )
1551  {
1552  if ( sample_distribution[d] > 0 )
1553  histo_ndims++;
1554  }
1555 
1556  if ( histo_ndims == 0 )
1557  {
1558  /* Special case: all our dimensions had low variability! */
1559  /* We just divide the cells up evenly */
1560  POSTGIS_DEBUG(3, " special case: no axes have variability");
1561  histo_cells_new = 1;
1562  for ( d = 0; d < ndims; d++ )
1563  {
1564  histo_size[d] = 1 + (int)pow((double)histo_cells_target, 1/(double)ndims);
1565  POSTGIS_DEBUGF(3, " histo_size[d]: %d", histo_size[d]);
1566  histo_cells_new *= histo_size[d];
1567  }
1568  POSTGIS_DEBUGF(3, " histo_cells_new: %d", histo_cells_new);
1569  }
1570  else
1571  {
1572  /*
1573  * We're going to express the amount of variability in each dimension
1574  * as a proportion of the total variability and allocate cells in that
1575  * dimension relative to that proportion.
1576  */
1577  POSTGIS_DEBUG(3, " allocating histogram axes based on axis variability");
1578  total_distribution = total_double(sample_distribution, ndims); /* First get the total */
1579  POSTGIS_DEBUGF(3, " total_distribution: %.8g", total_distribution);
1580  histo_cells_new = 1; /* For the number of cells in the final histogram */
1581  for ( d = 0; d < ndims; d++ )
1582  {
1583  if ( sample_distribution[d] == 0 ) /* Uninteresting dimensions don't get any room */
1584  {
1585  histo_size[d] = 1;
1586  }
1587  else /* Interesting dimension */
1588  {
1589  /* How does this dims variability compare to the total? */
1590  float edge_ratio = (float)sample_distribution[d] / (float)total_distribution;
1591  /*
1592  * Scale the target cells number by the # of dims and ratio,
1593  * then take the appropriate root to get the estimated number of cells
1594  * on this axis (eg, pow(0.5) for 2d, pow(0.333) for 3d, pow(0.25) for 4d)
1595  */
1596  histo_size[d] = (int)pow(histo_cells_target * histo_ndims * edge_ratio, 1/(double)histo_ndims);
1597  /* If something goes awry, just give this dim one slot */
1598  if ( ! histo_size[d] )
1599  histo_size[d] = 1;
1600  }
1601  histo_cells_new *= histo_size[d];
1602  }
1603  POSTGIS_DEBUGF(3, " histo_cells_new: %d", histo_cells_new);
1604  }
1605 
1606  /* Update histo_cells to the actual number of cells we need to allocate */
1607  histo_cells = histo_cells_new;
1608  POSTGIS_DEBUGF(3, " histo_cells: %d", histo_cells);
1609 
1610  /*
1611  * Create the histogram (ND_STATS) in the stats memory context
1612  */
1613  old_context = MemoryContextSwitchTo(stats->anl_context);
1614  nd_stats_size = sizeof(ND_STATS) + ((histo_cells - 1) * sizeof(float4));
1615  nd_stats = palloc(nd_stats_size);
1616  memset(nd_stats, 0, nd_stats_size); /* Initialize all values to 0 */
1617  MemoryContextSwitchTo(old_context);
1618 
1619  /* Initialize the #ND_STATS objects */
1620  nd_stats->ndims = ndims;
1621  nd_stats->extent = histo_extent;
1622  nd_stats->sample_features = sample_rows;
1623  nd_stats->table_features = total_rows;
1624  nd_stats->not_null_features = notnull_cnt;
1625  /* Copy in the histogram dimensions */
1626  for ( d = 0; d < ndims; d++ )
1627  nd_stats->size[d] = histo_size[d];
1628 
1629  /*
1630  * Fourth scan:
1631  * o fill histogram values with the proportion of
1632  * features' bbox overlaps: a feature's bvol
1633  * can fully overlap (1) or partially overlap
1634  * (fraction of 1) an histogram cell.
1635  *
1636  * Note that we are filling each cell with the "portion of
1637  * the feature's box that overlaps the cell". So, if we sum
1638  * up the values in the histogram, we could get the
1639  * histogram feature count.
1640  *
1641  */
1642  for ( i = 0; i < notnull_cnt; i++ )
1643  {
1644  const ND_BOX *nd_box;
1645  ND_IBOX nd_ibox;
1646  int at[ND_DIMS];
1647  int d;
1648  double num_cells = 0;
1649  double tmp_volume = 1.0;
1650  double min[ND_DIMS];
1651  double max[ND_DIMS];
1652  double cellsize[ND_DIMS];
1653 
1654  nd_box = sample_boxes[i];
1655  if ( ! nd_box ) continue; /* Skip Null'ed out hard deviants */
1656 
1657  /* Give backend a chance of interrupting us */
1658  vacuum_delay_point();
1659 
1660  /* Find the cells that overlap with this box and put them into the ND_IBOX */
1661  nd_box_overlap(nd_stats, nd_box, &nd_ibox);
1662  memset(at, 0, sizeof(int)*ND_DIMS);
1663 
1664  POSTGIS_DEBUGF(3, " feature %d: ibox (%d, %d, %d, %d) (%d, %d, %d, %d)", i,
1665  nd_ibox.min[0], nd_ibox.min[1], nd_ibox.min[2], nd_ibox.min[3],
1666  nd_ibox.max[0], nd_ibox.max[1], nd_ibox.max[2], nd_ibox.max[3]);
1667 
1668  for ( d = 0; d < nd_stats->ndims; d++ )
1669  {
1670  /* Initialize the starting values */
1671  at[d] = nd_ibox.min[d];
1672  min[d] = nd_stats->extent.min[d];
1673  max[d] = nd_stats->extent.max[d];
1674  cellsize[d] = (max[d] - min[d])/(nd_stats->size[d]);
1675 
1676  /* What's the volume (area) of this feature's box? */
1677  tmp_volume *= (nd_box->max[d] - nd_box->min[d]);
1678  }
1679 
1680  /* Add feature volume (area) to our total */
1681  total_sample_volume += tmp_volume;
1682 
1683  /*
1684  * Move through all the overlaped histogram cells values and
1685  * add the box overlap proportion to them.
1686  */
1687  do
1688  {
1689  ND_BOX nd_cell;
1690  double ratio;
1691  /* Create a box for this histogram cell */
1692  for ( d = 0; d < nd_stats->ndims; d++ )
1693  {
1694  nd_cell.min[d] = min[d] + (at[d]+0) * cellsize[d];
1695  nd_cell.max[d] = min[d] + (at[d]+1) * cellsize[d];
1696  }
1697 
1698  /*
1699  * If a feature box is completely inside one cell the ratio will be
1700  * 1.0. If a feature box is 50% in two cells, each cell will get
1701  * 0.5 added on.
1702  */
1703  ratio = nd_box_ratio(&nd_cell, nd_box, nd_stats->ndims);
1704  nd_stats->value[nd_stats_value_index(nd_stats, at)] += ratio;
1705  num_cells += ratio;
1706  POSTGIS_DEBUGF(3, " ratio (%.8g) num_cells (%.8g)", ratio, num_cells);
1707  POSTGIS_DEBUGF(3, " at (%d, %d, %d, %d)", at[0], at[1], at[2], at[3]);
1708  }
1709  while ( nd_increment(&nd_ibox, nd_stats->ndims, at) );
1710 
1711  /* Keep track of overall number of overlaps counted */
1712  total_cell_count += num_cells;
1713  /* How many features have we added to this histogram? */
1714  histogram_features++;
1715  }
1716 
1717  POSTGIS_DEBUGF(3, " histogram_features: %d", histogram_features);
1718  POSTGIS_DEBUGF(3, " sample_rows: %d", sample_rows);
1719  POSTGIS_DEBUGF(3, " table_rows: %.6g", total_rows);
1720 
1721  /* Error out if we got no sample information */
1722  if ( ! histogram_features )
1723  {
1724  POSTGIS_DEBUG(3, " no stats have been gathered");
1725  elog(NOTICE, " no features lie in the stats histogram, invalid stats");
1726  stats->stats_valid = false;
1727  return;
1728  }
1729 
1730  nd_stats->histogram_features = histogram_features;
1731  nd_stats->histogram_cells = histo_cells;
1732  nd_stats->cells_covered = total_cell_count;
1733 
1734  /* Put this histogram data into the right slot/kind */
1735  if ( mode == 2 )
1736  {
1737  stats_slot = STATISTIC_SLOT_2D;
1738  stats_kind = STATISTIC_KIND_2D;
1739  }
1740  else
1741  {
1742  stats_slot = STATISTIC_SLOT_ND;
1743  stats_kind = STATISTIC_KIND_ND;
1744  }
1745 
1746  /* Write the statistics data */
1747  stats->stakind[stats_slot] = stats_kind;
1748  stats->staop[stats_slot] = InvalidOid;
1749  stats->stanumbers[stats_slot] = (float4*)nd_stats;
1750  stats->numnumbers[stats_slot] = nd_stats_size/sizeof(float4);
1751  stats->stanullfrac = (float4)null_cnt/sample_rows;
1752  stats->stawidth = total_width/notnull_cnt;
1753  stats->stadistinct = -1.0;
1754  stats->stats_valid = true;
1755 
1756  POSTGIS_DEBUGF(3, " out: slot 0: kind %d (STATISTIC_KIND_ND)", stats->stakind[0]);
1757  POSTGIS_DEBUGF(3, " out: slot 0: op %d (InvalidOid)", stats->staop[0]);
1758  POSTGIS_DEBUGF(3, " out: slot 0: numnumbers %d", stats->numnumbers[0]);
1759  POSTGIS_DEBUGF(3, " out: null fraction: %f=%d/%d", stats->stanullfrac, null_cnt, sample_rows);
1760  POSTGIS_DEBUGF(3, " out: average width: %d bytes", stats->stawidth);
1761  POSTGIS_DEBUG (3, " out: distinct values: all (no check done)");
1762  POSTGIS_DEBUGF(3, " out: %s", nd_stats_to_json(nd_stats));
1763  /*
1764  POSTGIS_DEBUGF(3, " out histogram:\n%s", nd_stats_to_grid(nd_stats));
1765  */
1766 
1767  return;
1768 }
1769 
1770 
1788 static void
1789 compute_gserialized_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
1790  int sample_rows, double total_rows)
1791 {
1792  /* 2D Mode */
1793  compute_gserialized_stats_mode(stats, fetchfunc, sample_rows, total_rows, 2);
1794  /* ND Mode */
1795  compute_gserialized_stats_mode(stats, fetchfunc, sample_rows, total_rows, 0);
1796 }
1797 
1798 
1827 Datum gserialized_analyze_nd(PG_FUNCTION_ARGS)
1828 {
1829  VacAttrStats *stats = (VacAttrStats *)PG_GETARG_POINTER(0);
1830  Form_pg_attribute attr = stats->attr;
1831 
1832  POSTGIS_DEBUG(2, "gserialized_analyze_nd called");
1833 
1834  /* If the attstattarget column is negative, use the default value */
1835  /* NB: it is okay to scribble on stats->attr since it's a copy */
1836  if (attr->attstattarget < 0)
1837  attr->attstattarget = default_statistics_target;
1838 
1839  POSTGIS_DEBUGF(3, " attribute stat target: %d", attr->attstattarget);
1840 
1841  /* Setup the minimum rows and the algorithm function */
1842  stats->minrows = 300 * stats->attr->attstattarget;
1843  stats->compute_stats = compute_gserialized_stats;
1844 
1845  POSTGIS_DEBUGF(3, " minrows: %d", stats->minrows);
1846 
1847  /* Indicate we are done successfully */
1848  PG_RETURN_BOOL(true);
1849 }
1850 
1863 static float8
1864 estimate_selectivity(const GBOX *box, const ND_STATS *nd_stats, int mode)
1865 {
1866  int d; /* counter */
1867  float8 selectivity;
1868  ND_BOX nd_box;
1869  ND_IBOX nd_ibox;
1870  int at[ND_DIMS];
1871  double cell_size[ND_DIMS];
1872  double min[ND_DIMS];
1873  double max[ND_DIMS];
1874  double total_count = 0.0;
1875  int ndims_max = Max(nd_stats->ndims, gbox_ndims(box));
1876 // int ndims_min = Min(nd_stats->ndims, gbox_ndims(box));
1877 
1878  /* Calculate the overlap of the box on the histogram */
1879  if ( ! nd_stats )
1880  {
1881  elog(NOTICE, " estimate_selectivity called with null input");
1882  return FALLBACK_ND_SEL;
1883  }
1884 
1885  /* Initialize nd_box. */
1886  nd_box_from_gbox(box, &nd_box);
1887 
1888  /*
1889  * To return 2D stats on an ND sample, we need to make the
1890  * 2D box cover the full range of the other dimensions in the
1891  * histogram.
1892  */
1893  POSTGIS_DEBUGF(3, " mode: %d", mode);
1894  if ( mode == 2 )
1895  {
1896  POSTGIS_DEBUG(3, " in 2d mode, stripping the computation down to 2d");
1897  ndims_max = 2;
1898  }
1899 
1900  POSTGIS_DEBUGF(3, " nd_stats->extent: %s", nd_box_to_json(&(nd_stats->extent), nd_stats->ndims));
1901  POSTGIS_DEBUGF(3, " nd_box: %s", nd_box_to_json(&(nd_box), gbox_ndims(box)));
1902 
1903  /*
1904  * Search box completely misses histogram extent?
1905  * We have to intersect in all N dimensions or else we have
1906  * zero interaction under the &&& operator. It's important
1907  * to short circuit in this case, as some of the tests below
1908  * will return junk results when run on non-intersecting inputs.
1909  */
1910  if ( ! nd_box_intersects(&nd_box, &(nd_stats->extent), ndims_max) )
1911  {
1912  POSTGIS_DEBUG(3, " search box does not overlap histogram, returning 0");
1913  return 0.0;
1914  }
1915 
1916  /* Search box completely contains histogram extent! */
1917  if ( nd_box_contains(&nd_box, &(nd_stats->extent), ndims_max) )
1918  {
1919  POSTGIS_DEBUG(3, " search box contains histogram, returning 1");
1920  return 1.0;
1921  }
1922 
1923  /* Calculate the overlap of the box on the histogram */
1924  if ( ! nd_box_overlap(nd_stats, &nd_box, &nd_ibox) )
1925  {
1926  POSTGIS_DEBUG(3, " search box overlap with stats histogram failed");
1927  return FALLBACK_ND_SEL;
1928  }
1929 
1930  /* Work out some measurements of the histogram */
1931  for ( d = 0; d < nd_stats->ndims; d++ )
1932  {
1933  /* Cell size in each dim */
1934  min[d] = nd_stats->extent.min[d];
1935  max[d] = nd_stats->extent.max[d];
1936  cell_size[d] = (max[d] - min[d]) / nd_stats->size[d];
1937  POSTGIS_DEBUGF(3, " cell_size[%d] : %.9g", d, cell_size[d]);
1938 
1939  /* Initialize the counter */
1940  at[d] = nd_ibox.min[d];
1941  }
1942 
1943  /* Move through all the overlap values and sum them */
1944  do
1945  {
1946  float cell_count, ratio;
1947  ND_BOX nd_cell;
1948 
1949  /* We have to pro-rate partially overlapped cells. */
1950  for ( d = 0; d < nd_stats->ndims; d++ )
1951  {
1952  nd_cell.min[d] = min[d] + (at[d]+0) * cell_size[d];
1953  nd_cell.max[d] = min[d] + (at[d]+1) * cell_size[d];
1954  }
1955 
1956  ratio = nd_box_ratio(&nd_box, &nd_cell, nd_stats->ndims);
1957  cell_count = nd_stats->value[nd_stats_value_index(nd_stats, at)];
1958 
1959  /* Add the pro-rated count for this cell to the overall total */
1960  total_count += cell_count * ratio;
1961  POSTGIS_DEBUGF(4, " cell (%d,%d), cell value %.6f, ratio %.6f", at[0], at[1], cell_count, ratio);
1962  }
1963  while ( nd_increment(&nd_ibox, nd_stats->ndims, at) );
1964 
1965  /* Scale by the number of features in our histogram to get the proportion */
1966  selectivity = total_count / nd_stats->histogram_features;
1967 
1968  POSTGIS_DEBUGF(3, " nd_stats->histogram_features = %f", nd_stats->histogram_features);
1969  POSTGIS_DEBUGF(3, " nd_stats->histogram_cells = %f", nd_stats->histogram_cells);
1970  POSTGIS_DEBUGF(3, " sum(overlapped histogram cells) = %f", total_count);
1971  POSTGIS_DEBUGF(3, " selectivity = %f", selectivity);
1972 
1973  /* Prevent rounding overflows */
1974  if (selectivity > 1.0) selectivity = 1.0;
1975  else if (selectivity < 0.0) selectivity = 0.0;
1976 
1977  return selectivity;
1978 }
1979 
1980 
1981 
1987 Datum _postgis_gserialized_stats(PG_FUNCTION_ARGS)
1988 {
1989  Oid table_oid = PG_GETARG_OID(0);
1990  text *att_text = PG_GETARG_TEXT_P(1);
1991  ND_STATS *nd_stats;
1992  char *str;
1993  text *json;
1994  int mode = 2; /* default to 2D mode */
1995  bool only_parent = FALSE; /* default to whole tree stats */
1996 
1997  /* Check if we've been asked to not use 2d mode */
1998  if ( ! PG_ARGISNULL(2) )
1999  mode = text_p_get_mode(PG_GETARG_TEXT_P(2));
2000 
2001  /* Check if we've been asked to only use stats from parent */
2002  if ( ! PG_ARGISNULL(3) )
2003  only_parent = PG_GETARG_BOOL(3);
2004 
2005  /* Retrieve the stats object */
2006  nd_stats = pg_get_nd_stats_by_name(table_oid, att_text, mode, only_parent);
2007  if ( ! nd_stats )
2008  elog(ERROR, "stats for \"%s.%s\" do not exist", get_rel_name(table_oid), text2cstring(att_text));
2009 
2010  /* Convert to JSON */
2011  str = nd_stats_to_json(nd_stats);
2012  json = cstring2text(str);
2013  pfree(str);
2014  pfree(nd_stats);
2015  PG_RETURN_TEXT_P(json);
2016 }
2017 
2018 
2024 Datum _postgis_gserialized_sel(PG_FUNCTION_ARGS)
2025 {
2026  Oid table_oid = PG_GETARG_OID(0);
2027  text *att_text = PG_GETARG_TEXT_P(1);
2028  Datum geom_datum = PG_GETARG_DATUM(2);
2029  GBOX gbox; /* search box read from gserialized datum */
2030  float8 selectivity = 0;
2031  ND_STATS *nd_stats;
2032  int mode = 2; /* 2D mode by default */
2033 
2034  /* Check if we've been asked to not use 2d mode */
2035  if ( ! PG_ARGISNULL(3) )
2036  mode = text_p_get_mode(PG_GETARG_TEXT_P(3));
2037 
2038  /* Retrieve the stats object */
2039  nd_stats = pg_get_nd_stats_by_name(table_oid, att_text, mode, FALSE);
2040 
2041  if ( ! nd_stats )
2042  elog(ERROR, "stats for \"%s.%s\" do not exist", get_rel_name(table_oid), text2cstring(att_text));
2043 
2044  /* Calculate the gbox */
2045  if ( ! gserialized_datum_get_gbox_p(geom_datum, &gbox) )
2046  elog(ERROR, "unable to calculate bounding box from geometry");
2047 
2048  POSTGIS_DEBUGF(3, " %s", gbox_to_string(&gbox));
2049 
2050  /* Do the estimation */
2051  selectivity = estimate_selectivity(&gbox, nd_stats, mode);
2052 
2053  pfree(nd_stats);
2054  PG_RETURN_FLOAT8(selectivity);
2055 }
2056 
2057 
2063 Datum _postgis_gserialized_joinsel(PG_FUNCTION_ARGS)
2064 {
2065  Oid table_oid1 = PG_GETARG_OID(0);
2066  text *att_text1 = PG_GETARG_TEXT_P(1);
2067  Oid table_oid2 = PG_GETARG_OID(2);
2068  text *att_text2 = PG_GETARG_TEXT_P(3);
2069  ND_STATS *nd_stats1, *nd_stats2;
2070  float8 selectivity = 0;
2071  int mode = 2; /* 2D mode by default */
2072 
2073 
2074  /* Retrieve the stats object */
2075  nd_stats1 = pg_get_nd_stats_by_name(table_oid1, att_text1, mode, FALSE);
2076  nd_stats2 = pg_get_nd_stats_by_name(table_oid2, att_text2, mode, FALSE);
2077 
2078  if ( ! nd_stats1 )
2079  elog(ERROR, "stats for \"%s.%s\" do not exist", get_rel_name(table_oid1), text2cstring(att_text1));
2080 
2081  if ( ! nd_stats2 )
2082  elog(ERROR, "stats for \"%s.%s\" do not exist", get_rel_name(table_oid2), text2cstring(att_text2));
2083 
2084  /* Check if we've been asked to not use 2d mode */
2085  if ( ! PG_ARGISNULL(4) )
2086  {
2087  text *modetxt = PG_GETARG_TEXT_P(4);
2088  char *modestr = text2cstring(modetxt);
2089  if ( modestr[0] == 'N' )
2090  mode = 0;
2091  }
2092 
2093  /* Do the estimation */
2094  selectivity = estimate_join_selectivity(nd_stats1, nd_stats2);
2095 
2096  pfree(nd_stats1);
2097  pfree(nd_stats2);
2098  PG_RETURN_FLOAT8(selectivity);
2099 }
2100 
2106 Datum gserialized_gist_sel_2d(PG_FUNCTION_ARGS)
2107 {
2108  PG_RETURN_DATUM(DirectFunctionCall5(
2110  PG_GETARG_DATUM(0), PG_GETARG_DATUM(1),
2111  PG_GETARG_DATUM(2), PG_GETARG_DATUM(3),
2112  Int32GetDatum(2) /* 2-D mode */
2113  ));
2114 }
2115 
2121 Datum gserialized_gist_sel_nd(PG_FUNCTION_ARGS)
2122 {
2123  PG_RETURN_DATUM(DirectFunctionCall5(
2125  PG_GETARG_DATUM(0), PG_GETARG_DATUM(1),
2126  PG_GETARG_DATUM(2), PG_GETARG_DATUM(3),
2127  Int32GetDatum(0) /* N-D mode */
2128  ));
2129 }
2130 
2145 Datum gserialized_gist_sel(PG_FUNCTION_ARGS)
2146 {
2147  PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
2148  /* Oid operator_oid = PG_GETARG_OID(1); */
2149  List *args = (List *) PG_GETARG_POINTER(2);
2150  /* int varRelid = PG_GETARG_INT32(3); */
2151  int mode = PG_GETARG_INT32(4);
2152 
2153  VariableStatData vardata;
2154  ND_STATS *nd_stats = NULL;
2155 
2156  Node *other;
2157  Var *self;
2158  GBOX search_box;
2159  float8 selectivity = 0;
2160 
2161  POSTGIS_DEBUG(2, "gserialized_gist_sel called");
2162 
2163  /*
2164  * TODO: This is a big one,
2165  * All this statistics code *only* tries to generate a valid
2166  * selectivity for && and &&&. That leaves all the other
2167  * geometry operators with bad stats! The selectivity
2168  * calculation should take account of the incoming operator
2169  * type and do the right thing.
2170  */
2171 
2172  /* Fail if not a binary opclause (probably shouldn't happen) */
2173  if (list_length(args) != 2)
2174  {
2175  POSTGIS_DEBUG(3, "gserialized_gist_sel: not a binary opclause");
2176  PG_RETURN_FLOAT8(DEFAULT_ND_SEL);
2177  }
2178 
2179  /* Find the constant part */
2180  other = (Node *) linitial(args);
2181  if ( ! IsA(other, Const) )
2182  {
2183  self = (Var *)other;
2184  other = (Node *) lsecond(args);
2185  }
2186  else
2187  {
2188  self = (Var *) lsecond(args);
2189  }
2190 
2191  if ( ! IsA(other, Const) )
2192  {
2193  POSTGIS_DEBUG(3, " no constant arguments - returning a default selectivity");
2194  PG_RETURN_FLOAT8(DEFAULT_ND_SEL);
2195  }
2196 
2197  /* Convert the constant to a BOX */
2198  if( ! gserialized_datum_get_gbox_p(((Const*)other)->constvalue, &search_box) )
2199  {
2200  POSTGIS_DEBUG(3, "search box is EMPTY");
2201  PG_RETURN_FLOAT8(0.0);
2202  }
2203  POSTGIS_DEBUGF(4, " requested search box is: %s", gbox_to_string(&search_box));
2204 
2205  /* Get pg_statistic row */
2206  examine_variable(root, (Node*)self, 0, &vardata);
2207  if ( vardata.statsTuple ) {
2208  nd_stats = pg_nd_stats_from_tuple(vardata.statsTuple, mode);
2209  }
2210  ReleaseVariableStats(vardata);
2211 
2212  if ( ! nd_stats )
2213  {
2214  POSTGIS_DEBUG(3, " unable to load stats from syscache, not analyzed yet?");
2215  PG_RETURN_FLOAT8(FALLBACK_ND_SEL);
2216  }
2217 
2218  POSTGIS_DEBUGF(4, " got stats:\n%s", nd_stats_to_json(nd_stats));
2219 
2220  /* Do the estimation! */
2221  selectivity = estimate_selectivity(&search_box, nd_stats, mode);
2222  POSTGIS_DEBUGF(3, " returning computed value: %f", selectivity);
2223 
2224  pfree(nd_stats);
2225  PG_RETURN_FLOAT8(selectivity);
2226 }
2227 
2228 
2229 
2236 Datum gserialized_estimated_extent(PG_FUNCTION_ARGS)
2237 {
2238  char *nsp = NULL;
2239  char *tbl = NULL;
2240  text *col = NULL;
2241  char *nsp_tbl = NULL;
2242  Oid tbl_oid;
2243  ND_STATS *nd_stats;
2244  GBOX *gbox;
2245  bool only_parent = FALSE;
2246 
2247  if ( PG_NARGS() == 4 )
2248  {
2249  nsp = text2cstring(PG_GETARG_TEXT_P(0));
2250  tbl = text2cstring(PG_GETARG_TEXT_P(1));
2251  col = PG_GETARG_TEXT_P(2);
2252  only_parent = PG_GETARG_BOOL(3);
2253  nsp_tbl = palloc(strlen(nsp) + strlen(tbl) + 6);
2254  sprintf(nsp_tbl, "\"%s\".\"%s\"", nsp, tbl);
2255  tbl_oid = DatumGetObjectId(DirectFunctionCall1(regclassin, CStringGetDatum(nsp_tbl)));
2256  pfree(nsp_tbl);
2257  }
2258  else if ( PG_NARGS() == 3 )
2259  {
2260  nsp = text2cstring(PG_GETARG_TEXT_P(0));
2261  tbl = text2cstring(PG_GETARG_TEXT_P(1));
2262  col = PG_GETARG_TEXT_P(2);
2263  nsp_tbl = palloc(strlen(nsp) + strlen(tbl) + 6);
2264  sprintf(nsp_tbl, "\"%s\".\"%s\"", nsp, tbl);
2265  tbl_oid = DatumGetObjectId(DirectFunctionCall1(regclassin, CStringGetDatum(nsp_tbl)));
2266  pfree(nsp_tbl);
2267  }
2268  else if ( PG_NARGS() == 2 )
2269  {
2270  tbl = text2cstring(PG_GETARG_TEXT_P(0));
2271  col = PG_GETARG_TEXT_P(1);
2272  nsp_tbl = palloc(strlen(tbl) + 3);
2273  sprintf(nsp_tbl, "\"%s\"", tbl);
2274  tbl_oid = DatumGetObjectId(DirectFunctionCall1(regclassin, CStringGetDatum(nsp_tbl)));
2275  pfree(nsp_tbl);
2276  }
2277  else
2278  {
2279  elog(ERROR, "estimated_extent() called with wrong number of arguments");
2280  PG_RETURN_NULL();
2281  }
2282 
2283  /* Estimated extent only returns 2D bounds, so use mode 2 */
2284  nd_stats = pg_get_nd_stats_by_name(tbl_oid, col, 2, only_parent);
2285 
2286  /* Error out on no stats */
2287  if ( ! nd_stats ) {
2288  elog(WARNING, "stats for \"%s.%s\" do not exist", tbl, text2cstring(col));
2289  PG_RETURN_NULL();
2290  }
2291 
2292  /* Construct the box */
2293  gbox = palloc(sizeof(GBOX));
2294  FLAGS_SET_GEODETIC(gbox->flags, 0);
2295  FLAGS_SET_Z(gbox->flags, 0);
2296  FLAGS_SET_M(gbox->flags, 0);
2297  gbox->xmin = nd_stats->extent.min[0];
2298  gbox->xmax = nd_stats->extent.max[0];
2299  gbox->ymin = nd_stats->extent.min[1];
2300  gbox->ymax = nd_stats->extent.max[1];
2301 
2302  pfree(nd_stats);
2303  PG_RETURN_POINTER(gbox);
2304 }
2305 
2313 Datum geometry_estimated_extent(PG_FUNCTION_ARGS)
2314 {
2315  if ( PG_NARGS() == 3 )
2316  {
2317  PG_RETURN_DATUM(
2318  DirectFunctionCall3(gserialized_estimated_extent,
2319  PG_GETARG_DATUM(0),
2320  PG_GETARG_DATUM(1),
2321  PG_GETARG_DATUM(2)));
2322  }
2323  else if ( PG_NARGS() == 2 )
2324  {
2325  PG_RETURN_DATUM(
2326  DirectFunctionCall2(gserialized_estimated_extent,
2327  PG_GETARG_DATUM(0),
2328  PG_GETARG_DATUM(1)));
2329  }
2330 
2331  elog(ERROR, "geometry_estimated_extent() called with wrong number of arguments");
2332  PG_RETURN_NULL();
2333 }
args
Definition: ovdump.py:44
int gserialized_get_gbox_p(const GSERIALIZED *g, GBOX *box)
Read the bounding box off a serialization and calculate one if it is not already there.
Definition: g_serialized.c:398
static int nd_increment(ND_IBOX *ibox, int ndims, int *counter)
Given an n-d index array (counter), and a domain to increment it in (ibox) increment it by one...
static ND_STATS * pg_get_nd_stats_by_name(const Oid table_oid, const text *att_text, int mode, bool only_parent)
Pull the stats object from the PgSQL system catalogs.
static int text_p_get_mode(const text *txt)
Utility function to see if the first letter of the mode argument is &#39;N&#39;.
static int nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
What stats cells overlap with this ND_BOX? Put the lowest cell addresses in ND_IBOX->min and the high...
PG_FUNCTION_INFO_V1(gserialized_gist_joinsel_nd)
For (geometry &&& geometry) and (geography && geography) we call into the N-D mode.
#define DEFAULT_ND_JOINSEL
static void compute_gserialized_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc, int sample_rows, double total_rows)
In order to do useful selectivity calculations in both 2-D and N-D modes, we actually have to generat...
stringbuffer_t * stringbuffer_create(void)
Allocate a new stringbuffer_t.
Definition: stringbuffer.c:35
int gbox_is_valid(const GBOX *gbox)
Return false if any of the dimensions is NaN or infinite.
Definition: g_box.c:209
#define MIN_DIMENSION_WIDTH
Minimum width of a dimension that we&#39;ll bother trying to compute statistics on.
char * gbox_to_string(const GBOX *gbox)
Allocate a string representation of the GBOX, based on dimensionality of flags.
Definition: g_box.c:404
#define FLAGS_GET_GEODETIC(flags)
Definition: liblwgeom.h:142
char * stringbuffer_getstringcopy(stringbuffer_t *s)
Returns a newly allocated string large enough to contain the current state of the string...
Definition: stringbuffer.c:160
double xmax
Definition: liblwgeom.h:292
#define ND_DIMS
The maximum number of dimensions our code can handle.
static void compute_gserialized_stats_mode(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc, int sample_rows, double total_rows, int mode)
The gserialized_analyze_nd sets this function as a callback on the stats object when called by the AN...
Datum geometry_estimated_extent(PG_FUNCTION_ARGS)
static int nd_box_contains(const ND_BOX *a, const ND_BOX *b, int ndims)
Return TRUE if ND_BOX a contains b, false otherwise.
Datum _postgis_gserialized_stats(PG_FUNCTION_ARGS)
#define STATISTIC_SLOT_2D
static void nd_box_from_gbox(const GBOX *gbox, ND_BOX *nd_box)
Set the values of an ND_BOX from a GBOX.
struct ND_BOX_T ND_BOX
N-dimensional box type for calculations, to avoid doing explicit axis conversions from GBOX in all ca...
#define FALLBACK_ND_JOINSEL
Datum gserialized_estimated_extent(PG_FUNCTION_ARGS)
#define FALLBACK_ND_SEL
More modest fallback selectivity factor.
#define FLAGS_SET_GEODETIC(flags, value)
Definition: liblwgeom.h:148
Datum _postgis_gserialized_joinsel(PG_FUNCTION_ARGS)
static float8 estimate_selectivity(const GBOX *box, const ND_STATS *nd_stats, int mode)
This function returns an estimate of the selectivity of a search GBOX by looking at data in the ND_ST...
#define LW_FAILURE
Definition: liblwgeom.h:78
int stringbuffer_aprintf(stringbuffer_t *s, const char *fmt,...)
Appends a formatted string to the current string buffer, using the format and argument list provided...
Definition: stringbuffer.c:253
static int nd_box_init(ND_BOX *a)
Zero out an ND_BOX.
Datum gserialized_gist_joinsel(PG_FUNCTION_ARGS)
#define FLAGS_SET_Z(flags, value)
Definition: liblwgeom.h:145
double zmax
Definition: liblwgeom.h:296
double ymin
Definition: liblwgeom.h:293
Datum gserialized_gist_sel_2d(PG_FUNCTION_ARGS)
Datum gserialized_gist_joinsel_2d(PG_FUNCTION_ARGS)
Datum gserialized_gist_sel(PG_FUNCTION_ARGS)
static int nd_box_array_distribution(const ND_BOX **nd_boxes, int num_boxes, const ND_BOX *extent, int ndims, double *distribution)
Calculate how much a set of boxes is homogenously distributed or contentrated within one dimension...
double xmin
Definition: liblwgeom.h:291
static int nd_stats_value_index(const ND_STATS *stats, int *indexes)
Given a position in the n-d histogram (i,j,k) return the position in the 1-d values array...
float4 size[ND_DIMS]
Datum gserialized_gist_joinsel_nd(PG_FUNCTION_ARGS)
static int nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
Return TRUE if ND_BOX a overlaps b, false otherwise.
static double total_double(const double *vals, int nvals)
Given double array, return sum of values.
int min[ND_DIMS]
static ND_STATS * pg_nd_stats_from_tuple(HeapTuple stats_tuple, int mode)
char * text2cstring(const text *textptr)
Datum gserialized_gist_sel_nd(PG_FUNCTION_ARGS)
static int cmp_int(const void *a, const void *b)
Integer comparison function for qsort.
static int nd_box_init_bounds(ND_BOX *a)
Prepare an ND_BOX for bounds calculation: set the maxes to the smallest thing possible and the mins t...
double ymax
Definition: liblwgeom.h:294
N-dimensional box index type.
#define FLAGS_GET_Z(flags)
Macros for manipulating the &#39;flags&#39; byte.
Definition: liblwgeom.h:139
static int range_quintile(int *vals, int nvals)
The difference between the fourth and first quintile values, the "inter-quintile range".
Datum _postgis_gserialized_sel(PG_FUNCTION_ARGS)
static float8 estimate_join_selectivity(const ND_STATS *s1, const ND_STATS *s2)
Given two statistics histograms, what is the selectivity of a join driven by the && or &&& operator...
uint8_t flags
Definition: liblwgeom.h:290
static char * nd_box_to_json(const ND_BOX *nd_box, int ndims)
Convert an ND_BOX to a JSON string for printing.
float4 max[ND_DIMS]
void stringbuffer_append(stringbuffer_t *s, const char *a)
Append the specified string to the stringbuffer_t.
Definition: stringbuffer.c:134
#define STATISTIC_KIND_2D
struct ND_IBOX_T ND_IBOX
N-dimensional box index type.
void stringbuffer_destroy(stringbuffer_t *s)
Free the stringbuffer_t and all memory managed within it.
Definition: stringbuffer.c:78
int max[ND_DIMS]
struct ND_STATS_T ND_STATS
N-dimensional statistics structure.
float4 min[ND_DIMS]
#define FALSE
Definition: dbfopen.c:168
static int nd_box_merge(const ND_BOX *source, ND_BOX *target)
Create a printable view of the ND_STATS histogram.
static int nd_box_expand(ND_BOX *nd_box, double expansion_factor)
Expand an ND_BOX ever so slightly.
double mmin
Definition: liblwgeom.h:297
#define SDFACTOR
double zmin
Definition: liblwgeom.h:295
#define FLAGS_GET_M(flags)
Definition: liblwgeom.h:140
static char * nd_stats_to_json(const ND_STATS *nd_stats)
Convert an ND_STATS to a JSON representation for external use.
Datum gserialized_analyze_nd(PG_FUNCTION_ARGS)
double mmax
Definition: liblwgeom.h:298
#define STATISTIC_SLOT_ND
static double nd_box_ratio(const ND_BOX *b1, const ND_BOX *b2, int ndims)
Returns the proportion of b2 that is covered by b1.
int value
Definition: genraster.py:61
N-dimensional statistics structure.
static ND_STATS * pg_get_nd_stats(const Oid table_oid, AttrNumber att_num, int mode, bool only_parent)
Pull the stats object from the PgSQL system catalogs.
#define TRUE
Definition: dbfopen.c:169
N-dimensional box type for calculations, to avoid doing explicit axis conversions from GBOX in all ca...
static int gbox_ndims(const GBOX *gbox)
Given that geodetic boxes are X/Y/Z regardless of the underlying geometry dimensionality and other bo...
This library is the generic geometry handling section of PostGIS.
#define DEFAULT_ND_SEL
Default geometry selectivity factor.
#define STATISTIC_KIND_ND
Assign a number to the n-dimensional statistics kind.
#define FLAGS_SET_M(flags, value)
Definition: liblwgeom.h:146