PostGIS  2.1.10dev-r@@SVN_REVISION@@
gserialized_estimate.c
Go to the documentation of this file.
1 /**********************************************************************
2  * PostGIS - Spatial Types for PostgreSQL
3  * http://postgis.net
4  *
5  * Copyright 2012 (C) Paul Ramsey <pramsey@cleverelephant.ca>
6  *
7  * This is free software; you can redistribute and/or modify it under
8  * the terms of the GNU General Public Licence. See the COPYING file.
9  *
10  **********************************************************************/
11 
12 
13 /**********************************************************************
14  THEORY OF OPERATION
15 
16 The ANALYZE command hooks to a callback (gserialized_analyze_nd) that
17 calculates (compute_gserialized_stats_mode) two histograms of occurances of
18 features, once for the 2D domain (and the && operator) one for the
19 ND domain (and the &&& operator).
20 
21 Queries in PostgreSQL call into the selectivity sub-system to find out
22 the relative effectiveness of different clauses in sub-setting
23 relations. Queries with constant arguments call gserialized_gist_sel,
24 queries with relations on both sides call gserialized_gist_joinsel.
25 
26 gserialized_gist_sel sums up the values in the histogram that overlap
27 the contant search box.
28 
29 gserialized_gist_joinsel sums up the product of the overlapping
30 cells in each relation's histogram.
31 
32 Depending on the operator and type, the mode of selectivity calculation
33 will be 2D or ND.
34 
35 - geometry && geometry ==> 2D
36 - geometry &&& geometry ==> ND
37 - geography && geography ==> ND
38 
39 The 2D mode is put in effect by retrieving the 2D histogram from the
40 statistics cache and then allowing the generic ND calculations to
41 go to work.
42 
43 TO DO: More testing and examination of the &&& operator and mixed
44 dimensionality cases. (2D geometry) &&& (3D column), etc.
45 
46 **********************************************************************/
47 
48 #include "postgres.h"
49 #include "executor/spi.h"
50 #include "fmgr.h"
51 #include "commands/vacuum.h"
52 #include "nodes/relation.h"
53 #include "parser/parsetree.h"
54 #include "utils/array.h"
55 #include "utils/lsyscache.h"
56 #include "utils/builtins.h"
57 #include "utils/syscache.h"
58 #include "utils/rel.h"
59 
60 #include "../postgis_config.h"
61 
62 #if POSTGIS_PGSQL_VERSION >= 93
63  #include "access/htup_details.h"
64 #endif
65 
66 #include "stringbuffer.h"
67 #include "liblwgeom.h"
68 #include "lwgeom_pg.h" /* For debugging macros. */
69 #include "gserialized_gist.h" /* For index common functions */
70 
71 #include <math.h>
72 #if HAVE_IEEEFP_H
73 #include <ieeefp.h>
74 #endif
75 #include <float.h>
76 #include <string.h>
77 #include <stdio.h>
78 #include <errno.h>
79 #include <ctype.h>
80 
81 /* Prototypes */
82 Datum gserialized_gist_joinsel(PG_FUNCTION_ARGS);
83 Datum gserialized_gist_joinsel_2d(PG_FUNCTION_ARGS);
84 Datum gserialized_gist_joinsel_nd(PG_FUNCTION_ARGS);
85 Datum gserialized_gist_sel(PG_FUNCTION_ARGS);
86 Datum gserialized_gist_sel_2d(PG_FUNCTION_ARGS);
87 Datum gserialized_gist_sel_nd(PG_FUNCTION_ARGS);
88 Datum gserialized_analyze_nd(PG_FUNCTION_ARGS);
89 Datum gserialized_estimated_extent(PG_FUNCTION_ARGS);
90 Datum _postgis_gserialized_sel(PG_FUNCTION_ARGS);
91 Datum _postgis_gserialized_joinsel(PG_FUNCTION_ARGS);
92 Datum _postgis_gserialized_stats(PG_FUNCTION_ARGS);
93 
94 /* Old Prototype */
95 Datum geometry_estimated_extent(PG_FUNCTION_ARGS);
96 
107 #define STATISTIC_KIND_ND 102
108 #define STATISTIC_KIND_2D 103
109 #define STATISTIC_SLOT_ND 0
110 #define STATISTIC_SLOT_2D 1
111 
112 /*
113 * The SD factor restricts the side of the statistics histogram
114 * based on the standard deviation of the extent of the data.
115 * SDFACTOR is the number of standard deviations from the mean
116 * the histogram will extend.
117 */
118 #define SDFACTOR 3.25
119 
125 #define ND_DIMS 4
126 
133 #define MIN_DIMENSION_WIDTH 0.000000001
134 
138 #define DEFAULT_ND_SEL 0.0001
139 #define DEFAULT_ND_JOINSEL 0.001
140 
144 #define FALLBACK_ND_SEL 0.2
145 #define FALLBACK_ND_JOINSEL 0.3
146 
152 typedef struct ND_BOX_T
153 {
154  float4 min[ND_DIMS];
155  float4 max[ND_DIMS];
156 } ND_BOX;
157 
161 typedef struct ND_IBOX_T
162 {
163  int min[ND_DIMS];
164  int max[ND_DIMS];
165 } ND_IBOX;
166 
167 
174 typedef struct ND_STATS_T
175 {
176  /* Dimensionality of the histogram. */
177  float4 ndims;
178 
179  /* Size of n-d histogram in each dimension. */
180  float4 size[ND_DIMS];
181 
182  /* Lower-left (min) and upper-right (max) spatial bounds of histogram. */
184 
185  /* How many rows in the table itself? */
187 
188  /* How many rows were in the sample that built this histogram? */
190 
191  /* How many not-Null/Empty features were in the sample? */
193 
194  /* How many features actually got sampled in the histogram? */
196 
197  /* How many cells in histogram? (sizex*sizey*sizez*sizem) */
199 
200  /* How many cells did those histogram features cover? */
201  /* Since we are pro-rating coverage, this number should */
202  /* now always equal histogram_features */
204 
205  /* Variable length # of floats for histogram */
206  float4 value[1];
207 } ND_STATS;
208 
209 
210 
211 
218 static int
219 gbox_ndims(const GBOX* gbox)
220 {
221  int dims = 2;
222  if ( FLAGS_GET_GEODETIC(gbox->flags) )
223  return 3;
224  if ( FLAGS_GET_Z(gbox->flags) )
225  dims++;
226  if ( FLAGS_GET_M(gbox->flags) )
227  dims++;
228  return dims;
229 }
230 
236 static int
237 text_p_get_mode(const text *txt)
238 {
239  int mode = 2;
240  char *modestr = text2cstring(txt);
241  if ( modestr[0] == 'N' )
242  mode = 0;
243  pfree(modestr);
244  return mode;
245 }
246 
247 
251 static int
252 cmp_int (const void *a, const void *b)
253 {
254  int ia = *((const int*)a);
255  int ib = *((const int*)b);
256 
257  if ( ia == ib )
258  return 0;
259  else if ( ia > ib )
260  return 1;
261  else
262  return -1;
263 }
264 
269 static int
270 range_quintile(int *vals, int nvals)
271 {
272  qsort(vals, nvals, sizeof(int), cmp_int);
273  return vals[4*nvals/5] - vals[nvals/5];
274 }
275 
279 static double
280 total_double(const double *vals, int nvals)
281 {
282  int i;
283  float total = 0;
284  /* Calculate total */
285  for ( i = 0; i < nvals; i++ )
286  total += vals[i];
287 
288  return total;
289 }
290 
291 #if POSTGIS_DEBUG_LEVEL >= 3
292 
296 static int
297 total_int(const int *vals, int nvals)
298 {
299  int i;
300  int total = 0;
301  /* Calculate total */
302  for ( i = 0; i < nvals; i++ )
303  total += vals[i];
304 
305  return total;
306 }
307 
311 static double
312 avg(const int *vals, int nvals)
313 {
314  int t = total_int(vals, nvals);
315  return (double)t / (double)nvals;
316 }
317 
321 static double
322 stddev(const int *vals, int nvals)
323 {
324  int i;
325  double sigma2 = 0;
326  double mean = avg(vals, nvals);
327 
328  /* Calculate sigma2 */
329  for ( i = 0; i < nvals; i++ )
330  {
331  double v = (double)(vals[i]);
332  sigma2 += (mean - v) * (mean - v);
333  }
334  return sqrt(sigma2 / nvals);
335 }
336 #endif /* POSTGIS_DEBUG_LEVEL >= 3 */
337 
342 static int
343 nd_stats_value_index(const ND_STATS *stats, int *indexes)
344 {
345  int d;
346  int accum = 1, vdx = 0;
347 
348  /* Calculate the index into the 1-d values array that the (i,j,k,l) */
349  /* n-d histogram coordinate implies. */
350  /* index = x + y * sizex + z * sizex * sizey + m * sizex * sizey * sizez */
351  for ( d = 0; d < (int)(stats->ndims); d++ )
352  {
353  int size = (int)(stats->size[d]);
354  if ( indexes[d] < 0 || indexes[d] >= size )
355  {
356  POSTGIS_DEBUGF(3, " bad index at (%d, %d)", indexes[0], indexes[1]);
357  return -1;
358  }
359  vdx += indexes[d] * accum;
360  accum *= size;
361  }
362  return vdx;
363 }
364 
368 static char*
369 nd_box_to_json(const ND_BOX *nd_box, int ndims)
370 {
371  char *rv;
372  int i;
374 
375  stringbuffer_append(sb, "{\"min\":[");
376  for ( i = 0; i < ndims; i++ )
377  {
378  if ( i ) stringbuffer_append(sb, ",");
379  stringbuffer_aprintf(sb, "%.6g", nd_box->min[i]);
380  }
381  stringbuffer_append(sb, "],\"max\":[");
382  for ( i = 0; i < ndims; i++ )
383  {
384  if ( i ) stringbuffer_append(sb, ",");
385  stringbuffer_aprintf(sb, "%.6g", nd_box->max[i]);
386  }
387  stringbuffer_append(sb, "]}");
388 
391  return rv;
392 }
393 
394 
399 static char*
400 nd_stats_to_json(const ND_STATS *nd_stats)
401 {
402  char *json_extent, *str;
403  int d;
405  int ndims = (int)roundf(nd_stats->ndims);
406 
407  stringbuffer_append(sb, "{");
408  stringbuffer_aprintf(sb, "\"ndims\":%d,", ndims);
409 
410  /* Size */
411  stringbuffer_append(sb, "\"size\":[");
412  for ( d = 0; d < ndims; d++ )
413  {
414  if ( d ) stringbuffer_append(sb, ",");
415  stringbuffer_aprintf(sb, "%d", (int)roundf(nd_stats->size[d]));
416  }
417  stringbuffer_append(sb, "],");
418 
419  /* Extent */
420  json_extent = nd_box_to_json(&(nd_stats->extent), ndims);
421  stringbuffer_aprintf(sb, "\"extent\":%s,", json_extent);
422  pfree(json_extent);
423 
424  stringbuffer_aprintf(sb, "\"table_features\":%d,", (int)roundf(nd_stats->table_features));
425  stringbuffer_aprintf(sb, "\"sample_features\":%d,", (int)roundf(nd_stats->sample_features));
426  stringbuffer_aprintf(sb, "\"not_null_features\":%d,", (int)roundf(nd_stats->not_null_features));
427  stringbuffer_aprintf(sb, "\"histogram_features\":%d,", (int)roundf(nd_stats->histogram_features));
428  stringbuffer_aprintf(sb, "\"histogram_cells\":%d,", (int)roundf(nd_stats->histogram_cells));
429  stringbuffer_aprintf(sb, "\"cells_covered\":%d", (int)roundf(nd_stats->cells_covered));
430  stringbuffer_append(sb, "}");
431 
432  str = stringbuffer_getstringcopy(sb);
434  return str;
435 }
436 
437 
443 // static char*
444 // nd_stats_to_grid(const ND_STATS *stats)
445 // {
446 // char *rv;
447 // int j, k;
448 // int sizex = (int)roundf(stats->size[0]);
449 // int sizey = (int)roundf(stats->size[1]);
450 // stringbuffer_t *sb = stringbuffer_create();
451 //
452 // for ( k = 0; k < sizey; k++ )
453 // {
454 // for ( j = 0; j < sizex; j++ )
455 // {
456 // stringbuffer_aprintf(sb, "%3d ", (int)roundf(stats->value[j + k*sizex]));
457 // }
458 // stringbuffer_append(sb, "\n");
459 // }
460 //
461 // rv = stringbuffer_getstringcopy(sb);
462 // stringbuffer_destroy(sb);
463 // return rv;
464 // }
465 
466 
468 static int
469 nd_box_merge(const ND_BOX *source, ND_BOX *target)
470 {
471  int d;
472  for ( d = 0; d < ND_DIMS; d++ )
473  {
474  target->min[d] = Min(target->min[d], source->min[d]);
475  target->max[d] = Max(target->max[d], source->max[d]);
476  }
477  return TRUE;
478 }
479 
481 static int
483 {
484  memset(a, 0, sizeof(ND_BOX));
485  return TRUE;
486 }
487 
493 static int
495 {
496  int d;
497  for ( d = 0; d < ND_DIMS; d++ )
498  {
499  a->min[d] = FLT_MAX;
500  a->max[d] = -1 * FLT_MAX;
501  }
502  return TRUE;
503 }
504 
506 static void
507 nd_box_from_gbox(const GBOX *gbox, ND_BOX *nd_box)
508 {
509  int d = 0;
510  POSTGIS_DEBUGF(3, " %s", gbox_to_string(gbox));
511 
512  nd_box_init(nd_box);
513  nd_box->min[d] = gbox->xmin;
514  nd_box->max[d] = gbox->xmax;
515  d++;
516  nd_box->min[d] = gbox->ymin;
517  nd_box->max[d] = gbox->ymax;
518  d++;
519  if ( FLAGS_GET_GEODETIC(gbox->flags) )
520  {
521  nd_box->min[d] = gbox->zmin;
522  nd_box->max[d] = gbox->zmax;
523  return;
524  }
525  if ( FLAGS_GET_Z(gbox->flags) )
526  {
527  nd_box->min[d] = gbox->zmin;
528  nd_box->max[d] = gbox->zmax;
529  d++;
530  }
531  if ( FLAGS_GET_M(gbox->flags) )
532  {
533  nd_box->min[d] = gbox->mmin;
534  nd_box->max[d] = gbox->mmax;
535  d++;
536  }
537  return;
538 }
539 
543 static int
544 nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
545 {
546  int d;
547  for ( d = 0; d < ndims; d++ )
548  {
549  if ( (a->min[d] > b->max[d]) || (a->max[d] < b->min[d]) )
550  return FALSE;
551  }
552  return TRUE;
553 }
554 
558 static int
559 nd_box_contains(const ND_BOX *a, const ND_BOX *b, int ndims)
560 {
561  int d;
562  for ( d = 0; d < ndims; d++ )
563  {
564  if ( ! ((a->min[d] < b->min[d]) && (a->max[d] > b->max[d])) )
565  return FALSE;
566  }
567  return TRUE;
568 }
569 
574 static int
575 nd_box_expand(ND_BOX *nd_box, double expansion_factor)
576 {
577  int d;
578  double size;
579  for ( d = 0; d < ND_DIMS; d++ )
580  {
581  size = nd_box->max[d] - nd_box->min[d];
582  if ( size <= 0 ) continue;
583  nd_box->min[d] -= size * expansion_factor / 2;
584  nd_box->max[d] += size * expansion_factor / 2;
585  }
586  return TRUE;
587 }
588 
593 static inline int
594 nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
595 {
596  int d;
597 
598  POSTGIS_DEBUGF(4, " nd_box: %s", nd_box_to_json(nd_box, nd_stats->ndims));
599 
600  /* Initialize ibox */
601  memset(nd_ibox, 0, sizeof(ND_IBOX));
602 
603  /* In each dimension... */
604  for ( d = 0; d < nd_stats->ndims; d++ )
605  {
606  double smin = nd_stats->extent.min[d];
607  double smax = nd_stats->extent.max[d];
608  double width = smax - smin;
609  int size = roundf(nd_stats->size[d]);
610 
611  /* ... find cells the box overlaps with in this dimension */
612  nd_ibox->min[d] = floor(size * (nd_box->min[d] - smin) / width);
613  nd_ibox->max[d] = floor(size * (nd_box->max[d] - smin) / width);
614 
615  POSTGIS_DEBUGF(5, " stats: dim %d: min %g: max %g: width %g", d, smin, smax, width);
616  POSTGIS_DEBUGF(5, " overlap: dim %d: (%d, %d)", d, nd_ibox->min[d], nd_ibox->max[d]);
617 
618  /* Push any out-of range values into range */
619  nd_ibox->min[d] = Max(nd_ibox->min[d], 0);
620  nd_ibox->max[d] = Min(nd_ibox->max[d], size-1);
621  }
622  return TRUE;
623 }
624 
628 static inline double
629 nd_box_ratio(const ND_BOX *b1, const ND_BOX *b2, int ndims)
630 {
631  int d;
632  bool covered = TRUE;
633  double ivol = 1.0;
634  double vol2 = 1.0;
635  double vol1 = 1.0;
636 
637  for ( d = 0 ; d < ndims; d++ )
638  {
639  if ( b1->max[d] <= b2->min[d] || b1->min[d] >= b2->max[d] )
640  return 0.0; /* Disjoint */
641 
642  if ( b1->min[d] > b2->min[d] || b1->max[d] < b2->max[d] )
643  covered = FALSE;
644  }
645 
646  if ( covered )
647  return 1.0;
648 
649  for ( d = 0; d < ndims; d++ )
650  {
651  double width1 = b1->max[d] - b1->min[d];
652  double width2 = b2->max[d] - b2->min[d];
653  double imin, imax, iwidth;
654 
655  vol1 *= width1;
656  vol2 *= width2;
657 
658  imin = Max(b1->min[d], b2->min[d]);
659  imax = Min(b1->max[d], b2->max[d]);
660  iwidth = imax - imin;
661  iwidth = Max(0.0, iwidth);
662 
663  ivol *= iwidth;
664  }
665 
666  if ( vol2 == 0.0 )
667  return vol2;
668 
669  return ivol / vol2;
670 }
671 
672 
688 static int
689 nd_box_array_distribution(const ND_BOX **nd_boxes, int num_boxes, const ND_BOX *extent, int ndims, double *distribution)
690 {
691  /* How many bins shall we use in figuring out the distribution? */
692  static int num_bins = 50;
693  int d, i, k, range;
694  int counts[num_bins];
695  double smin, smax; /* Spatial min, spatial max */
696  double swidth; /* Spatial width of dimension */
697 #if POSTGIS_DEBUG_LEVEL >= 3
698  double average, sdev, sdev_ratio;
699 #endif
700  int bmin, bmax; /* Bin min, bin max */
701  const ND_BOX *ndb;
702 
703  /* For each dimension... */
704  for ( d = 0; d < ndims; d++ )
705  {
706  /* Initialize counts for this dimension */
707  memset(counts, 0, sizeof(int)*num_bins);
708 
709  smin = extent->min[d];
710  smax = extent->max[d];
711  swidth = smax - smin;
712 
713  /* Don't try and calculate distribution of overly narrow dimensions */
714  if ( swidth < MIN_DIMENSION_WIDTH )
715  {
716  distribution[d] = 0;
717  continue;
718  }
719 
720  /* Sum up the overlaps of each feature with the dimensional bins */
721  for ( i = 0; i < num_boxes; i++ )
722  {
723  double minoffset, maxoffset;
724 
725  /* Skip null entries */
726  ndb = nd_boxes[i];
727  if ( ! ndb ) continue;
728 
729  /* Where does box fall relative to the working range */
730  minoffset = ndb->min[d] - smin;
731  maxoffset = ndb->max[d] - smin;
732 
733  /* Skip boxes that our outside our working range */
734  if ( minoffset < 0 || minoffset > swidth ||
735  maxoffset < 0 || maxoffset > swidth )
736  {
737  continue;
738  }
739 
740  /* What bins does this range correspond to? */
741  bmin = num_bins * (minoffset) / swidth;
742  bmax = num_bins * (maxoffset) / swidth;
743 
744  POSTGIS_DEBUGF(4, " dimension %d, feature %d: bin %d to bin %d", d, i, bmin, bmax);
745 
746  /* Increment the counts in all the bins this feature overlaps */
747  for ( k = bmin; k <= bmax; k++ )
748  {
749  counts[k] += 1;
750  }
751 
752  }
753 
754  /* How dispersed is the distribution of features across bins? */
755  range = range_quintile(counts, num_bins);
756 
757 #if POSTGIS_DEBUG_LEVEL >= 3
758  average = avg(counts, num_bins);
759  sdev = stddev(counts, num_bins);
760  sdev_ratio = sdev/average;
761 
762  POSTGIS_DEBUGF(3, " dimension %d: range = %d", d, range);
763  POSTGIS_DEBUGF(3, " dimension %d: average = %.6g", d, average);
764  POSTGIS_DEBUGF(3, " dimension %d: stddev = %.6g", d, sdev);
765  POSTGIS_DEBUGF(3, " dimension %d: stddev_ratio = %.6g", d, sdev_ratio);
766 #endif
767 
768  distribution[d] = range;
769  }
770 
771  return TRUE;
772 }
773 
779 static inline int
780 nd_increment(ND_IBOX *ibox, int ndims, int *counter)
781 {
782  int d = 0;
783 
784  while ( d < ndims )
785  {
786  if ( counter[d] < ibox->max[d] )
787  {
788  counter[d] += 1;
789  break;
790  }
791  counter[d] = ibox->min[d];
792  d++;
793  }
794  /* That's it, cannot increment any more! */
795  if ( d == ndims )
796  return FALSE;
797 
798  /* Increment complete! */
799  return TRUE;
800 }
801 
802 
807 static ND_STATS*
808 pg_get_nd_stats(const Oid table_oid, AttrNumber att_num, int mode)
809 {
810  HeapTuple stats_tuple;
811  float4 *floatptr;
812  ND_STATS *nd_stats;
813  int rv, nvalues;
814  int stats_kind = STATISTIC_KIND_ND;
815 
816  /* First pull the stats tuple */
817  stats_tuple = SearchSysCache2(STATRELATT, table_oid, att_num);
818  if ( ! stats_tuple )
819  {
820  POSTGIS_DEBUGF(2, "stats for \"%s\" do not exist", get_rel_name(table_oid)? get_rel_name(table_oid) : "NULL");
821  return NULL;
822  }
823 
824  /* If we're in 2D mode, set the kind appropriately */
825  if ( mode == 2 )
826  stats_kind = STATISTIC_KIND_2D;
827 
828  /* Then read the geom status histogram from that */
829  rv = get_attstatsslot(stats_tuple, 0, 0, stats_kind, InvalidOid, NULL, NULL, NULL, &floatptr, &nvalues);
830  if ( ! rv )
831  {
832  ReleaseSysCache(stats_tuple);
833  POSTGIS_DEBUGF(2, "histogram for \"%s\" does not exist?", get_rel_name(table_oid));
834  return NULL;
835  }
836 
837  /* Clone the stats here so we can release the attstatsslot immediately */
838  nd_stats = palloc(sizeof(float) * nvalues);
839  memcpy(nd_stats, floatptr, sizeof(float) * nvalues);
840 
841  /* Clean up */
842  free_attstatsslot(0, NULL, 0, floatptr, nvalues);
843  ReleaseSysCache(stats_tuple);
844 
845  return nd_stats;
846 }
847 
848 
854 static ND_STATS*
855 pg_get_nd_stats_by_name(const Oid table_oid, const text *att_text, int mode)
856 {
857  const char *att_name = text2cstring(att_text);
858  AttrNumber att_num;
859 
860  /* We know the name? Look up the num */
861  if ( att_text )
862  {
863  /* Get the attribute number */
864  att_num = get_attnum(table_oid, att_name);
865  if ( ! att_num ) {
866  elog(ERROR, "attribute \"%s\" does not exist", att_name);
867  return NULL;
868  }
869  }
870  else
871  {
872  elog(ERROR, "attribute name is null");
873  return NULL;
874  }
875 
876  return pg_get_nd_stats(table_oid, att_num, mode);
877 }
878 
892 static float8
894 {
895  int ncells1, ncells2;
896  int ndims1, ndims2, ndims;
897  double ntuples_max;
898  double ntuples_not_null1, ntuples_not_null2;
899 
900  ND_BOX extent1, extent2;
901  ND_IBOX ibox1, ibox2;
902  int at1[ND_DIMS];
903  int at2[ND_DIMS];
904  double min1[ND_DIMS];
905  double width1[ND_DIMS];
906  double cellsize1[ND_DIMS];
907  int size2[ND_DIMS];
908  double min2[ND_DIMS];
909  double width2[ND_DIMS];
910  double cellsize2[ND_DIMS];
911  int size1[ND_DIMS];
912  int d;
913  double val = 0;
914  float8 selectivity;
915 
916  /* Drop out on null inputs */
917  if ( ! ( s1 && s2 ) )
918  {
919  elog(NOTICE, " estimate_join_selectivity called with null inputs");
920  return FALLBACK_ND_SEL;
921  }
922 
923  /* We need to know how many cells each side has... */
924  ncells1 = (int)roundf(s1->histogram_cells);
925  ncells2 = (int)roundf(s2->histogram_cells);
926 
927  /* ...so that we can drive the summation loop with the smaller histogram. */
928  if ( ncells1 > ncells2 )
929  {
930  const ND_STATS *stats_tmp = s1;
931  s1 = s2;
932  s2 = stats_tmp;
933  }
934 
935  POSTGIS_DEBUGF(3, "s1: %s", nd_stats_to_json(s1));
936  POSTGIS_DEBUGF(3, "s2: %s", nd_stats_to_json(s2));
937 
938  /* Re-read that info after the swap */
939  ncells1 = (int)roundf(s1->histogram_cells);
940  ncells2 = (int)roundf(s2->histogram_cells);
941 
942  /* Q: What's the largest possible join size these relations can create? */
943  /* A: The product of the # of non-null rows in each relation. */
944  ntuples_not_null1 = s1->table_features * (s1->not_null_features / s1->sample_features);
945  ntuples_not_null2 = s2->table_features * (s2->not_null_features / s2->sample_features);
946  ntuples_max = ntuples_not_null1 * ntuples_not_null2;
947 
948  /* Get the ndims as ints */
949  ndims1 = (int)roundf(s1->ndims);
950  ndims2 = (int)roundf(s2->ndims);
951  ndims = Max(ndims1, ndims2);
952 
953  /* Get the extents */
954  extent1 = s1->extent;
955  extent2 = s2->extent;
956 
957  /* If relation stats do not intersect, join is very very selective. */
958  if ( ! nd_box_intersects(&extent1, &extent2, ndims) )
959  {
960  POSTGIS_DEBUG(3, "relation stats do not intersect, returning 0");
961  PG_RETURN_FLOAT8(0.0);
962  }
963 
964  /*
965  * First find the index range of the part of the smaller
966  * histogram that overlaps the larger one.
967  */
968  if ( ! nd_box_overlap(s1, &extent2, &ibox1) )
969  {
970  POSTGIS_DEBUG(3, "could not calculate overlap of relations");
971  PG_RETURN_FLOAT8(FALLBACK_ND_JOINSEL);
972  }
973 
974  /* Initialize counters / constants on s1 */
975  for ( d = 0; d < ndims1; d++ )
976  {
977  at1[d] = ibox1.min[d];
978  min1[d] = s1->extent.min[d];
979  width1[d] = s1->extent.max[d] - s1->extent.min[d];
980  size1[d] = (int)roundf(s1->size[d]);
981  cellsize1[d] = width1[d] / size1[d];
982  }
983 
984  /* Initialize counters / constants on s2 */
985  for ( d = 0; d < ndims2; d++ )
986  {
987  min2[d] = s2->extent.min[d];
988  width2[d] = s2->extent.max[d] - s2->extent.min[d];
989  size2[d] = (int)roundf(s2->size[d]);
990  cellsize2[d] = width2[d] / size2[d];
991  }
992 
993  /* For each affected cell of s1... */
994  do
995  {
996  double val1;
997  /* Construct the bounds of this cell */
998  ND_BOX nd_cell1;
999  nd_box_init(&nd_cell1);
1000  for ( d = 0; d < ndims1; d++ )
1001  {
1002  nd_cell1.min[d] = min1[d] + (at1[d]+0) * cellsize1[d];
1003  nd_cell1.max[d] = min1[d] + (at1[d]+1) * cellsize1[d];
1004  }
1005 
1006  /* Find the cells of s2 that cell1 overlaps.. */
1007  nd_box_overlap(s2, &nd_cell1, &ibox2);
1008 
1009  /* Initialize counter */
1010  for ( d = 0; d < ndims2; d++ )
1011  {
1012  at2[d] = ibox2.min[d];
1013  }
1014 
1015  POSTGIS_DEBUGF(3, "at1 %d,%d %s", at1[0], at1[1], nd_box_to_json(&nd_cell1, ndims1));
1016 
1017  /* Get the value at this cell */
1018  val1 = s1->value[nd_stats_value_index(s1, at1)];
1019 
1020  /* For each overlapped cell of s2... */
1021  do
1022  {
1023  double ratio2;
1024  double val2;
1025 
1026  /* Construct the bounds of this cell */
1027  ND_BOX nd_cell2;
1028  nd_box_init(&nd_cell2);
1029  for ( d = 0; d < ndims2; d++ )
1030  {
1031  nd_cell2.min[d] = min2[d] + (at2[d]+0) * cellsize2[d];
1032  nd_cell2.max[d] = min2[d] + (at2[d]+1) * cellsize2[d];
1033  }
1034 
1035  POSTGIS_DEBUGF(3, " at2 %d,%d %s", at2[0], at2[1], nd_box_to_json(&nd_cell2, ndims2));
1036 
1037  /* Calculate overlap ratio of the cells */
1038  ratio2 = nd_box_ratio(&nd_cell1, &nd_cell2, Max(ndims1, ndims2));
1039 
1040  /* Multiply the cell counts, scaled by overlap ratio */
1041  val2 = s2->value[nd_stats_value_index(s2, at2)];
1042  POSTGIS_DEBUGF(3, " val1 %.6g val2 %.6g ratio %.6g", val1, val2, ratio2);
1043  val += val1 * (val2 * ratio2);
1044  }
1045  while ( nd_increment(&ibox2, ndims2, at2) );
1046 
1047  }
1048  while( nd_increment(&ibox1, ndims1, at1) );
1049 
1050  POSTGIS_DEBUGF(3, "val of histogram = %g", val);
1051 
1052  /*
1053  * In order to compare our total cell count "val" to the
1054  * ntuples_max, we need to scale val up to reflect a full
1055  * table estimate. So, multiply by ratio of table size to
1056  * sample size.
1057  */
1058  val *= (s1->table_features / s1->sample_features);
1059  val *= (s2->table_features / s2->sample_features);
1060 
1061  POSTGIS_DEBUGF(3, "val scaled to full table size = %g", val);
1062 
1063  /*
1064  * Because the cell counts are over-determined due to
1065  * double counting of features that overlap multiple cells
1066  * (see the compute_gserialized_stats routine)
1067  * we also have to scale our cell count "val" *down*
1068  * to adjust for the double counting.
1069  */
1070 // val /= (s1->cells_covered / s1->histogram_features);
1071 // val /= (s2->cells_covered / s2->histogram_features);
1072 
1073  /*
1074  * Finally, the selectivity is the estimated number of
1075  * rows to be returned divided by the maximum possible
1076  * number of rows that can be returned.
1077  */
1078  selectivity = val / ntuples_max;
1079 
1080  /* Guard against over-estimates :) */
1081  if ( selectivity > 1.0 )
1082  selectivity = 1.0;
1083 
1084  return selectivity;
1085 }
1086 
1092 Datum gserialized_gist_joinsel_nd(PG_FUNCTION_ARGS)
1093 {
1094  PG_RETURN_DATUM(DirectFunctionCall5(
1096  PG_GETARG_DATUM(0), PG_GETARG_DATUM(1),
1097  PG_GETARG_DATUM(2), PG_GETARG_DATUM(3),
1098  Int32GetDatum(0) /* ND mode */
1099  ));
1100 }
1101 
1107 Datum gserialized_gist_joinsel_2d(PG_FUNCTION_ARGS)
1108 {
1109  PG_RETURN_DATUM(DirectFunctionCall5(
1111  PG_GETARG_DATUM(0), PG_GETARG_DATUM(1),
1112  PG_GETARG_DATUM(2), PG_GETARG_DATUM(3),
1113  Int32GetDatum(2) /* 2D mode */
1114  ));
1115 }
1116 
1126 Datum gserialized_gist_joinsel(PG_FUNCTION_ARGS)
1127 {
1128  PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
1129  /* Oid operator = PG_GETARG_OID(1); */
1130  List *args = (List *) PG_GETARG_POINTER(2);
1131  JoinType jointype = (JoinType) PG_GETARG_INT16(3);
1132  int mode = PG_GETARG_INT32(4);
1133 
1134  Node *arg1, *arg2;
1135  Var *var1, *var2;
1136  Oid relid1, relid2;
1137 
1138  ND_STATS *stats1, *stats2;
1139  float8 selectivity;
1140 
1141  /* Only respond to an inner join/unknown context join */
1142  if (jointype != JOIN_INNER)
1143  {
1144  elog(NOTICE, "gserialized_gist_joinsel: jointype %d not supported", jointype);
1145  PG_RETURN_FLOAT8(DEFAULT_ND_JOINSEL);
1146  }
1147 
1148  /* Find Oids of the geometry columns we are working with */
1149  arg1 = (Node*) linitial(args);
1150  arg2 = (Node*) lsecond(args);
1151  var1 = (Var*) arg1;
1152  var2 = (Var*) arg2;
1153 
1154  /* We only do column joins right now, no functional joins */
1155  /* TODO: handle g1 && ST_Expand(g2) */
1156  if (!IsA(arg1, Var) || !IsA(arg2, Var))
1157  {
1158  elog(DEBUG1, "gserialized_gist_joinsel called with arguments that are not column references");
1159  PG_RETURN_FLOAT8(DEFAULT_ND_JOINSEL);
1160  }
1161 
1162  /* What are the Oids of our tables/relations? */
1163  relid1 = getrelid(var1->varno, root->parse->rtable);
1164  relid2 = getrelid(var2->varno, root->parse->rtable);
1165 
1166  POSTGIS_DEBUGF(3, "using relations \"%s\" Oid(%d), \"%s\" Oid(%d)",
1167  get_rel_name(relid1) ? get_rel_name(relid1) : "NULL", relid1, get_rel_name(relid2) ? get_rel_name(relid2) : "NULL", relid2);
1168 
1169  /* Pull the stats from the stats system. */
1170  stats1 = pg_get_nd_stats(relid1, var1->varattno, mode);
1171  stats2 = pg_get_nd_stats(relid2, var2->varattno, mode);
1172 
1173  /* If we can't get stats, we have to stop here! */
1174  if ( ! stats1 )
1175  {
1176  POSTGIS_DEBUGF(3, "unable to retrieve stats for \"%s\" Oid(%d)", get_rel_name(relid1) ? get_rel_name(relid1) : "NULL" , relid1);
1177  PG_RETURN_FLOAT8(DEFAULT_ND_JOINSEL);
1178  }
1179  else if ( ! stats2 )
1180  {
1181  POSTGIS_DEBUGF(3, "unable to retrieve stats for \"%s\" Oid(%d)", get_rel_name(relid2) ? get_rel_name(relid2) : "NULL", relid2);
1182  PG_RETURN_FLOAT8(DEFAULT_ND_JOINSEL);
1183  }
1184 
1185  selectivity = estimate_join_selectivity(stats1, stats2);
1186  POSTGIS_DEBUGF(2, "got selectivity %g", selectivity);
1187 
1188  pfree(stats1);
1189  pfree(stats2);
1190  PG_RETURN_FLOAT8(selectivity);
1191 }
1192 
1193 
1194 
1195 
1214 static void
1215 compute_gserialized_stats_mode(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
1216  int sample_rows, double total_rows, int mode)
1217 {
1218  MemoryContext old_context;
1219  int d, i; /* Counters */
1220  int notnull_cnt = 0; /* # not null rows in the sample */
1221  int null_cnt = 0; /* # null rows in the sample */
1222  int histogram_features = 0; /* # rows that actually got counted in the histogram */
1223 
1224  ND_STATS *nd_stats; /* Our histogram */
1225  size_t nd_stats_size; /* Size to allocate */
1226 
1227  double total_width = 0; /* # of bytes used by sample */
1228  double total_sample_volume = 0; /* Area/volume coverage of the sample */
1229  double total_cell_count = 0; /* # of cells in histogram affected by sample */
1230 
1231  ND_BOX sum; /* Sum of extents of sample boxes */
1232  ND_BOX avg; /* Avg of extents of sample boxes */
1233  ND_BOX stddev; /* StdDev of extents of sample boxes */
1234 
1235  const ND_BOX **sample_boxes; /* ND_BOXes for each of the sample features */
1236  ND_BOX sample_extent; /* Extent of the raw sample */
1237  int histo_size[ND_DIMS]; /* histogram nrows, ncols, etc */
1238  ND_BOX histo_extent; /* Spatial extent of the histogram */
1239  ND_BOX histo_extent_new; /* Temporary variable */
1240  int histo_cells_target; /* Number of cells we will shoot for, given the stats target */
1241  int histo_cells; /* Number of cells in the histogram */
1242  int histo_cells_new = 1; /* Temporary variable */
1243 
1244  int ndims = 2; /* Dimensionality of the sample */
1245  int histo_ndims = 0; /* Dimensionality of the histogram */
1246  double sample_distribution[ND_DIMS]; /* How homogeneous is distribution of sample in each axis? */
1247  double total_distribution; /* Total of sample_distribution */
1248 
1249  int stats_slot; /* What slot is this data going into? (2D vs ND) */
1250  int stats_kind; /* And this is what? (2D vs ND) */
1251 
1252  /* Initialize sum and stddev */
1253  nd_box_init(&sum);
1254  nd_box_init(&stddev);
1255 
1256  /*
1257  * This is where gserialized_analyze_nd
1258  * should put its' custom parameters.
1259  */
1260  /* void *mystats = stats->extra_data; */
1261 
1262  POSTGIS_DEBUG(2, "compute_gserialized_stats called");
1263  POSTGIS_DEBUGF(3, " # sample_rows: %d", sample_rows);
1264  POSTGIS_DEBUGF(3, " estimate of total_rows: %.6g", total_rows);
1265 
1266  /*
1267  * We might need less space, but don't think
1268  * its worth saving...
1269  */
1270  sample_boxes = palloc(sizeof(ND_BOX*) * sample_rows);
1271 
1272  /*
1273  * First scan:
1274  * o read boxes
1275  * o find dimensionality of the sample
1276  * o find extent of the sample
1277  * o count null-infinite/not-null values
1278  * o compute total_width
1279  * o compute total features's box area (for avgFeatureArea)
1280  * o sum features box coordinates (for standard deviation)
1281  */
1282  for ( i = 0; i < sample_rows; i++ )
1283  {
1284  Datum datum;
1285  GSERIALIZED *geom;
1286  GBOX gbox;
1287  ND_BOX *nd_box;
1288  bool is_null;
1289 
1290  datum = fetchfunc(stats, i, &is_null);
1291 
1292  /* Skip all NULLs. */
1293  if ( is_null )
1294  {
1295  POSTGIS_DEBUGF(4, " skipped null geometry %d", i);
1296  null_cnt++;
1297  continue;
1298  }
1299 
1300  /* Read the bounds from the gserialized. */
1301  geom = (GSERIALIZED *)PG_DETOAST_DATUM(datum);
1302  if ( LW_FAILURE == gserialized_get_gbox_p(geom, &gbox) )
1303  {
1304  /* Skip empties too. */
1305  POSTGIS_DEBUGF(3, " skipped empty geometry %d", i);
1306  continue;
1307  }
1308 
1309  /* If we're in 2D mode, zero out the higher dimensions for "safety" */
1310  if ( mode == 2 )
1311  gbox.zmin = gbox.zmax = gbox.mmin = gbox.mmax = 0.0;
1312 
1313  /* Check bounds for validity (finite and not NaN) */
1314  if ( ! gbox_is_valid(&gbox) )
1315  {
1316  POSTGIS_DEBUGF(3, " skipped infinite/nan geometry %d", i);
1317  continue;
1318  }
1319 
1320  /*
1321  * In N-D mode, set the ndims to the maximum dimensionality found
1322  * in the sample. Otherwise, leave at ndims == 2.
1323  */
1324  if ( mode != 2 )
1325  ndims = Max(gbox_ndims(&gbox), ndims);
1326 
1327  /* Convert gbox to n-d box */
1328  nd_box = palloc(sizeof(ND_BOX));
1329  nd_box_from_gbox(&gbox, nd_box);
1330 
1331  /* Cache n-d bounding box */
1332  sample_boxes[notnull_cnt] = nd_box;
1333 
1334  /* Initialize sample extent before merging first entry */
1335  if ( ! notnull_cnt )
1336  nd_box_init_bounds(&sample_extent);
1337 
1338  /* Add current sample to overall sample extent */
1339  nd_box_merge(nd_box, &sample_extent);
1340 
1341  /* How many bytes does this sample use? */
1342  total_width += VARSIZE(geom);
1343 
1344  /* Add bounds coordinates to sums for stddev calculation */
1345  for ( d = 0; d < ndims; d++ )
1346  {
1347  sum.min[d] += nd_box->min[d];
1348  sum.max[d] += nd_box->max[d];
1349  }
1350 
1351  /* Increment our "good feature" count */
1352  notnull_cnt++;
1353 
1354  /* Give backend a chance of interrupting us */
1355  vacuum_delay_point();
1356  }
1357 
1358  /*
1359  * We'll build a histogram having stats->attr->attstattarget cells
1360  * on each side, within reason... we'll use ndims*10000 as the
1361  * maximum number of cells.
1362  * Also, if we're sampling a relatively small table, we'll try to ensure that
1363  * we have an average of 5 features for each cell so the histogram isn't
1364  * so sparse.
1365  */
1366  histo_cells_target = (int)pow((double)(stats->attr->attstattarget), (double)ndims);
1367  histo_cells_target = Min(histo_cells_target, ndims * 10000);
1368  histo_cells_target = Min(histo_cells_target, (int)(total_rows/5));
1369  POSTGIS_DEBUGF(3, " stats->attr->attstattarget: %d", stats->attr->attstattarget);
1370  POSTGIS_DEBUGF(3, " target # of histogram cells: %d", histo_cells_target);
1371 
1372  /* If there's no useful features, we can't work out stats */
1373  if ( ! notnull_cnt )
1374  {
1375  elog(NOTICE, "no non-null/empty features, unable to compute statistics");
1376  stats->stats_valid = false;
1377  return;
1378  }
1379 
1380  POSTGIS_DEBUGF(3, " sample_extent: %s", nd_box_to_json(&sample_extent, ndims));
1381 
1382  /*
1383  * Second scan:
1384  * o compute standard deviation
1385  */
1386  for ( d = 0; d < ndims; d++ )
1387  {
1388  /* Calculate average bounds values */
1389  avg.min[d] = sum.min[d] / notnull_cnt;
1390  avg.max[d] = sum.max[d] / notnull_cnt;
1391 
1392  /* Calculate standard deviation for this dimension bounds */
1393  for ( i = 0; i < notnull_cnt; i++ )
1394  {
1395  const ND_BOX *ndb = sample_boxes[i];
1396  stddev.min[d] += (ndb->min[d] - avg.min[d]) * (ndb->min[d] - avg.min[d]);
1397  stddev.max[d] += (ndb->max[d] - avg.max[d]) * (ndb->max[d] - avg.max[d]);
1398  }
1399  stddev.min[d] = sqrt(stddev.min[d] / notnull_cnt);
1400  stddev.max[d] = sqrt(stddev.max[d] / notnull_cnt);
1401 
1402  /* Histogram bounds for this dimension bounds is avg +/- SDFACTOR * stdev */
1403  histo_extent.min[d] = Max(avg.min[d] - SDFACTOR * stddev.min[d], sample_extent.min[d]);
1404  histo_extent.max[d] = Min(avg.max[d] + SDFACTOR * stddev.max[d], sample_extent.max[d]);
1405  }
1406 
1407  /*
1408  * Third scan:
1409  * o skip hard deviants
1410  * o compute new histogram box
1411  */
1412  nd_box_init_bounds(&histo_extent_new);
1413  for ( i = 0; i < notnull_cnt; i++ )
1414  {
1415  const ND_BOX *ndb = sample_boxes[i];
1416  /* Skip any hard deviants (boxes entirely outside our histo_extent */
1417  if ( ! nd_box_intersects(&histo_extent, ndb, ndims) )
1418  {
1419  POSTGIS_DEBUGF(4, " feature %d is a hard deviant, skipped", i);
1420  sample_boxes[i] = NULL;
1421  continue;
1422  }
1423  /* Expand our new box to fit all the other features. */
1424  nd_box_merge(ndb, &histo_extent_new);
1425  }
1426  /*
1427  * Expand the box slightly (1%) to avoid edge effects
1428  * with objects that are on the boundary
1429  */
1430  nd_box_expand(&histo_extent_new, 0.01);
1431  histo_extent = histo_extent_new;
1432 
1433  /*
1434  * How should we allocate our histogram cells to the
1435  * different dimensions? We can't do it by raw dimensional width,
1436  * because in x/y/z space, the z can have different units
1437  * from the x/y. Similarly for x/y/t space.
1438  * So, we instead calculate how much features overlap
1439  * each other in their dimension to figure out which
1440  * dimensions have useful selectivity characteristics (more
1441  * variability in density) and therefor would find
1442  * more cells useful (to distinguish between dense places and
1443  * homogeneous places).
1444  */
1445  nd_box_array_distribution(sample_boxes, notnull_cnt, &histo_extent, ndims,
1446  sample_distribution);
1447 
1448  /*
1449  * The sample_distribution array now tells us how spread out the
1450  * data is in each dimension, so we use that data to allocate
1451  * the histogram cells we have available.
1452  * At this point, histo_cells_target is the approximate target number
1453  * of cells.
1454  */
1455 
1456  /*
1457  * Some dimensions have basically a uniform distribution, we want
1458  * to allocate no cells to those dimensions, only to dimensions
1459  * that have some interesting differences in data distribution.
1460  * Here we count up the number of interesting dimensions
1461  */
1462  for ( d = 0; d < ndims; d++ )
1463  {
1464  if ( sample_distribution[d] > 0 )
1465  histo_ndims++;
1466  }
1467 
1468  if ( histo_ndims == 0 )
1469  {
1470  /* Special case: all our dimensions had low variability! */
1471  /* We just divide the cells up evenly */
1472  POSTGIS_DEBUG(3, " special case: no axes have variability");
1473  histo_cells_new = 1;
1474  for ( d = 0; d < ndims; d++ )
1475  {
1476  histo_size[d] = 1 + (int)pow((double)histo_cells_target, 1/(double)ndims);
1477  POSTGIS_DEBUGF(3, " histo_size[d]: %d", histo_size[d]);
1478  histo_cells_new *= histo_size[d];
1479  }
1480  POSTGIS_DEBUGF(3, " histo_cells_new: %d", histo_cells_new);
1481  }
1482  else
1483  {
1484  /*
1485  * We're going to express the amount of variability in each dimension
1486  * as a proportion of the total variability and allocate cells in that
1487  * dimension relative to that proportion.
1488  */
1489  POSTGIS_DEBUG(3, " allocating histogram axes based on axis variability");
1490  total_distribution = total_double(sample_distribution, ndims); /* First get the total */
1491  POSTGIS_DEBUGF(3, " total_distribution: %.8g", total_distribution);
1492  histo_cells_new = 1; /* For the number of cells in the final histogram */
1493  for ( d = 0; d < ndims; d++ )
1494  {
1495  if ( sample_distribution[d] == 0 ) /* Uninteresting dimensions don't get any room */
1496  {
1497  histo_size[d] = 1;
1498  }
1499  else /* Interesting dimension */
1500  {
1501  /* How does this dims variability compare to the total? */
1502  float edge_ratio = (float)sample_distribution[d] / (float)total_distribution;
1503  /*
1504  * Scale the target cells number by the # of dims and ratio,
1505  * then take the appropriate root to get the estimated number of cells
1506  * on this axis (eg, pow(0.5) for 2d, pow(0.333) for 3d, pow(0.25) for 4d)
1507  */
1508  histo_size[d] = (int)pow(histo_cells_target * histo_ndims * edge_ratio, 1/(double)histo_ndims);
1509  /* If something goes awry, just give this dim one slot */
1510  if ( ! histo_size[d] )
1511  histo_size[d] = 1;
1512  }
1513  histo_cells_new *= histo_size[d];
1514  }
1515  POSTGIS_DEBUGF(3, " histo_cells_new: %d", histo_cells_new);
1516  }
1517 
1518  /* Update histo_cells to the actual number of cells we need to allocate */
1519  histo_cells = histo_cells_new;
1520  POSTGIS_DEBUGF(3, " histo_cells: %d", histo_cells);
1521 
1522  /*
1523  * Create the histogram (ND_STATS) in the stats memory context
1524  */
1525  old_context = MemoryContextSwitchTo(stats->anl_context);
1526  nd_stats_size = sizeof(ND_STATS) + ((histo_cells - 1) * sizeof(float4));
1527  nd_stats = palloc(nd_stats_size);
1528  memset(nd_stats, 0, nd_stats_size); /* Initialize all values to 0 */
1529  MemoryContextSwitchTo(old_context);
1530 
1531  /* Initialize the #ND_STATS objects */
1532  nd_stats->ndims = ndims;
1533  nd_stats->extent = histo_extent;
1534  nd_stats->sample_features = sample_rows;
1535  nd_stats->table_features = total_rows;
1536  nd_stats->not_null_features = notnull_cnt;
1537  /* Copy in the histogram dimensions */
1538  for ( d = 0; d < ndims; d++ )
1539  nd_stats->size[d] = histo_size[d];
1540 
1541  /*
1542  * Fourth scan:
1543  * o fill histogram values with the proportion of
1544  * features' bbox overlaps: a feature's bvol
1545  * can fully overlap (1) or partially overlap
1546  * (fraction of 1) an histogram cell.
1547  *
1548  * Note that we are filling each cell with the "portion of
1549  * the feature's box that overlaps the cell". So, if we sum
1550  * up the values in the histogram, we could get the
1551  * histogram feature count.
1552  *
1553  */
1554  for ( i = 0; i < notnull_cnt; i++ )
1555  {
1556  const ND_BOX *nd_box;
1557  ND_IBOX nd_ibox;
1558  int at[ND_DIMS];
1559  int d;
1560  double num_cells = 0;
1561  double tmp_volume = 1.0;
1562  double min[ND_DIMS];
1563  double max[ND_DIMS];
1564  double cellsize[ND_DIMS];
1565 
1566  nd_box = sample_boxes[i];
1567  if ( ! nd_box ) continue; /* Skip Null'ed out hard deviants */
1568 
1569  /* Give backend a chance of interrupting us */
1570  vacuum_delay_point();
1571 
1572  /* Find the cells that overlap with this box and put them into the ND_IBOX */
1573  nd_box_overlap(nd_stats, nd_box, &nd_ibox);
1574  memset(at, 0, sizeof(int)*ND_DIMS);
1575 
1576  POSTGIS_DEBUGF(3, " feature %d: ibox (%d, %d, %d, %d) (%d, %d, %d, %d)", i,
1577  nd_ibox.min[0], nd_ibox.min[1], nd_ibox.min[2], nd_ibox.min[3],
1578  nd_ibox.max[0], nd_ibox.max[1], nd_ibox.max[2], nd_ibox.max[3]);
1579 
1580  for ( d = 0; d < nd_stats->ndims; d++ )
1581  {
1582  /* Initialize the starting values */
1583  at[d] = nd_ibox.min[d];
1584  min[d] = nd_stats->extent.min[d];
1585  max[d] = nd_stats->extent.max[d];
1586  cellsize[d] = (max[d] - min[d])/(nd_stats->size[d]);
1587 
1588  /* What's the volume (area) of this feature's box? */
1589  tmp_volume *= (nd_box->max[d] - nd_box->min[d]);
1590  }
1591 
1592  /* Add feature volume (area) to our total */
1593  total_sample_volume += tmp_volume;
1594 
1595  /*
1596  * Move through all the overlaped histogram cells values and
1597  * add the box overlap proportion to them.
1598  */
1599  do
1600  {
1601  ND_BOX nd_cell;
1602  double ratio;
1603  /* Create a box for this histogram cell */
1604  for ( d = 0; d < nd_stats->ndims; d++ )
1605  {
1606  nd_cell.min[d] = min[d] + (at[d]+0) * cellsize[d];
1607  nd_cell.max[d] = min[d] + (at[d]+1) * cellsize[d];
1608  }
1609 
1610  /*
1611  * If a feature box is completely inside one cell the ratio will be
1612  * 1.0. If a feature box is 50% in two cells, each cell will get
1613  * 0.5 added on.
1614  */
1615  ratio = nd_box_ratio(&nd_cell, nd_box, nd_stats->ndims);
1616  nd_stats->value[nd_stats_value_index(nd_stats, at)] += ratio;
1617  num_cells += ratio;
1618  POSTGIS_DEBUGF(3, " ratio (%.8g) num_cells (%.8g)", ratio, num_cells);
1619  POSTGIS_DEBUGF(3, " at (%d, %d, %d, %d)", at[0], at[1], at[2], at[3]);
1620  }
1621  while ( nd_increment(&nd_ibox, nd_stats->ndims, at) );
1622 
1623  /* Keep track of overall number of overlaps counted */
1624  total_cell_count += num_cells;
1625  /* How many features have we added to this histogram? */
1626  histogram_features++;
1627  }
1628 
1629  POSTGIS_DEBUGF(3, " histogram_features: %d", histogram_features);
1630  POSTGIS_DEBUGF(3, " sample_rows: %d", sample_rows);
1631  POSTGIS_DEBUGF(3, " table_rows: %.6g", total_rows);
1632 
1633  /* Error out if we got no sample information */
1634  if ( ! histogram_features )
1635  {
1636  POSTGIS_DEBUG(3, " no stats have been gathered");
1637  elog(NOTICE, " no features lie in the stats histogram, invalid stats");
1638  stats->stats_valid = false;
1639  return;
1640  }
1641 
1642  nd_stats->histogram_features = histogram_features;
1643  nd_stats->histogram_cells = histo_cells;
1644  nd_stats->cells_covered = total_cell_count;
1645 
1646  /* Put this histogram data into the right slot/kind */
1647  if ( mode == 2 )
1648  {
1649  stats_slot = STATISTIC_SLOT_2D;
1650  stats_kind = STATISTIC_KIND_2D;
1651  }
1652  else
1653  {
1654  stats_slot = STATISTIC_SLOT_ND;
1655  stats_kind = STATISTIC_KIND_ND;
1656  }
1657 
1658  /* Write the statistics data */
1659  stats->stakind[stats_slot] = stats_kind;
1660  stats->staop[stats_slot] = InvalidOid;
1661  stats->stanumbers[stats_slot] = (float4*)nd_stats;
1662  stats->numnumbers[stats_slot] = nd_stats_size/sizeof(float4);
1663  stats->stanullfrac = (float4)null_cnt/sample_rows;
1664  stats->stawidth = total_width/notnull_cnt;
1665  stats->stadistinct = -1.0;
1666  stats->stats_valid = true;
1667 
1668  POSTGIS_DEBUGF(3, " out: slot 0: kind %d (STATISTIC_KIND_ND)", stats->stakind[0]);
1669  POSTGIS_DEBUGF(3, " out: slot 0: op %d (InvalidOid)", stats->staop[0]);
1670  POSTGIS_DEBUGF(3, " out: slot 0: numnumbers %d", stats->numnumbers[0]);
1671  POSTGIS_DEBUGF(3, " out: null fraction: %f=%d/%d", stats->stanullfrac, null_cnt, sample_rows);
1672  POSTGIS_DEBUGF(3, " out: average width: %d bytes", stats->stawidth);
1673  POSTGIS_DEBUG (3, " out: distinct values: all (no check done)");
1674  POSTGIS_DEBUGF(3, " out: %s", nd_stats_to_json(nd_stats));
1675  /*
1676  POSTGIS_DEBUGF(3, " out histogram:\n%s", nd_stats_to_grid(nd_stats));
1677  */
1678 
1679  return;
1680 }
1681 
1682 
1700 static void
1701 compute_gserialized_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
1702  int sample_rows, double total_rows)
1703 {
1704  /* 2D Mode */
1705  compute_gserialized_stats_mode(stats, fetchfunc, sample_rows, total_rows, 2);
1706  /* ND Mode */
1707  compute_gserialized_stats_mode(stats, fetchfunc, sample_rows, total_rows, 0);
1708 }
1709 
1710 
1739 Datum gserialized_analyze_nd(PG_FUNCTION_ARGS)
1740 {
1741  VacAttrStats *stats = (VacAttrStats *)PG_GETARG_POINTER(0);
1742  Form_pg_attribute attr = stats->attr;
1743 
1744  POSTGIS_DEBUG(2, "gserialized_analyze_nd called");
1745 
1746  /* If the attstattarget column is negative, use the default value */
1747  /* NB: it is okay to scribble on stats->attr since it's a copy */
1748  if (attr->attstattarget < 0)
1749  attr->attstattarget = default_statistics_target;
1750 
1751  POSTGIS_DEBUGF(3, " attribute stat target: %d", attr->attstattarget);
1752 
1753  /* Setup the minimum rows and the algorithm function */
1754  stats->minrows = 300 * stats->attr->attstattarget;
1755  stats->compute_stats = compute_gserialized_stats;
1756 
1757  POSTGIS_DEBUGF(3, " minrows: %d", stats->minrows);
1758 
1759  /* Indicate we are done successfully */
1760  PG_RETURN_BOOL(true);
1761 }
1762 
1775 static float8
1776 estimate_selectivity(const GBOX *box, const ND_STATS *nd_stats, int mode)
1777 {
1778  int d; /* counter */
1779  float8 selectivity;
1780  ND_BOX nd_box;
1781  ND_IBOX nd_ibox;
1782  int at[ND_DIMS];
1783  double cell_size[ND_DIMS];
1784  double min[ND_DIMS];
1785  double max[ND_DIMS];
1786  double total_count = 0.0;
1787  int ndims_max = Max(nd_stats->ndims, gbox_ndims(box));
1788 // int ndims_min = Min(nd_stats->ndims, gbox_ndims(box));
1789 
1790  /* Calculate the overlap of the box on the histogram */
1791  if ( ! nd_stats )
1792  {
1793  elog(NOTICE, " estimate_selectivity called with null input");
1794  return FALLBACK_ND_SEL;
1795  }
1796 
1797  /* Initialize nd_box. */
1798  nd_box_from_gbox(box, &nd_box);
1799 
1800  /*
1801  * To return 2D stats on an ND sample, we need to make the
1802  * 2D box cover the full range of the other dimensions in the
1803  * histogram.
1804  */
1805  POSTGIS_DEBUGF(3, " mode: %d", mode);
1806  if ( mode == 2 )
1807  {
1808  POSTGIS_DEBUG(3, " in 2d mode, stripping the computation down to 2d");
1809  ndims_max = 2;
1810  }
1811 
1812  POSTGIS_DEBUGF(3, " nd_stats->extent: %s", nd_box_to_json(&(nd_stats->extent), nd_stats->ndims));
1813  POSTGIS_DEBUGF(3, " nd_box: %s", nd_box_to_json(&(nd_box), gbox_ndims(box)));
1814 
1815  /*
1816  * Search box completely misses histogram extent?
1817  * We have to intersect in all N dimensions or else we have
1818  * zero interaction under the &&& operator. It's important
1819  * to short circuit in this case, as some of the tests below
1820  * will return junk results when run on non-intersecting inputs.
1821  */
1822  if ( ! nd_box_intersects(&nd_box, &(nd_stats->extent), ndims_max) )
1823  {
1824  POSTGIS_DEBUG(3, " search box does not overlap histogram, returning 0");
1825  return 0.0;
1826  }
1827 
1828  /* Search box completely contains histogram extent! */
1829  if ( nd_box_contains(&nd_box, &(nd_stats->extent), ndims_max) )
1830  {
1831  POSTGIS_DEBUG(3, " search box contains histogram, returning 1");
1832  return 1.0;
1833  }
1834 
1835  /* Calculate the overlap of the box on the histogram */
1836  if ( ! nd_box_overlap(nd_stats, &nd_box, &nd_ibox) )
1837  {
1838  POSTGIS_DEBUG(3, " search box overlap with stats histogram failed");
1839  return FALLBACK_ND_SEL;
1840  }
1841 
1842  /* Work out some measurements of the histogram */
1843  for ( d = 0; d < nd_stats->ndims; d++ )
1844  {
1845  /* Cell size in each dim */
1846  min[d] = nd_stats->extent.min[d];
1847  max[d] = nd_stats->extent.max[d];
1848  cell_size[d] = (max[d] - min[d]) / nd_stats->size[d];
1849  POSTGIS_DEBUGF(3, " cell_size[%d] : %.9g", d, cell_size[d]);
1850 
1851  /* Initialize the counter */
1852  at[d] = nd_ibox.min[d];
1853  }
1854 
1855  /* Move through all the overlap values and sum them */
1856  do
1857  {
1858  float cell_count, ratio;
1859  ND_BOX nd_cell;
1860 
1861  /* We have to pro-rate partially overlapped cells. */
1862  for ( d = 0; d < nd_stats->ndims; d++ )
1863  {
1864  nd_cell.min[d] = min[d] + (at[d]+0) * cell_size[d];
1865  nd_cell.max[d] = min[d] + (at[d]+1) * cell_size[d];
1866  }
1867 
1868  ratio = nd_box_ratio(&nd_box, &nd_cell, nd_stats->ndims);
1869  cell_count = nd_stats->value[nd_stats_value_index(nd_stats, at)];
1870 
1871  /* Add the pro-rated count for this cell to the overall total */
1872  total_count += cell_count * ratio;
1873  POSTGIS_DEBUGF(4, " cell (%d,%d), cell value %.6f, ratio %.6f", at[0], at[1], cell_count, ratio);
1874  }
1875  while ( nd_increment(&nd_ibox, nd_stats->ndims, at) );
1876 
1877  /* Scale by the number of features in our histogram to get the proportion */
1878  selectivity = total_count / nd_stats->histogram_features;
1879 
1880  POSTGIS_DEBUGF(3, " nd_stats->histogram_features = %f", nd_stats->histogram_features);
1881  POSTGIS_DEBUGF(3, " nd_stats->histogram_cells = %f", nd_stats->histogram_cells);
1882  POSTGIS_DEBUGF(3, " sum(overlapped histogram cells) = %f", total_count);
1883  POSTGIS_DEBUGF(3, " selectivity = %f", selectivity);
1884 
1885  /* Prevent rounding overflows */
1886  if (selectivity > 1.0) selectivity = 1.0;
1887  else if (selectivity < 0.0) selectivity = 0.0;
1888 
1889  return selectivity;
1890 }
1891 
1892 
1893 
1899 Datum _postgis_gserialized_stats(PG_FUNCTION_ARGS)
1900 {
1901  Oid table_oid = PG_GETARG_OID(0);
1902  text *att_text = PG_GETARG_TEXT_P(1);
1903  ND_STATS *nd_stats;
1904  char *str;
1905  text *json;
1906  int mode = 2; /* default to 2D mode */
1907 
1908  /* Check if we've been asked to not use 2d mode */
1909  if ( ! PG_ARGISNULL(2) )
1910  mode = text_p_get_mode(PG_GETARG_TEXT_P(2));
1911 
1912  /* Retrieve the stats object */
1913  nd_stats = pg_get_nd_stats_by_name(table_oid, att_text, mode);
1914  if ( ! nd_stats )
1915  elog(ERROR, "stats for \"%s.%s\" do not exist", get_rel_name(table_oid), text2cstring(att_text));
1916 
1917  /* Convert to JSON */
1918  str = nd_stats_to_json(nd_stats);
1919  json = cstring2text(str);
1920  pfree(str);
1921  pfree(nd_stats);
1922  PG_RETURN_TEXT_P(json);
1923 }
1924 
1925 
1931 Datum _postgis_gserialized_sel(PG_FUNCTION_ARGS)
1932 {
1933  Oid table_oid = PG_GETARG_OID(0);
1934  text *att_text = PG_GETARG_TEXT_P(1);
1935  Datum geom_datum = PG_GETARG_DATUM(2);
1936  GBOX gbox; /* search box read from gserialized datum */
1937  float8 selectivity = 0;
1938  ND_STATS *nd_stats;
1939  int mode = 2; /* 2D mode by default */
1940 
1941  /* Check if we've been asked to not use 2d mode */
1942  if ( ! PG_ARGISNULL(3) )
1943  mode = text_p_get_mode(PG_GETARG_TEXT_P(3));
1944 
1945  /* Retrieve the stats object */
1946  nd_stats = pg_get_nd_stats_by_name(table_oid, att_text, mode);
1947 
1948  if ( ! nd_stats )
1949  elog(ERROR, "stats for \"%s.%s\" do not exist", get_rel_name(table_oid), text2cstring(att_text));
1950 
1951  /* Calculate the gbox */
1952  if ( ! gserialized_datum_get_gbox_p(geom_datum, &gbox) )
1953  elog(ERROR, "unable to calculate bounding box from geometry");
1954 
1955  POSTGIS_DEBUGF(3, " %s", gbox_to_string(&gbox));
1956 
1957  /* Do the estimation */
1958  selectivity = estimate_selectivity(&gbox, nd_stats, mode);
1959 
1960  pfree(nd_stats);
1961  PG_RETURN_FLOAT8(selectivity);
1962 }
1963 
1964 
1970 Datum _postgis_gserialized_joinsel(PG_FUNCTION_ARGS)
1971 {
1972  Oid table_oid1 = PG_GETARG_OID(0);
1973  text *att_text1 = PG_GETARG_TEXT_P(1);
1974  Oid table_oid2 = PG_GETARG_OID(2);
1975  text *att_text2 = PG_GETARG_TEXT_P(3);
1976  ND_STATS *nd_stats1, *nd_stats2;
1977  float8 selectivity = 0;
1978  int mode = 2; /* 2D mode by default */
1979 
1980 
1981  /* Retrieve the stats object */
1982  nd_stats1 = pg_get_nd_stats_by_name(table_oid1, att_text1, mode);
1983  nd_stats2 = pg_get_nd_stats_by_name(table_oid2, att_text2, mode);
1984 
1985  if ( ! nd_stats1 )
1986  elog(ERROR, "stats for \"%s.%s\" do not exist", get_rel_name(table_oid1), text2cstring(att_text1));
1987 
1988  if ( ! nd_stats2 )
1989  elog(ERROR, "stats for \"%s.%s\" do not exist", get_rel_name(table_oid2), text2cstring(att_text2));
1990 
1991  /* Check if we've been asked to not use 2d mode */
1992  if ( ! PG_ARGISNULL(4) )
1993  {
1994  text *modetxt = PG_GETARG_TEXT_P(4);
1995  char *modestr = text2cstring(modetxt);
1996  if ( modestr[0] == 'N' )
1997  mode = 0;
1998  }
1999 
2000  /* Do the estimation */
2001  selectivity = estimate_join_selectivity(nd_stats1, nd_stats2);
2002 
2003  pfree(nd_stats1);
2004  pfree(nd_stats2);
2005  PG_RETURN_FLOAT8(selectivity);
2006 }
2007 
2013 Datum gserialized_gist_sel_2d(PG_FUNCTION_ARGS)
2014 {
2015  PG_RETURN_DATUM(DirectFunctionCall5(
2017  PG_GETARG_DATUM(0), PG_GETARG_DATUM(1),
2018  PG_GETARG_DATUM(2), PG_GETARG_DATUM(3),
2019  Int32GetDatum(2) /* 2-D mode */
2020  ));
2021 }
2022 
2028 Datum gserialized_gist_sel_nd(PG_FUNCTION_ARGS)
2029 {
2030  PG_RETURN_DATUM(DirectFunctionCall5(
2032  PG_GETARG_DATUM(0), PG_GETARG_DATUM(1),
2033  PG_GETARG_DATUM(2), PG_GETARG_DATUM(3),
2034  Int32GetDatum(0) /* N-D mode */
2035  ));
2036 }
2037 
2052 Datum gserialized_gist_sel(PG_FUNCTION_ARGS)
2053 {
2054  PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
2055  /* Oid operator_oid = PG_GETARG_OID(1); */
2056  List *args = (List *) PG_GETARG_POINTER(2);
2057  /* int varRelid = PG_GETARG_INT32(3); */
2058  int mode = PG_GETARG_INT32(4);
2059 
2060  Oid relid;
2061  ND_STATS *nd_stats;
2062 
2063  Node *other;
2064  Var *self;
2065  GBOX search_box;
2066  float8 selectivity = 0;
2067 
2068  POSTGIS_DEBUG(2, "gserialized_gist_sel called");
2069 
2070  /*
2071  * TODO: This is a big one,
2072  * All this statistics code *only* tries to generate a valid
2073  * selectivity for && and &&&. That leaves all the other
2074  * geometry operators with bad stats! The selectivity
2075  * calculation should take account of the incoming operator
2076  * type and do the right thing.
2077  */
2078 
2079  /* Fail if not a binary opclause (probably shouldn't happen) */
2080  if (list_length(args) != 2)
2081  {
2082  POSTGIS_DEBUG(3, "gserialized_gist_sel: not a binary opclause");
2083  PG_RETURN_FLOAT8(DEFAULT_ND_SEL);
2084  }
2085 
2086  /* Find the constant part */
2087  other = (Node *) linitial(args);
2088  if ( ! IsA(other, Const) )
2089  {
2090  self = (Var *)other;
2091  other = (Node *) lsecond(args);
2092  }
2093  else
2094  {
2095  self = (Var *) lsecond(args);
2096  }
2097 
2098  if ( ! IsA(other, Const) )
2099  {
2100  POSTGIS_DEBUG(3, " no constant arguments - returning a default selectivity");
2101  PG_RETURN_FLOAT8(DEFAULT_ND_SEL);
2102  }
2103 
2104  /*
2105  * We don't have a nice <const> && <var> or <var> && <const>
2106  * situation here. <const> && <const> would probably get evaluated
2107  * away by PgSQL earlier on. <func> && <const> is harder, and the
2108  * case we get often is <const> && ST_Expand(<var>), which does
2109  * actually have a subtly different selectivity than a bae
2110  * <const> && <var> call. It's calculatable though, by expanding
2111  * every cell in the histgram appropriately.
2112  *
2113  * Discussion: http://trac.osgeo.org/postgis/ticket/1828
2114  *
2115  * To do? Do variable selectivity based on the <func> node.
2116  */
2117  if ( ! IsA(self, Var) )
2118  {
2119  POSTGIS_DEBUG(3, " no bare variable argument ? - returning a moderate selectivity");
2120  PG_RETURN_FLOAT8(FALLBACK_ND_SEL);
2121  }
2122 
2123  /* Convert the constant to a BOX */
2124  if( ! gserialized_datum_get_gbox_p(((Const*)other)->constvalue, &search_box) )
2125  {
2126  POSTGIS_DEBUG(3, "search box is EMPTY");
2127  PG_RETURN_FLOAT8(0.0);
2128  }
2129  POSTGIS_DEBUGF(4, " requested search box is: %s", gbox_to_string(&search_box));
2130 
2131  /* Get pg_statistic row */
2132  relid = getrelid(self->varno, root->parse->rtable);
2133  nd_stats = pg_get_nd_stats(relid, self->varattno, mode);
2134  if ( ! nd_stats )
2135  {
2136  POSTGIS_DEBUG(3, " unable to load stats from syscache, not analyzed yet?");
2137  PG_RETURN_FLOAT8(FALLBACK_ND_SEL);
2138  }
2139  POSTGIS_DEBUGF(4, " got stats:\n%s", nd_stats_to_json(nd_stats));
2140 
2141  /* Do the estimation! */
2142  selectivity = estimate_selectivity(&search_box, nd_stats, mode);
2143  POSTGIS_DEBUGF(3, " returning computed value: %f", selectivity);
2144 
2145  pfree(nd_stats);
2146  PG_RETURN_FLOAT8(selectivity);
2147 }
2148 
2149 
2150 
2157 Datum gserialized_estimated_extent(PG_FUNCTION_ARGS)
2158 {
2159  char *nsp = NULL;
2160  char *tbl = NULL;
2161  text *col = NULL;
2162  char *nsp_tbl = NULL;
2163  Oid tbl_oid;
2164  ND_STATS *nd_stats;
2165  GBOX *gbox;
2166 
2167  if ( PG_NARGS() == 3 )
2168  {
2169  nsp = text2cstring(PG_GETARG_TEXT_P(0));
2170  tbl = text2cstring(PG_GETARG_TEXT_P(1));
2171  col = PG_GETARG_TEXT_P(2);
2172  nsp_tbl = palloc(strlen(nsp) + strlen(tbl) + 6);
2173  sprintf(nsp_tbl, "\"%s\".\"%s\"", nsp, tbl);
2174  tbl_oid = DatumGetObjectId(DirectFunctionCall1(regclassin, CStringGetDatum(nsp_tbl)));
2175  pfree(nsp_tbl);
2176  }
2177  else if ( PG_NARGS() == 2 )
2178  {
2179  tbl = text2cstring(PG_GETARG_TEXT_P(0));
2180  col = PG_GETARG_TEXT_P(1);
2181  nsp_tbl = palloc(strlen(tbl) + 3);
2182  sprintf(nsp_tbl, "\"%s\"", tbl);
2183  tbl_oid = DatumGetObjectId(DirectFunctionCall1(regclassin, CStringGetDatum(nsp_tbl)));
2184  pfree(nsp_tbl);
2185  }
2186  else
2187  {
2188  elog(ERROR, "estimated_extent() called with wrong number of arguments");
2189  PG_RETURN_NULL();
2190  }
2191 
2192  /* Estimated extent only returns 2D bounds, so use mode 2 */
2193  nd_stats = pg_get_nd_stats_by_name(tbl_oid, col, 2);
2194 
2195  /* Error out on no stats */
2196  if ( ! nd_stats )
2197  elog(ERROR, "stats for \"%s.%s\" do not exist", tbl, text2cstring(col));
2198 
2199  /* Construct the box */
2200  gbox = palloc(sizeof(GBOX));
2201  FLAGS_SET_GEODETIC(gbox->flags, 0);
2202  FLAGS_SET_Z(gbox->flags, 0);
2203  FLAGS_SET_M(gbox->flags, 0);
2204  gbox->xmin = nd_stats->extent.min[0];
2205  gbox->xmax = nd_stats->extent.max[0];
2206  gbox->ymin = nd_stats->extent.min[1];
2207  gbox->ymax = nd_stats->extent.max[1];
2208 
2209  pfree(nd_stats);
2210  PG_RETURN_POINTER(gbox);
2211 }
2212 
2220 Datum geometry_estimated_extent(PG_FUNCTION_ARGS)
2221 {
2222  if ( PG_NARGS() == 3 )
2223  {
2224  PG_RETURN_DATUM(
2225  DirectFunctionCall3(gserialized_estimated_extent,
2226  PG_GETARG_DATUM(0),
2227  PG_GETARG_DATUM(1),
2228  PG_GETARG_DATUM(2)));
2229  }
2230  else if ( PG_NARGS() == 2 )
2231  {
2232  PG_RETURN_DATUM(
2233  DirectFunctionCall2(gserialized_estimated_extent,
2234  PG_GETARG_DATUM(0),
2235  PG_GETARG_DATUM(1)));
2236  }
2237 
2238  elog(ERROR, "geometry_estimated_extent() called with wrong number of arguments");
2239  PG_RETURN_NULL();
2240 }
int gserialized_get_gbox_p(const GSERIALIZED *g, GBOX *box)
Read the bounding box off a serialization and calculate one if it is not already there.
Definition: g_serialized.c:373
static int nd_increment(ND_IBOX *ibox, int ndims, int *counter)
Given an n-d index array (counter), and a domain to increment it in (ibox) increment it by one...
static int text_p_get_mode(const text *txt)
Utility function to see if the first letter of the mode argument is 'N'.
static int nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
What stats cells overlap with this ND_BOX? Put the lowest cell addresses in ND_IBOX->min and the high...
PG_FUNCTION_INFO_V1(gserialized_gist_joinsel_nd)
For (geometry &&& geometry) and (geography && geography) we call into the N-D mode.
#define DEFAULT_ND_JOINSEL
static void compute_gserialized_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc, int sample_rows, double total_rows)
In order to do useful selectivity calculations in both 2-D and N-D modes, we actually have to generat...
stringbuffer_t * stringbuffer_create(void)
Allocate a new stringbuffer_t.
Definition: stringbuffer.c:47
int gbox_is_valid(const GBOX *gbox)
Return false if any of the dimensions is NaN or infinite.
Definition: g_box.c:155
#define MIN_DIMENSION_WIDTH
Minimum width of a dimension that we'll bother trying to compute statistics on.
char * gbox_to_string(const GBOX *gbox)
Allocate a string representation of the GBOX, based on dimensionality of flags.
Definition: g_box.c:328
#define FLAGS_GET_GEODETIC(flags)
Definition: liblwgeom.h:109
char * stringbuffer_getstringcopy(stringbuffer_t *s)
Returns a newly allocated string large enough to contain the current state of the string...
Definition: stringbuffer.c:154
double xmax
Definition: liblwgeom.h:249
#define ND_DIMS
The maximum number of dimensions our code can handle.
static void compute_gserialized_stats_mode(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc, int sample_rows, double total_rows, int mode)
The gserialized_analyze_nd sets this function as a callback on the stats object when called by the AN...
Datum geometry_estimated_extent(PG_FUNCTION_ARGS)
static int nd_box_contains(const ND_BOX *a, const ND_BOX *b, int ndims)
Return TRUE if ND_BOX a contains b, false otherwise.
Datum _postgis_gserialized_stats(PG_FUNCTION_ARGS)
#define STATISTIC_SLOT_2D
static void nd_box_from_gbox(const GBOX *gbox, ND_BOX *nd_box)
Set the values of an ND_BOX from a GBOX.
struct ND_BOX_T ND_BOX
N-dimensional box type for calculations, to avoid doing explicit axis conversions from GBOX in all ca...
#define FALLBACK_ND_JOINSEL
Datum gserialized_estimated_extent(PG_FUNCTION_ARGS)
#define FALLBACK_ND_SEL
More modest fallback selectivity factor.
#define FLAGS_SET_GEODETIC(flags, value)
Definition: liblwgeom.h:115
Datum _postgis_gserialized_joinsel(PG_FUNCTION_ARGS)
static float8 estimate_selectivity(const GBOX *box, const ND_STATS *nd_stats, int mode)
This function returns an estimate of the selectivity of a search GBOX by looking at data in the ND_ST...
#define LW_FAILURE
Definition: liblwgeom.h:54
int stringbuffer_aprintf(stringbuffer_t *s, const char *fmt,...)
Appends a formatted string to the current string buffer, using the format and argument list provided...
Definition: stringbuffer.c:247
static int nd_box_init(ND_BOX *a)
Zero out an ND_BOX.
Datum gserialized_gist_joinsel(PG_FUNCTION_ARGS)
#define FLAGS_SET_Z(flags, value)
Definition: liblwgeom.h:112
double zmax
Definition: liblwgeom.h:253
double ymin
Definition: liblwgeom.h:250
Datum gserialized_gist_sel_2d(PG_FUNCTION_ARGS)
Datum gserialized_gist_joinsel_2d(PG_FUNCTION_ARGS)
Datum gserialized_gist_sel(PG_FUNCTION_ARGS)
static int nd_box_array_distribution(const ND_BOX **nd_boxes, int num_boxes, const ND_BOX *extent, int ndims, double *distribution)
Calculate how much a set of boxes is homogenously distributed or contentrated within one dimension...
double xmin
Definition: liblwgeom.h:248
static int nd_stats_value_index(const ND_STATS *stats, int *indexes)
Given a position in the n-d histogram (i,j,k) return the position in the 1-d values array...
float4 size[ND_DIMS]
Datum gserialized_gist_joinsel_nd(PG_FUNCTION_ARGS)
static int nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
Return TRUE if ND_BOX a overlaps b, false otherwise.
static double total_double(const double *vals, int nvals)
Given double array, return sum of values.
int min[ND_DIMS]
char * text2cstring(const text *textptr)
Datum gserialized_gist_sel_nd(PG_FUNCTION_ARGS)
static int cmp_int(const void *a, const void *b)
Integer comparison function for qsort.
static int nd_box_init_bounds(ND_BOX *a)
Prepare an ND_BOX for bounds calculation: set the maxes to the smallest thing possible and the mins t...
double ymax
Definition: liblwgeom.h:251
N-dimensional box index type.
#define FLAGS_GET_Z(flags)
Macros for manipulating the 'flags' byte.
Definition: liblwgeom.h:106
static int range_quintile(int *vals, int nvals)
The difference between the fourth and first quintile values, the "inter-quintile range".
Datum _postgis_gserialized_sel(PG_FUNCTION_ARGS)
static float8 estimate_join_selectivity(const ND_STATS *s1, const ND_STATS *s2)
Given two statistics histograms, what is the selectivity of a join driven by the && or &&& operator...
static ND_STATS * pg_get_nd_stats(const Oid table_oid, AttrNumber att_num, int mode)
Pull the stats object from the PgSQL system catalogs.
uint8_t flags
Definition: liblwgeom.h:247
static char * nd_box_to_json(const ND_BOX *nd_box, int ndims)
Convert an ND_BOX to a JSON string for printing.
float4 max[ND_DIMS]
void stringbuffer_append(stringbuffer_t *s, const char *a)
Append the specified string to the stringbuffer_t.
Definition: stringbuffer.c:128
#define STATISTIC_KIND_2D
struct ND_IBOX_T ND_IBOX
N-dimensional box index type.
void stringbuffer_destroy(stringbuffer_t *s)
Free the stringbuffer_t and all memory managed within it.
Definition: stringbuffer.c:72
int max[ND_DIMS]
struct ND_STATS_T ND_STATS
N-dimensional statistics structure.
float4 min[ND_DIMS]
#define FALSE
Definition: dbfopen.c:169
static int nd_box_merge(const ND_BOX *source, ND_BOX *target)
Create a printable view of the ND_STATS histogram.
static int nd_box_expand(ND_BOX *nd_box, double expansion_factor)
Expand an ND_BOX ever so slightly.
double mmin
Definition: liblwgeom.h:254
#define SDFACTOR
double zmin
Definition: liblwgeom.h:252
static ND_STATS * pg_get_nd_stats_by_name(const Oid table_oid, const text *att_text, int mode)
Pull the stats object from the PgSQL system catalogs.
#define FLAGS_GET_M(flags)
Definition: liblwgeom.h:107
static char * nd_stats_to_json(const ND_STATS *nd_stats)
Convert an ND_STATS to a JSON representation for external use.
Datum gserialized_analyze_nd(PG_FUNCTION_ARGS)
double mmax
Definition: liblwgeom.h:255
#define STATISTIC_SLOT_ND
static double nd_box_ratio(const ND_BOX *b1, const ND_BOX *b2, int ndims)
Returns the proportion of b2 that is covered by b1.
N-dimensional statistics structure.
#define TRUE
Definition: dbfopen.c:170
N-dimensional box type for calculations, to avoid doing explicit axis conversions from GBOX in all ca...
static int gbox_ndims(const GBOX *gbox)
Given that geodetic boxes are X/Y/Z regardless of the underlying geometry dimensionality and other bo...
This library is the generic geometry handling section of PostGIS.
#define DEFAULT_ND_SEL
Default geometry selectivity factor.
#define STATISTIC_KIND_ND
Assign a number to the n-dimensional statistics kind.
#define FLAGS_SET_M(flags, value)
Definition: liblwgeom.h:113