PostGIS  2.2.7dev-r@@SVN_REVISION@@
gserialized_estimate.c
Go to the documentation of this file.
1 /**********************************************************************
2  * PostGIS - Spatial Types for PostgreSQL
3  * http://postgis.net
4  *
5  * Copyright 2012 (C) Paul Ramsey <pramsey@cleverelephant.ca>
6  *
7  * This is free software; you can redistribute and/or modify it under
8  * the terms of the GNU General Public Licence. See the COPYING file.
9  *
10  **********************************************************************/
11 
12 
13 /**********************************************************************
14  THEORY OF OPERATION
15 
16 The ANALYZE command hooks to a callback (gserialized_analyze_nd) that
17 calculates (compute_gserialized_stats_mode) two histograms of occurances of
18 features, once for the 2D domain (and the && operator) one for the
19 ND domain (and the &&& operator).
20 
21 Queries in PostgreSQL call into the selectivity sub-system to find out
22 the relative effectiveness of different clauses in sub-setting
23 relations. Queries with constant arguments call gserialized_gist_sel,
24 queries with relations on both sides call gserialized_gist_joinsel.
25 
26 gserialized_gist_sel sums up the values in the histogram that overlap
27 the contant search box.
28 
29 gserialized_gist_joinsel sums up the product of the overlapping
30 cells in each relation's histogram.
31 
32 Depending on the operator and type, the mode of selectivity calculation
33 will be 2D or ND.
34 
35 - geometry && geometry ==> 2D
36 - geometry &&& geometry ==> ND
37 - geography && geography ==> ND
38 
39 The 2D mode is put in effect by retrieving the 2D histogram from the
40 statistics cache and then allowing the generic ND calculations to
41 go to work.
42 
43 TO DO: More testing and examination of the &&& operator and mixed
44 dimensionality cases. (2D geometry) &&& (3D column), etc.
45 
46 **********************************************************************/
47 
48 #include "postgres.h"
49 #include "executor/spi.h"
50 #include "fmgr.h"
51 #include "commands/vacuum.h"
52 #include "nodes/relation.h"
53 #include "parser/parsetree.h"
54 #include "utils/array.h"
55 #include "utils/lsyscache.h"
56 #include "utils/builtins.h"
57 #include "utils/syscache.h"
58 #include "utils/rel.h"
59 #include "utils/selfuncs.h"
60 
61 #include "../postgis_config.h"
62 
63 #if POSTGIS_PGSQL_VERSION >= 93
64  #include "access/htup_details.h"
65 #endif
66 
67 #include "stringbuffer.h"
68 #include "liblwgeom.h"
69 #include "lwgeom_pg.h" /* For debugging macros. */
70 #include "gserialized_gist.h" /* For index common functions */
71 
72 #include <math.h>
73 #if HAVE_IEEEFP_H
74 #include <ieeefp.h>
75 #endif
76 #include <float.h>
77 #include <string.h>
78 #include <stdio.h>
79 #include <errno.h>
80 #include <ctype.h>
81 
82 /* Prototypes */
83 Datum gserialized_gist_joinsel(PG_FUNCTION_ARGS);
84 Datum gserialized_gist_joinsel_2d(PG_FUNCTION_ARGS);
85 Datum gserialized_gist_joinsel_nd(PG_FUNCTION_ARGS);
86 Datum gserialized_gist_sel(PG_FUNCTION_ARGS);
87 Datum gserialized_gist_sel_2d(PG_FUNCTION_ARGS);
88 Datum gserialized_gist_sel_nd(PG_FUNCTION_ARGS);
89 Datum gserialized_analyze_nd(PG_FUNCTION_ARGS);
90 Datum gserialized_estimated_extent(PG_FUNCTION_ARGS);
91 Datum _postgis_gserialized_sel(PG_FUNCTION_ARGS);
92 Datum _postgis_gserialized_joinsel(PG_FUNCTION_ARGS);
93 Datum _postgis_gserialized_stats(PG_FUNCTION_ARGS);
94 
95 /* Old Prototype */
96 Datum geometry_estimated_extent(PG_FUNCTION_ARGS);
97 
108 #define STATISTIC_KIND_ND 102
109 #define STATISTIC_KIND_2D 103
110 #define STATISTIC_SLOT_ND 0
111 #define STATISTIC_SLOT_2D 1
112 
113 /*
114 * The SD factor restricts the side of the statistics histogram
115 * based on the standard deviation of the extent of the data.
116 * SDFACTOR is the number of standard deviations from the mean
117 * the histogram will extend.
118 */
119 #define SDFACTOR 3.25
120 
126 #define ND_DIMS 4
127 
134 #define MIN_DIMENSION_WIDTH 0.000000001
135 
139 #define DEFAULT_ND_SEL 0.0001
140 #define DEFAULT_ND_JOINSEL 0.001
141 
145 #define FALLBACK_ND_SEL 0.2
146 #define FALLBACK_ND_JOINSEL 0.3
147 
153 typedef struct ND_BOX_T
154 {
155  float4 min[ND_DIMS];
156  float4 max[ND_DIMS];
157 } ND_BOX;
158 
162 typedef struct ND_IBOX_T
163 {
164  int min[ND_DIMS];
165  int max[ND_DIMS];
166 } ND_IBOX;
167 
168 
175 typedef struct ND_STATS_T
176 {
177  /* Dimensionality of the histogram. */
178  float4 ndims;
179 
180  /* Size of n-d histogram in each dimension. */
181  float4 size[ND_DIMS];
182 
183  /* Lower-left (min) and upper-right (max) spatial bounds of histogram. */
185 
186  /* How many rows in the table itself? */
188 
189  /* How many rows were in the sample that built this histogram? */
191 
192  /* How many not-Null/Empty features were in the sample? */
194 
195  /* How many features actually got sampled in the histogram? */
197 
198  /* How many cells in histogram? (sizex*sizey*sizez*sizem) */
200 
201  /* How many cells did those histogram features cover? */
202  /* Since we are pro-rating coverage, this number should */
203  /* now always equal histogram_features */
205 
206  /* Variable length # of floats for histogram */
207  float4 value[1];
208 } ND_STATS;
209 
210 
211 
212 
219 static int
220 gbox_ndims(const GBOX* gbox)
221 {
222  int dims = 2;
223  if ( FLAGS_GET_GEODETIC(gbox->flags) )
224  return 3;
225  if ( FLAGS_GET_Z(gbox->flags) )
226  dims++;
227  if ( FLAGS_GET_M(gbox->flags) )
228  dims++;
229  return dims;
230 }
231 
237 static int
238 text_p_get_mode(const text *txt)
239 {
240  int mode = 2;
241  char *modestr = text2cstring(txt);
242  if ( modestr[0] == 'N' )
243  mode = 0;
244  pfree(modestr);
245  return mode;
246 }
247 
248 
252 static int
253 cmp_int (const void *a, const void *b)
254 {
255  int ia = *((const int*)a);
256  int ib = *((const int*)b);
257 
258  if ( ia == ib )
259  return 0;
260  else if ( ia > ib )
261  return 1;
262  else
263  return -1;
264 }
265 
270 static int
271 range_quintile(int *vals, int nvals)
272 {
273  qsort(vals, nvals, sizeof(int), cmp_int);
274  return vals[4*nvals/5] - vals[nvals/5];
275 }
276 
280 static double
281 total_double(const double *vals, int nvals)
282 {
283  int i;
284  float total = 0;
285  /* Calculate total */
286  for ( i = 0; i < nvals; i++ )
287  total += vals[i];
288 
289  return total;
290 }
291 
292 #if POSTGIS_DEBUG_LEVEL >= 3
293 
297 static int
298 total_int(const int *vals, int nvals)
299 {
300  int i;
301  int total = 0;
302  /* Calculate total */
303  for ( i = 0; i < nvals; i++ )
304  total += vals[i];
305 
306  return total;
307 }
308 
312 static double
313 avg(const int *vals, int nvals)
314 {
315  int t = total_int(vals, nvals);
316  return (double)t / (double)nvals;
317 }
318 
322 static double
323 stddev(const int *vals, int nvals)
324 {
325  int i;
326  double sigma2 = 0;
327  double mean = avg(vals, nvals);
328 
329  /* Calculate sigma2 */
330  for ( i = 0; i < nvals; i++ )
331  {
332  double v = (double)(vals[i]);
333  sigma2 += (mean - v) * (mean - v);
334  }
335  return sqrt(sigma2 / nvals);
336 }
337 #endif /* POSTGIS_DEBUG_LEVEL >= 3 */
338 
343 static int
344 nd_stats_value_index(const ND_STATS *stats, int *indexes)
345 {
346  int d;
347  int accum = 1, vdx = 0;
348 
349  /* Calculate the index into the 1-d values array that the (i,j,k,l) */
350  /* n-d histogram coordinate implies. */
351  /* index = x + y * sizex + z * sizex * sizey + m * sizex * sizey * sizez */
352  for ( d = 0; d < (int)(stats->ndims); d++ )
353  {
354  int size = (int)(stats->size[d]);
355  if ( indexes[d] < 0 || indexes[d] >= size )
356  {
357  POSTGIS_DEBUGF(3, " bad index at (%d, %d)", indexes[0], indexes[1]);
358  return -1;
359  }
360  vdx += indexes[d] * accum;
361  accum *= size;
362  }
363  return vdx;
364 }
365 
369 static char*
370 nd_box_to_json(const ND_BOX *nd_box, int ndims)
371 {
372  char *rv;
373  int i;
375 
376  stringbuffer_append(sb, "{\"min\":[");
377  for ( i = 0; i < ndims; i++ )
378  {
379  if ( i ) stringbuffer_append(sb, ",");
380  stringbuffer_aprintf(sb, "%.6g", nd_box->min[i]);
381  }
382  stringbuffer_append(sb, "],\"max\":[");
383  for ( i = 0; i < ndims; i++ )
384  {
385  if ( i ) stringbuffer_append(sb, ",");
386  stringbuffer_aprintf(sb, "%.6g", nd_box->max[i]);
387  }
388  stringbuffer_append(sb, "]}");
389 
392  return rv;
393 }
394 
395 
400 static char*
401 nd_stats_to_json(const ND_STATS *nd_stats)
402 {
403  char *json_extent, *str;
404  int d;
406  int ndims = (int)roundf(nd_stats->ndims);
407 
408  stringbuffer_append(sb, "{");
409  stringbuffer_aprintf(sb, "\"ndims\":%d,", ndims);
410 
411  /* Size */
412  stringbuffer_append(sb, "\"size\":[");
413  for ( d = 0; d < ndims; d++ )
414  {
415  if ( d ) stringbuffer_append(sb, ",");
416  stringbuffer_aprintf(sb, "%d", (int)roundf(nd_stats->size[d]));
417  }
418  stringbuffer_append(sb, "],");
419 
420  /* Extent */
421  json_extent = nd_box_to_json(&(nd_stats->extent), ndims);
422  stringbuffer_aprintf(sb, "\"extent\":%s,", json_extent);
423  pfree(json_extent);
424 
425  stringbuffer_aprintf(sb, "\"table_features\":%d,", (int)roundf(nd_stats->table_features));
426  stringbuffer_aprintf(sb, "\"sample_features\":%d,", (int)roundf(nd_stats->sample_features));
427  stringbuffer_aprintf(sb, "\"not_null_features\":%d,", (int)roundf(nd_stats->not_null_features));
428  stringbuffer_aprintf(sb, "\"histogram_features\":%d,", (int)roundf(nd_stats->histogram_features));
429  stringbuffer_aprintf(sb, "\"histogram_cells\":%d,", (int)roundf(nd_stats->histogram_cells));
430  stringbuffer_aprintf(sb, "\"cells_covered\":%d", (int)roundf(nd_stats->cells_covered));
431  stringbuffer_append(sb, "}");
432 
433  str = stringbuffer_getstringcopy(sb);
435  return str;
436 }
437 
438 
444 // static char*
445 // nd_stats_to_grid(const ND_STATS *stats)
446 // {
447 // char *rv;
448 // int j, k;
449 // int sizex = (int)roundf(stats->size[0]);
450 // int sizey = (int)roundf(stats->size[1]);
451 // stringbuffer_t *sb = stringbuffer_create();
452 //
453 // for ( k = 0; k < sizey; k++ )
454 // {
455 // for ( j = 0; j < sizex; j++ )
456 // {
457 // stringbuffer_aprintf(sb, "%3d ", (int)roundf(stats->value[j + k*sizex]));
458 // }
459 // stringbuffer_append(sb, "\n");
460 // }
461 //
462 // rv = stringbuffer_getstringcopy(sb);
463 // stringbuffer_destroy(sb);
464 // return rv;
465 // }
466 
467 
469 static int
470 nd_box_merge(const ND_BOX *source, ND_BOX *target)
471 {
472  int d;
473  for ( d = 0; d < ND_DIMS; d++ )
474  {
475  target->min[d] = Min(target->min[d], source->min[d]);
476  target->max[d] = Max(target->max[d], source->max[d]);
477  }
478  return TRUE;
479 }
480 
482 static int
484 {
485  memset(a, 0, sizeof(ND_BOX));
486  return TRUE;
487 }
488 
494 static int
496 {
497  int d;
498  for ( d = 0; d < ND_DIMS; d++ )
499  {
500  a->min[d] = FLT_MAX;
501  a->max[d] = -1 * FLT_MAX;
502  }
503  return TRUE;
504 }
505 
507 static void
508 nd_box_from_gbox(const GBOX *gbox, ND_BOX *nd_box)
509 {
510  int d = 0;
511  POSTGIS_DEBUGF(3, " %s", gbox_to_string(gbox));
512 
513  nd_box_init(nd_box);
514  nd_box->min[d] = gbox->xmin;
515  nd_box->max[d] = gbox->xmax;
516  d++;
517  nd_box->min[d] = gbox->ymin;
518  nd_box->max[d] = gbox->ymax;
519  d++;
520  if ( FLAGS_GET_GEODETIC(gbox->flags) )
521  {
522  nd_box->min[d] = gbox->zmin;
523  nd_box->max[d] = gbox->zmax;
524  return;
525  }
526  if ( FLAGS_GET_Z(gbox->flags) )
527  {
528  nd_box->min[d] = gbox->zmin;
529  nd_box->max[d] = gbox->zmax;
530  d++;
531  }
532  if ( FLAGS_GET_M(gbox->flags) )
533  {
534  nd_box->min[d] = gbox->mmin;
535  nd_box->max[d] = gbox->mmax;
536  d++;
537  }
538  return;
539 }
540 
544 static int
545 nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
546 {
547  int d;
548  for ( d = 0; d < ndims; d++ )
549  {
550  if ( (a->min[d] > b->max[d]) || (a->max[d] < b->min[d]) )
551  return FALSE;
552  }
553  return TRUE;
554 }
555 
559 static int
560 nd_box_contains(const ND_BOX *a, const ND_BOX *b, int ndims)
561 {
562  int d;
563  for ( d = 0; d < ndims; d++ )
564  {
565  if ( ! ((a->min[d] < b->min[d]) && (a->max[d] > b->max[d])) )
566  return FALSE;
567  }
568  return TRUE;
569 }
570 
575 static int
576 nd_box_expand(ND_BOX *nd_box, double expansion_factor)
577 {
578  int d;
579  double size;
580  for ( d = 0; d < ND_DIMS; d++ )
581  {
582  size = nd_box->max[d] - nd_box->min[d];
583  if ( size <= 0 ) continue;
584  nd_box->min[d] -= size * expansion_factor / 2;
585  nd_box->max[d] += size * expansion_factor / 2;
586  }
587  return TRUE;
588 }
589 
594 static inline int
595 nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
596 {
597  int d;
598 
599  POSTGIS_DEBUGF(4, " nd_box: %s", nd_box_to_json(nd_box, nd_stats->ndims));
600 
601  /* Initialize ibox */
602  memset(nd_ibox, 0, sizeof(ND_IBOX));
603 
604  /* In each dimension... */
605  for ( d = 0; d < nd_stats->ndims; d++ )
606  {
607  double smin = nd_stats->extent.min[d];
608  double smax = nd_stats->extent.max[d];
609  double width = smax - smin;
610  int size = roundf(nd_stats->size[d]);
611 
612  /* ... find cells the box overlaps with in this dimension */
613  nd_ibox->min[d] = floor(size * (nd_box->min[d] - smin) / width);
614  nd_ibox->max[d] = floor(size * (nd_box->max[d] - smin) / width);
615 
616  POSTGIS_DEBUGF(5, " stats: dim %d: min %g: max %g: width %g", d, smin, smax, width);
617  POSTGIS_DEBUGF(5, " overlap: dim %d: (%d, %d)", d, nd_ibox->min[d], nd_ibox->max[d]);
618 
619  /* Push any out-of range values into range */
620  nd_ibox->min[d] = Max(nd_ibox->min[d], 0);
621  nd_ibox->max[d] = Min(nd_ibox->max[d], size-1);
622  }
623  return TRUE;
624 }
625 
629 static inline double
630 nd_box_ratio(const ND_BOX *b1, const ND_BOX *b2, int ndims)
631 {
632  int d;
633  bool covered = TRUE;
634  double ivol = 1.0;
635  double vol2 = 1.0;
636  double vol1 = 1.0;
637 
638  for ( d = 0 ; d < ndims; d++ )
639  {
640  if ( b1->max[d] <= b2->min[d] || b1->min[d] >= b2->max[d] )
641  return 0.0; /* Disjoint */
642 
643  if ( b1->min[d] > b2->min[d] || b1->max[d] < b2->max[d] )
644  covered = FALSE;
645  }
646 
647  if ( covered )
648  return 1.0;
649 
650  for ( d = 0; d < ndims; d++ )
651  {
652  double width1 = b1->max[d] - b1->min[d];
653  double width2 = b2->max[d] - b2->min[d];
654  double imin, imax, iwidth;
655 
656  vol1 *= width1;
657  vol2 *= width2;
658 
659  imin = Max(b1->min[d], b2->min[d]);
660  imax = Min(b1->max[d], b2->max[d]);
661  iwidth = imax - imin;
662  iwidth = Max(0.0, iwidth);
663 
664  ivol *= iwidth;
665  }
666 
667  if ( vol2 == 0.0 )
668  return vol2;
669 
670  return ivol / vol2;
671 }
672 
673 
689 static int
690 nd_box_array_distribution(const ND_BOX **nd_boxes, int num_boxes, const ND_BOX *extent, int ndims, double *distribution)
691 {
692  /* How many bins shall we use in figuring out the distribution? */
693  static int num_bins = 50;
694  int d, i, k, range;
695  int counts[num_bins];
696  double smin, smax; /* Spatial min, spatial max */
697  double swidth; /* Spatial width of dimension */
698 #if POSTGIS_DEBUG_LEVEL >= 3
699  double average, sdev, sdev_ratio;
700 #endif
701  int bmin, bmax; /* Bin min, bin max */
702  const ND_BOX *ndb;
703 
704  /* For each dimension... */
705  for ( d = 0; d < ndims; d++ )
706  {
707  /* Initialize counts for this dimension */
708  memset(counts, 0, sizeof(int)*num_bins);
709 
710  smin = extent->min[d];
711  smax = extent->max[d];
712  swidth = smax - smin;
713 
714  /* Don't try and calculate distribution of overly narrow dimensions */
715  if ( swidth < MIN_DIMENSION_WIDTH )
716  {
717  distribution[d] = 0;
718  continue;
719  }
720 
721  /* Sum up the overlaps of each feature with the dimensional bins */
722  for ( i = 0; i < num_boxes; i++ )
723  {
724  double minoffset, maxoffset;
725 
726  /* Skip null entries */
727  ndb = nd_boxes[i];
728  if ( ! ndb ) continue;
729 
730  /* Where does box fall relative to the working range */
731  minoffset = ndb->min[d] - smin;
732  maxoffset = ndb->max[d] - smin;
733 
734  /* Skip boxes that our outside our working range */
735  if ( minoffset < 0 || minoffset > swidth ||
736  maxoffset < 0 || maxoffset > swidth )
737  {
738  continue;
739  }
740 
741  /* What bins does this range correspond to? */
742  bmin = num_bins * (minoffset) / swidth;
743  bmax = num_bins * (maxoffset) / swidth;
744 
745  POSTGIS_DEBUGF(4, " dimension %d, feature %d: bin %d to bin %d", d, i, bmin, bmax);
746 
747  /* Increment the counts in all the bins this feature overlaps */
748  for ( k = bmin; k <= bmax; k++ )
749  {
750  counts[k] += 1;
751  }
752 
753  }
754 
755  /* How dispersed is the distribution of features across bins? */
756  range = range_quintile(counts, num_bins);
757 
758 #if POSTGIS_DEBUG_LEVEL >= 3
759  average = avg(counts, num_bins);
760  sdev = stddev(counts, num_bins);
761  sdev_ratio = sdev/average;
762 
763  POSTGIS_DEBUGF(3, " dimension %d: range = %d", d, range);
764  POSTGIS_DEBUGF(3, " dimension %d: average = %.6g", d, average);
765  POSTGIS_DEBUGF(3, " dimension %d: stddev = %.6g", d, sdev);
766  POSTGIS_DEBUGF(3, " dimension %d: stddev_ratio = %.6g", d, sdev_ratio);
767 #endif
768 
769  distribution[d] = range;
770  }
771 
772  return TRUE;
773 }
774 
780 static inline int
781 nd_increment(ND_IBOX *ibox, int ndims, int *counter)
782 {
783  int d = 0;
784 
785  while ( d < ndims )
786  {
787  if ( counter[d] < ibox->max[d] )
788  {
789  counter[d] += 1;
790  break;
791  }
792  counter[d] = ibox->min[d];
793  d++;
794  }
795  /* That's it, cannot increment any more! */
796  if ( d == ndims )
797  return FALSE;
798 
799  /* Increment complete! */
800  return TRUE;
801 }
802 
803 static ND_STATS*
804 pg_nd_stats_from_tuple(HeapTuple stats_tuple, int mode)
805 {
806  int stats_kind = STATISTIC_KIND_ND;
807  int rv, nvalues;
808  float4 *floatptr;
809  ND_STATS *nd_stats;
810 
811  /* If we're in 2D mode, set the kind appropriately */
812  if ( mode == 2 ) stats_kind = STATISTIC_KIND_2D;
813 
814  /* Then read the geom status histogram from that */
815  rv = get_attstatsslot(stats_tuple, 0, 0, stats_kind, InvalidOid,
816  NULL, NULL, NULL, &floatptr, &nvalues);
817  if ( ! rv ) {
818  POSTGIS_DEBUGF(2,
819  "no slot of kind %d in stats tuple", stats_kind);
820  return NULL;
821  }
822 
823  /* Clone the stats here so we can release the attstatsslot immediately */
824  nd_stats = palloc(sizeof(float) * nvalues);
825  memcpy(nd_stats, floatptr, sizeof(float) * nvalues);
826 
827  /* Clean up */
828  free_attstatsslot(0, NULL, 0, floatptr, nvalues);
829 
830  return nd_stats;
831 }
832 
837 static ND_STATS*
838 pg_get_nd_stats(const Oid table_oid, AttrNumber att_num, int mode)
839 {
840  HeapTuple stats_tuple;
841  ND_STATS *nd_stats;
842 
843  /* First pull the stats tuple */
844  stats_tuple = SearchSysCache2(STATRELATT, table_oid, att_num);
845  if ( ! stats_tuple )
846  {
847  POSTGIS_DEBUGF(2, "stats for \"%s\" do not exist", get_rel_name(table_oid)? get_rel_name(table_oid) : "NULL");
848  return NULL;
849  }
850 
851  nd_stats = pg_nd_stats_from_tuple(stats_tuple, mode);
852  ReleaseSysCache(stats_tuple);
853  if ( ! nd_stats )
854  {
855  POSTGIS_DEBUGF(2,
856  "histogram for attribute %d of table \"%s\" does not exist?",
857  att_num, get_rel_name(table_oid));
858  }
859 
860  return nd_stats;
861 }
862 
868 static ND_STATS*
869 pg_get_nd_stats_by_name(const Oid table_oid, const text *att_text, int mode)
870 {
871  const char *att_name = text2cstring(att_text);
872  AttrNumber att_num;
873 
874  /* We know the name? Look up the num */
875  if ( att_text )
876  {
877  /* Get the attribute number */
878  att_num = get_attnum(table_oid, att_name);
879  if ( ! att_num ) {
880  elog(ERROR, "attribute \"%s\" does not exist", att_name);
881  return NULL;
882  }
883  }
884  else
885  {
886  elog(ERROR, "attribute name is null");
887  return NULL;
888  }
889 
890  return pg_get_nd_stats(table_oid, att_num, mode);
891 }
892 
906 static float8
908 {
909  int ncells1, ncells2;
910  int ndims1, ndims2, ndims;
911  double ntuples_max;
912  double ntuples_not_null1, ntuples_not_null2;
913 
914  ND_BOX extent1, extent2;
915  ND_IBOX ibox1, ibox2;
916  int at1[ND_DIMS];
917  int at2[ND_DIMS];
918  double min1[ND_DIMS];
919  double width1[ND_DIMS];
920  double cellsize1[ND_DIMS];
921  int size2[ND_DIMS];
922  double min2[ND_DIMS];
923  double width2[ND_DIMS];
924  double cellsize2[ND_DIMS];
925  int size1[ND_DIMS];
926  int d;
927  double val = 0;
928  float8 selectivity;
929 
930  /* Drop out on null inputs */
931  if ( ! ( s1 && s2 ) )
932  {
933  elog(NOTICE, " estimate_join_selectivity called with null inputs");
934  return FALLBACK_ND_SEL;
935  }
936 
937  /* We need to know how many cells each side has... */
938  ncells1 = (int)roundf(s1->histogram_cells);
939  ncells2 = (int)roundf(s2->histogram_cells);
940 
941  /* ...so that we can drive the summation loop with the smaller histogram. */
942  if ( ncells1 > ncells2 )
943  {
944  const ND_STATS *stats_tmp = s1;
945  s1 = s2;
946  s2 = stats_tmp;
947  }
948 
949  POSTGIS_DEBUGF(3, "s1: %s", nd_stats_to_json(s1));
950  POSTGIS_DEBUGF(3, "s2: %s", nd_stats_to_json(s2));
951 
952  /* Re-read that info after the swap */
953  ncells1 = (int)roundf(s1->histogram_cells);
954  ncells2 = (int)roundf(s2->histogram_cells);
955 
956  /* Q: What's the largest possible join size these relations can create? */
957  /* A: The product of the # of non-null rows in each relation. */
958  ntuples_not_null1 = s1->table_features * (s1->not_null_features / s1->sample_features);
959  ntuples_not_null2 = s2->table_features * (s2->not_null_features / s2->sample_features);
960  ntuples_max = ntuples_not_null1 * ntuples_not_null2;
961 
962  /* Get the ndims as ints */
963  ndims1 = (int)roundf(s1->ndims);
964  ndims2 = (int)roundf(s2->ndims);
965  ndims = Max(ndims1, ndims2);
966 
967  /* Get the extents */
968  extent1 = s1->extent;
969  extent2 = s2->extent;
970 
971  /* If relation stats do not intersect, join is very very selective. */
972  if ( ! nd_box_intersects(&extent1, &extent2, ndims) )
973  {
974  POSTGIS_DEBUG(3, "relation stats do not intersect, returning 0");
975  PG_RETURN_FLOAT8(0.0);
976  }
977 
978  /*
979  * First find the index range of the part of the smaller
980  * histogram that overlaps the larger one.
981  */
982  if ( ! nd_box_overlap(s1, &extent2, &ibox1) )
983  {
984  POSTGIS_DEBUG(3, "could not calculate overlap of relations");
985  PG_RETURN_FLOAT8(FALLBACK_ND_JOINSEL);
986  }
987 
988  /* Initialize counters / constants on s1 */
989  for ( d = 0; d < ndims1; d++ )
990  {
991  at1[d] = ibox1.min[d];
992  min1[d] = s1->extent.min[d];
993  width1[d] = s1->extent.max[d] - s1->extent.min[d];
994  size1[d] = (int)roundf(s1->size[d]);
995  cellsize1[d] = width1[d] / size1[d];
996  }
997 
998  /* Initialize counters / constants on s2 */
999  for ( d = 0; d < ndims2; d++ )
1000  {
1001  min2[d] = s2->extent.min[d];
1002  width2[d] = s2->extent.max[d] - s2->extent.min[d];
1003  size2[d] = (int)roundf(s2->size[d]);
1004  cellsize2[d] = width2[d] / size2[d];
1005  }
1006 
1007  /* For each affected cell of s1... */
1008  do
1009  {
1010  double val1;
1011  /* Construct the bounds of this cell */
1012  ND_BOX nd_cell1;
1013  nd_box_init(&nd_cell1);
1014  for ( d = 0; d < ndims1; d++ )
1015  {
1016  nd_cell1.min[d] = min1[d] + (at1[d]+0) * cellsize1[d];
1017  nd_cell1.max[d] = min1[d] + (at1[d]+1) * cellsize1[d];
1018  }
1019 
1020  /* Find the cells of s2 that cell1 overlaps.. */
1021  nd_box_overlap(s2, &nd_cell1, &ibox2);
1022 
1023  /* Initialize counter */
1024  for ( d = 0; d < ndims2; d++ )
1025  {
1026  at2[d] = ibox2.min[d];
1027  }
1028 
1029  POSTGIS_DEBUGF(3, "at1 %d,%d %s", at1[0], at1[1], nd_box_to_json(&nd_cell1, ndims1));
1030 
1031  /* Get the value at this cell */
1032  val1 = s1->value[nd_stats_value_index(s1, at1)];
1033 
1034  /* For each overlapped cell of s2... */
1035  do
1036  {
1037  double ratio2;
1038  double val2;
1039 
1040  /* Construct the bounds of this cell */
1041  ND_BOX nd_cell2;
1042  nd_box_init(&nd_cell2);
1043  for ( d = 0; d < ndims2; d++ )
1044  {
1045  nd_cell2.min[d] = min2[d] + (at2[d]+0) * cellsize2[d];
1046  nd_cell2.max[d] = min2[d] + (at2[d]+1) * cellsize2[d];
1047  }
1048 
1049  POSTGIS_DEBUGF(3, " at2 %d,%d %s", at2[0], at2[1], nd_box_to_json(&nd_cell2, ndims2));
1050 
1051  /* Calculate overlap ratio of the cells */
1052  ratio2 = nd_box_ratio(&nd_cell1, &nd_cell2, Max(ndims1, ndims2));
1053 
1054  /* Multiply the cell counts, scaled by overlap ratio */
1055  val2 = s2->value[nd_stats_value_index(s2, at2)];
1056  POSTGIS_DEBUGF(3, " val1 %.6g val2 %.6g ratio %.6g", val1, val2, ratio2);
1057  val += val1 * (val2 * ratio2);
1058  }
1059  while ( nd_increment(&ibox2, ndims2, at2) );
1060 
1061  }
1062  while( nd_increment(&ibox1, ndims1, at1) );
1063 
1064  POSTGIS_DEBUGF(3, "val of histogram = %g", val);
1065 
1066  /*
1067  * In order to compare our total cell count "val" to the
1068  * ntuples_max, we need to scale val up to reflect a full
1069  * table estimate. So, multiply by ratio of table size to
1070  * sample size.
1071  */
1072  val *= (s1->table_features / s1->sample_features);
1073  val *= (s2->table_features / s2->sample_features);
1074 
1075  POSTGIS_DEBUGF(3, "val scaled to full table size = %g", val);
1076 
1077  /*
1078  * Because the cell counts are over-determined due to
1079  * double counting of features that overlap multiple cells
1080  * (see the compute_gserialized_stats routine)
1081  * we also have to scale our cell count "val" *down*
1082  * to adjust for the double counting.
1083  */
1084 // val /= (s1->cells_covered / s1->histogram_features);
1085 // val /= (s2->cells_covered / s2->histogram_features);
1086 
1087  /*
1088  * Finally, the selectivity is the estimated number of
1089  * rows to be returned divided by the maximum possible
1090  * number of rows that can be returned.
1091  */
1092  selectivity = val / ntuples_max;
1093 
1094  /* Guard against over-estimates and crazy numbers :) */
1095  if ( isnan(selectivity) || ! isfinite(selectivity) || selectivity < 0.0 )
1096  {
1097  selectivity = DEFAULT_ND_JOINSEL;
1098  }
1099  else if ( selectivity > 1.0 )
1100  {
1101  selectivity = 1.0;
1102  }
1103 
1104  return selectivity;
1105 }
1106 
1112 Datum gserialized_gist_joinsel_nd(PG_FUNCTION_ARGS)
1113 {
1114  PG_RETURN_DATUM(DirectFunctionCall5(
1116  PG_GETARG_DATUM(0), PG_GETARG_DATUM(1),
1117  PG_GETARG_DATUM(2), PG_GETARG_DATUM(3),
1118  Int32GetDatum(0) /* ND mode */
1119  ));
1120 }
1121 
1127 Datum gserialized_gist_joinsel_2d(PG_FUNCTION_ARGS)
1128 {
1129  PG_RETURN_DATUM(DirectFunctionCall5(
1131  PG_GETARG_DATUM(0), PG_GETARG_DATUM(1),
1132  PG_GETARG_DATUM(2), PG_GETARG_DATUM(3),
1133  Int32GetDatum(2) /* 2D mode */
1134  ));
1135 }
1136 
1146 Datum gserialized_gist_joinsel(PG_FUNCTION_ARGS)
1147 {
1148  PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
1149  /* Oid operator = PG_GETARG_OID(1); */
1150  List *args = (List *) PG_GETARG_POINTER(2);
1151  JoinType jointype = (JoinType) PG_GETARG_INT16(3);
1152  int mode = PG_GETARG_INT32(4);
1153 
1154  Node *arg1, *arg2;
1155  Var *var1, *var2;
1156  Oid relid1, relid2;
1157 
1158  ND_STATS *stats1, *stats2;
1159  float8 selectivity;
1160 
1161  /* Only respond to an inner join/unknown context join */
1162  if (jointype != JOIN_INNER)
1163  {
1164  elog(DEBUG1, "%s: jointype %d not supported", __func__, jointype);
1165  PG_RETURN_FLOAT8(DEFAULT_ND_JOINSEL);
1166  }
1167 
1168  /* Find Oids of the geometry columns we are working with */
1169  arg1 = (Node*) linitial(args);
1170  arg2 = (Node*) lsecond(args);
1171  var1 = (Var*) arg1;
1172  var2 = (Var*) arg2;
1173 
1174  /* We only do column joins right now, no functional joins */
1175  /* TODO: handle g1 && ST_Expand(g2) */
1176  if (!IsA(arg1, Var) || !IsA(arg2, Var))
1177  {
1178  elog(DEBUG1, "%s called with arguments that are not column references", __func__);
1179  PG_RETURN_FLOAT8(DEFAULT_ND_JOINSEL);
1180  }
1181 
1182  /* What are the Oids of our tables/relations? */
1183  relid1 = getrelid(var1->varno, root->parse->rtable);
1184  relid2 = getrelid(var2->varno, root->parse->rtable);
1185 
1186  POSTGIS_DEBUGF(3, "using relations \"%s\" Oid(%d), \"%s\" Oid(%d)",
1187  get_rel_name(relid1) ? get_rel_name(relid1) : "NULL", relid1, get_rel_name(relid2) ? get_rel_name(relid2) : "NULL", relid2);
1188 
1189  /* Pull the stats from the stats system. */
1190  stats1 = pg_get_nd_stats(relid1, var1->varattno, mode);
1191  stats2 = pg_get_nd_stats(relid2, var2->varattno, mode);
1192 
1193  /* If we can't get stats, we have to stop here! */
1194  if ( ! stats1 )
1195  {
1196  POSTGIS_DEBUGF(3, "unable to retrieve stats for \"%s\" Oid(%d)", get_rel_name(relid1) ? get_rel_name(relid1) : "NULL" , relid1);
1197  PG_RETURN_FLOAT8(DEFAULT_ND_JOINSEL);
1198  }
1199  else if ( ! stats2 )
1200  {
1201  POSTGIS_DEBUGF(3, "unable to retrieve stats for \"%s\" Oid(%d)", get_rel_name(relid2) ? get_rel_name(relid2) : "NULL", relid2);
1202  PG_RETURN_FLOAT8(DEFAULT_ND_JOINSEL);
1203  }
1204 
1205  selectivity = estimate_join_selectivity(stats1, stats2);
1206  POSTGIS_DEBUGF(2, "got selectivity %g", selectivity);
1207 
1208  pfree(stats1);
1209  pfree(stats2);
1210  PG_RETURN_FLOAT8(selectivity);
1211 }
1212 
1213 
1214 
1215 
1234 static void
1235 compute_gserialized_stats_mode(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
1236  int sample_rows, double total_rows, int mode)
1237 {
1238  MemoryContext old_context;
1239  int d, i; /* Counters */
1240  int notnull_cnt = 0; /* # not null rows in the sample */
1241  int null_cnt = 0; /* # null rows in the sample */
1242  int histogram_features = 0; /* # rows that actually got counted in the histogram */
1243 
1244  ND_STATS *nd_stats; /* Our histogram */
1245  size_t nd_stats_size; /* Size to allocate */
1246 
1247  double total_width = 0; /* # of bytes used by sample */
1248  double total_sample_volume = 0; /* Area/volume coverage of the sample */
1249  double total_cell_count = 0; /* # of cells in histogram affected by sample */
1250 
1251  ND_BOX sum; /* Sum of extents of sample boxes */
1252  ND_BOX avg; /* Avg of extents of sample boxes */
1253  ND_BOX stddev; /* StdDev of extents of sample boxes */
1254 
1255  const ND_BOX **sample_boxes; /* ND_BOXes for each of the sample features */
1256  ND_BOX sample_extent; /* Extent of the raw sample */
1257  int histo_size[ND_DIMS]; /* histogram nrows, ncols, etc */
1258  ND_BOX histo_extent; /* Spatial extent of the histogram */
1259  ND_BOX histo_extent_new; /* Temporary variable */
1260  int histo_cells_target; /* Number of cells we will shoot for, given the stats target */
1261  int histo_cells; /* Number of cells in the histogram */
1262  int histo_cells_new = 1; /* Temporary variable */
1263 
1264  int ndims = 2; /* Dimensionality of the sample */
1265  int histo_ndims = 0; /* Dimensionality of the histogram */
1266  double sample_distribution[ND_DIMS]; /* How homogeneous is distribution of sample in each axis? */
1267  double total_distribution; /* Total of sample_distribution */
1268 
1269  int stats_slot; /* What slot is this data going into? (2D vs ND) */
1270  int stats_kind; /* And this is what? (2D vs ND) */
1271 
1272  /* Initialize sum and stddev */
1273  nd_box_init(&sum);
1274  nd_box_init(&stddev);
1275 
1276  /*
1277  * This is where gserialized_analyze_nd
1278  * should put its' custom parameters.
1279  */
1280  /* void *mystats = stats->extra_data; */
1281 
1282  POSTGIS_DEBUG(2, "compute_gserialized_stats called");
1283  POSTGIS_DEBUGF(3, " # sample_rows: %d", sample_rows);
1284  POSTGIS_DEBUGF(3, " estimate of total_rows: %.6g", total_rows);
1285 
1286  /*
1287  * We might need less space, but don't think
1288  * its worth saving...
1289  */
1290  sample_boxes = palloc(sizeof(ND_BOX*) * sample_rows);
1291 
1292  /*
1293  * First scan:
1294  * o read boxes
1295  * o find dimensionality of the sample
1296  * o find extent of the sample
1297  * o count null-infinite/not-null values
1298  * o compute total_width
1299  * o compute total features's box area (for avgFeatureArea)
1300  * o sum features box coordinates (for standard deviation)
1301  */
1302  for ( i = 0; i < sample_rows; i++ )
1303  {
1304  Datum datum;
1305  GSERIALIZED *geom;
1306  GBOX gbox;
1307  ND_BOX *nd_box;
1308  bool is_null;
1309  bool is_copy;
1310 
1311  datum = fetchfunc(stats, i, &is_null);
1312 
1313  /* Skip all NULLs. */
1314  if ( is_null )
1315  {
1316  POSTGIS_DEBUGF(4, " skipped null geometry %d", i);
1317  null_cnt++;
1318  continue;
1319  }
1320 
1321  /* Read the bounds from the gserialized. */
1322  geom = (GSERIALIZED *)PG_DETOAST_DATUM(datum);
1323  is_copy = VARATT_IS_EXTENDED(datum);
1324  if ( LW_FAILURE == gserialized_get_gbox_p(geom, &gbox) )
1325  {
1326  /* Skip empties too. */
1327  POSTGIS_DEBUGF(3, " skipped empty geometry %d", i);
1328  continue;
1329  }
1330 
1331  /* If we're in 2D mode, zero out the higher dimensions for "safety" */
1332  if ( mode == 2 )
1333  gbox.zmin = gbox.zmax = gbox.mmin = gbox.mmax = 0.0;
1334 
1335  /* Check bounds for validity (finite and not NaN) */
1336  if ( ! gbox_is_valid(&gbox) )
1337  {
1338  POSTGIS_DEBUGF(3, " skipped infinite/nan geometry %d", i);
1339  continue;
1340  }
1341 
1342  /*
1343  * In N-D mode, set the ndims to the maximum dimensionality found
1344  * in the sample. Otherwise, leave at ndims == 2.
1345  */
1346  if ( mode != 2 )
1347  ndims = Max(gbox_ndims(&gbox), ndims);
1348 
1349  /* Convert gbox to n-d box */
1350  nd_box = palloc(sizeof(ND_BOX));
1351  nd_box_from_gbox(&gbox, nd_box);
1352 
1353  /* Cache n-d bounding box */
1354  sample_boxes[notnull_cnt] = nd_box;
1355 
1356  /* Initialize sample extent before merging first entry */
1357  if ( ! notnull_cnt )
1358  nd_box_init_bounds(&sample_extent);
1359 
1360  /* Add current sample to overall sample extent */
1361  nd_box_merge(nd_box, &sample_extent);
1362 
1363  /* How many bytes does this sample use? */
1364  total_width += VARSIZE(geom);
1365 
1366  /* Add bounds coordinates to sums for stddev calculation */
1367  for ( d = 0; d < ndims; d++ )
1368  {
1369  sum.min[d] += nd_box->min[d];
1370  sum.max[d] += nd_box->max[d];
1371  }
1372 
1373  /* Increment our "good feature" count */
1374  notnull_cnt++;
1375 
1376  /* Free up memory if our sample geometry was copied */
1377  if ( is_copy )
1378  pfree(geom);
1379 
1380  /* Give backend a chance of interrupting us */
1381  vacuum_delay_point();
1382  }
1383 
1384  /*
1385  * We'll build a histogram having stats->attr->attstattarget cells
1386  * on each side, within reason... we'll use ndims*10000 as the
1387  * maximum number of cells.
1388  * Also, if we're sampling a relatively small table, we'll try to ensure that
1389  * we have an average of 5 features for each cell so the histogram isn't
1390  * so sparse.
1391  */
1392  histo_cells_target = (int)pow((double)(stats->attr->attstattarget), (double)ndims);
1393  histo_cells_target = Min(histo_cells_target, ndims * 10000);
1394  histo_cells_target = Min(histo_cells_target, (int)(total_rows/5));
1395  POSTGIS_DEBUGF(3, " stats->attr->attstattarget: %d", stats->attr->attstattarget);
1396  POSTGIS_DEBUGF(3, " target # of histogram cells: %d", histo_cells_target);
1397 
1398  /* If there's no useful features, we can't work out stats */
1399  if ( ! notnull_cnt )
1400  {
1401  elog(NOTICE, "no non-null/empty features, unable to compute statistics");
1402  stats->stats_valid = false;
1403  return;
1404  }
1405 
1406  POSTGIS_DEBUGF(3, " sample_extent: %s", nd_box_to_json(&sample_extent, ndims));
1407 
1408  /*
1409  * Second scan:
1410  * o compute standard deviation
1411  */
1412  for ( d = 0; d < ndims; d++ )
1413  {
1414  /* Calculate average bounds values */
1415  avg.min[d] = sum.min[d] / notnull_cnt;
1416  avg.max[d] = sum.max[d] / notnull_cnt;
1417 
1418  /* Calculate standard deviation for this dimension bounds */
1419  for ( i = 0; i < notnull_cnt; i++ )
1420  {
1421  const ND_BOX *ndb = sample_boxes[i];
1422  stddev.min[d] += (ndb->min[d] - avg.min[d]) * (ndb->min[d] - avg.min[d]);
1423  stddev.max[d] += (ndb->max[d] - avg.max[d]) * (ndb->max[d] - avg.max[d]);
1424  }
1425  stddev.min[d] = sqrt(stddev.min[d] / notnull_cnt);
1426  stddev.max[d] = sqrt(stddev.max[d] / notnull_cnt);
1427 
1428  /* Histogram bounds for this dimension bounds is avg +/- SDFACTOR * stdev */
1429  histo_extent.min[d] = Max(avg.min[d] - SDFACTOR * stddev.min[d], sample_extent.min[d]);
1430  histo_extent.max[d] = Min(avg.max[d] + SDFACTOR * stddev.max[d], sample_extent.max[d]);
1431  }
1432 
1433  /*
1434  * Third scan:
1435  * o skip hard deviants
1436  * o compute new histogram box
1437  */
1438  nd_box_init_bounds(&histo_extent_new);
1439  for ( i = 0; i < notnull_cnt; i++ )
1440  {
1441  const ND_BOX *ndb = sample_boxes[i];
1442  /* Skip any hard deviants (boxes entirely outside our histo_extent */
1443  if ( ! nd_box_intersects(&histo_extent, ndb, ndims) )
1444  {
1445  POSTGIS_DEBUGF(4, " feature %d is a hard deviant, skipped", i);
1446  sample_boxes[i] = NULL;
1447  continue;
1448  }
1449  /* Expand our new box to fit all the other features. */
1450  nd_box_merge(ndb, &histo_extent_new);
1451  }
1452  /*
1453  * Expand the box slightly (1%) to avoid edge effects
1454  * with objects that are on the boundary
1455  */
1456  nd_box_expand(&histo_extent_new, 0.01);
1457  histo_extent = histo_extent_new;
1458 
1459  /*
1460  * How should we allocate our histogram cells to the
1461  * different dimensions? We can't do it by raw dimensional width,
1462  * because in x/y/z space, the z can have different units
1463  * from the x/y. Similarly for x/y/t space.
1464  * So, we instead calculate how much features overlap
1465  * each other in their dimension to figure out which
1466  * dimensions have useful selectivity characteristics (more
1467  * variability in density) and therefor would find
1468  * more cells useful (to distinguish between dense places and
1469  * homogeneous places).
1470  */
1471  nd_box_array_distribution(sample_boxes, notnull_cnt, &histo_extent, ndims,
1472  sample_distribution);
1473 
1474  /*
1475  * The sample_distribution array now tells us how spread out the
1476  * data is in each dimension, so we use that data to allocate
1477  * the histogram cells we have available.
1478  * At this point, histo_cells_target is the approximate target number
1479  * of cells.
1480  */
1481 
1482  /*
1483  * Some dimensions have basically a uniform distribution, we want
1484  * to allocate no cells to those dimensions, only to dimensions
1485  * that have some interesting differences in data distribution.
1486  * Here we count up the number of interesting dimensions
1487  */
1488  for ( d = 0; d < ndims; d++ )
1489  {
1490  if ( sample_distribution[d] > 0 )
1491  histo_ndims++;
1492  }
1493 
1494  if ( histo_ndims == 0 )
1495  {
1496  /* Special case: all our dimensions had low variability! */
1497  /* We just divide the cells up evenly */
1498  POSTGIS_DEBUG(3, " special case: no axes have variability");
1499  histo_cells_new = 1;
1500  for ( d = 0; d < ndims; d++ )
1501  {
1502  histo_size[d] = 1 + (int)pow((double)histo_cells_target, 1/(double)ndims);
1503  POSTGIS_DEBUGF(3, " histo_size[d]: %d", histo_size[d]);
1504  histo_cells_new *= histo_size[d];
1505  }
1506  POSTGIS_DEBUGF(3, " histo_cells_new: %d", histo_cells_new);
1507  }
1508  else
1509  {
1510  /*
1511  * We're going to express the amount of variability in each dimension
1512  * as a proportion of the total variability and allocate cells in that
1513  * dimension relative to that proportion.
1514  */
1515  POSTGIS_DEBUG(3, " allocating histogram axes based on axis variability");
1516  total_distribution = total_double(sample_distribution, ndims); /* First get the total */
1517  POSTGIS_DEBUGF(3, " total_distribution: %.8g", total_distribution);
1518  histo_cells_new = 1; /* For the number of cells in the final histogram */
1519  for ( d = 0; d < ndims; d++ )
1520  {
1521  if ( sample_distribution[d] == 0 ) /* Uninteresting dimensions don't get any room */
1522  {
1523  histo_size[d] = 1;
1524  }
1525  else /* Interesting dimension */
1526  {
1527  /* How does this dims variability compare to the total? */
1528  float edge_ratio = (float)sample_distribution[d] / (float)total_distribution;
1529  /*
1530  * Scale the target cells number by the # of dims and ratio,
1531  * then take the appropriate root to get the estimated number of cells
1532  * on this axis (eg, pow(0.5) for 2d, pow(0.333) for 3d, pow(0.25) for 4d)
1533  */
1534  histo_size[d] = (int)pow(histo_cells_target * histo_ndims * edge_ratio, 1/(double)histo_ndims);
1535  /* If something goes awry, just give this dim one slot */
1536  if ( ! histo_size[d] )
1537  histo_size[d] = 1;
1538  }
1539  histo_cells_new *= histo_size[d];
1540  }
1541  POSTGIS_DEBUGF(3, " histo_cells_new: %d", histo_cells_new);
1542  }
1543 
1544  /* Update histo_cells to the actual number of cells we need to allocate */
1545  histo_cells = histo_cells_new;
1546  POSTGIS_DEBUGF(3, " histo_cells: %d", histo_cells);
1547 
1548  /*
1549  * Create the histogram (ND_STATS) in the stats memory context
1550  */
1551  old_context = MemoryContextSwitchTo(stats->anl_context);
1552  nd_stats_size = sizeof(ND_STATS) + ((histo_cells - 1) * sizeof(float4));
1553  nd_stats = palloc(nd_stats_size);
1554  memset(nd_stats, 0, nd_stats_size); /* Initialize all values to 0 */
1555  MemoryContextSwitchTo(old_context);
1556 
1557  /* Initialize the #ND_STATS objects */
1558  nd_stats->ndims = ndims;
1559  nd_stats->extent = histo_extent;
1560  nd_stats->sample_features = sample_rows;
1561  nd_stats->table_features = total_rows;
1562  nd_stats->not_null_features = notnull_cnt;
1563  /* Copy in the histogram dimensions */
1564  for ( d = 0; d < ndims; d++ )
1565  nd_stats->size[d] = histo_size[d];
1566 
1567  /*
1568  * Fourth scan:
1569  * o fill histogram values with the proportion of
1570  * features' bbox overlaps: a feature's bvol
1571  * can fully overlap (1) or partially overlap
1572  * (fraction of 1) an histogram cell.
1573  *
1574  * Note that we are filling each cell with the "portion of
1575  * the feature's box that overlaps the cell". So, if we sum
1576  * up the values in the histogram, we could get the
1577  * histogram feature count.
1578  *
1579  */
1580  for ( i = 0; i < notnull_cnt; i++ )
1581  {
1582  const ND_BOX *nd_box;
1583  ND_IBOX nd_ibox;
1584  int at[ND_DIMS];
1585  int d;
1586  double num_cells = 0;
1587  double tmp_volume = 1.0;
1588  double min[ND_DIMS];
1589  double max[ND_DIMS];
1590  double cellsize[ND_DIMS];
1591 
1592  nd_box = sample_boxes[i];
1593  if ( ! nd_box ) continue; /* Skip Null'ed out hard deviants */
1594 
1595  /* Give backend a chance of interrupting us */
1596  vacuum_delay_point();
1597 
1598  /* Find the cells that overlap with this box and put them into the ND_IBOX */
1599  nd_box_overlap(nd_stats, nd_box, &nd_ibox);
1600  memset(at, 0, sizeof(int)*ND_DIMS);
1601 
1602  POSTGIS_DEBUGF(3, " feature %d: ibox (%d, %d, %d, %d) (%d, %d, %d, %d)", i,
1603  nd_ibox.min[0], nd_ibox.min[1], nd_ibox.min[2], nd_ibox.min[3],
1604  nd_ibox.max[0], nd_ibox.max[1], nd_ibox.max[2], nd_ibox.max[3]);
1605 
1606  for ( d = 0; d < nd_stats->ndims; d++ )
1607  {
1608  /* Initialize the starting values */
1609  at[d] = nd_ibox.min[d];
1610  min[d] = nd_stats->extent.min[d];
1611  max[d] = nd_stats->extent.max[d];
1612  cellsize[d] = (max[d] - min[d])/(nd_stats->size[d]);
1613 
1614  /* What's the volume (area) of this feature's box? */
1615  tmp_volume *= (nd_box->max[d] - nd_box->min[d]);
1616  }
1617 
1618  /* Add feature volume (area) to our total */
1619  total_sample_volume += tmp_volume;
1620 
1621  /*
1622  * Move through all the overlaped histogram cells values and
1623  * add the box overlap proportion to them.
1624  */
1625  do
1626  {
1627  ND_BOX nd_cell;
1628  double ratio;
1629  /* Create a box for this histogram cell */
1630  for ( d = 0; d < nd_stats->ndims; d++ )
1631  {
1632  nd_cell.min[d] = min[d] + (at[d]+0) * cellsize[d];
1633  nd_cell.max[d] = min[d] + (at[d]+1) * cellsize[d];
1634  }
1635 
1636  /*
1637  * If a feature box is completely inside one cell the ratio will be
1638  * 1.0. If a feature box is 50% in two cells, each cell will get
1639  * 0.5 added on.
1640  */
1641  ratio = nd_box_ratio(&nd_cell, nd_box, nd_stats->ndims);
1642  nd_stats->value[nd_stats_value_index(nd_stats, at)] += ratio;
1643  num_cells += ratio;
1644  POSTGIS_DEBUGF(3, " ratio (%.8g) num_cells (%.8g)", ratio, num_cells);
1645  POSTGIS_DEBUGF(3, " at (%d, %d, %d, %d)", at[0], at[1], at[2], at[3]);
1646  }
1647  while ( nd_increment(&nd_ibox, nd_stats->ndims, at) );
1648 
1649  /* Keep track of overall number of overlaps counted */
1650  total_cell_count += num_cells;
1651  /* How many features have we added to this histogram? */
1652  histogram_features++;
1653  }
1654 
1655  POSTGIS_DEBUGF(3, " histogram_features: %d", histogram_features);
1656  POSTGIS_DEBUGF(3, " sample_rows: %d", sample_rows);
1657  POSTGIS_DEBUGF(3, " table_rows: %.6g", total_rows);
1658 
1659  /* Error out if we got no sample information */
1660  if ( ! histogram_features )
1661  {
1662  POSTGIS_DEBUG(3, " no stats have been gathered");
1663  elog(NOTICE, " no features lie in the stats histogram, invalid stats");
1664  stats->stats_valid = false;
1665  return;
1666  }
1667 
1668  nd_stats->histogram_features = histogram_features;
1669  nd_stats->histogram_cells = histo_cells;
1670  nd_stats->cells_covered = total_cell_count;
1671 
1672  /* Put this histogram data into the right slot/kind */
1673  if ( mode == 2 )
1674  {
1675  stats_slot = STATISTIC_SLOT_2D;
1676  stats_kind = STATISTIC_KIND_2D;
1677  }
1678  else
1679  {
1680  stats_slot = STATISTIC_SLOT_ND;
1681  stats_kind = STATISTIC_KIND_ND;
1682  }
1683 
1684  /* Write the statistics data */
1685  stats->stakind[stats_slot] = stats_kind;
1686  stats->staop[stats_slot] = InvalidOid;
1687  stats->stanumbers[stats_slot] = (float4*)nd_stats;
1688  stats->numnumbers[stats_slot] = nd_stats_size/sizeof(float4);
1689  stats->stanullfrac = (float4)null_cnt/sample_rows;
1690  stats->stawidth = total_width/notnull_cnt;
1691  stats->stadistinct = -1.0;
1692  stats->stats_valid = true;
1693 
1694  POSTGIS_DEBUGF(3, " out: slot 0: kind %d (STATISTIC_KIND_ND)", stats->stakind[0]);
1695  POSTGIS_DEBUGF(3, " out: slot 0: op %d (InvalidOid)", stats->staop[0]);
1696  POSTGIS_DEBUGF(3, " out: slot 0: numnumbers %d", stats->numnumbers[0]);
1697  POSTGIS_DEBUGF(3, " out: null fraction: %f=%d/%d", stats->stanullfrac, null_cnt, sample_rows);
1698  POSTGIS_DEBUGF(3, " out: average width: %d bytes", stats->stawidth);
1699  POSTGIS_DEBUG (3, " out: distinct values: all (no check done)");
1700  POSTGIS_DEBUGF(3, " out: %s", nd_stats_to_json(nd_stats));
1701  /*
1702  POSTGIS_DEBUGF(3, " out histogram:\n%s", nd_stats_to_grid(nd_stats));
1703  */
1704 
1705  return;
1706 }
1707 
1708 
1726 static void
1727 compute_gserialized_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
1728  int sample_rows, double total_rows)
1729 {
1730  /* 2D Mode */
1731  compute_gserialized_stats_mode(stats, fetchfunc, sample_rows, total_rows, 2);
1732  /* ND Mode */
1733  compute_gserialized_stats_mode(stats, fetchfunc, sample_rows, total_rows, 0);
1734 }
1735 
1736 
1765 Datum gserialized_analyze_nd(PG_FUNCTION_ARGS)
1766 {
1767  VacAttrStats *stats = (VacAttrStats *)PG_GETARG_POINTER(0);
1768  Form_pg_attribute attr = stats->attr;
1769 
1770  POSTGIS_DEBUG(2, "gserialized_analyze_nd called");
1771 
1772  /* If the attstattarget column is negative, use the default value */
1773  /* NB: it is okay to scribble on stats->attr since it's a copy */
1774  if (attr->attstattarget < 0)
1775  attr->attstattarget = default_statistics_target;
1776 
1777  POSTGIS_DEBUGF(3, " attribute stat target: %d", attr->attstattarget);
1778 
1779  /* Setup the minimum rows and the algorithm function */
1780  stats->minrows = 300 * stats->attr->attstattarget;
1781  stats->compute_stats = compute_gserialized_stats;
1782 
1783  POSTGIS_DEBUGF(3, " minrows: %d", stats->minrows);
1784 
1785  /* Indicate we are done successfully */
1786  PG_RETURN_BOOL(true);
1787 }
1788 
1801 static float8
1802 estimate_selectivity(const GBOX *box, const ND_STATS *nd_stats, int mode)
1803 {
1804  int d; /* counter */
1805  float8 selectivity;
1806  ND_BOX nd_box;
1807  ND_IBOX nd_ibox;
1808  int at[ND_DIMS];
1809  double cell_size[ND_DIMS];
1810  double min[ND_DIMS];
1811  double max[ND_DIMS];
1812  double total_count = 0.0;
1813  int ndims_max = Max(nd_stats->ndims, gbox_ndims(box));
1814 // int ndims_min = Min(nd_stats->ndims, gbox_ndims(box));
1815 
1816  /* Calculate the overlap of the box on the histogram */
1817  if ( ! nd_stats )
1818  {
1819  elog(NOTICE, " estimate_selectivity called with null input");
1820  return FALLBACK_ND_SEL;
1821  }
1822 
1823  /* Initialize nd_box. */
1824  nd_box_from_gbox(box, &nd_box);
1825 
1826  /*
1827  * To return 2D stats on an ND sample, we need to make the
1828  * 2D box cover the full range of the other dimensions in the
1829  * histogram.
1830  */
1831  POSTGIS_DEBUGF(3, " mode: %d", mode);
1832  if ( mode == 2 )
1833  {
1834  POSTGIS_DEBUG(3, " in 2d mode, stripping the computation down to 2d");
1835  ndims_max = 2;
1836  }
1837 
1838  POSTGIS_DEBUGF(3, " nd_stats->extent: %s", nd_box_to_json(&(nd_stats->extent), nd_stats->ndims));
1839  POSTGIS_DEBUGF(3, " nd_box: %s", nd_box_to_json(&(nd_box), gbox_ndims(box)));
1840 
1841  /*
1842  * Search box completely misses histogram extent?
1843  * We have to intersect in all N dimensions or else we have
1844  * zero interaction under the &&& operator. It's important
1845  * to short circuit in this case, as some of the tests below
1846  * will return junk results when run on non-intersecting inputs.
1847  */
1848  if ( ! nd_box_intersects(&nd_box, &(nd_stats->extent), ndims_max) )
1849  {
1850  POSTGIS_DEBUG(3, " search box does not overlap histogram, returning 0");
1851  return 0.0;
1852  }
1853 
1854  /* Search box completely contains histogram extent! */
1855  if ( nd_box_contains(&nd_box, &(nd_stats->extent), ndims_max) )
1856  {
1857  POSTGIS_DEBUG(3, " search box contains histogram, returning 1");
1858  return 1.0;
1859  }
1860 
1861  /* Calculate the overlap of the box on the histogram */
1862  if ( ! nd_box_overlap(nd_stats, &nd_box, &nd_ibox) )
1863  {
1864  POSTGIS_DEBUG(3, " search box overlap with stats histogram failed");
1865  return FALLBACK_ND_SEL;
1866  }
1867 
1868  /* Work out some measurements of the histogram */
1869  for ( d = 0; d < nd_stats->ndims; d++ )
1870  {
1871  /* Cell size in each dim */
1872  min[d] = nd_stats->extent.min[d];
1873  max[d] = nd_stats->extent.max[d];
1874  cell_size[d] = (max[d] - min[d]) / nd_stats->size[d];
1875  POSTGIS_DEBUGF(3, " cell_size[%d] : %.9g", d, cell_size[d]);
1876 
1877  /* Initialize the counter */
1878  at[d] = nd_ibox.min[d];
1879  }
1880 
1881  /* Move through all the overlap values and sum them */
1882  do
1883  {
1884  float cell_count, ratio;
1885  ND_BOX nd_cell;
1886 
1887  /* We have to pro-rate partially overlapped cells. */
1888  for ( d = 0; d < nd_stats->ndims; d++ )
1889  {
1890  nd_cell.min[d] = min[d] + (at[d]+0) * cell_size[d];
1891  nd_cell.max[d] = min[d] + (at[d]+1) * cell_size[d];
1892  }
1893 
1894  ratio = nd_box_ratio(&nd_box, &nd_cell, nd_stats->ndims);
1895  cell_count = nd_stats->value[nd_stats_value_index(nd_stats, at)];
1896 
1897  /* Add the pro-rated count for this cell to the overall total */
1898  total_count += cell_count * ratio;
1899  POSTGIS_DEBUGF(4, " cell (%d,%d), cell value %.6f, ratio %.6f", at[0], at[1], cell_count, ratio);
1900  }
1901  while ( nd_increment(&nd_ibox, nd_stats->ndims, at) );
1902 
1903  /* Scale by the number of features in our histogram to get the proportion */
1904  selectivity = total_count / nd_stats->histogram_features;
1905 
1906  POSTGIS_DEBUGF(3, " nd_stats->histogram_features = %f", nd_stats->histogram_features);
1907  POSTGIS_DEBUGF(3, " nd_stats->histogram_cells = %f", nd_stats->histogram_cells);
1908  POSTGIS_DEBUGF(3, " sum(overlapped histogram cells) = %f", total_count);
1909  POSTGIS_DEBUGF(3, " selectivity = %f", selectivity);
1910 
1911  /* Prevent rounding overflows */
1912  if (selectivity > 1.0) selectivity = 1.0;
1913  else if (selectivity < 0.0) selectivity = 0.0;
1914 
1915  return selectivity;
1916 }
1917 
1918 
1919 
1925 Datum _postgis_gserialized_stats(PG_FUNCTION_ARGS)
1926 {
1927  Oid table_oid = PG_GETARG_OID(0);
1928  text *att_text = PG_GETARG_TEXT_P(1);
1929  ND_STATS *nd_stats;
1930  char *str;
1931  text *json;
1932  int mode = 2; /* default to 2D mode */
1933 
1934  /* Check if we've been asked to not use 2d mode */
1935  if ( ! PG_ARGISNULL(2) )
1936  mode = text_p_get_mode(PG_GETARG_TEXT_P(2));
1937 
1938  /* Retrieve the stats object */
1939  nd_stats = pg_get_nd_stats_by_name(table_oid, att_text, mode);
1940  if ( ! nd_stats )
1941  elog(ERROR, "stats for \"%s.%s\" do not exist", get_rel_name(table_oid), text2cstring(att_text));
1942 
1943  /* Convert to JSON */
1944  str = nd_stats_to_json(nd_stats);
1945  json = cstring2text(str);
1946  pfree(str);
1947  pfree(nd_stats);
1948  PG_RETURN_TEXT_P(json);
1949 }
1950 
1951 
1957 Datum _postgis_gserialized_sel(PG_FUNCTION_ARGS)
1958 {
1959  Oid table_oid = PG_GETARG_OID(0);
1960  text *att_text = PG_GETARG_TEXT_P(1);
1961  Datum geom_datum = PG_GETARG_DATUM(2);
1962  GBOX gbox; /* search box read from gserialized datum */
1963  float8 selectivity = 0;
1964  ND_STATS *nd_stats;
1965  int mode = 2; /* 2D mode by default */
1966 
1967  /* Check if we've been asked to not use 2d mode */
1968  if ( ! PG_ARGISNULL(3) )
1969  mode = text_p_get_mode(PG_GETARG_TEXT_P(3));
1970 
1971  /* Retrieve the stats object */
1972  nd_stats = pg_get_nd_stats_by_name(table_oid, att_text, mode);
1973 
1974  if ( ! nd_stats )
1975  elog(ERROR, "stats for \"%s.%s\" do not exist", get_rel_name(table_oid), text2cstring(att_text));
1976 
1977  /* Calculate the gbox */
1978  if ( ! gserialized_datum_get_gbox_p(geom_datum, &gbox) )
1979  elog(ERROR, "unable to calculate bounding box from geometry");
1980 
1981  POSTGIS_DEBUGF(3, " %s", gbox_to_string(&gbox));
1982 
1983  /* Do the estimation */
1984  selectivity = estimate_selectivity(&gbox, nd_stats, mode);
1985 
1986  pfree(nd_stats);
1987  PG_RETURN_FLOAT8(selectivity);
1988 }
1989 
1990 
1996 Datum _postgis_gserialized_joinsel(PG_FUNCTION_ARGS)
1997 {
1998  Oid table_oid1 = PG_GETARG_OID(0);
1999  text *att_text1 = PG_GETARG_TEXT_P(1);
2000  Oid table_oid2 = PG_GETARG_OID(2);
2001  text *att_text2 = PG_GETARG_TEXT_P(3);
2002  ND_STATS *nd_stats1, *nd_stats2;
2003  float8 selectivity = 0;
2004  int mode = 2; /* 2D mode by default */
2005 
2006 
2007  /* Retrieve the stats object */
2008  nd_stats1 = pg_get_nd_stats_by_name(table_oid1, att_text1, mode);
2009  nd_stats2 = pg_get_nd_stats_by_name(table_oid2, att_text2, mode);
2010 
2011  if ( ! nd_stats1 )
2012  elog(ERROR, "stats for \"%s.%s\" do not exist", get_rel_name(table_oid1), text2cstring(att_text1));
2013 
2014  if ( ! nd_stats2 )
2015  elog(ERROR, "stats for \"%s.%s\" do not exist", get_rel_name(table_oid2), text2cstring(att_text2));
2016 
2017  /* Check if we've been asked to not use 2d mode */
2018  if ( ! PG_ARGISNULL(4) )
2019  {
2020  text *modetxt = PG_GETARG_TEXT_P(4);
2021  char *modestr = text2cstring(modetxt);
2022  if ( modestr[0] == 'N' )
2023  mode = 0;
2024  }
2025 
2026  /* Do the estimation */
2027  selectivity = estimate_join_selectivity(nd_stats1, nd_stats2);
2028 
2029  pfree(nd_stats1);
2030  pfree(nd_stats2);
2031  PG_RETURN_FLOAT8(selectivity);
2032 }
2033 
2039 Datum gserialized_gist_sel_2d(PG_FUNCTION_ARGS)
2040 {
2041  PG_RETURN_DATUM(DirectFunctionCall5(
2043  PG_GETARG_DATUM(0), PG_GETARG_DATUM(1),
2044  PG_GETARG_DATUM(2), PG_GETARG_DATUM(3),
2045  Int32GetDatum(2) /* 2-D mode */
2046  ));
2047 }
2048 
2054 Datum gserialized_gist_sel_nd(PG_FUNCTION_ARGS)
2055 {
2056  PG_RETURN_DATUM(DirectFunctionCall5(
2058  PG_GETARG_DATUM(0), PG_GETARG_DATUM(1),
2059  PG_GETARG_DATUM(2), PG_GETARG_DATUM(3),
2060  Int32GetDatum(0) /* N-D mode */
2061  ));
2062 }
2063 
2078 Datum gserialized_gist_sel(PG_FUNCTION_ARGS)
2079 {
2080  PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
2081  /* Oid operator_oid = PG_GETARG_OID(1); */
2082  List *args = (List *) PG_GETARG_POINTER(2);
2083  /* int varRelid = PG_GETARG_INT32(3); */
2084  int mode = PG_GETARG_INT32(4);
2085 
2086  VariableStatData vardata;
2087  ND_STATS *nd_stats = NULL;
2088 
2089  Node *other;
2090  Var *self;
2091  GBOX search_box;
2092  float8 selectivity = 0;
2093 
2094  POSTGIS_DEBUG(2, "gserialized_gist_sel called");
2095 
2096  /*
2097  * TODO: This is a big one,
2098  * All this statistics code *only* tries to generate a valid
2099  * selectivity for && and &&&. That leaves all the other
2100  * geometry operators with bad stats! The selectivity
2101  * calculation should take account of the incoming operator
2102  * type and do the right thing.
2103  */
2104 
2105  /* Fail if not a binary opclause (probably shouldn't happen) */
2106  if (list_length(args) != 2)
2107  {
2108  POSTGIS_DEBUG(3, "gserialized_gist_sel: not a binary opclause");
2109  PG_RETURN_FLOAT8(DEFAULT_ND_SEL);
2110  }
2111 
2112  /* Find the constant part */
2113  other = (Node *) linitial(args);
2114  if ( ! IsA(other, Const) )
2115  {
2116  self = (Var *)other;
2117  other = (Node *) lsecond(args);
2118  }
2119  else
2120  {
2121  self = (Var *) lsecond(args);
2122  }
2123 
2124  if ( ! IsA(other, Const) )
2125  {
2126  POSTGIS_DEBUG(3, " no constant arguments - returning a default selectivity");
2127  PG_RETURN_FLOAT8(DEFAULT_ND_SEL);
2128  }
2129 
2130  /* Convert the constant to a BOX */
2131  if( ! gserialized_datum_get_gbox_p(((Const*)other)->constvalue, &search_box) )
2132  {
2133  POSTGIS_DEBUG(3, "search box is EMPTY");
2134  PG_RETURN_FLOAT8(0.0);
2135  }
2136  POSTGIS_DEBUGF(4, " requested search box is: %s", gbox_to_string(&search_box));
2137 
2138  /* Get pg_statistic row */
2139  examine_variable(root, (Node*)self, 0, &vardata);
2140  if ( vardata.statsTuple ) {
2141  nd_stats = pg_nd_stats_from_tuple(vardata.statsTuple, mode);
2142  }
2143  ReleaseVariableStats(vardata);
2144 
2145  if ( ! nd_stats )
2146  {
2147  POSTGIS_DEBUG(3, " unable to load stats from syscache, not analyzed yet?");
2148  PG_RETURN_FLOAT8(FALLBACK_ND_SEL);
2149  }
2150 
2151  POSTGIS_DEBUGF(4, " got stats:\n%s", nd_stats_to_json(nd_stats));
2152 
2153  /* Do the estimation! */
2154  selectivity = estimate_selectivity(&search_box, nd_stats, mode);
2155  POSTGIS_DEBUGF(3, " returning computed value: %f", selectivity);
2156 
2157  pfree(nd_stats);
2158  PG_RETURN_FLOAT8(selectivity);
2159 }
2160 
2161 
2162 
2169 Datum gserialized_estimated_extent(PG_FUNCTION_ARGS)
2170 {
2171  char *nsp = NULL;
2172  char *tbl = NULL;
2173  text *col = NULL;
2174  char *nsp_tbl = NULL;
2175  Oid tbl_oid;
2176  ND_STATS *nd_stats;
2177  GBOX *gbox;
2178 
2179  if ( PG_NARGS() == 3 )
2180  {
2181  nsp = text2cstring(PG_GETARG_TEXT_P(0));
2182  tbl = text2cstring(PG_GETARG_TEXT_P(1));
2183  col = PG_GETARG_TEXT_P(2);
2184  nsp_tbl = palloc(strlen(nsp) + strlen(tbl) + 6);
2185  sprintf(nsp_tbl, "\"%s\".\"%s\"", nsp, tbl);
2186  tbl_oid = DatumGetObjectId(DirectFunctionCall1(regclassin, CStringGetDatum(nsp_tbl)));
2187  pfree(nsp_tbl);
2188  }
2189  else if ( PG_NARGS() == 2 )
2190  {
2191  tbl = text2cstring(PG_GETARG_TEXT_P(0));
2192  col = PG_GETARG_TEXT_P(1);
2193  nsp_tbl = palloc(strlen(tbl) + 3);
2194  sprintf(nsp_tbl, "\"%s\"", tbl);
2195  tbl_oid = DatumGetObjectId(DirectFunctionCall1(regclassin, CStringGetDatum(nsp_tbl)));
2196  pfree(nsp_tbl);
2197  }
2198  else
2199  {
2200  elog(ERROR, "estimated_extent() called with wrong number of arguments");
2201  PG_RETURN_NULL();
2202  }
2203 
2204  /* Estimated extent only returns 2D bounds, so use mode 2 */
2205  nd_stats = pg_get_nd_stats_by_name(tbl_oid, col, 2);
2206 
2207  /* Error out on no stats */
2208  if ( ! nd_stats )
2209  elog(ERROR, "stats for \"%s.%s\" do not exist", tbl, text2cstring(col));
2210 
2211  /* Construct the box */
2212  gbox = palloc(sizeof(GBOX));
2213  FLAGS_SET_GEODETIC(gbox->flags, 0);
2214  FLAGS_SET_Z(gbox->flags, 0);
2215  FLAGS_SET_M(gbox->flags, 0);
2216  gbox->xmin = nd_stats->extent.min[0];
2217  gbox->xmax = nd_stats->extent.max[0];
2218  gbox->ymin = nd_stats->extent.min[1];
2219  gbox->ymax = nd_stats->extent.max[1];
2220 
2221  pfree(nd_stats);
2222  PG_RETURN_POINTER(gbox);
2223 }
2224 
2232 Datum geometry_estimated_extent(PG_FUNCTION_ARGS)
2233 {
2234  if ( PG_NARGS() == 3 )
2235  {
2236  PG_RETURN_DATUM(
2237  DirectFunctionCall3(gserialized_estimated_extent,
2238  PG_GETARG_DATUM(0),
2239  PG_GETARG_DATUM(1),
2240  PG_GETARG_DATUM(2)));
2241  }
2242  else if ( PG_NARGS() == 2 )
2243  {
2244  PG_RETURN_DATUM(
2245  DirectFunctionCall2(gserialized_estimated_extent,
2246  PG_GETARG_DATUM(0),
2247  PG_GETARG_DATUM(1)));
2248  }
2249 
2250  elog(ERROR, "geometry_estimated_extent() called with wrong number of arguments");
2251  PG_RETURN_NULL();
2252 }
int gserialized_get_gbox_p(const GSERIALIZED *g, GBOX *box)
Read the bounding box off a serialization and calculate one if it is not already there.
Definition: g_serialized.c:371
static int nd_increment(ND_IBOX *ibox, int ndims, int *counter)
Given an n-d index array (counter), and a domain to increment it in (ibox) increment it by one...
static int text_p_get_mode(const text *txt)
Utility function to see if the first letter of the mode argument is 'N'.
static int nd_box_overlap(const ND_STATS *nd_stats, const ND_BOX *nd_box, ND_IBOX *nd_ibox)
What stats cells overlap with this ND_BOX? Put the lowest cell addresses in ND_IBOX->min and the high...
PG_FUNCTION_INFO_V1(gserialized_gist_joinsel_nd)
For (geometry &&& geometry) and (geography && geography) we call into the N-D mode.
#define DEFAULT_ND_JOINSEL
static void compute_gserialized_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc, int sample_rows, double total_rows)
In order to do useful selectivity calculations in both 2-D and N-D modes, we actually have to generat...
stringbuffer_t * stringbuffer_create(void)
Allocate a new stringbuffer_t.
Definition: stringbuffer.c:46
int gbox_is_valid(const GBOX *gbox)
Return false if any of the dimensions is NaN or infinite.
Definition: g_box.c:174
#define MIN_DIMENSION_WIDTH
Minimum width of a dimension that we'll bother trying to compute statistics on.
char * gbox_to_string(const GBOX *gbox)
Allocate a string representation of the GBOX, based on dimensionality of flags.
Definition: g_box.c:369
#define FLAGS_GET_GEODETIC(flags)
Definition: liblwgeom.h:127
char * stringbuffer_getstringcopy(stringbuffer_t *s)
Returns a newly allocated string large enough to contain the current state of the string...
Definition: stringbuffer.c:153
double xmax
Definition: liblwgeom.h:277
#define ND_DIMS
The maximum number of dimensions our code can handle.
static void compute_gserialized_stats_mode(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc, int sample_rows, double total_rows, int mode)
The gserialized_analyze_nd sets this function as a callback on the stats object when called by the AN...
Datum geometry_estimated_extent(PG_FUNCTION_ARGS)
static int nd_box_contains(const ND_BOX *a, const ND_BOX *b, int ndims)
Return TRUE if ND_BOX a contains b, false otherwise.
Datum _postgis_gserialized_stats(PG_FUNCTION_ARGS)
#define STATISTIC_SLOT_2D
static void nd_box_from_gbox(const GBOX *gbox, ND_BOX *nd_box)
Set the values of an ND_BOX from a GBOX.
struct ND_BOX_T ND_BOX
N-dimensional box type for calculations, to avoid doing explicit axis conversions from GBOX in all ca...
#define FALLBACK_ND_JOINSEL
Datum gserialized_estimated_extent(PG_FUNCTION_ARGS)
#define FALLBACK_ND_SEL
More modest fallback selectivity factor.
#define FLAGS_SET_GEODETIC(flags, value)
Definition: liblwgeom.h:133
Datum _postgis_gserialized_joinsel(PG_FUNCTION_ARGS)
static float8 estimate_selectivity(const GBOX *box, const ND_STATS *nd_stats, int mode)
This function returns an estimate of the selectivity of a search GBOX by looking at data in the ND_ST...
#define LW_FAILURE
Definition: liblwgeom.h:64
int stringbuffer_aprintf(stringbuffer_t *s, const char *fmt,...)
Appends a formatted string to the current string buffer, using the format and argument list provided...
Definition: stringbuffer.c:246
static int nd_box_init(ND_BOX *a)
Zero out an ND_BOX.
Datum gserialized_gist_joinsel(PG_FUNCTION_ARGS)
#define FLAGS_SET_Z(flags, value)
Definition: liblwgeom.h:130
double zmax
Definition: liblwgeom.h:281
double ymin
Definition: liblwgeom.h:278
Datum gserialized_gist_sel_2d(PG_FUNCTION_ARGS)
Datum gserialized_gist_joinsel_2d(PG_FUNCTION_ARGS)
Datum gserialized_gist_sel(PG_FUNCTION_ARGS)
static int nd_box_array_distribution(const ND_BOX **nd_boxes, int num_boxes, const ND_BOX *extent, int ndims, double *distribution)
Calculate how much a set of boxes is homogenously distributed or contentrated within one dimension...
double xmin
Definition: liblwgeom.h:276
static int nd_stats_value_index(const ND_STATS *stats, int *indexes)
Given a position in the n-d histogram (i,j,k) return the position in the 1-d values array...
float4 size[ND_DIMS]
Datum gserialized_gist_joinsel_nd(PG_FUNCTION_ARGS)
static int nd_box_intersects(const ND_BOX *a, const ND_BOX *b, int ndims)
Return TRUE if ND_BOX a overlaps b, false otherwise.
static double total_double(const double *vals, int nvals)
Given double array, return sum of values.
int min[ND_DIMS]
static ND_STATS * pg_nd_stats_from_tuple(HeapTuple stats_tuple, int mode)
char * text2cstring(const text *textptr)
Datum gserialized_gist_sel_nd(PG_FUNCTION_ARGS)
static int cmp_int(const void *a, const void *b)
Integer comparison function for qsort.
static int nd_box_init_bounds(ND_BOX *a)
Prepare an ND_BOX for bounds calculation: set the maxes to the smallest thing possible and the mins t...
double ymax
Definition: liblwgeom.h:279
N-dimensional box index type.
#define FLAGS_GET_Z(flags)
Macros for manipulating the 'flags' byte.
Definition: liblwgeom.h:124
static int range_quintile(int *vals, int nvals)
The difference between the fourth and first quintile values, the "inter-quintile range".
Datum _postgis_gserialized_sel(PG_FUNCTION_ARGS)
static float8 estimate_join_selectivity(const ND_STATS *s1, const ND_STATS *s2)
Given two statistics histograms, what is the selectivity of a join driven by the && or &&& operator...
static ND_STATS * pg_get_nd_stats(const Oid table_oid, AttrNumber att_num, int mode)
Pull the stats object from the PgSQL system catalogs.
uint8_t flags
Definition: liblwgeom.h:275
static char * nd_box_to_json(const ND_BOX *nd_box, int ndims)
Convert an ND_BOX to a JSON string for printing.
float4 max[ND_DIMS]
void stringbuffer_append(stringbuffer_t *s, const char *a)
Append the specified string to the stringbuffer_t.
Definition: stringbuffer.c:127
#define STATISTIC_KIND_2D
struct ND_IBOX_T ND_IBOX
N-dimensional box index type.
void stringbuffer_destroy(stringbuffer_t *s)
Free the stringbuffer_t and all memory managed within it.
Definition: stringbuffer.c:71
int max[ND_DIMS]
struct ND_STATS_T ND_STATS
N-dimensional statistics structure.
float4 min[ND_DIMS]
#define FALSE
Definition: dbfopen.c:168
static int nd_box_merge(const ND_BOX *source, ND_BOX *target)
Create a printable view of the ND_STATS histogram.
static int nd_box_expand(ND_BOX *nd_box, double expansion_factor)
Expand an ND_BOX ever so slightly.
double mmin
Definition: liblwgeom.h:282
#define SDFACTOR
double zmin
Definition: liblwgeom.h:280
static ND_STATS * pg_get_nd_stats_by_name(const Oid table_oid, const text *att_text, int mode)
Pull the stats object from the PgSQL system catalogs.
#define FLAGS_GET_M(flags)
Definition: liblwgeom.h:125
static char * nd_stats_to_json(const ND_STATS *nd_stats)
Convert an ND_STATS to a JSON representation for external use.
Datum gserialized_analyze_nd(PG_FUNCTION_ARGS)
double mmax
Definition: liblwgeom.h:283
#define STATISTIC_SLOT_ND
static double nd_box_ratio(const ND_BOX *b1, const ND_BOX *b2, int ndims)
Returns the proportion of b2 that is covered by b1.
N-dimensional statistics structure.
#define TRUE
Definition: dbfopen.c:169
N-dimensional box type for calculations, to avoid doing explicit axis conversions from GBOX in all ca...
static int gbox_ndims(const GBOX *gbox)
Given that geodetic boxes are X/Y/Z regardless of the underlying geometry dimensionality and other bo...
This library is the generic geometry handling section of PostGIS.
#define DEFAULT_ND_SEL
Default geometry selectivity factor.
#define STATISTIC_KIND_ND
Assign a number to the n-dimensional statistics kind.
#define FLAGS_SET_M(flags, value)
Definition: liblwgeom.h:131