PostGIS  3.3.9dev-r@@SVN_REVISION@@
lwgeom_in_marc21.c
Go to the documentation of this file.
1 /**********************************************************************
2  *
3  * PostGIS - Spatial Types for PostgreSQL
4  * http://postgis.net
5  *
6  * PostGIS is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * PostGIS is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with PostGIS. If not, see <http://www.gnu.org/licenses/>.
18  *
19  **********************************************************************/
20 
21 #include "postgres.h"
22 #include "utils/builtins.h"
23 
24 #include <libxml/tree.h>
25 #include <libxml/parser.h>
26 #include <string.h>
27 
28 #include "../postgis_config.h"
29 #include "lwgeom_pg.h"
30 #include <math.h>
31 #include "liblwgeom.h"
32 
33 static LWGEOM* parse_marc21(xmlNodePtr xnode);
34 
35 /**********************************************************************
36  * Ability to parse geographic data contained in MARC21/XML records
37  * to return an LWGEOM or an error message. It returns NULL if the
38  * MARC21/XML record is valid but does not contain any geographic
39  * data (datafield:034).
40  *
41  * MARC21/XML version supported: 1.1
42  * MARC21/XML Cartographic Mathematical Data Definition:
43  * https://www.loc.gov/marc/bibliographic/bd034.html
44  *
45  * Copyright (C) 2021 University of Münster (WWU), Germany
46  * Written by Jim Jones <jim.jones@uni-muenster.de>
47  *
48  **********************************************************************/
49 
51 Datum ST_GeomFromMARC21(PG_FUNCTION_ARGS) {
52  GSERIALIZED *geom;
53  LWGEOM *lwgeom;
54  xmlDocPtr xmldoc;
55  text *xml_input;
56  int xml_size;
57  char *xml;
58  xmlNodePtr xmlroot = NULL;
59 
60  if (PG_ARGISNULL(0)) PG_RETURN_NULL();
61 
62  xml_input = PG_GETARG_TEXT_P(0);
63  xml = text_to_cstring(xml_input);
64  xml_size = VARSIZE_ANY_EXHDR(xml_input);
65 
66  xmlInitParser();
67  xmldoc = xmlReadMemory(xml, xml_size, NULL, NULL, 0);
68 
69  if (!xmldoc || (xmlroot = xmlDocGetRootElement(xmldoc)) == NULL) {
70  xmlFreeDoc(xmldoc);
71  xmlCleanupParser();
72  lwpgerror("invalid MARC21/XML document.");
73  }
74 
75  lwgeom = parse_marc21(xmlroot);
76 
77  xmlFreeDoc(xmldoc);
78  xmlCleanupParser();
79 
80  if (lwgeom == NULL) {
81 
82  //lwgeom_free(lwgeom);
83  PG_RETURN_NULL();
84 
85  }
86 
87  geom = geometry_serialize(lwgeom);
88 
89  lwgeom_free(lwgeom);
90 
91  PG_RETURN_POINTER(geom);
92 }
93 
94 static inline bool
95 is_xml_element(xmlNodePtr xn, const char *xml_name)
96 {
97  char *colon_pos, *node_name;
98 
99  /* Not an element node, can't do anything */
100  if (!xn || xn->type != XML_ELEMENT_NODE)
101  return false;
102 
103  /* If there's a colon in the element name, */
104  /* move past it before checking for equality with */
105  /* the element name we are looking for */
106  node_name = (char*)xn->name;
107  colon_pos = strchr(node_name, ':');
108  if (colon_pos)
109  node_name = colon_pos + 1;
110 
111  return strcmp(node_name, xml_name) == 0;
112 }
113 
114 static int is_literal_valid(const char *literal) {
115 
116  int num_dec_sep;
117  int coord_start;
118  int literal_length;
119 
120  if(literal == NULL) return LW_FALSE;
121 
122  literal_length = strlen(literal);
123 
124  POSTGIS_DEBUGF(2, "is_literal_valid called (%s)", literal);
125 
126  if (literal_length < 3) return LW_FALSE;
127 
128  coord_start = 0;
129  num_dec_sep = 0;
130 
140  if (literal[0] == 'N' || literal[0] == 'E' || literal[0] == 'S' || literal[0] == 'W' || literal[0] == '+' || literal[0] == '-') {
141 
142  if (literal_length < 4) {
143  POSTGIS_DEBUGF(3, " invalid literal length (%d): \"%s\"", literal_length, literal);
144  return LW_FALSE;
145  }
146 
147  coord_start = 1;
148  }
149 
150  for (int j = coord_start; j < literal_length; j++) {
151 
152  if (!isdigit(literal[j])) {
153 
154 
155  if (j < 3) {
156 
161  POSTGIS_DEBUGF(3," invalid character '%c' at the degrees section: \"%s\"", literal[j], literal);
162  return LW_FALSE;
163 
164  }
165 
170  if (literal[j] == '.' || literal[j] == ',') {
171 
172  num_dec_sep++;
173 
174  if (num_dec_sep > 1) return LW_FALSE;
175 
176  } else {
177  POSTGIS_DEBUGF(3, " invalid character '%c' in %d: \"%s\"", literal[j], j, literal);
178  return LW_FALSE;
179 
180  }
181 
182  }
183 
184  }
185 
186  POSTGIS_DEBUGF(2, "=> is_literal_valid returns LW_TRUE for \"%s\"", literal);
187  return LW_TRUE;
188 
189 }
190 
191 static double parse_geo_literal(char *literal) {
192 
205  char *dgr;
206  char *min;
207  char *sec;
208  size_t literal_length;
209 
210  char start_character = literal[0];
211  int start_literal = 0;
212  double result = 0.0;
213 
214  const size_t numdigits_degrees = 3;
215  const size_t numdigits_minutes = 2;
216  const size_t numdigits_seconds = 2;
217 
218  POSTGIS_DEBUGF(2, "parse_geo_literal called (%s)", literal);
219  POSTGIS_DEBUGF(2, " start character: %c", start_character);
220 
221  literal_length = strlen(literal);
222 
223  if (!isdigit(start_character)) start_literal = 1;
224 
225  POSTGIS_DEBUGF(2, " start_literal=%d", start_literal);
226 
227  dgr = palloc(sizeof(char)*numdigits_degrees+1);
228  snprintf(dgr, numdigits_degrees+1, "%s", &literal[start_literal]);
229 
230  if (strchr(literal, '.') == NULL && strchr(literal, ',') == NULL) {
231 
243  POSTGIS_DEBUG(2, " lat/lon integer coordinates detected");
244  POSTGIS_DEBUGF(2, " parsed degrees (lon/lat): %s", dgr);
245 
246  /* literal contain at least degrees.
247  * minutes and seconds are optional */
248  result = atof(dgr);
249 
250  /* checks if the literal contains minutes */
251  if (literal_length > (start_literal + numdigits_degrees)) {
252 
253  min = palloc(sizeof(char)*numdigits_minutes+1);
254  snprintf(min, numdigits_minutes+1, "%s", &literal[start_literal+numdigits_degrees]);
255  POSTGIS_DEBUGF(2, " parsed minutes (lon/lat): %s", min);
256  result = result + atof(min) / 60;
257  pfree(min);
258 
259  /* checks if the literal contains seconds */
260  if (literal_length >= (start_literal + numdigits_degrees + numdigits_minutes)) {
261 
262  sec = palloc(sizeof(char)*numdigits_seconds+1);
263  snprintf(sec, numdigits_seconds+1, "%s", &literal[start_literal+numdigits_degrees+numdigits_minutes]);
264  POSTGIS_DEBUGF(2, " parsed seconds (lon/lat): %s", sec);
265 
266  result = result + atof(sec) / 3600;
267  pfree(sec);
268 
269  }
270 
271 
272  }
273 
274 
275  } else {
276 
277  POSTGIS_DEBUG(2, " decimal coordinates detected");
278 
279  if (strchr(literal, ',')) {
280 
281  /* changes the literal decimal sign from comma to period to avoid problems with atof.
282  * from the docs "In MARC21/XML coordinates, the decimal sign may be either a period or a comma." */
283 
284  literal[literal_length-strlen(strchr(literal, ','))]='.';
285  POSTGIS_DEBUGF(2, " decimal separator changed to '.': %s",literal);
286 
287  }
288 
289  /* checks if the literal is encoded in decimal degrees */
290  if (literal[start_literal + numdigits_degrees] == '.') {
291 
301  char *dec = palloc(sizeof(char)*literal_length+1);
302  snprintf(dec, literal_length+1, "%s", &literal[start_literal]);
303  result = atof(dec);
304 
305  POSTGIS_DEBUGF(2, " parsed decimal degrees: %s", dec);
306  pfree(dec);
307 
308  /* checks if the literal is encoded in decimal minutes */
309  } else if (literal[start_literal + numdigits_degrees + numdigits_minutes] == '.') {
310 
320  size_t len_decimal_minutes = literal_length - (start_literal + numdigits_degrees);
321 
322  min = palloc(sizeof(char)*len_decimal_minutes+1);
323  snprintf(min, len_decimal_minutes+1, "%s", &literal[start_literal + numdigits_degrees]);
324 
325  POSTGIS_DEBUGF(2, " parsed degrees: %s", dgr);
326  POSTGIS_DEBUGF(2, " parsed decimal minutes: %s", min);
327 
328  result = atof(dgr) + (atof(min) / 60);
329 
330  pfree(min);
331 
332  /* checks if the literal is encoded in decimal seconds */
333  } else if (literal[start_literal + numdigits_degrees + numdigits_minutes + numdigits_seconds] == '.') {
334 
345  size_t len_decimal_seconds = literal_length - (start_literal + numdigits_degrees + numdigits_minutes);
346 
347  min = palloc(sizeof(char)*numdigits_minutes+1);
348  snprintf(min, numdigits_minutes+1, "%s", &literal[start_literal + numdigits_degrees]);
349 
350  sec = palloc(sizeof(char)*len_decimal_seconds+1);
351  snprintf(sec, len_decimal_seconds+1, "%s", &literal[start_literal + numdigits_degrees + numdigits_minutes]);
352 
353  result = atof(dgr) + (atof(min) / 60) + (atof(sec) / 3600);
354 
355  POSTGIS_DEBUGF(2, " parsed degrees: %s", dgr);
356  POSTGIS_DEBUGF(2, " parsed minutes: %s", min);
357  POSTGIS_DEBUGF(2, " parsed decimal seconds: %s", sec);
358  pfree(min);
359  pfree(sec);
360 
361  }
362 
363  }
364 
370  pfree(dgr);
371 
372  if (start_character == 'S' || start_character == 'W' || start_character == '-') {
373 
374  POSTGIS_DEBUGF(2, " switching sign due to start character: '%c'", start_character);
375  result = -result;
376 
377  }
378 
379  POSTGIS_DEBUGF(2, "=> parse_geo_literal returns: %.*f (in decimal degrees)", literal_length-(3+start_literal), result);
380  return result;
381 }
382 
383 static LWGEOM*
384 parse_marc21(xmlNodePtr xnode) {
385 
386  int ngeoms;
387  int i;
388  xmlNodePtr datafield;
389  xmlNodePtr subfield;
390  LWGEOM *result;
391  LWGEOM **lwgeoms = (LWGEOM**) lwalloc(sizeof(LWGEOM*));
392  uint8_t geometry_type;
393  uint8_t result_type;
394  char *code;
395  char *literal;
396 
397  POSTGIS_DEBUGF(2, "parse_marc21 called: root '<%s>'", xnode->name);
398 
404  if (!is_xml_element(xnode, "record"))
405  lwpgerror("invalid MARC21/XML document. Root element <record> expected but <%s> found.",xnode->name);
406 
407  result_type = 0;
408  ngeoms = 0;
409 
410  for (datafield = xnode->children; datafield != NULL; datafield = datafield->next) {
411 
412  char *lw = NULL;
413  char *le = NULL;
414  char *ln = NULL;
415  char *ls = NULL;
416 
417  if (datafield->type != XML_ELEMENT_NODE) continue;
418 
419  if (!is_xml_element(datafield, "datafield") || xmlStrcmp(xmlGetProp(datafield, (xmlChar*) "tag"),(xmlChar*) "034") != 0) continue;
420 
421  POSTGIS_DEBUG(3, " datafield found");
422 
423  for (subfield = datafield->children; subfield != NULL; subfield = subfield->next) {
424 
425  if (subfield->type != XML_ELEMENT_NODE) continue;
426  if (!is_xml_element(subfield, "subfield"))
427  continue;
428 
429  code = (char*) xmlGetProp(subfield, (xmlChar*) "code");
430 
431  if ((strcmp(code, "d") != 0 &&
432  strcmp(code, "e") != 0 &&
433  strcmp(code, "f") != 0 &&
434  strcmp(code, "g")) != 0)
435  continue;
436 
437  literal = (char*) xmlNodeGetContent(subfield);
438 
439  POSTGIS_DEBUGF(3, " subfield code '%s': %s", code, literal);
440 
441  if (is_literal_valid(literal) == LW_TRUE) {
442 
443  if (strcmp(code, "d") == 0) lw = literal;
444  else if (strcmp(code, "e") == 0) le = literal;
445  else if (strcmp(code, "f") == 0) ln = literal;
446  else if (strcmp(code, "g") == 0) ls = literal;
447 
448  } else {
449 
450  lwpgerror("parse error - invalid literal at 034$%s: \"%s\"", code, literal);
451 
452  }
453 
454  }
455 
456  xmlFreeNode(subfield);
457 
458  if (lw && le && ln && ls) {
459 
460  double w = parse_geo_literal(lw);
461  double e = parse_geo_literal(le);
462  double n = parse_geo_literal(ln);
463  double s = parse_geo_literal(ls);
464  geometry_type = 0;
465 
466  if (ngeoms > 0) lwgeoms = (LWGEOM**)
467  lwrealloc(lwgeoms, sizeof(LWGEOM*) * (ngeoms + 1));
468 
469  if (fabs(w - e) < 0.0000001f && fabs(n - s) < 0.0000001f) {
470 
477  lwgeoms[ngeoms] = (LWGEOM*) lwpoint_make2d(SRID_UNKNOWN, w, s);
478  geometry_type = MULTIPOINTTYPE;
479 
480  } else {
481 
482  lwgeoms[ngeoms] = (LWGEOM*) lwpoly_construct_envelope(SRID_UNKNOWN, w, n, e, s);
483  geometry_type = MULTIPOLYGONTYPE;
484 
485  }
486 
487  if (ngeoms && result_type != geometry_type) {
488  result_type = COLLECTIONTYPE;
489  } else {
490  result_type = geometry_type;
491  }
492 
493  ngeoms++;
494 
495  } else {
496 
497  if (lw || le || ln || ls) {
498 
499  lwpgerror("parse error - the Coded Cartographic Mathematical Data (datafield:034) in the given MARC21/XML is incomplete. Coordinates for subfields \"$d\",\"$e\",\"$f\" and \"$g\" are expected.");
500  }
501 
502  }
503 
504  }
505 
506  POSTGIS_DEBUG(5, " xmlFreeNode(datafield)");
507  xmlFreeNode(datafield);
508 
509  if (ngeoms == 1) {
510 
511  POSTGIS_DEBUGF(2, "=> parse_marc21 returns single geometry: %s",lwtype_name(lwgeom_get_type(lwgeoms[0])));
512  lwgeom_force_clockwise(lwgeoms[0]);
513  return lwgeoms[0];
514 
515  } else if (ngeoms > 1) {
516 
517  result = (LWGEOM*) lwcollection_construct_empty(result_type,SRID_UNKNOWN, 0, 0);
518 
519  for (i = 0; i < ngeoms; i++) {
520 
521  POSTGIS_DEBUGF(3, " adding geometry to result set: %s",lwtype_name(lwgeom_get_type(lwgeoms[i])));
522  lwgeom_force_clockwise(lwgeoms[i]);
524 
525  }
526 
527  POSTGIS_DEBUGF(2, "=> parse_marc21 returns a collection: %s", lwtype_name(lwgeom_get_type(result)));
528  return result;
529 
530  }
531 
535  POSTGIS_DEBUG(2, "=> parse_marc21 returns NULL");
536  return NULL;
537 
538 }
char * s
Definition: cu_in_wkt.c:23
static char * w
Definition: cu_out_twkb.c:25
char result[OUT_DOUBLE_BUFFER_SIZE]
Definition: cu_print.c:267
#define LW_FALSE
Definition: liblwgeom.h:109
#define COLLECTIONTYPE
Definition: liblwgeom.h:123
LWPOINT * lwpoint_make2d(int32_t srid, double x, double y)
Definition: lwpoint.c:163
void lwgeom_free(LWGEOM *geom)
Definition: lwgeom.c:1155
#define MULTIPOINTTYPE
Definition: liblwgeom.h:120
LWPOLY * lwpoly_construct_envelope(int32_t srid, double x1, double y1, double x2, double y2)
Definition: lwpoly.c:98
#define MULTIPOLYGONTYPE
Definition: liblwgeom.h:122
void * lwrealloc(void *mem, size_t size)
Definition: lwutil.c:235
LWCOLLECTION * lwcollection_construct_empty(uint8_t type, int32_t srid, char hasz, char hasm)
Definition: lwcollection.c:92
const char * lwtype_name(uint8_t type)
Return the type name string associated with a type number (e.g.
Definition: lwutil.c:216
void lwgeom_force_clockwise(LWGEOM *lwgeom)
Force Right-hand-rule on LWGEOM polygons.
Definition: lwgeom.c:38
LWCOLLECTION * lwcollection_add_lwgeom(LWCOLLECTION *col, const LWGEOM *geom)
Appends geom to the collection managed by col.
Definition: lwcollection.c:188
void * lwalloc(size_t size)
Definition: lwutil.c:227
#define LW_TRUE
Return types for functions with status returns.
Definition: liblwgeom.h:108
#define SRID_UNKNOWN
Unknown SRID value.
Definition: liblwgeom.h:230
This library is the generic geometry handling section of PostGIS.
static bool is_xml_element(xmlNodePtr xn, const char *xml_name)
PG_FUNCTION_INFO_V1(ST_GeomFromMARC21)
static LWGEOM * parse_marc21(xmlNodePtr xnode)
Datum ST_GeomFromMARC21(PG_FUNCTION_ARGS)
static double parse_geo_literal(char *literal)
static int is_literal_valid(const char *literal)
static uint32_t lwgeom_get_type(const LWGEOM *geom)
Return LWTYPE number.
Definition: lwinline.h:145