PostGIS  3.4.0dev-r@@SVN_REVISION@@
lwgeom_in_marc21.c
Go to the documentation of this file.
1 /**********************************************************************
2  *
3  * PostGIS - Spatial Types for PostgreSQL
4  * http://postgis.net
5  *
6  * PostGIS is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * PostGIS is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with PostGIS. If not, see <http://www.gnu.org/licenses/>.
18  *
19  **********************************************************************/
20 
21 #include "postgres.h"
22 #include "utils/builtins.h"
23 
24 #include <libxml/tree.h>
25 #include <libxml/parser.h>
26 #include <string.h>
27 
28 #include "../postgis_config.h"
29 #include "lwgeom_pg.h"
30 #include <math.h>
31 #include "liblwgeom.h"
32 
33 static LWGEOM* parse_marc21(xmlNodePtr xnode);
34 
35 /**********************************************************************
36  * Ability to parse geographic data contained in MARC21/XML records
37  * to return an LWGEOM or an error message. It returns NULL if the
38  * MARC21/XML record is valid but does not contain any geographic
39  * data (datafield:034).
40  *
41  * MARC21/XML version supported: 1.1
42  * MARC21/XML Cartographic Mathematical Data Definition:
43  * https://www.loc.gov/marc/bibliographic/bd034.html
44  *
45  * Copyright (C) 2021 University of Münster (WWU), Germany
46  * Written by Jim Jones <jim.jones@uni-muenster.de>
47  *
48  **********************************************************************/
49 
51 Datum ST_GeomFromMARC21(PG_FUNCTION_ARGS) {
52  GSERIALIZED *geom;
53  LWGEOM *lwgeom;
54  xmlDocPtr xmldoc;
55  text *xml_input;
56  int xml_size;
57  char *xml;
58  xmlNodePtr xmlroot = NULL;
59 
60  if (PG_ARGISNULL(0)) PG_RETURN_NULL();
61 
62  xml_input = PG_GETARG_TEXT_P(0);
63  xml = text_to_cstring(xml_input);
64  xml_size = VARSIZE_ANY_EXHDR(xml_input);
65 
66  xmlInitParser();
67  xmldoc = xmlReadMemory(xml, xml_size, NULL, NULL, XML_PARSE_SAX1);
68 
69  if (!xmldoc || (xmlroot = xmlDocGetRootElement(xmldoc)) == NULL) {
70  xmlFreeDoc(xmldoc);
71  xmlCleanupParser();
72  lwpgerror("invalid MARC21/XML document.");
73  }
74 
75  lwgeom = parse_marc21(xmlroot);
76 
77  xmlFreeDoc(xmldoc);
78  xmlCleanupParser();
79 
80  if (lwgeom == NULL) {
81 
82  //lwgeom_free(lwgeom);
83  PG_RETURN_NULL();
84 
85  }
86 
87  geom = geometry_serialize(lwgeom);
88 
89  lwgeom_free(lwgeom);
90 
91  PG_RETURN_POINTER(geom);
92 }
93 
94 static int is_literal_valid(const char *literal) {
95 
96  int num_dec_sep;
97  int coord_start;
98  int literal_length;
99 
100  if(literal == NULL) return LW_FALSE;
101 
102  literal_length = strlen(literal);
103 
104  POSTGIS_DEBUGF(2, "is_literal_valid called (%s)", literal);
105 
106  if (literal_length < 3) return LW_FALSE;
107 
108  coord_start = 0;
109  num_dec_sep = 0;
110 
120  if (literal[0] == 'N' || literal[0] == 'E' || literal[0] == 'S' || literal[0] == 'W' || literal[0] == '+' || literal[0] == '-') {
121 
122  if (literal_length < 4) {
123  POSTGIS_DEBUGF(3, " invalid literal length (%d): \"%s\"", literal_length, literal);
124  return LW_FALSE;
125  }
126 
127  coord_start = 1;
128  }
129 
130  for (int j = coord_start; j < literal_length; j++) {
131 
132  if (!isdigit(literal[j])) {
133 
134 
135  if (j < 3) {
136 
141  POSTGIS_DEBUGF(3," invalid character '%c' at the degrees section: \"%s\"", literal[j], literal);
142  return LW_FALSE;
143 
144  }
145 
150  if (literal[j] == '.' || literal[j] == ',') {
151 
152  num_dec_sep++;
153 
154  if (num_dec_sep > 1) return LW_FALSE;
155 
156  } else {
157  POSTGIS_DEBUGF(3, " invalid character '%c' in %d: \"%s\"", literal[j], j, literal);
158  return LW_FALSE;
159 
160  }
161 
162  }
163 
164  }
165 
166  POSTGIS_DEBUGF(2, "=> is_literal_valid returns LW_TRUE for \"%s\"", literal);
167  return LW_TRUE;
168 
169 }
170 
171 static double parse_geo_literal(char *literal) {
172 
185  char *dgr;
186  char *min;
187  char *sec;
188  size_t literal_length;
189 
190  char start_character = literal[0];
191  int start_literal = 0;
192  double result = 0.0;
193 
194  const size_t numdigits_degrees = 3;
195  const size_t numdigits_minutes = 2;
196  const size_t numdigits_seconds = 2;
197 
198  POSTGIS_DEBUGF(2, "parse_geo_literal called (%s)", literal);
199  POSTGIS_DEBUGF(2, " start character: %c", start_character);
200 
201  literal_length = strlen(literal);
202 
203  if (!isdigit(start_character)) start_literal = 1;
204 
205  POSTGIS_DEBUGF(2, " start_literal=%d", start_literal);
206 
207  dgr = palloc(sizeof(char)*numdigits_degrees+1);
208  snprintf(dgr, numdigits_degrees+1, "%s", &literal[start_literal]);
209 
210  if (strchr(literal, '.') == NULL && strchr(literal, ',') == NULL) {
211 
223  POSTGIS_DEBUG(2, " lat/lon integer coordinates detected");
224  POSTGIS_DEBUGF(2, " parsed degrees (lon/lat): %s", dgr);
225 
226  /* literal contain at least degrees.
227  * minutes and seconds are optional */
228  result = atof(dgr);
229 
230  /* checks if the literal contains minutes */
231  if (literal_length > (start_literal + numdigits_degrees)) {
232 
233  min = palloc(sizeof(char)*numdigits_minutes+1);
234  snprintf(min, numdigits_minutes+1, "%s", &literal[start_literal+numdigits_degrees]);
235  POSTGIS_DEBUGF(2, " parsed minutes (lon/lat): %s", min);
236  result = result + atof(min) / 60;
237  pfree(min);
238 
239  /* checks if the literal contains seconds */
240  if (literal_length >= (start_literal + numdigits_degrees + numdigits_minutes)) {
241 
242  sec = palloc(sizeof(char)*numdigits_seconds+1);
243  snprintf(sec, numdigits_seconds+1, "%s", &literal[start_literal+numdigits_degrees+numdigits_minutes]);
244  POSTGIS_DEBUGF(2, " parsed seconds (lon/lat): %s", sec);
245 
246  result = result + atof(sec) / 3600;
247  pfree(sec);
248 
249  }
250 
251 
252  }
253 
254 
255  } else {
256 
257  POSTGIS_DEBUG(2, " decimal coordinates detected");
258 
259  if (strchr(literal, ',')) {
260 
261  /* changes the literal decimal sign from comma to period to avoid problems with atof.
262  * from the docs "In MARC21/XML coordinates, the decimal sign may be either a period or a comma." */
263 
264  literal[literal_length-strlen(strchr(literal, ','))]='.';
265  POSTGIS_DEBUGF(2, " decimal separator changed to '.': %s",literal);
266 
267  }
268 
269  /* checks if the literal is encoded in decimal degrees */
270  if (literal[start_literal + numdigits_degrees] == '.') {
271 
281  char *dec = palloc(sizeof(char)*literal_length+1);
282  snprintf(dec, literal_length+1, "%s", &literal[start_literal]);
283  result = atof(dec);
284 
285  POSTGIS_DEBUGF(2, " parsed decimal degrees: %s", dec);
286  pfree(dec);
287 
288  /* checks if the literal is encoded in decimal minutes */
289  } else if (literal[start_literal + numdigits_degrees + numdigits_minutes] == '.') {
290 
300  size_t len_decimal_minutes = literal_length - (start_literal + numdigits_degrees);
301 
302  min = palloc(sizeof(char)*len_decimal_minutes+1);
303  snprintf(min, len_decimal_minutes+1, "%s", &literal[start_literal + numdigits_degrees]);
304 
305  POSTGIS_DEBUGF(2, " parsed degrees: %s", dgr);
306  POSTGIS_DEBUGF(2, " parsed decimal minutes: %s", min);
307 
308  result = atof(dgr) + (atof(min) / 60);
309 
310  pfree(min);
311 
312  /* checks if the literal is encoded in decimal seconds */
313  } else if (literal[start_literal + numdigits_degrees + numdigits_minutes + numdigits_seconds] == '.') {
314 
325  size_t len_decimal_seconds = literal_length - (start_literal + numdigits_degrees + numdigits_minutes);
326 
327  min = palloc(sizeof(char)*numdigits_minutes+1);
328  snprintf(min, numdigits_minutes+1, "%s", &literal[start_literal + numdigits_degrees]);
329 
330  sec = palloc(sizeof(char)*len_decimal_seconds+1);
331  snprintf(sec, len_decimal_seconds+1, "%s", &literal[start_literal + numdigits_degrees + numdigits_minutes]);
332 
333  result = atof(dgr) + (atof(min) / 60) + (atof(sec) / 3600);
334 
335  POSTGIS_DEBUGF(2, " parsed degrees: %s", dgr);
336  POSTGIS_DEBUGF(2, " parsed minutes: %s", min);
337  POSTGIS_DEBUGF(2, " parsed decimal seconds: %s", sec);
338  pfree(min);
339  pfree(sec);
340 
341  }
342 
343  }
344 
350  pfree(dgr);
351 
352  if (start_character == 'S' || start_character == 'W' || start_character == '-') {
353 
354  POSTGIS_DEBUGF(2, " switching sign due to start character: '%c'", start_character);
355  result = -result;
356 
357  }
358 
359  POSTGIS_DEBUGF(2, "=> parse_geo_literal returns: %.*f (in decimal degrees)", literal_length-(3+start_literal), result);
360  return result;
361 }
362 
363 static LWGEOM*
364 parse_marc21(xmlNodePtr xnode) {
365 
366  int ngeoms;
367  int i;
368  xmlNodePtr datafield;
369  xmlNodePtr subfield;
370  LWGEOM *result;
371  LWGEOM **lwgeoms = (LWGEOM**) lwalloc(sizeof(LWGEOM*));
372  uint8_t geometry_type;
373  uint8_t result_type;
374  char *code;
375  char *literal;
376 
377  POSTGIS_DEBUGF(2, "parse_marc21 called: root '<%s>'", xnode->name);
378 
384  if (xmlStrcmp(xnode->name, (xmlChar*) "record")) lwpgerror("invalid MARC21/XML document. Root element <record> expected but <%s> found.",xnode->name);
385 
386  result_type = 0;
387  ngeoms = 0;
388 
389  for (datafield = xnode->children; datafield != NULL; datafield = datafield->next) {
390 
391  char *lw = NULL;
392  char *le = NULL;
393  char *ln = NULL;
394  char *ls = NULL;
395 
396  if (datafield->type != XML_ELEMENT_NODE) continue;
397 
398  if (xmlStrcmp(datafield->name, (xmlChar*) "datafield") != 0 || xmlStrcmp(xmlGetProp(datafield, (xmlChar*) "tag"),(xmlChar*) "034") != 0) continue;
399 
400  POSTGIS_DEBUG(3, " datafield found");
401 
402  for (subfield = datafield->children; subfield != NULL; subfield = subfield->next) {
403 
404  if (subfield->type != XML_ELEMENT_NODE) continue;
405  if (xmlStrcmp(subfield->name, (xmlChar*) "subfield") != 0) continue;
406 
407  code = (char*) xmlGetProp(subfield, (xmlChar*) "code");
408 
409  if ((strcmp(code, "d") != 0 && strcmp(code, "e") != 0 && strcmp(code, "f") != 0 && strcmp(code, "g")) != 0) continue;
410 
411  literal = (char*) xmlNodeGetContent(subfield);
412 
413  POSTGIS_DEBUGF(3, " subfield code '%s': %s", code, literal);
414 
415  if (is_literal_valid(literal) == LW_TRUE) {
416 
417  if (strcmp(code, "d") == 0) lw = literal;
418  else if (strcmp(code, "e") == 0) le = literal;
419  else if (strcmp(code, "f") == 0) ln = literal;
420  else if (strcmp(code, "g") == 0) ls = literal;
421 
422  } else {
423 
424  lwpgerror("parse error - invalid literal at 034$%s: \"%s\"", code, literal);
425 
426  }
427 
428  }
429 
430  xmlFreeNode(subfield);
431 
432  if (lw && le && ln && ls) {
433 
434  double w = parse_geo_literal(lw);
435  double e = parse_geo_literal(le);
436  double n = parse_geo_literal(ln);
437  double s = parse_geo_literal(ls);
438  geometry_type = 0;
439 
440  if (ngeoms > 0) lwgeoms = (LWGEOM**) lwrealloc(lwgeoms, sizeof(LWGEOM*) * (ngeoms + 1));
441 
442  if (fabs(w - e) < 0.0000001f && fabs(n - s) < 0.0000001f) {
443 
450  lwgeoms[ngeoms] = (LWGEOM*) lwpoint_make2d(SRID_UNKNOWN, w, s);
451  geometry_type = MULTIPOINTTYPE;
452 
453  } else {
454 
455  lwgeoms[ngeoms] = (LWGEOM*) lwpoly_construct_envelope(SRID_UNKNOWN, w, n, e, s);
456  geometry_type = MULTIPOLYGONTYPE;
457 
458  }
459 
460  if (ngeoms && result_type != geometry_type) {
461  result_type = COLLECTIONTYPE;
462  } else {
463  result_type = geometry_type;
464  }
465 
466  ngeoms++;
467 
468  } else {
469 
470  if (lw || le || ln || ls) {
471 
472  lwpgerror("parse error - the Coded Cartographic Mathematical Data (datafield:034) in the given MARC21/XML is incomplete. Coordinates for subfields \"$d\",\"$e\",\"$f\" and \"$g\" are expected.");
473  }
474 
475  }
476 
477  }
478 
479  POSTGIS_DEBUG(5, " xmlFreeNode(datafield)");
480  xmlFreeNode(datafield);
481 
482  if (ngeoms == 1) {
483 
484  POSTGIS_DEBUGF(2, "=> parse_marc21 returns single geometry: %s",lwtype_name(lwgeom_get_type(lwgeoms[0])));
485  lwgeom_force_clockwise(lwgeoms[0]);
486  return lwgeoms[0];
487 
488  } else if (ngeoms > 1) {
489 
490  result = (LWGEOM*) lwcollection_construct_empty(result_type,SRID_UNKNOWN, 0, 0);
491 
492  for (i = 0; i < ngeoms; i++) {
493 
494  POSTGIS_DEBUGF(3, " adding geometry to result set: %s",lwtype_name(lwgeom_get_type(lwgeoms[i])));
495  lwgeom_force_clockwise(lwgeoms[i]);
497 
498  }
499 
500  POSTGIS_DEBUGF(2, "=> parse_marc21 returns a collection: %s", lwtype_name(lwgeom_get_type(result)));
501  return result;
502 
503  }
504 
508  POSTGIS_DEBUG(2, "=> parse_marc21 returns NULL");
509  return NULL;
510 
511 }
char * s
Definition: cu_in_wkt.c:23
static char * w
Definition: cu_out_twkb.c:25
char result[OUT_DOUBLE_BUFFER_SIZE]
Definition: cu_print.c:262
#define LW_FALSE
Definition: liblwgeom.h:94
#define COLLECTIONTYPE
Definition: liblwgeom.h:108
LWPOINT * lwpoint_make2d(int32_t srid, double x, double y)
Definition: lwpoint.c:163
void lwgeom_free(LWGEOM *geom)
Definition: lwgeom.c:1155
#define MULTIPOINTTYPE
Definition: liblwgeom.h:105
LWPOLY * lwpoly_construct_envelope(int32_t srid, double x1, double y1, double x2, double y2)
Definition: lwpoly.c:98
#define MULTIPOLYGONTYPE
Definition: liblwgeom.h:107
void * lwrealloc(void *mem, size_t size)
Definition: lwutil.c:235
LWCOLLECTION * lwcollection_construct_empty(uint8_t type, int32_t srid, char hasz, char hasm)
Definition: lwcollection.c:92
const char * lwtype_name(uint8_t type)
Return the type name string associated with a type number (e.g.
Definition: lwutil.c:216
void lwgeom_force_clockwise(LWGEOM *lwgeom)
Force Right-hand-rule on LWGEOM polygons.
Definition: lwgeom.c:38
LWCOLLECTION * lwcollection_add_lwgeom(LWCOLLECTION *col, const LWGEOM *geom)
Appends geom to the collection managed by col.
Definition: lwcollection.c:188
void * lwalloc(size_t size)
Definition: lwutil.c:227
#define LW_TRUE
Return types for functions with status returns.
Definition: liblwgeom.h:93
#define SRID_UNKNOWN
Unknown SRID value.
Definition: liblwgeom.h:215
This library is the generic geometry handling section of PostGIS.
PG_FUNCTION_INFO_V1(ST_GeomFromMARC21)
static LWGEOM * parse_marc21(xmlNodePtr xnode)
Datum ST_GeomFromMARC21(PG_FUNCTION_ARGS)
static double parse_geo_literal(char *literal)
static int is_literal_valid(const char *literal)
static uint32_t lwgeom_get_type(const LWGEOM *geom)
Return LWTYPE number.
Definition: lwinline.h:145