blob: 195ce56c73bf16205284838204cd1f4fef111699 [file] [log] [blame]
Radek Krejcid91dbaf2018-09-21 15:51:39 +02001/**
2 * @file xml.c
3 * @author Radek Krejci <rkrejci@cesnet.cz>
4 * @brief Generic XML parser implementation for libyang
5 *
6 * Copyright (c) 2015 - 2018 CESNET, z.s.p.o.
7 *
8 * This source code is licensed under BSD 3-Clause License (the "License").
9 * You may not use this file except in compliance with the License.
10 * You may obtain a copy of the License at
11 *
12 * https://opensource.org/licenses/BSD-3-Clause
13 */
14
Radek Krejcic1c03d62018-11-27 10:52:43 +010015#include "common.h"
Radek Krejci4b74d5e2018-09-26 14:30:55 +020016
Radek Krejcib1890642018-10-03 14:05:40 +020017#include <assert.h>
Radek Krejci7a7fa902018-09-25 17:08:21 +020018#include <ctype.h>
Radek Krejcid91dbaf2018-09-21 15:51:39 +020019#include <stdbool.h>
20#include <stdint.h>
Radek Krejcie7b95092019-05-15 11:03:07 +020021#include <stdlib.h>
Radek Krejci4b74d5e2018-09-26 14:30:55 +020022#include <string.h>
Radek Krejcid91dbaf2018-09-21 15:51:39 +020023
Radek Krejcid91dbaf2018-09-21 15:51:39 +020024#include "xml.h"
Radek Krejcie7b95092019-05-15 11:03:07 +020025#include "printer_internal.h"
Radek Krejcid91dbaf2018-09-21 15:51:39 +020026
Radek Krejcid91dbaf2018-09-21 15:51:39 +020027/* Move input p by s characters, if EOF log with lyxml_context c */
28#define move_input(c,p,s) p += s; LY_CHECK_ERR_RET(!p[0], LOGVAL(c->ctx, LY_VLOG_LINE, &c->line, LY_VCODE_EOF), LY_EVALID)
29
Radek Krejcib1890642018-10-03 14:05:40 +020030/* Ignore whitespaces in the input string p */
Radek Krejcid91dbaf2018-09-21 15:51:39 +020031#define ign_xmlws(c,p) while (is_xmlws(*(p))) {if (*(p) == '\n') {++c->line;} ++p;}
32
Radek Krejci4b74d5e2018-09-26 14:30:55 +020033/**
34 * @brief Ignore any characters until the delim of the size delim_len is read
35 *
36 * Detects number of read new lines.
37 * Returns the pointer to the beginning of the detected delim, or NULL in case the delim not found in
38 * NULL-terminated input string.
39 * */
Radek Krejcid91dbaf2018-09-21 15:51:39 +020040static const char *
41ign_todelim(register const char *input, const char *delim, size_t delim_len, size_t *newlines)
42{
43 size_t i;
44 register const char *a, *b;
45
46 (*newlines) = 0;
47 for ( ; *input; ++input) {
48 if (*input != *delim) {
49 if (*input == '\n') {
50 ++(*newlines);
51 }
52 continue;
53 }
54 a = input;
55 b = delim;
56 for (i = 0; i < delim_len; ++i) {
57 if (*a++ != *b++) {
58 break;
59 }
60 }
61 if (i == delim_len) {
62 return input;
63 }
64 }
65 return NULL;
66}
67
Radek Krejci4b74d5e2018-09-26 14:30:55 +020068/**
Radek Krejci7a7fa902018-09-25 17:08:21 +020069 * Store UTF-8 character specified as 4byte integer into the dst buffer.
70 * Returns number of written bytes (4 max), expects that dst has enough space.
71 *
72 * UTF-8 mapping:
73 * 00000000 -- 0000007F: 0xxxxxxx
74 * 00000080 -- 000007FF: 110xxxxx 10xxxxxx
75 * 00000800 -- 0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
76 * 00010000 -- 001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
77 *
78 * Includes checking for valid characters (following RFC 7950, sec 9.4)
79 */
80static LY_ERR
Radek Krejci117d2082018-09-26 10:05:14 +020081lyxml_pututf8(char *dst, uint32_t value, size_t *bytes_written)
Radek Krejci7a7fa902018-09-25 17:08:21 +020082{
83 if (value < 0x80) {
84 /* one byte character */
85 if (value < 0x20 &&
86 value != 0x09 &&
87 value != 0x0a &&
88 value != 0x0d) {
89 return LY_EINVAL;
90 }
91
92 dst[0] = value;
93 (*bytes_written) = 1;
94 } else if (value < 0x800) {
95 /* two bytes character */
96 dst[0] = 0xc0 | (value >> 6);
97 dst[1] = 0x80 | (value & 0x3f);
98 (*bytes_written) = 2;
99 } else if (value < 0xfffe) {
100 /* three bytes character */
101 if (((value & 0xf800) == 0xd800) ||
102 (value >= 0xfdd0 && value <= 0xfdef)) {
103 /* exclude surrogate blocks %xD800-DFFF */
104 /* exclude noncharacters %xFDD0-FDEF */
105 return LY_EINVAL;
106 }
107
108 dst[0] = 0xe0 | (value >> 12);
109 dst[1] = 0x80 | ((value >> 6) & 0x3f);
110 dst[2] = 0x80 | (value & 0x3f);
111
112 (*bytes_written) = 3;
113 } else if (value < 0x10fffe) {
114 if ((value & 0xffe) == 0xffe) {
115 /* exclude noncharacters %xFFFE-FFFF, %x1FFFE-1FFFF, %x2FFFE-2FFFF, %x3FFFE-3FFFF, %x4FFFE-4FFFF,
116 * %x5FFFE-5FFFF, %x6FFFE-6FFFF, %x7FFFE-7FFFF, %x8FFFE-8FFFF, %x9FFFE-9FFFF, %xAFFFE-AFFFF,
117 * %xBFFFE-BFFFF, %xCFFFE-CFFFF, %xDFFFE-DFFFF, %xEFFFE-EFFFF, %xFFFFE-FFFFF, %x10FFFE-10FFFF */
118 return LY_EINVAL;
119 }
120 /* four bytes character */
121 dst[0] = 0xf0 | (value >> 18);
122 dst[1] = 0x80 | ((value >> 12) & 0x3f);
123 dst[2] = 0x80 | ((value >> 6) & 0x3f);
124 dst[3] = 0x80 | (value & 0x3f);
125
126 (*bytes_written) = 4;
127 }
128 return LY_SUCCESS;
129}
130
Radek Krejci4b74d5e2018-09-26 14:30:55 +0200131/**
132 * @brief Check/Get an XML qualified name from the input string.
133 *
134 * The identifier must have at least one valid character complying the name start character constraints.
135 * The identifier is terminated by the first character, which does not comply to the name character constraints.
136 *
137 * See https://www.w3.org/TR/xml-names/#NT-NCName
138 *
139 * @param[in] context XML context to track lines or store errors into libyang context.
140 * @param[in,out] input Input string to process, updated according to the processed/read data.
141 * Note that the term_char is also read, so input points after the term_char at the end.
142 * @param[out] term_char The first character in the input string which does not compy to the name constraints.
143 * @param[out] term_char_len Number of bytes used to encode UTF8 term_char. Serves to be able to go back in input string.
144 * @return LY_ERR value.
145 */
146static LY_ERR
Radek Krejcid91dbaf2018-09-21 15:51:39 +0200147lyxml_check_qname(struct lyxml_context *context, const char **input, unsigned int *term_char, size_t *term_char_len)
148{
149 unsigned int c;
150 const char *id = (*input);
151 LY_ERR rc;
152
153 /* check NameStartChar (minus colon) */
Radek Krejcib416be62018-10-01 14:51:45 +0200154 LY_CHECK_ERR_RET(ly_getutf8(input, &c, NULL) != LY_SUCCESS,
Radek Krejcid91dbaf2018-09-21 15:51:39 +0200155 LOGVAL(context->ctx, LY_VLOG_LINE, &context->line, LY_VCODE_INCHAR, (*input)[0]), LY_EVALID);
156 LY_CHECK_ERR_RET(!is_xmlqnamestartchar(c),
157 LOGVAL(context->ctx, LY_VLOG_LINE, &context->line, LYVE_SYNTAX,
158 "Identifier \"%s\" starts with invalid character.", id),
159 LY_EVALID);
160
161 /* check rest of the identifier */
Radek Krejcib416be62018-10-01 14:51:45 +0200162 for (rc = ly_getutf8(input, &c, term_char_len);
Radek Krejcid91dbaf2018-09-21 15:51:39 +0200163 rc == LY_SUCCESS && is_xmlqnamechar(c);
Radek Krejcib416be62018-10-01 14:51:45 +0200164 rc = ly_getutf8(input, &c, term_char_len));
Radek Krejcid91dbaf2018-09-21 15:51:39 +0200165 LY_CHECK_ERR_RET(rc != LY_SUCCESS, LOGVAL(context->ctx, LY_VLOG_LINE, &context->line, LY_VCODE_INCHAR, (*input)[0]), LY_EVALID);
166
167 (*term_char) = c;
168 return LY_SUCCESS;
169}
170
Radek Krejci7a7fa902018-09-25 17:08:21 +0200171LY_ERR
Radek Krejcid70d1072018-10-09 14:20:47 +0200172lyxml_get_string(struct lyxml_context *context, const char **input, char **buffer, size_t *buffer_size, char **output, size_t *length, int *dynamic)
Radek Krejci7a7fa902018-09-25 17:08:21 +0200173{
174#define BUFSIZE 4096
175#define BUFSIZE_STEP 4096
176#define BUFSIZE_CHECK(CTX, BUF, SIZE, CURR, NEED) \
177 if (CURR+NEED >= SIZE) { \
178 BUF = ly_realloc(BUF, SIZE + BUFSIZE_STEP); \
179 LY_CHECK_ERR_RET(!BUF, LOGMEM(CTX), LY_EMEM); \
180 SIZE += BUFSIZE_STEP; \
181 }
182
183 struct ly_ctx *ctx = context->ctx; /* shortcut */
Radek Krejcid70d1072018-10-09 14:20:47 +0200184 const char *in = (*input), *start;
185 char *buf = NULL, delim;
Radek Krejci7a7fa902018-09-25 17:08:21 +0200186 size_t offset; /* read offset in input buffer */
Radek Krejcid70d1072018-10-09 14:20:47 +0200187 size_t len; /* length of the output string (write offset in output buffer) */
Radek Krejci7a7fa902018-09-25 17:08:21 +0200188 size_t size; /* size of the output buffer */
189 void *p;
Radek Krejci117d2082018-09-26 10:05:14 +0200190 uint32_t n;
191 size_t u, newlines;
Radek Krejci7a7fa902018-09-25 17:08:21 +0200192 bool empty_content = false;
193 LY_ERR rc;
194
Radek Krejcib1890642018-10-03 14:05:40 +0200195 assert(context);
196 assert(context->status == LYXML_ELEM_CONTENT || context->status == LYXML_ATTR_CONTENT);
197
Radek Krejci7a7fa902018-09-25 17:08:21 +0200198 if (in[0] == '\'') {
199 delim = '\'';
200 ++in;
201 } else if (in[0] == '"') {
202 delim = '"';
203 ++in;
204 } else {
205 delim = '<';
206 empty_content = true;
207 }
Radek Krejcid70d1072018-10-09 14:20:47 +0200208 start = in;
Radek Krejci7a7fa902018-09-25 17:08:21 +0200209
210 if (empty_content) {
211 /* only when processing element's content - try to ignore whitespaces used to format XML data
212 * before element's child or closing tag */
Radek Krejci117d2082018-09-26 10:05:14 +0200213 for (offset = newlines = 0; in[offset] && is_xmlws(in[offset]); ++offset) {
214 if (in[offset] == '\n') {
215 ++newlines;
216 }
217 }
Radek Krejci7a7fa902018-09-25 17:08:21 +0200218 LY_CHECK_ERR_RET(!in[offset], LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_EOF), LY_EVALID);
Radek Krejci117d2082018-09-26 10:05:14 +0200219 context->line += newlines;
Radek Krejci7a7fa902018-09-25 17:08:21 +0200220 if (in[offset] == '<') {
Radek Krejcied6c6ad2018-09-26 09:10:18 +0200221 (*input) = in + offset;
Radek Krejcie7b95092019-05-15 11:03:07 +0200222 context->status -= 1; /* LYXML_ELEMENT */;
Radek Krejci7a7fa902018-09-25 17:08:21 +0200223 return LY_EINVAL;
224 }
Radek Krejci7a7fa902018-09-25 17:08:21 +0200225 }
Radek Krejcid70d1072018-10-09 14:20:47 +0200226 /* init */
227 offset = len = 0;
Radek Krejci7a7fa902018-09-25 17:08:21 +0200228
Radek Krejcid70d1072018-10-09 14:20:47 +0200229 if (0) {
230getbuffer:
231 /* prepare output buffer */
232 if (*buffer) {
233 buf = *buffer;
234 size = *buffer_size;
235 } else {
236 buf = malloc(BUFSIZE);
237 size = BUFSIZE;
238 LY_CHECK_ERR_RET(!buf, LOGMEM(ctx), LY_EMEM);
239 }
Radek Krejci7a7fa902018-09-25 17:08:21 +0200240 }
Radek Krejci7a7fa902018-09-25 17:08:21 +0200241
242 /* parse */
243 while (in[offset]) {
244 if (in[offset] == '&') {
Radek Krejcid70d1072018-10-09 14:20:47 +0200245 if (!buf) {
246 /* it is necessary to modify the input, so we will need a dynamically allocated buffer */
247 goto getbuffer;
248 }
249
Radek Krejci7a7fa902018-09-25 17:08:21 +0200250 if (offset) {
251 /* store what we have so far */
252 BUFSIZE_CHECK(ctx, buf, size, len, offset);
253 memcpy(&buf[len], in, offset);
254 len += offset;
255 in += offset;
256 offset = 0;
257 }
258 /* process reference */
259 /* we will need 4 bytes at most since we support only the predefined
260 * (one-char) entities and character references */
261 BUFSIZE_CHECK(ctx, buf, size, len, 4);
262 ++offset;
263 if (in[offset] != '#') {
264 /* entity reference - only predefined references are supported */
265 if (!strncmp(&in[offset], "lt;", 3)) {
266 buf[len++] = '<';
267 in += 4; /* &lt; */
268 } else if (!strncmp(&in[offset], "gt;", 3)) {
269 buf[len++] = '>';
270 in += 4; /* &gt; */
271 } else if (!strncmp(&in[offset], "amp;", 4)) {
272 buf[len++] = '&';
273 in += 5; /* &amp; */
274 } else if (!strncmp(&in[offset], "apos;", 5)) {
275 buf[len++] = '\'';
276 in += 6; /* &apos; */
277 } else if (!strncmp(&in[offset], "quot;", 5)) {
278 buf[len++] = '\"';
279 in += 6; /* &quot; */
280 } else {
Radek Krejcied6c6ad2018-09-26 09:10:18 +0200281 LOGVAL(ctx, LY_VLOG_LINE, &context->line, LYVE_SYNTAX,
282 "Entity reference \"%.*s\" not supported, only predefined references allowed.", 10, &in[offset-1]);
Radek Krejci7a7fa902018-09-25 17:08:21 +0200283 goto error;
284 }
285 offset = 0;
286 } else {
287 p = (void*)&in[offset - 1];
288 /* character reference */
289 ++offset;
290 if (isdigit(in[offset])) {
291 for (n = 0; isdigit(in[offset]); offset++) {
292 n = (10 * n) + (in[offset] - '0');
293 }
294 } else if (in[offset] == 'x' && isxdigit(in[offset + 1])) {
295 for (n = 0, ++offset; isxdigit(in[offset]); offset++) {
296 if (isdigit(in[offset])) {
297 u = (in[offset] - '0');
298 } else if (in[offset] > 'F') {
299 u = 10 + (in[offset] - 'a');
300 } else {
301 u = 10 + (in[offset] - 'A');
302 }
303 n = (16 * n) + u;
304 }
305 } else {
Radek Krejcied6c6ad2018-09-26 09:10:18 +0200306 LOGVAL(ctx, LY_VLOG_LINE, &context->line, LYVE_SYNTAX, "Invalid character reference \"%.*s\".", 12, p);
Radek Krejci7a7fa902018-09-25 17:08:21 +0200307 goto error;
308
309 }
310 LY_CHECK_ERR_GOTO(in[offset] != ';',
311 LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_INSTREXP,
312 LY_VCODE_INSTREXP_len(&in[offset]), &in[offset], ";"),
313 error);
314 ++offset;
315 rc = lyxml_pututf8(&buf[len], n, &u);
316 LY_CHECK_ERR_GOTO(rc, LOGVAL(ctx, LY_VLOG_LINE, &context->line, LYVE_SYNTAX,
Radek Krejci117d2082018-09-26 10:05:14 +0200317 "Invalid character reference \"%.*s\" (0x%08x).", 12, p, n),
Radek Krejci7a7fa902018-09-25 17:08:21 +0200318 error);
319 len += u;
320 in += offset;
321 offset = 0;
322 }
323 } else if (in[offset] == delim) {
324 /* end of string */
Radek Krejcid70d1072018-10-09 14:20:47 +0200325 if (buf) {
326 if (len + offset >= size) {
327 buf = ly_realloc(buf, len + offset + 1);
328 LY_CHECK_ERR_RET(!buf, LOGMEM(ctx), LY_EMEM);
329 size = len + offset + 1;
330 }
331 memcpy(&buf[len], in, offset);
Radek Krejci7a7fa902018-09-25 17:08:21 +0200332 }
Radek Krejci7a7fa902018-09-25 17:08:21 +0200333 len += offset;
334 /* in case of element content, keep the leading <,
Radek Krejcib1890642018-10-03 14:05:40 +0200335 * for attribute's value move after the terminating quotation mark */
336 if (context->status == LYXML_ELEM_CONTENT) {
Radek Krejci7a7fa902018-09-25 17:08:21 +0200337 in += offset;
338 } else {
339 in += offset + 1;
340 }
341 goto success;
342 } else {
343 /* log lines */
344 if (in[offset] == '\n') {
345 ++context->line;
346 }
347
348 /* continue */
349 ++offset;
350 }
351 }
352 LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_EOF);
353error:
354 if (!(*buffer)) {
Radek Krejcibb9b1982019-04-08 14:24:59 +0200355 /* buffer not provided, buf is local */
Radek Krejci7a7fa902018-09-25 17:08:21 +0200356 free(buf);
Radek Krejcibb9b1982019-04-08 14:24:59 +0200357 } else if (buf) {
358 /* buf is shared with caller via buffer, but buf could be reallocated, so update the provided buffer */
359 (*buffer) = buf;
360 (*buffer_size) = size;
Radek Krejci7a7fa902018-09-25 17:08:21 +0200361 }
362 return LY_EVALID;
363
364success:
Radek Krejcid70d1072018-10-09 14:20:47 +0200365 if (buf) {
366 if (!(*buffer) && size != len + 1) {
367 /* not using provided buffer, so fit the allocated buffer to what we really have inside */
368 p = realloc(buf, len + 1);
369 /* ignore realloc fail because we are reducing the buffer,
370 * so just return bigger buffer than needed */
371 if (p) {
372 size = len + 1;
373 buf = p;
374 }
Radek Krejci7a7fa902018-09-25 17:08:21 +0200375 }
Radek Krejcid70d1072018-10-09 14:20:47 +0200376 /* set terminating NULL byte */
377 buf[len] = '\0';
Radek Krejci7a7fa902018-09-25 17:08:21 +0200378 }
Radek Krejci7a7fa902018-09-25 17:08:21 +0200379
Radek Krejcib1890642018-10-03 14:05:40 +0200380 context->status -= 1;
Radek Krejcid70d1072018-10-09 14:20:47 +0200381 if (buf) {
382 (*buffer) = buf;
383 (*buffer_size) = size;
384 (*output) = buf;
385 (*dynamic) = 1;
386 } else {
387 (*output) = (char*)start;
388 (*dynamic) = 0;
389 }
390 (*length) = len;
391
Radek Krejci28e8cb52019-03-08 11:31:31 +0100392 if (context->status == LYXML_ATTRIBUTE) {
393 if (in[0] == '>') {
394 /* element terminated by > - termination of the opening tag */
395 context->status = LYXML_ELEM_CONTENT;
396 ++in;
397 } else if (in[0] == '/' && in[1] == '>') {
398 /* element terminated by /> - termination of an empty element */
399 context->status = LYXML_ELEMENT;
400 in += 2;
401
402 /* remove the closed element record from the tags list */
403 free(context->elements.objs[context->elements.count - 1]);
404 --context->elements.count;
405 }
406 }
407
408 (*input) = in;
Radek Krejci7a7fa902018-09-25 17:08:21 +0200409 return LY_SUCCESS;
410
411#undef BUFSIZE
412#undef BUFSIZE_STEP
413#undef BUFSIZE_CHECK
414}
415
Radek Krejcid972c252018-09-25 13:23:39 +0200416LY_ERR
Radek Krejci7a7fa902018-09-25 17:08:21 +0200417lyxml_get_attribute(struct lyxml_context *context, const char **input,
Radek Krejcid972c252018-09-25 13:23:39 +0200418 const char **prefix, size_t *prefix_len, const char **name, size_t *name_len)
419{
420 struct ly_ctx *ctx = context->ctx; /* shortcut */
421 const char *in = (*input);
422 const char *id;
423 const char *endtag;
424 LY_ERR rc;
425 unsigned int c;
426 size_t endtag_len;
427
428 /* initialize output variables */
429 (*prefix) = (*name) = NULL;
430 (*prefix_len) = (*name_len) = 0;
431
432 /* skip initial whitespaces */
433 ign_xmlws(context, in);
434
435 if (in[0] == '\0') {
436 /* EOF - not expected at this place */
437 return LY_EINVAL;
Radek Krejcid972c252018-09-25 13:23:39 +0200438 }
439
440 /* remember the identifier start before checking its format */
441 id = in;
442 rc = lyxml_check_qname(context, &in, &c, &endtag_len);
443 LY_CHECK_RET(rc);
444 if (c == ':') {
445 /* we have prefixed identifier */
446 endtag = in - endtag_len;
447
448 rc = lyxml_check_qname(context, &in, &c, &endtag_len);
449 LY_CHECK_RET(rc);
450
451 (*prefix) = id;
452 (*prefix_len) = endtag - id;
453 id = endtag + 1;
454 }
455 if (!is_xmlws(c) && c != '=') {
456 in = in - endtag_len;
457 LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_INSTREXP, LY_VCODE_INSTREXP_len(in), in, "whitespace or '='");
458 return LY_EVALID;
459 }
460 in = in - endtag_len;
461 (*name) = id;
462 (*name_len) = in - id;
463
464 /* eat '=' and stop at the value beginning */
465 ign_xmlws(context, in);
466 if (in[0] != '=') {
467 LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_INSTREXP, LY_VCODE_INSTREXP_len(in), in, "'='");
468 return LY_EVALID;
469 }
470 ++in;
471 ign_xmlws(context, in);
472 if (in[0] != '\'' && in[0] != '"') {
Radek Krejcib1890642018-10-03 14:05:40 +0200473 LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_INSTREXP,
474 LY_VCODE_INSTREXP_len(in), in, "either single or double quotation mark");
Radek Krejcid972c252018-09-25 13:23:39 +0200475 return LY_EVALID;
476 }
Radek Krejcib1890642018-10-03 14:05:40 +0200477 context->status = LYXML_ATTR_CONTENT;
Radek Krejcid972c252018-09-25 13:23:39 +0200478
Radek Krejcid972c252018-09-25 13:23:39 +0200479 /* move caller's input */
480 (*input) = in;
481 return LY_SUCCESS;
482}
483
Radek Krejcid91dbaf2018-09-21 15:51:39 +0200484LY_ERR
Radek Krejci7a7fa902018-09-25 17:08:21 +0200485lyxml_get_element(struct lyxml_context *context, const char **input,
Radek Krejcid91dbaf2018-09-21 15:51:39 +0200486 const char **prefix, size_t *prefix_len, const char **name, size_t *name_len)
487{
488 struct ly_ctx *ctx = context->ctx; /* shortcut */
489 const char *in = (*input);
490 const char *endtag;
491 const char *sectname;
492 const char *id;
493 size_t endtag_len, newlines;
Radek Krejcib1890642018-10-03 14:05:40 +0200494 bool loop = true, closing = false;
Radek Krejcid91dbaf2018-09-21 15:51:39 +0200495 unsigned int c;
496 LY_ERR rc;
Radek Krejcib1890642018-10-03 14:05:40 +0200497 struct lyxml_elem *e;
Radek Krejcid91dbaf2018-09-21 15:51:39 +0200498
499 /* initialize output variables */
500 (*prefix) = (*name) = NULL;
501 (*prefix_len) = (*name_len) = 0;
502
503 while (loop) {
504 ign_xmlws(context, in);
505
506 if (in[0] == '\0') {
507 /* EOF */
Radek Krejcib1890642018-10-03 14:05:40 +0200508 context->status = LYXML_END;
Radek Krejcid91dbaf2018-09-21 15:51:39 +0200509 goto success;
510 } else if (in[0] != '<') {
511 return LY_EINVAL;
512 }
513 move_input(context, in, 1);
514
515 if (in[0] == '!') {
516 move_input(context, in, 1);
517 /* sections to ignore */
518 if (!strncmp(in, "--", 2)) {
519 /* comment */
520 move_input(context, in, 2);
521 sectname = "Comment";
522 endtag = "-->";
523 endtag_len = 3;
524 } else if (!strncmp(in, "[CDATA[", 7)) {
525 /* CDATA section */
526 move_input(context, in, 7);
527 sectname = "CData";
528 endtag = "]]>";
529 endtag_len = 3;
530 } else if (!strncmp(in, "DOCTYPE", 7)) {
531 /* Document type declaration - not supported */
532 LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_NSUPP, "Document Type Declaration");
533 return LY_EVALID;
Radek Krejcic5c31bb2019-04-08 14:40:52 +0200534 } else {
535 LOGVAL(ctx, LY_VLOG_LINE, &context->line, LYVE_SYNTAX, "Unknown XML section \"%.20s\".", &in[-2]);
536 return LY_EVALID;
Radek Krejcid91dbaf2018-09-21 15:51:39 +0200537 }
538 in = ign_todelim(in, endtag, endtag_len, &newlines);
539 LY_CHECK_ERR_RET(!in, LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_NTERM, sectname), LY_EVALID);
540 context->line += newlines;
541 in += endtag_len;
542 } else if (in[0] == '?') {
543 in = ign_todelim(in, "?>", 2, &newlines);
544 LY_CHECK_ERR_RET(!in, LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_NTERM, "Declaration"), LY_EVALID);
545 context->line += newlines;
546 in += 2;
Radek Krejcib1890642018-10-03 14:05:40 +0200547 } else if (in[0] == '/') {
548 /* closing element */
549 closing = true;
550 ++in;
551 goto element;
Radek Krejcid91dbaf2018-09-21 15:51:39 +0200552 } else {
553 /* element */
Radek Krejcib1890642018-10-03 14:05:40 +0200554element:
Radek Krejcid91dbaf2018-09-21 15:51:39 +0200555 ign_xmlws(context, in);
556 LY_CHECK_ERR_RET(!in[0], LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_EOF), LY_EVALID);
557
558 /* remember the identifier start before checking its format */
559 id = in;
560 rc = lyxml_check_qname(context, &in, &c, &endtag_len);
561 LY_CHECK_RET(rc);
562 if (c == ':') {
563 /* we have prefixed identifier */
564 endtag = in - endtag_len;
565
566 rc = lyxml_check_qname(context, &in, &c, &endtag_len);
567 LY_CHECK_RET(rc);
568
569 (*prefix) = id;
570 (*prefix_len) = endtag - id;
571 id = endtag + 1;
572 }
573 if (!is_xmlws(c) && c != '/' && c != '>') {
574 in = in - endtag_len;
Radek Krejcid972c252018-09-25 13:23:39 +0200575 LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_INSTREXP, LY_VCODE_INSTREXP_len(in), in,
576 "whitespace or element tag termination ('>' or '/>'");
Radek Krejcid91dbaf2018-09-21 15:51:39 +0200577 return LY_EVALID;
578 }
Radek Krejcid91dbaf2018-09-21 15:51:39 +0200579 (*name) = id;
Radek Krejcib1890642018-10-03 14:05:40 +0200580 (*name_len) = in - endtag_len - id;
Radek Krejcid91dbaf2018-09-21 15:51:39 +0200581
Radek Krejcib1890642018-10-03 14:05:40 +0200582 if (is_xmlws(c)) {
583 /* go to the next meaningful input */
584 ign_xmlws(context, in);
585 LY_CHECK_ERR_RET(!in[0], LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_EOF), LY_EVALID);
586 c = in[0];
587 ++in;
588 endtag_len = 1;
589 }
590
591 if (closing) {
592 /* match opening and closing element tags */
593 LY_CHECK_ERR_RET(
594 !context->elements.count,
Radek Krejci3fbc9872019-04-16 16:50:01 +0200595 LOGVAL(ctx, LY_VLOG_LINE, &context->line, LYVE_SYNTAX, "Opening and closing elements tag missmatch (\"%.*s\").", *name_len, *name),
Radek Krejcib1890642018-10-03 14:05:40 +0200596 LY_EVALID);
597 e = (struct lyxml_elem*)context->elements.objs[context->elements.count - 1];
598 LY_CHECK_ERR_RET(e->prefix_len != *prefix_len || e->name_len != *name_len
599 || (*prefix_len && strncmp(*prefix, e->prefix, e->prefix_len)) || strncmp(*name, e->name, e->name_len),
Radek Krejci3fbc9872019-04-16 16:50:01 +0200600 LOGVAL(ctx, LY_VLOG_LINE, &context->line, LYVE_SYNTAX, "Opening and closing elements tag missmatch (\"%.*s\").", *name_len, *name),
Radek Krejcib1890642018-10-03 14:05:40 +0200601 LY_EVALID);
602 /* opening and closing element tags matches, remove record from the opening tags list */
603 free(e);
604 --context->elements.count;
605 /* do not return element information to announce closing element being currently processed */
606 *name = *prefix = NULL;
607 *name_len = *prefix_len = 0;
608
609 if (c == '>') {
610 /* end of closing element */
611 context->status = LYXML_ELEMENT;
612 } else {
613 in -= endtag_len;
614 LOGVAL(ctx, LY_VLOG_LINE, &context->line, LYVE_SYNTAX, "Unexpected data \"%.*s\" in closing element tag.",
615 LY_VCODE_INSTREXP_len(in), in);
616 return LY_EVALID;
617 }
618 } else {
619 if (c == '>') {
620 /* end of opening element */
621 context->status = LYXML_ELEM_CONTENT;
622 } else if (c == '/' && in[0] == '>') {
623 /* empty element closing */
624 context->status = LYXML_ELEMENT;
625 ++in;
626 } else {
627 /* attribute */
628 context->status = LYXML_ATTRIBUTE;
629 in -= endtag_len;
630 }
631
632 if (context->status != LYXML_ELEMENT) {
633 /* store element opening tag information */
634 e = malloc(sizeof *e);
635 LY_CHECK_ERR_RET(!e, LOGMEM(ctx), LY_EMEM);
636 e->name = *name;
637 e->prefix = *prefix;
638 e->name_len = *name_len;
639 e->prefix_len = *prefix_len;
640 ly_set_add(&context->elements, e, LY_SET_OPT_USEASLIST);
641 }
642 }
Radek Krejcid91dbaf2018-09-21 15:51:39 +0200643 loop = false;
644 }
645 }
646
647success:
648 /* move caller's input */
649 (*input) = in;
650 return LY_SUCCESS;
651}
652
Radek Krejci4b74d5e2018-09-26 14:30:55 +0200653LY_ERR
654lyxml_ns_add(struct lyxml_context *context, const char *element_name, const char *prefix, size_t prefix_len, char *uri)
655{
656 struct lyxml_ns *ns;
657
658 ns = malloc(sizeof *ns);
659 LY_CHECK_ERR_RET(!ns, LOGMEM(context->ctx), LY_EMEM);
660
Radek Krejcie0734d22019-04-05 15:54:28 +0200661 /* to distinguish 2 elements, we need not only the name, but also its depth in the XML tree.
662 * In case some dictionary is used to store elements' names (so name strings of 2 distinguish nodes
663 * actually points to the same memory), so the depth is necessary to distinguish parent/child nodes
664 * of the same name. Otherwise, the namespace defined in parent could be removed when leaving child node. */
665 ns->element_depth = context->elements.count;
Radek Krejci4b74d5e2018-09-26 14:30:55 +0200666 ns->element = element_name;
Radek Krejcie0734d22019-04-05 15:54:28 +0200667
Radek Krejci4b74d5e2018-09-26 14:30:55 +0200668 ns->uri = uri;
669 if (prefix) {
670 ns->prefix = strndup(prefix, prefix_len);
671 LY_CHECK_ERR_RET(!ns->prefix, LOGMEM(context->ctx); free(ns), LY_EMEM);
672 } else {
673 ns->prefix = NULL;
674 }
675
676 LY_CHECK_ERR_RET(ly_set_add(&context->ns, ns, LY_SET_OPT_USEASLIST) == -1, free(ns->prefix), LY_EMEM);
677 return LY_SUCCESS;
678}
679
680const struct lyxml_ns *
681lyxml_ns_get(struct lyxml_context *context, const char *prefix, size_t prefix_len)
682{
683 unsigned int u;
684 struct lyxml_ns *ns;
685
686 for (u = context->ns.count - 1; u + 1 > 0; --u) {
687 ns = (struct lyxml_ns *)context->ns.objs[u];
688 if (prefix) {
689 if (!strncmp(prefix, ns->prefix, prefix_len) && ns->prefix[prefix_len] == '\0') {
690 return ns;
691 }
692 } else if (!ns->prefix) {
693 /* default namespace */
694 return ns;
695 }
696 }
697
698 return NULL;
699}
700
701LY_ERR
702lyxml_ns_rm(struct lyxml_context *context, const char *element_name)
703{
704 unsigned int u;
705
706 for (u = context->ns.count - 1; u + 1 > 0; --u) {
Radek Krejcie0734d22019-04-05 15:54:28 +0200707 if (((struct lyxml_ns *)context->ns.objs[u])->element != element_name ||
708 ((struct lyxml_ns *)context->ns.objs[u])->element_depth != context->elements.count + 1) {
709 /* we are done, the namespaces from a single element are supposed to be together;
710 * the second condition is there to distinguish parent/child elements with the same name
711 * (which are for some reason stored at the same memory chunk), so we need to distinguish
712 * level of the node */
Radek Krejci4b74d5e2018-09-26 14:30:55 +0200713 break;
714 }
715 /* remove the ns structure */
716 free(((struct lyxml_ns *)context->ns.objs[u])->prefix);
717 free(((struct lyxml_ns *)context->ns.objs[u])->uri);
718 free(context->ns.objs[u]);
719 --context->ns.count;
720 }
721
722 if (!context->ns.count) {
723 /* cleanup the context's namespaces storage */
724 ly_set_erase(&context->ns, NULL);
725 }
726
727 return LY_SUCCESS;
728}
Radek Krejcib1890642018-10-03 14:05:40 +0200729
730void
731lyxml_context_clear(struct lyxml_context *context)
732{
733 unsigned int u;
734
735 ly_set_erase(&context->elements, free);
736 for (u = context->ns.count - 1; u + 1 > 0; --u) {
737 /* remove the ns structure */
738 free(((struct lyxml_ns *)context->ns.objs[u])->prefix);
739 free(((struct lyxml_ns *)context->ns.objs[u])->uri);
740 free(context->ns.objs[u]);
741 }
742 ly_set_erase(&context->ns, NULL);
743}
Radek Krejcie7b95092019-05-15 11:03:07 +0200744
745LY_ERR
746lyxml_dump_text(struct lyout *out, const char *text, int attribute)
747{
748 LY_ERR ret = LY_SUCCESS;
749 unsigned int u;
750
751 if (!text) {
752 return 0;
753 }
754
755 for (u = 0; text[u]; u++) {
756 switch (text[u]) {
757 case '&':
758 ret = ly_print(out, "&amp;");
759 break;
760 case '<':
761 ret = ly_print(out, "&lt;");
762 break;
763 case '>':
764 /* not needed, just for readability */
765 ret = ly_print(out, "&gt;");
766 break;
767 case '"':
768 if (attribute) {
769 ret = ly_print(out, "&quot;");
770 break;
771 }
772 /* falls through */
773 default:
774 ly_write(out, &text[u], 1);
775 }
776 }
777
778 return ret;
779}
780