blob: dea01a8cbefa1abcef32dcc7cab5658ceebac65b [file] [log] [blame]
Radek Krejcid91dbaf2018-09-21 15:51:39 +02001/**
2 * @file xml.c
3 * @author Radek Krejci <rkrejci@cesnet.cz>
4 * @brief Generic XML parser implementation for libyang
5 *
6 * Copyright (c) 2015 - 2018 CESNET, z.s.p.o.
7 *
8 * This source code is licensed under BSD 3-Clause License (the "License").
9 * You may not use this file except in compliance with the License.
10 * You may obtain a copy of the License at
11 *
12 * https://opensource.org/licenses/BSD-3-Clause
13 */
14
Radek Krejcic1c03d62018-11-27 10:52:43 +010015#include "common.h"
Radek Krejci4b74d5e2018-09-26 14:30:55 +020016
Radek Krejcib1890642018-10-03 14:05:40 +020017#include <assert.h>
Radek Krejci7a7fa902018-09-25 17:08:21 +020018#include <ctype.h>
Radek Krejcid91dbaf2018-09-21 15:51:39 +020019#include <stdbool.h>
20#include <stdint.h>
Radek Krejcie7b95092019-05-15 11:03:07 +020021#include <stdlib.h>
Radek Krejci4b74d5e2018-09-26 14:30:55 +020022#include <string.h>
Radek Krejcid91dbaf2018-09-21 15:51:39 +020023
Radek Krejcid91dbaf2018-09-21 15:51:39 +020024#include "xml.h"
Radek Krejcie7b95092019-05-15 11:03:07 +020025#include "printer_internal.h"
Radek Krejcid91dbaf2018-09-21 15:51:39 +020026
Radek Krejcid91dbaf2018-09-21 15:51:39 +020027/* Move input p by s characters, if EOF log with lyxml_context c */
28#define move_input(c,p,s) p += s; LY_CHECK_ERR_RET(!p[0], LOGVAL(c->ctx, LY_VLOG_LINE, &c->line, LY_VCODE_EOF), LY_EVALID)
29
Radek Krejcib1890642018-10-03 14:05:40 +020030/* Ignore whitespaces in the input string p */
Radek Krejcid91dbaf2018-09-21 15:51:39 +020031#define ign_xmlws(c,p) while (is_xmlws(*(p))) {if (*(p) == '\n') {++c->line;} ++p;}
32
Radek Krejci4b74d5e2018-09-26 14:30:55 +020033/**
34 * @brief Ignore any characters until the delim of the size delim_len is read
35 *
36 * Detects number of read new lines.
37 * Returns the pointer to the beginning of the detected delim, or NULL in case the delim not found in
38 * NULL-terminated input string.
39 * */
Radek Krejcid91dbaf2018-09-21 15:51:39 +020040static const char *
41ign_todelim(register const char *input, const char *delim, size_t delim_len, size_t *newlines)
42{
43 size_t i;
44 register const char *a, *b;
45
46 (*newlines) = 0;
47 for ( ; *input; ++input) {
48 if (*input != *delim) {
49 if (*input == '\n') {
50 ++(*newlines);
51 }
52 continue;
53 }
54 a = input;
55 b = delim;
56 for (i = 0; i < delim_len; ++i) {
57 if (*a++ != *b++) {
58 break;
59 }
60 }
61 if (i == delim_len) {
62 return input;
63 }
64 }
65 return NULL;
66}
67
Radek Krejci4b74d5e2018-09-26 14:30:55 +020068/**
Radek Krejci7a7fa902018-09-25 17:08:21 +020069 * Store UTF-8 character specified as 4byte integer into the dst buffer.
70 * Returns number of written bytes (4 max), expects that dst has enough space.
71 *
72 * UTF-8 mapping:
73 * 00000000 -- 0000007F: 0xxxxxxx
74 * 00000080 -- 000007FF: 110xxxxx 10xxxxxx
75 * 00000800 -- 0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
76 * 00010000 -- 001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
77 *
78 * Includes checking for valid characters (following RFC 7950, sec 9.4)
79 */
80static LY_ERR
Radek Krejci117d2082018-09-26 10:05:14 +020081lyxml_pututf8(char *dst, uint32_t value, size_t *bytes_written)
Radek Krejci7a7fa902018-09-25 17:08:21 +020082{
83 if (value < 0x80) {
84 /* one byte character */
85 if (value < 0x20 &&
86 value != 0x09 &&
87 value != 0x0a &&
88 value != 0x0d) {
89 return LY_EINVAL;
90 }
91
92 dst[0] = value;
93 (*bytes_written) = 1;
94 } else if (value < 0x800) {
95 /* two bytes character */
96 dst[0] = 0xc0 | (value >> 6);
97 dst[1] = 0x80 | (value & 0x3f);
98 (*bytes_written) = 2;
99 } else if (value < 0xfffe) {
100 /* three bytes character */
101 if (((value & 0xf800) == 0xd800) ||
102 (value >= 0xfdd0 && value <= 0xfdef)) {
103 /* exclude surrogate blocks %xD800-DFFF */
104 /* exclude noncharacters %xFDD0-FDEF */
105 return LY_EINVAL;
106 }
107
108 dst[0] = 0xe0 | (value >> 12);
109 dst[1] = 0x80 | ((value >> 6) & 0x3f);
110 dst[2] = 0x80 | (value & 0x3f);
111
112 (*bytes_written) = 3;
113 } else if (value < 0x10fffe) {
114 if ((value & 0xffe) == 0xffe) {
115 /* exclude noncharacters %xFFFE-FFFF, %x1FFFE-1FFFF, %x2FFFE-2FFFF, %x3FFFE-3FFFF, %x4FFFE-4FFFF,
116 * %x5FFFE-5FFFF, %x6FFFE-6FFFF, %x7FFFE-7FFFF, %x8FFFE-8FFFF, %x9FFFE-9FFFF, %xAFFFE-AFFFF,
117 * %xBFFFE-BFFFF, %xCFFFE-CFFFF, %xDFFFE-DFFFF, %xEFFFE-EFFFF, %xFFFFE-FFFFF, %x10FFFE-10FFFF */
118 return LY_EINVAL;
119 }
120 /* four bytes character */
121 dst[0] = 0xf0 | (value >> 18);
122 dst[1] = 0x80 | ((value >> 12) & 0x3f);
123 dst[2] = 0x80 | ((value >> 6) & 0x3f);
124 dst[3] = 0x80 | (value & 0x3f);
125
126 (*bytes_written) = 4;
127 }
128 return LY_SUCCESS;
129}
130
Radek Krejci4b74d5e2018-09-26 14:30:55 +0200131/**
132 * @brief Check/Get an XML qualified name from the input string.
133 *
134 * The identifier must have at least one valid character complying the name start character constraints.
135 * The identifier is terminated by the first character, which does not comply to the name character constraints.
136 *
137 * See https://www.w3.org/TR/xml-names/#NT-NCName
138 *
139 * @param[in] context XML context to track lines or store errors into libyang context.
140 * @param[in,out] input Input string to process, updated according to the processed/read data.
141 * Note that the term_char is also read, so input points after the term_char at the end.
142 * @param[out] term_char The first character in the input string which does not compy to the name constraints.
143 * @param[out] term_char_len Number of bytes used to encode UTF8 term_char. Serves to be able to go back in input string.
144 * @return LY_ERR value.
145 */
146static LY_ERR
Radek Krejcid91dbaf2018-09-21 15:51:39 +0200147lyxml_check_qname(struct lyxml_context *context, const char **input, unsigned int *term_char, size_t *term_char_len)
148{
149 unsigned int c;
150 const char *id = (*input);
151 LY_ERR rc;
152
153 /* check NameStartChar (minus colon) */
Radek Krejcib416be62018-10-01 14:51:45 +0200154 LY_CHECK_ERR_RET(ly_getutf8(input, &c, NULL) != LY_SUCCESS,
Radek Krejcid91dbaf2018-09-21 15:51:39 +0200155 LOGVAL(context->ctx, LY_VLOG_LINE, &context->line, LY_VCODE_INCHAR, (*input)[0]), LY_EVALID);
156 LY_CHECK_ERR_RET(!is_xmlqnamestartchar(c),
157 LOGVAL(context->ctx, LY_VLOG_LINE, &context->line, LYVE_SYNTAX,
158 "Identifier \"%s\" starts with invalid character.", id),
159 LY_EVALID);
160
161 /* check rest of the identifier */
Radek Krejcib416be62018-10-01 14:51:45 +0200162 for (rc = ly_getutf8(input, &c, term_char_len);
Radek Krejcid91dbaf2018-09-21 15:51:39 +0200163 rc == LY_SUCCESS && is_xmlqnamechar(c);
Radek Krejcib416be62018-10-01 14:51:45 +0200164 rc = ly_getutf8(input, &c, term_char_len));
Radek Krejcid91dbaf2018-09-21 15:51:39 +0200165 LY_CHECK_ERR_RET(rc != LY_SUCCESS, LOGVAL(context->ctx, LY_VLOG_LINE, &context->line, LY_VCODE_INCHAR, (*input)[0]), LY_EVALID);
166
167 (*term_char) = c;
168 return LY_SUCCESS;
169}
170
Radek Krejci17a78d82019-05-15 15:49:55 +0200171/**
172 * @brief Add namespace definition into XML context.
173 *
174 * Namespaces from a single element are supposed to be added sequentially together (not interleaved by a namespace from other
175 * element). This mimic namespace visibility, since the namespace defined in element E is not visible from its parents or
176 * siblings. On the other hand, namespace from a parent element can be redefined in a child element. This is also reflected
177 * by lyxml_ns_get() which returns the most recent namespace definition for the given prefix.
178 *
179 * When leaving processing of a subtree of some element (after it is removed from context->elements), caller is supposed to call
180 * lyxml_ns_rm() to remove all the namespaces defined in such an element from the context.
181 *
182 * @param[in] context XML context to work with.
183 * @param[in] prefix Pointer to the namespace prefix as taken from lyxml_get_attribute(). Can be NULL for default namespace.
184 * @param[in] prefix_len Length of the prefix string (since it is not NULL-terminated when returned from lyxml_get_attribute()).
185 * @param[in] uri Namespace URI (value) to store. Value can be obtained via lyxml_get_string() and caller is not supposed to
186 * work with the pointer when the function succeeds. In case of error the value is freed.
187 * @return LY_ERR values.
188 */
Radek Krejci2d7a47b2019-05-16 13:34:10 +0200189LY_ERR
Radek Krejci17a78d82019-05-15 15:49:55 +0200190lyxml_ns_add(struct lyxml_context *context, const char *prefix, size_t prefix_len, char *uri)
191{
192 struct lyxml_ns *ns;
193
194 ns = malloc(sizeof *ns);
195 LY_CHECK_ERR_RET(!ns, LOGMEM(context->ctx), LY_EMEM);
196
197 /* we need to connect the depth of the element where the namespace is defined with the
198 * namespace record to be able to maintain (remove) the record when the parser leaves
199 * (to its sibling or back to the parent) the element where the namespace was defined */
200 ns->depth = context->elements.count;
201
202 ns->uri = uri;
203 if (prefix) {
204 ns->prefix = strndup(prefix, prefix_len);
205 LY_CHECK_ERR_RET(!ns->prefix, LOGMEM(context->ctx); free(ns->uri); free(ns), LY_EMEM);
206 } else {
207 ns->prefix = NULL;
208 }
209
210 LY_CHECK_ERR_RET(ly_set_add(&context->ns, ns, LY_SET_OPT_USEASLIST) == -1,
211 free(ns->prefix); free(ns->uri); free(ns), LY_EMEM);
212 return LY_SUCCESS;
213}
214
215/**
216 * @brief Remove all the namespaces defined in the element recently closed (removed from the context->elements).
217 *
218 * @param[in] context XML context to work with.
Radek Krejci17a78d82019-05-15 15:49:55 +0200219 */
Radek Krejci17dca992019-05-17 10:53:27 +0200220void
Radek Krejci17a78d82019-05-15 15:49:55 +0200221lyxml_ns_rm(struct lyxml_context *context)
222{
223 unsigned int u;
224
225 for (u = context->ns.count - 1; u + 1 > 0; --u) {
226 if (((struct lyxml_ns *)context->ns.objs[u])->depth != context->elements.count + 1) {
227 /* we are done, the namespaces from a single element are supposed to be together */
228 break;
229 }
230 /* remove the ns structure */
231 free(((struct lyxml_ns *)context->ns.objs[u])->prefix);
232 free(((struct lyxml_ns *)context->ns.objs[u])->uri);
233 free(context->ns.objs[u]);
234 --context->ns.count;
235 }
236
237 if (!context->ns.count) {
238 /* cleanup the context's namespaces storage */
239 ly_set_erase(&context->ns, NULL);
240 }
Radek Krejci17a78d82019-05-15 15:49:55 +0200241}
242
243const struct lyxml_ns *
244lyxml_ns_get(struct lyxml_context *context, const char *prefix, size_t prefix_len)
245{
246 unsigned int u;
247 struct lyxml_ns *ns;
248
249 for (u = context->ns.count - 1; u + 1 > 0; --u) {
250 ns = (struct lyxml_ns *)context->ns.objs[u];
251 if (prefix) {
252 if (!strncmp(prefix, ns->prefix, prefix_len) && ns->prefix[prefix_len] == '\0') {
253 return ns;
254 }
255 } else if (!ns->prefix) {
256 /* default namespace */
257 return ns;
258 }
259 }
260
261 return NULL;
262}
263
Radek Krejci7a7fa902018-09-25 17:08:21 +0200264LY_ERR
Radek Krejcid70d1072018-10-09 14:20:47 +0200265lyxml_get_string(struct lyxml_context *context, const char **input, char **buffer, size_t *buffer_size, char **output, size_t *length, int *dynamic)
Radek Krejci7a7fa902018-09-25 17:08:21 +0200266{
267#define BUFSIZE 4096
268#define BUFSIZE_STEP 4096
269#define BUFSIZE_CHECK(CTX, BUF, SIZE, CURR, NEED) \
270 if (CURR+NEED >= SIZE) { \
271 BUF = ly_realloc(BUF, SIZE + BUFSIZE_STEP); \
272 LY_CHECK_ERR_RET(!BUF, LOGMEM(CTX), LY_EMEM); \
273 SIZE += BUFSIZE_STEP; \
274 }
275
276 struct ly_ctx *ctx = context->ctx; /* shortcut */
Radek Krejcid70d1072018-10-09 14:20:47 +0200277 const char *in = (*input), *start;
278 char *buf = NULL, delim;
Radek Krejci7a7fa902018-09-25 17:08:21 +0200279 size_t offset; /* read offset in input buffer */
Radek Krejcid70d1072018-10-09 14:20:47 +0200280 size_t len; /* length of the output string (write offset in output buffer) */
Radek Krejci7a7fa902018-09-25 17:08:21 +0200281 size_t size; /* size of the output buffer */
282 void *p;
Radek Krejci117d2082018-09-26 10:05:14 +0200283 uint32_t n;
284 size_t u, newlines;
Radek Krejci7a7fa902018-09-25 17:08:21 +0200285 bool empty_content = false;
Radek Krejci17a78d82019-05-15 15:49:55 +0200286 LY_ERR rc = LY_SUCCESS;
Radek Krejci7a7fa902018-09-25 17:08:21 +0200287
Radek Krejcib1890642018-10-03 14:05:40 +0200288 assert(context);
289 assert(context->status == LYXML_ELEM_CONTENT || context->status == LYXML_ATTR_CONTENT);
290
Radek Krejci7a7fa902018-09-25 17:08:21 +0200291 if (in[0] == '\'') {
292 delim = '\'';
293 ++in;
294 } else if (in[0] == '"') {
295 delim = '"';
296 ++in;
297 } else {
298 delim = '<';
299 empty_content = true;
300 }
Radek Krejcid70d1072018-10-09 14:20:47 +0200301 start = in;
Radek Krejci7a7fa902018-09-25 17:08:21 +0200302
303 if (empty_content) {
304 /* only when processing element's content - try to ignore whitespaces used to format XML data
305 * before element's child or closing tag */
Radek Krejci117d2082018-09-26 10:05:14 +0200306 for (offset = newlines = 0; in[offset] && is_xmlws(in[offset]); ++offset) {
307 if (in[offset] == '\n') {
308 ++newlines;
309 }
310 }
Radek Krejci7a7fa902018-09-25 17:08:21 +0200311 LY_CHECK_ERR_RET(!in[offset], LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_EOF), LY_EVALID);
Radek Krejci117d2082018-09-26 10:05:14 +0200312 context->line += newlines;
Radek Krejci7a7fa902018-09-25 17:08:21 +0200313 if (in[offset] == '<') {
Radek Krejci339e2de2019-05-17 14:28:24 +0200314 const char *name, *prefix;
315 size_t name_len, prefix_len;
316
Radek Krejcied6c6ad2018-09-26 09:10:18 +0200317 (*input) = in + offset;
Radek Krejci339e2de2019-05-17 14:28:24 +0200318
319 /* get know if it is child element (indentation) or closing element (whitespace-only content) */
320 in = *input;
321 rc = lyxml_get_element(context, &in, &prefix, &prefix_len, &name, &name_len);
322 if (name) {
323 /* the element here is not closing element, so we have the just indentation formatting before the child */
324 free(context->elements.objs[--context->elements.count]);
325 context->status -= 1; /* LYXML_ELEMENT */
326 return LY_EINVAL;
327 } else if (rc) {
328 /* some parsing error, so pass it */
329 (*input) = in;
Radek Krejci8ced2f72019-05-20 12:33:49 +0200330 goto error;
Radek Krejci339e2de2019-05-17 14:28:24 +0200331 } else {
332 /* whitespace-only content */
Radek Krejcie92210c2019-05-17 15:53:35 +0200333 len = offset;
334 context->status++;
Radek Krejci339e2de2019-05-17 14:28:24 +0200335 goto success;
336 }
Radek Krejci7a7fa902018-09-25 17:08:21 +0200337 }
Radek Krejci7a7fa902018-09-25 17:08:21 +0200338 }
Radek Krejcid70d1072018-10-09 14:20:47 +0200339 /* init */
340 offset = len = 0;
Radek Krejci7a7fa902018-09-25 17:08:21 +0200341
Radek Krejcid70d1072018-10-09 14:20:47 +0200342 if (0) {
343getbuffer:
344 /* prepare output buffer */
345 if (*buffer) {
346 buf = *buffer;
347 size = *buffer_size;
348 } else {
349 buf = malloc(BUFSIZE);
350 size = BUFSIZE;
351 LY_CHECK_ERR_RET(!buf, LOGMEM(ctx), LY_EMEM);
352 }
Radek Krejci7a7fa902018-09-25 17:08:21 +0200353 }
Radek Krejci7a7fa902018-09-25 17:08:21 +0200354
355 /* parse */
356 while (in[offset]) {
357 if (in[offset] == '&') {
Radek Krejcid70d1072018-10-09 14:20:47 +0200358 if (!buf) {
359 /* it is necessary to modify the input, so we will need a dynamically allocated buffer */
360 goto getbuffer;
361 }
362
Radek Krejci7a7fa902018-09-25 17:08:21 +0200363 if (offset) {
364 /* store what we have so far */
365 BUFSIZE_CHECK(ctx, buf, size, len, offset);
366 memcpy(&buf[len], in, offset);
367 len += offset;
368 in += offset;
369 offset = 0;
370 }
371 /* process reference */
372 /* we will need 4 bytes at most since we support only the predefined
373 * (one-char) entities and character references */
374 BUFSIZE_CHECK(ctx, buf, size, len, 4);
375 ++offset;
376 if (in[offset] != '#') {
377 /* entity reference - only predefined references are supported */
378 if (!strncmp(&in[offset], "lt;", 3)) {
379 buf[len++] = '<';
380 in += 4; /* &lt; */
381 } else if (!strncmp(&in[offset], "gt;", 3)) {
382 buf[len++] = '>';
383 in += 4; /* &gt; */
384 } else if (!strncmp(&in[offset], "amp;", 4)) {
385 buf[len++] = '&';
386 in += 5; /* &amp; */
387 } else if (!strncmp(&in[offset], "apos;", 5)) {
388 buf[len++] = '\'';
389 in += 6; /* &apos; */
390 } else if (!strncmp(&in[offset], "quot;", 5)) {
391 buf[len++] = '\"';
392 in += 6; /* &quot; */
393 } else {
Radek Krejcied6c6ad2018-09-26 09:10:18 +0200394 LOGVAL(ctx, LY_VLOG_LINE, &context->line, LYVE_SYNTAX,
395 "Entity reference \"%.*s\" not supported, only predefined references allowed.", 10, &in[offset-1]);
Radek Krejci7a7fa902018-09-25 17:08:21 +0200396 goto error;
397 }
398 offset = 0;
399 } else {
400 p = (void*)&in[offset - 1];
401 /* character reference */
402 ++offset;
403 if (isdigit(in[offset])) {
404 for (n = 0; isdigit(in[offset]); offset++) {
405 n = (10 * n) + (in[offset] - '0');
406 }
407 } else if (in[offset] == 'x' && isxdigit(in[offset + 1])) {
408 for (n = 0, ++offset; isxdigit(in[offset]); offset++) {
409 if (isdigit(in[offset])) {
410 u = (in[offset] - '0');
411 } else if (in[offset] > 'F') {
412 u = 10 + (in[offset] - 'a');
413 } else {
414 u = 10 + (in[offset] - 'A');
415 }
416 n = (16 * n) + u;
417 }
418 } else {
Radek Krejcied6c6ad2018-09-26 09:10:18 +0200419 LOGVAL(ctx, LY_VLOG_LINE, &context->line, LYVE_SYNTAX, "Invalid character reference \"%.*s\".", 12, p);
Radek Krejci7a7fa902018-09-25 17:08:21 +0200420 goto error;
421
422 }
423 LY_CHECK_ERR_GOTO(in[offset] != ';',
424 LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_INSTREXP,
425 LY_VCODE_INSTREXP_len(&in[offset]), &in[offset], ";"),
426 error);
427 ++offset;
428 rc = lyxml_pututf8(&buf[len], n, &u);
429 LY_CHECK_ERR_GOTO(rc, LOGVAL(ctx, LY_VLOG_LINE, &context->line, LYVE_SYNTAX,
Radek Krejci117d2082018-09-26 10:05:14 +0200430 "Invalid character reference \"%.*s\" (0x%08x).", 12, p, n),
Radek Krejci7a7fa902018-09-25 17:08:21 +0200431 error);
432 len += u;
433 in += offset;
434 offset = 0;
435 }
436 } else if (in[offset] == delim) {
437 /* end of string */
Radek Krejcid70d1072018-10-09 14:20:47 +0200438 if (buf) {
439 if (len + offset >= size) {
440 buf = ly_realloc(buf, len + offset + 1);
441 LY_CHECK_ERR_RET(!buf, LOGMEM(ctx), LY_EMEM);
442 size = len + offset + 1;
443 }
444 memcpy(&buf[len], in, offset);
Radek Krejci7a7fa902018-09-25 17:08:21 +0200445 }
Radek Krejci7a7fa902018-09-25 17:08:21 +0200446 len += offset;
447 /* in case of element content, keep the leading <,
Radek Krejcib1890642018-10-03 14:05:40 +0200448 * for attribute's value move after the terminating quotation mark */
449 if (context->status == LYXML_ELEM_CONTENT) {
Radek Krejci339e2de2019-05-17 14:28:24 +0200450 const char *name, *prefix;
451 size_t name_len, prefix_len;
452
Radek Krejci7a7fa902018-09-25 17:08:21 +0200453 in += offset;
Radek Krejci339e2de2019-05-17 14:28:24 +0200454
455 /* get know if it is child element (mixed content) or closing element (regular content) */
456 (*input) = in;
457 rc = lyxml_get_element(context, &in, &prefix, &prefix_len, &name, &name_len);
458 if (name) {
459 /* the element here is not closing element, so we have not allowed mixed content */
460 struct lyxml_elem *e = (struct lyxml_elem*)context->elements.objs[--context->elements.count];
461 LOGVAL(ctx, LY_VLOG_LINE, &context->line, LYVE_SYNTAX, "Mixed XML content is not allowed (%.*s).",
462 offset + (in - (*input)), &(*input)[-offset]);
463 free(e);
Radek Krejci8ced2f72019-05-20 12:33:49 +0200464 goto error;
Radek Krejci339e2de2019-05-17 14:28:24 +0200465 } else if (rc) {
Radek Krejci8ced2f72019-05-20 12:33:49 +0200466 /* some parsing error */
467 goto error;
Radek Krejci339e2de2019-05-17 14:28:24 +0200468 } else {
469 /* closing element, so we have regular content */
470 context->status++;
471 goto success;
472 }
Radek Krejci7a7fa902018-09-25 17:08:21 +0200473 } else {
474 in += offset + 1;
475 }
476 goto success;
477 } else {
478 /* log lines */
479 if (in[offset] == '\n') {
480 ++context->line;
481 }
482
483 /* continue */
484 ++offset;
485 }
486 }
487 LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_EOF);
488error:
489 if (!(*buffer)) {
Radek Krejcibb9b1982019-04-08 14:24:59 +0200490 /* buffer not provided, buf is local */
Radek Krejci7a7fa902018-09-25 17:08:21 +0200491 free(buf);
Radek Krejcibb9b1982019-04-08 14:24:59 +0200492 } else if (buf) {
493 /* buf is shared with caller via buffer, but buf could be reallocated, so update the provided buffer */
494 (*buffer) = buf;
495 (*buffer_size) = size;
Radek Krejci7a7fa902018-09-25 17:08:21 +0200496 }
497 return LY_EVALID;
498
499success:
Radek Krejcid70d1072018-10-09 14:20:47 +0200500 if (buf) {
501 if (!(*buffer) && size != len + 1) {
502 /* not using provided buffer, so fit the allocated buffer to what we really have inside */
503 p = realloc(buf, len + 1);
504 /* ignore realloc fail because we are reducing the buffer,
505 * so just return bigger buffer than needed */
506 if (p) {
507 size = len + 1;
508 buf = p;
509 }
Radek Krejci7a7fa902018-09-25 17:08:21 +0200510 }
Radek Krejcid70d1072018-10-09 14:20:47 +0200511 /* set terminating NULL byte */
512 buf[len] = '\0';
Radek Krejci7a7fa902018-09-25 17:08:21 +0200513 }
Radek Krejci7a7fa902018-09-25 17:08:21 +0200514
Radek Krejcib1890642018-10-03 14:05:40 +0200515 context->status -= 1;
Radek Krejcid70d1072018-10-09 14:20:47 +0200516 if (buf) {
517 (*buffer) = buf;
518 (*buffer_size) = size;
519 (*output) = buf;
520 (*dynamic) = 1;
521 } else {
522 (*output) = (char*)start;
523 (*dynamic) = 0;
524 }
525 (*length) = len;
526
Radek Krejci28e8cb52019-03-08 11:31:31 +0100527 if (context->status == LYXML_ATTRIBUTE) {
528 if (in[0] == '>') {
529 /* element terminated by > - termination of the opening tag */
530 context->status = LYXML_ELEM_CONTENT;
531 ++in;
532 } else if (in[0] == '/' && in[1] == '>') {
533 /* element terminated by /> - termination of an empty element */
534 context->status = LYXML_ELEMENT;
535 in += 2;
536
537 /* remove the closed element record from the tags list */
538 free(context->elements.objs[context->elements.count - 1]);
539 --context->elements.count;
Radek Krejci17a78d82019-05-15 15:49:55 +0200540
541 /* remove also the namespaces conneted with the element */
Radek Krejci17dca992019-05-17 10:53:27 +0200542 lyxml_ns_rm(context);
Radek Krejci28e8cb52019-03-08 11:31:31 +0100543 }
544 }
545
546 (*input) = in;
Radek Krejci17a78d82019-05-15 15:49:55 +0200547 return rc;
Radek Krejci7a7fa902018-09-25 17:08:21 +0200548
549#undef BUFSIZE
550#undef BUFSIZE_STEP
551#undef BUFSIZE_CHECK
552}
553
Radek Krejcid972c252018-09-25 13:23:39 +0200554LY_ERR
Radek Krejci7a7fa902018-09-25 17:08:21 +0200555lyxml_get_attribute(struct lyxml_context *context, const char **input,
Radek Krejcid972c252018-09-25 13:23:39 +0200556 const char **prefix, size_t *prefix_len, const char **name, size_t *name_len)
557{
558 struct ly_ctx *ctx = context->ctx; /* shortcut */
559 const char *in = (*input);
560 const char *id;
561 const char *endtag;
562 LY_ERR rc;
563 unsigned int c;
564 size_t endtag_len;
Radek Krejci17a78d82019-05-15 15:49:55 +0200565 int is_ns = 0;
566 const char *ns_prefix = NULL;
567 size_t ns_prefix_len = 0;
Radek Krejcid972c252018-09-25 13:23:39 +0200568
Radek Krejci17a78d82019-05-15 15:49:55 +0200569start:
Radek Krejcid972c252018-09-25 13:23:39 +0200570 /* initialize output variables */
571 (*prefix) = (*name) = NULL;
572 (*prefix_len) = (*name_len) = 0;
573
574 /* skip initial whitespaces */
575 ign_xmlws(context, in);
576
577 if (in[0] == '\0') {
578 /* EOF - not expected at this place */
579 return LY_EINVAL;
Radek Krejcid972c252018-09-25 13:23:39 +0200580 }
581
582 /* remember the identifier start before checking its format */
583 id = in;
584 rc = lyxml_check_qname(context, &in, &c, &endtag_len);
585 LY_CHECK_RET(rc);
586 if (c == ':') {
587 /* we have prefixed identifier */
588 endtag = in - endtag_len;
589
590 rc = lyxml_check_qname(context, &in, &c, &endtag_len);
591 LY_CHECK_RET(rc);
592
593 (*prefix) = id;
594 (*prefix_len) = endtag - id;
595 id = endtag + 1;
596 }
597 if (!is_xmlws(c) && c != '=') {
598 in = in - endtag_len;
599 LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_INSTREXP, LY_VCODE_INSTREXP_len(in), in, "whitespace or '='");
600 return LY_EVALID;
601 }
602 in = in - endtag_len;
603 (*name) = id;
604 (*name_len) = in - id;
605
606 /* eat '=' and stop at the value beginning */
607 ign_xmlws(context, in);
608 if (in[0] != '=') {
609 LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_INSTREXP, LY_VCODE_INSTREXP_len(in), in, "'='");
610 return LY_EVALID;
611 }
612 ++in;
613 ign_xmlws(context, in);
614 if (in[0] != '\'' && in[0] != '"') {
Radek Krejcib1890642018-10-03 14:05:40 +0200615 LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_INSTREXP,
616 LY_VCODE_INSTREXP_len(in), in, "either single or double quotation mark");
Radek Krejcid972c252018-09-25 13:23:39 +0200617 return LY_EVALID;
618 }
Radek Krejcib1890642018-10-03 14:05:40 +0200619 context->status = LYXML_ATTR_CONTENT;
Radek Krejcid972c252018-09-25 13:23:39 +0200620
Radek Krejci17a78d82019-05-15 15:49:55 +0200621 is_ns = 0;
622 if (*prefix && *prefix_len == 5 && !strncmp(*prefix, "xmlns", 5)) {
623 is_ns = 1;
624 ns_prefix = *name;
625 ns_prefix_len = *name_len;
626 } else if (*name_len == 5 && !strncmp(*name, "xmlns", 5)) {
627 is_ns = 1;
628 }
629 if (is_ns) {
630 /* instead of attribute, we have namespace specification,
631 * so process it automatically and then move to another attribute (if any) */
632 char *value = NULL;
633 size_t value_len = 0;
634 int dynamic = 0;
635
636 LY_CHECK_RET(lyxml_get_string(context, &in, &value, &value_len, &value, &value_len, &dynamic));
637 if ((rc = lyxml_ns_add(context, ns_prefix, ns_prefix_len, dynamic ? value : strndup(value, value_len)))) {
638 if (dynamic) {
639 free(value);
640 return rc;
641 }
642 }
643 if (context->status == LYXML_ATTRIBUTE) {
644 goto start;
645 } else {
646 (*prefix) = (*name) = NULL;
647 (*prefix_len) = (*name_len) = 0;
648 }
649 }
650
Radek Krejcid972c252018-09-25 13:23:39 +0200651 /* move caller's input */
652 (*input) = in;
653 return LY_SUCCESS;
654}
655
Radek Krejcid91dbaf2018-09-21 15:51:39 +0200656LY_ERR
Radek Krejci7a7fa902018-09-25 17:08:21 +0200657lyxml_get_element(struct lyxml_context *context, const char **input,
Radek Krejcid91dbaf2018-09-21 15:51:39 +0200658 const char **prefix, size_t *prefix_len, const char **name, size_t *name_len)
659{
660 struct ly_ctx *ctx = context->ctx; /* shortcut */
661 const char *in = (*input);
662 const char *endtag;
663 const char *sectname;
664 const char *id;
665 size_t endtag_len, newlines;
Radek Krejcib1890642018-10-03 14:05:40 +0200666 bool loop = true, closing = false;
Radek Krejcid91dbaf2018-09-21 15:51:39 +0200667 unsigned int c;
668 LY_ERR rc;
Radek Krejcib1890642018-10-03 14:05:40 +0200669 struct lyxml_elem *e;
Radek Krejcid91dbaf2018-09-21 15:51:39 +0200670
671 /* initialize output variables */
672 (*prefix) = (*name) = NULL;
673 (*prefix_len) = (*name_len) = 0;
674
675 while (loop) {
676 ign_xmlws(context, in);
677
678 if (in[0] == '\0') {
679 /* EOF */
Radek Krejcib1890642018-10-03 14:05:40 +0200680 context->status = LYXML_END;
Radek Krejcid91dbaf2018-09-21 15:51:39 +0200681 goto success;
682 } else if (in[0] != '<') {
683 return LY_EINVAL;
684 }
685 move_input(context, in, 1);
686
687 if (in[0] == '!') {
688 move_input(context, in, 1);
689 /* sections to ignore */
690 if (!strncmp(in, "--", 2)) {
691 /* comment */
692 move_input(context, in, 2);
693 sectname = "Comment";
694 endtag = "-->";
695 endtag_len = 3;
696 } else if (!strncmp(in, "[CDATA[", 7)) {
697 /* CDATA section */
698 move_input(context, in, 7);
699 sectname = "CData";
700 endtag = "]]>";
701 endtag_len = 3;
702 } else if (!strncmp(in, "DOCTYPE", 7)) {
703 /* Document type declaration - not supported */
704 LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_NSUPP, "Document Type Declaration");
705 return LY_EVALID;
Radek Krejcic5c31bb2019-04-08 14:40:52 +0200706 } else {
707 LOGVAL(ctx, LY_VLOG_LINE, &context->line, LYVE_SYNTAX, "Unknown XML section \"%.20s\".", &in[-2]);
708 return LY_EVALID;
Radek Krejcid91dbaf2018-09-21 15:51:39 +0200709 }
710 in = ign_todelim(in, endtag, endtag_len, &newlines);
711 LY_CHECK_ERR_RET(!in, LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_NTERM, sectname), LY_EVALID);
712 context->line += newlines;
713 in += endtag_len;
714 } else if (in[0] == '?') {
715 in = ign_todelim(in, "?>", 2, &newlines);
716 LY_CHECK_ERR_RET(!in, LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_NTERM, "Declaration"), LY_EVALID);
717 context->line += newlines;
718 in += 2;
Radek Krejcib1890642018-10-03 14:05:40 +0200719 } else if (in[0] == '/') {
720 /* closing element */
721 closing = true;
722 ++in;
723 goto element;
Radek Krejcid91dbaf2018-09-21 15:51:39 +0200724 } else {
725 /* element */
Radek Krejcib1890642018-10-03 14:05:40 +0200726element:
Radek Krejcid91dbaf2018-09-21 15:51:39 +0200727 ign_xmlws(context, in);
728 LY_CHECK_ERR_RET(!in[0], LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_EOF), LY_EVALID);
729
730 /* remember the identifier start before checking its format */
731 id = in;
732 rc = lyxml_check_qname(context, &in, &c, &endtag_len);
733 LY_CHECK_RET(rc);
734 if (c == ':') {
735 /* we have prefixed identifier */
736 endtag = in - endtag_len;
737
738 rc = lyxml_check_qname(context, &in, &c, &endtag_len);
739 LY_CHECK_RET(rc);
740
741 (*prefix) = id;
742 (*prefix_len) = endtag - id;
743 id = endtag + 1;
744 }
745 if (!is_xmlws(c) && c != '/' && c != '>') {
746 in = in - endtag_len;
Radek Krejcid972c252018-09-25 13:23:39 +0200747 LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_INSTREXP, LY_VCODE_INSTREXP_len(in), in,
748 "whitespace or element tag termination ('>' or '/>'");
Radek Krejcid91dbaf2018-09-21 15:51:39 +0200749 return LY_EVALID;
750 }
Radek Krejcid91dbaf2018-09-21 15:51:39 +0200751 (*name) = id;
Radek Krejcib1890642018-10-03 14:05:40 +0200752 (*name_len) = in - endtag_len - id;
Radek Krejcid91dbaf2018-09-21 15:51:39 +0200753
Radek Krejcib1890642018-10-03 14:05:40 +0200754 if (is_xmlws(c)) {
755 /* go to the next meaningful input */
756 ign_xmlws(context, in);
757 LY_CHECK_ERR_RET(!in[0], LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_EOF), LY_EVALID);
758 c = in[0];
759 ++in;
760 endtag_len = 1;
761 }
762
763 if (closing) {
764 /* match opening and closing element tags */
765 LY_CHECK_ERR_RET(
766 !context->elements.count,
Radek Krejci3fbc9872019-04-16 16:50:01 +0200767 LOGVAL(ctx, LY_VLOG_LINE, &context->line, LYVE_SYNTAX, "Opening and closing elements tag missmatch (\"%.*s\").", *name_len, *name),
Radek Krejcib1890642018-10-03 14:05:40 +0200768 LY_EVALID);
769 e = (struct lyxml_elem*)context->elements.objs[context->elements.count - 1];
770 LY_CHECK_ERR_RET(e->prefix_len != *prefix_len || e->name_len != *name_len
771 || (*prefix_len && strncmp(*prefix, e->prefix, e->prefix_len)) || strncmp(*name, e->name, e->name_len),
Radek Krejci3fbc9872019-04-16 16:50:01 +0200772 LOGVAL(ctx, LY_VLOG_LINE, &context->line, LYVE_SYNTAX, "Opening and closing elements tag missmatch (\"%.*s\").", *name_len, *name),
Radek Krejcib1890642018-10-03 14:05:40 +0200773 LY_EVALID);
774 /* opening and closing element tags matches, remove record from the opening tags list */
775 free(e);
776 --context->elements.count;
Radek Krejci17a78d82019-05-15 15:49:55 +0200777
778 /* remove also the namespaces conneted with the element */
Radek Krejci17dca992019-05-17 10:53:27 +0200779 lyxml_ns_rm(context);
Radek Krejci17a78d82019-05-15 15:49:55 +0200780
Radek Krejcib1890642018-10-03 14:05:40 +0200781 /* do not return element information to announce closing element being currently processed */
782 *name = *prefix = NULL;
783 *name_len = *prefix_len = 0;
784
785 if (c == '>') {
786 /* end of closing element */
787 context->status = LYXML_ELEMENT;
788 } else {
789 in -= endtag_len;
790 LOGVAL(ctx, LY_VLOG_LINE, &context->line, LYVE_SYNTAX, "Unexpected data \"%.*s\" in closing element tag.",
791 LY_VCODE_INSTREXP_len(in), in);
792 return LY_EVALID;
793 }
794 } else {
795 if (c == '>') {
796 /* end of opening element */
797 context->status = LYXML_ELEM_CONTENT;
798 } else if (c == '/' && in[0] == '>') {
799 /* empty element closing */
800 context->status = LYXML_ELEMENT;
801 ++in;
802 } else {
803 /* attribute */
804 context->status = LYXML_ATTRIBUTE;
805 in -= endtag_len;
806 }
807
808 if (context->status != LYXML_ELEMENT) {
809 /* store element opening tag information */
810 e = malloc(sizeof *e);
811 LY_CHECK_ERR_RET(!e, LOGMEM(ctx), LY_EMEM);
812 e->name = *name;
813 e->prefix = *prefix;
814 e->name_len = *name_len;
815 e->prefix_len = *prefix_len;
816 ly_set_add(&context->elements, e, LY_SET_OPT_USEASLIST);
817 }
818 }
Radek Krejcid91dbaf2018-09-21 15:51:39 +0200819 loop = false;
820 }
821 }
822
823success:
824 /* move caller's input */
825 (*input) = in;
826 return LY_SUCCESS;
827}
828
Radek Krejcib1890642018-10-03 14:05:40 +0200829
830void
831lyxml_context_clear(struct lyxml_context *context)
832{
833 unsigned int u;
834
835 ly_set_erase(&context->elements, free);
836 for (u = context->ns.count - 1; u + 1 > 0; --u) {
837 /* remove the ns structure */
838 free(((struct lyxml_ns *)context->ns.objs[u])->prefix);
839 free(((struct lyxml_ns *)context->ns.objs[u])->uri);
840 free(context->ns.objs[u]);
841 }
842 ly_set_erase(&context->ns, NULL);
843}
Radek Krejcie7b95092019-05-15 11:03:07 +0200844
845LY_ERR
846lyxml_dump_text(struct lyout *out, const char *text, int attribute)
847{
848 LY_ERR ret = LY_SUCCESS;
849 unsigned int u;
850
851 if (!text) {
852 return 0;
853 }
854
855 for (u = 0; text[u]; u++) {
856 switch (text[u]) {
857 case '&':
858 ret = ly_print(out, "&amp;");
859 break;
860 case '<':
861 ret = ly_print(out, "&lt;");
862 break;
863 case '>':
864 /* not needed, just for readability */
865 ret = ly_print(out, "&gt;");
866 break;
867 case '"':
868 if (attribute) {
869 ret = ly_print(out, "&quot;");
870 break;
871 }
872 /* falls through */
873 default:
874 ly_write(out, &text[u], 1);
875 }
876 }
877
878 return ret;
879}
880