blob: d1a2be83a4428fc09cff68f4ae06f415bbd41e00 [file] [log] [blame]
Radek Krejcid91dbaf2018-09-21 15:51:39 +02001/**
2 * @file xml.c
3 * @author Radek Krejci <rkrejci@cesnet.cz>
4 * @brief Generic XML parser implementation for libyang
5 *
6 * Copyright (c) 2015 - 2018 CESNET, z.s.p.o.
7 *
8 * This source code is licensed under BSD 3-Clause License (the "License").
9 * You may not use this file except in compliance with the License.
10 * You may obtain a copy of the License at
11 *
12 * https://opensource.org/licenses/BSD-3-Clause
13 */
14
15#include <stdbool.h>
16#include <stdint.h>
17
18#include "libyang.h"
19#include "xml.h"
20#include "common.h"
21
22/* Macro to test if character is whitespace */
23#define is_xmlws(c) (c == 0x20 || c == 0x9 || c == 0xa || c == 0xd)
24
25/* Macro to test if character is allowed to be a first character of an qualified identifier */
26#define is_xmlqnamestartchar(c) ((c >= 'a' && c <= 'z') || c == '_' || \
27 (c >= 'A' && c <= 'Z') || /* c == ':' || */ \
28 (c >= 0x370 && c <= 0x1fff && c != 0x37e ) || \
29 (c >= 0xc0 && c <= 0x2ff && c != 0xd7 && c != 0xf7) || c == 0x200c || \
30 c == 0x200d || (c >= 0x2070 && c <= 0x218f) || \
31 (c >= 0x2c00 && c <= 0x2fef) || (c >= 0x3001 && c <= 0xd7ff) || \
32 (c >= 0xf900 && c <= 0xfdcf) || (c >= 0xfdf0 && c <= 0xfffd) || \
33 (c >= 0x10000 && c <= 0xeffff))
34
35/* Macro to test if character is allowed to be used in an qualified identifier */
36#define is_xmlqnamechar(c) ((c >= 'a' && c <= 'z') || c == '_' || c == '-' || \
37 (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || /* c == ':' || */ \
38 c == '.' || c == 0xb7 || (c >= 0x370 && c <= 0x1fff && c != 0x37e ) ||\
39 (c >= 0xc0 && c <= 0x2ff && c != 0xd7 && c != 0xf7) || c == 0x200c || \
40 c == 0x200d || (c >= 0x300 && c <= 0x36f) || \
41 (c >= 0x2070 && c <= 0x218f) || (c >= 0x2030f && c <= 0x2040) || \
42 (c >= 0x2c00 && c <= 0x2fef) || (c >= 0x3001 && c <= 0xd7ff) || \
43 (c >= 0xf900 && c <= 0xfdcf) || (c >= 0xfdf0 && c <= 0xfffd) || \
44 (c >= 0x10000 && c <= 0xeffff))
45
46/* Move input p by s characters, if EOF log with lyxml_context c */
47#define move_input(c,p,s) p += s; LY_CHECK_ERR_RET(!p[0], LOGVAL(c->ctx, LY_VLOG_LINE, &c->line, LY_VCODE_EOF), LY_EVALID)
48
49/* Ignore whitespaces in the input string p, if EOF log with lyxml_context c */
50#define ign_xmlws(c,p) while (is_xmlws(*(p))) {if (*(p) == '\n') {++c->line;} ++p;}
51
52static const char *
53ign_todelim(register const char *input, const char *delim, size_t delim_len, size_t *newlines)
54{
55 size_t i;
56 register const char *a, *b;
57
58 (*newlines) = 0;
59 for ( ; *input; ++input) {
60 if (*input != *delim) {
61 if (*input == '\n') {
62 ++(*newlines);
63 }
64 continue;
65 }
66 a = input;
67 b = delim;
68 for (i = 0; i < delim_len; ++i) {
69 if (*a++ != *b++) {
70 break;
71 }
72 }
73 if (i == delim_len) {
74 return input;
75 }
76 }
77 return NULL;
78}
79
80static LY_ERR
81lyxml_getutf8(const char **input, unsigned int *utf8_char, size_t *bytes_read)
82{
83 unsigned int c, len;
84 int aux;
85 int i;
86
87 c = (*input)[0];
88 LY_CHECK_RET(!c, LY_EINVAL);
89
90 /* process character byte(s) */
91 if ((c & 0xf8) == 0xf0) {
92 /* four bytes character */
93 len = 4;
94
95 c &= 0x07;
96 for (i = 1; i <= 3; i++) {
97 aux = (*input)[i];
98 if ((aux & 0xc0) != 0x80) {
99 return LY_EINVAL;
100 }
101
102 c = (c << 6) | (aux & 0x3f);
103 }
104
105 if (c < 0x1000 || c > 0x10ffff) {
106 return LY_EINVAL;
107 }
108 } else if ((c & 0xf0) == 0xe0) {
109 /* three bytes character */
110 len = 3;
111
112 c &= 0x0f;
113 for (i = 1; i <= 2; i++) {
114 aux = (*input)[i];
115 if ((aux & 0xc0) != 0x80) {
116 return LY_EINVAL;
117 }
118
119 c = (c << 6) | (aux & 0x3f);
120 }
121
122 if (c < 0x800 || (c > 0xd7ff && c < 0xe000) || c > 0xfffd) {
123 return LY_EINVAL;
124 }
125 } else if ((c & 0xe0) == 0xc0) {
126 /* two bytes character */
127 len = 2;
128
129 aux = (*input)[1];
130 if ((aux & 0xc0) != 0x80) {
131 return LY_EINVAL;
132 }
133 c = ((c & 0x1f) << 6) | (aux & 0x3f);
134
135 if (c < 0x80) {
136 return LY_EINVAL;
137 }
138 } else if (!(c & 0x80)) {
139 /* one byte character */
140 len = 1;
141
142 if (c < 0x20 && c != 0x9 && c != 0xa && c != 0xd) {
143 return LY_EINVAL;
144 }
145 } else {
146 return LY_EINVAL;
147 }
148
149 (*utf8_char) = c;
150 (*input) += len;
151 if (bytes_read) {
152 (*bytes_read) = len;
153 }
154 return LY_SUCCESS;
155}
156
157LY_ERR
158lyxml_check_qname(struct lyxml_context *context, const char **input, unsigned int *term_char, size_t *term_char_len)
159{
160 unsigned int c;
161 const char *id = (*input);
162 LY_ERR rc;
163
164 /* check NameStartChar (minus colon) */
165 LY_CHECK_ERR_RET(lyxml_getutf8(input, &c, NULL) != LY_SUCCESS,
166 LOGVAL(context->ctx, LY_VLOG_LINE, &context->line, LY_VCODE_INCHAR, (*input)[0]), LY_EVALID);
167 LY_CHECK_ERR_RET(!is_xmlqnamestartchar(c),
168 LOGVAL(context->ctx, LY_VLOG_LINE, &context->line, LYVE_SYNTAX,
169 "Identifier \"%s\" starts with invalid character.", id),
170 LY_EVALID);
171
172 /* check rest of the identifier */
173 for (rc = lyxml_getutf8(input, &c, term_char_len);
174 rc == LY_SUCCESS && is_xmlqnamechar(c);
175 rc = lyxml_getutf8(input, &c, term_char_len));
176 LY_CHECK_ERR_RET(rc != LY_SUCCESS, LOGVAL(context->ctx, LY_VLOG_LINE, &context->line, LY_VCODE_INCHAR, (*input)[0]), LY_EVALID);
177
178 (*term_char) = c;
179 return LY_SUCCESS;
180}
181
Radek Krejcid972c252018-09-25 13:23:39 +0200182/**
183 * @brief Parse input expecting an XML attribute (including XML namespace).
184 *
185 * Input string is not being modified, so the returned values are not NULL-terminated, instead their length
186 * is returned.
187 *
188 * In case of a namespace definition, prefix just contains xmlns string. In case of the default namespace,
189 * prefix is NULL and the attribute name is xmlns.
190 *
191 * @param[in] context XML context to track lines or store errors into libyang context.
192 * @param[in,out] input Input string to process, updated according to the processed/read data so,
193 * when succeeded, it points to the opening quote of the attribute's value..
194 * @param[in] options Currently unused options to modify input processing.
195 * @param[out] prefix Pointer to prefix if present in the attribute name, NULL otherwise.
196 * @param[out] prefix_len Length of the prefix if any.
197 * @param[out] name Attribute name. LY_SUCCESS can be returned with NULL name only in case the
198 * end of the element tag was reached.
199 * @param[out] name_len Length of the element name.
200 * @return LY_ERR values.
201 */
202LY_ERR
203lyxml_get_attribute(struct lyxml_context *context, const char **input, int UNUSED(options),
204 const char **prefix, size_t *prefix_len, const char **name, size_t *name_len)
205{
206 struct ly_ctx *ctx = context->ctx; /* shortcut */
207 const char *in = (*input);
208 const char *id;
209 const char *endtag;
210 LY_ERR rc;
211 unsigned int c;
212 size_t endtag_len;
213
214 /* initialize output variables */
215 (*prefix) = (*name) = NULL;
216 (*prefix_len) = (*name_len) = 0;
217
218 /* skip initial whitespaces */
219 ign_xmlws(context, in);
220
221 if (in[0] == '\0') {
222 /* EOF - not expected at this place */
223 return LY_EINVAL;
224 } else if (in[0] == '>' || in[0] == '/') {
225 /* element terminated by > or /> */
226 goto success;
227 }
228
229 /* remember the identifier start before checking its format */
230 id = in;
231 rc = lyxml_check_qname(context, &in, &c, &endtag_len);
232 LY_CHECK_RET(rc);
233 if (c == ':') {
234 /* we have prefixed identifier */
235 endtag = in - endtag_len;
236
237 rc = lyxml_check_qname(context, &in, &c, &endtag_len);
238 LY_CHECK_RET(rc);
239
240 (*prefix) = id;
241 (*prefix_len) = endtag - id;
242 id = endtag + 1;
243 }
244 if (!is_xmlws(c) && c != '=') {
245 in = in - endtag_len;
246 LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_INSTREXP, LY_VCODE_INSTREXP_len(in), in, "whitespace or '='");
247 return LY_EVALID;
248 }
249 in = in - endtag_len;
250 (*name) = id;
251 (*name_len) = in - id;
252
253 /* eat '=' and stop at the value beginning */
254 ign_xmlws(context, in);
255 if (in[0] != '=') {
256 LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_INSTREXP, LY_VCODE_INSTREXP_len(in), in, "'='");
257 return LY_EVALID;
258 }
259 ++in;
260 ign_xmlws(context, in);
261 if (in[0] != '\'' && in[0] != '"') {
262 LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_INSTREXP, LY_VCODE_INSTREXP_len(in), in, "either single or double quotation mark");
263 return LY_EVALID;
264 }
265
266success:
267 /* move caller's input */
268 (*input) = in;
269 return LY_SUCCESS;
270}
271
272/**
273 * @brief Parse input expecting an XML element.
274 *
275 * Able to silently skip comments, PIs and CData. DOCTYPE is not parsable, so it is reported as LY_EVALID error.
276 * If '<' is not found in input, LY_EINVAL is returned (but no error is logged), so it is possible to continue
277 * with parsing input as text content.
278 *
279 * Input string is not being modified, so the returned values are not NULL-terminated, instead their length
280 * is returned.
281 *
282 * @param[in] context XML context to track lines or store errors into libyang context.
283 * @param[in,out] input Input string to process, updated according to the processed/read data.
284 * @param[in] options Currently unused options to modify input processing.
285 * @param[out] prefix Pointer to prefix if present in the element name, NULL otherwise.
286 * @param[out] prefix_len Length of the prefix if any.
287 * @param[out] name Element name. LY_SUCCESS can be returned with NULL name only in case the
288 * end of the input string was reached (EOF).
289 * @param[out] name_len Length of the element name.
290 * @return LY_ERR values.
291 */
Radek Krejcid91dbaf2018-09-21 15:51:39 +0200292LY_ERR
293lyxml_get_element(struct lyxml_context *context, const char **input, int UNUSED(options),
294 const char **prefix, size_t *prefix_len, const char **name, size_t *name_len)
295{
296 struct ly_ctx *ctx = context->ctx; /* shortcut */
297 const char *in = (*input);
298 const char *endtag;
299 const char *sectname;
300 const char *id;
301 size_t endtag_len, newlines;
302 bool loop = true;
303 unsigned int c;
304 LY_ERR rc;
Radek Krejcid91dbaf2018-09-21 15:51:39 +0200305
306 /* initialize output variables */
307 (*prefix) = (*name) = NULL;
308 (*prefix_len) = (*name_len) = 0;
309
310 while (loop) {
311 ign_xmlws(context, in);
312
313 if (in[0] == '\0') {
314 /* EOF */
315 goto success;
316 } else if (in[0] != '<') {
317 return LY_EINVAL;
318 }
319 move_input(context, in, 1);
320
321 if (in[0] == '!') {
322 move_input(context, in, 1);
323 /* sections to ignore */
324 if (!strncmp(in, "--", 2)) {
325 /* comment */
326 move_input(context, in, 2);
327 sectname = "Comment";
328 endtag = "-->";
329 endtag_len = 3;
330 } else if (!strncmp(in, "[CDATA[", 7)) {
331 /* CDATA section */
332 move_input(context, in, 7);
333 sectname = "CData";
334 endtag = "]]>";
335 endtag_len = 3;
336 } else if (!strncmp(in, "DOCTYPE", 7)) {
337 /* Document type declaration - not supported */
338 LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_NSUPP, "Document Type Declaration");
339 return LY_EVALID;
340 }
341 in = ign_todelim(in, endtag, endtag_len, &newlines);
342 LY_CHECK_ERR_RET(!in, LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_NTERM, sectname), LY_EVALID);
343 context->line += newlines;
344 in += endtag_len;
345 } else if (in[0] == '?') {
346 in = ign_todelim(in, "?>", 2, &newlines);
347 LY_CHECK_ERR_RET(!in, LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_NTERM, "Declaration"), LY_EVALID);
348 context->line += newlines;
349 in += 2;
350 } else {
351 /* element */
352 ign_xmlws(context, in);
353 LY_CHECK_ERR_RET(!in[0], LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_EOF), LY_EVALID);
354
355 /* remember the identifier start before checking its format */
356 id = in;
357 rc = lyxml_check_qname(context, &in, &c, &endtag_len);
358 LY_CHECK_RET(rc);
359 if (c == ':') {
360 /* we have prefixed identifier */
361 endtag = in - endtag_len;
362
363 rc = lyxml_check_qname(context, &in, &c, &endtag_len);
364 LY_CHECK_RET(rc);
365
366 (*prefix) = id;
367 (*prefix_len) = endtag - id;
368 id = endtag + 1;
369 }
370 if (!is_xmlws(c) && c != '/' && c != '>') {
371 in = in - endtag_len;
Radek Krejcid972c252018-09-25 13:23:39 +0200372 LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_INSTREXP, LY_VCODE_INSTREXP_len(in), in,
373 "whitespace or element tag termination ('>' or '/>'");
Radek Krejcid91dbaf2018-09-21 15:51:39 +0200374 return LY_EVALID;
375 }
376 in = in - endtag_len;
377 (*name) = id;
378 (*name_len) = in - id;
379
380 loop = false;
381 }
382 }
383
384success:
385 /* move caller's input */
386 (*input) = in;
387 return LY_SUCCESS;
388}
389