Radek Krejci | d91dbaf | 2018-09-21 15:51:39 +0200 | [diff] [blame] | 1 | /** |
| 2 | * @file xml.c |
| 3 | * @author Radek Krejci <rkrejci@cesnet.cz> |
| 4 | * @brief Generic XML parser implementation for libyang |
| 5 | * |
| 6 | * Copyright (c) 2015 - 2018 CESNET, z.s.p.o. |
| 7 | * |
| 8 | * This source code is licensed under BSD 3-Clause License (the "License"). |
| 9 | * You may not use this file except in compliance with the License. |
| 10 | * You may obtain a copy of the License at |
| 11 | * |
| 12 | * https://opensource.org/licenses/BSD-3-Clause |
| 13 | */ |
| 14 | |
| 15 | #include <stdbool.h> |
| 16 | #include <stdint.h> |
| 17 | |
| 18 | #include "libyang.h" |
| 19 | #include "xml.h" |
| 20 | #include "common.h" |
| 21 | |
| 22 | /* Macro to test if character is whitespace */ |
| 23 | #define is_xmlws(c) (c == 0x20 || c == 0x9 || c == 0xa || c == 0xd) |
| 24 | |
| 25 | /* Macro to test if character is allowed to be a first character of an qualified identifier */ |
| 26 | #define is_xmlqnamestartchar(c) ((c >= 'a' && c <= 'z') || c == '_' || \ |
| 27 | (c >= 'A' && c <= 'Z') || /* c == ':' || */ \ |
| 28 | (c >= 0x370 && c <= 0x1fff && c != 0x37e ) || \ |
| 29 | (c >= 0xc0 && c <= 0x2ff && c != 0xd7 && c != 0xf7) || c == 0x200c || \ |
| 30 | c == 0x200d || (c >= 0x2070 && c <= 0x218f) || \ |
| 31 | (c >= 0x2c00 && c <= 0x2fef) || (c >= 0x3001 && c <= 0xd7ff) || \ |
| 32 | (c >= 0xf900 && c <= 0xfdcf) || (c >= 0xfdf0 && c <= 0xfffd) || \ |
| 33 | (c >= 0x10000 && c <= 0xeffff)) |
| 34 | |
| 35 | /* Macro to test if character is allowed to be used in an qualified identifier */ |
| 36 | #define is_xmlqnamechar(c) ((c >= 'a' && c <= 'z') || c == '_' || c == '-' || \ |
| 37 | (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || /* c == ':' || */ \ |
| 38 | c == '.' || c == 0xb7 || (c >= 0x370 && c <= 0x1fff && c != 0x37e ) ||\ |
| 39 | (c >= 0xc0 && c <= 0x2ff && c != 0xd7 && c != 0xf7) || c == 0x200c || \ |
| 40 | c == 0x200d || (c >= 0x300 && c <= 0x36f) || \ |
| 41 | (c >= 0x2070 && c <= 0x218f) || (c >= 0x2030f && c <= 0x2040) || \ |
| 42 | (c >= 0x2c00 && c <= 0x2fef) || (c >= 0x3001 && c <= 0xd7ff) || \ |
| 43 | (c >= 0xf900 && c <= 0xfdcf) || (c >= 0xfdf0 && c <= 0xfffd) || \ |
| 44 | (c >= 0x10000 && c <= 0xeffff)) |
| 45 | |
| 46 | /* Move input p by s characters, if EOF log with lyxml_context c */ |
| 47 | #define move_input(c,p,s) p += s; LY_CHECK_ERR_RET(!p[0], LOGVAL(c->ctx, LY_VLOG_LINE, &c->line, LY_VCODE_EOF), LY_EVALID) |
| 48 | |
| 49 | /* Ignore whitespaces in the input string p, if EOF log with lyxml_context c */ |
| 50 | #define ign_xmlws(c,p) while (is_xmlws(*(p))) {if (*(p) == '\n') {++c->line;} ++p;} |
| 51 | |
| 52 | static const char * |
| 53 | ign_todelim(register const char *input, const char *delim, size_t delim_len, size_t *newlines) |
| 54 | { |
| 55 | size_t i; |
| 56 | register const char *a, *b; |
| 57 | |
| 58 | (*newlines) = 0; |
| 59 | for ( ; *input; ++input) { |
| 60 | if (*input != *delim) { |
| 61 | if (*input == '\n') { |
| 62 | ++(*newlines); |
| 63 | } |
| 64 | continue; |
| 65 | } |
| 66 | a = input; |
| 67 | b = delim; |
| 68 | for (i = 0; i < delim_len; ++i) { |
| 69 | if (*a++ != *b++) { |
| 70 | break; |
| 71 | } |
| 72 | } |
| 73 | if (i == delim_len) { |
| 74 | return input; |
| 75 | } |
| 76 | } |
| 77 | return NULL; |
| 78 | } |
| 79 | |
| 80 | static LY_ERR |
| 81 | lyxml_getutf8(const char **input, unsigned int *utf8_char, size_t *bytes_read) |
| 82 | { |
| 83 | unsigned int c, len; |
| 84 | int aux; |
| 85 | int i; |
| 86 | |
| 87 | c = (*input)[0]; |
| 88 | LY_CHECK_RET(!c, LY_EINVAL); |
| 89 | |
| 90 | /* process character byte(s) */ |
| 91 | if ((c & 0xf8) == 0xf0) { |
| 92 | /* four bytes character */ |
| 93 | len = 4; |
| 94 | |
| 95 | c &= 0x07; |
| 96 | for (i = 1; i <= 3; i++) { |
| 97 | aux = (*input)[i]; |
| 98 | if ((aux & 0xc0) != 0x80) { |
| 99 | return LY_EINVAL; |
| 100 | } |
| 101 | |
| 102 | c = (c << 6) | (aux & 0x3f); |
| 103 | } |
| 104 | |
| 105 | if (c < 0x1000 || c > 0x10ffff) { |
| 106 | return LY_EINVAL; |
| 107 | } |
| 108 | } else if ((c & 0xf0) == 0xe0) { |
| 109 | /* three bytes character */ |
| 110 | len = 3; |
| 111 | |
| 112 | c &= 0x0f; |
| 113 | for (i = 1; i <= 2; i++) { |
| 114 | aux = (*input)[i]; |
| 115 | if ((aux & 0xc0) != 0x80) { |
| 116 | return LY_EINVAL; |
| 117 | } |
| 118 | |
| 119 | c = (c << 6) | (aux & 0x3f); |
| 120 | } |
| 121 | |
| 122 | if (c < 0x800 || (c > 0xd7ff && c < 0xe000) || c > 0xfffd) { |
| 123 | return LY_EINVAL; |
| 124 | } |
| 125 | } else if ((c & 0xe0) == 0xc0) { |
| 126 | /* two bytes character */ |
| 127 | len = 2; |
| 128 | |
| 129 | aux = (*input)[1]; |
| 130 | if ((aux & 0xc0) != 0x80) { |
| 131 | return LY_EINVAL; |
| 132 | } |
| 133 | c = ((c & 0x1f) << 6) | (aux & 0x3f); |
| 134 | |
| 135 | if (c < 0x80) { |
| 136 | return LY_EINVAL; |
| 137 | } |
| 138 | } else if (!(c & 0x80)) { |
| 139 | /* one byte character */ |
| 140 | len = 1; |
| 141 | |
| 142 | if (c < 0x20 && c != 0x9 && c != 0xa && c != 0xd) { |
| 143 | return LY_EINVAL; |
| 144 | } |
| 145 | } else { |
| 146 | return LY_EINVAL; |
| 147 | } |
| 148 | |
| 149 | (*utf8_char) = c; |
| 150 | (*input) += len; |
| 151 | if (bytes_read) { |
| 152 | (*bytes_read) = len; |
| 153 | } |
| 154 | return LY_SUCCESS; |
| 155 | } |
| 156 | |
| 157 | LY_ERR |
| 158 | lyxml_check_qname(struct lyxml_context *context, const char **input, unsigned int *term_char, size_t *term_char_len) |
| 159 | { |
| 160 | unsigned int c; |
| 161 | const char *id = (*input); |
| 162 | LY_ERR rc; |
| 163 | |
| 164 | /* check NameStartChar (minus colon) */ |
| 165 | LY_CHECK_ERR_RET(lyxml_getutf8(input, &c, NULL) != LY_SUCCESS, |
| 166 | LOGVAL(context->ctx, LY_VLOG_LINE, &context->line, LY_VCODE_INCHAR, (*input)[0]), LY_EVALID); |
| 167 | LY_CHECK_ERR_RET(!is_xmlqnamestartchar(c), |
| 168 | LOGVAL(context->ctx, LY_VLOG_LINE, &context->line, LYVE_SYNTAX, |
| 169 | "Identifier \"%s\" starts with invalid character.", id), |
| 170 | LY_EVALID); |
| 171 | |
| 172 | /* check rest of the identifier */ |
| 173 | for (rc = lyxml_getutf8(input, &c, term_char_len); |
| 174 | rc == LY_SUCCESS && is_xmlqnamechar(c); |
| 175 | rc = lyxml_getutf8(input, &c, term_char_len)); |
| 176 | LY_CHECK_ERR_RET(rc != LY_SUCCESS, LOGVAL(context->ctx, LY_VLOG_LINE, &context->line, LY_VCODE_INCHAR, (*input)[0]), LY_EVALID); |
| 177 | |
| 178 | (*term_char) = c; |
| 179 | return LY_SUCCESS; |
| 180 | } |
| 181 | |
Radek Krejci | d972c25 | 2018-09-25 13:23:39 +0200 | [diff] [blame^] | 182 | /** |
| 183 | * @brief Parse input expecting an XML attribute (including XML namespace). |
| 184 | * |
| 185 | * Input string is not being modified, so the returned values are not NULL-terminated, instead their length |
| 186 | * is returned. |
| 187 | * |
| 188 | * In case of a namespace definition, prefix just contains xmlns string. In case of the default namespace, |
| 189 | * prefix is NULL and the attribute name is xmlns. |
| 190 | * |
| 191 | * @param[in] context XML context to track lines or store errors into libyang context. |
| 192 | * @param[in,out] input Input string to process, updated according to the processed/read data so, |
| 193 | * when succeeded, it points to the opening quote of the attribute's value.. |
| 194 | * @param[in] options Currently unused options to modify input processing. |
| 195 | * @param[out] prefix Pointer to prefix if present in the attribute name, NULL otherwise. |
| 196 | * @param[out] prefix_len Length of the prefix if any. |
| 197 | * @param[out] name Attribute name. LY_SUCCESS can be returned with NULL name only in case the |
| 198 | * end of the element tag was reached. |
| 199 | * @param[out] name_len Length of the element name. |
| 200 | * @return LY_ERR values. |
| 201 | */ |
| 202 | LY_ERR |
| 203 | lyxml_get_attribute(struct lyxml_context *context, const char **input, int UNUSED(options), |
| 204 | const char **prefix, size_t *prefix_len, const char **name, size_t *name_len) |
| 205 | { |
| 206 | struct ly_ctx *ctx = context->ctx; /* shortcut */ |
| 207 | const char *in = (*input); |
| 208 | const char *id; |
| 209 | const char *endtag; |
| 210 | LY_ERR rc; |
| 211 | unsigned int c; |
| 212 | size_t endtag_len; |
| 213 | |
| 214 | /* initialize output variables */ |
| 215 | (*prefix) = (*name) = NULL; |
| 216 | (*prefix_len) = (*name_len) = 0; |
| 217 | |
| 218 | /* skip initial whitespaces */ |
| 219 | ign_xmlws(context, in); |
| 220 | |
| 221 | if (in[0] == '\0') { |
| 222 | /* EOF - not expected at this place */ |
| 223 | return LY_EINVAL; |
| 224 | } else if (in[0] == '>' || in[0] == '/') { |
| 225 | /* element terminated by > or /> */ |
| 226 | goto success; |
| 227 | } |
| 228 | |
| 229 | /* remember the identifier start before checking its format */ |
| 230 | id = in; |
| 231 | rc = lyxml_check_qname(context, &in, &c, &endtag_len); |
| 232 | LY_CHECK_RET(rc); |
| 233 | if (c == ':') { |
| 234 | /* we have prefixed identifier */ |
| 235 | endtag = in - endtag_len; |
| 236 | |
| 237 | rc = lyxml_check_qname(context, &in, &c, &endtag_len); |
| 238 | LY_CHECK_RET(rc); |
| 239 | |
| 240 | (*prefix) = id; |
| 241 | (*prefix_len) = endtag - id; |
| 242 | id = endtag + 1; |
| 243 | } |
| 244 | if (!is_xmlws(c) && c != '=') { |
| 245 | in = in - endtag_len; |
| 246 | LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_INSTREXP, LY_VCODE_INSTREXP_len(in), in, "whitespace or '='"); |
| 247 | return LY_EVALID; |
| 248 | } |
| 249 | in = in - endtag_len; |
| 250 | (*name) = id; |
| 251 | (*name_len) = in - id; |
| 252 | |
| 253 | /* eat '=' and stop at the value beginning */ |
| 254 | ign_xmlws(context, in); |
| 255 | if (in[0] != '=') { |
| 256 | LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_INSTREXP, LY_VCODE_INSTREXP_len(in), in, "'='"); |
| 257 | return LY_EVALID; |
| 258 | } |
| 259 | ++in; |
| 260 | ign_xmlws(context, in); |
| 261 | if (in[0] != '\'' && in[0] != '"') { |
| 262 | LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_INSTREXP, LY_VCODE_INSTREXP_len(in), in, "either single or double quotation mark"); |
| 263 | return LY_EVALID; |
| 264 | } |
| 265 | |
| 266 | success: |
| 267 | /* move caller's input */ |
| 268 | (*input) = in; |
| 269 | return LY_SUCCESS; |
| 270 | } |
| 271 | |
| 272 | /** |
| 273 | * @brief Parse input expecting an XML element. |
| 274 | * |
| 275 | * Able to silently skip comments, PIs and CData. DOCTYPE is not parsable, so it is reported as LY_EVALID error. |
| 276 | * If '<' is not found in input, LY_EINVAL is returned (but no error is logged), so it is possible to continue |
| 277 | * with parsing input as text content. |
| 278 | * |
| 279 | * Input string is not being modified, so the returned values are not NULL-terminated, instead their length |
| 280 | * is returned. |
| 281 | * |
| 282 | * @param[in] context XML context to track lines or store errors into libyang context. |
| 283 | * @param[in,out] input Input string to process, updated according to the processed/read data. |
| 284 | * @param[in] options Currently unused options to modify input processing. |
| 285 | * @param[out] prefix Pointer to prefix if present in the element name, NULL otherwise. |
| 286 | * @param[out] prefix_len Length of the prefix if any. |
| 287 | * @param[out] name Element name. LY_SUCCESS can be returned with NULL name only in case the |
| 288 | * end of the input string was reached (EOF). |
| 289 | * @param[out] name_len Length of the element name. |
| 290 | * @return LY_ERR values. |
| 291 | */ |
Radek Krejci | d91dbaf | 2018-09-21 15:51:39 +0200 | [diff] [blame] | 292 | LY_ERR |
| 293 | lyxml_get_element(struct lyxml_context *context, const char **input, int UNUSED(options), |
| 294 | const char **prefix, size_t *prefix_len, const char **name, size_t *name_len) |
| 295 | { |
| 296 | struct ly_ctx *ctx = context->ctx; /* shortcut */ |
| 297 | const char *in = (*input); |
| 298 | const char *endtag; |
| 299 | const char *sectname; |
| 300 | const char *id; |
| 301 | size_t endtag_len, newlines; |
| 302 | bool loop = true; |
| 303 | unsigned int c; |
| 304 | LY_ERR rc; |
Radek Krejci | d91dbaf | 2018-09-21 15:51:39 +0200 | [diff] [blame] | 305 | |
| 306 | /* initialize output variables */ |
| 307 | (*prefix) = (*name) = NULL; |
| 308 | (*prefix_len) = (*name_len) = 0; |
| 309 | |
| 310 | while (loop) { |
| 311 | ign_xmlws(context, in); |
| 312 | |
| 313 | if (in[0] == '\0') { |
| 314 | /* EOF */ |
| 315 | goto success; |
| 316 | } else if (in[0] != '<') { |
| 317 | return LY_EINVAL; |
| 318 | } |
| 319 | move_input(context, in, 1); |
| 320 | |
| 321 | if (in[0] == '!') { |
| 322 | move_input(context, in, 1); |
| 323 | /* sections to ignore */ |
| 324 | if (!strncmp(in, "--", 2)) { |
| 325 | /* comment */ |
| 326 | move_input(context, in, 2); |
| 327 | sectname = "Comment"; |
| 328 | endtag = "-->"; |
| 329 | endtag_len = 3; |
| 330 | } else if (!strncmp(in, "[CDATA[", 7)) { |
| 331 | /* CDATA section */ |
| 332 | move_input(context, in, 7); |
| 333 | sectname = "CData"; |
| 334 | endtag = "]]>"; |
| 335 | endtag_len = 3; |
| 336 | } else if (!strncmp(in, "DOCTYPE", 7)) { |
| 337 | /* Document type declaration - not supported */ |
| 338 | LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_NSUPP, "Document Type Declaration"); |
| 339 | return LY_EVALID; |
| 340 | } |
| 341 | in = ign_todelim(in, endtag, endtag_len, &newlines); |
| 342 | LY_CHECK_ERR_RET(!in, LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_NTERM, sectname), LY_EVALID); |
| 343 | context->line += newlines; |
| 344 | in += endtag_len; |
| 345 | } else if (in[0] == '?') { |
| 346 | in = ign_todelim(in, "?>", 2, &newlines); |
| 347 | LY_CHECK_ERR_RET(!in, LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_NTERM, "Declaration"), LY_EVALID); |
| 348 | context->line += newlines; |
| 349 | in += 2; |
| 350 | } else { |
| 351 | /* element */ |
| 352 | ign_xmlws(context, in); |
| 353 | LY_CHECK_ERR_RET(!in[0], LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_EOF), LY_EVALID); |
| 354 | |
| 355 | /* remember the identifier start before checking its format */ |
| 356 | id = in; |
| 357 | rc = lyxml_check_qname(context, &in, &c, &endtag_len); |
| 358 | LY_CHECK_RET(rc); |
| 359 | if (c == ':') { |
| 360 | /* we have prefixed identifier */ |
| 361 | endtag = in - endtag_len; |
| 362 | |
| 363 | rc = lyxml_check_qname(context, &in, &c, &endtag_len); |
| 364 | LY_CHECK_RET(rc); |
| 365 | |
| 366 | (*prefix) = id; |
| 367 | (*prefix_len) = endtag - id; |
| 368 | id = endtag + 1; |
| 369 | } |
| 370 | if (!is_xmlws(c) && c != '/' && c != '>') { |
| 371 | in = in - endtag_len; |
Radek Krejci | d972c25 | 2018-09-25 13:23:39 +0200 | [diff] [blame^] | 372 | LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_INSTREXP, LY_VCODE_INSTREXP_len(in), in, |
| 373 | "whitespace or element tag termination ('>' or '/>'"); |
Radek Krejci | d91dbaf | 2018-09-21 15:51:39 +0200 | [diff] [blame] | 374 | return LY_EVALID; |
| 375 | } |
| 376 | in = in - endtag_len; |
| 377 | (*name) = id; |
| 378 | (*name_len) = in - id; |
| 379 | |
| 380 | loop = false; |
| 381 | } |
| 382 | } |
| 383 | |
| 384 | success: |
| 385 | /* move caller's input */ |
| 386 | (*input) = in; |
| 387 | return LY_SUCCESS; |
| 388 | } |
| 389 | |