blob: 60273a914be25847571fbb96d7368a3b31e58a5f [file] [log] [blame]
Radek Krejci54ea8de2015-04-09 18:02:56 +02001/**
2 * @file xml.c
3 * @author Radek Krejci <rkrejci@cesnet.cz>
4 * @brief XML parser implementation for libyang
5 *
6 * Copyright (c) 2015 CESNET, z.s.p.o.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of the Company nor the names of its contributors
18 * may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 */
21
Radek Krejci02117302015-04-13 16:32:44 +020022
Radek Krejci709fee62015-04-15 13:56:19 +020023#include <ctype.h>
24#include <stdint.h>
Radek Krejci02117302015-04-13 16:32:44 +020025#include <stdlib.h>
26#include <string.h>
Radek Krejci54ea8de2015-04-09 18:02:56 +020027#include <unistd.h>
28
29#include "../common.h"
30#include "xml.h"
31
Radek Krejci02117302015-04-13 16:32:44 +020032/*
33 * Macro to test if character is #x20 | #x9 | #xA | #xD (whitespace)
34 */
35#define is_xmlws(c) (c == 0x20 || c == 0x9 || c == 0xa || c == 0xd)
Radek Krejci54ea8de2015-04-09 18:02:56 +020036
Radek Krejci02117302015-04-13 16:32:44 +020037#define is_xmlnamestartchar(c) ((c >= 'a' && c <= 'z') || c == '_' || \
38 (c >= 'A' && c <= 'Z') || c == ':' || \
39 (c >= 0x370 && c <= 0x1fff && c != 0x37e ) || \
40 (c >= 0xc0 && c <= 0x2ff && c != 0xd7 && c != 0xf7) || c == 0x200c || \
41 c == 0x200d || (c >= 0x2070 && c <= 0x218f) || \
42 (c >= 0x2c00 && c <= 0x2fef) || (c >= 0x3001 && c <= 0xd7ff) || \
43 (c >= 0xf900 && c <= 0xfdcf) || (c >= 0xfdf0 && c <= 0xfffd) || \
44 (c >= 0x10000 && c <= 0xeffff))
45
46#define is_xmlnamechar(c) ((c >= 'a' && c <= 'z') || c == '_' || c == '-' || \
47 (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == ':' || \
48 c == '.' || c == 0xb7 || (c >= 0x370 && c <= 0x1fff && c != 0x37e ) ||\
49 (c >= 0xc0 && c <= 0x2ff && c != 0xd7 && c != 0xf7) || c == 0x200c || \
50 c == 0x200d || (c >= 0x300 && c <= 0x36f) || \
51 (c >= 0x2070 && c <= 0x218f) || (c >= 0x2030f && c <= 0x2040) || \
52 (c >= 0x2c00 && c <= 0x2fef) || (c >= 0x3001 && c <= 0xd7ff) || \
53 (c >= 0xf900 && c <= 0xfdcf) || (c >= 0xfdf0 && c <= 0xfffd) || \
54 (c >= 0x10000 && c <= 0xeffff))
55
56#define ign_xmlws(p) while(is_xmlws(*p)) {p++;}
57
58API int lyxml_unlink_attr(struct lyxml_attr *attr)
Radek Krejci54ea8de2015-04-09 18:02:56 +020059{
Radek Krejci02117302015-04-13 16:32:44 +020060 struct lyxml_attr *a;
61
62 if (!attr) {
63 LY_ERR(LY_EINVAL, NULL);
64 return EXIT_FAILURE;
65 }
66
67 if (!attr->parent) {
68 return EXIT_SUCCESS;
69 }
70
71 a = attr->parent->attr;
72 if (!a) {
73 LY_ERR(LY_EINVAL, "Broken structure (%s).", __func__);
74 return EXIT_FAILURE;
75 } else if (a == attr) {
76 attr->parent->attr = attr->next;
77 } else {
78 while(a->next != attr) {
79 a = a->next;
80 }
81
82 if (!a) {
83 LY_ERR(LY_EINVAL, "Broken structure (%s).", __func__);
84 return EXIT_FAILURE;
85 }
86
87 a->next = attr->next;
88 }
89
90 attr->parent = NULL;
91 attr->next = NULL;
92
93 return EXIT_SUCCESS;
94}
95
96API int lyxml_unlink_elem(struct lyxml_elem *elem)
97{
98 struct lyxml_elem *e;
99
100 if (!elem) {
101 LY_ERR(LY_EINVAL, NULL);
102 return EXIT_FAILURE;
103 }
104
105 if (!elem->parent) {
106 return EXIT_SUCCESS;
107 }
108
109 e = elem->parent->child;
110 if (!e) {
111 LY_ERR(LY_EINVAL, "Broken structure (%s).", __func__);
112 return EXIT_FAILURE;
113 } else if (e == elem) {
114 /* child element of parent is going to be the next element after the
115 * one being unlinked
116 */
117 if (e == e->next) {
118 elem->parent->child = NULL;
119 } else {
120 elem->parent->child = e->next;
121 }
122 }
123
124 /* remove elem from ring list of sibling elements */
125 while (e != elem) {
126 e = e->next;
127 }
128 if (!e) {
129 LY_ERR(LY_EINVAL, "Broken structure (%s).", __func__);
130 return EXIT_FAILURE;
131 }
132 e->prev->next = e->next;
133 e->next->prev = e->prev;
134
135 /* clean up the unlinked element */
136 e->next = e;
137 e->prev = e;
138
139 return EXIT_SUCCESS;
140}
141
142API void lyxml_free_attr(struct lyxml_attr *attr)
143{
144 if (!attr) {
145 return;
146 }
147
148 lyxml_unlink_attr(attr);
149 free(attr->name);
150 free(attr->value);
151 free(attr);
152}
153
154API void lyxml_free_attrs(struct lyxml_elem *elem)
155{
156 struct lyxml_attr *a, *next;
157 if (!elem || !elem->attr) {
158 return;
159 }
160
161 a = elem->attr;
162 do {
163 next = a->next;
164
165 free(a->name);
166 free(a->value);
167 free(a);
168
169 a = next;
170 } while (a);
171}
172
173static void lyxml_free_elem_(struct lyxml_elem *elem)
174{
175 struct lyxml_elem *e, *next;
176
177 if (!elem) {
178 return;
179 }
180
181 lyxml_free_attrs(elem);
182 if (elem->child) {
183 e = elem->child;
184 e->prev->next = NULL;
185 do {
186 next = e->next;
187 lyxml_free_elem_(e);
188 e = next;
189 } while (e);
190 }
191 free(elem->name);
192 free(elem->content);
193 free(elem);
194}
195
196API void lyxml_free_elem(struct lyxml_elem *elem)
197{
198 if (!elem) {
199 return;
200 }
201
202 lyxml_unlink_elem(elem);
203 lyxml_free_elem_(elem);
204}
205
206API int lyxml_add_attr(struct lyxml_elem *parent, struct lyxml_attr *attr)
207{
208 struct lyxml_attr *a;
209
210 if (!parent || !attr) {
211 LY_ERR(LY_EINVAL, NULL);
212 return EXIT_FAILURE;
213 }
214
215 /* (re)link attribute to parent */
216 if (attr->parent) {
217 lyxml_unlink_attr(attr);
218 }
219 attr->parent = parent;
220
221 /* link parent to attribute */
222 if (parent->attr) {
223 for (a = parent->attr; a->next; a = a->next);
224 a->next = attr;
225 } else {
226 parent->attr = attr;
227 }
228
229 return EXIT_SUCCESS;
230}
231
232API int lyxml_add_child(struct lyxml_elem *parent, struct lyxml_elem *elem)
233{
234 struct lyxml_elem *e;
235
236 if (!parent || !elem) {
237 LY_ERR(LY_EINVAL, NULL);
238 return EXIT_FAILURE;
239 }
240
241 /* (re)link element to parent */
242 if (elem->parent) {
243 lyxml_unlink_elem(elem);
244 }
245 elem->parent = parent;
246
247 /* link parent to element */
248 if (parent->child) {
249 e = parent->child;
250 elem->prev = e->prev;
251 elem->next = e;
252 elem->prev->next = elem;
253 elem->next->prev = elem;
254 } else {
255 parent->child = elem;
256 elem->next = elem;
257 elem->prev = elem;
258 }
259
260 return EXIT_SUCCESS;
261}
262
263/**
264 * @brief Get the first UTF-8 character value (4bytes) from buffer
265 * @param[in] buf pointr to the current position in input buffer
266 * @param[out] read Number of processed bytes in buf (length of UTF-8
267 * character).
268 * @return UTF-8 value as 4 byte number. 0 means error, only UTF-8 characters
269 * valid for XML are returned, so:
270 * #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
271 * = any Unicode character, excluding the surrogate blocks, FFFE, and FFFF.
272 *
273 * UTF-8 mapping:
274 * 00000000 -- 0000007F: 0xxxxxxx
275 * 00000080 -- 000007FF: 110xxxxx 10xxxxxx
276 * 00000800 -- 0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
277 * 00010000 -- 001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
278 *
279 */
280static int getutf8(const char *buf, unsigned int *read)
281{
282 int c, aux;
283 int i;
284
285 /* check input variable */
286 if (!buf || !read) {
287 LY_ERR(LY_EINVAL, NULL);
288 return 0;
289 }
290 c = buf[0];
291 *read = 0;
292
293 /* buf is NULL terminated string, so 0 means EOF */
294 if (!c) {
295 LY_ERR(LY_EEOF, NULL);
296 return 0;
297 }
298 *read = 1;
299
300 /* process character byte(s) */
301 if ((c && 0xf8) == 0xf0) {
302 /* four bytes character */
303 *read = 4;
304
305 c &= 0x07;
306 for (i = 1; i <= 3; i++) {
307 aux = buf[i];
308 if ((aux & 0xc0) != 0x80) {
309 LY_ERR(LY_EINVAL, NULL);
310 return 0;
311 }
312
313 c = (c << 6) | (aux & 0x3f);
314 }
315
316
317 if (c < 0x1000 || c > 0x10ffff) {
318 LY_ERR(LY_EINVAL, NULL);
319 return 0;
320 }
321 } else if ((c & 0xf0) == 0xe0) {
322 /* three bytes character */
323 *read = 3;
324
325 c &= 0x0f;
326 for (i = 1; i <= 2; i++) {
327 aux = buf[i];
328 if ((aux & 0xc0) != 0x80) {
329 LY_ERR(LY_EINVAL, NULL);
330 return 0;
331 }
332
333 c = (c << 6) | (aux & 0x3f);
334 }
335
336
337 if (c < 0x800 || (c > 0xd7ff && c < 0xe000) || c > 0xfffd ) {
338 LY_ERR(LY_EINVAL, NULL);
339 return 0;
340 }
341 } else if ((c & 0xe0) == 0xc0) {
342 /* two bytes character */
343 *read = 2;
344
345 aux = buf[1];
346 if ((aux & 0xc0) != 0x80) {
347 LY_ERR(LY_EINVAL, NULL);
348 return 0;
349 }
350 c = ((c & 0x1f) << 6) | (aux & 0x3f);
351
352 if (c < 0x80) {
353 LY_ERR(LY_EINVAL, NULL);
354 return 0;
355 }
356 } else if (!(c & 0x80)) {
357 /* one byte character */
358 if (c < 0x20 && c != 0x9 && c != 0xa && c != 0xd) {
359 /* invalid character */
360 LY_ERR(LY_EINVAL, NULL);
361 return 0;
362 }
363 } else {
364 /* invalid character */
365 LY_ERR(LY_EINVAL, NULL);
366 return 0;
367 }
368
369 return c;
370}
371
Radek Krejci709fee62015-04-15 13:56:19 +0200372/**
373 * Store UTF-8 character specified as 4byte integer into the dst buffer.
374 * Returns number of written bytes (4 max), expects that dst has enough space.
375 *
376 * UTF-8 mapping:
377 * 00000000 -- 0000007F: 0xxxxxxx
378 * 00000080 -- 000007FF: 110xxxxx 10xxxxxx
379 * 00000800 -- 0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
380 * 00010000 -- 001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
381 *
382 */
383static unsigned int pututf8(char *dst, int32_t value)
384{
385 if (value < 0x80) {
386 /* one byte character */
387 dst[0] = value;
388
389 return 1;
390 } else if (value < 0x800) {
391 /* two bytes character */
392 dst[0] = 0xc0 | (value >> 6);
393 dst[1] = 0x80 | (value & 0x3f);
394
395 return 2;
396 } else if (value < 0x10000) {
397 /* three bytes character */
398 dst[0] = 0xe0 | (value >> 12);
399 dst[1] = 0x80 | ((value >> 6) & 0x3f);
400 dst[2] = 0x80 | (value & 0x3f);
401
402 return 3;
403 } else if (value < 0x200000) {
404 /* four bytes character */
405 dst[0] = 0xf0 | (value >> 18);
406 dst[1] = 0x80 | ((value >> 12) & 0x3f);
407 dst[2] = 0x80 | ((value >> 6) & 0x3f);
408 dst[3] = 0x80 | (value & 0x3f);
409
410 return 4;
411 } else {
412 /* out of range */
413 LY_ERR(LY_EINVAL, NULL);
414 return 0;
415 }
416}
417
Radek Krejci05e37a32015-04-15 14:40:34 +0200418static int parse_ignore(const char *data, const char *endstr,
419 unsigned int *len)
Radek Krejci02117302015-04-13 16:32:44 +0200420{
Radek Krejci05e37a32015-04-15 14:40:34 +0200421 unsigned int slen;
Radek Krejci02117302015-04-13 16:32:44 +0200422 const char *c = data;
423
Radek Krejci05e37a32015-04-15 14:40:34 +0200424 slen = strlen(endstr);
Radek Krejci02117302015-04-13 16:32:44 +0200425
Radek Krejci05e37a32015-04-15 14:40:34 +0200426 while (*c && memcmp(c, endstr, slen)) {
Radek Krejci02117302015-04-13 16:32:44 +0200427 c++;
428 }
429 if (!*c) {
430 LY_ERR(LY_EWELLFORM, "Missing close sequence \"%s\".", endstr);
Radek Krejci05e37a32015-04-15 14:40:34 +0200431 return EXIT_FAILURE;
Radek Krejci02117302015-04-13 16:32:44 +0200432 }
Radek Krejci05e37a32015-04-15 14:40:34 +0200433 c += slen;
Radek Krejci02117302015-04-13 16:32:44 +0200434
Radek Krejci05e37a32015-04-15 14:40:34 +0200435 *len = c - data;
436 return EXIT_SUCCESS;
Radek Krejci02117302015-04-13 16:32:44 +0200437}
438
Radek Krejci521008e2015-04-15 14:41:07 +0200439static char *parse_text(const char *data, char delim, unsigned int *len)
Radek Krejci02117302015-04-13 16:32:44 +0200440{
Radek Krejci709fee62015-04-15 13:56:19 +0200441#define BUFSIZE 1024
Radek Krejci02117302015-04-13 16:32:44 +0200442
Radek Krejci709fee62015-04-15 13:56:19 +0200443 char buf[BUFSIZE];
444 char *result = NULL, *aux;
445 unsigned int r;
446 int o, size = 0;
Radek Krejcia4a84062015-04-16 13:00:10 +0200447 int cdsect = 0;
Radek Krejci709fee62015-04-15 13:56:19 +0200448 int32_t n;
449
Radek Krejcia4a84062015-04-16 13:00:10 +0200450 for (*len = o = 0; cdsect || data[*len] != delim; o++) {
451 if (!data[*len] || (!cdsect && !memcmp(&data[*len], "]]>", 2))) {
Radek Krejci02117302015-04-13 16:32:44 +0200452 LY_ERR(LY_EWELLFORM, "Invalid element content, \"]]>\" found.");
Radek Krejci709fee62015-04-15 13:56:19 +0200453 goto error;
Radek Krejci02117302015-04-13 16:32:44 +0200454 }
Radek Krejci709fee62015-04-15 13:56:19 +0200455
Radek Krejcia4a84062015-04-16 13:00:10 +0200456loop:
457
Radek Krejci709fee62015-04-15 13:56:19 +0200458 if (o > BUFSIZE - 3) {
459 /* add buffer into the result */
460 if (result) {
461 size = size + o;
462 aux = realloc(result, size + 1);
463 result = aux;
464 } else {
465 size = o;
466 result = malloc((size + 1) * sizeof *result);
467 }
468 memcpy(&result[size - o], buf, o);
469
470 /* write again into the beginning of the buffer */
471 o = 0;
472 }
473
Radek Krejcia4a84062015-04-16 13:00:10 +0200474 if (cdsect || !memcmp(&data[*len], "<![CDATA[", 9)) {
475 /* CDSect */
476 if (!cdsect) {
477 cdsect = 1;
478 *len += 9;
479 }
480 if (data[*len] && !memcmp(&data[*len], "]]>", 3)) {
481 *len += 3;
482 cdsect = 0;
483 o--; /* we don't write any data in this iteration */
484 } else {
485 buf[o] = data[*len];
486 (*len)++;
487 }
488 } else if (data[*len] == '&') {
Radek Krejci709fee62015-04-15 13:56:19 +0200489 (*len)++;
490 if (data[*len] != '#') {
491 /* entity reference - only predefined refs are supported */
492 if (!memcmp(&data[*len], "lt;", 3)) {
493 buf[o] = '<';
494 *len += 3;
495 } else if (!memcmp(&data[*len], "gt;", 3)) {
496 buf[o] = '>';
497 *len += 3;
498 } else if (!memcmp(&data[*len], "amp;", 4)) {
499 buf[o] = '&';
500 *len += 4;
501 } else if (!memcmp(&data[*len], "apos;", 5)) {
502 buf[o] = '\'';
503 *len += 5;
504 } else if (!memcmp(&data[*len], "quot;", 5)) {
505 buf[o] = '\"';
506 *len += 5;
507 } else {
508 LY_ERR(LY_EWELLFORM,
509 "Invalid entity reference, only predefined entity references are supported.");
510 goto error;
511 }
512 } else {
513 /* character reference */
514 (*len)++;
515 if (isdigit(data[*len])) {
516 for (n = 0; isdigit(data[*len]); (*len)++) {
517 n = (10 * n) + (data[*len] - '0');
518 }
519 if (data[*len] != ';') {
520 LY_ERR(LY_EWELLFORM,
521 "Invalid character reference, missing semicolon.");
522 goto error;
523 }
524 } else if (data[(*len)++] == 'x' && isxdigit(data[*len])) {
525 for (n = 0; isxdigit(data[*len]); (*len)++) {
526 if (isdigit(data[*len])) {
527 r = (data[*len] - '0');
528 } else if (data[*len] > 'F') {
529 r = 10 + (data[*len] - 'a');
530 } else {
531 r = 10 + (data[*len] - 'A');
532 }
533 n = (16 * n) + r;
534 }
535 } else {
536 LY_ERR(LY_EWELLFORM, "Invalid character reference.");
537 goto error;
538
539 }
540 r = pututf8(&buf[o], n);
541 if (!r) {
542 LY_ERR(LY_EWELLFORM, "Invalid character reference value.");
543 goto error;
544 }
545 o += r - 1; /* o is ++ in for loop */
546 (*len)++;
547 }
548 } else {
549 buf[o] = data[*len];
550 (*len)++;
551 }
Radek Krejci02117302015-04-13 16:32:44 +0200552 }
553
Radek Krejcia4a84062015-04-16 13:00:10 +0200554 if (delim == '<' && !memcmp(&data[*len], "<![CDATA[", 9)) {
555 /* ignore loop's end condition on beginning of CDSect */
556 goto loop;
557 }
558
Radek Krejci709fee62015-04-15 13:56:19 +0200559#undef BUFSIZE
560
561 if (o) {
562 if (result) {
563 size = size + o;
564 aux = realloc(result, *len + 1);
565 result = aux;
566 } else {
567 size = o;
568 result = malloc(size * sizeof *result);
569 }
570 memcpy(&result[size - o], buf, o);
571 }
Radek Krejci02117302015-04-13 16:32:44 +0200572 result[size] = '\0';
573
Radek Krejci02117302015-04-13 16:32:44 +0200574 return result;
Radek Krejci709fee62015-04-15 13:56:19 +0200575
576error:
577 free(result);
578 return NULL;
Radek Krejci02117302015-04-13 16:32:44 +0200579}
580
Radek Krejci05e37a32015-04-15 14:40:34 +0200581static struct lyxml_attr *parse_attr(const char *data, unsigned int *len)
Radek Krejci02117302015-04-13 16:32:44 +0200582{
Radek Krejci521008e2015-04-15 14:41:07 +0200583 const char *c = data, *delim;
Radek Krejci02117302015-04-13 16:32:44 +0200584 int uc;
585 struct lyxml_attr *attr = NULL;
586 unsigned int size;
587
588 /* process name part of the attribute */
589 uc = getutf8(c, &size);
590 if (!is_xmlnamestartchar(uc)) {
591 LY_ERR(LY_EWELLFORM, "Invalid NameStartChar of the attribute");
592 return NULL;
593 }
594 c += size;
595 uc = getutf8(c, &size);
596 while (is_xmlnamechar(uc)) {
597 c += size;
598 uc = getutf8(c, &size);
599 }
600
601 attr = calloc(1, sizeof *attr);
602
603 /* store the name */
Radek Krejci521008e2015-04-15 14:41:07 +0200604 size = c - data;
Radek Krejci02117302015-04-13 16:32:44 +0200605 attr->name = malloc((size + 1) * sizeof *attr->name);
Radek Krejci521008e2015-04-15 14:41:07 +0200606 memcpy(attr->name, data, size);
Radek Krejci02117302015-04-13 16:32:44 +0200607 attr->name[size] = '\0';
608
609 /* check Eq mark that can be surrounded by whitespaces */
610 ign_xmlws(c);
611 if (*c != '=') {
612 LY_ERR(LY_EWELLFORM, "Invalid attribute definition, \"=\" expected.");
613 goto error;
614 }
615 c++;
616 ign_xmlws(c);
Radek Krejci02117302015-04-13 16:32:44 +0200617
618 /* process value part of the attribute */
619 if (!*c || (*c != '"' && *c != '\'')) {
620 LY_ERR(LY_EWELLFORM, "Invalid attribute value, \" or \' expected.");
621 goto error;
622 }
623 delim = c;
Radek Krejci521008e2015-04-15 14:41:07 +0200624 attr->value = parse_text(++c, *delim, &size);
625 if (ly_errno) {
Radek Krejci02117302015-04-13 16:32:44 +0200626 goto error;
627 }
628
Radek Krejci521008e2015-04-15 14:41:07 +0200629 *len = c + size + 1 - data; /* +1 is delimiter size */
Radek Krejci02117302015-04-13 16:32:44 +0200630 return attr;
631
632error:
633 lyxml_free_attr(attr);
Radek Krejci54ea8de2015-04-09 18:02:56 +0200634 return NULL;
635}
636
Radek Krejci05e37a32015-04-15 14:40:34 +0200637static struct lyxml_elem *parse_elem(const char *data, unsigned int *len)
Radek Krejci54ea8de2015-04-09 18:02:56 +0200638{
Radek Krejci02117302015-04-13 16:32:44 +0200639 const char *c = data, *e;
640 const char *lws; /* leading white space for handling mixed content */
641 int uc;
642 char *str;
643 struct lyxml_elem *elem, *child;
644 struct lyxml_attr *attr;
Radek Krejci05e37a32015-04-15 14:40:34 +0200645 unsigned int size;
Radek Krejci02117302015-04-13 16:32:44 +0200646
647 *len = 0;
648
649 if (*c != '<') {
650 return NULL;
651 }
652
653 /* locate element name */
654 c++;
655 e = c;
656
Radek Krejci05e37a32015-04-15 14:40:34 +0200657 uc = getutf8(e, &size);
Radek Krejci02117302015-04-13 16:32:44 +0200658 if (!is_xmlnamestartchar(uc)) {
659 LY_ERR(LY_EWELLFORM, "Invalid NameStartChar of the attribute");
660 return NULL;
661 }
Radek Krejci05e37a32015-04-15 14:40:34 +0200662 e += size;
663 uc = getutf8(e, &size);
Radek Krejci02117302015-04-13 16:32:44 +0200664 while (is_xmlnamechar(uc)) {
Radek Krejci05e37a32015-04-15 14:40:34 +0200665 e += size;
666 uc = getutf8(e, &size);
Radek Krejci02117302015-04-13 16:32:44 +0200667 }
668 if (!*e) {
669 LY_ERR(LY_EWELLFORM, "Unexpected end of input data.");
670 return NULL;
671 }
672
673 /* allocate element structure */
674 elem = calloc(1, sizeof *elem);
675 elem->next = elem;
676 elem->prev = elem;
677
678 /* store the name into the element structure */
679 elem->name = malloc((e - c + 1) * sizeof *elem->name);
680 memcpy(elem->name, c, e - c);
681 elem->name[e - c] = '\0';
682 c = e;
683
684process:
Radek Krejci709fee62015-04-15 13:56:19 +0200685 ly_errno = 0;
Radek Krejci02117302015-04-13 16:32:44 +0200686 ign_xmlws(c);
687 if (!memcmp("/>", c, 2)) {
688 /* we are done, it was EmptyElemTag */
689 c += 2;
690 } else if (*c == '>') {
691 /* process element content */
692 c++;
693 lws = NULL;
694
695 while (*c) {
696 if (!memcmp(c, "</", 2)) {
697 if (lws) {
698 /* leading white spaces were actually content */
699 goto store_content;
700 }
701
702 /* Etag */
703 c += 2;
704 /* get name and check it */
705 e = c;
Radek Krejci05e37a32015-04-15 14:40:34 +0200706 uc = getutf8(e, &size);
Radek Krejci02117302015-04-13 16:32:44 +0200707 if (!is_xmlnamestartchar(uc)) {
708 LY_ERR(LY_EWELLFORM,
709 "Invalid NameStartChar of the attribute");
710 goto error;
711 }
Radek Krejci05e37a32015-04-15 14:40:34 +0200712 e += size;
713 uc = getutf8(e, &size);
Radek Krejci02117302015-04-13 16:32:44 +0200714 while (is_xmlnamechar(uc)) {
Radek Krejci05e37a32015-04-15 14:40:34 +0200715 e += size;
716 uc = getutf8(e, &size);
Radek Krejci02117302015-04-13 16:32:44 +0200717 }
718 if (!*e) {
719 LY_ERR(LY_EWELLFORM, "Unexpected end of input data.");
720 goto error;
721 }
722
723 /* check that it corresponds to opening tag */
Radek Krejci05e37a32015-04-15 14:40:34 +0200724 size = e - c;
725 str = malloc((size + 1) * sizeof *str);
Radek Krejci02117302015-04-13 16:32:44 +0200726 memcpy(str, c, e - c);
727 str[e - c] = '\0';
Radek Krejci05e37a32015-04-15 14:40:34 +0200728 if (size != strlen(elem->name) ||
729 memcmp(str, elem->name, size)) {
Radek Krejci02117302015-04-13 16:32:44 +0200730 LY_ERR(LY_EWELLFORM,
731 "Mixed opening (%s) and closing (%s) element tag",
732 elem->name);
733 goto error;
734 }
735 free(str);
736 c = e;
737
738 ign_xmlws(c);
739 if (*c != '>') {
740 LY_ERR(LY_EWELLFORM,
741 "Close element tag \"%s\" contain additional data.",
742 elem->name);
743 goto error;
744 }
745 c++;
746 break;
747
748 } else if (!memcmp(c, "<?", 2)) {
749 if (lws) {
750 /* leading white spaces were only formatting */
751 lws = NULL;
752 }
753 /* PI - ignore it */
754 c += 2;
Radek Krejci05e37a32015-04-15 14:40:34 +0200755 if (parse_ignore(c, "?>", &size)) {
Radek Krejci02117302015-04-13 16:32:44 +0200756 goto error;
757 }
758 c += size;
759 } else if (!memcmp(c, "<!--", 4)) {
760 if (lws) {
761 /* leading white spaces were only formatting */
762 lws = NULL;
763 }
764 /* Comment - ignore it */
765 c += 4;
Radek Krejci05e37a32015-04-15 14:40:34 +0200766 if (parse_ignore(c, "-->", &size)) {
Radek Krejci02117302015-04-13 16:32:44 +0200767 goto error;
768 }
769 c += size;
770 } else if (!memcmp(c, "<![CDATA[", 9)) {
771 /* CDSect */
Radek Krejcia4a84062015-04-16 13:00:10 +0200772 goto store_content;
Radek Krejci02117302015-04-13 16:32:44 +0200773 } else if (*c == '<') {
774 if (lws) {
775 /* leading white spaces were only formatting */
776 lws = NULL;
777 }
778 if (elem->content) {
779 /* we have a mixed content */
780 child = calloc(1, sizeof *child);
781 child->content = elem->content;
782 elem->content = NULL;
783 lyxml_add_child(elem, child);
784 }
785 child = parse_elem(c, &size);
786 if (!child) {
787 LY_ERR(LY_EWELLFORM, "Unexpected end of input data.");
788 goto error;
789 }
790 lyxml_add_child(elem, child);
791 c += size; /* move after processed child element */
792 } else if (is_xmlws(*c)) {
793 lws = c;
794 ign_xmlws(c);
795 } else {
796store_content:
797 /* store text content */
798 if (lws) {
799 /* process content including the leading white spaces */
800 c = lws;
801 lws = NULL;
802 }
Radek Krejci521008e2015-04-15 14:41:07 +0200803 elem->content = parse_text(c, '<', &size);
804 if (ly_errno) {
Radek Krejci709fee62015-04-15 13:56:19 +0200805 goto error;
806 }
Radek Krejci02117302015-04-13 16:32:44 +0200807 c += size; /* move after processed text content */
808
809 if (elem->child) {
810 /* we have a mixed content */
811 child = calloc(1, sizeof *child);
812 child->content = elem->content;
813 elem->content = NULL;
814 lyxml_add_child(elem, child);
815 }
816 }
817 }
818 } else {
819 /* process attribute */
820 attr = parse_attr(c, &size);
821 if (!attr) {
822 LY_ERR(LY_EWELLFORM, "Unexpected end of input data.");
823 goto error;
824 }
825 lyxml_add_attr(elem, attr);
826 c += size; /* move after processed attribute */
827
828 /* go back to finish element processing */
829 goto process;
830 }
831
832 *len = c - data;
833
834 return elem;
835
836error:
837 lyxml_free_elem(elem);
838
Radek Krejci54ea8de2015-04-09 18:02:56 +0200839 return NULL;
840}
841
Radek Krejci02117302015-04-13 16:32:44 +0200842API struct lyxml_elem *lyxml_read(const char *data, int UNUSED(options))
Radek Krejci54ea8de2015-04-09 18:02:56 +0200843{
Radek Krejci02117302015-04-13 16:32:44 +0200844 const char *c = data;
Radek Krejci05e37a32015-04-15 14:40:34 +0200845 unsigned int len;
Radek Krejci02117302015-04-13 16:32:44 +0200846 struct lyxml_elem *root = NULL;
847
848 if (!data) {
849 ly_errno = LY_EINVAL;
850 return NULL;
851 }
852
853 /* process document */
854 while (*c) {
855 if (is_xmlws(*c)) {
856 /* skip whitespaces */
857 c++;
858 } else if (!memcmp(c, "<?", 2)) {
859 /* XMLDecl or PI - ignore it */
860 c += 2;
Radek Krejci05e37a32015-04-15 14:40:34 +0200861 if (parse_ignore(c, "?>", &len)) {
Radek Krejci02117302015-04-13 16:32:44 +0200862 LY_ERR(LY_EWELLFORM, "Missing close sequence \"?>\".");
863 return NULL;
864 }
865 c += len;
866 } else if (!memcmp(c, "<!--", 4)) {
867 /* Comment - ignore it */
868 c += 2;
Radek Krejci05e37a32015-04-15 14:40:34 +0200869 if (parse_ignore(c, "-->", &len)) {
Radek Krejci02117302015-04-13 16:32:44 +0200870 LY_ERR(LY_EWELLFORM, "Missing close sequence \"-->\".");
871 return NULL;
872 }
873 c += len;
874 } else if (!memcmp(c, "<!", 2)) {
875 /* DOCTYPE */
876 /* TODO - standalone ignore counting < and > */
877 LY_ERR(LY_EINVAL, "DOCTYPE not implemented.");
878 return NULL;
879 } else if (*c == '<') {
880 /* element - process it in next loop to strictly follow XML
881 * format
882 */
883 break;
884 }
885 }
886
887 root = parse_elem(c, &len);
888 if (!root) {
889 return NULL;
890 }
891 c += len;
892
893 /* ignore the rest of document where can be comments, PIs and whitespaces,
894 * note that we are not detecting syntax errors in these parts
895 */
896 ign_xmlws(c);
897 if (*c) {
898 LY_WRN("There are some not parsed data:\n%s", c);
899 }
900
901 return root;
902}
903
904API struct lyxml_elem *lyxml_read_fd(int fd, int UNUSED(options))
905{
906 if (fd == -1) {
907 ly_errno = LY_EINVAL;
908 return NULL;
909 }
910
Radek Krejci54ea8de2015-04-09 18:02:56 +0200911 return NULL;
912}
913
Radek Krejci02117302015-04-13 16:32:44 +0200914API struct lyxml_elem *lyxml_read_file(const char *filename,
915 int UNUSED(options))
Radek Krejci54ea8de2015-04-09 18:02:56 +0200916{
Radek Krejci02117302015-04-13 16:32:44 +0200917 if (!filename) {
918 LY_ERR(LY_EINVAL, NULL);
919 return NULL;
920 }
Radek Krejci54ea8de2015-04-09 18:02:56 +0200921
Radek Krejci02117302015-04-13 16:32:44 +0200922 return NULL;
Radek Krejci54ea8de2015-04-09 18:02:56 +0200923}
Radek Krejci02117302015-04-13 16:32:44 +0200924