xml FEATURE xml parser

just work in progress:
- element name parser (lyxml_get_element())
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6827dc0..403a6b0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -157,7 +157,8 @@
     src/hash_table.c
     src/set.c
     src/context.c
-    src/parser_yang.c)
+    src/parser_yang.c
+    src/xml.c)
 
 #set(lintsrc
 #    tools/lint/main.c
diff --git a/src/common.h b/src/common.h
index 7fecf36..4f0157b 100644
--- a/src/common.h
+++ b/src/common.h
@@ -116,6 +116,8 @@
 #define LY_VCODE_INCHAR      LYVE_SYNTAX, "Invalid character 0x%x."
 #define LY_VCODE_INSTREXP    LYVE_SYNTAX, "Invalid character sequence \"%.*s\", expected %s."
 #define LY_VCODE_EOF         LYVE_SYNTAX, "Unexpected end-of-file."
+#define LY_VCODE_NTERM       LYVE_SYNTAX, "%s not terminated."
+#define LY_VCODE_NSUPP       LYVE_SYNTAX, "%s not supported."
 #define LY_VCODE_INSTMT      LYVE_SYNTAX_YANG, "Invalid keyword \"%s\"."
 #define LY_VCODE_INCHILDSTMT LYVE_SYNTAX_YANG, "Invalid keyword \"%s\" as a child of \"%s\"."
 #define LY_VCODE_DUPSTMT     LYVE_SYNTAX_YANG, "Duplicate keyword \"%s\"."
diff --git a/src/xml.c b/src/xml.c
new file mode 100644
index 0000000..c4bdb4b
--- /dev/null
+++ b/src/xml.c
@@ -0,0 +1,281 @@
+/**
+ * @file xml.c
+ * @author Radek Krejci <rkrejci@cesnet.cz>
+ * @brief Generic XML parser implementation for libyang
+ *
+ * Copyright (c) 2015 - 2018 CESNET, z.s.p.o.
+ *
+ * This source code is licensed under BSD 3-Clause License (the "License").
+ * You may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://opensource.org/licenses/BSD-3-Clause
+ */
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "libyang.h"
+#include "xml.h"
+#include "common.h"
+
+/* Macro to test if character is whitespace */
+#define is_xmlws(c) (c == 0x20 || c == 0x9 || c == 0xa || c == 0xd)
+
+/* Macro to test if character is allowed to be a first character of an qualified identifier */
+#define is_xmlqnamestartchar(c) ((c >= 'a' && c <= 'z') || c == '_' || \
+        (c >= 'A' && c <= 'Z') || /* c == ':' || */ \
+        (c >= 0x370 && c <= 0x1fff && c != 0x37e ) || \
+        (c >= 0xc0 && c <= 0x2ff && c != 0xd7 && c != 0xf7) || c == 0x200c || \
+        c == 0x200d || (c >= 0x2070 && c <= 0x218f) || \
+        (c >= 0x2c00 && c <= 0x2fef) || (c >= 0x3001 && c <= 0xd7ff) || \
+        (c >= 0xf900 && c <= 0xfdcf) || (c >= 0xfdf0 && c <= 0xfffd) || \
+        (c >= 0x10000 && c <= 0xeffff))
+
+/* Macro to test if character is allowed to be used in an qualified identifier */
+#define is_xmlqnamechar(c) ((c >= 'a' && c <= 'z') || c == '_' || c == '-' || \
+        (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || /* c == ':' || */ \
+        c == '.' || c == 0xb7 || (c >= 0x370 && c <= 0x1fff && c != 0x37e ) ||\
+        (c >= 0xc0 && c <= 0x2ff && c != 0xd7 && c != 0xf7) || c == 0x200c || \
+        c == 0x200d || (c >= 0x300 && c <= 0x36f) || \
+        (c >= 0x2070 && c <= 0x218f) || (c >= 0x2030f && c <= 0x2040) || \
+        (c >= 0x2c00 && c <= 0x2fef) || (c >= 0x3001 && c <= 0xd7ff) || \
+        (c >= 0xf900 && c <= 0xfdcf) || (c >= 0xfdf0 && c <= 0xfffd) || \
+        (c >= 0x10000 && c <= 0xeffff))
+
+/* Move input p by s characters, if EOF log with lyxml_context c */
+#define move_input(c,p,s) p += s; LY_CHECK_ERR_RET(!p[0], LOGVAL(c->ctx, LY_VLOG_LINE, &c->line, LY_VCODE_EOF), LY_EVALID)
+
+/* Ignore whitespaces in the input string p, if EOF log with lyxml_context c */
+#define ign_xmlws(c,p) while (is_xmlws(*(p))) {if (*(p) == '\n') {++c->line;} ++p;}
+
+static const char *
+ign_todelim(register const char *input, const char *delim, size_t delim_len, size_t *newlines)
+{
+    size_t i;
+    register const char *a, *b;
+
+    (*newlines) = 0;
+    for ( ; *input; ++input) {
+        if (*input != *delim) {
+            if (*input == '\n') {
+                ++(*newlines);
+            }
+            continue;
+        }
+        a = input;
+        b = delim;
+        for (i = 0; i < delim_len; ++i) {
+            if (*a++ != *b++) {
+                break;
+            }
+        }
+        if (i == delim_len) {
+            return input;
+        }
+    }
+    return NULL;
+}
+
+static LY_ERR
+lyxml_getutf8(const char **input, unsigned int *utf8_char, size_t *bytes_read)
+{
+    unsigned int c, len;
+    int aux;
+    int i;
+
+    c = (*input)[0];
+    LY_CHECK_RET(!c, LY_EINVAL);
+
+    /* process character byte(s) */
+    if ((c & 0xf8) == 0xf0) {
+        /* four bytes character */
+        len = 4;
+
+        c &= 0x07;
+        for (i = 1; i <= 3; i++) {
+            aux = (*input)[i];
+            if ((aux & 0xc0) != 0x80) {
+                return LY_EINVAL;
+            }
+
+            c = (c << 6) | (aux & 0x3f);
+        }
+
+        if (c < 0x1000 || c > 0x10ffff) {
+            return LY_EINVAL;
+        }
+    } else if ((c & 0xf0) == 0xe0) {
+        /* three bytes character */
+        len = 3;
+
+        c &= 0x0f;
+        for (i = 1; i <= 2; i++) {
+            aux = (*input)[i];
+            if ((aux & 0xc0) != 0x80) {
+                return LY_EINVAL;
+            }
+
+            c = (c << 6) | (aux & 0x3f);
+        }
+
+        if (c < 0x800 || (c > 0xd7ff && c < 0xe000) || c > 0xfffd) {
+            return LY_EINVAL;
+        }
+    } else if ((c & 0xe0) == 0xc0) {
+        /* two bytes character */
+        len = 2;
+
+        aux = (*input)[1];
+        if ((aux & 0xc0) != 0x80) {
+            return LY_EINVAL;
+        }
+        c = ((c & 0x1f) << 6) | (aux & 0x3f);
+
+        if (c < 0x80) {
+            return LY_EINVAL;
+        }
+    } else if (!(c & 0x80)) {
+        /* one byte character */
+        len = 1;
+
+        if (c < 0x20 && c != 0x9 && c != 0xa && c != 0xd) {
+            return LY_EINVAL;
+        }
+    } else {
+        return LY_EINVAL;
+    }
+
+    (*utf8_char) = c;
+    (*input) += len;
+    if (bytes_read) {
+        (*bytes_read) = len;
+    }
+    return LY_SUCCESS;
+}
+
+LY_ERR
+lyxml_check_qname(struct lyxml_context *context, const char **input, unsigned int *term_char, size_t *term_char_len)
+{
+    unsigned int c;
+    const char *id = (*input);
+    LY_ERR rc;
+
+    /* check NameStartChar (minus colon) */
+    LY_CHECK_ERR_RET(lyxml_getutf8(input, &c, NULL) != LY_SUCCESS,
+                     LOGVAL(context->ctx, LY_VLOG_LINE, &context->line, LY_VCODE_INCHAR, (*input)[0]), LY_EVALID);
+    LY_CHECK_ERR_RET(!is_xmlqnamestartchar(c),
+                     LOGVAL(context->ctx, LY_VLOG_LINE, &context->line, LYVE_SYNTAX,
+                            "Identifier \"%s\" starts with invalid character.", id),
+                     LY_EVALID);
+
+    /* check rest of the identifier */
+    for (rc = lyxml_getutf8(input, &c, term_char_len);
+         rc == LY_SUCCESS && is_xmlqnamechar(c);
+         rc = lyxml_getutf8(input, &c, term_char_len));
+    LY_CHECK_ERR_RET(rc != LY_SUCCESS, LOGVAL(context->ctx, LY_VLOG_LINE, &context->line, LY_VCODE_INCHAR, (*input)[0]), LY_EVALID);
+
+    (*term_char) = c;
+    return LY_SUCCESS;
+}
+
+LY_ERR
+lyxml_get_element(struct lyxml_context *context, const char **input, int UNUSED(options),
+                  const char **prefix, size_t *prefix_len, const char **name, size_t *name_len)
+{
+    struct ly_ctx *ctx = context->ctx; /* shortcut */
+    const char *in = (*input);
+    const char *endtag;
+    const char *sectname;
+    const char *id;
+    size_t endtag_len, newlines;
+    bool loop = true;
+    unsigned int c;
+    LY_ERR rc;
+    uint32_t x;
+
+    /* initialize output variables */
+    (*prefix) = (*name) = NULL;
+    (*prefix_len) = (*name_len) = 0;
+
+    while (loop) {
+        ign_xmlws(context, in);
+
+        if (in[0] == '\0') {
+            /* EOF */
+            goto success;
+        } else if (in[0] != '<') {
+            return LY_EINVAL;
+        }
+        move_input(context, in, 1);
+
+        if (in[0] == '!') {
+            move_input(context, in, 1);
+            /* sections to ignore */
+            if (!strncmp(in, "--", 2)) {
+                /* comment */
+                move_input(context, in, 2);
+                sectname = "Comment";
+                endtag = "-->";
+                endtag_len = 3;
+            } else if (!strncmp(in, "[CDATA[", 7)) {
+                /* CDATA section */
+                move_input(context, in, 7);
+                sectname = "CData";
+                endtag = "]]>";
+                endtag_len = 3;
+            } else if (!strncmp(in, "DOCTYPE", 7)) {
+                /* Document type declaration - not supported */
+                LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_NSUPP, "Document Type Declaration");
+                return LY_EVALID;
+            }
+            in = ign_todelim(in, endtag, endtag_len, &newlines);
+            LY_CHECK_ERR_RET(!in, LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_NTERM, sectname), LY_EVALID);
+            context->line += newlines;
+            in += endtag_len;
+        } else if (in[0] == '?') {
+            in = ign_todelim(in, "?>", 2, &newlines);
+            LY_CHECK_ERR_RET(!in, LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_NTERM, "Declaration"), LY_EVALID);
+            context->line += newlines;
+            in += 2;
+        } else {
+            /* element */
+            ign_xmlws(context, in);
+            LY_CHECK_ERR_RET(!in[0], LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_EOF), LY_EVALID);
+
+            /* remember the identifier start before checking its format */
+            id = in;
+            rc = lyxml_check_qname(context, &in, &c, &endtag_len);
+            LY_CHECK_RET(rc);
+            if (c == ':') {
+                /* we have prefixed identifier */
+                endtag = in - endtag_len;
+
+                rc = lyxml_check_qname(context, &in, &c, &endtag_len);
+                LY_CHECK_RET(rc);
+
+                (*prefix) = id;
+                (*prefix_len) = endtag - id;
+                id = endtag + 1;
+            }
+            if (!is_xmlws(c) && c != '/' && c != '>') {
+                in = in - endtag_len;
+                x = 0;
+                memcpy(&x, in, endtag_len);
+                LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_INCHAR, x);
+                return LY_EVALID;
+            }
+            in = in - endtag_len;
+            (*name) = id;
+            (*name_len) = in - id;
+
+            loop = false;
+        }
+    }
+
+success:
+    /* move caller's input */
+    (*input) = in;
+    return LY_SUCCESS;
+}
+
diff --git a/src/xml.h b/src/xml.h
new file mode 100644
index 0000000..759c422
--- /dev/null
+++ b/src/xml.h
@@ -0,0 +1,39 @@
+/**
+ * @file xml.h
+ * @author Radek Krejci <rkrejci@cesnet.cz>
+ * @brief Generic XML parser routines.
+ *
+ * Copyright (c) 2015 - 2018 CESNET, z.s.p.o.
+ *
+ * This source code is licensed under BSD 3-Clause License (the "License").
+ * You may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://opensource.org/licenses/BSD-3-Clause
+ */
+
+#ifndef LY_XML_H_
+#define LY_XML_H_
+
+#include <stdint.h>
+
+#include "context.h"
+#include "set.h"
+
+struct lyxml_ns {
+    char *prefix;
+    char *ns;
+};
+
+enum LYXML_PARSER_STATUS {
+    LYXML_STATUS_CDSECT,  /* CDATA section */
+    LYXML_STATUS_COMMENT, /* XML comment */
+};
+
+struct lyxml_context {
+    struct ly_ctx *ctx;
+    uint64_t line;
+    struct ly_set ns;
+};
+
+#endif /* LY_XML_H_ */
diff --git a/tests/src/CMakeLists.txt b/tests/src/CMakeLists.txt
index cf68357..77eead2 100644
--- a/tests/src/CMakeLists.txt
+++ b/tests/src/CMakeLists.txt
@@ -2,11 +2,13 @@
     src_set
     src_common
     src_context
-    src_hash_table)
+    src_hash_table
+    src_xml)
 set(local_tests_wraps
     " "
     "-Wl,--wrap=realloc"
     "-Wl,--wrap=ly_set_add"
+    " "
     " ")
 set(tests ${tests} ${local_tests} PARENT_SCOPE)
 set(tests_wraps ${tests_wraps} ${local_tests_wraps} PARENT_SCOPE)
diff --git a/tests/src/xml.c b/tests/src/xml.c
new file mode 100644
index 0000000..3731fbe
--- /dev/null
+++ b/tests/src/xml.c
@@ -0,0 +1,166 @@
+/*
+ * @file xml.c
+ * @author: Radek Krejci <rkrejci@cesnet.cz>
+ * @brief unit tests for functions from xml.c
+ *
+ * Copyright (c) 2018 CESNET, z.s.p.o.
+ *
+ * This source code is licensed under BSD 3-Clause License (the "License").
+ * You may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://opensource.org/licenses/BSD-3-Clause
+ */
+
+#define _BSD_SOURCE
+#define _DEFAULT_SOURCE
+#include <stdarg.h>
+#include <stddef.h>
+#include <setjmp.h>
+#include <cmocka.h>
+
+#include <stdio.h>
+#include <string.h>
+
+#include "libyang.h"
+#include "../../src/xml.c"
+
+#define BUFSIZE 1024
+char logbuf[BUFSIZE] = {0};
+
+/* set to 0 to printing error messages to stderr instead of checking them in code */
+#define ENABLE_LOGGER_CHECKING 1
+
+static void
+logger(LY_LOG_LEVEL level, const char *msg, const char *path)
+{
+    (void) level; /* unused */
+
+    if (path) {
+        snprintf(logbuf, BUFSIZE - 1, "%s %s", msg, path);
+    } else {
+        strncpy(logbuf, msg, BUFSIZE - 1);
+    }
+}
+
+static int
+logger_setup(void **state)
+{
+    (void) state; /* unused */
+#if ENABLE_LOGGER_CHECKING
+    ly_set_log_clb(logger, 1);
+#endif
+    return 0;
+}
+
+void
+logbuf_clean(void)
+{
+    logbuf[0] = '\0';
+}
+
+#if ENABLE_LOGGER_CHECKING
+#   define logbuf_assert(str) assert_string_equal(logbuf, str)
+#else
+#   define logbuf_assert(str)
+#endif
+
+static void
+test_element(void **state)
+{
+    (void) state; /* unused */
+
+    size_t name_len, prefix_len;
+    const char *name, *prefix;
+    const char *str, *p;
+
+    struct lyxml_context ctx;
+    memset(&ctx, 0, sizeof ctx);
+    ctx.line = 1;
+
+    /* empty */
+    str = "";
+    assert_int_equal(LY_SUCCESS, lyxml_get_element(&ctx, &str, 0, &prefix, &prefix_len, &name, &name_len));
+    assert_null(name);
+    assert_true(str[0] == '\0');
+
+    /* no element */
+    logbuf_clean();
+    str = p = "no data present";
+    assert_int_equal(LY_EINVAL, lyxml_get_element(&ctx, &str, 0, &prefix, &prefix_len, &name, &name_len));
+    assert_null(name);
+    assert_ptr_equal(p, str); /* input data not eaten */
+    logbuf_assert("");
+
+    /* not supported DOCTYPE */
+    str = p = "<!DOCTYPE greeting SYSTEM \"hello.dtd\"><greeting/>";
+    assert_int_equal(LY_EVALID, lyxml_get_element(&ctx, &str, 0, &prefix, &prefix_len, &name, &name_len));
+    assert_null(name);
+    assert_ptr_equal(p, str); /* input data not eaten */
+    logbuf_assert("Document Type Declaration not supported. Line number 1.");
+
+    /* unqualified element */
+    str = "  <  element/>";
+    assert_int_equal(LY_SUCCESS, lyxml_get_element(&ctx, &str, 0, &prefix, &prefix_len, &name, &name_len));
+    assert_null(prefix);
+    assert_false(strncmp("element", name, name_len));
+    assert_int_equal(7, name_len);
+    assert_string_equal("/>", str);
+
+    str = "<?xml version=\"1.0\"?>  <!-- comment --> <?TEST xxx?> <element/>";
+    assert_int_equal(LY_SUCCESS, lyxml_get_element(&ctx, &str, 0, &prefix, &prefix_len, &name, &name_len));
+    assert_null(prefix);
+    assert_false(strncmp("element", name, name_len));
+    assert_int_equal(7, name_len);
+    assert_string_equal("/>", str);
+
+    str = "<element xmlns=\"urn\"></element>";
+    assert_int_equal(LY_SUCCESS, lyxml_get_element(&ctx, &str, 0, &prefix, &prefix_len, &name, &name_len));
+    assert_null(prefix);
+    assert_false(strncmp("element", name, name_len));
+    assert_int_equal(7, name_len);
+    assert_string_equal(" xmlns=\"urn\"></element>", str);
+
+    /* qualified element */
+    str = "  <  yin:element/>";
+    assert_int_equal(LY_SUCCESS, lyxml_get_element(&ctx, &str, 0, &prefix, &prefix_len, &name, &name_len));
+    assert_false(strncmp("yin", prefix, prefix_len));
+    assert_false(strncmp("element", name, name_len));
+    assert_int_equal(3, prefix_len);
+    assert_int_equal(7, name_len);
+    assert_string_equal("/>", str);
+
+    str = "<yin:element xmlns=\"urn\"></element>";
+    assert_int_equal(LY_SUCCESS, lyxml_get_element(&ctx, &str, 0, &prefix, &prefix_len, &name, &name_len));
+    assert_false(strncmp("yin", prefix, prefix_len));
+    assert_false(strncmp("element", name, name_len));
+    assert_int_equal(3, prefix_len);
+    assert_int_equal(7, name_len);
+    assert_string_equal(" xmlns=\"urn\"></element>", str);
+
+    /* UTF8 characters */
+    str = "<𠜎€𠜎Øn:𠜎€𠜎Øn/>";
+    assert_int_equal(LY_SUCCESS, lyxml_get_element(&ctx, &str, 0, &prefix, &prefix_len, &name, &name_len));
+    assert_false(strncmp("𠜎€𠜎Øn", prefix, prefix_len));
+    assert_false(strncmp("𠜎€𠜎Øn", name, name_len));
+    assert_int_equal(14, prefix_len);
+    assert_int_equal(14, name_len);
+    assert_string_equal("/>", str);
+
+    /* invalid UTF-8 character */
+    str = "<¢:element>";
+    assert_int_equal(LY_EVALID, lyxml_get_element(&ctx, &str, 0, &prefix, &prefix_len, &name, &name_len));
+    logbuf_assert("Identifier \"¢:element>\" starts with invalid character. Line number 1.");
+    str = "<yin:c⁐element>";
+    assert_int_equal(LY_EVALID, lyxml_get_element(&ctx, &str, 0, &prefix, &prefix_len, &name, &name_len));
+    logbuf_assert("Invalid character 0x9081e2. Line number 1.");
+}
+
+int main(void)
+{
+    const struct CMUnitTest tests[] = {
+        cmocka_unit_test_setup(test_element, logger_setup),
+    };
+
+    return cmocka_run_group_tests(tests, NULL, NULL);
+}