xml parser CHANGE handle mixed XML content mixed XML content is not allowed, do the detection inside the XML parser to avoid need of checking it by the callers

commit: 339e2dee922960fd72b1d036f9483e15b590dba5 [log] [tgz]
author: Radek Krejci <rkrejci@cesnet.cz> Fri May 17 14:28:24 2019 +0200
committer: Radek Krejci <rkrejci@cesnet.cz> Fri May 17 14:29:54 2019 +0200
tree: b87484dec795d8d27942fee116be2480b0f38757
parent: 17dca99515a2155eebd07350c864b8bc799240b4 [diff]
diff --git a/src/parser_xml.c b/src/parser_xml.c
index 066a4d1..86ae99a 100644
--- a/src/parser_xml.c
+++ b/src/parser_xml.c

@@ -217,7 +217,12 @@
 
             if (ctx->status == LYXML_ELEM_CONTENT) {
                 /* get the value */
-                lyxml_get_string((struct lyxml_context *)ctx, data, &buffer, &buffer_size, &value, &value_len, &dynamic);
+                LY_ERR r = lyxml_get_string((struct lyxml_context *)ctx, data, &buffer, &buffer_size, &value, &value_len, &dynamic);
+                if (r == LY_EINVAL) {
+                    /* just indentation of a child element found */
+                    LOGVAL(ctx->ctx, LY_VLOG_LINE, &ctx->line, LYVE_SYNTAX, "Child element inside terminal node \"%s\" found.", cur->schema->name);
+                    goto cleanup;
+                }
                 lyd_value_validate((struct lyd_node_term*)cur, value, value_len,
                                    LY_TYPE_VALIDATE_CANONIZE | (dynamic ? LY_TYPE_VALIDATE_DYNAMIC : 0));
             }

diff --git a/src/xml.c b/src/xml.c
index 59991d6..a318420 100644
--- a/src/xml.c
+++ b/src/xml.c

@@ -311,9 +311,28 @@
         LY_CHECK_ERR_RET(!in[offset], LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_EOF), LY_EVALID);
         context->line += newlines;
         if (in[offset] == '<') {
+            const char *name, *prefix;
+            size_t name_len, prefix_len;
+
             (*input) = in + offset;
-            context->status -= 1; /* LYXML_ELEMENT */;
-            return LY_EINVAL;
+
+            /* get know if it is child element (indentation) or closing element (whitespace-only content) */
+            in = *input;
+            rc = lyxml_get_element(context, &in, &prefix, &prefix_len, &name, &name_len);
+            if (name) {
+                /* the element here is not closing element, so we have the just indentation formatting before the child */
+                free(context->elements.objs[--context->elements.count]);
+                context->status -= 1; /* LYXML_ELEMENT */
+                return LY_EINVAL;
+            } else if (rc) {
+                /* some parsing error, so pass it */
+                (*input) = in;
+                return rc;
+            } else {
+                /* whitespace-only content */
+                len = offset - 1;
+                goto success;
+            }
         }
     }
     /* init */
@@ -427,7 +446,29 @@
             /* in case of element content, keep the leading <,
              * for attribute's value move after the terminating quotation mark */
             if (context->status == LYXML_ELEM_CONTENT) {
+                const char *name, *prefix;
+                size_t name_len, prefix_len;
+
                 in += offset;
+
+                /* get know if it is child element (mixed content) or closing element (regular content) */
+                (*input) = in;
+                rc = lyxml_get_element(context, &in, &prefix, &prefix_len, &name, &name_len);
+                if (name) {
+                    /* the element here is not closing element, so we have not allowed mixed content */
+                    struct lyxml_elem *e = (struct lyxml_elem*)context->elements.objs[--context->elements.count];
+                    LOGVAL(ctx, LY_VLOG_LINE, &context->line, LYVE_SYNTAX, "Mixed XML content is not allowed (%.*s).",
+                           offset + (in - (*input)), &(*input)[-offset]);
+                    free(e);
+                    return LY_EVALID;
+                } else if (rc) {
+                    /* some parsing error, so pass it */
+                    return rc;
+                } else {
+                    /* closing element, so we have regular content */
+                    context->status++;
+                    goto success;
+                }
             } else {
                 in += offset + 1;
             }

diff --git a/tests/src/test_xml.c b/tests/src/test_xml.c
index ac67d9b..216e6b2 100644
--- a/tests/src/test_xml.c
+++ b/tests/src/test_xml.c

@@ -207,6 +207,16 @@
     str = "<yin:c⁐element>";
     assert_int_equal(LY_EVALID, lyxml_get_element(&ctx, &str, &prefix, &prefix_len, &name, &name_len));
     logbuf_assert("Invalid character sequence \"⁐element>\", expected whitespace or element tag termination ('>' or '/>'. Line number 1.");
+
+    /* mixed content */
+    str = "<a>text <b>x</b></a>";
+    assert_int_equal(LY_SUCCESS, lyxml_get_element(&ctx, &str, &prefix, &prefix_len, &name, &name_len));
+    assert_string_equal("text <b>x</b></a>", str);
+    assert_int_equal(LYXML_ELEM_CONTENT, ctx.status);
+    assert_int_equal(LY_EVALID, lyxml_get_string(&ctx, &str, &buf, &buf_len, &out, &len, &dynamic));
+    logbuf_assert("Mixed XML content is not allowed (text <b>). Line number 1.");
+    lyxml_context_clear(&ctx);
+
 }
 
 static void
@@ -278,6 +288,8 @@
     int dynamic;
     const char *str, *p;
     char *buf = NULL, *out = NULL;
+    const char *prefix, *name;
+    size_t prefix_len, name_len;
 
     struct lyxml_context ctx;
     memset(&ctx, 0, sizeof ctx);
@@ -306,10 +318,12 @@
 
     /* empty element content - only formating before defining child */
     ctx.status = LYXML_ELEM_CONTENT;
-    str = "\n  <";
+    str = "<x>\n  <y>";
+    assert_int_equal(LY_SUCCESS, lyxml_get_element(&ctx, &str, &prefix, &prefix_len, &name, &name_len));
     assert_int_equal(LY_EINVAL, lyxml_get_string(&ctx, &str, &buf, &buf_len, &out, &len, &dynamic));
     assert_null(buf);
-    assert_string_equal("<", str);
+    assert_string_equal("<y>", str);
+    lyxml_context_clear(&ctx);
 
     /* empty element content is invalid - missing content terminating character < */
     ctx.status = LYXML_ELEM_CONTENT;
@@ -327,7 +341,8 @@
 
     /* valid strings */
     ctx.status = LYXML_ELEM_CONTENT;
-    str = "€𠜎Øn \n&lt;&amp;&quot;&apos;&gt; &#82;&#x4f;&#x4B;<";
+    str = "<a>€𠜎Øn \n&lt;&amp;&quot;&apos;&gt; &#82;&#x4f;&#x4B;</a>";
+    assert_int_equal(LY_SUCCESS, lyxml_get_element(&ctx, &str, &prefix, &prefix_len, &name, &name_len));
     assert_int_equal(LY_SUCCESS, lyxml_get_string(&ctx, &str, &buf, &buf_len, &out, &len, &dynamic));
     assert_int_not_equal(0, dynamic);
     assert_non_null(buf);
@@ -335,8 +350,9 @@
     assert_int_equal(22, buf_len);
     assert_int_equal(21, len);
     assert_string_equal("€𠜎Øn \n<&\"\'> ROK", buf);
-    assert_string_equal("<", str);
+    assert_string_equal("", str);
     assert_int_equal(LYXML_ELEMENT, ctx.status);
+    lyxml_context_clear(&ctx);
 
     /* test using n-bytes UTF8 hexadecimal code points */
     ctx.status = LYXML_ATTR_CONTENT;
commit	339e2dee922960fd72b1d036f9483e15b590dba5	[log] [tgz]
author	Radek Krejci <rkrejci@cesnet.cz>	Fri May 17 14:28:24 2019 +0200
committer	Radek Krejci <rkrejci@cesnet.cz>	Fri May 17 14:29:54 2019 +0200
tree	b87484dec795d8d27942fee116be2480b0f38757
parent	17dca99515a2155eebd07350c864b8bc799240b4 [diff]