xml CHANGE process closing element tags
diff --git a/src/xml.c b/src/xml.c
index fe1f2cd..4af078d 100644
--- a/src/xml.c
+++ b/src/xml.c
@@ -14,6 +14,7 @@
#define _POSIX_C_SOURCE 200809L /* strndup() */
+#include <assert.h>
#include <ctype.h>
#include <stdbool.h>
#include <stdint.h>
@@ -50,7 +51,7 @@
/* Move input p by s characters, if EOF log with lyxml_context c */
#define move_input(c,p,s) p += s; LY_CHECK_ERR_RET(!p[0], LOGVAL(c->ctx, LY_VLOG_LINE, &c->line, LY_VCODE_EOF), LY_EVALID)
-/* Ignore whitespaces in the input string p, if EOF log with lyxml_context c */
+/* Ignore whitespaces in the input string p */
#define ign_xmlws(c,p) while (is_xmlws(*(p))) {if (*(p) == '\n') {++c->line;} ++p;}
/**
@@ -215,6 +216,9 @@
bool empty_content = false;
LY_ERR rc;
+ assert(context);
+ assert(context->status == LYXML_ELEM_CONTENT || context->status == LYXML_ATTR_CONTENT);
+
if (in[0] == '\'') {
delim = '\'';
++in;
@@ -343,8 +347,8 @@
memcpy(&buf[len], in, offset);
len += offset;
/* in case of element content, keep the leading <,
- * for attribute's value mova after the terminating quotation mark */
- if (delim == '<') {
+ * for attribute's value move after the terminating quotation mark */
+ if (context->status == LYXML_ELEM_CONTENT) {
in += offset;
} else {
in += offset + 1;
@@ -381,6 +385,7 @@
/* set terminating NULL byte */
buf[len] = '\0';
+ context->status -= 1;
(*input) = in;
(*buffer) = buf;
(*buffer_size) = size;
@@ -413,8 +418,15 @@
if (in[0] == '\0') {
/* EOF - not expected at this place */
return LY_EINVAL;
- } else if (in[0] == '>' || in[0] == '/') {
- /* element terminated by > or /> */
+ } else if (in[0] == '>') {
+ /* element terminated by > - termination of the opening tag */
+ context->status = LYXML_ELEM_CONTENT;
+ ++in;
+ goto success;
+ } else if (in[0] == '/' && in[1] == '>') {
+ /* element terminated by /> - termination of an empty element */
+ context->status = LYXML_ELEMENT;
+ in += 2;
goto success;
}
@@ -451,9 +463,11 @@
++in;
ign_xmlws(context, in);
if (in[0] != '\'' && in[0] != '"') {
- LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_INSTREXP, LY_VCODE_INSTREXP_len(in), in, "either single or double quotation mark");
+ LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_INSTREXP,
+ LY_VCODE_INSTREXP_len(in), in, "either single or double quotation mark");
return LY_EVALID;
}
+ context->status = LYXML_ATTR_CONTENT;
success:
/* move caller's input */
@@ -471,9 +485,10 @@
const char *sectname;
const char *id;
size_t endtag_len, newlines;
- bool loop = true;
+ bool loop = true, closing = false;
unsigned int c;
LY_ERR rc;
+ struct lyxml_elem *e;
/* initialize output variables */
(*prefix) = (*name) = NULL;
@@ -484,6 +499,7 @@
if (in[0] == '\0') {
/* EOF */
+ context->status = LYXML_END;
goto success;
} else if (in[0] != '<') {
return LY_EINVAL;
@@ -519,8 +535,14 @@
LY_CHECK_ERR_RET(!in, LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_NTERM, "Declaration"), LY_EVALID);
context->line += newlines;
in += 2;
+ } else if (in[0] == '/') {
+ /* closing element */
+ closing = true;
+ ++in;
+ goto element;
} else {
/* element */
+element:
ign_xmlws(context, in);
LY_CHECK_ERR_RET(!in[0], LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_EOF), LY_EVALID);
@@ -545,10 +567,70 @@
"whitespace or element tag termination ('>' or '/>'");
return LY_EVALID;
}
- in = in - endtag_len;
(*name) = id;
- (*name_len) = in - id;
+ (*name_len) = in - endtag_len - id;
+ if (is_xmlws(c)) {
+ /* go to the next meaningful input */
+ ign_xmlws(context, in);
+ LY_CHECK_ERR_RET(!in[0], LOGVAL(ctx, LY_VLOG_LINE, &context->line, LY_VCODE_EOF), LY_EVALID);
+ c = in[0];
+ ++in;
+ endtag_len = 1;
+ }
+
+ if (closing) {
+ /* match opening and closing element tags */
+ LY_CHECK_ERR_RET(
+ !context->elements.count,
+ LOGVAL(ctx, LY_VLOG_LINE, &context->line, LYVE_SYNTAX, "Opening and closing elements tag missmatch (\"%.*s\").", name_len, *name),
+ LY_EVALID);
+ e = (struct lyxml_elem*)context->elements.objs[context->elements.count - 1];
+ LY_CHECK_ERR_RET(e->prefix_len != *prefix_len || e->name_len != *name_len
+ || (*prefix_len && strncmp(*prefix, e->prefix, e->prefix_len)) || strncmp(*name, e->name, e->name_len),
+ LOGVAL(ctx, LY_VLOG_LINE, &context->line, LYVE_SYNTAX, "Opening and closing elements tag missmatch (\"%.*s\").", name_len, *name),
+ LY_EVALID);
+ /* opening and closing element tags matches, remove record from the opening tags list */
+ free(e);
+ --context->elements.count;
+ /* do not return element information to announce closing element being currently processed */
+ *name = *prefix = NULL;
+ *name_len = *prefix_len = 0;
+
+ if (c == '>') {
+ /* end of closing element */
+ context->status = LYXML_ELEMENT;
+ } else {
+ in -= endtag_len;
+ LOGVAL(ctx, LY_VLOG_LINE, &context->line, LYVE_SYNTAX, "Unexpected data \"%.*s\" in closing element tag.",
+ LY_VCODE_INSTREXP_len(in), in);
+ return LY_EVALID;
+ }
+ } else {
+ if (c == '>') {
+ /* end of opening element */
+ context->status = LYXML_ELEM_CONTENT;
+ } else if (c == '/' && in[0] == '>') {
+ /* empty element closing */
+ context->status = LYXML_ELEMENT;
+ ++in;
+ } else {
+ /* attribute */
+ context->status = LYXML_ATTRIBUTE;
+ in -= endtag_len;
+ }
+
+ if (context->status != LYXML_ELEMENT) {
+ /* store element opening tag information */
+ e = malloc(sizeof *e);
+ LY_CHECK_ERR_RET(!e, LOGMEM(ctx), LY_EMEM);
+ e->name = *name;
+ e->prefix = *prefix;
+ e->name_len = *name_len;
+ e->prefix_len = *prefix_len;
+ ly_set_add(&context->elements, e, LY_SET_OPT_USEASLIST);
+ }
+ }
loop = false;
}
}
@@ -625,3 +707,18 @@
return LY_SUCCESS;
}
+
+void
+lyxml_context_clear(struct lyxml_context *context)
+{
+ unsigned int u;
+
+ ly_set_erase(&context->elements, free);
+ for (u = context->ns.count - 1; u + 1 > 0; --u) {
+ /* remove the ns structure */
+ free(((struct lyxml_ns *)context->ns.objs[u])->prefix);
+ free(((struct lyxml_ns *)context->ns.objs[u])->uri);
+ free(context->ns.objs[u]);
+ }
+ ly_set_erase(&context->ns, NULL);
+}
diff --git a/src/xml.h b/src/xml.h
index f0fbb4e..f41a836 100644
--- a/src/xml.h
+++ b/src/xml.h
@@ -26,14 +26,27 @@
char *uri; /* namespace URI */
};
+/* element tag identifier for matching opening and closing tags */
+struct lyxml_elem {
+ const char *prefix;
+ const char *name;
+ size_t prefix_len;
+ size_t name_len;
+};
+
enum LYXML_PARSER_STATUS {
- LYXML_STATUS_CDSECT, /* CDATA section */
- LYXML_STATUS_COMMENT, /* XML comment */
+ LYXML_ELEMENT, /* expecting XML element, call lyxml_get_element() */
+ LYXML_ELEM_CONTENT, /* expecting content of an element, call lyxml_get_string */
+ LYXML_ATTRIBUTE, /* expecting XML attribute, call lyxml_get_attribute() */
+ LYXML_ATTR_CONTENT, /* expecting value of an attribute, call lyxml_get_string */
+ LYXML_END /* end of input data */
};
struct lyxml_context {
struct ly_ctx *ctx;
uint64_t line;
+ enum LYXML_PARSER_STATUS status; /* status providing information about the next expected object in input data */
+ struct ly_set elements; /* list of not-yet-closed elements */
struct ly_set ns; /* handled with LY_SET_OPT_USEASLIST */
};
@@ -52,8 +65,9 @@
* @param[in] options Currently unused options to modify input processing.
* @param[out] prefix Pointer to prefix if present in the element name, NULL otherwise.
* @param[out] prefix_len Length of the prefix if any.
- * @param[out] name Element name. LY_SUCCESS can be returned with NULL name only in case the
- * end of the input string was reached (EOF).
+ * @param[out] name Element name. When LY_SUCCESS is returned but name is NULL, check context's status field:
+ * - LYXML_END - end of input was reached
+ * - LYXML_ELEMENT - closing element found, expecting sibling element so call lyxml_get_element() again
* @param[out] name_len Length of the element name.
* @return LY_ERR values.
*/
@@ -75,7 +89,8 @@
* @param[out] prefix Pointer to prefix if present in the attribute name, NULL otherwise.
* @param[out] prefix_len Length of the prefix if any.
* @param[out] name Attribute name. LY_SUCCESS can be returned with NULL name only in case the
- * end of the element tag was reached.
+ * end of the element tag was reached. According to the context's status field, the opening tag was read
+ * (LYXML_CONTENT) or empty element was closed (LYXML_ELEMENT).
* @param[out] name_len Length of the element name.
* @return LY_ERR values.
*/
@@ -149,4 +164,11 @@
*/
LY_ERR lyxml_ns_rm(struct lyxml_context *context, const char *element_name);
+/**
+ * @brief Remove the allocated working memory of the context.
+ *
+ * @param[in] context XML context to clear.
+ */
+void lyxml_context_clear(struct lyxml_context *context);
+
#endif /* LY_XML_H_ */
diff --git a/tests/src/test_xml.c b/tests/src/test_xml.c
index 9dcc250..3288a9b 100644
--- a/tests/src/test_xml.c
+++ b/tests/src/test_xml.c
@@ -82,8 +82,15 @@
str = "";
assert_int_equal(LY_SUCCESS, lyxml_get_element(&ctx, &str, &prefix, &prefix_len, &name, &name_len));
assert_null(name);
+ assert_int_equal(LYXML_END, ctx.status);
assert_true(str[0] == '\0');
+ /* end element */
+ str = "</element>";
+ assert_int_equal(LY_EVALID, lyxml_get_element(&ctx, &str, &prefix, &prefix_len, &name, &name_len));
+ logbuf_assert("Opening and closing elements tag missmatch (\"element>\"). Line number 1.");
+
+
/* no element */
logbuf_clean();
str = p = "no data present";
@@ -105,21 +112,27 @@
assert_null(prefix);
assert_false(strncmp("element", name, name_len));
assert_int_equal(7, name_len);
- assert_string_equal("/>", str);
+ assert_int_equal(LYXML_ELEMENT, ctx.status);
+ assert_string_equal("", str);
str = "<?xml version=\"1.0\"?> <!-- comment --> <![CDATA[<greeting>Hello, world!</greeting>]]> <?TEST xxx?> <element/>";
assert_int_equal(LY_SUCCESS, lyxml_get_element(&ctx, &str, &prefix, &prefix_len, &name, &name_len));
assert_null(prefix);
assert_false(strncmp("element", name, name_len));
assert_int_equal(7, name_len);
- assert_string_equal("/>", str);
+ assert_int_equal(LYXML_ELEMENT, ctx.status);
+ assert_string_equal("", str);
str = "<element xmlns=\"urn\"></element>";
assert_int_equal(LY_SUCCESS, lyxml_get_element(&ctx, &str, &prefix, &prefix_len, &name, &name_len));
assert_null(prefix);
assert_false(strncmp("element", name, name_len));
assert_int_equal(7, name_len);
- assert_string_equal(" xmlns=\"urn\"></element>", str);
+ assert_int_equal(LYXML_ATTRIBUTE, ctx.status);
+ assert_string_equal("xmlns=\"urn\"></element>", str);
+ /* cleean context by getting closing tag */
+ str += 12;
+ assert_int_equal(LY_SUCCESS, lyxml_get_element(&ctx, &str, &prefix, &prefix_len, &name, &name_len));
/* qualified element */
str = " < yin:element/>";
@@ -128,7 +141,8 @@
assert_false(strncmp("element", name, name_len));
assert_int_equal(3, prefix_len);
assert_int_equal(7, name_len);
- assert_string_equal("/>", str);
+ assert_int_equal(LYXML_ELEMENT, ctx.status);
+ assert_string_equal("", str);
str = "<yin:element xmlns=\"urn\"></element>";
assert_int_equal(LY_SUCCESS, lyxml_get_element(&ctx, &str, &prefix, &prefix_len, &name, &name_len));
@@ -136,7 +150,13 @@
assert_false(strncmp("element", name, name_len));
assert_int_equal(3, prefix_len);
assert_int_equal(7, name_len);
- assert_string_equal(" xmlns=\"urn\"></element>", str);
+ assert_int_equal(LYXML_ATTRIBUTE, ctx.status);
+ assert_string_equal("xmlns=\"urn\"></element>", str);
+ /* cleean context by getting closing tag */
+ str += 12;
+ assert_int_equal(LY_EVALID, lyxml_get_element(&ctx, &str, &prefix, &prefix_len, &name, &name_len));
+ logbuf_assert("Opening and closing elements tag missmatch (\"element>\"). Line number 1.");
+ lyxml_context_clear(&ctx);
/* UTF8 characters */
str = "<𠜎€𠜎Øn:𠜎€𠜎Øn/>";
@@ -145,7 +165,8 @@
assert_false(strncmp("𠜎€𠜎Øn", name, name_len));
assert_int_equal(14, prefix_len);
assert_int_equal(14, name_len);
- assert_string_equal("/>", str);
+ assert_int_equal(LYXML_ELEMENT, ctx.status);
+ assert_string_equal("", str);
/* invalid UTF-8 character */
str = "<¢:element>";
@@ -177,11 +198,13 @@
str = " />";
assert_int_equal(LY_SUCCESS, lyxml_get_attribute(&ctx, &str, &prefix, &prefix_len, &name, &name_len));
assert_null(name);
- assert_true(str[0] == '/');
+ assert_true(str[0] == '\0');
+ assert_int_equal(LYXML_ELEMENT, ctx.status);
str = ">";
assert_int_equal(LY_SUCCESS, lyxml_get_attribute(&ctx, &str, &prefix, &prefix_len, &name, &name_len));
assert_null(name);
- assert_true(str[0] == '>');
+ assert_true(str[0] == '\0');
+ assert_int_equal(LYXML_ELEM_CONTENT, ctx.status);
/* not an attribute */
str = p = "unknown/>";
@@ -210,6 +233,7 @@
assert_int_equal(0, prefix_len);
assert_false(strncmp("xmlns", name, name_len));
assert_string_equal("\"urn\">", str);
+ assert_int_equal(LYXML_ATTR_CONTENT, ctx.status);
str = "xmlns:nc\n = \'urn\'>";
assert_int_equal(LY_SUCCESS, lyxml_get_attribute(&ctx, &str, &prefix, &prefix_len, &name, &name_len));
@@ -221,6 +245,7 @@
assert_false(strncmp("xmlns", prefix, prefix_len));
assert_false(strncmp("nc", name, name_len));
assert_string_equal("\'urn\'>", str);
+ assert_int_equal(LYXML_ATTR_CONTENT, ctx.status);
}
static void
@@ -237,25 +262,32 @@
ctx.line = 1;
/* empty attribute value */
+ ctx.status = LYXML_ATTR_CONTENT;
str = "\"\"";
assert_int_equal(LY_SUCCESS, lyxml_get_string(&ctx, &str, &out, &out_len));
assert_non_null(out);
assert_int_equal(1, out_len);
assert_true(str[0] == '\0'); /* everything eaten */
assert_true(out[0] == '\0'); /* empty string */
+ assert_int_equal(LYXML_ATTRIBUTE, ctx.status);
+
+ ctx.status = LYXML_ATTR_CONTENT;
str = "\'\'";
assert_int_equal(LY_SUCCESS, lyxml_get_string(&ctx, &str, &out, &out_len));
assert_non_null(out);
assert_int_equal(1, out_len);
assert_true(str[0] == '\0'); /* everything eaten */
assert_true(out[0] == '\0'); /* empty string */
+ assert_int_equal(LYXML_ATTRIBUTE, ctx.status);
/* empty element content - only formating before defining child */
+ ctx.status = LYXML_ELEM_CONTENT;
str = "\n <";
assert_int_equal(LY_EINVAL, lyxml_get_string(&ctx, &str, &out, &out_len));
assert_string_equal("<", str);
/* empty element content is invalid - missing content terminating character < */
+ ctx.status = LYXML_ELEM_CONTENT;
str = "";
assert_int_equal(LY_EVALID, lyxml_get_string(&ctx, &str, &out, &out_len));
logbuf_assert("Unexpected end-of-file. Line number 2.");
@@ -263,6 +295,7 @@
free(out);
out = NULL;
+ ctx.status = LYXML_ELEM_CONTENT;
str = p = "xxx";
assert_int_equal(LY_EVALID, lyxml_get_string(&ctx, &str, &out, &out_len));
logbuf_assert("Unexpected end-of-file. Line number 2.");
@@ -272,43 +305,54 @@
out = NULL;
/* valid strings */
+ ctx.status = LYXML_ELEM_CONTENT;
str = "€𠜎Øn \n<&"'> ROK<";
assert_int_equal(LY_SUCCESS, lyxml_get_string(&ctx, &str, &out, &out_len));
assert_int_equal(22, out_len);
assert_string_equal("€𠜎Øn \n<&\"\'> ROK", out);
assert_string_equal("<", str);
+ assert_int_equal(LYXML_ELEMENT, ctx.status);
/* test using n-bytes UTF8 hexadecimal code points */
+ ctx.status = LYXML_ATTR_CONTENT;
str = "\'$¢€𐍈\'";
assert_int_equal(LY_SUCCESS, lyxml_get_string(&ctx, &str, &out, &out_len));
assert_string_equal("$¢€𐍈", out);
+ assert_int_equal(LYXML_ATTRIBUTE, ctx.status);
/* invalid characters in string */
+ ctx.status = LYXML_ATTR_CONTENT;
str = p = "\'R\'";
assert_int_equal(LY_EVALID, lyxml_get_string(&ctx, &str, &out, &out_len));
logbuf_assert("Invalid character sequence \"'\", expected ;. Line number 3.");
assert_ptr_equal(p, str); /* input data not eaten */
+ ctx.status = LYXML_ATTR_CONTENT;
str = p = "\"R\"";
assert_int_equal(LY_EVALID, lyxml_get_string(&ctx, &str, &out, &out_len));
logbuf_assert("Invalid character sequence \"\"\", expected ;. Line number 3.");
assert_ptr_equal(p, str); /* input data not eaten */
+ ctx.status = LYXML_ATTR_CONTENT;
str = p = "\"&nonsence;\"";
assert_int_equal(LY_EVALID, lyxml_get_string(&ctx, &str, &out, &out_len));
logbuf_assert("Entity reference \"&nonsence;\" not supported, only predefined references allowed. Line number 3.");
assert_ptr_equal(p, str); /* input data not eaten */
+ ctx.status = LYXML_ELEM_CONTENT;
str = p = "&#o122;";
assert_int_equal(LY_EVALID, lyxml_get_string(&ctx, &str, &out, &out_len));
logbuf_assert("Invalid character reference \"&#o122;\". Line number 3.");
assert_ptr_equal(p, str); /* input data not eaten */
+ ctx.status = LYXML_ATTR_CONTENT;
str = p = "\'\'";
assert_int_equal(LY_EVALID, lyxml_get_string(&ctx, &str, &out, &out_len));
logbuf_assert("Invalid character reference \"\'\" (0x00000006). Line number 3.");
assert_ptr_equal(p, str); /* input data not eaten */
+ ctx.status = LYXML_ATTR_CONTENT;
str = p = "\'\'";
assert_int_equal(LY_EVALID, lyxml_get_string(&ctx, &str, &out, &out_len));
logbuf_assert("Invalid character reference \"\'\" (0x0000fdd0). Line number 3.");
assert_ptr_equal(p, str); /* input data not eaten */
+ ctx.status = LYXML_ATTR_CONTENT;
str = p = "\'\'";
assert_int_equal(LY_EVALID, lyxml_get_string(&ctx, &str, &out, &out_len));
logbuf_assert("Invalid character reference \"\'\" (0x0000ffff). Line number 3.");