common CHANGE make utf8 getter more generic for other parsers, not only for XML
diff --git a/src/common.c b/src/common.c
index c3a737d..912ee13 100644
--- a/src/common.c
+++ b/src/common.c
@@ -152,6 +152,82 @@
return new_mem;
}
+LY_ERR
+ly_getutf8(const char **input, unsigned int *utf8_char, size_t *bytes_read)
+{
+ unsigned int c, len;
+ int aux;
+ int i;
+
+ c = (*input)[0];
+ LY_CHECK_RET(!c, LY_EINVAL);
+
+ if (!(c & 0x80)) {
+ /* one byte character */
+ len = 1;
+
+ if (c < 0x20 && c != 0x9 && c != 0xa && c != 0xd) {
+ return LY_EINVAL;
+ }
+ } else if ((c & 0xe0) == 0xc0) {
+ /* two bytes character */
+ len = 2;
+
+ aux = (*input)[1];
+ if ((aux & 0xc0) != 0x80) {
+ return LY_EINVAL;
+ }
+ c = ((c & 0x1f) << 6) | (aux & 0x3f);
+
+ if (c < 0x80) {
+ return LY_EINVAL;
+ }
+ } else if ((c & 0xf0) == 0xe0) {
+ /* three bytes character */
+ len = 3;
+
+ c &= 0x0f;
+ for (i = 1; i <= 2; i++) {
+ aux = (*input)[i];
+ if ((aux & 0xc0) != 0x80) {
+ return LY_EINVAL;
+ }
+
+ c = (c << 6) | (aux & 0x3f);
+ }
+
+ if (c < 0x800 || (c > 0xd7ff && c < 0xe000) || c > 0xfffd) {
+ return LY_EINVAL;
+ }
+ } else if ((c & 0xf8) == 0xf0) {
+ /* four bytes character */
+ len = 4;
+
+ c &= 0x07;
+ for (i = 1; i <= 3; i++) {
+ aux = (*input)[i];
+ if ((aux & 0xc0) != 0x80) {
+ return LY_EINVAL;
+ }
+
+ c = (c << 6) | (aux & 0x3f);
+ }
+
+ if (c < 0x1000 || c > 0x10ffff) {
+ return LY_EINVAL;
+ }
+ } else {
+ return LY_EINVAL;
+ }
+
+ (*utf8_char) = c;
+ (*input) += len;
+ if (bytes_read) {
+ (*bytes_read) = len;
+ }
+ return LY_SUCCESS;
+}
+
size_t
LY_VCODE_INSTREXP_len(const char *str)
{