X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=lib%2Funicode.c;h=31612a25c74604999d3ef316d30931271a14f970;hb=7d78f21c057ff50a823220d809ac38c3d907243c;hp=69ebcfc9d8e8337515b8fae6ca8296d17be387d1;hpb=58fda1dab104041fc693032475ec4662c1a52849;p=sliver-openvswitch.git

diff --git a/lib/unicode.c b/lib/unicode.c
index 69ebcfc9d..31612a25c 100644
--- a/lib/unicode.c
+++ b/lib/unicode.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2009 Nicira Networks.
+ * Copyright (c) 2009, 2010 Nicira, Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,11 @@
 
 #include "unicode.h"
 
+#include <inttypes.h>
+
+#include "dynamic-string.h"
+#include "util.h"
+
 /* Returns the unicode code point corresponding to leading surrogate 'leading'
  * and trailing surrogate 'trailing'.  The return value will not make any
  * sense if 'leading' or 'trailing' are not in the correct ranges for leading
@@ -36,3 +41,124 @@ utf16_decode_surrogate_pair(int leading, int trailing)
     int x1 = trailing & 0x3ff;
     return (u << 16) | (x0 << 10) | x1;
 }
+
+/* Returns the number of Unicode characters in UTF-8 string 's'. */
+size_t
+utf8_length(const char *s_)
+{
+    const uint8_t *s;
+    size_t length;
+
+    length = 0;
+    for (s = (const uint8_t *) s_; *s != '\0'; s++) {
+        /* The most-significant bits of the first byte in a character are one
+         * of 2#01, 2#00, or 2#11.  2#10 is a continuation byte. */
+        length += (*s & 0xc0) != 0x80;
+    }
+    return length;
+}
+
+static char *
+invalid_utf8_sequence(const uint8_t *s, int n, size_t *lengthp)
+{
+    struct ds msg;
+    int i;
+
+    if (lengthp) {
+        *lengthp = 0;
+    }
+
+    ds_init(&msg);
+    ds_put_cstr(&msg, "invalid UTF-8 sequence");
+    for (i = 0; i < n; i++) {
+        ds_put_format(&msg, " 0x%02"PRIx8, s[i]);
+    }
+    return ds_steal_cstr(&msg);
+}
+
+struct utf8_sequence {
+    uint8_t octets[5][2];
+};
+
+static const struct utf8_sequence *
+lookup_utf8_sequence(uint8_t c)
+{
+    static const struct utf8_sequence seqs[] = {
+        { { { 0x01, 0x7f },
+            { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 } } },
+
+        { { { 0xc2, 0xdf }, { 0x80, 0xbf },
+            { 0, 0 }, { 0, 0 }, { 0, 0 } } },
+
+        { { { 0xe0, 0xe0 }, { 0xa0, 0xbf }, { 0x80, 0xbf },
+            {0,0}, {0, 0 } } },
+
+        { { { 0xe1, 0xec }, { 0x80, 0xbf }, { 0x80, 0xbf },
+            { 0, 0 }, { 0, 0 } } },
+
+        { { { 0xed, 0xed }, { 0x80, 0x9f }, { 0x80, 0xbf },
+            { 0, 0 }, { 0, 0 } } },
+
+        { { { 0xee, 0xef }, { 0x80, 0xbf }, { 0x80, 0xbf },
+            { 0, 0 }, { 0, 0 } } },
+
+        { { { 0xf0, 0xf0 }, { 0x90, 0xbf }, { 0x80, 0xbf }, { 0x80, 0xbf },
+            { 0, 0 } } },
+
+        { { { 0xf1, 0xf3 }, { 0x80, 0xbf }, { 0x80, 0xbf }, { 0x80, 0xbf },
+            { 0, 0 } } },
+
+        { { { 0xf4, 0xf4 }, { 0x80, 0x8f }, { 0x80, 0xbf }, { 0x80, 0xbf },
+            { 0, 0 } } },
+    };
+
+    size_t i;
+
+    for (i = 0; i < ARRAY_SIZE(seqs); i++) {
+        const uint8_t *o = seqs[i].octets[0];
+        if (c >= o[0] && c <= o[1]) {
+            return &seqs[i];
+        }
+    }
+    return NULL;
+}
+
+/* Checks that 's' is a valid, null-terminated UTF-8 string.  If so, returns a
+ * null pointer and sets '*lengthp' to the number of Unicode characters in
+ * 's'.  If not, returns an error message that the caller must free and sets
+ * '*lengthp' to 0.
+ *
+ * 'lengthp' may be NULL if the length is not needed. */
+char *
+utf8_validate(const char *s_, size_t *lengthp)
+{
+    size_t length = 0;
+    const uint8_t *s;
+
+    for (s = (const uint8_t *) s_; *s != '\0'; ) {
+        length++;
+        if (s[0] < 0x80) {
+            s++;
+        } else {
+            const struct utf8_sequence *seq;
+            int i;
+
+            seq = lookup_utf8_sequence(s[0]);
+            if (!seq) {
+                return invalid_utf8_sequence(s, 1, lengthp);
+            }
+
+            for (i = 1; seq->octets[i][0]; i++) {
+                const uint8_t *o = seq->octets[i];
+                if (s[i] < o[0] || s[i] > o[1]) {
+                    return invalid_utf8_sequence(s, i + 1, lengthp);
+                }
+            }
+            s += i;
+        }
+    }
+    if (lengthp) {
+        *lengthp = length;
+    }
+    return NULL;
+}