-static const char utf8_byte_len[256] = {
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0,0
-};
-
-static inline int
-is_legal_utf8_sequence(unsigned char *source, int length)
-{
- unsigned char *ptr;
- unsigned char c;
-
- if (length==1) return 1;
-
- /* Check for overlong sequence, and check second byte */
- c = *(source + 1);
- switch (*source) {
- case 0xE0: /* 3 bytes */
- if ( c < 0xA0 ) return 0;
- break;
- case 0xF0: /* 4 bytes */
- if ( c < 0x90 ) return 0;
- break;
- case 0xF8: /* 5 bytes */
- if ( c < 0xC8 ) return 0;
- break;
- case 0xFC: /* 6 bytes */
- if ( c < 0x84 ) return 0;
- break;
- default:
- if ( (c & 0xC0) != 0x80) return 0;
- }
-
- /* Check that trailing bytes look like 10xxxxxx */
- for (ptr = source++ + length - 1; ptr>source; ptr--)
- if ( ((*ptr) & 0xC0) != 0x80 ) return 0;
- return 1;
-}
-
-/* This does some screening on disallowed unicode characters. It is NOT
- * comprehensive.
- */
-static int
-is_allowed_utf8_char(unsigned char *source, int length)
-{
- /* We assume length and source point to a valid utf8 sequence */
- unsigned char c;
-
- /* Disallow F0000 and up (in utf8, F3B08080) */
- if (*source > 0xF3 ) return 0;
- c = *(source + 1);
- switch (*source) {
- case 0xF3:
- if (c >= 0xB0) return 0;
- break;
- /* Disallow D800-F8FF (in utf8, EDA080-EFA3BF */
- case 0xED:
- if (c >= 0xA0) return 0;
- break;
- case 0xEE:
- return 0;
- break;
- case 0xEF:
- if (c <= 0xA3) return 0;
- /* Disallow FFF9-FFFF (EFBFB9-EFBFBF) */
- if (c==0xBF)
- /* Don't need to check <=0xBF, since valid utf8 */
- if ( *(source+2) >= 0xB9) return 0;
- break;
- }
- return 1;
-}
-
-/* This routine should really check to see that the proper stringprep
- * mappings have been applied. Instead, we do a simple screen of some
- * of the more obvious illegal values by calling is_allowed_utf8_char.
- * This will allow many illegal strings through, but if a client behaves,
- * it will get full functionality. The other option (apart from full
- * stringprep checking) is to limit everything to an easily handled subset,
- * such as 7-bit ascii.
- *
- * Note - currently calling routines ignore return value except as boolean.