#include "utf8.h" #include /** Own utf-8 handling */ enum { UTF8_LEN_1 = 1 << 7, UTF8_BITS1 = 1 << 7, UTF8_BITS2 = UTF8_BITS1 + (1 << 6) + (1 << 5), UTF8_BITS3 = UTF8_BITS2 + (1 << 4), UTF8_BITS4 = UTF8_BITS3 + (1 << 3), UTF8_BITS5 = UTF8_BITS4 + (1 << 2), UTF8_BITS6 = UTF8_BITS5 + (1 << 1), UTF8_BITSX = (1 << 7) + (1 << 6) }; #define E -1 /* Shorthand for "error" */ static char utf8_length_table[256] = { 0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, E,E,E,E,E,E,E,E,E,E,E,E,E,E,E,E, E,E,E,E,E,E,E,E,E,E,E,E,E,E,E,E, E,E,E,E,E,E,E,E,E,E,E,E,E,E,E,E, E,E,E,E,E,E,E,E,E,E,E,E,E,E,E,E, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,E,E }; #undef E /* Returns the length of the utf8 character that starts at * str, or negative if str does not point to the beginning byte of * of a valid utf8 character. */ int utf8_decode_length(char * str) { int index = (int)(*((unsigned char *) str)); return utf8_length_table[index]; } static long utf8_start_masks[7] = { -1, UTF8_BITS1, UTF8_BITS2, UTF8_BITS3, UTF8_BITS4, UTF8_BITS5, UTF8_BITS6, }; /** Decodes a single utf8 character. */ int utf8_decode_one(char * str, long * result) { int index; unsigned char * ustr = (unsigned char *) str; if (!result) return -1; (*result) = 0; /* Ensure result is always set. */ int len = utf8_decode_length(str); if (len < 1) return len; if (len >= 7) return len; /* Get the unicode character bits from the first byte by masking off the length part*/ long start = ((long) *ustr) & (~utf8_start_masks[len]); /** now keep shifting in the remaining bits from the other bytes. */ for (index = 1; index < len; index ++) { start = (start << 6) + (ustr[index] & (~UTF8_BITSX)); } (*result) = start; return len; } /** Length of the whole UTF-8 string in unicode characters. * Returns a negative value on encoding error. */ int utf8_length(char * str) { int result = 0; int index = 0; int aid; do { aid = utf8_decode_length(str + index); if (aid < 0) { return -1; } index += aid; result ++; } while (aid > 0); if (aid < 0) { return -1; } return result; } /** Changes the pointer next to point to the next character * in the utf-encoded string str and decodes the character * in result. Returns the length in bytes of the character decoded, or less * than 1 if at the end of the string. In that case, next is also set to NULL; */ int utf8_next(char * str, char ** next, long * result) { int length = utf8_decode_length(str); if (length < 1) { (*next) = NULL; return length; } (*next) = str + length; return length; } /** Encodes a single utf8 character. * Result must have at least 6 bytes of data available for storage. * No \0 terminator is generated. * Returns the length of the encoded character. */ int utf8_encode_one(long character, char * result) { (void) result; (void) character; /*XXX: to do. */ return 0; }