utf8.c 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. #include "utf8.h"
  2. #include <stdlib.h>
  3. /** Own utf-8 handling */
  4. enum {
  5. UTF8_LEN_1 = 1 << 7,
  6. UTF8_BITS1 = 1 << 7,
  7. UTF8_BITS2 = UTF8_BITS1 + (1 << 6) + (1 << 5),
  8. UTF8_BITS3 = UTF8_BITS2 + (1 << 4),
  9. UTF8_BITS4 = UTF8_BITS3 + (1 << 3),
  10. UTF8_BITS5 = UTF8_BITS4 + (1 << 2),
  11. UTF8_BITS6 = UTF8_BITS5 + (1 << 1),
  12. UTF8_BITSX = (1 << 7) + (1 << 6)
  13. };
  14. #define E -1 /* Shorthand for "error" */
  15. static char utf8_length_table[256] = {
  16. 0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  17. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  18. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  19. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  20. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  21. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  22. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  23. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  24. E,E,E,E,E,E,E,E,E,E,E,E,E,E,E,E,
  25. E,E,E,E,E,E,E,E,E,E,E,E,E,E,E,E,
  26. E,E,E,E,E,E,E,E,E,E,E,E,E,E,E,E,
  27. E,E,E,E,E,E,E,E,E,E,E,E,E,E,E,E,
  28. 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  29. 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  30. 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
  31. 4,4,4,4,4,4,4,4,5,5,5,5,6,6,E,E
  32. };
  33. #undef E
  34. /* Returns the length of the utf8 character that starts at
  35. * str, or negative if str does not point to the beginning byte of
  36. * of a valid utf8 character.
  37. */
  38. int utf8_decode_length(char * str) {
  39. int index = (int)(*((unsigned char *) str));
  40. return utf8_length_table[index];
  41. }
  42. static long utf8_start_masks[7] = {
  43. -1, UTF8_BITS1, UTF8_BITS2, UTF8_BITS3,
  44. UTF8_BITS4, UTF8_BITS5, UTF8_BITS6,
  45. };
  46. /** Decodes a single utf8 character. */
  47. int utf8_decode_one(char * str, long * result) {
  48. int index;
  49. unsigned char * ustr = (unsigned char *) str;
  50. if (!result) return -1;
  51. (*result) = 0; /* Ensure result is always set. */
  52. int len = utf8_decode_length(str);
  53. if (len < 1) return len;
  54. if (len >= 7) return len;
  55. /* Get the unicode character bits from the first byte by masking off
  56. the length part*/
  57. long start = ((long) *ustr) & (~utf8_start_masks[len]);
  58. /** now keep shifting in the remaining bits from the other bytes. */
  59. for (index = 1; index < len; index ++) {
  60. start = (start << 6) + (ustr[index] & (~UTF8_BITSX));
  61. }
  62. (*result) = start;
  63. return len;
  64. }
  65. /** Length of the whole UTF-8 string in unicode characters.
  66. * Returns a negative value on encoding error.
  67. */
  68. int utf8_length(char * str) {
  69. int result = 0;
  70. int index = 0;
  71. int aid;
  72. do {
  73. aid = utf8_decode_length(str + index);
  74. if (aid < 0) { return -1; }
  75. index += aid;
  76. result ++;
  77. } while (aid > 0);
  78. if (aid < 0) { return -1; }
  79. return result;
  80. }
  81. /** Changes the pointer next to point to the next character
  82. * in the utf-encoded string str and decodes the character
  83. * in result. Returns the length in bytes of the character decoded, or less
  84. * than 1 if at the end of the string. In that case, next is also set to NULL;
  85. */
  86. int utf8_next(char * str, char ** next, long * result) {
  87. int length = utf8_decode_length(str);
  88. if (length < 1) {
  89. (*next) = NULL;
  90. return length;
  91. }
  92. (*next) = str + length;
  93. return length;
  94. }
  95. /** Encodes a single utf8 character.
  96. * Result must have at least 6 bytes of data available for storage.
  97. * No \0 terminator is generated.
  98. * Returns the length of the encoded character.
  99. */
  100. int utf8_encode_one(long character, char * result) {
  101. (void) result;
  102. (void) character;
  103. /*XXX: to do. */
  104. return 0;
  105. }