25 #define UNI_MAX_LEGAL_UTF32 0x0010FFFF
35 for (len = 0; len <
UNICHAR_LEN && utf8_str[len] != 0; ++len);
37 for (total_len = 0; total_len < len; total_len += step) {
44 for (i = 1; i < step; ++i)
45 if ((utf8_str[total_len + i] & 0xc0) != 0x80)
50 memcpy(chars, utf8_str, total_len);
54 chars[total_len++] = 0;
61 const int bytemask = 0xBF;
62 const int bytemark = 0x80;
68 chars[0] =
static_cast<char>(unicode);
69 }
else if (unicode < 0x800) {
72 chars[1] =
static_cast<char>((unicode | bytemark) & bytemask);
74 chars[0] =
static_cast<char>(unicode | 0xc0);
75 }
else if (unicode < 0x10000) {
77 chars[2] =
static_cast<char>((unicode | bytemark) & bytemask);
79 chars[1] =
static_cast<char>((unicode | bytemark) & bytemask);
81 chars[0] =
static_cast<char>(unicode | 0xe0);
84 chars[3] =
static_cast<char>((unicode | bytemark) & bytemask);
86 chars[2] =
static_cast<char>((unicode | bytemark) & bytemask);
88 chars[1] =
static_cast<char>((unicode | bytemark) & bytemask);
90 chars[0] =
static_cast<char>(unicode | 0xf0);
98 static const int utf8_offsets[5] = {
99 0, 0, 0x3080, 0xE2080, 0x3C82080
103 const char* src = chars;
109 uni +=
static_cast<unsigned char>(*src++);
112 uni +=
static_cast<unsigned char>(*src++);
115 uni +=
static_cast<unsigned char>(*src++);
118 uni +=
static_cast<unsigned char>(*src++);
120 uni -= utf8_offsets[len];
127 char* str =
new char[len + 1];
128 memcpy(str, chars, len);
135 static const char utf8_bytes[256] = {
136 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
137 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
138 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
139 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
140 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
141 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
142 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
143 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0
146 return utf8_bytes[
static_cast<unsigned char>(*utf8_str)];
153 tprintf(
"ERROR: Illegal UTF8 encountered.\n");
154 for (
int i = 0; i < 5 && it_[i] !=
'\0'; ++i) {
155 tprintf(
"Index %d char = 0x%x\n", i, it_[i]);
167 tprintf(
"WARNING: Illegal UTF8 encountered\n");
178 tprintf(
"WARNING: Illegal UTF8 encountered\n");
179 utf8_output[0] =
' ';
182 strncpy(utf8_output, it_, len);
190 tprintf(
"WARNING: Illegal UTF8 encountered\n");
213 const int utf8_length = strlen(utf8_str);
static bool UTF8ToUnicode(const char *utf8_str, GenericVector< int > *unicodes)
#define UNI_MAX_LEGAL_UTF32
static const_iterator begin(const char *utf8_str, const int byte_length)
int get_utf8(char *buf) const
static int utf8_step(const char *utf8_str)
const_iterator & operator++()
static const_iterator end(const char *utf8_str, const int byte_length)