24 #define UNI_MAX_LEGAL_UTF32 0x0010FFFF
38 for (total_len = 0; total_len < len; total_len += step) {
45 for (i = 1; i < step; ++i)
46 if ((
utf8_str[total_len + i] & 0xc0) != 0x80)
55 chars[total_len++] = 0;
62 const int bytemask = 0xBF;
63 const int bytemark = 0x80;
69 chars[0] = static_cast<char>(unicode);
70 }
else if (unicode < 0x800) {
73 chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
75 chars[0] = static_cast<char>(unicode | 0xc0);
76 }
else if (unicode < 0x10000) {
78 chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
80 chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
82 chars[0] = static_cast<char>(unicode | 0xe0);
85 chars[3] = static_cast<char>((unicode | bytemark) & bytemask);
87 chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
89 chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
91 chars[0] = static_cast<char>(unicode | 0xf0);
99 static const int utf8_offsets[5] = {
100 0, 0, 0x3080, 0xE2080, 0x3C82080
104 const char* src = chars;
110 uni += static_cast<unsigned char>(*src++);
114 uni += static_cast<unsigned char>(*src++);
118 uni += static_cast<unsigned char>(*src++);
122 uni += static_cast<unsigned char>(*src++);
124 uni -= utf8_offsets[len];
131 char* str =
new char[len + 1];
132 memcpy(str, chars, len);
139 static const char utf8_bytes[256] = {
140 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
141 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
142 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
143 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
144 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
145 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
146 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
147 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0
150 return utf8_bytes[static_cast<unsigned char>(*
utf8_str)];
157 tprintf(
"ERROR: Illegal UTF8 encountered.\n");
158 for (
int i = 0; i < 5 && it_[i] !=
'\0'; ++i) {
159 tprintf(
"Index %d char = 0x%x\n", i, it_[i]);
171 tprintf(
"WARNING: Illegal UTF8 encountered\n");
182 tprintf(
"WARNING: Illegal UTF8 encountered\n");
183 utf8_output[0] =
' ';
186 strncpy(utf8_output, it_, len);
194 tprintf(
"WARNING: Illegal UTF8 encountered\n");
216 const int utf8_length = strlen(
utf8_str);
217 std::vector<char32> unicodes;
218 unicodes.reserve(utf8_length);
222 unicodes.push_back(*it);