25 #define UNI_MAX_LEGAL_UTF32 0x0010FFFF 39 for (total_len = 0; total_len < len; total_len += step) {
46 for (i = 1; i < step; ++i)
47 if ((
utf8_str[total_len + i] & 0xc0) != 0x80)
56 chars[total_len++] = 0;
63 const int bytemask = 0xBF;
64 const int bytemark = 0x80;
70 chars[0] =
static_cast<char>(unicode);
71 }
else if (unicode < 0x800) {
74 chars[1] =
static_cast<char>((unicode | bytemark) & bytemask);
76 chars[0] =
static_cast<char>(unicode | 0xc0);
77 }
else if (unicode < 0x10000) {
79 chars[2] =
static_cast<char>((unicode | bytemark) & bytemask);
81 chars[1] =
static_cast<char>((unicode | bytemark) & bytemask);
83 chars[0] =
static_cast<char>(unicode | 0xe0);
86 chars[3] =
static_cast<char>((unicode | bytemark) & bytemask);
88 chars[2] =
static_cast<char>((unicode | bytemark) & bytemask);
90 chars[1] =
static_cast<char>((unicode | bytemark) & bytemask);
92 chars[0] =
static_cast<char>(unicode | 0xf0);
100 static const int utf8_offsets[5] = {
101 0, 0, 0x3080, 0xE2080, 0x3C82080
105 const char* src = chars;
111 uni +=
static_cast<unsigned char>(*src++);
114 uni +=
static_cast<unsigned char>(*src++);
117 uni +=
static_cast<unsigned char>(*src++);
120 uni +=
static_cast<unsigned char>(*src++);
122 uni -= utf8_offsets[len];
129 char* str =
new char[len + 1];
130 memcpy(str, chars, len);
137 static const char utf8_bytes[256] = {
138 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
139 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
140 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
141 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
142 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
143 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
144 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
145 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0
148 return utf8_bytes[
static_cast<unsigned char>(*utf8_str)];
155 tprintf(
"ERROR: Illegal UTF8 encountered.\n");
156 for (
int i = 0; i < 5 && it_[i] !=
'\0'; ++i) {
157 tprintf(
"Index %d char = 0x%x\n", i, it_[i]);
169 tprintf(
"WARNING: Illegal UTF8 encountered\n");
180 tprintf(
"WARNING: Illegal UTF8 encountered\n");
181 utf8_output[0] =
' ';
184 strncpy(utf8_output, it_, len);
192 tprintf(
"WARNING: Illegal UTF8 encountered\n");
214 const int utf8_length = strlen(
utf8_str);
215 std::vector<char32> unicodes;
216 unicodes.reserve(utf8_length);
220 unicodes.push_back(*it);
const char * utf8() const
int get_utf8(char *buf) const
static const_iterator begin(const char *utf8_str, const int byte_length)
static std::string UTF32ToUTF8(const std::vector< char32 > &str32)
DLLSYM void tprintf(const char *format,...)
static int utf8_step(const char *utf8_str)
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
static const_iterator end(const char *utf8_str, const int byte_length)
const_iterator & operator++()
#define UNI_MAX_LEGAL_UTF32