tesseract  4.0.0-1-g2a2b
unichar.cpp
Go to the documentation of this file.
1 // File: unichar.cpp
3 // Description: Unicode character/ligature class.
4 // Author: Ray Smith
5 // Created: Wed Jun 28 17:05:01 PDT 2006
6 //
7 // (C) Copyright 2006, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #include "unichar.h"
21 #include "errcode.h"
22 #include "genericvector.h"
23 #include "tprintf.h"
24 
25 #define UNI_MAX_LEGAL_UTF32 0x0010FFFF
26 
27 namespace tesseract {
28 
29 // Construct from a utf8 string. If len<0 then the string is null terminated.
30 // If the string is too long to fit in the UNICHAR then it takes only what
31 // will fit. Checks for illegal input and stops at an illegal sequence.
32 // The resulting UNICHAR may be empty.
33 UNICHAR::UNICHAR(const char* utf8_str, int len) {
34  int total_len = 0;
35  int step = 0;
36  if (len < 0) {
37  for (len = 0; len < UNICHAR_LEN && utf8_str[len] != 0; ++len);
38  }
39  for (total_len = 0; total_len < len; total_len += step) {
40  step = utf8_step(utf8_str + total_len);
41  if (total_len + step > UNICHAR_LEN)
42  break; // Too long.
43  if (step == 0)
44  break; // Illegal first byte.
45  int i;
46  for (i = 1; i < step; ++i)
47  if ((utf8_str[total_len + i] & 0xc0) != 0x80)
48  break;
49  if (i < step)
50  break; // Illegal surrogate
51  }
52  memcpy(chars, utf8_str, total_len);
53  if (total_len < UNICHAR_LEN) {
54  chars[UNICHAR_LEN - 1] = total_len;
55  while (total_len < UNICHAR_LEN - 1)
56  chars[total_len++] = 0;
57  }
58 }
59 
60 // Construct from a single UCS4 character. Illegal values are ignored,
61 // resulting in an empty UNICHAR.
62 UNICHAR::UNICHAR(int unicode) {
63  const int bytemask = 0xBF;
64  const int bytemark = 0x80;
65 
66  if (unicode < 0x80) {
67  chars[UNICHAR_LEN - 1] = 1;
68  chars[2] = 0;
69  chars[1] = 0;
70  chars[0] = static_cast<char>(unicode);
71  } else if (unicode < 0x800) {
72  chars[UNICHAR_LEN - 1] = 2;
73  chars[2] = 0;
74  chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
75  unicode >>= 6;
76  chars[0] = static_cast<char>(unicode | 0xc0);
77  } else if (unicode < 0x10000) {
78  chars[UNICHAR_LEN - 1] = 3;
79  chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
80  unicode >>= 6;
81  chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
82  unicode >>= 6;
83  chars[0] = static_cast<char>(unicode | 0xe0);
84  } else if (unicode <= UNI_MAX_LEGAL_UTF32) {
85  chars[UNICHAR_LEN - 1] = 4;
86  chars[3] = static_cast<char>((unicode | bytemark) & bytemask);
87  unicode >>= 6;
88  chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
89  unicode >>= 6;
90  chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
91  unicode >>= 6;
92  chars[0] = static_cast<char>(unicode | 0xf0);
93  } else {
94  memset(chars, 0, UNICHAR_LEN);
95  }
96 }
97 
98 // Get the first character as UCS-4.
99 int UNICHAR::first_uni() const {
100  static const int utf8_offsets[5] = {
101  0, 0, 0x3080, 0xE2080, 0x3C82080
102  };
103  int uni = 0;
104  int len = utf8_step(chars);
105  const char* src = chars;
106 
107  switch (len) {
108  default:
109  break;
110  case 4:
111  uni += static_cast<unsigned char>(*src++);
112  uni <<= 6;
113  case 3:
114  uni += static_cast<unsigned char>(*src++);
115  uni <<= 6;
116  case 2:
117  uni += static_cast<unsigned char>(*src++);
118  uni <<= 6;
119  case 1:
120  uni += static_cast<unsigned char>(*src++);
121  }
122  uni -= utf8_offsets[len];
123  return uni;
124 }
125 
126 // Get a terminated UTF8 string: Must delete[] it after use.
127 char* UNICHAR::utf8_str() const {
128  int len = utf8_len();
129  char* str = new char[len + 1];
130  memcpy(str, chars, len);
131  str[len] = 0;
132  return str;
133 }
134 
135 // Get the number of bytes in the first character of the given utf8 string.
136 int UNICHAR::utf8_step(const char* utf8_str) {
137  static const char utf8_bytes[256] = {
138  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
139  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
140  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
141  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
142  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
143  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
144  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
145  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0
146  };
147 
148  return utf8_bytes[static_cast<unsigned char>(*utf8_str)];
149 }
150 
152  ASSERT_HOST(it_ != nullptr);
153  int step = utf8_step(it_);
154  if (step == 0) {
155  tprintf("ERROR: Illegal UTF8 encountered.\n");
156  for (int i = 0; i < 5 && it_[i] != '\0'; ++i) {
157  tprintf("Index %d char = 0x%x\n", i, it_[i]);
158  }
159  step = 1;
160  }
161  it_ += step;
162  return *this;
163 }
164 
166  ASSERT_HOST(it_ != nullptr);
167  const int len = utf8_step(it_);
168  if (len == 0) {
169  tprintf("WARNING: Illegal UTF8 encountered\n");
170  return ' ';
171  }
172  UNICHAR uch(it_, len);
173  return uch.first_uni();
174 }
175 
176 int UNICHAR::const_iterator::get_utf8(char* utf8_output) const {
177  ASSERT_HOST(it_ != nullptr);
178  const int len = utf8_step(it_);
179  if (len == 0) {
180  tprintf("WARNING: Illegal UTF8 encountered\n");
181  utf8_output[0] = ' ';
182  return 1;
183  }
184  strncpy(utf8_output, it_, len);
185  return len;
186 }
187 
189  ASSERT_HOST(it_ != nullptr);
190  const int len = utf8_step(it_);
191  if (len == 0) {
192  tprintf("WARNING: Illegal UTF8 encountered\n");
193  return 1;
194  }
195  return len;
196 }
197 
199  return utf8_step(it_) > 0;
200 }
201 
202 UNICHAR::const_iterator UNICHAR::begin(const char* utf8_str, const int len) {
204 }
205 
206 UNICHAR::const_iterator UNICHAR::end(const char* utf8_str, const int len) {
207  return UNICHAR::const_iterator(utf8_str + len);
208 }
209 
210 // Converts a utf-8 string to a vector of unicodes.
211 // Returns an empty vector if the input contains invalid UTF-8.
212 /* static */
213 std::vector<char32> UNICHAR::UTF8ToUTF32(const char* utf8_str) {
214  const int utf8_length = strlen(utf8_str);
215  std::vector<char32> unicodes;
216  unicodes.reserve(utf8_length);
217  const_iterator end_it(end(utf8_str, utf8_length));
218  for (const_iterator it(begin(utf8_str, utf8_length)); it != end_it; ++it) {
219  if (it.is_legal()) {
220  unicodes.push_back(*it);
221  } else {
222  unicodes.clear();
223  return unicodes;
224  }
225  }
226  return unicodes;
227 }
228 
229 // Returns an empty string if the input contains an invalid unicode.
230 std::string UNICHAR::UTF32ToUTF8(const std::vector<char32>& str32) {
231  std::string utf8_str;
232  for (char32 ch : str32) {
233  UNICHAR uni_ch(ch);
234  int step;
235  if (uni_ch.utf8_len() > 0 && (step = utf8_step(uni_ch.utf8())) > 0) {
236  utf8_str.append(uni_ch.utf8(), step);
237  } else {
238  return "";
239  }
240  }
241  return utf8_str;
242 }
243 
244 } // namespace tesseract
int first_uni() const
Definition: unichar.cpp:99
signed int char32
Definition: unichar.h:52
const char * utf8() const
Definition: unichar.h:84
#define UNICHAR_LEN
Definition: unichar.h:31
int get_utf8(char *buf) const
Definition: unichar.cpp:176
static const_iterator begin(const char *utf8_str, const int byte_length)
Definition: unichar.cpp:202
static std::string UTF32ToUTF8(const std::vector< char32 > &str32)
Definition: unichar.cpp:230
char * utf8_str() const
Definition: unichar.cpp:127
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:136
int utf8_len() const
Definition: unichar.h:78
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
Definition: unichar.cpp:213
static const_iterator end(const char *utf8_str, const int byte_length)
Definition: unichar.cpp:206
const_iterator & operator++()
Definition: unichar.cpp:151
#define UNI_MAX_LEGAL_UTF32
Definition: unichar.cpp:25
#define ASSERT_HOST(x)
Definition: errcode.h:84