tesseract  5.0.0-alpha-619-ge9db
unichar.cpp
Go to the documentation of this file.
1 // File: unichar.cpp
3 // Description: Unicode character/ligature class.
4 // Author: Ray Smith
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #include <tesseract/unichar.h>
20 #include "errcode.h"
22 #include "tprintf.h"
23 
24 #define UNI_MAX_LEGAL_UTF32 0x0010FFFF
25 
26 namespace tesseract {
27 
28 // Construct from a utf8 string. If len<0 then the string is null terminated.
29 // If the string is too long to fit in the UNICHAR then it takes only what
30 // will fit. Checks for illegal input and stops at an illegal sequence.
31 // The resulting UNICHAR may be empty.
32 UNICHAR::UNICHAR(const char* utf8_str, int len) {
33  int total_len = 0;
34  int step = 0;
35  if (len < 0) {
36  for (len = 0; len < UNICHAR_LEN && utf8_str[len] != 0; ++len);
37  }
38  for (total_len = 0; total_len < len; total_len += step) {
39  step = utf8_step(utf8_str + total_len);
40  if (total_len + step > UNICHAR_LEN)
41  break; // Too long.
42  if (step == 0)
43  break; // Illegal first byte.
44  int i;
45  for (i = 1; i < step; ++i)
46  if ((utf8_str[total_len + i] & 0xc0) != 0x80)
47  break;
48  if (i < step)
49  break; // Illegal surrogate
50  }
51  memcpy(chars, utf8_str, total_len);
52  if (total_len < UNICHAR_LEN) {
53  chars[UNICHAR_LEN - 1] = total_len;
54  while (total_len < UNICHAR_LEN - 1)
55  chars[total_len++] = 0;
56  }
57 }
58 
59 // Construct from a single UCS4 character. Illegal values are ignored,
60 // resulting in an empty UNICHAR.
61 UNICHAR::UNICHAR(int unicode) {
62  const int bytemask = 0xBF;
63  const int bytemark = 0x80;
64 
65  if (unicode < 0x80) {
66  chars[UNICHAR_LEN - 1] = 1;
67  chars[2] = 0;
68  chars[1] = 0;
69  chars[0] = static_cast<char>(unicode);
70  } else if (unicode < 0x800) {
71  chars[UNICHAR_LEN - 1] = 2;
72  chars[2] = 0;
73  chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
74  unicode >>= 6;
75  chars[0] = static_cast<char>(unicode | 0xc0);
76  } else if (unicode < 0x10000) {
77  chars[UNICHAR_LEN - 1] = 3;
78  chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
79  unicode >>= 6;
80  chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
81  unicode >>= 6;
82  chars[0] = static_cast<char>(unicode | 0xe0);
83  } else if (unicode <= UNI_MAX_LEGAL_UTF32) {
84  chars[UNICHAR_LEN - 1] = 4;
85  chars[3] = static_cast<char>((unicode | bytemark) & bytemask);
86  unicode >>= 6;
87  chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
88  unicode >>= 6;
89  chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
90  unicode >>= 6;
91  chars[0] = static_cast<char>(unicode | 0xf0);
92  } else {
93  memset(chars, 0, UNICHAR_LEN);
94  }
95 }
96 
97 // Get the first character as UCS-4.
98 int UNICHAR::first_uni() const {
99  static const int utf8_offsets[5] = {
100  0, 0, 0x3080, 0xE2080, 0x3C82080
101  };
102  int uni = 0;
103  int len = utf8_step(chars);
104  const char* src = chars;
105 
106  switch (len) {
107  default:
108  break;
109  case 4:
110  uni += static_cast<unsigned char>(*src++);
111  uni <<= 6;
112  // Fall through.
113  case 3:
114  uni += static_cast<unsigned char>(*src++);
115  uni <<= 6;
116  // Fall through.
117  case 2:
118  uni += static_cast<unsigned char>(*src++);
119  uni <<= 6;
120  // Fall through.
121  case 1:
122  uni += static_cast<unsigned char>(*src++);
123  }
124  uni -= utf8_offsets[len];
125  return uni;
126 }
127 
128 // Get a terminated UTF8 string: Must delete[] it after use.
129 char* UNICHAR::utf8_str() const {
130  int len = utf8_len();
131  char* str = new char[len + 1];
132  memcpy(str, chars, len);
133  str[len] = 0;
134  return str;
135 }
136 
137 // Get the number of bytes in the first character of the given utf8 string.
138 int UNICHAR::utf8_step(const char* utf8_str) {
139  static const char utf8_bytes[256] = {
140  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
141  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
142  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
143  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
144  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
145  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
146  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
147  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0
148  };
149 
150  return utf8_bytes[static_cast<unsigned char>(*utf8_str)];
151 }
152 
154  ASSERT_HOST(it_ != nullptr);
155  int step = utf8_step(it_);
156  if (step == 0) {
157  tprintf("ERROR: Illegal UTF8 encountered.\n");
158  for (int i = 0; i < 5 && it_[i] != '\0'; ++i) {
159  tprintf("Index %d char = 0x%x\n", i, it_[i]);
160  }
161  step = 1;
162  }
163  it_ += step;
164  return *this;
165 }
166 
168  ASSERT_HOST(it_ != nullptr);
169  const int len = utf8_step(it_);
170  if (len == 0) {
171  tprintf("WARNING: Illegal UTF8 encountered\n");
172  return ' ';
173  }
174  UNICHAR uch(it_, len);
175  return uch.first_uni();
176 }
177 
178 int UNICHAR::const_iterator::get_utf8(char* utf8_output) const {
179  ASSERT_HOST(it_ != nullptr);
180  const int len = utf8_step(it_);
181  if (len == 0) {
182  tprintf("WARNING: Illegal UTF8 encountered\n");
183  utf8_output[0] = ' ';
184  return 1;
185  }
186  strncpy(utf8_output, it_, len);
187  return len;
188 }
189 
191  ASSERT_HOST(it_ != nullptr);
192  const int len = utf8_step(it_);
193  if (len == 0) {
194  tprintf("WARNING: Illegal UTF8 encountered\n");
195  return 1;
196  }
197  return len;
198 }
199 
201  return utf8_step(it_) > 0;
202 }
203 
206 }
207 
209  return UNICHAR::const_iterator(utf8_str + len);
210 }
211 
212 // Converts a utf-8 string to a vector of unicodes.
213 // Returns an empty vector if the input contains invalid UTF-8.
214 /* static */
215 std::vector<char32> UNICHAR::UTF8ToUTF32(const char* utf8_str) {
216  const int utf8_length = strlen(utf8_str);
217  std::vector<char32> unicodes;
218  unicodes.reserve(utf8_length);
219  const_iterator end_it(end(utf8_str, utf8_length));
220  for (const_iterator it(begin(utf8_str, utf8_length)); it != end_it; ++it) {
221  if (it.is_legal()) {
222  unicodes.push_back(*it);
223  } else {
224  unicodes.clear();
225  return unicodes;
226  }
227  }
228  return unicodes;
229 }
230 
231 // Returns an empty string if the input contains an invalid unicode.
232 std::string UNICHAR::UTF32ToUTF8(const std::vector<char32>& str32) {
234  for (char32 ch : str32) {
235  UNICHAR uni_ch(ch);
236  int step;
237  if (uni_ch.utf8_len() > 0 && (step = utf8_step(uni_ch.utf8())) > 0) {
238  utf8_str.append(uni_ch.utf8(), step);
239  } else {
240  return "";
241  }
242  }
243  return utf8_str;
244 }
245 
246 } // namespace tesseract
tesseract::UNICHAR::utf8_len
int utf8_len() const
Definition: unichar.h:79
string
std::string string
Definition: equationdetect_test.cc:21
tesseract::UNICHAR::utf8
const char * utf8() const
Definition: unichar.h:85
tesseract::UNICHAR::begin
static const_iterator begin(const char *utf8_str, int byte_length)
Definition: unichar.cpp:204
tesseract::UNICHAR::UTF8ToUTF32
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
Definition: unichar.cpp:215
tesseract::UNICHAR::end
static const_iterator end(const char *utf8_str, int byte_length)
Definition: unichar.cpp:208
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
tesseract::UNICHAR::const_iterator
Definition: unichar.h:109
tesseract::UNICHAR::const_iterator::utf8_len
int utf8_len() const
Definition: unichar.cpp:190
tesseract::UNICHAR::const_iterator::is_legal
bool is_legal() const
Definition: unichar.cpp:200
tesseract::UNICHAR::UNICHAR
UNICHAR()
Definition: unichar.h:61
tesseract::UNICHAR::const_iterator::operator++
const_iterator & operator++()
Definition: unichar.cpp:153
tesseract::UNICHAR
Definition: unichar.h:59
genericvector.h
tesseract::UNICHAR::const_iterator::operator*
int operator*() const
Definition: unichar.cpp:167
tesseract::char32
signed int char32
Definition: unichar.h:53
tesseract::UNICHAR::first_uni
int first_uni() const
Definition: unichar.cpp:98
tesseract::UNICHAR::utf8_step
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:138
tesseract::UNICHAR::UTF32ToUTF8
static std::string UTF32ToUTF8(const std::vector< char32 > &str32)
Definition: unichar.cpp:232
tesseract::UNICHAR::const_iterator::get_utf8
int get_utf8(char *buf) const
Definition: unichar.cpp:178
tesseract::UNICHAR::utf8_str
char * utf8_str() const
Definition: unichar.cpp:129
tesseract
Definition: baseapi.h:65
tprintf.h
UNICHAR_LEN
#define UNICHAR_LEN
Definition: unichar.h:32
unichar.h
UNI_MAX_LEGAL_UTF32
#define UNI_MAX_LEGAL_UTF32
Definition: unichar.cpp:24
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
errcode.h