tesseract  5.0.0-alpha-619-ge9db
unicodetext.cc
Go to the documentation of this file.
1 
17 #include "util/utf8/unicodetext.h"
18 
19 #include <string.h> // for memcpy, NULL, memcmp, etc
20 #include <algorithm> // for max
21 
22 //#include "base/logging.h" // for operator<<, CHECK, etc
23 //#include "base/stringprintf.h" // for StringPrintf, StringAppendF
24 //#include "strings/stringpiece.h" // for StringPiece, etc
25 
26 #include "third_party/utf/utf.h" // for isvalidcharntorune, etc
27 #include "util/utf8/unilib.h" // for IsInterchangeValid, etc
28 #include "util/utf8/unilib_utf8_utils.h" // for OneCharLen
29 
30 static int CodepointDistance(const char* start, const char* end) {
31  int n = 0;
32  // Increment n on every non-trail-byte.
33  for (const char* p = start; p < end; ++p) {
34  n += (*reinterpret_cast<const signed char*>(p) >= -0x40);
35  }
36  return n;
37 }
38 
39 static int CodepointCount(const char* utf8, int len) {
40  return CodepointDistance(utf8, utf8 + len);
41 }
42 
46  return CodepointDistance(first.it_, last.it_);
47 }
48 
49 // ---------- Utility ----------
50 
51 static int ConvertToInterchangeValid(char* start, int len) {
52  // This routine is called only when we've discovered that a UTF-8 buffer
53  // that was passed to CopyUTF8, TakeOwnershipOfUTF8, or PointToUTF8
54  // was not interchange valid. This indicates a bug in the caller, and
55  // a LOG(WARNING) is done in that case.
56  // This is similar to CoerceToInterchangeValid, but it replaces each
57  // structurally valid byte with a space, and each non-interchange
58  // character with a space, even when that character requires more
59  // than one byte in UTF8. E.g., "\xEF\xB7\x90" (U+FDD0) is
60  // structurally valid UTF8, but U+FDD0 is not an interchange-valid
61  // code point. The result should contain one space, not three.
62  //
63  // Since the conversion never needs to write more data than it
64  // reads, it is safe to change the buffer in place. It returns the
65  // number of bytes written.
66  char* const in = start;
67  char* out = start;
68  char* const end = start + len;
69  while (start < end) {
70  int good = UniLib::SpanInterchangeValid(start, end - start);
71  if (good > 0) {
72  if (out != start) {
73  memmove(out, start, good);
74  }
75  out += good;
76  start += good;
77  if (start == end) {
78  break;
79  }
80  }
81  // Is the current string invalid UTF8 or just non-interchange UTF8?
82  char32 rune;
83  int n;
84  if (isvalidcharntorune(start, end - start, &rune, &n)) {
85  // structurally valid UTF8, but not interchange valid
86  start += n; // Skip over the whole character.
87  } else { // bad UTF8
88  start += 1; // Skip over just one byte
89  }
90  *out++ = ' ';
91  }
92  return out - in;
93 }
94 
95 
96 // *************** Data representation **********
97 
98 // Note: the copy constructor is undefined.
99 
100 // After reserve(), resize(), or clear(), we're an owner, not an alias.
101 
102 void UnicodeText::Repr::reserve(int new_capacity) {
103  // If there's already enough capacity, and we're an owner, do nothing.
104  if (capacity_ >= new_capacity && ours_) return;
105 
106  // Otherwise, allocate a new buffer.
107  capacity_ = std::max(new_capacity, (3 * capacity_) / 2 + 20);
108  char* new_data = new char[capacity_];
109 
110  // If there is an old buffer, copy it into the new buffer.
111  if (data_) {
112  memcpy(new_data, data_, size_);
113  if (ours_) delete[] data_; // If we owned the old buffer, free it.
114  }
115  data_ = new_data;
116  ours_ = true; // We own the new buffer.
117  // size_ is unchanged.
118 }
119 
120 void UnicodeText::Repr::resize(int new_size) {
121  if (new_size == 0) {
122  clear();
123  } else {
124  if (!ours_ || new_size > capacity_) reserve(new_size);
125  // Clear the memory in the expanded part.
126  if (size_ < new_size) memset(data_ + size_, 0, new_size - size_);
127  size_ = new_size;
128  ours_ = true;
129  }
130 }
131 
132 // This implementation of clear() deallocates the buffer if we're an owner.
133 // That's not strictly necessary; we could just set size_ to 0.
134 void UnicodeText::Repr::clear() {
135  if (ours_) delete[] data_;
136  data_ = nullptr;
137  size_ = capacity_ = 0;
138  ours_ = true;
139 }
140 
141 void UnicodeText::Repr::Copy(const char* data, int size) {
142  resize(size);
143  memcpy(data_, data, size);
144 }
145 
146 void UnicodeText::Repr::TakeOwnershipOf(char* data, int size, int capacity) {
147  if (data == data_) return; // We already own this memory. (Weird case.)
148  if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it.
149  data_ = data;
150  size_ = size;
151  capacity_ = capacity;
152  ours_ = true;
153 }
154 
155 void UnicodeText::Repr::PointTo(const char* data, int size) {
156  if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it.
157  data_ = const_cast<char*>(data);
158  size_ = size;
159  capacity_ = size;
160  ours_ = false;
161 }
162 
163 void UnicodeText::Repr::append(const char* bytes, int byte_length) {
164  reserve(size_ + byte_length);
165  memcpy(data_ + size_, bytes, byte_length);
166  size_ += byte_length;
167 }
168 
169 string UnicodeText::Repr::DebugString() const {
170  return tensorflow::strings::Printf("{Repr %p data=%p size=%d capacity=%d %s}",
171  this,
172  data_, size_, capacity_,
173  ours_ ? "Owned" : "Alias");
174 }
175 
176 
177 
178 // *************** UnicodeText ******************
179 
180 // ----- Constructors -----
181 
182 // Default constructor
184 }
185 
186 // Copy constructor
188  Copy(src);
189 }
190 
191 // Substring constructor
194  CHECK(first <= last) << " Incompatible iterators";
195  repr_.append(first.it_, last.it_ - first.it_);
196 }
197 
199  const const_iterator& last) {
200  CHECK(first <= last) << " Incompatible iterators";
201  return string(first.it_, last.it_ - first.it_);
202 }
203 
204 
205 // ----- Copy -----
206 
208  if (this != &src) {
209  Copy(src);
210  }
211  return *this;
212 }
213 
215  repr_.Copy(src.repr_.data_, src.repr_.size_);
216  return *this;
217 }
218 
219 UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) {
220  repr_.Copy(buffer, byte_length);
221  if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
222  LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
223  repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
224  }
225  return *this;
226 }
227 
228 UnicodeText& UnicodeText::UnsafeCopyUTF8(const char* buffer,
229  int byte_length) {
230  repr_.Copy(buffer, byte_length);
231  return *this;
232 }
233 
234 // ----- TakeOwnershipOf -----
235 
237  int byte_length,
238  int byte_capacity) {
239  repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
240  if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
241  LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
242  repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
243  }
244  return *this;
245 }
246 
247 UnicodeText& UnicodeText::UnsafeTakeOwnershipOfUTF8(char* buffer,
248  int byte_length,
249  int byte_capacity) {
250  repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
251  return *this;
252 }
253 
254 // ----- PointTo -----
255 
256 UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) {
257  if (UniLib:: IsInterchangeValid(buffer, byte_length)) {
258  repr_.PointTo(buffer, byte_length);
259  } else {
260  LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
261  repr_.Copy(buffer, byte_length);
262  repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
263  }
264  return *this;
265 }
266 
267 UnicodeText& UnicodeText::UnsafePointToUTF8(const char* buffer,
268  int byte_length) {
269  repr_.PointTo(buffer, byte_length);
270  return *this;
271 }
272 
274  repr_.PointTo(src.repr_.data_, src.repr_.size_);
275  return *this;
276 }
277 
279  const const_iterator &last) {
280  CHECK(first <= last) << " Incompatible iterators";
281  repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data());
282  return *this;
283 }
284 
285 // ----- Append -----
286 
288  repr_.append(u.repr_.data_, u.repr_.size_);
289  return *this;
290 }
291 
293  const const_iterator& last) {
294  CHECK(first <= last) << " Incompatible iterators";
295  repr_.append(first.it_, last.it_ - first.it_);
296  return *this;
297 }
298 
299 UnicodeText& UnicodeText::UnsafeAppendUTF8(const char* utf8, int len) {
300  repr_.append(utf8, len);
301  return *this;
302 }
303 
304 // ----- substring searching -----
305 
307  const_iterator start_pos) const {
308  CHECK_GE(start_pos.utf8_data(), utf8_data());
309  CHECK_LE(start_pos.utf8_data(), utf8_data() + utf8_length());
310  return UnsafeFind(look, start_pos);
311 }
312 
314  return UnsafeFind(look, begin());
315 }
316 
317 UnicodeText::const_iterator UnicodeText::UnsafeFind(
318  const UnicodeText& look, const_iterator start_pos) const {
319  // Due to the magic of the UTF8 encoding, searching for a sequence of
320  // letters is equivalent to substring search.
321  StringPiece searching(utf8_data(), utf8_length());
322  StringPiece look_piece(look.utf8_data(), look.utf8_length());
323  LOG(FATAL) << "Not implemented";
324  //StringPiece::size_type found =
325  // searching.find(look_piece, start_pos.utf8_data() - utf8_data());
326  StringPiece::size_type found = StringPiece::npos;
327  if (found == StringPiece::npos) return end();
328  return const_iterator(utf8_data() + found);
329 }
330 
332  // Equivalent to:
333  // UnicodeText replacement_char;
334  // replacement_char.push_back(0xFFFD);
335  // return find(replacement_char) != end();
336  StringPiece searching(utf8_data(), utf8_length());
337  StringPiece looking_for("\xEF\xBF\xBD", 3);
338  LOG(FATAL) << "Not implemented";
339  //return searching.find(looking_for) != StringPiece::npos;
340  return false;
341 }
342 
343 // ----- other methods -----
344 
345 // Clear operator
347  repr_.clear();
348 }
349 
350 // Destructor
352 
353 
355  if (UniLib::IsValidCodepoint(c)) {
356  char buf[UTFmax];
357  int len = runetochar(buf, &c);
358  if (UniLib::IsInterchangeValid(buf, len)) {
359  repr_.append(buf, len);
360  } else {
361  LOG(WARNING) << "Unicode value 0x" << std::hex << c
362  << " is not valid for interchange";
363  repr_.append(" ", 1);
364  }
365  } else {
366  LOG(WARNING) << "Illegal Unicode value: 0x" << std::hex << c;
367  repr_.append(" ", 1);
368  }
369 }
370 
371 int UnicodeText::size() const {
372  return CodepointCount(repr_.data_, repr_.size_);
373 }
374 
375 bool operator==(const UnicodeText& lhs, const UnicodeText& rhs) {
376  if (&lhs == &rhs) return true;
377  if (lhs.repr_.size_ != rhs.repr_.size_) return false;
378  return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0;
379 }
380 
381 string UnicodeText::DebugString() const {
382  return tensorflow::strings::Printf("{UnicodeText %p chars=%d repr=%s}",
383  this,
384  size(),
385  repr_.DebugString().c_str());
386 }
387 
388 
389 // ******************* UnicodeText::const_iterator *********************
390 
391 // The implementation of const_iterator would be nicer if it
392 // inherited from boost::iterator_facade
393 // (http://boost.org/libs/iterator/doc/iterator_facade.html).
394 
396 
398  : it_(other.it_) {
399 }
400 
403  if (&other != this)
404  it_ = other.it_;
405  return *this;
406 }
407 
409  return const_iterator(repr_.data_);
410 }
411 
413  return const_iterator(repr_.data_ + repr_.size_);
414 }
415 
417  const UnicodeText::const_iterator& rhs) {
418  return lhs.it_ < rhs.it_;
419 }
420 
422  // (We could call chartorune here, but that does some
423  // error-checking, and we're guaranteed that our data is valid
424  // UTF-8. Also, we expect this routine to be called very often. So
425  // for speed, we do the calculation ourselves.)
426 
427  // Convert from UTF-8
428  unsigned char byte1 = it_[0];
429  if (byte1 < 0x80)
430  return byte1;
431 
432  unsigned char byte2 = it_[1];
433  if (byte1 < 0xE0)
434  return ((byte1 & 0x1F) << 6)
435  | (byte2 & 0x3F);
436 
437  unsigned char byte3 = it_[2];
438  if (byte1 < 0xF0)
439  return ((byte1 & 0x0F) << 12)
440  | ((byte2 & 0x3F) << 6)
441  | (byte3 & 0x3F);
442 
443  unsigned char byte4 = it_[3];
444  return ((byte1 & 0x07) << 18)
445  | ((byte2 & 0x3F) << 12)
446  | ((byte3 & 0x3F) << 6)
447  | (byte4 & 0x3F);
448 }
449 
451  it_ += UniLib::OneCharLen(it_);
452  return *this;
453 }
454 
456  while (UniLib::IsTrailByte(*--it_));
457  return *this;
458 }
459 
460 int UnicodeText::const_iterator::get_utf8(char* utf8_output) const {
461  utf8_output[0] = it_[0]; if ((it_[0] & 0xff) < 0x80) return 1;
462  utf8_output[1] = it_[1]; if ((it_[0] & 0xff) < 0xE0) return 2;
463  utf8_output[2] = it_[2]; if ((it_[0] & 0xff) < 0xF0) return 3;
464  utf8_output[3] = it_[3];
465  return 4;
466 }
467 
469  return string(utf8_data(), utf8_length());
470 }
471 
473  if ((it_[0] & 0xff) < 0x80) {
474  return 1;
475  } else if ((it_[0] & 0xff) < 0xE0) {
476  return 2;
477  } else if ((it_[0] & 0xff) < 0xF0) {
478  return 3;
479  } else {
480  return 4;
481  }
482 }
483 
485  CHECK(p != nullptr);
486  const char* start = utf8_data();
487  int len = utf8_length();
488  const char* end = start + len;
489  CHECK(p >= start);
490  CHECK(p <= end);
491  CHECK(p == end || !UniLib::IsTrailByte(*p));
492  return const_iterator(p);
493 }
494 
496  return tensorflow::strings::Printf("{iter %p}", it_);
497 }
498 
499 
500 // *************************** Utilities *************************
501 
502 string CodepointString(const UnicodeText& t) {
503  string s;
504  UnicodeText::const_iterator it = t.begin(), end = t.end();
505  while (it != end) tensorflow::strings::Appendf(&s, "%X ", *it++);
506  return s;
507 }
string
std::string string
Definition: equationdetect_test.cc:21
UnicodeText::~UnicodeText
~UnicodeText()
Definition: unicodetext.cc:351
UnicodeText::const_iterator
friend class const_iterator
Definition: unicodetext.h:332
UnicodeText::clear
void clear()
Definition: unicodetext.cc:346
FATAL
Definition: log.h:29
UnicodeText::const_iterator::difference_type
ptrdiff_t difference_type
Definition: unicodetext.h:181
UnicodeText::const_iterator::operator=
const_iterator & operator=(const const_iterator &other)
Definition: unicodetext.cc:402
UnicodeText::UTF8Substring
static string UTF8Substring(const const_iterator &first, const const_iterator &last)
Definition: unicodetext.cc:198
UnicodeText::push_back
void push_back(char32 codepoint)
Definition: unicodetext.cc:354
UnicodeText::size
int size() const
Definition: unicodetext.cc:371
UnicodeText::const_iterator::const_iterator
const_iterator()
Definition: unicodetext.cc:395
operator<
bool operator<(const UnicodeText::const_iterator &lhs, const UnicodeText::const_iterator &rhs)
Definition: unicodetext.cc:416
UnicodeText::DebugString
string DebugString() const
Definition: unicodetext.cc:381
UnicodeText::const_iterator::operator--
const_iterator & operator--()
Definition: unicodetext.cc:455
unicodetext.h
UnicodeText::UnicodeText
UnicodeText()
Definition: unicodetext.cc:183
UnicodeText::const_iterator::operator*
char32 operator*() const
Definition: unicodetext.cc:421
UnicodeText::MakeIterator
const_iterator MakeIterator(const char *p) const
Definition: unicodetext.cc:484
UnicodeText::TakeOwnershipOfUTF8
UnicodeText & TakeOwnershipOfUTF8(char *utf8_buffer, int byte_length, int byte_capacity)
Definition: unicodetext.cc:236
UnicodeText::const_iterator::utf8_data
const char * utf8_data() const
Definition: unicodetext.h:233
UnicodeText::operator=
UnicodeText & operator=(const UnicodeText &src)
Definition: unicodetext.cc:207
CHECK
#define CHECK(test)
Definition: include_gunit.h:57
UTFmax
Definition: utf.h:23
last
LIST last(LIST var_list)
Definition: oldlist.cpp:151
UnicodeText::find
const_iterator find(const UnicodeText &look, const_iterator start_pos) const
Definition: unicodetext.cc:306
unilib_utf8_utils.h
UnicodeText::utf8_length
int utf8_length() const
Definition: unicodetext.h:293
UnicodeText
Definition: unicodetext.h:116
distance
UnicodeText::const_iterator::difference_type distance(const UnicodeText::const_iterator &first, const UnicodeText::const_iterator &last)
Definition: unicodetext.cc:44
UnicodeText::PointTo
UnicodeText & PointTo(const UnicodeText &src)
Definition: unicodetext.cc:273
CodepointString
string CodepointString(const UnicodeText &t)
Definition: unicodetext.cc:502
UnicodeText::const_iterator::operator++
const_iterator & operator++()
Definition: unicodetext.cc:450
UnicodeText::PointToUTF8
UnicodeText & PointToUTF8(const char *utf8_buffer, int byte_length)
Definition: unicodetext.cc:256
CHECK_LE
#define CHECK_LE(test, value)
Definition: include_gunit.h:61
UnicodeText::const_iterator::DebugString
string DebugString() const
Definition: unicodetext.cc:495
UnicodeText::end
const_iterator end() const
Definition: unicodetext.cc:412
UniLib::IsTrailByte
bool IsTrailByte(char x)
Definition: unilib_utf8_utils.h:58
UnicodeText::HasReplacementChar
bool HasReplacementChar() const
Definition: unicodetext.cc:331
UniLib::IsInterchangeValid
bool IsInterchangeValid(char32 c)
Definition: unilib.cc:33
UniLib::IsValidCodepoint
bool IsValidCodepoint(char32 c)
Definition: unilib_utf8_utils.h:31
UnicodeText::const_iterator::utf8_length
int utf8_length() const
Definition: unicodetext.cc:472
WARNING
Definition: log.h:29
UniLib::SpanInterchangeValid
int SpanInterchangeValid(const char *begin, int byte_length)
Definition: unilib.cc:40
char32
signed int char32
Definition: pango_font_info.h:33
UnicodeText::append
UnicodeText & append(ForwardIterator first, const ForwardIterator last)
Definition: unicodetext.h:160
UnicodeText::utf8_data
const char * utf8_data() const
Definition: unicodetext.h:292
UnicodeText::Copy
UnicodeText & Copy(const UnicodeText &src)
Definition: unicodetext.cc:214
UnicodeText::const_iterator::get_utf8_string
string get_utf8_string() const
Definition: unicodetext.cc:468
UnicodeText::const_iterator
Definition: unicodetext.h:176
LOG
Definition: cleanapi_test.cc:19
operator==
bool operator==(const UnicodeText &lhs, const UnicodeText &rhs)
Definition: unicodetext.cc:375
isvalidcharntorune
int isvalidcharntorune(const char *str, int length, Rune *rune, int *consumed)
Definition: rune.c:247
unilib.h
UnicodeText::CopyUTF8
UnicodeText & CopyUTF8(const char *utf8_buffer, int byte_length)
Definition: unicodetext.cc:219
utf.h
runetochar
int runetochar(char *str, const Rune *rune)
Definition: rune.c:253
UniLib::OneCharLen
int OneCharLen(const char *src)
Definition: unilib_utf8_utils.h:53
UnicodeText::const_iterator::get_utf8
int get_utf8(char *buf) const
Definition: unicodetext.cc:460
UnicodeText::begin
const_iterator begin() const
Definition: unicodetext.cc:408