tesseract  5.0.0-alpha-619-ge9db
UnicodeText Class Reference

#include <unicodetext.h>

Classes

class  const_iterator
 
class  const_reverse_iterator
 

Public Types

typedef char32 value_type
 

Public Member Functions

 UnicodeText ()
 
 UnicodeText (const UnicodeText &src)
 
 UnicodeText (const const_iterator &first, const const_iterator &last)
 
UnicodeTextoperator= (const UnicodeText &src)
 
UnicodeTextCopy (const UnicodeText &src)
 
UnicodeTextassign (const UnicodeText &src)
 
UnicodeTextPointTo (const UnicodeText &src)
 
UnicodeTextPointTo (const const_iterator &first, const const_iterator &last)
 
 ~UnicodeText ()
 
void clear ()
 
bool empty () const
 
void push_back (char32 codepoint)
 
template<typename ForwardIterator >
UnicodeTextappend (ForwardIterator first, const ForwardIterator last)
 
UnicodeTextappend (const const_iterator &first, const const_iterator &last)
 
UnicodeTextappend (const UnicodeText &source)
 
int size () const
 
const_iterator begin () const
 
const_iterator end () const
 
const_reverse_iterator rbegin () const
 
const_reverse_iterator rend () const
 
const_iterator find (const UnicodeText &look, const_iterator start_pos) const
 
const_iterator find (const UnicodeText &look) const
 
bool HasReplacementChar () const
 
const char * utf8_data () const
 
int utf8_length () const
 
int utf8_capacity () const
 
UnicodeTextCopyUTF8 (const char *utf8_buffer, int byte_length)
 
UnicodeTextTakeOwnershipOfUTF8 (char *utf8_buffer, int byte_length, int byte_capacity)
 
UnicodeTextPointToUTF8 (const char *utf8_buffer, int byte_length)
 
const_iterator MakeIterator (const char *p) const
 
string DebugString () const
 

Static Public Member Functions

static string UTF8Substring (const const_iterator &first, const const_iterator &last)
 

Friends

class const_iterator
 
class UnicodeTextUtils
 
bool operator== (const UnicodeText &lhs, const UnicodeText &rhs)
 
bool operator!= (const UnicodeText &lhs, const UnicodeText &rhs)
 

Detailed Description

Copyright 2010 Google Inc.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

 http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

Definition at line 116 of file unicodetext.h.

Member Typedef Documentation

◆ value_type

Definition at line 118 of file unicodetext.h.

Constructor & Destructor Documentation

◆ UnicodeText() [1/3]

UnicodeText::UnicodeText ( )

Definition at line 183 of file unicodetext.cc.

183  {
184 }

◆ UnicodeText() [2/3]

UnicodeText::UnicodeText ( const UnicodeText src)

Definition at line 187 of file unicodetext.cc.

187  {
188  Copy(src);
189 }

◆ UnicodeText() [3/3]

UnicodeText::UnicodeText ( const const_iterator first,
const const_iterator last 
)

Definition at line 192 of file unicodetext.cc.

193  {
194  CHECK(first <= last) << " Incompatible iterators";
195  repr_.append(first.it_, last.it_ - first.it_);
196 }

◆ ~UnicodeText()

UnicodeText::~UnicodeText ( )

Definition at line 351 of file unicodetext.cc.

351 {}

Member Function Documentation

◆ append() [1/3]

UnicodeText & UnicodeText::append ( const const_iterator first,
const const_iterator last 
)

Definition at line 292 of file unicodetext.cc.

293  {
294  CHECK(first <= last) << " Incompatible iterators";
295  repr_.append(first.it_, last.it_ - first.it_);
296  return *this;
297 }

◆ append() [2/3]

UnicodeText & UnicodeText::append ( const UnicodeText source)

Definition at line 287 of file unicodetext.cc.

287  {
288  repr_.append(u.repr_.data_, u.repr_.size_);
289  return *this;
290 }

◆ append() [3/3]

template<typename ForwardIterator >
UnicodeText& UnicodeText::append ( ForwardIterator  first,
const ForwardIterator  last 
)
inline

Definition at line 160 of file unicodetext.h.

160  {
161  while (first != last) { push_back(*first++); }
162  return *this;
163  }

◆ assign()

UnicodeText& UnicodeText::assign ( const UnicodeText src)
inline

Definition at line 134 of file unicodetext.h.

134 { return Copy(src); }

◆ begin()

UnicodeText::const_iterator UnicodeText::begin ( ) const

Definition at line 408 of file unicodetext.cc.

408  {
409  return const_iterator(repr_.data_);
410 }

◆ clear()

void UnicodeText::clear ( )

Definition at line 346 of file unicodetext.cc.

346  {
347  repr_.clear();
348 }

◆ Copy()

UnicodeText & UnicodeText::Copy ( const UnicodeText src)

Definition at line 214 of file unicodetext.cc.

214  {
215  repr_.Copy(src.repr_.data_, src.repr_.size_);
216  return *this;
217 }

◆ CopyUTF8()

UnicodeText & UnicodeText::CopyUTF8 ( const char *  utf8_buffer,
int  byte_length 
)

Definition at line 219 of file unicodetext.cc.

219  {
220  repr_.Copy(buffer, byte_length);
221  if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
222  LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
223  repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
224  }
225  return *this;
226 }

◆ DebugString()

string UnicodeText::DebugString ( ) const

Definition at line 381 of file unicodetext.cc.

381  {
382  return tensorflow::strings::Printf("{UnicodeText %p chars=%d repr=%s}",
383  this,
384  size(),
385  repr_.DebugString().c_str());
386 }

◆ empty()

bool UnicodeText::empty ( ) const
inline

Definition at line 145 of file unicodetext.h.

145 { return repr_.size_ == 0; } // Test if text is empty.

◆ end()

UnicodeText::const_iterator UnicodeText::end ( ) const

Definition at line 412 of file unicodetext.cc.

412  {
413  return const_iterator(repr_.data_ + repr_.size_);
414 }

◆ find() [1/2]

UnicodeText::const_iterator UnicodeText::find ( const UnicodeText look) const

Definition at line 313 of file unicodetext.cc.

313  {
314  return UnsafeFind(look, begin());
315 }

◆ find() [2/2]

UnicodeText::const_iterator UnicodeText::find ( const UnicodeText look,
const_iterator  start_pos 
) const

Definition at line 306 of file unicodetext.cc.

307  {
308  CHECK_GE(start_pos.utf8_data(), utf8_data());
309  CHECK_LE(start_pos.utf8_data(), utf8_data() + utf8_length());
310  return UnsafeFind(look, start_pos);
311 }

◆ HasReplacementChar()

bool UnicodeText::HasReplacementChar ( ) const

Definition at line 331 of file unicodetext.cc.

331  {
332  // Equivalent to:
333  // UnicodeText replacement_char;
334  // replacement_char.push_back(0xFFFD);
335  // return find(replacement_char) != end();
336  StringPiece searching(utf8_data(), utf8_length());
337  StringPiece looking_for("\xEF\xBF\xBD", 3);
338  LOG(FATAL) << "Not implemented";
339  //return searching.find(looking_for) != StringPiece::npos;
340  return false;
341 }

◆ MakeIterator()

UnicodeText::const_iterator UnicodeText::MakeIterator ( const char *  p) const

Definition at line 484 of file unicodetext.cc.

484  {
485  CHECK(p != nullptr);
486  const char* start = utf8_data();
487  int len = utf8_length();
488  const char* end = start + len;
489  CHECK(p >= start);
490  CHECK(p <= end);
491  CHECK(p == end || !UniLib::IsTrailByte(*p));
492  return const_iterator(p);
493 }

◆ operator=()

UnicodeText & UnicodeText::operator= ( const UnicodeText src)

Definition at line 207 of file unicodetext.cc.

207  {
208  if (this != &src) {
209  Copy(src);
210  }
211  return *this;
212 }

◆ PointTo() [1/2]

UnicodeText & UnicodeText::PointTo ( const const_iterator first,
const const_iterator last 
)

Definition at line 278 of file unicodetext.cc.

279  {
280  CHECK(first <= last) << " Incompatible iterators";
281  repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data());
282  return *this;
283 }

◆ PointTo() [2/2]

UnicodeText & UnicodeText::PointTo ( const UnicodeText src)

Definition at line 273 of file unicodetext.cc.

273  {
274  repr_.PointTo(src.repr_.data_, src.repr_.size_);
275  return *this;
276 }

◆ PointToUTF8()

UnicodeText & UnicodeText::PointToUTF8 ( const char *  utf8_buffer,
int  byte_length 
)

Definition at line 256 of file unicodetext.cc.

256  {
257  if (UniLib:: IsInterchangeValid(buffer, byte_length)) {
258  repr_.PointTo(buffer, byte_length);
259  } else {
260  LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
261  repr_.Copy(buffer, byte_length);
262  repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
263  }
264  return *this;
265 }

◆ push_back()

void UnicodeText::push_back ( char32  codepoint)

Definition at line 354 of file unicodetext.cc.

354  {
355  if (UniLib::IsValidCodepoint(c)) {
356  char buf[UTFmax];
357  int len = runetochar(buf, &c);
358  if (UniLib::IsInterchangeValid(buf, len)) {
359  repr_.append(buf, len);
360  } else {
361  LOG(WARNING) << "Unicode value 0x" << std::hex << c
362  << " is not valid for interchange";
363  repr_.append(" ", 1);
364  }
365  } else {
366  LOG(WARNING) << "Illegal Unicode value: 0x" << std::hex << c;
367  repr_.append(" ", 1);
368  }
369 }

◆ rbegin()

const_reverse_iterator UnicodeText::rbegin ( ) const
inline

Definition at line 270 of file unicodetext.h.

270  {
271  return const_reverse_iterator(end());
272  }

◆ rend()

const_reverse_iterator UnicodeText::rend ( ) const
inline

Definition at line 273 of file unicodetext.h.

273  {
274  return const_reverse_iterator(begin());
275  }

◆ size()

int UnicodeText::size ( ) const

Definition at line 371 of file unicodetext.cc.

371  {
372  return CodepointCount(repr_.data_, repr_.size_);
373 }

◆ TakeOwnershipOfUTF8()

UnicodeText & UnicodeText::TakeOwnershipOfUTF8 ( char *  utf8_buffer,
int  byte_length,
int  byte_capacity 
)

Definition at line 236 of file unicodetext.cc.

238  {
239  repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
240  if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
241  LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
242  repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
243  }
244  return *this;
245 }

◆ utf8_capacity()

int UnicodeText::utf8_capacity ( ) const
inline

Definition at line 294 of file unicodetext.h.

294 { return repr_.capacity_; }

◆ utf8_data()

const char* UnicodeText::utf8_data ( ) const
inline

Definition at line 292 of file unicodetext.h.

292 { return repr_.data_; }

◆ utf8_length()

int UnicodeText::utf8_length ( ) const
inline

Definition at line 293 of file unicodetext.h.

293 { return repr_.size_; }

◆ UTF8Substring()

string UnicodeText::UTF8Substring ( const const_iterator first,
const const_iterator last 
)
static

Definition at line 198 of file unicodetext.cc.

199  {
200  CHECK(first <= last) << " Incompatible iterators";
201  return string(first.it_, last.it_ - first.it_);
202 }

Friends And Related Function Documentation

◆ const_iterator

friend class const_iterator
friend

Definition at line 332 of file unicodetext.h.

◆ operator!=

bool operator!= ( const UnicodeText lhs,
const UnicodeText rhs 
)
friend

Definition at line 380 of file unicodetext.h.

380  {
381  return !(lhs == rhs);
382 }

◆ operator==

bool operator== ( const UnicodeText lhs,
const UnicodeText rhs 
)
friend

Definition at line 375 of file unicodetext.cc.

375  {
376  if (&lhs == &rhs) return true;
377  if (lhs.repr_.size_ != rhs.repr_.size_) return false;
378  return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0;
379 }

◆ UnicodeTextUtils

friend class UnicodeTextUtils
friend

Definition at line 333 of file unicodetext.h.


The documentation for this class was generated from the following files:
string
std::string string
Definition: equationdetect_test.cc:21
UnicodeText::const_iterator
friend class const_iterator
Definition: unicodetext.h:332
FATAL
Definition: log.h:29
UnicodeText::push_back
void push_back(char32 codepoint)
Definition: unicodetext.cc:354
UnicodeText::size
int size() const
Definition: unicodetext.cc:371
CHECK
#define CHECK(test)
Definition: include_gunit.h:57
UTFmax
Definition: utf.h:23
last
LIST last(LIST var_list)
Definition: oldlist.cpp:151
UnicodeText::utf8_length
int utf8_length() const
Definition: unicodetext.h:293
CHECK_LE
#define CHECK_LE(test, value)
Definition: include_gunit.h:61
UnicodeText::end
const_iterator end() const
Definition: unicodetext.cc:412
UniLib::IsTrailByte
bool IsTrailByte(char x)
Definition: unilib_utf8_utils.h:58
UniLib::IsInterchangeValid
bool IsInterchangeValid(char32 c)
Definition: unilib.cc:33
UniLib::IsValidCodepoint
bool IsValidCodepoint(char32 c)
Definition: unilib_utf8_utils.h:31
WARNING
Definition: log.h:29
UnicodeText::utf8_data
const char * utf8_data() const
Definition: unicodetext.h:292
UnicodeText::Copy
UnicodeText & Copy(const UnicodeText &src)
Definition: unicodetext.cc:214
LOG
Definition: cleanapi_test.cc:19
runetochar
int runetochar(char *str, const Rune *rune)
Definition: rune.c:253
UnicodeText::begin
const_iterator begin() const
Definition: unicodetext.cc:408