tesseract  4.0.0-1-g2a2b
tesseract::UnicodeSpanSkipper Class Reference

Public Member Functions

 UnicodeSpanSkipper (const UNICHARSET *unicharset, const WERD_CHOICE *word)
 
int SkipPunc (int pos)
 
int SkipDigits (int pos)
 
int SkipRomans (int pos)
 
int SkipAlpha (int pos)
 

Detailed Description

Definition at line 297 of file paragraphs.cpp.

Constructor & Destructor Documentation

◆ UnicodeSpanSkipper()

tesseract::UnicodeSpanSkipper::UnicodeSpanSkipper ( const UNICHARSET unicharset,
const WERD_CHOICE word 
)
inline

Definition at line 299 of file paragraphs.cpp.

300  : u_(unicharset), word_(word) { wordlen_ = word->length(); }
int length() const
Definition: ratngs.h:303

Member Function Documentation

◆ SkipAlpha()

int tesseract::UnicodeSpanSkipper::SkipAlpha ( int  pos)

Definition at line 338 of file paragraphs.cpp.

338  {
339  while (pos < wordlen_ && u_->get_isalpha(word_->unichar_id(pos))) pos++;
340  return pos;
341 }
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315

◆ SkipDigits()

int tesseract::UnicodeSpanSkipper::SkipDigits ( int  pos)

Definition at line 322 of file paragraphs.cpp.

322  {
323  while (pos < wordlen_ && (u_->get_isdigit(word_->unichar_id(pos)) ||
324  IsDigitLike(UnicodeFor(u_, word_, pos)))) pos++;
325  return pos;
326 }
int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos)
Definition: paragraphs.cpp:289
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315

◆ SkipPunc()

int tesseract::UnicodeSpanSkipper::SkipPunc ( int  pos)

Definition at line 317 of file paragraphs.cpp.

317  {
318  while (pos < wordlen_ && u_->get_ispunctuation(word_->unichar_id(pos))) pos++;
319  return pos;
320 }
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315

◆ SkipRomans()

int tesseract::UnicodeSpanSkipper::SkipRomans ( int  pos)

Definition at line 328 of file paragraphs.cpp.

328  {
329  const char *kRomans = "ivxlmdIVXLMD";
330  while (pos < wordlen_) {
331  int ch = UnicodeFor(u_, word_, pos);
332  if (ch >= 0xF0 || strchr(kRomans, ch) == nullptr) break;
333  pos++;
334  }
335  return pos;
336 }
int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos)
Definition: paragraphs.cpp:289

The documentation for this class was generated from the following file: