tesseract  5.0.0-alpha-619-ge9db
tesseract::ValidateGrapheme Class Reference

#include <validate_grapheme.h>

Inheritance diagram for tesseract::ValidateGrapheme:
tesseract::Validator

Public Member Functions

 ValidateGrapheme (ViramaScript script, bool report_errors)
 
 ~ValidateGrapheme ()
 
- Public Member Functions inherited from tesseract::Validator
virtual ~Validator ()
 

Protected Member Functions

bool ConsumeGraphemeIfValid () override
 
CharClass UnicodeToCharClass (char32 ch) const override
 
- Protected Member Functions inherited from tesseract::Validator
 Validator (ViramaScript script, bool report_errors)
 
bool ValidateCleanAndSegmentInternal (GraphemeNormMode g_mode, const std::vector< char32 > &src, std::vector< std::vector< char32 >> *dest)
 
void MoveResultsToDest (GraphemeNormMode g_mode, std::vector< std::vector< char32 >> *dest)
 
bool IsSubscriptScript () const
 
bool CodeOnlyToOutput ()
 
void MultiCodePart (unsigned length)
 
bool UseMultiCode (unsigned length)
 
void ComputeClassCodes (const std::vector< char32 > &text)
 
void Clear ()
 

Additional Inherited Members

- Static Public Member Functions inherited from tesseract::Validator
static bool ValidateCleanAndSegment (GraphemeNormMode g_mode, bool report_errors, const std::vector< char32 > &src, std::vector< std::vector< char32 >> *dest)
 
static bool IsZeroWidthMark (char32 ch)
 
- Static Public Attributes inherited from tesseract::Validator
static const char32 kZeroWidthSpace = 0x200B
 
static const char32 kZeroWidthNonJoiner = 0x200C
 
static const char32 kZeroWidthJoiner = 0x200D
 
static const char32 kLeftToRightMark = 0x200E
 
static const char32 kRightToLeftMark = 0x200F
 
static const char32 kInvalid = 0xfffd
 
- Protected Types inherited from tesseract::Validator
enum  CharClass {
  CharClass::kConsonant = 'C', CharClass::kVowel = 'V', CharClass::kVirama = 'H', CharClass::kMatra = 'M',
  CharClass::kMatraPiece = 'P', CharClass::kVowelModifier = 'D', CharClass::kZeroWidthNonJoiner = 'z', CharClass::kZeroWidthJoiner = 'Z',
  CharClass::kVedicMark = 'v', CharClass::kNukta = 'N', CharClass::kRobat = 'R', CharClass::kOther = 'O',
  CharClass::kWhitespace = ' ', CharClass::kCombiner = 'c'
}
 
using IndicPair = std::pair< CharClass, char32 >
 
- Static Protected Member Functions inherited from tesseract::Validator
static std::unique_ptr< ValidatorScriptValidator (ViramaScript script, bool report_errors)
 
static ViramaScript MostFrequentViramaScript (const std::vector< char32 > &utf32)
 
static bool IsVirama (char32 unicode)
 
static bool IsVedicAccent (char32 unicode)
 
- Protected Attributes inherited from tesseract::Validator
ViramaScript script_
 
std::vector< IndicPaircodes_
 
std::vector< std::vector< char32 > > parts_
 
std::vector< char32output_
 
unsigned codes_used_
 
unsigned output_used_
 
bool report_errors_
 
- Static Protected Attributes inherited from tesseract::Validator
static const int kIndicCodePageSize = 128
 
static const char32 kMinIndicUnicode = 0x900
 
static const char32 kMaxSinhalaUnicode = 0xdff
 
static const char32 kMaxViramaScriptUnicode = 0x17ff
 
static const char32 kSinhalaVirama = 0xdca
 
static const char32 kMyanmarVirama = 0x1039
 
static const char32 kKhmerVirama = 0x17d2
 
static const char32 kJavaneseVirama = 0xa9c0
 
static const char32 kMaxJavaneseUnicode = 0xa9df
 

Detailed Description

Definition at line 10 of file validate_grapheme.h.

Constructor & Destructor Documentation

◆ ValidateGrapheme()

tesseract::ValidateGrapheme::ValidateGrapheme ( ViramaScript  script,
bool  report_errors 
)
inline

Definition at line 12 of file validate_grapheme.h.

13  : Validator(script, report_errors) {}

◆ ~ValidateGrapheme()

tesseract::ValidateGrapheme::~ValidateGrapheme ( )
inline

Definition at line 14 of file validate_grapheme.h.

14 {}

Member Function Documentation

◆ ConsumeGraphemeIfValid()

bool tesseract::ValidateGrapheme::ConsumeGraphemeIfValid ( )
overrideprotectedvirtual

Implements tesseract::Validator.

Definition at line 7 of file validate_grapheme.cpp.

7  {
8  const unsigned num_codes = codes_.size();
9  char32 prev_prev_ch = ' ';
10  char32 prev_ch = ' ';
12  int num_codes_in_grapheme = 0;
13  while (codes_used_ < num_codes) {
14  CharClass cc = codes_[codes_used_].first;
15  char32 ch = codes_[codes_used_].second;
16  const bool is_combiner =
18  // TODO: Make this code work well with RTL text.
19  // See https://github.com/tesseract-ocr/tesseract/pull/2266#issuecomment-467114751
20  #if 0
21  // Reject easily detected badly formed sequences.
22  if (prev_cc == CharClass::kWhitespace && is_combiner) {
23  if (report_errors_) tprintf("Word started with a combiner:0x%x\n", ch);
24  return false;
25  }
26  #endif
27  if (prev_cc == CharClass::kVirama && cc == CharClass::kVirama) {
28  if (report_errors_)
29  tprintf("Two grapheme links in a row:0x%x 0x%x\n", prev_ch, ch);
30  return false;
31  }
32  if (prev_cc != CharClass::kWhitespace && cc != CharClass::kWhitespace &&
33  IsBadlyFormed(prev_ch, ch)) {
34  return false;
35  }
36  bool prev_is_fwd_combiner =
37  prev_ch == kZeroWidthJoiner || prev_cc == CharClass::kVirama ||
38  (prev_ch == kZeroWidthNonJoiner &&
39  (cc == CharClass::kVirama || prev_prev_ch == kZeroWidthJoiner));
40  if (num_codes_in_grapheme > 0 && !is_combiner && !prev_is_fwd_combiner)
41  break;
43  ++num_codes_in_grapheme;
44  prev_prev_ch = prev_ch;
45  prev_ch = ch;
46  prev_cc = cc;
47  }
48  if (num_codes_in_grapheme > 0) MultiCodePart(num_codes_in_grapheme);
49  return true;
50 }

◆ UnicodeToCharClass()

Validator::CharClass tesseract::ValidateGrapheme::UnicodeToCharClass ( char32  ch) const
overrideprotectedvirtual

Implements tesseract::Validator.

Definition at line 52 of file validate_grapheme.cpp.

52  {
53  if (IsVedicAccent(ch)) return CharClass::kVedicMark;
54  // The ZeroWidth[Non]Joiner characters are mapped to kCombiner as they
55  // always combine with the previous character.
56  if (u_hasBinaryProperty(ch, UCHAR_GRAPHEME_LINK)) return CharClass::kVirama;
57  if (u_isUWhiteSpace(ch)) return CharClass::kWhitespace;
58  // Workaround for Javanese Aksara's Taling, do not label it as a combiner
59  if (ch == 0xa9ba) return CharClass::kConsonant;
60  int char_type = u_charType(ch);
61  if (char_type == U_NON_SPACING_MARK || char_type == U_ENCLOSING_MARK ||
62  char_type == U_COMBINING_SPACING_MARK || ch == kZeroWidthNonJoiner ||
63  ch == kZeroWidthJoiner)
64  return CharClass::kCombiner;
65  return CharClass::kOther;
66 }

The documentation for this class was generated from the following files:
tesseract::Validator::MultiCodePart
void MultiCodePart(unsigned length)
Definition: validator.h:196
tesseract::Validator::IsVedicAccent
static bool IsVedicAccent(char32 unicode)
Definition: validator.cpp:191
tesseract::Validator::CharClass::kCombiner
tesseract::Validator::codes_
std::vector< IndicPair > codes_
Definition: validator.h:246
tesseract::Validator::kZeroWidthNonJoiner
static const char32 kZeroWidthNonJoiner
Definition: validator.h:110
tesseract::Validator::CharClass
CharClass
Definition: validator.h:126
tesseract::Validator::kZeroWidthJoiner
static const char32 kZeroWidthJoiner
Definition: validator.h:111
tesseract::Validator::report_errors_
bool report_errors_
Definition: validator.h:256
tesseract::Validator::CharClass::kVedicMark
tesseract::Validator::codes_used_
unsigned codes_used_
Definition: validator.h:252
tesseract::Validator::CharClass::kVirama
tesseract::Validator::CodeOnlyToOutput
bool CodeOnlyToOutput()
Definition: validator.h:186
tesseract::Validator::CharClass::kConsonant
char32
signed int char32
Definition: pango_font_info.h:33
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
tesseract::Validator::CharClass::kOther
tesseract::Validator::CharClass::kWhitespace
tesseract::Validator::Validator
Validator(ViramaScript script, bool report_errors)
Definition: validator.h:150