tesseract  4.0.0-1-g2a2b
tesseract::ValidateKhmer Class Reference

#include <validate_khmer.h>

Inheritance diagram for tesseract::ValidateKhmer:
tesseract::Validator

Public Member Functions

 ValidateKhmer (ViramaScript script, bool report_errors)
 
 ~ValidateKhmer ()
 
- Public Member Functions inherited from tesseract::Validator
virtual ~Validator ()
 

Protected Member Functions

bool ConsumeGraphemeIfValid () override
 
CharClass UnicodeToCharClass (char32 ch) const override
 
- Protected Member Functions inherited from tesseract::Validator
 Validator (ViramaScript script, bool report_errors)
 
bool ValidateCleanAndSegmentInternal (GraphemeNormMode g_mode, const std::vector< char32 > &src, std::vector< std::vector< char32 >> *dest)
 
void MoveResultsToDest (GraphemeNormMode g_mode, std::vector< std::vector< char32 >> *dest)
 
bool IsSubscriptScript () const
 
bool CodeOnlyToOutput ()
 
void MultiCodePart (int length)
 
bool UseMultiCode (int length)
 
void ComputeClassCodes (const std::vector< char32 > &text)
 
void Clear ()
 

Additional Inherited Members

- Static Public Member Functions inherited from tesseract::Validator
static bool ValidateCleanAndSegment (GraphemeNormMode g_mode, bool report_errors, const std::vector< char32 > &src, std::vector< std::vector< char32 >> *dest)
 
static bool IsZeroWidthMark (char32 ch)
 
- Static Public Attributes inherited from tesseract::Validator
static const char32 kZeroWidthSpace = 0x200B
 
static const char32 kZeroWidthNonJoiner = 0x200C
 
static const char32 kZeroWidthJoiner = 0x200D
 
static const char32 kLeftToRightMark = 0x200E
 
static const char32 kRightToLeftMark = 0x200F
 
static const char32 kInvalid = 0xfffd
 
- Protected Types inherited from tesseract::Validator
enum  CharClass {
  CharClass::kConsonant = 'C', CharClass::kVowel = 'V', CharClass::kVirama = 'H', CharClass::kMatra = 'M',
  CharClass::kMatraPiece = 'P', CharClass::kVowelModifier = 'D', CharClass::kZeroWidthNonJoiner = 'z', CharClass::kZeroWidthJoiner = 'Z',
  CharClass::kVedicMark = 'v', CharClass::kNukta = 'N', CharClass::kRobat = 'R', CharClass::kOther = 'O',
  CharClass::kWhitespace = ' ', CharClass::kCombiner = 'c'
}
 
using IndicPair = std::pair< CharClass, char32 >
 
- Static Protected Member Functions inherited from tesseract::Validator
static std::unique_ptr< ValidatorScriptValidator (ViramaScript script, bool report_errors)
 
static ViramaScript MostFrequentViramaScript (const std::vector< char32 > &utf32)
 
static bool IsVirama (char32 unicode)
 
static bool IsVedicAccent (char32 unicode)
 
- Protected Attributes inherited from tesseract::Validator
ViramaScript script_
 
std::vector< IndicPaircodes_
 
std::vector< std::vector< char32 > > parts_
 
std::vector< char32output_
 
int codes_used_
 
int output_used_
 
bool report_errors_
 
- Static Protected Attributes inherited from tesseract::Validator
static const int kIndicCodePageSize = 128
 
static const char32 kMinIndicUnicode = 0x900
 
static const char32 kMaxSinhalaUnicode = 0xdff
 
static const char32 kMaxViramaScriptUnicode = 0x17ff
 
static const char32 kSinhalaVirama = 0xdca
 
static const char32 kMyanmarVirama = 0x1039
 
static const char32 kKhmerVirama = 0x17d2
 
static const char32 kJavaneseVirama = 0xa9c0
 
static const char32 kMaxJavaneseUnicode = 0xa9df
 

Detailed Description

Definition at line 9 of file validate_khmer.h.

Constructor & Destructor Documentation

◆ ValidateKhmer()

tesseract::ValidateKhmer::ValidateKhmer ( ViramaScript  script,
bool  report_errors 
)
inline

Definition at line 11 of file validate_khmer.h.

12  : Validator(script, report_errors) {}
Validator(ViramaScript script, bool report_errors)
Definition: validator.h:136

◆ ~ValidateKhmer()

tesseract::ValidateKhmer::~ValidateKhmer ( )
inline

Definition at line 13 of file validate_khmer.h.

13 {}

Member Function Documentation

◆ ConsumeGraphemeIfValid()

bool tesseract::ValidateKhmer::ConsumeGraphemeIfValid ( )
overrideprotectedvirtual

Implements tesseract::Validator.

Definition at line 20 of file validate_khmer.cpp.

20  {
21  int num_codes = codes_.size();
22  if (codes_used_ == num_codes) return false;
23  if (codes_[codes_used_].first == CharClass::kOther) {
24  UseMultiCode(1);
25  return true;
26  }
28  if (report_errors_) {
29  tprintf("Invalid start of Khmer syllable:0x%x\n",
30  codes_[codes_used_].second);
31  }
32  return false;
33  }
34  if (UseMultiCode(1)) return true;
35  if (codes_[codes_used_].first == CharClass::kRobat ||
37  if (UseMultiCode(1)) return true;
38  }
39  while (codes_used_ + 1 < num_codes &&
43  if (UseMultiCode(2)) return true;
44  if (codes_[codes_used_].first == CharClass::kRobat) {
45  if (UseMultiCode(1)) return true;
46  }
47  }
48  int num_matra_parts = 0;
49  if (codes_[codes_used_].second == kZeroWidthJoiner ||
51  if (CodeOnlyToOutput()) {
52  if (report_errors_) {
53  tprintf("Unterminated joiner: 0x%x\n", output_.back());
54  }
55  return false;
56  }
57  ++num_matra_parts;
58  }
59  // Not quite as shown by the BNF, the matra piece is allowed as a matra on its
60  // own or as an addition to other matras.
61  if (codes_[codes_used_].first == CharClass::kMatra ||
63  ++num_matra_parts;
64  if (UseMultiCode(num_matra_parts)) return true;
65  } else if (num_matra_parts) {
66  if (report_errors_) {
67  tprintf("Joiner with non-dependent vowel after it!:0x%x 0x%x\n",
68  output_.back(), codes_[codes_used_].second);
69  }
70  return false;
71  }
74  if (UseMultiCode(1)) return true;
75  }
77  if (UseMultiCode(1)) return true;
78  }
79  if (codes_used_ + 1 < num_codes &&
83  if (UseMultiCode(2)) return true;
84  }
85  return true;
86 }
std::vector< IndicPair > codes_
Definition: validator.h:232
static const char32 kZeroWidthNonJoiner
Definition: validator.h:96
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
bool UseMultiCode(int length)
Definition: validator.h:196
static const char32 kZeroWidthJoiner
Definition: validator.h:97
std::vector< char32 > output_
Definition: validator.h:236
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ UnicodeToCharClass()

Validator::CharClass tesseract::ValidateKhmer::UnicodeToCharClass ( char32  ch) const
overrideprotectedvirtual

Implements tesseract::Validator.

Definition at line 88 of file validate_khmer.cpp.

88  {
89  if (IsVedicAccent(ch)) return CharClass::kVedicMark;
92  // Offset from the start of the relevant unicode code block aka code page.
93  int off = ch - static_cast<char32>(script_);
94  // Anything in another code block is other.
95  if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther;
96  if (off <= 0x33) return CharClass::kConsonant;
97  if (off <= 0x45) return CharClass::kMatra;
98  if (off == 0x46) return CharClass::kMatraPiece;
99  if (off == 0x4c) return CharClass::kRobat;
100  if (off == 0x49 || off == 0x4a) return CharClass::kNukta;
101  if (off <= 0x51) return CharClass::kVowelModifier;
102  if (off == 0x52) return CharClass::kVirama;
103  return CharClass::kOther;
104 }
signed int char32
static bool IsVedicAccent(char32 unicode)
Definition: validator.cpp:191
static const char32 kZeroWidthNonJoiner
Definition: validator.h:96
static const int kIndicCodePageSize
Definition: validator.h:214
ViramaScript script_
Definition: validator.h:230
static const char32 kZeroWidthJoiner
Definition: validator.h:97

The documentation for this class was generated from the following files: