#include <validator.h>
|
enum | CharClass {
CharClass::kConsonant = 'C',
CharClass::kVowel = 'V',
CharClass::kVirama = 'H',
CharClass::kMatra = 'M',
CharClass::kMatraPiece = 'P',
CharClass::kVowelModifier = 'D',
CharClass::kZeroWidthNonJoiner = 'z',
CharClass::kZeroWidthJoiner = 'Z',
CharClass::kVedicMark = 'v',
CharClass::kNukta = 'N',
CharClass::kRobat = 'R',
CharClass::kOther = 'O',
CharClass::kWhitespace = ' ',
CharClass::kCombiner = 'c'
} |
|
using | IndicPair = std::pair< CharClass, char32 > |
|
Definition at line 86 of file validator.h.
◆ IndicPair
◆ CharClass
Enumerator |
---|
kConsonant | |
kVowel | |
kVirama | |
kMatra | |
kMatraPiece | |
kVowelModifier | |
kZeroWidthNonJoiner | |
kZeroWidthJoiner | |
kVedicMark | |
kNukta | |
kRobat | |
kOther | |
kWhitespace | |
kCombiner | |
Definition at line 126 of file validator.h.
◆ ~Validator()
tesseract::Validator::~Validator |
( |
| ) |
|
|
virtualdefault |
◆ Validator()
tesseract::Validator::Validator |
( |
ViramaScript |
script, |
|
|
bool |
report_errors |
|
) |
| |
|
inlineprotected |
◆ Clear()
void tesseract::Validator::Clear |
( |
| ) |
|
|
protected |
◆ CodeOnlyToOutput()
bool tesseract::Validator::CodeOnlyToOutput |
( |
| ) |
|
|
inlineprotected |
◆ ComputeClassCodes()
void tesseract::Validator::ComputeClassCodes |
( |
const std::vector< char32 > & |
text | ) |
|
|
protected |
◆ ConsumeGraphemeIfValid()
virtual bool tesseract::Validator::ConsumeGraphemeIfValid |
( |
| ) |
|
|
protectedpure virtual |
◆ IsSubscriptScript()
bool tesseract::Validator::IsSubscriptScript |
( |
| ) |
const |
|
protected |
◆ IsVedicAccent()
bool tesseract::Validator::IsVedicAccent |
( |
char32 |
unicode | ) |
|
|
staticprotected |
Definition at line 191 of file validator.cpp.
192 return (0x1cd0 <= unicode && unicode < 0x1d00) ||
193 (0xa8e0 <= unicode && unicode <= 0xa8f7) ||
194 (0x951 <= unicode && unicode <= 0x954);
◆ IsVirama()
bool tesseract::Validator::IsVirama |
( |
char32 |
unicode | ) |
|
|
staticprotected |
◆ IsZeroWidthMark()
static bool tesseract::Validator::IsZeroWidthMark |
( |
char32 |
ch | ) |
|
|
inlinestatic |
◆ MostFrequentViramaScript()
ViramaScript tesseract::Validator::MostFrequentViramaScript |
( |
const std::vector< char32 > & |
utf32 | ) |
|
|
staticprotected |
Definition at line 143 of file validator.cpp.
145 std::unordered_map<int, int> histogram;
152 UScriptCode script_code = uscript_getScript(ch, err);
154 script_code != USCRIPT_COMMON) ||
155 script_code == USCRIPT_MYANMAR) {
156 if (script_code == USCRIPT_MYANMAR)
161 if (!histogram.empty()) {
163 std::max_element(histogram.begin(), histogram.end(), CmpPairSecond)
172 return static_cast<ViramaScript>(codebase);
◆ MoveResultsToDest()
void tesseract::Validator::MoveResultsToDest |
( |
GraphemeNormMode |
g_mode, |
|
|
std::vector< std::vector< char32 >> * |
dest |
|
) |
| |
|
protected |
◆ MultiCodePart()
void tesseract::Validator::MultiCodePart |
( |
unsigned |
length | ) |
|
|
inlineprotected |
◆ ScriptValidator()
std::unique_ptr< Validator > tesseract::Validator::ScriptValidator |
( |
ViramaScript |
script, |
|
|
bool |
report_errors |
|
) |
| |
|
staticprotected |
Definition at line 71 of file validator.cpp.
75 return std::unique_ptr<Validator>(
76 new ValidateGrapheme(script, report_errors));
78 return std::unique_ptr<Validator>(
79 new ValidateJavanese(script, report_errors));
81 return std::unique_ptr<Validator>(
82 new ValidateMyanmar(script, report_errors));
84 return std::unique_ptr<Validator>(
85 new ValidateKhmer(script, report_errors));
87 return std::unique_ptr<Validator>(
88 new ValidateIndic(script, report_errors));
◆ UnicodeToCharClass()
virtual CharClass tesseract::Validator::UnicodeToCharClass |
( |
char32 |
ch | ) |
const |
|
protectedpure virtual |
◆ UseMultiCode()
bool tesseract::Validator::UseMultiCode |
( |
unsigned |
length | ) |
|
|
inlineprotected |
◆ ValidateCleanAndSegment()
bool tesseract::Validator::ValidateCleanAndSegment |
( |
GraphemeNormMode |
g_mode, |
|
|
bool |
report_errors, |
|
|
const std::vector< char32 > & |
src, |
|
|
std::vector< std::vector< char32 >> * |
dest |
|
) |
| |
|
static |
Definition at line 40 of file validator.cpp.
44 std::vector<std::vector<char32>> graphemes;
55 success = g_validator.ValidateCleanAndSegmentInternal(g_mode, src,
dest);
57 success = g_validator.ValidateCleanAndSegmentInternal(
59 std::unique_ptr<Validator> validator(
61 for (
const auto& grapheme : graphemes) {
62 if (!validator->ValidateCleanAndSegmentInternal(g_mode, grapheme,
dest)) {
◆ ValidateCleanAndSegmentInternal()
bool tesseract::Validator::ValidateCleanAndSegmentInternal |
( |
GraphemeNormMode |
g_mode, |
|
|
const std::vector< char32 > & |
src, |
|
|
std::vector< std::vector< char32 >> * |
dest |
|
) |
| |
|
protected |
◆ codes_
std::vector<IndicPair> tesseract::Validator::codes_ |
|
protected |
◆ codes_used_
unsigned tesseract::Validator::codes_used_ |
|
protected |
◆ kIndicCodePageSize
const int tesseract::Validator::kIndicCodePageSize = 128 |
|
staticprotected |
◆ kInvalid
const char32 tesseract::Validator::kInvalid = 0xfffd |
|
static |
◆ kJavaneseVirama
const char32 tesseract::Validator::kJavaneseVirama = 0xa9c0 |
|
staticprotected |
◆ kKhmerVirama
const char32 tesseract::Validator::kKhmerVirama = 0x17d2 |
|
staticprotected |
◆ kLeftToRightMark
const char32 tesseract::Validator::kLeftToRightMark = 0x200E |
|
static |
◆ kMaxJavaneseUnicode
const char32 tesseract::Validator::kMaxJavaneseUnicode = 0xa9df |
|
staticprotected |
◆ kMaxSinhalaUnicode
const char32 tesseract::Validator::kMaxSinhalaUnicode = 0xdff |
|
staticprotected |
◆ kMaxViramaScriptUnicode
const char32 tesseract::Validator::kMaxViramaScriptUnicode = 0x17ff |
|
staticprotected |
◆ kMinIndicUnicode
const char32 tesseract::Validator::kMinIndicUnicode = 0x900 |
|
staticprotected |
◆ kMyanmarVirama
const char32 tesseract::Validator::kMyanmarVirama = 0x1039 |
|
staticprotected |
◆ kRightToLeftMark
const char32 tesseract::Validator::kRightToLeftMark = 0x200F |
|
static |
◆ kSinhalaVirama
const char32 tesseract::Validator::kSinhalaVirama = 0xdca |
|
staticprotected |
◆ kZeroWidthJoiner
const char32 tesseract::Validator::kZeroWidthJoiner = 0x200D |
|
static |
◆ kZeroWidthNonJoiner
const char32 tesseract::Validator::kZeroWidthNonJoiner = 0x200C |
|
static |
◆ kZeroWidthSpace
const char32 tesseract::Validator::kZeroWidthSpace = 0x200B |
|
static |
◆ output_
std::vector<char32> tesseract::Validator::output_ |
|
protected |
◆ output_used_
unsigned tesseract::Validator::output_used_ |
|
protected |
◆ parts_
std::vector<std::vector<char32> > tesseract::Validator::parts_ |
|
protected |
◆ report_errors_
bool tesseract::Validator::report_errors_ |
|
protected |
◆ script_
The documentation for this class was generated from the following files: