tesseract  5.0.0-alpha-619-ge9db
tesseract::ValidateIndic Class Reference

#include <validate_indic.h>

Inheritance diagram for tesseract::ValidateIndic:
tesseract::Validator

Public Member Functions

 ValidateIndic (ViramaScript script, bool report_errors)
 
 ~ValidateIndic ()
 
- Public Member Functions inherited from tesseract::Validator
virtual ~Validator ()
 

Protected Member Functions

bool ConsumeGraphemeIfValid () override
 
Validator::CharClass UnicodeToCharClass (char32 ch) const override
 
- Protected Member Functions inherited from tesseract::Validator
 Validator (ViramaScript script, bool report_errors)
 
bool ValidateCleanAndSegmentInternal (GraphemeNormMode g_mode, const std::vector< char32 > &src, std::vector< std::vector< char32 >> *dest)
 
void MoveResultsToDest (GraphemeNormMode g_mode, std::vector< std::vector< char32 >> *dest)
 
bool IsSubscriptScript () const
 
bool CodeOnlyToOutput ()
 
void MultiCodePart (unsigned length)
 
bool UseMultiCode (unsigned length)
 
void ComputeClassCodes (const std::vector< char32 > &text)
 
void Clear ()
 

Additional Inherited Members

- Static Public Member Functions inherited from tesseract::Validator
static bool ValidateCleanAndSegment (GraphemeNormMode g_mode, bool report_errors, const std::vector< char32 > &src, std::vector< std::vector< char32 >> *dest)
 
static bool IsZeroWidthMark (char32 ch)
 
- Static Public Attributes inherited from tesseract::Validator
static const char32 kZeroWidthSpace = 0x200B
 
static const char32 kZeroWidthNonJoiner = 0x200C
 
static const char32 kZeroWidthJoiner = 0x200D
 
static const char32 kLeftToRightMark = 0x200E
 
static const char32 kRightToLeftMark = 0x200F
 
static const char32 kInvalid = 0xfffd
 
- Protected Types inherited from tesseract::Validator
enum  CharClass {
  CharClass::kConsonant = 'C', CharClass::kVowel = 'V', CharClass::kVirama = 'H', CharClass::kMatra = 'M',
  CharClass::kMatraPiece = 'P', CharClass::kVowelModifier = 'D', CharClass::kZeroWidthNonJoiner = 'z', CharClass::kZeroWidthJoiner = 'Z',
  CharClass::kVedicMark = 'v', CharClass::kNukta = 'N', CharClass::kRobat = 'R', CharClass::kOther = 'O',
  CharClass::kWhitespace = ' ', CharClass::kCombiner = 'c'
}
 
using IndicPair = std::pair< CharClass, char32 >
 
- Static Protected Member Functions inherited from tesseract::Validator
static std::unique_ptr< ValidatorScriptValidator (ViramaScript script, bool report_errors)
 
static ViramaScript MostFrequentViramaScript (const std::vector< char32 > &utf32)
 
static bool IsVirama (char32 unicode)
 
static bool IsVedicAccent (char32 unicode)
 
- Protected Attributes inherited from tesseract::Validator
ViramaScript script_
 
std::vector< IndicPaircodes_
 
std::vector< std::vector< char32 > > parts_
 
std::vector< char32output_
 
unsigned codes_used_
 
unsigned output_used_
 
bool report_errors_
 
- Static Protected Attributes inherited from tesseract::Validator
static const int kIndicCodePageSize = 128
 
static const char32 kMinIndicUnicode = 0x900
 
static const char32 kMaxSinhalaUnicode = 0xdff
 
static const char32 kMaxViramaScriptUnicode = 0x17ff
 
static const char32 kSinhalaVirama = 0xdca
 
static const char32 kMyanmarVirama = 0x1039
 
static const char32 kKhmerVirama = 0x17d2
 
static const char32 kJavaneseVirama = 0xa9c0
 
static const char32 kMaxJavaneseUnicode = 0xa9df
 

Detailed Description

Definition at line 10 of file validate_indic.h.

Constructor & Destructor Documentation

◆ ValidateIndic()

tesseract::ValidateIndic::ValidateIndic ( ViramaScript  script,
bool  report_errors 
)
inline

Definition at line 12 of file validate_indic.h.

13  : Validator(script, report_errors) {}

◆ ~ValidateIndic()

tesseract::ValidateIndic::~ValidateIndic ( )
inline

Definition at line 14 of file validate_indic.h.

14 {}

Member Function Documentation

◆ ConsumeGraphemeIfValid()

bool tesseract::ValidateIndic::ConsumeGraphemeIfValid ( )
overrideprotectedvirtual

Implements tesseract::Validator.

Definition at line 18 of file validate_indic.cpp.

18  {
19  switch (codes_[codes_used_].first) {
21  return ConsumeConsonantHeadIfValid() && ConsumeConsonantTailIfValid();
22  case CharClass::kVowel:
24  return ConsumeVowelIfValid();
27  // Apart from within an aksara, joiners are silently dropped.
28  if (report_errors_)
29  tprintf("Dropping isolated joiner: 0x%x\n", codes_[codes_used_].second);
30  ++codes_used_;
31  return true;
32  case CharClass::kOther:
33  UseMultiCode(1);
34  return true;
35  default:
36  if (report_errors_) {
37  tprintf("Invalid start of grapheme sequence:%c=0x%x\n",
38  codes_[codes_used_].first, codes_[codes_used_].second);
39  }
40  return false;
41  }
42 }

◆ UnicodeToCharClass()

Validator::CharClass tesseract::ValidateIndic::UnicodeToCharClass ( char32  ch) const
overrideprotectedvirtual

Implements tesseract::Validator.

Definition at line 44 of file validate_indic.cpp.

44  {
45  if (IsVedicAccent(ch)) return CharClass::kVedicMark;
48  // Offset from the start of the relevant unicode code block aka code page.
49  int base = static_cast<char32>(script_);
50  int off = ch - base;
51  // Anything in another code block is other.
52  if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther;
53  // Exception for Tamil. The aytham character is considered a letter.
54  if (script_ == ViramaScript::kTamil && off == 0x03) return CharClass::kVowel;
55  if (off < 0x4) return CharClass::kVowelModifier;
57  // Sinhala is an exception.
58  if (off <= 0x19) return CharClass::kVowel;
59  if (off <= 0x49) return CharClass::kConsonant;
60  if (off == 0x4a) return CharClass::kVirama;
61  if (off <= 0x5f) return CharClass::kMatra;
62  } else {
63  if (off <= 0x14 || off == 0x50) return CharClass::kVowel;
64  if (off <= 0x3b || (0x58 <= off && off <= 0x5f))
65  return CharClass::kConsonant;
66  // Sinhala doesn't have Nukta or Avagraha.
67  if (off == 0x3c) return CharClass::kNukta;
68  if (off == 0x3d) return CharClass::kVowel; // avagraha
69  if (off <= 0x4c || (0x51 <= off && off <= 0x54)) return CharClass::kMatra;
70  if (0x55 <= off && off <= 0x57) return CharClass::kMatraPiece;
71  if (off == 0x4d) return CharClass::kVirama;
72  }
73  if (off == 0x60 || off == 0x61) return CharClass::kVowel;
74  if (off == 0x62 || off == 0x63) return CharClass::kMatra;
75  // Danda and digits up to 6f are OK as other.
76  // 70-7f are script-specific.
77  // 0BF0-0BF2 are Tamil numbers 10, 100 and 1000; treat as other.
78  if (script_ == ViramaScript::kTamil && (0x70 <= off && off <= 0x72))
79  return CharClass::kOther;
80  // 0BF3-0BFA are other Tamil symbols.
81  if (script_ == ViramaScript::kTamil && (0x73 <= off && off <= 0x7A))
82  return CharClass::kOther;
83  if (script_ == ViramaScript::kBengali && (off == 0x70 || off == 0x71))
84  return CharClass::kConsonant;
85  if (script_ == ViramaScript::kGurmukhi && (off == 0x72 || off == 0x73))
86  return CharClass::kConsonant;
87  if (script_ == ViramaScript::kSinhala && off == 0x70)
88  return CharClass::kConsonant;
89  if (script_ == ViramaScript::kDevanagari && off == 0x70)
90  return CharClass::kOther;
91  if (0x70 <= off && off <= 0x73) return CharClass::kVowelModifier;
92  // Non Indic, Digits, Measures, danda, etc.
93  return CharClass::kOther;
94 }

The documentation for this class was generated from the following files:
tesseract::Validator::CharClass::kZeroWidthJoiner
tesseract::Validator::CharClass::kVowelModifier
tesseract::Validator::IsVedicAccent
static bool IsVedicAccent(char32 unicode)
Definition: validator.cpp:191
tesseract::Validator::UseMultiCode
bool UseMultiCode(unsigned length)
Definition: validator.h:210
tesseract::Validator::CharClass::kMatra
tesseract::Validator::codes_
std::vector< IndicPair > codes_
Definition: validator.h:246
tesseract::Validator::kIndicCodePageSize
static const int kIndicCodePageSize
Definition: validator.h:228
tesseract::Validator::kZeroWidthNonJoiner
static const char32 kZeroWidthNonJoiner
Definition: validator.h:110
tesseract::ViramaScript::kTamil
tesseract::ViramaScript::kBengali
tesseract::ViramaScript::kDevanagari
tesseract::Validator::kZeroWidthJoiner
static const char32 kZeroWidthJoiner
Definition: validator.h:111
tesseract::Validator::report_errors_
bool report_errors_
Definition: validator.h:256
tesseract::Validator::CharClass::kVedicMark
tesseract::Validator::codes_used_
unsigned codes_used_
Definition: validator.h:252
tesseract::Validator::CharClass::kVirama
tesseract::Validator::CharClass::kConsonant
tesseract::Validator::CharClass::kNukta
tesseract::Validator::CharClass::kVowel
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
tesseract::ViramaScript::kSinhala
tesseract::Validator::CharClass::kOther
tesseract::ViramaScript::kGurmukhi
tesseract::Validator::script_
ViramaScript script_
Definition: validator.h:244
tesseract::Validator::CharClass::kZeroWidthNonJoiner
tesseract::Validator::CharClass::kMatraPiece
tesseract::Validator::Validator
Validator(ViramaScript script, bool report_errors)
Definition: validator.h:150