tesseract  5.0.0-alpha-619-ge9db
validator.h
Go to the documentation of this file.
1 /**********************************************************************
2  * File: validator.h
3  * Description: Base class for various text validators. Intended mainly for
4  * scripts that use a virama character.
5  * Author: Ray Smith
6  *
7  * (C) Copyright 2017, Google Inc.
8  * Licensed under the Apache License, Version 2.0 (the "License");
9  * you may not use this file except in compliance with the License.
10  * You may obtain a copy of the License at
11  * http://www.apache.org/licenses/LICENSE-2.0
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  *
18  **********************************************************************/
19 
20 #ifndef TESSERACT_TRAINING_VALIDATOR_H_
21 #define TESSERACT_TRAINING_VALIDATOR_H_
22 
23 #include <memory>
24 #include <vector>
25 #include <tesseract/unichar.h>
26 
27 namespace tesseract {
28 
29 // Different kinds of grapheme normalization - not just for Indic!
30 // A grapheme is a syllable unit in Indic and can be several unicodes.
31 // In other scripts, a grapheme is a base character and accent/diacritic
32 // combination, as not all accented characters have a single composed form.
33 enum class GraphemeNormMode {
34  // Validation result is a single string, even if input is multi-word.
36  // Standard unicode graphemes are validated and output as grapheme units.
37  kCombined,
38  // Graphemes are validated and sub-divided. For virama-using scripts, units
39  // that correspond to repeatable glyphs are generated. (Mostly single unicodes
40  // but viramas and joiners are paired with the most sensible neighbor.)
41  // For non-virama scripts, this means that base/accent pairs are separated,
42  // ie the output is individual unicodes.
44  // The output is always single unicodes, regardless of the script.
46 };
47 
48 // An enum representing the scripts that use a virama character. It is
49 // guaranteed that the value of any element, (except kNonVirama) can be cast
50 // to a unicode (char32) value that represents the start of the unicode range
51 // of the corresponding script.
52 enum class ViramaScript : char32 {
53  kNonVirama = 0,
54  kDevanagari = 0x900,
55  kBengali = 0x980,
56  kGurmukhi = 0xa00,
57  kGujarati = 0xa80,
58  kOriya = 0xb00,
59  kTamil = 0xb80,
60  kTelugu = 0xc00,
61  kKannada = 0xc80,
62  kMalayalam = 0xd00,
63  kSinhala = 0xd80,
64  kMyanmar = 0x1000,
65  kKhmer = 0x1780,
66  kJavanese = 0xa980,
67 };
68 
69 // Base class offers a validation API and protected methods to allow subclasses
70 // to easily build the validated/segmented output.
71 class Validator {
72  public:
73  // Validates and cleans the src vector of unicodes to the *dest, according to
74  // g_mode. In the case of kSingleString, a single vector containing the whole
75  // result is added to *dest. With kCombined, multiple vectors are added to
76  // *dest with one grapheme in each. With kGlyphSplit, multiple vectors are
77  // added to *dest with a smaller unit representing a glyph in each.
78  // In case of validation error, returns false and as much as possible of the
79  // input, without discarding invalid text.
80  static bool ValidateCleanAndSegment(GraphemeNormMode g_mode,
81  bool report_errors,
82  const std::vector<char32>& src,
83  std::vector<std::vector<char32>>* dest);
84 
85  // Returns true if the unicode ch is a non-printing zero-width mark of no
86  // significance to OCR training or evaluation.
87  static bool IsZeroWidthMark(char32 ch) {
88  return ch == kZeroWidthSpace || ch == kLeftToRightMark ||
89  ch == kRightToLeftMark || ch == kInvalid;
90  }
91  virtual ~Validator();
92 
93  // Some specific but universally useful unicodes.
94  static const char32 kZeroWidthSpace;
95  static const char32 kZeroWidthNonJoiner;
96  static const char32 kZeroWidthJoiner;
97  static const char32 kLeftToRightMark;
98  static const char32 kRightToLeftMark;
99  static const char32 kInvalid;
100 
101  protected:
102  // These are more or less the character class identifiers in the ISCII
103  // standard, section 8. They have been augmented with the Unicode meta
104  // characters Zero Width Joiner and Zero Width Non Joiner, and the
105  // Unicode Vedic Marks.
106  // The best sources of information on Unicode and Indic scripts are:
107  // http://varamozhi.sourceforge.net/iscii91.pdf
108  // http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf
109  // http://unicode.org/faq/indic.html
110  // http://www.microsoft.com/typography/otfntdev/teluguot/shaping.aspx
111  enum class CharClass {
112  // NOTE: The values of the enum members are meaningless and arbitrary, ie
113  // they are not used for sorting, or any other risky application.
114  // The reason they are what they are is they are a single character
115  // abbreviation that can be used in a regexp/BNF definition of a grammar,
116  // IN A COMMENT, and still not relied upon in the code.
117  kConsonant = 'C',
118  kVowel = 'V',
119  kVirama = 'H', // (aka Halant)
120  kMatra = 'M', // (aka Dependent Vowel)
121  kMatraPiece = 'P', // unicode provides pieces of Matras.
122  kVowelModifier = 'D', // (candrabindu, anusvara, visarga, other marks)
123  kZeroWidthNonJoiner = 'z', // Unicode Zero Width Non-Joiner U+200C
124  kZeroWidthJoiner = 'Z', // Unicode Zero Width Joiner U+200D
125  kVedicMark = 'v', // Modifiers can come modify any indic syllable.
126  kNukta = 'N', // Occurs only immediately after consonants.
127  kRobat = 'R', // Khmer only.
128  kOther = 'O', // (digits, measures, non-Indic, etc)
129  // Additional classes used only by ValidateGrapheme.
130  kWhitespace = ' ',
131  kCombiner = 'c', // Combiners other than virama.
132  };
133  using IndicPair = std::pair<CharClass, char32>;
134 
135  Validator(ViramaScript script, bool report_errors)
136  : script_(script),
137  codes_used_(0),
138  output_used_(0),
139  report_errors_(report_errors) {}
140 
141  // Factory method that understands how to map script to the right subclass.
142  static std::unique_ptr<Validator> ScriptValidator(ViramaScript script,
143  bool report_errors);
144 
145  // Internal version of the public static ValidateCleanAndSegment.
146  // Validates and cleans the src vector of unicodes to the *dest, according to
147  // its type and the given g_mode.
148  // In case of validation error, returns false and returns as much as possible
149  // of the input, without discarding invalid text.
151  const std::vector<char32>& src,
152  std::vector<std::vector<char32>>* dest);
153  // Moves the results from parts_ or output_ to dest according to g_mode.
155  std::vector<std::vector<char32>>* dest);
156 
157  // Computes and returns the ViramaScript corresponding to the most frequent
158  // virama-using script in the input, or kNonVirama if none are present.
160  const std::vector<char32>& utf32);
161  // Returns true if the given UTF-32 unicode is a "virama" character.
162  static bool IsVirama(char32 unicode);
163  // Returns true if the given UTF-32 unicode is a vedic accent.
164  static bool IsVedicAccent(char32 unicode);
165  // Returns true if the script is one that uses subscripts for conjuncts.
166  bool IsSubscriptScript() const;
167 
168  // Helper function appends the next element of codes_ only to output_,
169  // without touching parts_
170  // Returns true at the end of codes_.
171  bool CodeOnlyToOutput() {
172  output_.push_back(codes_[codes_used_].second);
173  return ++codes_used_ == codes_.size();
174  }
175 
176  // Helper function adds a length-element vector to parts_ from the last length
177  // elements of output_. If there are more than length unused elements in
178  // output_, adds unicodes as single-element vectors to parts_ to catch
179  // output_used_ up to output->size() - length before adding the length-element
180  // vector.
181  void MultiCodePart(unsigned length) {
182  while (output_used_ + length < output_.size()) {
183  parts_.emplace_back(
184  std::initializer_list<char32>{output_[output_used_++]});
185  }
186  parts_.emplace_back(std::initializer_list<char32>{output_[output_used_]});
187  while (++output_used_ < output_.size()) {
188  parts_.back().push_back(output_[output_used_]);
189  }
190  }
191 
192  // Helper function appends the next element of codes_ to output_, and then
193  // calls MultiCodePart to add the appropriate components to parts_.
194  // Returns true at the end of codes_.
195  bool UseMultiCode(unsigned length) {
196  output_.push_back(codes_[codes_used_].second);
197  MultiCodePart(length);
198  return ++codes_used_ == codes_.size();
199  }
200 
201  // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
202  // parts_ and output_. Returns true if a valid Grapheme was consumed,
203  // otherwise does not increment codes_used_.
204  virtual bool ConsumeGraphemeIfValid() = 0;
205  // Sets codes_ to the class codes for the given unicode text.
206  void ComputeClassCodes(const std::vector<char32>& text);
207  // Returns the CharClass corresponding to the given Unicode ch.
208  virtual CharClass UnicodeToCharClass(char32 ch) const = 0;
209  // Resets to the initial state.
210  void Clear();
211 
212  // Number of unicodes in each Indic codepage.
213  static const int kIndicCodePageSize = 128;
214  // Lowest unicode value of any Indic script. (Devanagari).
215  static const char32 kMinIndicUnicode = 0x900;
216  // Highest unicode value of any consistent (ISCII-based) Indic script.
217  static const char32 kMaxSinhalaUnicode = 0xdff;
218  // Highest unicode value of any virama-using script. (Khmer).
219  static const char32 kMaxViramaScriptUnicode = 0x17ff;
220  // Some special unicodes.
221  static const char32 kSinhalaVirama = 0xdca;
222  static const char32 kMyanmarVirama = 0x1039;
223  static const char32 kKhmerVirama = 0x17d2;
224  // Javanese Script - aksarajawa
225  static const char32 kJavaneseVirama = 0xa9c0;
226  static const char32 kMaxJavaneseUnicode = 0xa9df;
227 
228  // Script we are operating on.
230  // Input unicodes with assigned CharClass is the data to be validated.
231  std::vector<IndicPair> codes_;
232  // Glyph-like components of the input.
233  std::vector<std::vector<char32>> parts_;
234  // Copied validated unicodes from codes_ that are OK to output.
235  std::vector<char32> output_;
236  // The number of elements of codes_ that have been processed so far.
237  unsigned codes_used_;
238  // The number of elements of output_ that have already been added to parts_.
239  unsigned output_used_;
240  // Log error messages for reasons why text is invalid.
242 };
243 
244 } // namespace tesseract
245 
246 #endif // TESSERACT_TRAINING_VALIDATOR_H_
tesseract::Validator::MultiCodePart
void MultiCodePart(unsigned length)
Definition: validator.h:196
tesseract::Validator::CharClass::kZeroWidthJoiner
tesseract::Validator::CharClass::kVowelModifier
tesseract::Validator::MostFrequentViramaScript
static ViramaScript MostFrequentViramaScript(const std::vector< char32 > &utf32)
Definition: validator.cpp:143
tesseract::Validator::IsVedicAccent
static bool IsVedicAccent(char32 unicode)
Definition: validator.cpp:191
tesseract::Validator::kSinhalaVirama
static const char32 kSinhalaVirama
Definition: validator.h:236
tesseract::Validator::IsZeroWidthMark
static bool IsZeroWidthMark(char32 ch)
Definition: validator.h:102
tesseract::Validator::UseMultiCode
bool UseMultiCode(unsigned length)
Definition: validator.h:210
tesseract::Validator::IsSubscriptScript
bool IsSubscriptScript() const
Definition: validator.cpp:198
tesseract::Validator::CharClass::kCombiner
tesseract::Validator::MoveResultsToDest
void MoveResultsToDest(GraphemeNormMode g_mode, std::vector< std::vector< char32 >> *dest)
Definition: validator.cpp:114
tesseract::Validator::IndicPair
std::pair< CharClass, char32 > IndicPair
Definition: validator.h:148
tesseract::Validator::kMaxSinhalaUnicode
static const char32 kMaxSinhalaUnicode
Definition: validator.h:232
tesseract::Validator::CharClass::kMatra
tesseract::Validator::IsVirama
static bool IsVirama(char32 unicode)
Definition: validator.cpp:180
tesseract::ViramaScript
ViramaScript
Definition: validator.h:67
tesseract::Validator::codes_
std::vector< IndicPair > codes_
Definition: validator.h:246
tesseract::ViramaScript::kTelugu
tesseract::ViramaScript::kGujarati
tesseract::ViramaScript::kJavanese
tesseract::Validator::kIndicCodePageSize
static const int kIndicCodePageSize
Definition: validator.h:228
tesseract::Validator::kInvalid
static const char32 kInvalid
Definition: validator.h:114
tesseract::ViramaScript::kNonVirama
tesseract::Validator::kMyanmarVirama
static const char32 kMyanmarVirama
Definition: validator.h:237
tesseract::ViramaScript::kKhmer
tesseract::Validator::kRightToLeftMark
static const char32 kRightToLeftMark
Definition: validator.h:113
tesseract::GraphemeNormMode
GraphemeNormMode
Definition: validator.h:48
tesseract::ViramaScript::kMyanmar
tesseract::Validator::kZeroWidthNonJoiner
static const char32 kZeroWidthNonJoiner
Definition: validator.h:110
tesseract::ViramaScript::kTamil
tesseract::Validator::ConsumeGraphemeIfValid
virtual bool ConsumeGraphemeIfValid()=0
tesseract::Validator::ValidateCleanAndSegment
static bool ValidateCleanAndSegment(GraphemeNormMode g_mode, bool report_errors, const std::vector< char32 > &src, std::vector< std::vector< char32 >> *dest)
Definition: validator.cpp:40
tesseract::GraphemeNormMode::kSingleString
tesseract::Validator::CharClass
CharClass
Definition: validator.h:126
tesseract::ViramaScript::kMalayalam
tesseract::Validator::output_used_
unsigned output_used_
Definition: validator.h:254
tesseract::Validator::~Validator
virtual ~Validator()
tesseract::ViramaScript::kBengali
tesseract::GraphemeNormMode::kGlyphSplit
tesseract::Validator::output_
std::vector< char32 > output_
Definition: validator.h:250
tesseract::Validator::kMinIndicUnicode
static const char32 kMinIndicUnicode
Definition: validator.h:230
tesseract::Validator::kZeroWidthSpace
static const char32 kZeroWidthSpace
Definition: validator.h:109
tesseract::ViramaScript::kOriya
tesseract::char32
signed int char32
Definition: unichar.h:53
tesseract::Validator::ValidateCleanAndSegmentInternal
bool ValidateCleanAndSegmentInternal(GraphemeNormMode g_mode, const std::vector< char32 > &src, std::vector< std::vector< char32 >> *dest)
Definition: validator.cpp:97
tesseract::Validator::ScriptValidator
static std::unique_ptr< Validator > ScriptValidator(ViramaScript script, bool report_errors)
Definition: validator.cpp:71
tesseract::ViramaScript::kDevanagari
tesseract::Validator::kZeroWidthJoiner
static const char32 kZeroWidthJoiner
Definition: validator.h:111
tesseract::ViramaScript::kKannada
tesseract::Validator::report_errors_
bool report_errors_
Definition: validator.h:256
tesseract::Validator::kLeftToRightMark
static const char32 kLeftToRightMark
Definition: validator.h:112
tesseract
Definition: baseapi.h:65
tesseract::Validator::CharClass::kVedicMark
tesseract::Validator::codes_used_
unsigned codes_used_
Definition: validator.h:252
tesseract::Validator::kMaxViramaScriptUnicode
static const char32 kMaxViramaScriptUnicode
Definition: validator.h:234
tesseract::Validator::CharClass::kVirama
tesseract::Validator::CodeOnlyToOutput
bool CodeOnlyToOutput()
Definition: validator.h:186
tesseract::Validator::CharClass::kConsonant
tesseract::Validator::Clear
void Clear()
Definition: validator.cpp:214
tesseract::Validator::CharClass::kNukta
unichar.h
tesseract::Validator::UnicodeToCharClass
virtual CharClass UnicodeToCharClass(char32 ch) const =0
tesseract::Validator::ComputeClassCodes
void ComputeClassCodes(const std::vector< char32 > &text)
Definition: validator.cpp:206
tesstrain_utils.dest
dest
Definition: tesstrain_utils.py:139
tesseract::Validator::kKhmerVirama
static const char32 kKhmerVirama
Definition: validator.h:238
tesseract::Validator::kJavaneseVirama
static const char32 kJavaneseVirama
Definition: validator.h:240
tesseract::Validator::CharClass::kVowel
tesseract::GraphemeNormMode::kIndividualUnicodes
tesseract::Validator::CharClass::kRobat
tesseract::GraphemeNormMode::kCombined
tesseract::ViramaScript::kSinhala
tesseract::Validator::CharClass::kOther
tesseract::Validator::CharClass::kWhitespace
tesseract::Validator::kMaxJavaneseUnicode
static const char32 kMaxJavaneseUnicode
Definition: validator.h:241
tesseract::ViramaScript::kGurmukhi
tesseract::Validator::script_
ViramaScript script_
Definition: validator.h:244
tesseract::Validator::CharClass::kZeroWidthNonJoiner
tesseract::Validator::parts_
std::vector< std::vector< char32 > > parts_
Definition: validator.h:248
tesseract::Validator::CharClass::kMatraPiece
tesseract::Validator::Validator
Validator(ViramaScript script, bool report_errors)
Definition: validator.h:150