tesseract  5.0.0-alpha-619-ge9db
unicharcompress.h
Go to the documentation of this file.
1 // File: unicharcompress.h
3 // Description: Unicode re-encoding using a sequence of smaller numbers in
4 // place of a single large code for CJK, similarly for Indic,
5 // and dissection of ligatures for other scripts.
6 // Author: Ray Smith
7 // Created: Wed Mar 04 14:45:01 PST 2015
8 //
9 // (C) Copyright 2015, Google Inc.
10 // Licensed under the Apache License, Version 2.0 (the "License");
11 // you may not use this file except in compliance with the License.
12 // You may obtain a copy of the License at
13 // http://www.apache.org/licenses/LICENSE-2.0
14 // Unless required by applicable law or agreed to in writing, software
15 // distributed under the License is distributed on an "AS IS" BASIS,
16 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 // See the License for the specific language governing permissions and
18 // limitations under the License.
19 //
21 
22 #ifndef TESSERACT_CCUTIL_UNICHARCOMPRESS_H_
23 #define TESSERACT_CCUTIL_UNICHARCOMPRESS_H_
24 
25 #include <unordered_map>
26 
27 #include <tesseract/serialis.h>
28 #include <tesseract/strngs.h>
29 #include "unicharset.h"
30 
31 namespace tesseract {
32 
33 // Trivial class to hold the code for a recoded unichar-id.
35  public:
36  // The maximum length of a code.
37  static const int kMaxCodeLen = 9;
38 
39  RecodedCharID() : self_normalized_(1), length_(0) {
40  memset(code_, 0, sizeof(code_));
41  }
42  void Truncate(int length) { length_ = length; }
43  // Sets the code value at the given index in the code.
44  void Set(int index, int value) {
45  code_[index] = value;
46  if (length_ <= index) length_ = index + 1;
47  }
48  // Shorthand for setting codes of length 3, as all Hangul and Han codes are
49  // length 3.
50  void Set3(int code0, int code1, int code2) {
51  length_ = 3;
52  code_[0] = code0;
53  code_[1] = code1;
54  code_[2] = code2;
55  }
56  // Accessors
57  int length() const { return length_; }
58  int operator()(int index) const { return code_[index]; }
59 
60  // Writes to the given file. Returns false in case of error.
61  bool Serialize(TFile* fp) const {
62  return fp->Serialize(&self_normalized_) &&
63  fp->Serialize(&length_) &&
64  fp->Serialize(&code_[0], length_);
65  }
66  // Reads from the given file. Returns false in case of error.
67  bool DeSerialize(TFile* fp) {
68  return fp->DeSerialize(&self_normalized_) &&
69  fp->DeSerialize(&length_) &&
70  fp->DeSerialize(&code_[0], length_);
71  }
72  bool operator==(const RecodedCharID& other) const {
73  if (length_ != other.length_) return false;
74  for (int i = 0; i < length_; ++i) {
75  if (code_[i] != other.code_[i]) return false;
76  }
77  return true;
78  }
79  // Hash functor for RecodedCharID.
81  uint64_t operator()(const RecodedCharID& code) const {
82  uint64_t result = 0;
83  for (int i = 0; i < code.length_; ++i) {
84  result ^= static_cast<uint64_t>(code(i)) << (7 * i);
85  }
86  return result;
87  }
88  };
89 
90  private:
91  // True if this code is self-normalizing, ie is the master entry for indices
92  // that map to the same code. Has boolean value, but int8_t for serialization.
93  int8_t self_normalized_;
94  // The number of elements in use in code_;
95  int32_t length_;
96  // The re-encoded form of the unichar-id to which this RecodedCharID relates.
97  int32_t code_[kMaxCodeLen];
98 };
99 
100 // Class holds a "compression" of a unicharset to simplify the learning problem
101 // for a neural-network-based classifier.
102 // Objectives:
103 // 1 (CJK): Ids of a unicharset with a large number of classes are expressed as
104 // a sequence of 3 codes with much fewer values.
105 // This is achieved using the Jamo coding for Hangul and the Unicode
106 // Radical-Stroke-index for Han.
107 // 2 (Indic): Instead of thousands of codes with one for each grapheme, re-code
108 // as the unicode sequence (but coded in a more compact space).
109 // 3 (the rest): Eliminate multi-path problems with ligatures and fold confusing
110 // and not significantly distinct shapes (quotes) together, ie
111 // represent the fi ligature as the f-i pair, and fold u+2019 and
112 // friends all onto ascii single '
113 // 4 The null character and mapping to target activations:
114 // To save horizontal coding space, the compressed codes are generally mapped
115 // to target network activations without intervening null characters, BUT
116 // in the case of ligatures, such as ff, null characters have to be included
117 // so existence of repeated codes is detected at codebook-building time, and
118 // null characters are embedded directly into the codes, so the rest of the
119 // system doesn't need to worry about the problem (much). There is still an
120 // effect on the range of ways in which the target activations can be
121 // generated.
122 //
123 // The computed code values are compact (no unused values), and, for CJK,
124 // unique (each code position uses a disjoint set of values from each other code
125 // position). For non-CJK, the same code value CAN be used in multiple
126 // positions, eg the ff ligature is converted to <f> <nullchar> <f>, where <f>
127 // is the same code as is used for the single f.
129  public:
130  UnicharCompress();
131  UnicharCompress(const UnicharCompress& src);
134 
135  // The 1st Hangul unicode.
136  static const int kFirstHangul = 0xac00;
137  // The number of Hangul unicodes.
138  static const int kNumHangul = 11172;
139  // The number of Jamos for each of the 3 parts of a Hangul character, being
140  // the Leading consonant, Vowel and Trailing consonant.
141  static const int kLCount = 19;
142  static const int kVCount = 21;
143  static const int kTCount = 28;
144 
145  // Computes the encoding for the given unicharset. It is a requirement that
146  // the file training/langdata/radical-stroke.txt have been read into the
147  // input string radical_stroke_table.
148  // Returns false if the encoding cannot be constructed.
149  bool ComputeEncoding(const UNICHARSET& unicharset, int null_id,
150  STRING* radical_stroke_table);
151  // Sets up an encoder that doesn't change the unichars at all, so it just
152  // passes them through unchanged.
153  void SetupPassThrough(const UNICHARSET& unicharset);
154  // Sets up an encoder directly using the given encoding vector, which maps
155  // unichar_ids to the given codes.
156  void SetupDirect(const GenericVector<RecodedCharID>& codes);
157 
158  // Returns the number of different values that can be used in a code, ie
159  // 1 + the maximum value that will ever be used by an RecodedCharID code in
160  // any position in its array.
161  int code_range() const { return code_range_; }
162 
163  // Encodes a single unichar_id. Returns the length of the code, (or zero if
164  // invalid input), and the encoding itself in code.
165  int EncodeUnichar(int unichar_id, RecodedCharID* code) const;
166  // Decodes code, returning the original unichar-id, or
167  // INVALID_UNICHAR_ID if the input is invalid.
168  int DecodeUnichar(const RecodedCharID& code) const;
169  // Returns true if the given code is a valid start or single code.
170  bool IsValidFirstCode(int code) const { return is_valid_start_[code]; }
171  // Returns a list of valid non-final next codes for a given prefix code,
172  // which may be empty.
173  const GenericVector<int>* GetNextCodes(const RecodedCharID& code) const {
174  auto it = next_codes_.find(code);
175  return it == next_codes_.end() ? nullptr : it->second;
176  }
177  // Returns a list of valid final codes for a given prefix code, which may
178  // be empty.
179  const GenericVector<int>* GetFinalCodes(const RecodedCharID& code) const {
180  auto it = final_codes_.find(code);
181  return it == final_codes_.end() ? nullptr : it->second;
182  }
183 
184  // Writes to the given file. Returns false in case of error.
185  bool Serialize(TFile* fp) const;
186  // Reads from the given file. Returns false in case of error.
187 
188  bool DeSerialize(TFile* fp);
189 
190  // Returns a STRING containing a text file that describes the encoding thus:
191  // <index>[,<index>]*<tab><UTF8-str><newline>
192  // In words, a comma-separated list of one or more indices, followed by a tab
193  // and the UTF-8 string that the code represents per line. Most simple scripts
194  // will encode a single index to a UTF8-string, but Chinese, Japanese, Korean
195  // and the Indic scripts will contain a many-to-many mapping.
196  // See the class comment above for details.
197  STRING GetEncodingAsString(const UNICHARSET& unicharset) const;
198 
199  // Helper decomposes a Hangul unicode to 3 parts, leading, vowel, trailing.
200  // Note that the returned values are 0-based indices, NOT unicode Jamo.
201  // Returns false if the input is not in the Hangul unicode range.
202  static bool DecomposeHangul(int unicode, int* leading, int* vowel,
203  int* trailing);
204 
205  private:
206  // Renumbers codes to eliminate unused values.
207  void DefragmentCodeValues(int encoded_null);
208  // Computes the value of code_range_ from the encoder_.
209  void ComputeCodeRange();
210  // Initializes the decoding hash_map from the encoder_ array.
211  void SetupDecoder();
212  // Frees allocated memory.
213  void Cleanup();
214 
215  // The encoder that maps a unichar-id to a sequence of small codes.
216  // encoder_ is the only part that is serialized. The rest is computed on load.
218  // Decoder converts the output of encoder back to a unichar-id.
219  std::unordered_map<RecodedCharID, int, RecodedCharID::RecodedCharIDHash>
220  decoder_;
221  // True if the index is a valid single or start code.
222  GenericVector<bool> is_valid_start_;
223  // Maps a prefix code to a list of valid next codes.
224  // The map owns the vectors.
225  std::unordered_map<RecodedCharID, GenericVectorEqEq<int>*,
227  next_codes_;
228  // Maps a prefix code to a list of valid final codes.
229  // The map owns the vectors.
230  std::unordered_map<RecodedCharID, GenericVectorEqEq<int>*,
232  final_codes_;
233  // Max of any value in encoder_ + 1.
234  int code_range_;
235 };
236 
237 } // namespace tesseract.
238 
239 #endif // TESSERACT_CCUTIL_UNICHARCOMPRESS_H_
tesseract::RecodedCharID::DeSerialize
bool DeSerialize(TFile *fp)
Definition: unicharcompress.h:67
strngs.h
tesseract::RecodedCharID::RecodedCharIDHash
Definition: unicharcompress.h:80
tesseract::UnicharCompress::SetupDirect
void SetupDirect(const GenericVector< RecodedCharID > &codes)
Definition: unicharcompress.cpp:233
tesseract::UnicharCompress::DeSerialize
bool DeSerialize(TFile *fp)
Definition: unicharcompress.cpp:305
STRING
Definition: strngs.h:45
tesseract::UnicharCompress::code_range
int code_range() const
Definition: unicharcompress.h:161
tesseract::UnicharCompress::DecomposeHangul
static bool DecomposeHangul(int unicode, int *leading, int *vowel, int *trailing)
Definition: unicharcompress.cpp:348
tesseract::UnicharCompress::IsValidFirstCode
bool IsValidFirstCode(int code) const
Definition: unicharcompress.h:170
tesseract::UnicharCompress::kNumHangul
static const int kNumHangul
Definition: unicharcompress.h:138
tesseract::RecodedCharID::operator()
int operator()(int index) const
Definition: unicharcompress.h:58
tesseract::RecodedCharID::Serialize
bool Serialize(TFile *fp) const
Definition: unicharcompress.h:61
tesseract::UnicharCompress::DecodeUnichar
int DecodeUnichar(const RecodedCharID &code) const
Definition: unicharcompress.cpp:291
unicharset.h
tesseract::TFile::DeSerialize
bool DeSerialize(char *data, size_t count=1)
Definition: serialis.cpp:117
tesseract::RecodedCharID::RecodedCharIDHash::operator()
uint64_t operator()(const RecodedCharID &code) const
Definition: unicharcompress.h:81
tesseract::UnicharCompress::~UnicharCompress
~UnicharCompress()
Definition: unicharcompress.cpp:88
tesseract::TFile::Serialize
bool Serialize(const char *data, size_t count=1)
Definition: serialis.cpp:161
tesseract::RecodedCharID::Set
void Set(int index, int value)
Definition: unicharcompress.h:44
tesseract::UnicharCompress::UnicharCompress
UnicharCompress()
Definition: unicharcompress.cpp:86
tesseract::TFile
Definition: serialis.h:75
UNICHARSET
Definition: unicharset.h:145
tesseract
Definition: baseapi.h:65
tesseract::UnicharCompress::kVCount
static const int kVCount
Definition: unicharcompress.h:142
tesseract::RecodedCharID::operator==
bool operator==(const RecodedCharID &other) const
Definition: unicharcompress.h:72
tesseract::RecodedCharID::Truncate
void Truncate(int length)
Definition: unicharcompress.h:42
tesseract::RecodedCharID::length
int length() const
Definition: unicharcompress.h:57
tesseract::RecodedCharID
Definition: unicharcompress.h:34
GenericVector
Definition: baseapi.h:40
tesseract::UnicharCompress::GetFinalCodes
const GenericVector< int > * GetFinalCodes(const RecodedCharID &code) const
Definition: unicharcompress.h:179
tesseract::RecodedCharID::kMaxCodeLen
static const int kMaxCodeLen
Definition: unicharcompress.h:37
tesseract::UnicharCompress::ComputeEncoding
bool ComputeEncoding(const UNICHARSET &unicharset, int null_id, STRING *radical_stroke_table)
Definition: unicharcompress.cpp:101
tesseract::RecodedCharID::RecodedCharID
RecodedCharID()
Definition: unicharcompress.h:39
tesseract::UnicharCompress::SetupPassThrough
void SetupPassThrough(const UNICHARSET &unicharset)
Definition: unicharcompress.cpp:216
tesseract::UnicharCompress::kFirstHangul
static const int kFirstHangul
Definition: unicharcompress.h:136
tesseract::UnicharCompress::GetNextCodes
const GenericVector< int > * GetNextCodes(const RecodedCharID &code) const
Definition: unicharcompress.h:173
tesseract::UnicharCompress::GetEncodingAsString
STRING GetEncodingAsString(const UNICHARSET &unicharset) const
Definition: unicharcompress.cpp:319
serialis.h
tesseract::UnicharCompress::kTCount
static const int kTCount
Definition: unicharcompress.h:143
tesseract::UnicharCompress::EncodeUnichar
int EncodeUnichar(int unichar_id, RecodedCharID *code) const
Definition: unicharcompress.cpp:283
tesseract::UnicharCompress::operator=
UnicharCompress & operator=(const UnicharCompress &src)
Definition: unicharcompress.cpp:89
tesseract::UnicharCompress
Definition: unicharcompress.h:128
tesseract::RecodedCharID::Set3
void Set3(int code0, int code1, int code2)
Definition: unicharcompress.h:50
tesseract::UnicharCompress::kLCount
static const int kLCount
Definition: unicharcompress.h:141
tesseract::UnicharCompress::Serialize
bool Serialize(TFile *fp) const
Definition: unicharcompress.cpp:300