tesseract  5.0.0-alpha-619-ge9db
ambigs.h
Go to the documentation of this file.
1 // File: ambigs.h
3 // Description: Constants, flags, functions for dealing with
4 // ambiguities (training and recognition).
5 // Author: Daria Antonova
6 //
7 // (C) Copyright 2008, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #ifndef TESSERACT_CCUTIL_AMBIGS_H_
21 #define TESSERACT_CCUTIL_AMBIGS_H_
22 
23 #if !defined(DISABLED_LEGACY_ENGINE)
24 
25 #include "elst.h"
26 #include "tprintf.h"
27 #include <tesseract/unichar.h>
28 #include "unicharset.h"
30 
31 #define MAX_AMBIG_SIZE 10
32 
33 namespace tesseract {
34 
36 
37 enum AmbigType {
38  NOT_AMBIG, // the ngram pair is not ambiguous
39  REPLACE_AMBIG, // ocred ngram should always be substituted with correct
40  DEFINITE_AMBIG, // add correct ngram to the classifier results (1-1)
41  SIMILAR_AMBIG, // use pairwise classifier for ocred/correct pair (1-1)
42  CASE_AMBIG, // this is a case ambiguity (1-1)
43 
44  AMBIG_TYPE_COUNT // number of enum entries
45 };
46 
47 // A collection of utility functions for arrays of UNICHAR_IDs that are
48 // terminated by INVALID_UNICHAR_ID.
50  public:
51  // Compares two arrays of unichar ids. Returns -1 if the length of array1 is
52  // less than length of array2, if any array1[i] is less than array2[i].
53  // Returns 0 if the arrays are equal, 1 otherwise.
54  // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID.
55  static inline int compare(const UNICHAR_ID *ptr1, const UNICHAR_ID *ptr2) {
56  for (;;) {
57  const UNICHAR_ID val1 = *ptr1++;
58  const UNICHAR_ID val2 = *ptr2++;
59  if (val1 != val2) {
60  if (val1 == INVALID_UNICHAR_ID) return -1;
61  if (val2 == INVALID_UNICHAR_ID) return 1;
62  if (val1 < val2) return -1;
63  return 1;
64  }
65  if (val1 == INVALID_UNICHAR_ID) return 0;
66  }
67  }
68 
69  // Look uid in the vector of uids. If found, the index of the matched
70  // element is returned. Otherwise, it returns -1.
71  static inline int find_in(const UnicharIdVector& uid_vec,
72  const UNICHAR_ID uid) {
73  for (int i = 0; i < uid_vec.size(); ++i)
74  if (uid_vec[i] == uid) return i;
75  return -1;
76  }
77 
78  // Copies UNICHAR_IDs from dst to src. Returns the number of ids copied.
79  // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID
80  // and that dst has enough space for all the elements from src.
81  static inline int copy(const UNICHAR_ID src[], UNICHAR_ID dst[]) {
82  int i = 0;
83  do {
84  dst[i] = src[i];
85  } while (dst[i++] != INVALID_UNICHAR_ID);
86  return i - 1;
87  }
88 
89  // Prints unichars corresponding to the unichar_ids in the given array.
90  // The function assumes that array is terminated by INVALID_UNICHAR_ID.
91  static inline void print(const UNICHAR_ID array[],
92  const UNICHARSET &unicharset) {
93  const UNICHAR_ID *ptr = array;
94  if (*ptr == INVALID_UNICHAR_ID) tprintf("[Empty]");
95  while (*ptr != INVALID_UNICHAR_ID) {
96  tprintf("%s ", unicharset.id_to_unichar(*ptr++));
97  }
98  tprintf("( ");
99  ptr = array;
100  while (*ptr != INVALID_UNICHAR_ID) tprintf("%d ", *ptr++);
101  tprintf(")\n");
102  }
103 };
104 
105 // AMBIG_SPEC_LIST stores a list of dangerous ambigs that
106 // start with the same unichar (e.g. r->t rn->m rr1->m).
107 class AmbigSpec : public ELIST_LINK {
108  public:
109  AmbigSpec();
110  ~AmbigSpec() = default;
111 
112  // Comparator function for sorting AmbigSpec_LISTs. The lists will
113  // be sorted by their wrong_ngram arrays. Example of wrong_ngram vectors
114  // in a a sorted AmbigSpec_LIST: [9 1 3], [9 3 4], [9 8], [9, 8 1].
115  static int compare_ambig_specs(const void *spec1, const void *spec2) {
116  const AmbigSpec *s1 = *static_cast<const AmbigSpec *const *>(spec1);
117  const AmbigSpec *s2 = *static_cast<const AmbigSpec *const *>(spec2);
119  if (result != 0) return result;
121  s2->correct_fragments);
122  }
123 
129 };
131 
132 // AMBIG_TABLE[i] stores a set of ambiguities whose
133 // wrong ngram starts with unichar id i.
134 using UnicharAmbigsVector = GenericVector<AmbigSpec_LIST *>;
135 
137  public:
138  UnicharAmbigs() = default;
140  replace_ambigs_.delete_data_pointers();
141  dang_ambigs_.delete_data_pointers();
142  one_to_one_definite_ambigs_.delete_data_pointers();
143  }
144 
145  const UnicharAmbigsVector &dang_ambigs() const { return dang_ambigs_; }
146  const UnicharAmbigsVector &replace_ambigs() const { return replace_ambigs_; }
147 
148  // Initializes the ambigs by adding a nullptr pointer to each table.
149  void InitUnicharAmbigs(const UNICHARSET& unicharset,
150  bool use_ambigs_for_adaption);
151 
152  // Loads the universal ambigs that are useful for any language.
153  void LoadUniversal(const UNICHARSET& encoder_set, UNICHARSET* unicharset);
154 
155  // Fills in two ambiguity tables (replaceable and dangerous) with information
156  // read from the ambigs file. An ambiguity table is an array of lists.
157  // The array is indexed by a class id. Each entry in the table provides
158  // a list of potential ambiguities which can start with the corresponding
159  // character. For example the ambiguity "rn -> m", would be located in the
160  // table at index of unicharset.unichar_to_id('r').
161  // In 1-1 ambiguities (e.g. s -> S, 1 -> I) are recorded in
162  // one_to_one_definite_ambigs_. This vector is also indexed by the class id
163  // of the wrong part of the ambiguity and each entry contains a vector of
164  // unichar ids that are ambiguous to it.
165  // encoder_set is used to encode the ambiguity strings, undisturbed by new
166  // unichar_ids that may be created by adding the ambigs.
167  void LoadUnicharAmbigs(const UNICHARSET& encoder_set,
168  TFile *ambigs_file, int debug_level,
169  bool use_ambigs_for_adaption, UNICHARSET *unicharset);
170 
171  // Returns definite 1-1 ambigs for the given unichar id.
173  UNICHAR_ID unichar_id) const {
174  if (one_to_one_definite_ambigs_.empty()) return nullptr;
175  return one_to_one_definite_ambigs_[unichar_id];
176  }
177 
178  // Returns a pointer to the vector with all unichar ids that appear in the
179  // 'correct' part of the ambiguity pair when the given unichar id appears
180  // in the 'wrong' part of the ambiguity. E.g. if DangAmbigs file consist of
181  // m->rn,rn->m,m->iii, UnicharAmbigsForAdaption() called with unichar id of
182  // m will return a pointer to a vector with unichar ids of r,n,i.
184  UNICHAR_ID unichar_id) const {
185  if (ambigs_for_adaption_.empty()) return nullptr;
186  return ambigs_for_adaption_[unichar_id];
187  }
188 
189  // Similar to the above, but return the vector of unichar ids for which
190  // the given unichar_id is an ambiguity (appears in the 'wrong' part of
191  // some ambiguity pair).
193  UNICHAR_ID unichar_id) const {
194  if (reverse_ambigs_for_adaption_.empty()) return nullptr;
195  return reverse_ambigs_for_adaption_[unichar_id];
196  }
197 
198  private:
199  bool ParseAmbiguityLine(int line_num, int version, int debug_level,
200  const UNICHARSET &unicharset, char *buffer,
201  int *test_ambig_part_size,
202  UNICHAR_ID *test_unichar_ids,
203  int *replacement_ambig_part_size,
204  char *replacement_string, int *type);
205  bool InsertIntoTable(UnicharAmbigsVector &table,
206  int test_ambig_part_size, UNICHAR_ID *test_unichar_ids,
207  int replacement_ambig_part_size,
208  const char *replacement_string, int type,
209  AmbigSpec *ambig_spec, UNICHARSET *unicharset);
210 
211  UnicharAmbigsVector dang_ambigs_;
212  UnicharAmbigsVector replace_ambigs_;
213  GenericVector<UnicharIdVector *> one_to_one_definite_ambigs_;
214  GenericVector<UnicharIdVector *> ambigs_for_adaption_;
215  GenericVector<UnicharIdVector *> reverse_ambigs_for_adaption_;
216 };
217 
218 } // namespace tesseract
219 
220 #endif // !defined(DISABLED_LEGACY_ENGINE)
221 
222 #endif // TESSERACT_CCUTIL_AMBIGS_H_
elst.h
tesseract::AmbigSpec::wrong_ngram
UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE+1]
Definition: ambigs.h:124
tesseract::AMBIG_TYPE_COUNT
Definition: ambigs.h:44
tesseract::CASE_AMBIG
Definition: ambigs.h:42
tesseract::DEFINITE_AMBIG
Definition: ambigs.h:40
tesseract::UnicharAmbigs::ReverseAmbigsForAdaption
const UnicharIdVector * ReverseAmbigsForAdaption(UNICHAR_ID unichar_id) const
Definition: ambigs.h:192
MAX_AMBIG_SIZE
#define MAX_AMBIG_SIZE
Definition: ambigs.h:31
tesseract::UnicharIdArrayUtils
Definition: ambigs.h:49
tesseract::NOT_AMBIG
Definition: ambigs.h:38
ELISTIZEH
#define ELISTIZEH(CLASSNAME)
Definition: elst.h:907
tesseract::UnicharAmbigs::AmbigsForAdaption
const UnicharIdVector * AmbigsForAdaption(UNICHAR_ID unichar_id) const
Definition: ambigs.h:183
tesseract::AmbigSpec::correct_fragments
UNICHAR_ID correct_fragments[MAX_AMBIG_SIZE+1]
Definition: ambigs.h:125
tesseract::AmbigSpec::correct_ngram_id
UNICHAR_ID correct_ngram_id
Definition: ambigs.h:126
tesseract::AmbigSpec::AmbigSpec
AmbigSpec()
Definition: ambigs.cpp:43
genericvector.h
tesseract::AmbigType
AmbigType
Definition: ambigs.h:37
unicharset.h
tesseract::UnicharAmbigs::dang_ambigs
const UnicharAmbigsVector & dang_ambigs() const
Definition: ambigs.h:145
tesseract::TFile
Definition: serialis.h:75
tesseract::UnicharAmbigs
Definition: ambigs.h:136
UNICHARSET
Definition: unicharset.h:145
tesseract::AmbigSpec::wrong_ngram_size
int wrong_ngram_size
Definition: ambigs.h:128
tesseract
Definition: baseapi.h:65
tesseract::UnicharIdArrayUtils::find_in
static int find_in(const UnicharIdVector &uid_vec, const UNICHAR_ID uid)
Definition: ambigs.h:71
tprintf.h
UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:36
tesseract::UnicharIdArrayUtils::compare
static int compare(const UNICHAR_ID *ptr1, const UNICHAR_ID *ptr2)
Definition: ambigs.h:55
GenericVector< UNICHAR_ID >
tesseract::UnicharIdArrayUtils::print
static void print(const UNICHAR_ID array[], const UNICHARSET &unicharset)
Definition: ambigs.h:91
tesseract::AmbigSpec
Definition: ambigs.h:107
tesseract::REPLACE_AMBIG
Definition: ambigs.h:39
unichar.h
tesseract::AmbigSpec::compare_ambig_specs
static int compare_ambig_specs(const void *spec1, const void *spec2)
Definition: ambigs.h:115
tesseract::UnicharAmbigs::~UnicharAmbigs
~UnicharAmbigs()
Definition: ambigs.h:139
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
tesseract::UnicharAmbigs::replace_ambigs
const UnicharAmbigsVector & replace_ambigs() const
Definition: ambigs.h:146
tesstrain_utils.type
type
Definition: tesstrain_utils.py:141
tesseract::AmbigSpec::~AmbigSpec
~AmbigSpec()=default
ELIST_LINK
Definition: elst.h:74
tesseract::UnicharAmbigs::OneToOneDefiniteAmbigs
const UnicharIdVector * OneToOneDefiniteAmbigs(UNICHAR_ID unichar_id) const
Definition: ambigs.h:172
UNICHARSET::id_to_unichar
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
GenericVector::size
int size() const
Definition: genericvector.h:71
tesseract::AmbigSpec::type
AmbigType type
Definition: ambigs.h:127
tesseract::SIMILAR_AMBIG
Definition: ambigs.h:41
tesseract::UnicharIdArrayUtils::copy
static int copy(const UNICHAR_ID src[], UNICHAR_ID dst[])
Definition: ambigs.h:81