tesseract  4.0.0-1-g2a2b
ambigs.cpp
Go to the documentation of this file.
1 // File: ambigs.cpp
3 // Description: Functions for dealing with ambiguities
4 // (training and recognition).
5 // Author: Daria Antonova
6 // Created: Mon Feb 5 11:26:43 PDT 2009
7 //
8 // (C) Copyright 2008, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
20 
21 #include "ambigs.h"
22 
23 #include <cstdio>
24 #include "helpers.h"
25 #include "universalambigs.h"
26 
27 #if defined(_WIN32) && !defined(__GNUC__)
28 #define strtok_r(str, delim, saveptr) strtok_s(str, delim, saveptr)
29 #endif /* _WIN32 && !__GNUC__ */
30 
31 namespace tesseract {
32 
33 // Maximum line size:
34 // 10 for sizes of ambigs, tabs, abmig type and newline
35 // UNICHAR_LEN * (MAX_AMBIG_SIZE + 1) for each part of the ambig
37 
39  wrong_ngram[0] = INVALID_UNICHAR_ID;
40  correct_fragments[0] = INVALID_UNICHAR_ID;
41  correct_ngram_id = INVALID_UNICHAR_ID;
42  type = NOT_AMBIG;
43  wrong_ngram_size = 0;
44 }
45 
47 
48 // Initializes the ambigs by adding a nullptr pointer to each table.
49 void UnicharAmbigs::InitUnicharAmbigs(const UNICHARSET& unicharset,
50  bool use_ambigs_for_adaption) {
51  for (int i = 0; i < unicharset.size(); ++i) {
52  replace_ambigs_.push_back(nullptr);
53  dang_ambigs_.push_back(nullptr);
54  one_to_one_definite_ambigs_.push_back(nullptr);
55  if (use_ambigs_for_adaption) {
56  ambigs_for_adaption_.push_back(nullptr);
57  reverse_ambigs_for_adaption_.push_back(nullptr);
58  }
59  }
60 }
61 
62 // Loads the universal ambigs that are useful for any language.
63 void UnicharAmbigs::LoadUniversal(const UNICHARSET& encoder_set,
64  UNICHARSET* unicharset) {
65  TFile file;
67  LoadUnicharAmbigs(encoder_set, &file, 0, false, unicharset);
68 }
69 
71  TFile *ambig_file,
72  int debug_level,
73  bool use_ambigs_for_adaption,
74  UNICHARSET *unicharset) {
75  int i, j;
76  UnicharIdVector *adaption_ambigs_entry;
77  if (debug_level) tprintf("Reading ambiguities\n");
78 
79  int test_ambig_part_size;
80  int replacement_ambig_part_size;
81  // The space for buffer is allocated on the heap to avoid
82  // GCC frame size warning.
83  const int kBufferSize = 10 + 2 * kMaxAmbigStringSize;
84  char *buffer = new char[kBufferSize];
85  char replacement_string[kMaxAmbigStringSize];
86  UNICHAR_ID test_unichar_ids[MAX_AMBIG_SIZE + 1];
87  int line_num = 0;
88  int type = NOT_AMBIG;
89 
90  // Determine the version of the ambigs file.
91  int version = 0;
92  ASSERT_HOST(ambig_file->FGets(buffer, kBufferSize) != nullptr &&
93  strlen(buffer) > 0);
94  if (*buffer == 'v') {
95  version = static_cast<int>(strtol(buffer+1, nullptr, 10));
96  ++line_num;
97  } else {
98  ambig_file->Rewind();
99  }
100  while (ambig_file->FGets(buffer, kBufferSize) != nullptr) {
101  chomp_string(buffer);
102  if (debug_level > 2) tprintf("read line %s\n", buffer);
103  ++line_num;
104  if (!ParseAmbiguityLine(line_num, version, debug_level, encoder_set,
105  buffer, &test_ambig_part_size, test_unichar_ids,
106  &replacement_ambig_part_size,
107  replacement_string, &type)) continue;
108  // Construct AmbigSpec and add it to the appropriate AmbigSpec_LIST.
109  AmbigSpec *ambig_spec = new AmbigSpec();
110  if (!InsertIntoTable((type == REPLACE_AMBIG) ? replace_ambigs_
111  : dang_ambigs_,
112  test_ambig_part_size, test_unichar_ids,
113  replacement_ambig_part_size, replacement_string, type,
114  ambig_spec, unicharset))
115  continue;
116 
117  // Update one_to_one_definite_ambigs_.
118  if (test_ambig_part_size == 1 &&
119  replacement_ambig_part_size == 1 && type == DEFINITE_AMBIG) {
120  if (one_to_one_definite_ambigs_[test_unichar_ids[0]] == nullptr) {
121  one_to_one_definite_ambigs_[test_unichar_ids[0]] = new UnicharIdVector();
122  }
123  one_to_one_definite_ambigs_[test_unichar_ids[0]]->push_back(
124  ambig_spec->correct_ngram_id);
125  }
126  // Update ambigs_for_adaption_.
127  if (use_ambigs_for_adaption) {
128  GenericVector<UNICHAR_ID> encoding;
129  // Silently ignore invalid strings, as before, so it is safe to use a
130  // universal ambigs file.
131  if (unicharset->encode_string(replacement_string, true, &encoding,
132  nullptr, nullptr)) {
133  for (i = 0; i < test_ambig_part_size; ++i) {
134  if (ambigs_for_adaption_[test_unichar_ids[i]] == nullptr) {
135  ambigs_for_adaption_[test_unichar_ids[i]] = new UnicharIdVector();
136  }
137  adaption_ambigs_entry = ambigs_for_adaption_[test_unichar_ids[i]];
138  for (int r = 0; r < encoding.size(); ++r) {
139  UNICHAR_ID id_to_insert = encoding[r];
140  ASSERT_HOST(id_to_insert != INVALID_UNICHAR_ID);
141  // Add the new unichar id to adaption_ambigs_entry (only if the
142  // vector does not already contain it) keeping it in sorted order.
143  for (j = 0; j < adaption_ambigs_entry->size() &&
144  (*adaption_ambigs_entry)[j] > id_to_insert; ++j);
145  if (j < adaption_ambigs_entry->size()) {
146  if ((*adaption_ambigs_entry)[j] != id_to_insert) {
147  adaption_ambigs_entry->insert(id_to_insert, j);
148  }
149  } else {
150  adaption_ambigs_entry->push_back(id_to_insert);
151  }
152  }
153  }
154  }
155  }
156  }
157  delete[] buffer;
158 
159  // Fill in reverse_ambigs_for_adaption from ambigs_for_adaption vector.
160  if (use_ambigs_for_adaption) {
161  for (i = 0; i < ambigs_for_adaption_.size(); ++i) {
162  adaption_ambigs_entry = ambigs_for_adaption_[i];
163  if (adaption_ambigs_entry == nullptr) continue;
164  for (j = 0; j < adaption_ambigs_entry->size(); ++j) {
165  UNICHAR_ID ambig_id = (*adaption_ambigs_entry)[j];
166  if (reverse_ambigs_for_adaption_[ambig_id] == nullptr) {
167  reverse_ambigs_for_adaption_[ambig_id] = new UnicharIdVector();
168  }
169  reverse_ambigs_for_adaption_[ambig_id]->push_back(i);
170  }
171  }
172  }
173 
174  // Print what was read from the input file.
175  if (debug_level > 1) {
176  for (int tbl = 0; tbl < 2; ++tbl) {
177  const UnicharAmbigsVector &print_table =
178  (tbl == 0) ? replace_ambigs_ : dang_ambigs_;
179  for (i = 0; i < print_table.size(); ++i) {
180  AmbigSpec_LIST *lst = print_table[i];
181  if (lst == nullptr) continue;
182  if (!lst->empty()) {
183  tprintf("%s Ambiguities for %s:\n",
184  (tbl == 0) ? "Replaceable" : "Dangerous",
185  unicharset->debug_str(i).string());
186  }
187  AmbigSpec_IT lst_it(lst);
188  for (lst_it.mark_cycle_pt(); !lst_it.cycled_list(); lst_it.forward()) {
189  AmbigSpec *ambig_spec = lst_it.data();
190  tprintf("wrong_ngram:");
191  UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, *unicharset);
192  tprintf("correct_fragments:");
193  UnicharIdArrayUtils::print(ambig_spec->correct_fragments, *unicharset);
194  }
195  }
196  }
197  if (use_ambigs_for_adaption) {
198  for (int vec_id = 0; vec_id < 2; ++vec_id) {
199  const GenericVector<UnicharIdVector *> &vec = (vec_id == 0) ?
200  ambigs_for_adaption_ : reverse_ambigs_for_adaption_;
201  for (i = 0; i < vec.size(); ++i) {
202  adaption_ambigs_entry = vec[i];
203  if (adaption_ambigs_entry != nullptr) {
204  tprintf("%sAmbigs for adaption for %s:\n",
205  (vec_id == 0) ? "" : "Reverse ",
206  unicharset->debug_str(i).string());
207  for (j = 0; j < adaption_ambigs_entry->size(); ++j) {
208  tprintf("%s ", unicharset->debug_str(
209  (*adaption_ambigs_entry)[j]).string());
210  }
211  tprintf("\n");
212  }
213  }
214  }
215  }
216  }
217 }
218 
219 bool UnicharAmbigs::ParseAmbiguityLine(
220  int line_num, int version, int debug_level, const UNICHARSET &unicharset,
221  char *buffer, int *test_ambig_part_size, UNICHAR_ID *test_unichar_ids,
222  int *replacement_ambig_part_size, char *replacement_string, int *type) {
223  if (version > 1) {
224  // Simpler format is just wrong-string correct-string type\n.
225  STRING input(buffer);
226  GenericVector<STRING> fields;
227  input.split(' ', &fields);
228  if (fields.size() != 3) {
229  if (debug_level) tprintf(kIllegalMsg, line_num);
230  return false;
231  }
232  // Encode wrong-string.
233  GenericVector<UNICHAR_ID> unichars;
234  if (!unicharset.encode_string(fields[0].string(), true, &unichars, nullptr,
235  nullptr)) {
236  return false;
237  }
238  *test_ambig_part_size = unichars.size();
239  if (*test_ambig_part_size > MAX_AMBIG_SIZE) {
240  if (debug_level)
241  tprintf("Too many unichars in ambiguity on line %d\n", line_num);
242  return false;
243  }
244  // Copy encoded string to output.
245  for (int i = 0; i < unichars.size(); ++i)
246  test_unichar_ids[i] = unichars[i];
247  test_unichar_ids[unichars.size()] = INVALID_UNICHAR_ID;
248  // Encode replacement-string to check validity.
249  if (!unicharset.encode_string(fields[1].string(), true, &unichars, nullptr,
250  nullptr)) {
251  return false;
252  }
253  *replacement_ambig_part_size = unichars.size();
254  if (*replacement_ambig_part_size > MAX_AMBIG_SIZE) {
255  if (debug_level)
256  tprintf("Too many unichars in ambiguity on line %d\n", line_num);
257  return false;
258  }
259  if (sscanf(fields[2].string(), "%d", type) != 1) {
260  if (debug_level) tprintf(kIllegalMsg, line_num);
261  return false;
262  }
263  snprintf(replacement_string, kMaxAmbigStringSize, "%s", fields[1].string());
264  return true;
265  }
266  int i;
267  char *token;
268  char *next_token;
269  if (!(token = strtok_r(buffer, kAmbigDelimiters, &next_token)) ||
270  !sscanf(token, "%d", test_ambig_part_size) ||
271  *test_ambig_part_size <= 0) {
272  if (debug_level) tprintf(kIllegalMsg, line_num);
273  return false;
274  }
275  if (*test_ambig_part_size > MAX_AMBIG_SIZE) {
276  if (debug_level)
277  tprintf("Too many unichars in ambiguity on line %d\n", line_num);
278  return false;
279  }
280  for (i = 0; i < *test_ambig_part_size; ++i) {
281  if (!(token = strtok_r(nullptr, kAmbigDelimiters, &next_token))) break;
282  if (!unicharset.contains_unichar(token)) {
283  if (debug_level) tprintf(kIllegalUnicharMsg, token);
284  break;
285  }
286  test_unichar_ids[i] = unicharset.unichar_to_id(token);
287  }
288  test_unichar_ids[i] = INVALID_UNICHAR_ID;
289 
290  if (i != *test_ambig_part_size ||
291  !(token = strtok_r(nullptr, kAmbigDelimiters, &next_token)) ||
292  !sscanf(token, "%d", replacement_ambig_part_size) ||
293  *replacement_ambig_part_size <= 0) {
294  if (debug_level) tprintf(kIllegalMsg, line_num);
295  return false;
296  }
297  if (*replacement_ambig_part_size > MAX_AMBIG_SIZE) {
298  if (debug_level)
299  tprintf("Too many unichars in ambiguity on line %d\n", line_num);
300  return false;
301  }
302  replacement_string[0] = '\0';
303  for (i = 0; i < *replacement_ambig_part_size; ++i) {
304  if (!(token = strtok_r(nullptr, kAmbigDelimiters, &next_token))) break;
305  strcat(replacement_string, token);
306  if (!unicharset.contains_unichar(token)) {
307  if (debug_level) tprintf(kIllegalUnicharMsg, token);
308  break;
309  }
310  }
311  if (i != *replacement_ambig_part_size) {
312  if (debug_level) tprintf(kIllegalMsg, line_num);
313  return false;
314  }
315  if (version > 0) {
316  // The next field being true indicates that the abiguity should
317  // always be substituted (e.g. '' should always be changed to ").
318  // For such "certain" n -> m ambigs tesseract will insert character
319  // fragments for the n pieces in the unicharset. AmbigsFound()
320  // will then replace the incorrect ngram with the character
321  // fragments of the correct character (or ngram if m > 1).
322  // Note that if m > 1, an ngram will be inserted into the
323  // modified word, not the individual unigrams. Tesseract
324  // has limited support for ngram unichar (e.g. dawg permuter).
325  if (!(token = strtok_r(nullptr, kAmbigDelimiters, &next_token)) ||
326  !sscanf(token, "%d", type)) {
327  if (debug_level) tprintf(kIllegalMsg, line_num);
328  return false;
329  }
330  }
331  return true;
332 }
333 
334 bool UnicharAmbigs::InsertIntoTable(
335  UnicharAmbigsVector &table, int test_ambig_part_size,
336  UNICHAR_ID *test_unichar_ids, int replacement_ambig_part_size,
337  const char *replacement_string, int type,
338  AmbigSpec *ambig_spec, UNICHARSET *unicharset) {
339  ambig_spec->type = static_cast<AmbigType>(type);
340  if (test_ambig_part_size == 1 && replacement_ambig_part_size == 1 &&
341  unicharset->to_lower(test_unichar_ids[0]) ==
342  unicharset->to_lower(unicharset->unichar_to_id(replacement_string))) {
343  ambig_spec->type = CASE_AMBIG;
344  }
345 
346  ambig_spec->wrong_ngram_size =
347  UnicharIdArrayUtils::copy(test_unichar_ids, ambig_spec->wrong_ngram);
348 
349  // Since we need to maintain a constant number of unichar positions in
350  // order to construct ambig_blob_choices vector in NoDangerousAmbig(), for
351  // each n->m ambiguity we will have to place n character fragments of the
352  // correct ngram into the corresponding positions in the vector (e.g. given
353  // "vvvvw" and vvvv->ww we will place v and |ww|0|4 into position 0, v and
354  // |ww|1|4 into position 1 and so on. The correct ngram is reconstructed
355  // from fragments by dawg_permute_and_select().
356 
357  // Insert the corresponding correct ngram into the unicharset.
358  // Unicharset code assumes that the "base" ngram is inserted into
359  // the unicharset before fragments of this ngram are inserted.
360  unicharset->unichar_insert(replacement_string, OldUncleanUnichars::kTrue);
361  ambig_spec->correct_ngram_id =
362  unicharset->unichar_to_id(replacement_string);
363  if (replacement_ambig_part_size > 1) {
364  unicharset->set_isngram(ambig_spec->correct_ngram_id, true);
365  }
366  // Add the corresponding fragments of the wrong ngram to unicharset.
367  int i;
368  for (i = 0; i < test_ambig_part_size; ++i) {
369  UNICHAR_ID unichar_id;
370  if (test_ambig_part_size == 1) {
371  unichar_id = ambig_spec->correct_ngram_id;
372  } else {
373  STRING frag_str = CHAR_FRAGMENT::to_string(
374  replacement_string, i, test_ambig_part_size, false);
375  unicharset->unichar_insert(frag_str.string(), OldUncleanUnichars::kTrue);
376  unichar_id = unicharset->unichar_to_id(frag_str.string());
377  }
378  ambig_spec->correct_fragments[i] = unichar_id;
379  }
380  ambig_spec->correct_fragments[i] = INVALID_UNICHAR_ID;
381 
382  // Add AmbigSpec for this ambiguity to the corresponding AmbigSpec_LIST.
383  // Keep AmbigSpec_LISTs sorted by AmbigSpec.wrong_ngram.
384  if (table[test_unichar_ids[0]] == nullptr) {
385  table[test_unichar_ids[0]] = new AmbigSpec_LIST();
386  }
387  if (table[test_unichar_ids[0]]->add_sorted(
388  AmbigSpec::compare_ambig_specs, true, ambig_spec))
389  return true;
390  delete ambig_spec;
391  return false;
392 }
393 
394 } // namespace tesseract
const int kMaxAmbigStringSize
Definition: ambigs.cpp:36
int UNICHAR_ID
Definition: unichar.h:35
int size() const
Definition: genericvector.h:71
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:258
const char kUniversalAmbigsFile[]
#define MAX_AMBIG_SIZE
Definition: ambigs.h:30
const char * string() const
Definition: strngs.cpp:196
static int compare_ambig_specs(const void *spec1, const void *spec2)
Definition: ambigs.h:122
char * FGets(char *buffer, int buffer_size)
Definition: serialis.cpp:248
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
void set_isngram(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:451
void LoadUniversal(const UNICHARSET &encoder_set, UNICHARSET *unicharset)
Definition: ambigs.cpp:63
#define UNICHAR_LEN
Definition: unichar.h:31
GenericVector< UNICHAR_ID > UnicharIdVector
Definition: ambigs.h:34
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:625
void insert(const T &t, int index)
void chomp_string(char *str)
Definition: helpers.h:83
static void print(const UNICHAR_ID array[], const UNICHARSET &unicharset)
Definition: ambigs.h:98
STRING to_string() const
Definition: unicharset.h:80
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:670
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:342
static int copy(const UNICHAR_ID src[], UNICHAR_ID dst[])
Definition: ambigs.h:88
#define ELISTIZE(CLASSNAME)
Definition: elst.h:961
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
bool Open(const STRING &filename, FileReader reader)
Definition: serialis.cpp:196
int push_back(T object)
UNICHAR_ID correct_ngram_id
Definition: ambigs.h:133
AmbigType
Definition: ambigs.h:44
Definition: strngs.h:45
UNICHAR_ID correct_fragments[MAX_AMBIG_SIZE+1]
Definition: ambigs.h:132
const int ksizeofUniversalAmbigsFile
AmbigType type
Definition: ambigs.h:134
void LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambigs_file, int debug_level, bool use_ambigs_for_adaption, UNICHARSET *unicharset)
Definition: ambigs.cpp:70
GenericVector< AmbigSpec_LIST * > UnicharAmbigsVector
Definition: ambigs.h:141
UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:699
UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE+1]
Definition: ambigs.h:131
#define ASSERT_HOST(x)
Definition: errcode.h:84