tesseract  4.0.0-1-g2a2b
trie.cpp
Go to the documentation of this file.
1 /* -*-C-*-
2  ********************************************************************************
3  *
4  * File: trie.cpp (Formerly trie.c)
5  * Description: Functions to build a trie data structure.
6  * Author: Mark Seaman, OCR Technology
7  * Created: Fri Oct 16 14:37:00 1987
8  * Modified: Fri Jul 26 12:18:10 1991 (Mark Seaman) marks@hpgrlt
9  * Language: C
10  * Package: N/A
11  * Status: Reusable Software Component
12  *
13  * (c) Copyright 1987, Hewlett-Packard Company.
14  ** Licensed under the Apache License, Version 2.0 (the "License");
15  ** you may not use this file except in compliance with the License.
16  ** You may obtain a copy of the License at
17  ** http://www.apache.org/licenses/LICENSE-2.0
18  ** Unless required by applicable law or agreed to in writing, software
19  ** distributed under the License is distributed on an "AS IS" BASIS,
20  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21  ** See the License for the specific language governing permissions and
22  ** limitations under the License.
23  *
24  *********************************************************************************/
25 /*----------------------------------------------------------------------
26  I n c l u d e s
27 ----------------------------------------------------------------------*/
28 
29 #include "trie.h"
30 
31 #include "callcpp.h"
32 #include "dawg.h"
33 #include "dict.h"
34 #include "genericvector.h"
35 #include "helpers.h"
36 #include "kdpair.h"
37 
38 namespace tesseract {
39 
40 const char kDoNotReverse[] = "RRP_DO_NO_REVERSE";
41 const char kReverseIfHasRTL[] = "RRP_REVERSE_IF_HAS_RTL";
42 const char kForceReverse[] = "RRP_FORCE_REVERSE";
43 
44 const char * const RTLReversePolicyNames[] = {
48 };
49 
50 const char Trie::kAlphaPatternUnicode[] = "\u2000";
51 const char Trie::kDigitPatternUnicode[] = "\u2001";
52 const char Trie::kAlphanumPatternUnicode[] = "\u2002";
53 const char Trie::kPuncPatternUnicode[] = "\u2003";
54 const char Trie::kLowerPatternUnicode[] = "\u2004";
55 const char Trie::kUpperPatternUnicode[] = "\u2005";
56 
57 const char *Trie::get_reverse_policy_name(RTLReversePolicy reverse_policy) {
58  return RTLReversePolicyNames[reverse_policy];
59 }
60 
61 // Reset the Trie to empty.
62 void Trie::clear() {
64  nodes_.clear();
66  num_edges_ = 0;
67  new_dawg_node(); // Need to allocate node 0.
68 }
69 
70 bool Trie::edge_char_of(NODE_REF node_ref, NODE_REF next_node,
71  int direction, bool word_end, UNICHAR_ID unichar_id,
72  EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const {
73  if (debug_level_ == 3) {
74  tprintf("edge_char_of() given node_ref " REFFORMAT " next_node " REFFORMAT
75  " direction %d word_end %d unichar_id %d, exploring node:\n",
76  node_ref, next_node, direction, word_end, unichar_id);
77  if (node_ref != NO_EDGE) {
78  print_node(node_ref, nodes_[node_ref]->forward_edges.size());
79  }
80  }
81  if (node_ref == NO_EDGE) return false;
82  assert(node_ref < nodes_.size());
83  EDGE_VECTOR &vec = (direction == FORWARD_EDGE) ?
84  nodes_[node_ref]->forward_edges : nodes_[node_ref]->backward_edges;
85  int vec_size = vec.size();
86  if (node_ref == 0 && direction == FORWARD_EDGE) { // binary search
87  EDGE_INDEX start = 0;
88  EDGE_INDEX end = vec_size - 1;
89  EDGE_INDEX k;
90  int compare;
91  while (start <= end) {
92  k = (start + end) >> 1; // (start + end) / 2
93  compare = given_greater_than_edge_rec(next_node, word_end,
94  unichar_id, vec[k]);
95  if (compare == 0) { // given == vec[k]
96  *edge_ptr = &(vec[k]);
97  *edge_index = k;
98  return true;
99  } else if (compare == 1) { // given > vec[k]
100  start = k + 1;
101  } else { // given < vec[k]
102  end = k - 1;
103  }
104  }
105  } else { // linear search
106  for (int i = 0; i < vec_size; ++i) {
107  EDGE_RECORD &edge_rec = vec[i];
108  if (edge_rec_match(next_node, word_end, unichar_id,
109  next_node_from_edge_rec(edge_rec),
110  end_of_word_from_edge_rec(edge_rec),
111  unichar_id_from_edge_rec(edge_rec))) {
112  *edge_ptr = &(edge_rec);
113  *edge_index = i;
114  return true;
115  }
116  }
117  }
118  return false; // not found
119 }
120 
121 bool Trie::add_edge_linkage(NODE_REF node1, NODE_REF node2, bool marker_flag,
122  int direction, bool word_end,
123  UNICHAR_ID unichar_id) {
124  EDGE_VECTOR *vec = (direction == FORWARD_EDGE) ?
125  &(nodes_[node1]->forward_edges) : &(nodes_[node1]->backward_edges);
126  int search_index;
127  if (node1 == 0 && direction == FORWARD_EDGE) {
128  search_index = 0; // find the index to make the add sorted
129  while (search_index < vec->size() &&
130  given_greater_than_edge_rec(node2, word_end, unichar_id,
131  (*vec)[search_index]) == 1) {
132  search_index++;
133  }
134  } else {
135  search_index = vec->size(); // add is unsorted, so index does not matter
136  }
137  EDGE_RECORD edge_rec;
138  link_edge(&edge_rec, node2, marker_flag, direction, word_end, unichar_id);
139  if (node1 == 0 && direction == BACKWARD_EDGE &&
141  EDGE_INDEX edge_index = root_back_freelist_.pop_back();
142  (*vec)[edge_index] = edge_rec;
143  } else if (search_index < vec->size()) {
144  vec->insert(edge_rec, search_index);
145  } else {
146  vec->push_back(edge_rec);
147  }
148  if (debug_level_ > 1) {
149  tprintf("new edge in nodes_[" REFFORMAT "]: ", node1);
150  print_edge_rec(edge_rec);
151  tprintf("\n");
152  }
153  num_edges_++;
154  return true;
155 }
156 
158  NODE_REF the_next_node,
159  bool marker_flag,
160  UNICHAR_ID unichar_id) {
161  EDGE_RECORD *back_edge_ptr;
162  EDGE_INDEX back_edge_index;
163  ASSERT_HOST(edge_char_of(the_next_node, NO_EDGE, BACKWARD_EDGE, false,
164  unichar_id, &back_edge_ptr, &back_edge_index));
165  if (marker_flag) {
166  *back_edge_ptr |= (MARKER_FLAG << flag_start_bit_);
167  *edge_ptr |= (MARKER_FLAG << flag_start_bit_);
168  }
169  // Mark both directions as end of word.
170  *back_edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
171  *edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
172 }
173 
175  const GenericVector<bool> *repetitions) {
176  if (word.length() <= 0) return false; // can't add empty words
177  if (repetitions != nullptr) ASSERT_HOST(repetitions->size() == word.length());
178  // Make sure the word does not contain invalid unchar ids.
179  for (int i = 0; i < word.length(); ++i) {
180  if (word.unichar_id(i) < 0 ||
181  word.unichar_id(i) >= unicharset_size_) return false;
182  }
183 
184  EDGE_RECORD *edge_ptr;
185  NODE_REF last_node = 0;
186  NODE_REF the_next_node;
187  bool marker_flag = false;
188  EDGE_INDEX edge_index;
189  int i;
190  int32_t still_finding_chars = true;
191  int32_t word_end = false;
192  bool add_failed = false;
193  bool found;
194 
195  if (debug_level_ > 1) word.print("\nAdding word: ");
196 
197  UNICHAR_ID unichar_id;
198  for (i = 0; i < word.length() - 1; ++i) {
199  unichar_id = word.unichar_id(i);
200  marker_flag = (repetitions != nullptr) ? (*repetitions)[i] : false;
201  if (debug_level_ > 1) tprintf("Adding letter %d\n", unichar_id);
202  if (still_finding_chars) {
203  found = edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, word_end,
204  unichar_id, &edge_ptr, &edge_index);
205  if (found && debug_level_ > 1) {
206  tprintf("exploring edge " REFFORMAT " in node " REFFORMAT "\n",
207  edge_index, last_node);
208  }
209  if (!found) {
210  still_finding_chars = false;
211  } else if (next_node_from_edge_rec(*edge_ptr) == 0) {
212  // We hit the end of an existing word, but the new word is longer.
213  // In this case we have to disconnect the existing word from the
214  // backwards root node, mark the current position as end-of-word
215  // and add new nodes for the increased length. Disconnecting the
216  // existing word from the backwards root node requires a linear
217  // search, so it is much faster to add the longest words first,
218  // to avoid having to come here.
219  word_end = true;
220  still_finding_chars = false;
221  remove_edge(last_node, 0, word_end, unichar_id);
222  } else {
223  // We have to add a new branch here for the new word.
224  if (marker_flag) set_marker_flag_in_edge_rec(edge_ptr);
225  last_node = next_node_from_edge_rec(*edge_ptr);
226  }
227  }
228  if (!still_finding_chars) {
229  the_next_node = new_dawg_node();
230  if (debug_level_ > 1)
231  tprintf("adding node " REFFORMAT "\n", the_next_node);
232  if (the_next_node == 0) {
233  add_failed = true;
234  break;
235  }
236  if (!add_new_edge(last_node, the_next_node,
237  marker_flag, word_end, unichar_id)) {
238  add_failed = true;
239  break;
240  }
241  word_end = false;
242  last_node = the_next_node;
243  }
244  }
245  the_next_node = 0;
246  unichar_id = word.unichar_id(i);
247  marker_flag = (repetitions != nullptr) ? (*repetitions)[i] : false;
248  if (debug_level_ > 1) tprintf("Adding letter %d\n", unichar_id);
249  if (still_finding_chars &&
250  edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, false,
251  unichar_id, &edge_ptr, &edge_index)) {
252  // An extension of this word already exists in the trie, so we
253  // only have to add the ending flags in both directions.
254  add_word_ending(edge_ptr, next_node_from_edge_rec(*edge_ptr),
255  marker_flag, unichar_id);
256  } else {
257  // Add a link to node 0. All leaves connect to node 0 so the back links can
258  // be used in reduction to a dawg. This root backward node has one edge
259  // entry for every word, (except prefixes of longer words) so it is huge.
260  if (!add_failed &&
261  !add_new_edge(last_node, the_next_node, marker_flag, true, unichar_id))
262  add_failed = true;
263  }
264  if (add_failed) {
265  tprintf("Re-initializing document dictionary...\n");
266  clear();
267  return false;
268  } else {
269  return true;
270  }
271 }
272 
274  TRIE_NODE_RECORD *node = new TRIE_NODE_RECORD();
275  nodes_.push_back(node);
276  return nodes_.length() - 1;
277 }
278 
279 // Sort function to sort words by decreasing order of length.
280 static int sort_strings_by_dec_length(const void* v1, const void* v2) {
281  const STRING *s1 = static_cast<const STRING *>(v1);
282  const STRING *s2 = static_cast<const STRING *>(v2);
283  return s2->length() - s1->length();
284 }
285 
286 bool Trie::read_and_add_word_list(const char *filename,
287  const UNICHARSET &unicharset,
288  Trie::RTLReversePolicy reverse_policy) {
289  GenericVector<STRING> word_list;
290  if (!read_word_list(filename, &word_list)) return false;
291  word_list.sort(sort_strings_by_dec_length);
292  return add_word_list(word_list, unicharset, reverse_policy);
293 }
294 
295 bool Trie::read_word_list(const char *filename,
296  GenericVector<STRING>* words) {
297  FILE *word_file;
298  char line_str[CHARS_PER_LINE];
299  int word_count = 0;
300 
301  word_file = fopen(filename, "rb");
302  if (word_file == nullptr) return false;
303 
304  while (fgets(line_str, sizeof(line_str), word_file) != nullptr) {
305  chomp_string(line_str); // remove newline
306  STRING word_str(line_str);
307  ++word_count;
308  if (debug_level_ && word_count % 10000 == 0)
309  tprintf("Read %d words so far\n", word_count);
310  words->push_back(word_str);
311  }
312  if (debug_level_)
313  tprintf("Read %d words total.\n", word_count);
314  fclose(word_file);
315  return true;
316 }
317 
319  const UNICHARSET &unicharset,
320  Trie::RTLReversePolicy reverse_policy) {
321  for (int i = 0; i < words.size(); ++i) {
322  WERD_CHOICE word(words[i].string(), unicharset);
323  if (word.length() == 0 || word.contains_unichar_id(INVALID_UNICHAR_ID))
324  continue;
325  if ((reverse_policy == RRP_REVERSE_IF_HAS_RTL &&
326  word.has_rtl_unichar_id()) ||
327  reverse_policy == RRP_FORCE_REVERSE) {
329  }
330  if (!word_in_dawg(word)) {
331  add_word_to_dawg(word);
332  if (!word_in_dawg(word)) {
333  tprintf("Error: word '%s' not in DAWG after adding it\n",
334  words[i].string());
335  return false;
336  }
337  }
338  }
339  return true;
340 }
341 
349  unicharset->unichar_insert(kPuncPatternUnicode);
355  initialized_patterns_ = true;
356  unicharset_size_ = unicharset->size();
357 }
358 
360  const UNICHARSET &unicharset,
361  GenericVector<UNICHAR_ID> *vec) const {
362  bool is_alpha = unicharset.get_isalpha(unichar_id);
363  if (is_alpha) {
366  if (unicharset.get_islower(unichar_id)) {
368  } else if (unicharset.get_isupper(unichar_id)) {
370  }
371  }
372  if (unicharset.get_isdigit(unichar_id)) {
374  if (!is_alpha) vec->push_back(alphanum_pattern_);
375  }
376  if (unicharset.get_ispunctuation(unichar_id)) {
377  vec->push_back(punc_pattern_);
378  }
379 }
380 
382  if (ch == 'c') {
383  return alpha_pattern_;
384  } else if (ch == 'd') {
385  return digit_pattern_;
386  } else if (ch == 'n') {
387  return alphanum_pattern_;
388  } else if (ch == 'p') {
389  return punc_pattern_;
390  } else if (ch == 'a') {
391  return lower_pattern_;
392  } else if (ch == 'A') {
393  return upper_pattern_;
394  } else {
395  return INVALID_UNICHAR_ID;
396  }
397 }
398 
399 bool Trie::read_pattern_list(const char *filename,
400  const UNICHARSET &unicharset) {
401  if (!initialized_patterns_) {
402  tprintf("please call initialize_patterns() before read_pattern_list()\n");
403  return false;
404  }
405 
406  FILE *pattern_file = fopen(filename, "rb");
407  if (pattern_file == nullptr) {
408  tprintf("Error opening pattern file %s\n", filename);
409  return false;
410  }
411 
412  int pattern_count = 0;
413  char string[CHARS_PER_LINE];
414  while (fgets(string, CHARS_PER_LINE, pattern_file) != nullptr) {
415  chomp_string(string); // remove newline
416  // Parse the pattern and construct a unichar id vector.
417  // Record the number of repetitions of each unichar in the parallel vector.
418  WERD_CHOICE word(&unicharset);
419  GenericVector<bool> repetitions_vec;
420  const char *str_ptr = string;
421  int step = unicharset.step(str_ptr);
422  bool failed = false;
423  while (step > 0) {
424  UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
425  if (step == 1 && *str_ptr == '\\') {
426  ++str_ptr;
427  if (*str_ptr == '\\') { // regular '\' unichar that was escaped
428  curr_unichar_id = unicharset.unichar_to_id(str_ptr, step);
429  } else {
430  if (word.length() < kSaneNumConcreteChars) {
431  tprintf("Please provide at least %d concrete characters at the"
432  " beginning of the pattern\n", kSaneNumConcreteChars);
433  failed = true;
434  break;
435  }
436  // Parse character class from expression.
437  curr_unichar_id = character_class_to_pattern(*str_ptr);
438  }
439  } else {
440  curr_unichar_id = unicharset.unichar_to_id(str_ptr, step);
441  }
442  if (curr_unichar_id == INVALID_UNICHAR_ID) {
443  failed = true;
444  break; // failed to parse this pattern
445  }
446  word.append_unichar_id(curr_unichar_id, 1, 0.0, 0.0);
447  repetitions_vec.push_back(false);
448  str_ptr += step;
449  step = unicharset.step(str_ptr);
450  // Check if there is a repetition pattern specified after this unichar.
451  if (step == 1 && *str_ptr == '\\' && *(str_ptr+1) == '*') {
452  repetitions_vec[repetitions_vec.size()-1] = true;
453  str_ptr += 2;
454  step = unicharset.step(str_ptr);
455  }
456  }
457  if (failed) {
458  tprintf("Invalid user pattern %s\n", string);
459  continue;
460  }
461  // Insert the pattern into the trie.
462  if (debug_level_ > 2) {
463  tprintf("Inserting expanded user pattern %s\n",
464  word.debug_string().string());
465  }
466  if (!this->word_in_dawg(word)) {
467  this->add_word_to_dawg(word, &repetitions_vec);
468  if (!this->word_in_dawg(word)) {
469  tprintf("Error: failed to insert pattern '%s'\n", string);
470  }
471  }
472  ++pattern_count;
473  }
474  if (debug_level_) {
475  tprintf("Read %d valid patterns from %s\n", pattern_count, filename);
476  }
477  fclose(pattern_file);
478  return true;
479 }
480 
482  bool word_end, UNICHAR_ID unichar_id) {
483  EDGE_RECORD *edge_ptr = nullptr;
484  EDGE_INDEX edge_index = 0;
485  ASSERT_HOST(edge_char_of(node1, node2, direction, word_end,
486  unichar_id, &edge_ptr, &edge_index));
487  if (debug_level_ > 1) {
488  tprintf("removed edge in nodes_[" REFFORMAT "]: ", node1);
489  print_edge_rec(*edge_ptr);
490  tprintf("\n");
491  }
492  if (direction == FORWARD_EDGE) {
493  nodes_[node1]->forward_edges.remove(edge_index);
494  } else if (node1 == 0) {
495  KillEdge(&nodes_[node1]->backward_edges[edge_index]);
496  root_back_freelist_.push_back(edge_index);
497  } else {
498  nodes_[node1]->backward_edges.remove(edge_index);
499  }
500  --num_edges_;
501 }
502 
503 // Some optimizations employed in add_word_to_dawg and trie_to_dawg:
504 // 1 Avoid insertion sorting or bubble sorting the tail root node
505 // (back links on node 0, a list of all the leaves.). The node is
506 // huge, and sorting it with n^2 time is terrible.
507 // 2 Avoid using GenericVector::remove on the tail root node.
508 // (a) During add of words to the trie, zero-out the unichars and
509 // keep a freelist of spaces to re-use.
510 // (b) During reduction, just zero-out the unichars of deleted back
511 // links, skipping zero entries while searching.
512 // 3 Avoid linear search of the tail root node. This has to be done when
513 // a suffix is added to an existing word. Adding words by decreasing
514 // length avoids this problem entirely. Words can still be added in
515 // any order, but it is faster to add the longest first.
517  root_back_freelist_.clear(); // Will be invalided by trie_to_dawg.
518  if (debug_level_ > 2) {
519  print_all("Before reduction:", MAX_NODE_EDGES_DISPLAY);
520  }
521  NODE_MARKER reduced_nodes = new bool[nodes_.size()];
522  for (int i = 0; i < nodes_.size(); i++) reduced_nodes[i] = 0;
523  this->reduce_node_input(0, reduced_nodes);
524  delete[] reduced_nodes;
525 
526  if (debug_level_ > 2) {
527  print_all("After reduction:", MAX_NODE_EDGES_DISPLAY);
528  }
529  // Build a translation map from node indices in nodes_ vector to
530  // their target indices in EDGE_ARRAY.
531  NODE_REF *node_ref_map = new NODE_REF[nodes_.size() + 1];
532  int i, j;
533  node_ref_map[0] = 0;
534  for (i = 0; i < nodes_.size(); ++i) {
535  node_ref_map[i+1] = node_ref_map[i] + nodes_[i]->forward_edges.size();
536  }
537  int num_forward_edges = node_ref_map[i];
538 
539  // Convert nodes_ vector into EDGE_ARRAY translating the next node references
540  // in edges using node_ref_map. Empty nodes and backward edges are dropped.
541  EDGE_ARRAY edge_array = new EDGE_RECORD[num_forward_edges];
542  EDGE_ARRAY edge_array_ptr = edge_array;
543  for (i = 0; i < nodes_.size(); ++i) {
544  TRIE_NODE_RECORD *node_ptr = nodes_[i];
545  int end = node_ptr->forward_edges.size();
546  for (j = 0; j < end; ++j) {
547  EDGE_RECORD &edge_rec = node_ptr->forward_edges[j];
548  NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
549  ASSERT_HOST(node_ref < nodes_.size());
550  UNICHAR_ID unichar_id = unichar_id_from_edge_rec(edge_rec);
551  link_edge(edge_array_ptr, node_ref_map[node_ref], false, FORWARD_EDGE,
552  end_of_word_from_edge_rec(edge_rec), unichar_id);
553  if (j == end - 1) set_marker_flag_in_edge_rec(edge_array_ptr);
554  ++edge_array_ptr;
555  }
556  }
557  delete[] node_ref_map;
558 
559  return new SquishedDawg(edge_array, num_forward_edges, type_, lang_,
561 }
562 
564  const EDGE_RECORD &edge1,
565  const EDGE_RECORD &edge2) {
566  if (debug_level_ > 1) {
567  tprintf("\nCollapsing node %" PRIi64 ":\n", node);
569  tprintf("Candidate edges: ");
570  print_edge_rec(edge1);
571  tprintf(", ");
572  print_edge_rec(edge2);
573  tprintf("\n\n");
574  }
575  NODE_REF next_node1 = next_node_from_edge_rec(edge1);
576  NODE_REF next_node2 = next_node_from_edge_rec(edge2);
577  TRIE_NODE_RECORD *next_node2_ptr = nodes_[next_node2];
578  // Translate all edges going to/from next_node2 to go to/from next_node1.
579  EDGE_RECORD *edge_ptr = nullptr;
580  EDGE_INDEX edge_index;
581  int i;
582  // The backward link in node to next_node2 will be zeroed out by the caller.
583  // Copy all the backward links in next_node2 to node next_node1
584  for (i = 0; i < next_node2_ptr->backward_edges.size(); ++i) {
585  const EDGE_RECORD &bkw_edge = next_node2_ptr->backward_edges[i];
586  NODE_REF curr_next_node = next_node_from_edge_rec(bkw_edge);
587  UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(bkw_edge);
588  int curr_word_end = end_of_word_from_edge_rec(bkw_edge);
589  bool marker_flag = marker_flag_from_edge_rec(bkw_edge);
590  add_edge_linkage(next_node1, curr_next_node, marker_flag, BACKWARD_EDGE,
591  curr_word_end, curr_unichar_id);
592  // Relocate the corresponding forward edge in curr_next_node
593  ASSERT_HOST(edge_char_of(curr_next_node, next_node2, FORWARD_EDGE,
594  curr_word_end, curr_unichar_id,
595  &edge_ptr, &edge_index));
596  set_next_node_in_edge_rec(edge_ptr, next_node1);
597  }
598  int next_node2_num_edges = (next_node2_ptr->forward_edges.size() +
599  next_node2_ptr->backward_edges.size());
600  if (debug_level_ > 1) {
601  tprintf("removed %d edges from node " REFFORMAT "\n",
602  next_node2_num_edges, next_node2);
603  }
604  next_node2_ptr->forward_edges.clear();
605  next_node2_ptr->backward_edges.clear();
606  num_edges_ -= next_node2_num_edges;
607  return true;
608 }
609 
611  UNICHAR_ID unichar_id,
612  NODE_REF node,
613  EDGE_VECTOR* backward_edges,
614  NODE_MARKER reduced_nodes) {
615  if (debug_level_ > 1)
616  tprintf("reduce_lettered_edges(edge=" REFFORMAT ")\n", edge_index);
617  // Compare each of the edge pairs with the given unichar_id.
618  bool did_something = false;
619  for (int i = edge_index; i < backward_edges->size() - 1; ++i) {
620  // Find the first edge that can be eliminated.
621  UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
622  while (i < backward_edges->size()) {
623  if (!DeadEdge((*backward_edges)[i])) {
624  curr_unichar_id = unichar_id_from_edge_rec((*backward_edges)[i]);
625  if (curr_unichar_id != unichar_id) return did_something;
626  if (can_be_eliminated((*backward_edges)[i])) break;
627  }
628  ++i;
629  }
630  if (i == backward_edges->size()) break;
631  const EDGE_RECORD &edge_rec = (*backward_edges)[i];
632  // Compare it to the rest of the edges with the given unichar_id.
633  for (int j = i + 1; j < backward_edges->size(); ++j) {
634  const EDGE_RECORD &next_edge_rec = (*backward_edges)[j];
635  if (DeadEdge(next_edge_rec)) continue;
636  UNICHAR_ID next_id = unichar_id_from_edge_rec(next_edge_rec);
637  if (next_id != unichar_id) break;
638  if (end_of_word_from_edge_rec(next_edge_rec) ==
639  end_of_word_from_edge_rec(edge_rec) &&
640  can_be_eliminated(next_edge_rec) &&
641  eliminate_redundant_edges(node, edge_rec, next_edge_rec)) {
642  reduced_nodes[next_node_from_edge_rec(edge_rec)] = 0;
643  did_something = true;
644  KillEdge(&(*backward_edges)[j]);
645  }
646  }
647  }
648  return did_something;
649 }
650 
652  int num_edges = edges->size();
653  if (num_edges <= 1) return;
655  sort_vec.reserve(num_edges);
656  for (int i = 0; i < num_edges; ++i) {
658  unichar_id_from_edge_rec((*edges)[i]), (*edges)[i]));
659  }
660  sort_vec.sort();
661  for (int i = 0; i < num_edges; ++i)
662  (*edges)[i] = sort_vec[i].data;
663 }
664 
666  NODE_MARKER reduced_nodes) {
667  EDGE_VECTOR &backward_edges = nodes_[node]->backward_edges;
668  sort_edges(&backward_edges);
669  if (debug_level_ > 1) {
670  tprintf("reduce_node_input(node=" REFFORMAT ")\n", node);
672  }
673 
674  EDGE_INDEX edge_index = 0;
675  while (edge_index < backward_edges.size()) {
676  if (DeadEdge(backward_edges[edge_index])) continue;
677  UNICHAR_ID unichar_id =
678  unichar_id_from_edge_rec(backward_edges[edge_index]);
679  while (reduce_lettered_edges(edge_index, unichar_id, node,
680  &backward_edges, reduced_nodes));
681  while (++edge_index < backward_edges.size()) {
682  UNICHAR_ID id = unichar_id_from_edge_rec(backward_edges[edge_index]);
683  if (!DeadEdge(backward_edges[edge_index]) && id != unichar_id) break;
684  }
685  }
686  reduced_nodes[node] = true; // mark as reduced
687 
688  if (debug_level_ > 1) {
689  tprintf("Node " REFFORMAT " after reduction:\n", node);
691  }
692 
693  for (int i = 0; i < backward_edges.size(); ++i) {
694  if (DeadEdge(backward_edges[i])) continue;
695  NODE_REF next_node = next_node_from_edge_rec(backward_edges[i]);
696  if (next_node != 0 && !reduced_nodes[next_node]) {
697  reduce_node_input(next_node, reduced_nodes);
698  }
699  }
700 }
701 
702 void Trie::print_node(NODE_REF node, int max_num_edges) const {
703  if (node == NO_EDGE) return; // nothing to print
704  TRIE_NODE_RECORD *node_ptr = nodes_[node];
705  int num_fwd = node_ptr->forward_edges.size();
706  int num_bkw = node_ptr->backward_edges.size();
707  EDGE_VECTOR *vec;
708  for (int dir = 0; dir < 2; ++dir) {
709  if (dir == 0) {
710  vec = &(node_ptr->forward_edges);
711  tprintf(REFFORMAT " (%d %d): ", node, num_fwd, num_bkw);
712  } else {
713  vec = &(node_ptr->backward_edges);
714  tprintf("\t");
715  }
716  int i;
717  for (i = 0; (dir == 0 ? i < num_fwd : i < num_bkw) &&
718  i < max_num_edges; ++i) {
719  if (DeadEdge((*vec)[i])) continue;
720  print_edge_rec((*vec)[i]);
721  tprintf(" ");
722  }
723  if (dir == 0 ? i < num_fwd : i < num_bkw) tprintf("...");
724  tprintf("\n");
725  }
726 }
727 
728 } // namespace tesseract
void initialize_patterns(UNICHARSET *unicharset)
Definition: trie.cpp:342
UNICHAR_ID alphanum_pattern_
Definition: trie.h:429
TRIE_NODES nodes_
Definition: trie.h:418
UNICHAR_ID alpha_pattern_
Definition: trie.h:427
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:493
int UNICHAR_ID
Definition: unichar.h:35
void remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.cpp:481
#define FORWARD_EDGE
Definition: dawg.h:85
int size() const
Definition: genericvector.h:71
EDGE_VECTOR forward_edges
Definition: trie.h:48
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:514
GenericVector< EDGE_INDEX > root_back_freelist_
Definition: trie.h:423
UNICHAR_ID upper_pattern_
Definition: trie.h:432
void link_edge(EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:305
void KillEdge(EDGE_RECORD *edge_rec) const
Definition: trie.h:153
const char kForceReverse[]
Definition: trie.cpp:42
void print_edge_rec(const EDGE_RECORD &edge_rec) const
Definition: trie.h:316
#define CHARS_PER_LINE
Definition: dict.h:35
const char * string() const
Definition: strngs.cpp:196
void print() const
Definition: ratngs.h:580
STRING lang_
Definition: dawg.h:302
int64_t EDGE_INDEX
Definition: trie.h:43
UNICHAR_ID lower_pattern_
Definition: trie.h:431
void remove(int index)
bool can_be_eliminated(const EDGE_RECORD &edge_rec)
Definition: trie.h:325
void reserve(int size)
int unicharset_size_
Definition: dawg.h:309
#define BACKWARD_EDGE
Definition: dawg.h:86
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:230
static const char kDigitPatternUnicode[]
Definition: trie.h:75
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:468
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:71
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:43
const char kDoNotReverse[]
Definition: trie.cpp:40
uint64_t num_edges_
Definition: trie.h:419
bool read_word_list(const char *filename, GenericVector< STRING > *words)
Definition: trie.cpp:295
void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset, GenericVector< UNICHAR_ID > *vec) const
Definition: trie.cpp:359
void set_marker_flag_in_edge_rec(EDGE_RECORD *edge_rec)
Sets this edge record to be the last one in a sequence of edges.
Definition: dawg.h:241
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:486
SquishedDawg * trie_to_dawg()
Definition: trie.cpp:516
EDGE_VECTOR backward_edges
Definition: trie.h:49
void set_next_node_in_edge_rec(EDGE_RECORD *edge_rec, EDGE_REF value)
Sets the next node link for this edge in the Dawg.
Definition: dawg.h:235
#define REFFORMAT
Definition: dawg.h:93
#define WERD_END_FLAG
Definition: dawg.h:90
PermuterType perm_
Permuter code that should be used if the word is found in this Dawg.
Definition: dawg.h:304
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
Definition: trie.cpp:174
int size() const
Definition: unicharset.h:336
UNICHAR_ID digit_pattern_
Definition: trie.h:428
int given_greater_than_edge_rec(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
Definition: dawg.h:251
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:625
bool add_edge_linkage(NODE_REF node1, NODE_REF node2, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.cpp:121
static const char * get_reverse_policy_name(RTLReversePolicy reverse_policy)
Definition: trie.cpp:57
void sort_edges(EDGE_VECTOR *edges)
Definition: trie.cpp:651
bool * NODE_MARKER
Definition: trie.h:44
bool eliminate_redundant_edges(NODE_REF node, const EDGE_RECORD &edge1, const EDGE_RECORD &edge2)
Definition: trie.cpp:563
void reverse_and_mirror_unichar_ids()
Definition: ratngs.cpp:365
int64_t NODE_REF
Definition: dawg.h:56
#define MAX_NODE_EDGES_DISPLAY
Definition: dawg.h:87
void insert(const T &t, int index)
void chomp_string(char *str)
Definition: helpers.h:83
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507
bool read_and_add_word_list(const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse)
Definition: trie.cpp:286
bool initialized_patterns_
Definition: trie.h:426
NODE_REF new_dawg_node()
Definition: trie.cpp:273
bool add_new_edge(NODE_REF node1, NODE_REF node2, bool repeats, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:355
bool has_rtl_unichar_id() const
Definition: ratngs.cpp:431
int length() const
Definition: genericvector.h:85
bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the marker flag of this edge.
Definition: dawg.h:217
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: ratngs.cpp:326
const char *const RTLReversePolicyNames[]
Definition: trie.cpp:44
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
bool empty() const
Definition: genericvector.h:90
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
int length() const
Definition: ratngs.h:303
int step(const char *str) const
Definition: unicharset.cpp:232
int push_back(T object)
static const char kAlphaPatternUnicode[]
Definition: trie.h:74
bool add_word_list(const GenericVector< STRING > &words, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse_policy)
Definition: trie.cpp:318
int flag_start_bit_
Definition: dawg.h:310
Definition: strngs.h:45
static const char kLowerPatternUnicode[]
Definition: trie.h:78
uint64_t EDGE_RECORD
Definition: dawg.h:53
const STRING debug_string() const
Definition: ratngs.h:505
static const char kAlphanumPatternUnicode[]
Definition: trie.h:76
RTLReversePolicy
Definition: trie.h:63
void remove_edge(NODE_REF node1, NODE_REF node2, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:380
DawgType type_
Definition: dawg.h:301
void delete_data_pointers()
bool reduce_lettered_edges(EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node, EDGE_VECTOR *backward_edges, NODE_MARKER reduced_nodes)
Definition: trie.cpp:610
bool DeadEdge(const EDGE_RECORD &edge_rec) const
Definition: trie.h:157
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
Definition: dawg.h:226
void print_node(NODE_REF node, int max_num_edges) const
Definition: trie.cpp:702
const char kReverseIfHasRTL[]
Definition: trie.cpp:41
void add_word_ending(EDGE_RECORD *edge, NODE_REF the_next_node, bool repeats, UNICHAR_ID unichar_id)
Definition: trie.cpp:157
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:500
void reduce_node_input(NODE_REF node, NODE_MARKER reduced_nodes)
Definition: trie.cpp:665
static const char kUpperPatternUnicode[]
Definition: trie.h:79
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:213
int debug_level_
Definition: dawg.h:316
UNICHAR_ID character_class_to_pattern(char ch)
Definition: trie.cpp:381
EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const
Definition: trie.h:103
void print_all(const char *msg, int max_num_edges)
Definition: trie.h:333
int32_t length() const
Definition: strngs.cpp:191
bool read_pattern_list(const char *filename, const UNICHARSET &unicharset)
Definition: trie.cpp:399
bool edge_rec_match(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, NODE_REF other_next_node, bool other_word_end, UNICHAR_ID other_unichar_id) const
Definition: dawg.h:272
EDGE_RECORD * EDGE_ARRAY
Definition: dawg.h:54
static const char kPuncPatternUnicode[]
Definition: trie.h:77
#define MARKER_FLAG
Definition: dawg.h:88
UNICHAR_ID punc_pattern_
Definition: trie.h:430
#define ASSERT_HOST(x)
Definition: errcode.h:84
NODE_REF next_node(EDGE_REF edge_ref) const
Definition: trie.h:132
void clear()
Definition: trie.cpp:62
static const int kSaneNumConcreteChars
Definition: trie.h:70