tesseract  5.0.0-alpha-619-ge9db
trie.h
Go to the documentation of this file.
1 /******************************************************************************
2  *
3  * File: trie.h
4  * Description: Functions to build a trie data structure.
5  * Author: Mark Seaman, SW Productivity
6  *
7  * (c) Copyright 1987, Hewlett-Packard Company.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  *****************************************************************************/
19 #ifndef TRIE_H
20 #define TRIE_H
21 
22 #include "dawg.h"
24 
25 class UNICHARSET;
26 
27 // Note: if we consider either NODE_REF or EDGE_INDEX to ever exceed
28 // max int32, we will need to change GenericVector to use int64 for size
29 // and address indices. This does not seem to be needed immediately,
30 // since currently the largest number of edges limit used by tesseract
31 // (kMaxNumEdges in wordlist2dawg.cpp) is far less than max int32.
32 // There are also int casts below to satisfy the WIN32 compiler that would
33 // need to be changed.
34 // It might be cleanest to change the types of most of the Trie/Dawg related
35 // typedefs to int and restrict the casts to extracting these values from
36 // the 64 bit EDGE_RECORD.
37 using EDGE_INDEX = int64_t ; // index of an edge in a given node
38 using NODE_MARKER = bool *;
40 
44 };
46 
47 namespace tesseract {
48 
55 class Trie : public Dawg {
56  public:
61  };
62 
63  // Minimum number of concrete characters at the beginning of user patterns.
64  static const int kSaneNumConcreteChars = 0;
65  // Various unicode whitespace characters are used to denote unichar patterns,
66  // (character classifier would never produce these whitespace characters as a
67  // valid classification).
68  static const char kAlphaPatternUnicode[];
69  static const char kDigitPatternUnicode[];
70  static const char kAlphanumPatternUnicode[];
71  static const char kPuncPatternUnicode[];
72  static const char kLowerPatternUnicode[];
73  static const char kUpperPatternUnicode[];
74 
75  static const char *get_reverse_policy_name(
76  RTLReversePolicy reverse_policy);
77 
78  // max_num_edges argument allows limiting the amount of memory this
79  // Trie can consume (if a new word insert would cause the Trie to
80  // contain more edges than max_num_edges, all the edges are cleared
81  // so that new inserts can proceed).
82  Trie(DawgType type, const STRING &lang, PermuterType perm,
83  int unicharset_size, int debug_level)
84  : Dawg(type, lang, perm, debug_level) {
85  init(unicharset_size);
86  num_edges_ = 0;
88  new_dawg_node(); // need to allocate node 0
89  initialized_patterns_ = false;
90  }
91  ~Trie() override { nodes_.delete_data_pointers(); }
92 
93  // Reset the Trie to empty.
94  void clear();
95 
97  EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id,
98  bool word_end) const override {
99  EDGE_RECORD *edge_ptr;
100  EDGE_INDEX edge_index;
101  if (!edge_char_of(node_ref, NO_EDGE, FORWARD_EDGE, word_end, unichar_id,
102  &edge_ptr, &edge_index)) return NO_EDGE;
103  return make_edge_ref(node_ref, edge_index);
104  }
105 
110  void unichar_ids_of(NODE_REF node, NodeChildVector *vec,
111  bool word_end) const override {
112  const EDGE_VECTOR &forward_edges =
113  nodes_[static_cast<int>(node)]->forward_edges;
114  for (int i = 0; i < forward_edges.size(); ++i) {
115  if (!word_end || end_of_word_from_edge_rec(forward_edges[i])) {
116  vec->push_back(NodeChild(unichar_id_from_edge_rec(forward_edges[i]),
117  make_edge_ref(node, i)));
118  }
119  }
120  }
121 
126  NODE_REF next_node(EDGE_REF edge_ref) const override {
127  if (edge_ref == NO_EDGE || num_edges_ == 0) return NO_EDGE;
128  return next_node_from_edge_rec(*deref_edge_ref(edge_ref));
129  }
130 
135  bool end_of_word(EDGE_REF edge_ref) const override {
136  if (edge_ref == NO_EDGE || num_edges_ == 0) return false;
137  return end_of_word_from_edge_rec(*deref_edge_ref(edge_ref));
138  }
139 
141  UNICHAR_ID edge_letter(EDGE_REF edge_ref) const override {
142  if (edge_ref == NO_EDGE || num_edges_ == 0) return INVALID_UNICHAR_ID;
143  return unichar_id_from_edge_rec(*deref_edge_ref(edge_ref));
144  }
145  // Sets the UNICHAR_ID in the given edge_rec to unicharset_size_, marking
146  // the edge dead.
147  void KillEdge(EDGE_RECORD* edge_rec) const {
148  *edge_rec &= ~letter_mask_;
149  *edge_rec |= (unicharset_size_ << LETTER_START_BIT);
150  }
151  bool DeadEdge(const EDGE_RECORD& edge_rec) const {
152  return unichar_id_from_edge_rec(edge_rec) == unicharset_size_;
153  }
154 
155  // Prints the contents of the node indicated by the given NODE_REF.
156  // At most max_num_edges will be printed.
157  void print_node(NODE_REF node, int max_num_edges) const override;
158 
159  // Writes edges from nodes_ to an EDGE_ARRAY and creates a SquishedDawg.
160  // Eliminates redundant edges and returns the pointer to the SquishedDawg.
161  // Note: the caller is responsible for deallocating memory associated
162  // with the returned SquishedDawg pointer.
163  SquishedDawg *trie_to_dawg();
164 
165  // Reads a list of words from the given file and adds into the Trie.
166  // Calls WERD_CHOICE::reverse_unichar_ids_if_rtl() according to the reverse
167  // policy and information in the unicharset.
168  // Returns false on error.
169  bool read_and_add_word_list(const char *filename,
170  const UNICHARSET &unicharset,
171  Trie::RTLReversePolicy reverse);
172 
173  // Reads a list of words from the given file.
174  // Returns false on error.
175  bool read_word_list(const char *filename,
176  GenericVector<STRING>* words);
177  // Adds a list of words previously read using read_word_list to the trie
178  // using the given unicharset and reverse_policy to convert to unichar-ids.
179  // Returns false on error.
180  bool add_word_list(const GenericVector<STRING> &words,
181  const UNICHARSET &unicharset,
182  Trie::RTLReversePolicy reverse_policy);
183 
184  // Inserts the list of patterns from the given file into the Trie.
185  // The pattern list file should contain one pattern per line in UTF-8 format.
186  //
187  // Each pattern can contain any non-whitespace characters, however only the
188  // patterns that contain characters from the unicharset of the corresponding
189  // language will be useful.
190  // The only meta character is '\'. To be used in a pattern as an ordinary
191  // string it should be escaped with '\' (e.g. string "C:\Documents" should
192  // be written in the patterns file as "C:\\Documents").
193  // This function supports a very limited regular expression syntax. One can
194  // express a character, a certain character class and a number of times the
195  // entity should be repeated in the pattern.
196  //
197  // To denote a character class use one of:
198  // \c - unichar for which UNICHARSET::get_isalpha() is true (character)
199  // \d - unichar for which UNICHARSET::get_isdigit() is true
200  // \n - unichar for which UNICHARSET::get_isdigit() and
201  // UNICHARSET::isalpha() are true
202  // \p - unichar for which UNICHARSET::get_ispunct() is true
203  // \a - unichar for which UNICHARSET::get_islower() is true
204  // \A - unichar for which UNICHARSET::get_isupper() is true
205  //
206  // \* could be specified after each character or pattern to indicate that
207  // the character/pattern can be repeated any number of times before the next
208  // character/pattern occurs.
209  //
210  // Examples:
211  // 1-8\d\d-GOOG-411 will be expanded to strings:
212  // 1-800-GOOG-411, 1-801-GOOG-411, ... 1-899-GOOG-411.
213  //
214  // http://www.\n\*.com will be expanded to strings like:
215  // http://www.a.com http://www.a123.com ... http://www.ABCDefgHIJKLMNop.com
216  //
217  // Note: In choosing which patterns to include please be aware of the fact
218  // providing very generic patterns will make tesseract run slower.
219  // For example \n\* at the beginning of the pattern will make Tesseract
220  // consider all the combinations of proposed character choices for each
221  // of the segmentations, which will be unacceptably slow.
222  // Because of potential problems with speed that could be difficult to
223  // identify, each user pattern has to have at least kSaneNumConcreteChars
224  // concrete characters from the unicharset at the beginning.
225  bool read_pattern_list(const char *filename, const UNICHARSET &unicharset);
226 
227  // Initializes the values of *_pattern_ unichar ids.
228  // This function should be called before calling read_pattern_list().
229  void initialize_patterns(UNICHARSET *unicharset);
230 
231  // Fills in the given unichar id vector with the unichar ids that represent
232  // the patterns of the character classes of the given unichar_id.
233  void unichar_id_to_patterns(UNICHAR_ID unichar_id,
234  const UNICHARSET &unicharset,
235  GenericVector<UNICHAR_ID> *vec) const override;
236 
237  // Returns the given EDGE_REF if the EDGE_RECORD that it points to has
238  // a self loop and the given unichar_id matches the unichar_id stored in the
239  // EDGE_RECORD, returns NO_EDGE otherwise.
241  UNICHAR_ID unichar_id,
242  bool word_end) const override {
243  if (edge_ref == NO_EDGE) return NO_EDGE;
244  EDGE_RECORD *edge_rec = deref_edge_ref(edge_ref);
245  return (marker_flag_from_edge_rec(*edge_rec) &&
246  unichar_id == unichar_id_from_edge_rec(*edge_rec) &&
247  word_end == end_of_word_from_edge_rec(*edge_rec)) ?
248  edge_ref : NO_EDGE;
249  }
250 
251  // Adds a word to the Trie (creates the necessary nodes and edges).
252  //
253  // If repetitions vector is not nullptr, each entry in the vector indicates
254  // whether the unichar id with the corresponding index in the word is allowed
255  // to repeat an unlimited number of times. For each entry that is true, MARKER
256  // flag of the corresponding edge created for this unichar id is set to true).
257  //
258  // Return true if add succeeded, false otherwise (e.g. when a word contained
259  // an invalid unichar id or the trie was getting too large and was cleared).
260  bool add_word_to_dawg(const WERD_CHOICE &word,
261  const GenericVector<bool> *repetitions);
262  bool add_word_to_dawg(const WERD_CHOICE &word) {
263  return add_word_to_dawg(word, nullptr);
264  }
265 
266  protected:
267  // The structure of an EDGE_REF for Trie edges is as follows:
268  // [LETTER_START_BIT, flag_start_bit_):
269  // edge index in *_edges in a TRIE_NODE_RECORD
270  // [flag_start_bit, 30th bit]: node index in nodes (TRIE_NODES vector)
271  //
272  // With this arrangement there are enough bits to represent edge indices
273  // (each node can have at most unicharset_size_ forward edges and
274  // the position of flag_start_bit is set to be log2(unicharset_size_)).
275  // It is also possible to accommodate a maximum number of nodes that is at
276  // least as large as that of the SquishedDawg representation (in SquishedDawg
277  // each EDGE_RECORD has 32-(flag_start_bit+NUM_FLAG_BITS) bits to represent
278  // the next node index).
279  //
280 
281  // Returns the pointer to EDGE_RECORD after decoding the location
282  // of the edge from the information in the given EDGE_REF.
283  // This function assumes that EDGE_REF holds valid node/edge indices.
284  inline EDGE_RECORD *deref_edge_ref(EDGE_REF edge_ref) const {
285  int edge_index = static_cast<int>(
286  (edge_ref & letter_mask_) >> LETTER_START_BIT);
287  int node_index = static_cast<int>(
288  (edge_ref & deref_node_index_mask_) >> flag_start_bit_);
289  TRIE_NODE_RECORD *node_rec = nodes_[node_index];
290  return &(node_rec->forward_edges[edge_index]);
291  }
293  inline EDGE_REF make_edge_ref(NODE_REF node_index,
294  EDGE_INDEX edge_index) const {
295  return ((node_index << flag_start_bit_) |
296  (edge_index << LETTER_START_BIT));
297  }
299  inline void link_edge(EDGE_RECORD *edge, NODE_REF nxt, bool repeats,
300  int direction, bool word_end, UNICHAR_ID unichar_id) {
301  EDGE_RECORD flags = 0;
302  if (repeats) flags |= MARKER_FLAG;
303  if (word_end) flags |= WERD_END_FLAG;
304  if (direction == BACKWARD_EDGE) flags |= DIRECTION_FLAG;
305  *edge = ((nxt << next_node_start_bit_) |
306  (static_cast<EDGE_RECORD>(flags) << flag_start_bit_) |
307  (static_cast<EDGE_RECORD>(unichar_id) << LETTER_START_BIT));
308  }
310  inline void print_edge_rec(const EDGE_RECORD &edge_rec) const {
311  tprintf("|" REFFORMAT "|%s%s%s|%d|", next_node_from_edge_rec(edge_rec),
312  marker_flag_from_edge_rec(edge_rec) ? "R," : "",
313  (direction_from_edge_rec(edge_rec) == FORWARD_EDGE) ? "F" : "B",
314  end_of_word_from_edge_rec(edge_rec) ? ",E" : "",
315  unichar_id_from_edge_rec(edge_rec));
316  }
317  // Returns true if the next node in recorded the given EDGE_RECORD
318  // has exactly one forward edge.
319  inline bool can_be_eliminated(const EDGE_RECORD &edge_rec) {
320  NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
321  return (node_ref != NO_EDGE &&
322  nodes_[static_cast<int>(node_ref)]->forward_edges.size() == 1);
323  }
324 
325  // Prints the contents of the Trie.
326  // At most max_num_edges will be printed for each node.
327  void print_all(const char* msg, int max_num_edges) {
328  tprintf("\n__________________________\n%s\n", msg);
329  for (int i = 0; i < nodes_.size(); ++i) print_node(i, max_num_edges);
330  tprintf("__________________________\n");
331  }
332 
333  // Finds the edge with the given direction, word_end and unichar_id
334  // in the node indicated by node_ref. Fills in the pointer to the
335  // EDGE_RECORD and the index of the edge with the the values
336  // corresponding to the edge found. Returns true if an edge was found.
337  bool edge_char_of(NODE_REF node_ref, NODE_REF next_node,
338  int direction, bool word_end, UNICHAR_ID unichar_id,
339  EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const;
340 
341  // Adds an single edge linkage between node1 and node2 in the direction
342  // indicated by direction argument.
343  bool add_edge_linkage(NODE_REF node1, NODE_REF node2, bool repeats,
344  int direction, bool word_end,
345  UNICHAR_ID unichar_id);
346 
347  // Adds forward edge linkage from node1 to node2 and the corresponding
348  // backward edge linkage in the other direction.
349  bool add_new_edge(NODE_REF node1, NODE_REF node2,
350  bool repeats, bool word_end, UNICHAR_ID unichar_id) {
351  return (add_edge_linkage(node1, node2, repeats, FORWARD_EDGE,
352  word_end, unichar_id) &&
353  add_edge_linkage(node2, node1, repeats, BACKWARD_EDGE,
354  word_end, unichar_id));
355  }
356 
357  // Sets the word ending flags in an already existing edge pair.
358  // Returns true on success.
359  void add_word_ending(EDGE_RECORD *edge,
360  NODE_REF the_next_node,
361  bool repeats,
362  UNICHAR_ID unichar_id);
363 
364  // Allocates space for a new node in the Trie.
366 
367  // Removes a single edge linkage to between node1 and node2 in the
368  // direction indicated by direction argument.
369  void remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction,
370  bool word_end, UNICHAR_ID unichar_id);
371 
372  // Removes forward edge linkage from node1 to node2 and the corresponding
373  // backward edge linkage in the other direction.
374  void remove_edge(NODE_REF node1, NODE_REF node2,
375  bool word_end, UNICHAR_ID unichar_id) {
376  remove_edge_linkage(node1, node2, FORWARD_EDGE, word_end, unichar_id);
377  remove_edge_linkage(node2, node1, BACKWARD_EDGE, word_end, unichar_id);
378  }
379 
380  // Compares edge1 and edge2 in the given node to see if they point to two
381  // next nodes that could be collapsed. If they do, performs the reduction
382  // and returns true.
383  bool eliminate_redundant_edges(NODE_REF node, const EDGE_RECORD &edge1,
384  const EDGE_RECORD &edge2);
385 
386  // Assuming that edge_index indicates the first edge in a group of edges
387  // in this node with a particular letter value, looks through these edges
388  // to see if any of them can be collapsed. If so does it. Returns to the
389  // caller when all edges with this letter have been reduced.
390  // Returns true if further reduction is possible with this same letter.
391  bool reduce_lettered_edges(EDGE_INDEX edge_index,
392  UNICHAR_ID unichar_id,
393  NODE_REF node,
394  EDGE_VECTOR* backward_edges,
395  NODE_MARKER reduced_nodes);
396 
403  void sort_edges(EDGE_VECTOR *edges);
404 
406  void reduce_node_input(NODE_REF node, NODE_MARKER reduced_nodes);
407 
408  // Returns the pattern unichar id for the given character class code.
410 
411  // Member variables
412  TRIE_NODES nodes_; // vector of nodes in the Trie
413  // Freelist of edges in the root backwards node that were previously zeroed.
415  uint64_t num_edges_; // sum of all edges (forward and backward)
416  uint64_t deref_direction_mask_; // mask for EDGE_REF to extract direction
417  uint64_t deref_node_index_mask_; // mask for EDGE_REF to extract node index
418  // Variables for translating character class codes denoted in user patterns
419  // file to the unichar ids used to represent them in a Trie.
427 };
428 } // namespace tesseract
429 
430 #endif
GenericVector::delete_data_pointers
void delete_data_pointers()
Definition: genericvector.h:872
TRIE_NODE_RECORD::forward_edges
EDGE_VECTOR forward_edges
Definition: trie.h:41
tesseract::Trie::edge_char_of
EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const override
Definition: trie.h:96
tesseract::Trie::remove_edge
void remove_edge(NODE_REF node1, NODE_REF node2, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:373
tesseract::Trie::kLowerPatternUnicode
static const char kLowerPatternUnicode[]
Definition: trie.h:71
tesseract::Trie::upper_pattern_
UNICHAR_ID upper_pattern_
Definition: trie.h:424
tesseract::Trie::deref_node_index_mask_
uint64_t deref_node_index_mask_
Definition: trie.h:416
tesseract::Trie::kDigitPatternUnicode
static const char kDigitPatternUnicode[]
Definition: trie.h:68
tesseract::Trie::character_class_to_pattern
UNICHAR_ID character_class_to_pattern(char ch)
Definition: trie.cpp:390
tesseract::Trie::kAlphanumPatternUnicode
static const char kAlphanumPatternUnicode[]
Definition: trie.h:69
tesseract::Dawg::type
DawgType type() const
Definition: dawg.h:122
tesseract::Dawg::flag_start_bit_
int flag_start_bit_
Definition: dawg.h:307
WERD_CHOICE
Definition: ratngs.h:261
tesseract::Trie::initialize_patterns
void initialize_patterns(UNICHARSET *unicharset)
Definition: trie.cpp:351
EDGE_INDEX
int64_t EDGE_INDEX
Definition: trie.h:36
tesseract::Dawg::next_node_from_edge_rec
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:207
tesseract::Dawg::lang
const STRING & lang() const
Definition: dawg.h:123
PermuterType
PermuterType
Definition: ratngs.h:230
tesseract::Trie::read_pattern_list
bool read_pattern_list(const char *filename, const UNICHARSET &unicharset)
Definition: trie.cpp:408
tesseract::Trie::end_of_word
bool end_of_word(EDGE_REF edge_ref) const override
Definition: trie.h:134
tesseract::Dawg::end_of_word_from_edge_rec
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
Definition: dawg.h:220
tesseract::NodeChildVector
GenericVector< NodeChild > NodeChildVector
Definition: dawg.h:62
STRING
Definition: strngs.h:45
tesseract::Trie::kPuncPatternUnicode
static const char kPuncPatternUnicode[]
Definition: trie.h:70
tesseract::Trie::kAlphaPatternUnicode
static const char kAlphaPatternUnicode[]
Definition: trie.h:67
tesseract::Trie::print_node
void print_node(NODE_REF node, int max_num_edges) const override
Definition: trie.cpp:711
tesseract::Trie::alphanum_pattern_
UNICHAR_ID alphanum_pattern_
Definition: trie.h:421
tesseract::Trie::next_node
NODE_REF next_node(EDGE_REF edge_ref) const override
Definition: trie.h:125
tesseract::Trie::add_word_ending
void add_word_ending(EDGE_RECORD *edge, NODE_REF the_next_node, bool repeats, UNICHAR_ID unichar_id)
Definition: trie.cpp:166
tesseract::Trie::add_new_edge
bool add_new_edge(NODE_REF node1, NODE_REF node2, bool repeats, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:348
tesseract::Trie::reduce_node_input
void reduce_node_input(NODE_REF node, NODE_MARKER reduced_nodes)
Definition: trie.cpp:674
FORWARD_EDGE
#define FORWARD_EDGE
Definition: dawg.h:79
tesseract::Trie::edge_letter
UNICHAR_ID edge_letter(EDGE_REF edge_ref) const override
Definition: trie.h:140
tesseract::Dawg::unichar_id_from_edge_rec
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:224
tesseract::Trie::kSaneNumConcreteChars
static const int kSaneNumConcreteChars
Definition: trie.h:63
tesseract::Trie::read_word_list
bool read_word_list(const char *filename, GenericVector< STRING > *words)
Definition: trie.cpp:304
tesseract::Trie::make_edge_ref
EDGE_REF make_edge_ref(NODE_REF node_index, EDGE_INDEX edge_index) const
Definition: trie.h:292
genericvector.h
GenericVector::push_back
int push_back(T object)
Definition: genericvector.h:799
tesseract::Trie::print_all
void print_all(const char *msg, int max_num_edges)
Definition: trie.h:326
tesseract::Trie::alpha_pattern_
UNICHAR_ID alpha_pattern_
Definition: trie.h:419
tesseract::Trie::initialized_patterns_
bool initialized_patterns_
Definition: trie.h:425
tesseract::Trie::punc_pattern_
UNICHAR_ID punc_pattern_
Definition: trie.h:422
dawg.h
tesseract::Trie::clear
void clear()
Definition: trie.cpp:71
tesseract::Trie::digit_pattern_
UNICHAR_ID digit_pattern_
Definition: trie.h:420
tesseract::Trie::sort_edges
void sort_edges(EDGE_VECTOR *edges)
Definition: trie.cpp:660
tesseract::Trie::RRP_REVERSE_IF_HAS_RTL
Definition: trie.h:58
tesseract::Trie::RRP_DO_NO_REVERSE
Definition: trie.h:57
tesseract::Dawg::init
void init(int unicharset_size)
Definition: dawg.cpp:190
tesseract::Trie::link_edge
void link_edge(EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:298
tesseract::Dawg::next_node_start_bit_
int next_node_start_bit_
Definition: dawg.h:308
EDGE_RECORD
uint64_t EDGE_RECORD
Definition: dawg.h:47
tesseract::Dawg::marker_flag_from_edge_rec
bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the marker flag of this edge.
Definition: dawg.h:211
DIRECTION_FLAG
#define DIRECTION_FLAG
Definition: dawg.h:83
tesseract::Trie::trie_to_dawg
SquishedDawg * trie_to_dawg()
Definition: trie.cpp:525
UNICHARSET
Definition: unicharset.h:145
tesseract::Trie::eliminate_redundant_edges
bool eliminate_redundant_edges(NODE_REF node, const EDGE_RECORD &edge1, const EDGE_RECORD &edge2)
Definition: trie.cpp:572
tesseract::Trie::Trie
Trie(DawgType type, const STRING &lang, PermuterType perm, int unicharset_size, int debug_level)
Definition: trie.h:81
tesseract::Trie::add_word_to_dawg
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
Definition: trie.cpp:183
tesseract::Trie::RTLReversePolicy
RTLReversePolicy
Definition: trie.h:56
tesseract
Definition: baseapi.h:65
REFFORMAT
#define REFFORMAT
Definition: dawg.h:87
tesseract::DawgType
DawgType
Definition: dawg.h:66
tesseract::Trie::pattern_loop_edge
EDGE_REF pattern_loop_edge(EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const override
Definition: trie.h:239
LETTER_START_BIT
#define LETTER_START_BIT
Definition: dawg.h:85
tesseract::Trie::KillEdge
void KillEdge(EDGE_RECORD *edge_rec) const
Definition: trie.h:146
UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:36
TRIE_NODE_RECORD
Definition: trie.h:40
GenericVector< EDGE_RECORD >
tesseract::Trie::nodes_
TRIE_NODES nodes_
Definition: trie.h:411
tesseract::Trie::kUpperPatternUnicode
static const char kUpperPatternUnicode[]
Definition: trie.h:72
tesseract::Dawg
Definition: dawg.h:113
tesseract::Trie::remove_edge_linkage
void remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.cpp:490
tesseract::Trie::deref_direction_mask_
uint64_t deref_direction_mask_
Definition: trie.h:415
MARKER_FLAG
#define MARKER_FLAG
Definition: dawg.h:82
tesseract::Trie::get_reverse_policy_name
static const char * get_reverse_policy_name(RTLReversePolicy reverse_policy)
Definition: trie.cpp:66
tesseract::Trie::add_edge_linkage
bool add_edge_linkage(NODE_REF node1, NODE_REF node2, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.cpp:130
EDGE_REF
int64_t EDGE_REF
Definition: dawg.h:49
tesseract::Dawg::letter_mask_
uint64_t letter_mask_
Definition: dawg.h:305
tesseract::Trie::root_back_freelist_
GenericVector< EDGE_INDEX > root_back_freelist_
Definition: trie.h:413
tesseract::Trie::add_word_list
bool add_word_list(const GenericVector< STRING > &words, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse_policy)
Definition: trie.cpp:327
tesseract::Trie::unichar_id_to_patterns
void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset, GenericVector< UNICHAR_ID > *vec) const override
Definition: trie.cpp:368
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
tesseract::Trie::print_edge_rec
void print_edge_rec(const EDGE_RECORD &edge_rec) const
Definition: trie.h:309
NODE_MARKER
bool * NODE_MARKER
Definition: trie.h:37
WERD_END_FLAG
#define WERD_END_FLAG
Definition: dawg.h:84
tesseract::Dawg::direction_from_edge_rec
int direction_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the direction flag of this edge.
Definition: dawg.h:215
tesseract::Trie::can_be_eliminated
bool can_be_eliminated(const EDGE_RECORD &edge_rec)
Definition: trie.h:318
tesseract::Trie::reduce_lettered_edges
bool reduce_lettered_edges(EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node, EDGE_VECTOR *backward_edges, NODE_MARKER reduced_nodes)
Definition: trie.cpp:619
tesseract::Trie::RRP_FORCE_REVERSE
Definition: trie.h:59
tesseract::Dawg::unicharset_size_
int unicharset_size_
Definition: dawg.h:306
tesseract::Trie::~Trie
~Trie() override
Definition: trie.h:90
BACKWARD_EDGE
#define BACKWARD_EDGE
Definition: dawg.h:80
TRIE_NODE_RECORD::backward_edges
EDGE_VECTOR backward_edges
Definition: trie.h:42
GenericVector::size
int size() const
Definition: genericvector.h:71
tesseract::Trie::num_edges_
uint64_t num_edges_
Definition: trie.h:414
tesseract::Trie::deref_edge_ref
EDGE_RECORD * deref_edge_ref(EDGE_REF edge_ref) const
Definition: trie.h:283
tesseract::Trie::DeadEdge
bool DeadEdge(const EDGE_RECORD &edge_rec) const
Definition: trie.h:150
tesseract::Trie::lower_pattern_
UNICHAR_ID lower_pattern_
Definition: trie.h:423
tesseract::Trie::new_dawg_node
NODE_REF new_dawg_node()
Definition: trie.cpp:282
tesseract::Trie::read_and_add_word_list
bool read_and_add_word_list(const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse)
Definition: trie.cpp:295
tesseract::Trie::unichar_ids_of
void unichar_ids_of(NODE_REF node, NodeChildVector *vec, bool word_end) const override
Definition: trie.h:109
NODE_REF
int64_t NODE_REF
Definition: dawg.h:50