tesseract
5.0.0-alpha-619-ge9db
|
#include <trie.h>
|
| Trie (DawgType type, const STRING &lang, PermuterType perm, int unicharset_size, int debug_level) |
|
| ~Trie () override |
|
void | clear () |
|
EDGE_REF | edge_char_of (NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const override |
|
void | unichar_ids_of (NODE_REF node, NodeChildVector *vec, bool word_end) const override |
|
NODE_REF | next_node (EDGE_REF edge_ref) const override |
|
bool | end_of_word (EDGE_REF edge_ref) const override |
|
UNICHAR_ID | edge_letter (EDGE_REF edge_ref) const override |
|
void | KillEdge (EDGE_RECORD *edge_rec) const |
|
bool | DeadEdge (const EDGE_RECORD &edge_rec) const |
|
void | print_node (NODE_REF node, int max_num_edges) const override |
|
SquishedDawg * | trie_to_dawg () |
|
bool | read_and_add_word_list (const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse) |
|
bool | read_word_list (const char *filename, GenericVector< STRING > *words) |
|
bool | add_word_list (const GenericVector< STRING > &words, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse_policy) |
|
bool | read_pattern_list (const char *filename, const UNICHARSET &unicharset) |
|
void | initialize_patterns (UNICHARSET *unicharset) |
|
void | unichar_id_to_patterns (UNICHAR_ID unichar_id, const UNICHARSET &unicharset, GenericVector< UNICHAR_ID > *vec) const override |
|
EDGE_REF | pattern_loop_edge (EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const override |
|
bool | add_word_to_dawg (const WERD_CHOICE &word, const GenericVector< bool > *repetitions) |
|
bool | add_word_to_dawg (const WERD_CHOICE &word) |
|
DawgType | type () const |
|
const STRING & | lang () const |
|
PermuterType | permuter () const |
|
virtual | ~Dawg () |
|
bool | word_in_dawg (const WERD_CHOICE &word) const |
| Returns true if the given word is in the Dawg. More...
|
|
bool | prefix_in_dawg (const WERD_CHOICE &prefix, bool requires_complete) const |
|
int | check_for_words (const char *filename, const UNICHARSET &unicharset, bool enable_wildcard) const |
|
void | iterate_words (const UNICHARSET &unicharset, std::function< void(const WERD_CHOICE *)> cb) const |
|
void | iterate_words (const UNICHARSET &unicharset, std::function< void(const char *)> cb) const |
|
|
EDGE_RECORD * | deref_edge_ref (EDGE_REF edge_ref) const |
|
EDGE_REF | make_edge_ref (NODE_REF node_index, EDGE_INDEX edge_index) const |
|
void | link_edge (EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id) |
|
void | print_edge_rec (const EDGE_RECORD &edge_rec) const |
|
bool | can_be_eliminated (const EDGE_RECORD &edge_rec) |
|
void | print_all (const char *msg, int max_num_edges) |
|
bool | edge_char_of (NODE_REF node_ref, NODE_REF next_node, int direction, bool word_end, UNICHAR_ID unichar_id, EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const |
|
bool | add_edge_linkage (NODE_REF node1, NODE_REF node2, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id) |
|
bool | add_new_edge (NODE_REF node1, NODE_REF node2, bool repeats, bool word_end, UNICHAR_ID unichar_id) |
|
void | add_word_ending (EDGE_RECORD *edge, NODE_REF the_next_node, bool repeats, UNICHAR_ID unichar_id) |
|
NODE_REF | new_dawg_node () |
|
void | remove_edge_linkage (NODE_REF node1, NODE_REF node2, int direction, bool word_end, UNICHAR_ID unichar_id) |
|
void | remove_edge (NODE_REF node1, NODE_REF node2, bool word_end, UNICHAR_ID unichar_id) |
|
bool | eliminate_redundant_edges (NODE_REF node, const EDGE_RECORD &edge1, const EDGE_RECORD &edge2) |
|
bool | reduce_lettered_edges (EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node, EDGE_VECTOR *backward_edges, NODE_MARKER reduced_nodes) |
|
void | sort_edges (EDGE_VECTOR *edges) |
|
void | reduce_node_input (NODE_REF node, NODE_MARKER reduced_nodes) |
|
UNICHAR_ID | character_class_to_pattern (char ch) |
|
| Dawg (DawgType type, const STRING &lang, PermuterType perm, int debug_level) |
|
NODE_REF | next_node_from_edge_rec (const EDGE_RECORD &edge_rec) const |
| Returns the next node visited by following this edge. More...
|
|
bool | marker_flag_from_edge_rec (const EDGE_RECORD &edge_rec) const |
| Returns the marker flag of this edge. More...
|
|
int | direction_from_edge_rec (const EDGE_RECORD &edge_rec) const |
| Returns the direction flag of this edge. More...
|
|
bool | end_of_word_from_edge_rec (const EDGE_RECORD &edge_rec) const |
| Returns true if this edge marks the end of a word. More...
|
|
UNICHAR_ID | unichar_id_from_edge_rec (const EDGE_RECORD &edge_rec) const |
| Returns UNICHAR_ID recorded in this edge. More...
|
|
void | set_next_node_in_edge_rec (EDGE_RECORD *edge_rec, EDGE_REF value) |
| Sets the next node link for this edge in the Dawg. More...
|
|
void | set_marker_flag_in_edge_rec (EDGE_RECORD *edge_rec) |
| Sets this edge record to be the last one in a sequence of edges. More...
|
|
int | given_greater_than_edge_rec (NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const |
|
bool | edge_rec_match (NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, NODE_REF other_next_node, bool other_word_end, UNICHAR_ID other_unichar_id) const |
|
void | init (int unicharset_size) |
|
bool | match_words (WERD_CHOICE *word, int32_t index, NODE_REF node, UNICHAR_ID wildcard) const |
|
void | iterate_words_rec (const WERD_CHOICE &word_so_far, NODE_REF to_explore, std::function< void(const WERD_CHOICE *)> cb) const |
|
Concrete class for Trie data structure that allows to store a list of words (extends Dawg base class) as well as dynamically add new words. This class stores a vector of pointers to TRIE_NODE_RECORDs, each of which has a vector of forward and backward edges.
Definition at line 54 of file trie.h.
◆ RTLReversePolicy
Enumerator |
---|
RRP_DO_NO_REVERSE | |
RRP_REVERSE_IF_HAS_RTL | |
RRP_FORCE_REVERSE | |
Definition at line 56 of file trie.h.
◆ Trie()
Definition at line 81 of file trie.h.
85 init(unicharset_size);
◆ ~Trie()
tesseract::Trie::~Trie |
( |
| ) |
|
|
inlineoverride |
◆ add_edge_linkage()
bool tesseract::Trie::add_edge_linkage |
( |
NODE_REF |
node1, |
|
|
NODE_REF |
node2, |
|
|
bool |
repeats, |
|
|
int |
direction, |
|
|
bool |
word_end, |
|
|
UNICHAR_ID |
unichar_id |
|
) |
| |
|
protected |
Definition at line 130 of file trie.cpp.
136 (*vec)[edge_index] = edge_rec;
137 }
else if (search_index < vec->size()) {
138 vec->insert(edge_rec, search_index);
140 vec->push_back(edge_rec);
158 unichar_id, &back_edge_ptr, &back_edge_index));
◆ add_new_edge()
Definition at line 348 of file trie.h.
352 word_end, unichar_id) &&
354 word_end, unichar_id));
◆ add_word_ending()
Definition at line 166 of file trie.cpp.
170 if (word.length() <= 0)
return false;
171 if (repetitions !=
nullptr)
ASSERT_HOST(repetitions->size() == word.length());
173 for (
int i = 0; i < word.length(); ++i) {
174 if (word.unichar_id(i) < 0 ||
181 bool marker_flag =
false;
◆ add_word_list()
Definition at line 327 of file trie.cpp.
327 : word
'%s' not in DAWG after adding it\n
",
336 void Trie::initialize_patterns(UNICHARSET *unicharset) {
337 unicharset->unichar_insert(kAlphaPatternUnicode);
338 alpha_pattern_ = unicharset->unichar_to_id(kAlphaPatternUnicode);
339 unicharset->unichar_insert(kDigitPatternUnicode);
340 digit_pattern_ = unicharset->unichar_to_id(kDigitPatternUnicode);
341 unicharset->unichar_insert(kAlphanumPatternUnicode);
342 alphanum_pattern_ = unicharset->unichar_to_id(kAlphanumPatternUnicode);
343 unicharset->unichar_insert(kPuncPatternUnicode);
344 punc_pattern_ = unicharset->unichar_to_id(kPuncPatternUnicode);
345 unicharset->unichar_insert(kLowerPatternUnicode);
346 lower_pattern_ = unicharset->unichar_to_id(kLowerPatternUnicode);
347 unicharset->unichar_insert(kUpperPatternUnicode);
348 upper_pattern_ = unicharset->unichar_to_id(kUpperPatternUnicode);
349 initialized_patterns_ = true;
◆ add_word_to_dawg() [1/2]
bool tesseract::Trie::add_word_to_dawg |
( |
const WERD_CHOICE & |
word | ) |
|
|
inline |
◆ add_word_to_dawg() [2/2]
Definition at line 183 of file trie.cpp.
191 UNICHAR_ID unichar_id;
192 for (i = 0; i < word.length() - 1; ++i) {
193 unichar_id = word.unichar_id(i);
194 marker_flag = (repetitions != nullptr) ? (*repetitions)[i] : false;
195 if (debug_level_ > 1) tprintf("Adding letter %d\n
", unichar_id);
196 if (still_finding_chars) {
197 found = edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, word_end,
198 unichar_id, &edge_ptr, &edge_index);
199 if (found && debug_level_ > 1) {
200 tprintf("exploring edge
" REFFORMAT " in node
" REFFORMAT "\n
",
201 edge_index, last_node);
204 still_finding_chars = false;
205 } else if (next_node_from_edge_rec(*edge_ptr) == 0) {
206 // We hit the end of an existing word, but the new word is longer.
207 // In this case we have to disconnect the existing word from the
208 // backwards root node, mark the current position as end-of-word
209 // and add new nodes for the increased length. Disconnecting the
210 // existing word from the backwards root node requires a linear
211 // search, so it is much faster to add the longest words first,
212 // to avoid having to come here.
214 still_finding_chars = false;
215 remove_edge(last_node, 0, word_end, unichar_id);
217 // We have to add a new branch here for the new word.
218 if (marker_flag) set_marker_flag_in_edge_rec(edge_ptr);
219 last_node = next_node_from_edge_rec(*edge_ptr);
222 if (!still_finding_chars) {
223 the_next_node = new_dawg_node();
224 if (debug_level_ > 1)
225 tprintf("adding node
" REFFORMAT "\n
", the_next_node);
226 if (the_next_node == 0) {
230 if (!add_new_edge(last_node, the_next_node,
231 marker_flag, word_end, unichar_id)) {
236 last_node = the_next_node;
240 unichar_id = word.unichar_id(i);
241 marker_flag = (repetitions != nullptr) ? (*repetitions)[i] : false;
242 if (debug_level_ > 1) tprintf("Adding letter %d\n
", unichar_id);
243 if (still_finding_chars &&
244 edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, false,
245 unichar_id, &edge_ptr, &edge_index)) {
246 // An extension of this word already exists in the trie, so we
247 // only have to add the ending flags in both directions.
248 add_word_ending(edge_ptr, next_node_from_edge_rec(*edge_ptr),
249 marker_flag, unichar_id);
251 // Add a link to node 0. All leaves connect to node 0 so the back links can
252 // be used in reduction to a dawg. This root backward node has one edge
253 // entry for every word, (except prefixes of longer words) so it is huge.
255 !add_new_edge(last_node, the_next_node, marker_flag, true, unichar_id))
259 tprintf("Re-initializing document dictionary...\n
");
267 NODE_REF Trie::new_dawg_node() {
268 auto *node = new TRIE_NODE_RECORD();
269 nodes_.push_back(node);
270 return nodes_.size() - 1;
273 // Sort function to sort words by decreasing order of length.
274 static int sort_strings_by_dec_length(const void* v1, const void* v2) {
275 const auto *s1 = static_cast<const STRING *>(v1);
276 const auto *s2 = static_cast<const STRING *>(v2);
277 return s2->length() - s1->length();
280 bool Trie::read_and_add_word_list(const char *filename,
◆ can_be_eliminated()
bool tesseract::Trie::can_be_eliminated |
( |
const EDGE_RECORD & |
edge_rec | ) |
|
|
inlineprotected |
Definition at line 318 of file trie.h.
321 return (node_ref != NO_EDGE &&
322 nodes_[static_cast<int>(node_ref)]->forward_edges.
size() == 1);
◆ character_class_to_pattern()
UNICHAR_ID tesseract::Trie::character_class_to_pattern |
( |
char |
ch | ) |
|
|
protected |
Definition at line 390 of file trie.cpp.
396 tprintf(
"please call initialize_patterns() before read_pattern_list()\n");
400 FILE *pattern_file = fopen(filename,
"rb");
401 if (pattern_file ==
nullptr) {
402 tprintf(
"Error opening pattern file %s\n", filename);
406 int pattern_count = 0;
◆ clear()
void tesseract::Trie::clear |
( |
| ) |
|
Definition at line 71 of file trie.cpp.
75 if (node_ref == NO_EDGE)
return false;
◆ DeadEdge()
bool tesseract::Trie::DeadEdge |
( |
const EDGE_RECORD & |
edge_rec | ) |
const |
|
inline |
◆ deref_edge_ref()
Definition at line 283 of file trie.h.
285 int edge_index = static_cast<int>(
287 int node_index = static_cast<int>(
◆ edge_char_of() [1/2]
Definition at line 79 of file trie.cpp.
85 while (start <= end) {
86 k = (start + end) >> 1;
90 *edge_ptr = &(vec[k]);
93 }
else if (compare == 1) {
100 for (
int i = 0; i < vec_size; ++i) {
106 *edge_ptr = &(edge_rec);
116 int direction,
bool word_end,
119 &(
nodes_[node1]->forward_edges) : &(
nodes_[node1]->backward_edges);
123 while (search_index < vec->size() &&
125 (*vec)[search_index]) == 1) {
◆ edge_char_of() [2/2]
Returns the edge that corresponds to the letter out of this node.
Implements tesseract::Dawg.
Definition at line 96 of file trie.h.
102 &edge_ptr, &edge_index))
return NO_EDGE;
◆ edge_letter()
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
Implements tesseract::Dawg.
Definition at line 140 of file trie.h.
142 if (edge_ref == NO_EDGE ||
num_edges_ == 0)
return INVALID_UNICHAR_ID;
◆ eliminate_redundant_edges()
Definition at line 572 of file trie.cpp.
579 const EDGE_RECORD &bkw_edge = next_node2_ptr->backward_edges[i];
585 curr_word_end, curr_unichar_id);
588 curr_word_end, curr_unichar_id,
589 &edge_ptr, &edge_index));
592 int next_node2_num_edges = (next_node2_ptr->forward_edges.size() +
593 next_node2_ptr->backward_edges.size());
596 next_node2_num_edges, next_node2);
598 next_node2_ptr->forward_edges.clear();
599 next_node2_ptr->backward_edges.clear();
612 bool did_something =
false;
613 for (
int i = edge_index; i < backward_edges->
size() - 1; ++i) {
615 UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
616 while (i < backward_edges->size()) {
617 if (!
DeadEdge((*backward_edges)[i])) {
◆ end_of_word()
bool tesseract::Trie::end_of_word |
( |
EDGE_REF |
edge_ref | ) |
const |
|
inlineoverridevirtual |
Returns true if the edge indicated by the given EDGE_REF marks the end of a word.
Implements tesseract::Dawg.
Definition at line 134 of file trie.h.
136 if (edge_ref == NO_EDGE ||
num_edges_ == 0)
return false;
◆ get_reverse_policy_name()
const char * tesseract::Trie::get_reverse_policy_name |
( |
RTLReversePolicy |
reverse_policy | ) |
|
|
static |
◆ initialize_patterns()
void tesseract::Trie::initialize_patterns |
( |
UNICHARSET * |
unicharset | ) |
|
◆ KillEdge()
void tesseract::Trie::KillEdge |
( |
EDGE_RECORD * |
edge_rec | ) |
const |
|
inline |
◆ link_edge()
Sets up this edge record to the requested values.
Definition at line 298 of file trie.h.
◆ make_edge_ref()
Constructs EDGE_REF from the given node_index and edge_index.
Definition at line 292 of file trie.h.
◆ new_dawg_node()
NODE_REF tesseract::Trie::new_dawg_node |
( |
| ) |
|
|
protected |
Definition at line 282 of file trie.cpp.
285 word_list.
sort(sort_strings_by_dec_length);
◆ next_node()
Returns the next node visited by following the edge indicated by the given EDGE_REF.
Implements tesseract::Dawg.
Definition at line 125 of file trie.h.
127 if (edge_ref == NO_EDGE ||
num_edges_ == 0)
return NO_EDGE;
◆ pattern_loop_edge()
Returns the given EDGE_REF if the EDGE_RECORD that it points to has a self loop and the given unichar_id matches the unichar_id stored in the EDGE_RECORD, returns NO_EDGE otherwise.
Reimplemented from tesseract::Dawg.
Definition at line 239 of file trie.h.
243 if (edge_ref == NO_EDGE)
return NO_EDGE;
◆ print_all()
void tesseract::Trie::print_all |
( |
const char * |
msg, |
|
|
int |
max_num_edges |
|
) |
| |
|
inlineprotected |
Definition at line 326 of file trie.h.
328 tprintf(
"\n__________________________\n%s\n", msg);
330 tprintf(
"__________________________\n");
◆ print_edge_rec()
void tesseract::Trie::print_edge_rec |
( |
const EDGE_RECORD & |
edge_rec | ) |
const |
|
inlineprotected |
Prints the given EDGE_RECORD.
Definition at line 309 of file trie.h.
◆ print_node()
void tesseract::Trie::print_node |
( |
NODE_REF |
node, |
|
|
int |
max_num_edges |
|
) |
| const |
|
overridevirtual |
Prints the contents of the node indicated by the given NODE_REF. At most max_num_edges will be printed.
Implements tesseract::Dawg.
Definition at line 711 of file trie.cpp.
712 i < max_num_edges; ++i) {
717 if (dir == 0 ? i < num_fwd : i < num_bkw)
tprintf(
"...");
◆ read_and_add_word_list()
◆ read_pattern_list()
bool tesseract::Trie::read_pattern_list |
( |
const char * |
filename, |
|
|
const UNICHARSET & |
unicharset |
|
) |
| |
Definition at line 408 of file trie.cpp.
414 const char *str_ptr =
string;
415 int step = unicharset.
step(str_ptr);
418 UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
419 if (step == 1 && *str_ptr ==
'\\') {
421 if (*str_ptr ==
'\\') {
425 tprintf(
"Please provide at least %d concrete characters at the"
436 if (curr_unichar_id == INVALID_UNICHAR_ID) {
440 word.append_unichar_id(curr_unichar_id, 1, 0.0, 0.0);
443 step = unicharset.
step(str_ptr);
445 if (step == 1 && *str_ptr ==
'\\' && *(str_ptr+1) ==
'*') {
446 repetitions_vec[repetitions_vec.
size()-1] =
true;
448 step = unicharset.
step(str_ptr);
452 tprintf(
"Invalid user pattern %s\n",
string);
457 tprintf(
"Inserting expanded user pattern %s\n",
458 word.debug_string().c_str());
463 tprintf(
"Error: failed to insert pattern '%s'\n",
string);
469 tprintf(
"Read %d valid patterns from %s\n", pattern_count, filename);
471 fclose(pattern_file);
480 unichar_id, &edge_ptr, &edge_index));
488 }
else if (node1 == 0) {
◆ read_word_list()
bool tesseract::Trie::read_word_list |
( |
const char * |
filename, |
|
|
GenericVector< STRING > * |
words |
|
) |
| |
Definition at line 304 of file trie.cpp.
315 for (
int i = 0; i < words.
size(); ++i) {
317 if (word.length() == 0 || word.contains_unichar_id(INVALID_UNICHAR_ID))
320 word.has_rtl_unichar_id()) ||
322 word.reverse_and_mirror_unichar_ids();
◆ reduce_lettered_edges()
Definition at line 619 of file trie.cpp.
628 const EDGE_RECORD &next_edge_rec = (*backward_edges)[j];
629 if (
DeadEdge(next_edge_rec))
continue;
631 if (next_id != unichar_id)
break;
637 did_something =
true;
642 return did_something;
646 int num_edges = edges->
size();
647 if (num_edges <= 1)
return;
650 for (
int i = 0; i < num_edges; ++i) {
651 sort_vec.
push_back(KDPairInc<UNICHAR_ID, EDGE_RECORD>(
655 for (
int i = 0; i < num_edges; ++i)
656 (*edges)[i] = sort_vec[i].data;
◆ reduce_node_input()
Eliminates any redundant edges from this node in the Trie.
Definition at line 674 of file trie.cpp.
677 if (!
DeadEdge(backward_edges[edge_index]) &&
id != unichar_id)
break;
680 reduced_nodes[node] =
true;
687 for (
int i = 0; i < backward_edges.size(); ++i) {
688 if (
DeadEdge(backward_edges[i]))
continue;
697 if (node == NO_EDGE)
return;
702 for (
int dir = 0; dir < 2; ++dir) {
◆ remove_edge()
◆ remove_edge_linkage()
◆ sort_edges()
void tesseract::Trie::sort_edges |
( |
EDGE_VECTOR * |
edges | ) |
|
|
protected |
Order num_edges of consecutive EDGE_RECORDS in the given EDGE_VECTOR in increasing order of unichar ids. This function is normally called for all edges in a single node, and since number of edges in each node is usually quite small, selection sort is used.
Definition at line 660 of file trie.cpp.
669 while (edge_index < backward_edges.
size()) {
670 if (
DeadEdge(backward_edges[edge_index]))
continue;
◆ trie_to_dawg()
Definition at line 525 of file trie.cpp.
529 node_ref_map[i+1] = node_ref_map[i] +
nodes_[i]->forward_edges.
size();
531 int num_forward_edges = node_ref_map[i];
535 auto edge_array =
new EDGE_RECORD[num_forward_edges];
540 for (j = 0; j < end; ++j) {
551 delete[] node_ref_map;
553 return new SquishedDawg(edge_array, num_forward_edges,
type_,
lang_,
561 tprintf(
"\nCollapsing node %" PRIi64
":\n", node);
◆ unichar_id_to_patterns()
Fills vec with unichar ids that represent the character classes of the given unichar_id.
Reimplemented from tesseract::Dawg.
Definition at line 368 of file trie.cpp.
378 }
else if (ch ==
'd') {
380 }
else if (ch ==
'n') {
382 }
else if (ch ==
'p') {
384 }
else if (ch ==
'a') {
386 }
else if (ch ==
'A') {
◆ unichar_ids_of()
Fills the given NodeChildVector with all the unichar ids (and the corresponding EDGE_REFs) for which there is an edge out of this node.
Implements tesseract::Dawg.
Definition at line 109 of file trie.h.
113 nodes_[static_cast<int>(node)]->forward_edges;
114 for (
int i = 0; i < forward_edges.
size(); ++i) {
◆ alpha_pattern_
◆ alphanum_pattern_
◆ deref_direction_mask_
uint64_t tesseract::Trie::deref_direction_mask_ |
|
protected |
◆ deref_node_index_mask_
uint64_t tesseract::Trie::deref_node_index_mask_ |
|
protected |
◆ digit_pattern_
◆ initialized_patterns_
bool tesseract::Trie::initialized_patterns_ |
|
protected |
◆ kAlphanumPatternUnicode
const char tesseract::Trie::kAlphanumPatternUnicode = "\u2002" |
|
static |
◆ kAlphaPatternUnicode
const char tesseract::Trie::kAlphaPatternUnicode = "\u2000" |
|
static |
◆ kDigitPatternUnicode
const char tesseract::Trie::kDigitPatternUnicode = "\u2001" |
|
static |
◆ kLowerPatternUnicode
const char tesseract::Trie::kLowerPatternUnicode = "\u2004" |
|
static |
◆ kPuncPatternUnicode
const char tesseract::Trie::kPuncPatternUnicode = "\u2003" |
|
static |
◆ kSaneNumConcreteChars
const int tesseract::Trie::kSaneNumConcreteChars = 0 |
|
static |
◆ kUpperPatternUnicode
const char tesseract::Trie::kUpperPatternUnicode = "\u2005" |
|
static |
◆ lower_pattern_
◆ nodes_
◆ num_edges_
uint64_t tesseract::Trie::num_edges_ |
|
protected |
◆ punc_pattern_
◆ root_back_freelist_
◆ upper_pattern_
The documentation for this class was generated from the following files:
void delete_data_pointers()
EDGE_VECTOR forward_edges
EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const override
UNICHAR_ID upper_pattern_
uint64_t deref_node_index_mask_
bool get_islower(UNICHAR_ID unichar_id) const
UNICHAR_ID character_class_to_pattern(char ch)
bool get_isdigit(UNICHAR_ID unichar_id) const
bool get_isalpha(UNICHAR_ID unichar_id) const
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
const STRING & lang() const
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
void set_next_node_in_edge_rec(EDGE_RECORD *edge_rec, EDGE_REF value)
Sets the next node link for this edge in the Dawg.
void print_node(NODE_REF node, int max_num_edges) const override
UNICHAR_ID alphanum_pattern_
NODE_REF next_node(EDGE_REF edge_ref) const override
void add_word_ending(EDGE_RECORD *edge, NODE_REF the_next_node, bool repeats, UNICHAR_ID unichar_id)
int given_greater_than_edge_rec(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
int step(const char *str) const
void reduce_node_input(NODE_REF node, NODE_MARKER reduced_nodes)
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
static const int kSaneNumConcreteChars
PermuterType perm_
Permuter code that should be used if the word is found in this Dawg.
bool read_word_list(const char *filename, GenericVector< STRING > *words)
#define MAX_NODE_EDGES_DISPLAY
EDGE_REF make_edge_ref(NODE_REF node_index, EDGE_INDEX edge_index) const
bool edge_rec_match(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, NODE_REF other_next_node, bool other_word_end, UNICHAR_ID other_unichar_id) const
UNICHAR_ID alpha_pattern_
bool initialized_patterns_
void chomp_string(char *str)
UNICHAR_ID digit_pattern_
void sort_edges(EDGE_VECTOR *edges)
void init(int unicharset_size)
void link_edge(EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the marker flag of this edge.
Dawg(DawgType type, const STRING &lang, PermuterType perm, int debug_level)
SquishedDawg * trie_to_dawg()
bool eliminate_redundant_edges(NODE_REF node, const EDGE_RECORD &edge1, const EDGE_RECORD &edge2)
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
void KillEdge(EDGE_RECORD *edge_rec) const
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
bool get_isupper(UNICHAR_ID unichar_id) const
void set_marker_flag_in_edge_rec(EDGE_RECORD *edge_rec)
Sets this edge record to be the last one in a sequence of edges.
void remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, bool word_end, UNICHAR_ID unichar_id)
bool add_edge_linkage(NODE_REF node1, NODE_REF node2, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
GenericVector< EDGE_INDEX > root_back_freelist_
bool add_word_list(const GenericVector< STRING > &words, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse_policy)
DLLSYM void tprintf(const char *format,...)
void print_edge_rec(const EDGE_RECORD &edge_rec) const
int direction_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the direction flag of this edge.
bool can_be_eliminated(const EDGE_RECORD &edge_rec)
bool reduce_lettered_edges(EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node, EDGE_VECTOR *backward_edges, NODE_MARKER reduced_nodes)
EDGE_VECTOR backward_edges
EDGE_RECORD * deref_edge_ref(EDGE_REF edge_ref) const
bool DeadEdge(const EDGE_RECORD &edge_rec) const
UNICHAR_ID lower_pattern_