tesseract
5.0.0-alpha-619-ge9db
|
#include <dawg.h>
|
DawgType | type () const |
|
const STRING & | lang () const |
|
PermuterType | permuter () const |
|
virtual | ~Dawg () |
|
bool | word_in_dawg (const WERD_CHOICE &word) const |
| Returns true if the given word is in the Dawg. More...
|
|
bool | prefix_in_dawg (const WERD_CHOICE &prefix, bool requires_complete) const |
|
int | check_for_words (const char *filename, const UNICHARSET &unicharset, bool enable_wildcard) const |
|
void | iterate_words (const UNICHARSET &unicharset, std::function< void(const WERD_CHOICE *)> cb) const |
|
void | iterate_words (const UNICHARSET &unicharset, std::function< void(const char *)> cb) const |
|
virtual EDGE_REF | edge_char_of (NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0 |
| Returns the edge that corresponds to the letter out of this node. More...
|
|
virtual void | unichar_ids_of (NODE_REF node, NodeChildVector *vec, bool word_end) const =0 |
|
virtual NODE_REF | next_node (EDGE_REF edge_ref) const =0 |
|
virtual bool | end_of_word (EDGE_REF edge_ref) const =0 |
|
virtual UNICHAR_ID | edge_letter (EDGE_REF edge_ref) const =0 |
| Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF. More...
|
|
virtual void | print_node (NODE_REF node, int max_num_edges) const =0 |
|
virtual void | unichar_id_to_patterns (UNICHAR_ID unichar_id, const UNICHARSET &unicharset, GenericVector< UNICHAR_ID > *vec) const |
|
virtual EDGE_REF | pattern_loop_edge (EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const |
|
|
| Dawg (DawgType type, const STRING &lang, PermuterType perm, int debug_level) |
|
NODE_REF | next_node_from_edge_rec (const EDGE_RECORD &edge_rec) const |
| Returns the next node visited by following this edge. More...
|
|
bool | marker_flag_from_edge_rec (const EDGE_RECORD &edge_rec) const |
| Returns the marker flag of this edge. More...
|
|
int | direction_from_edge_rec (const EDGE_RECORD &edge_rec) const |
| Returns the direction flag of this edge. More...
|
|
bool | end_of_word_from_edge_rec (const EDGE_RECORD &edge_rec) const |
| Returns true if this edge marks the end of a word. More...
|
|
UNICHAR_ID | unichar_id_from_edge_rec (const EDGE_RECORD &edge_rec) const |
| Returns UNICHAR_ID recorded in this edge. More...
|
|
void | set_next_node_in_edge_rec (EDGE_RECORD *edge_rec, EDGE_REF value) |
| Sets the next node link for this edge in the Dawg. More...
|
|
void | set_marker_flag_in_edge_rec (EDGE_RECORD *edge_rec) |
| Sets this edge record to be the last one in a sequence of edges. More...
|
|
int | given_greater_than_edge_rec (NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const |
|
bool | edge_rec_match (NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, NODE_REF other_next_node, bool other_word_end, UNICHAR_ID other_unichar_id) const |
|
void | init (int unicharset_size) |
|
bool | match_words (WERD_CHOICE *word, int32_t index, NODE_REF node, UNICHAR_ID wildcard) const |
|
void | iterate_words_rec (const WERD_CHOICE &word_so_far, NODE_REF to_explore, std::function< void(const WERD_CHOICE *)> cb) const |
|
Abstract class (an interface) that declares methods needed by the various tesseract classes to operate on SquishedDawg and Trie objects.
This class initializes all the edge masks (since their usage by SquishedDawg and Trie is identical) and implements simple accessors for each of the fields encoded in an EDGE_RECORD. This class also implements word_in_dawg() and check_for_words() (since they use only the public methods of SquishedDawg and Trie classes that are inherited from the Dawg base class).
Definition at line 113 of file dawg.h.
◆ ~Dawg()
tesseract::Dawg::~Dawg |
( |
| ) |
|
|
virtualdefault |
◆ Dawg()
◆ check_for_words()
int tesseract::Dawg::check_for_words |
( |
const char * |
filename, |
|
|
const UNICHARSET & |
unicharset, |
|
|
bool |
enable_wildcard |
|
) |
| const |
Checks the Dawg for the words that are listed in the requested file. Returns the number of words in the given file missing from the Dawg.
Definition at line 82 of file dawg.cpp.
86 if (word.length() > 0 &&
87 !word.contains_unichar_id(INVALID_UNICHAR_ID)) {
89 enable_wildcard ? wildcard : INVALID_UNICHAR_ID)) {
90 tprintf(
"Missing word: %s\n",
string);
94 tprintf(
"Failed to create a valid word from %s\n",
string);
104 std::function<
void(
const WERD_CHOICE*)> cb)
const {
109 static void CallWithUTF8(std::function<
void(
const char*)> cb,
◆ direction_from_edge_rec()
int tesseract::Dawg::direction_from_edge_rec |
( |
const EDGE_RECORD & |
edge_rec | ) |
const |
|
inlineprotected |
Returns the direction flag of this edge.
Definition at line 215 of file dawg.h.
◆ edge_char_of()
◆ edge_letter()
◆ edge_rec_match()
Returns true if all the values are equal (any value matches next_node if next_node == NO_EDGE, any value matches word_end if word_end is false).
Definition at line 266 of file dawg.h.
273 return ((unichar_id == other_unichar_id) &&
275 (!word_end || (word_end == other_word_end)));
◆ end_of_word()
virtual bool tesseract::Dawg::end_of_word |
( |
EDGE_REF |
edge_ref | ) |
const |
|
pure virtual |
◆ end_of_word_from_edge_rec()
bool tesseract::Dawg::end_of_word_from_edge_rec |
( |
const EDGE_RECORD & |
edge_rec | ) |
const |
|
inlineprotected |
Returns true if this edge marks the end of a word.
Definition at line 220 of file dawg.h.
◆ given_greater_than_edge_rec()
int tesseract::Dawg::given_greater_than_edge_rec |
( |
NODE_REF |
next_node, |
|
|
bool |
word_end, |
|
|
UNICHAR_ID |
unichar_id, |
|
|
const EDGE_RECORD & |
edge_rec |
|
) |
| const |
|
inlineprotected |
Sequentially compares the given values of unichar ID, next node and word end marker with the values in the given EDGE_RECORD. Returns: 1 if at any step the given input value exceeds that of edge_rec (and all the values already checked are the same) 0 if edge_rec_match() returns true -1 otherwise
Definition at line 245 of file dawg.h.
254 curr_word_end, curr_unichar_id))
return 0;
255 if (unichar_id > curr_unichar_id)
return 1;
256 if (unichar_id == curr_unichar_id) {
257 if (
next_node > curr_next_node)
return 1;
259 if (word_end > curr_word_end)
return 1;
◆ init()
void tesseract::Dawg::init |
( |
int |
unicharset_size | ) |
|
|
protected |
Sets unicharset_size_. Initializes the values of various masks from unicharset_size_.
Definition at line 190 of file dawg.cpp.
196 bool word_end)
const {
200 EDGE_REF end = num_forward_edges_in_node0 - 1;
◆ iterate_words() [1/2]
void tesseract::Dawg::iterate_words |
( |
const UNICHARSET & |
unicharset, |
|
|
std::function< void(const char *)> |
cb |
|
) |
| const |
◆ iterate_words() [2/2]
void tesseract::Dawg::iterate_words |
( |
const UNICHARSET & |
unicharset, |
|
|
std::function< void(const WERD_CHOICE *)> |
cb |
|
) |
| const |
◆ iterate_words_rec()
Definition at line 140 of file dawg.cpp.
148 if (wildcard != INVALID_UNICHAR_ID && word->unichar_id(index) == wildcard) {
149 bool any_matched =
false;
152 for (
int i = 0; i < vec.size(); ++i) {
153 word->set_unichar_id(vec[i].unichar_id, index);
◆ lang()
const STRING& tesseract::Dawg::lang |
( |
| ) |
const |
|
inline |
◆ marker_flag_from_edge_rec()
bool tesseract::Dawg::marker_flag_from_edge_rec |
( |
const EDGE_RECORD & |
edge_rec | ) |
const |
|
inlineprotected |
Returns the marker flag of this edge.
Definition at line 211 of file dawg.h.
◆ match_words()
Matches all of the words that are represented by this string. If wildcard is set to something other than INVALID_UNICHAR_ID, the *'s in this string are interpreted as wildcards. WERD_CHOICE param is not passed by const so that wildcard searches can modify it and work without having to copy WERD_CHOICEs.
Definition at line 158 of file dawg.cpp.
160 word_end = index == word->
length() - 1;
162 if (edge != NO_EDGE) {
167 }
else if (node != 0) {
◆ next_node()
◆ next_node_from_edge_rec()
Returns the next node visited by following this edge.
Definition at line 207 of file dawg.h.
◆ pattern_loop_edge()
Returns the given EDGE_REF if the EDGE_RECORD that it points to has a self loop and the given unichar_id matches the unichar_id stored in the EDGE_RECORD, returns NO_EDGE otherwise.
Reimplemented in tesseract::Trie.
Definition at line 190 of file dawg.h.
◆ permuter()
◆ prefix_in_dawg()
bool tesseract::Dawg::prefix_in_dawg |
( |
const WERD_CHOICE & |
prefix, |
|
|
bool |
requires_complete |
|
) |
| const |
Definition at line 57 of file dawg.cpp.
69 bool enable_wildcard)
const {
70 if (filename ==
nullptr)
return 0;
◆ print_node()
virtual void tesseract::Dawg::print_node |
( |
NODE_REF |
node, |
|
|
int |
max_num_edges |
|
) |
| const |
|
pure virtual |
◆ set_marker_flag_in_edge_rec()
void tesseract::Dawg::set_marker_flag_in_edge_rec |
( |
EDGE_RECORD * |
edge_rec | ) |
|
|
inlineprotected |
Sets this edge record to be the last one in a sequence of edges.
Definition at line 235 of file dawg.h.
◆ set_next_node_in_edge_rec()
Sets the next node link for this edge in the Dawg.
Definition at line 229 of file dawg.h.
◆ type()
DawgType tesseract::Dawg::type |
( |
| ) |
const |
|
inline |
◆ unichar_id_from_edge_rec()
Returns UNICHAR_ID recorded in this edge.
Definition at line 224 of file dawg.h.
◆ unichar_id_to_patterns()
Fills vec with unichar ids that represent the character classes of the given unichar_id.
Reimplemented in tesseract::Trie.
Definition at line 179 of file dawg.h.
◆ unichar_ids_of()
Fills the given NodeChildVector with all the unichar ids (and the corresponding EDGE_REFs) for which there is an edge out of this node.
Implemented in tesseract::SquishedDawg, and tesseract::Trie.
◆ word_in_dawg()
bool tesseract::Dawg::word_in_dawg |
( |
const WERD_CHOICE & |
word | ) |
const |
Returns true if the given word is in the Dawg.
Definition at line 78 of file dawg.cpp.
79 tprintf(
"Error: Could not open file %s\n", filename);
◆ debug_level_
int tesseract::Dawg::debug_level_ |
|
protected |
◆ flag_start_bit_
int tesseract::Dawg::flag_start_bit_ = 0 |
|
protected |
◆ flags_mask_
uint64_t tesseract::Dawg::flags_mask_ = 0 |
|
protected |
◆ kDawgMagicNumber
const int16_t tesseract::Dawg::kDawgMagicNumber = 42 |
|
static |
Magic number to determine endianness when reading the Dawg from file.
Definition at line 116 of file dawg.h.
◆ kPatternUnicharID
const UNICHAR_ID tesseract::Dawg::kPatternUnicharID = 0 |
|
static |
A special unichar id that indicates that any appropriate pattern (e.g.dicitonary word, 0-9 digit, etc) can be inserted instead Used for expressing patterns in punctuation and number Dawgs.
Definition at line 120 of file dawg.h.
◆ lang_
◆ letter_mask_
uint64_t tesseract::Dawg::letter_mask_ = 0 |
|
protected |
◆ next_node_mask_
uint64_t tesseract::Dawg::next_node_mask_ = 0 |
|
protected |
◆ next_node_start_bit_
int tesseract::Dawg::next_node_start_bit_ = 0 |
|
protected |
◆ perm_
Permuter code that should be used if the word is found in this Dawg.
Definition at line 298 of file dawg.h.
◆ type_
◆ unicharset_size_
int tesseract::Dawg::unicharset_size_ |
|
protected |
The documentation for this class was generated from the following files:
bool prefix_in_dawg(const WERD_CHOICE &prefix, bool requires_complete) const
bool match_words(WERD_CHOICE *word, int32_t index, NODE_REF node, UNICHAR_ID wildcard) const
EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const override
Returns the edge that corresponds to the letter out of this node.
UNICHAR_ID unichar_id(int index) const
void iterate_words_rec(const WERD_CHOICE &word_so_far, NODE_REF to_explore, std::function< void(const WERD_CHOICE *)> cb) const
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
const STRING & lang() const
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
GenericVector< NodeChild > NodeChildVector
virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec, bool word_end) const =0
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
PermuterType perm_
Permuter code that should be used if the word is found in this Dawg.
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
bool edge_rec_match(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, NODE_REF other_next_node, bool other_word_end, UNICHAR_ID other_unichar_id) const
const char * c_str() const
void chomp_string(char *str)
void init(int unicharset_size)
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Dawg(DawgType type, const STRING &lang, PermuterType perm, int debug_level)
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
void iterate_words(const UNICHARSET &unicharset, std::function< void(const WERD_CHOICE *)> cb) const
virtual NODE_REF next_node(EDGE_REF edge_ref) const =0
int check_for_words(const char *filename, const UNICHARSET &unicharset, bool enable_wildcard) const
DLLSYM void tprintf(const char *format,...)