51 bool requires_complete)
const {
52 if (word.
length() == 0)
return !requires_complete;
54 int end_index = word.
length() - 1;
55 for (
int i = 0; i < end_index; i++) {
57 if (edge == NO_EDGE) {
77 bool enable_wildcard)
const {
78 if (filename ==
nullptr)
return 0;
85 word_file = fopen(filename,
"r");
86 if (word_file ==
nullptr) {
87 tprintf(
"Error: Could not open file %s\n", filename);
97 enable_wildcard ? wildcard : INVALID_UNICHAR_ID)) {
98 tprintf(
"Missing word: %s\n",
string);
102 tprintf(
"Failed to create a valid word from %s\n",
string);
126 std::unique_ptr<TessCallback1<const WERD_CHOICE *>> shim(
137 for (
int i = 0; i < children.
size(); i++) {
155 if (wildcard != INVALID_UNICHAR_ID && word->
unichar_id(index) == wildcard) {
156 bool any_matched =
false;
159 for (
int i = 0; i < vec.
size(); ++i) {
167 word_end = index == word->
length() - 1;
169 if (edge != NO_EDGE) {
174 }
else if (node != 0) {
203 bool word_end)
const {
207 EDGE_REF end = num_forward_edges_in_node0 - 1;
209 while (start <= end) {
210 edge = (start + end) >> 1;
212 unichar_id, edges_[edge]);
215 }
else if (compare == 1) {
222 if (edge != NO_EDGE && edge_occupied(edge)) {
227 }
while (!last_edge(edge++));
233 int32_t SquishedDawg::num_forward_edges(
NODE_REF node)
const {
237 if (forward_edge (edge)) {
240 }
while (!last_edge(edge++));
247 if (node == NO_EDGE)
return;
250 const char *forward_string =
"FORWARD";
251 const char *backward_string =
" ";
253 const char *last_string =
"LAST";
254 const char *not_last_string =
" ";
256 const char *eow_string =
"EOW";
257 const char *not_eow_string =
" ";
265 if (edge_occupied(edge)) {
268 forward_edge(edge) ? forward_string : backward_string;
269 is_last = last_edge(edge) ? last_string : not_last_string;
270 eow =
end_of_word(edge) ? eow_string : not_eow_string;
277 if (edge - node > max_num_edges)
return;
278 }
while (!last_edge(edge++));
280 if (edge < num_edges_ &&
281 edge_occupied(edge) && backward_edge(edge)) {
284 forward_edge(edge) ? forward_string : backward_string;
285 is_last = last_edge(edge) ? last_string : not_last_string;
286 eow =
end_of_word(edge) ? eow_string : not_eow_string;
290 ", unichar_id = %d, %s %s %s\n",
295 }
while (!last_edge(edge++));
304 void SquishedDawg::print_edge(
EDGE_REF edge)
const {
305 if (edge == NO_EDGE) {
309 ", unichar_id = '%d', %s %s %s\n", edge,
311 (forward_edge(edge) ?
"FORWARD" :
" "),
312 (last_edge(edge) ?
"LAST" :
" "),
317 bool SquishedDawg::read_squished_dawg(TFile *file) {
323 if (!file->DeSerialize(&magic))
return false;
329 int32_t unicharset_size;
330 if (!file->DeSerialize(&unicharset_size))
return false;
331 if (!file->DeSerialize(&num_edges_))
return false;
336 if (!file->DeSerialize(&edges_[0], num_edges_))
return false;
338 tprintf(
"type: %d lang: %s perm: %d unicharset_size: %d num_edges: %d\n",
340 for (
EDGE_REF edge = 0; edge < num_edges_; ++edge) print_edge(edge);
345 std::unique_ptr<EDGE_REF[]> SquishedDawg::build_node_map(
346 int32_t *num_nodes)
const {
348 std::unique_ptr<EDGE_REF[]> node_map(
new EDGE_REF[num_edges_]);
349 int32_t node_counter;
352 for (edge = 0; edge < num_edges_; edge++)
355 node_counter = num_forward_edges(0);
358 for (edge = 0; edge < num_edges_; edge++) {
360 if (forward_edge(edge)) {
362 node_map[edge] = (edge ? node_counter : 0);
363 num_edges = num_forward_edges(edge);
364 if (edge != 0) node_counter += num_edges;
366 if (edge >= num_edges_)
break;
367 if (backward_edge(edge))
while (!last_edge(edge++));
377 int32_t node_count = 0;
383 std::unique_ptr<EDGE_REF[]> node_map(build_node_map(&node_count));
387 if (!file->
Serialize(&magic))
return false;
392 for (edge=0; edge < num_edges_; edge++)
393 if (forward_edge(edge))
397 if (!file->
Serialize(&num_edges))
return false;
400 tprintf(
"%d nodes in DAWG\n", node_count);
401 tprintf(
"%d edges in DAWG\n", num_edges);
404 for (edge = 0; edge < num_edges_; edge++) {
405 if (forward_edge(edge)) {
408 set_next_node(edge, node_map[old_index]);
409 temp_record = edges_[edge];
410 if (!file->
Serialize(&temp_record))
return false;
411 set_next_node(edge, old_index);
412 }
while (!last_edge(edge++));
414 if (edge >= num_edges_)
break;
415 if (backward_edge(edge))
416 while (!last_edge(edge++));
void set_unichar_id(UNICHAR_ID unichar_id, int index)
void print_node(NODE_REF node, int max_num_edges) const
const char * string() const
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
bool prefix_in_dawg(const WERD_CHOICE &prefix, bool requires_complete) const
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
void iterate_words_rec(const WERD_CHOICE &word_so_far, NODE_REF to_explore, TessCallback1< const WERD_CHOICE *> *cb) const
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
int direction(EDGEPT *point)
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
UNICHAR_ID edge_letter(EDGE_REF edge_ref) const
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
PermuterType perm_
Permuter code that should be used if the word is found in this Dawg.
int given_greater_than_edge_rec(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
static const int16_t kDawgMagicNumber
Magic number to determine endianness when reading the Dawg from file.
#define MAX_NODE_EDGES_DISPLAY
bool write_squished_dawg(TFile *file)
Writes the squished/reduced Dawg to a file.
void chomp_string(char *str)
EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const
Returns the edge that corresponds to the letter out of this node.
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
bool contains_unichar_id(UNICHAR_ID unichar_id) const
UNICHAR_ID unichar_id(int index) const
bool Serialize(const char *data, size_t count=1)
DLLSYM void tprintf(const char *format,...)
bool match_words(WERD_CHOICE *word, int32_t index, NODE_REF node, UNICHAR_ID wildcard) const
virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec, bool word_end) const =0
void iterate_words(const UNICHARSET &unicharset, TessCallback1< const WERD_CHOICE *> *cb) const
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
void init(int unicharset_size)
virtual NODE_REF next_node(EDGE_REF edge_ref) const =0
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
int check_for_words(const char *filename, const UNICHARSET &unicharset, bool enable_wildcard) const
NODE_REF next_node(EDGE_REF edge) const
virtual bool end_of_word(EDGE_REF edge_ref) const =0
bool end_of_word(EDGE_REF edge_ref) const