29 #pragma warning(disable:4244) // Conversion warnings
30 #pragma warning(disable:4800) // int/bool warnings
62 return RTLReversePolicyNames[reverse_policy];
79 " direction %d word_end %d unichar_id %d, exploring node:\n",
80 node_ref, next_node, direction, word_end, unichar_id);
81 if (node_ref != NO_EDGE) {
85 if (node_ref == NO_EDGE)
return false;
88 nodes_[node_ref]->forward_edges :
nodes_[node_ref]->backward_edges;
89 int vec_size = vec.
size();
95 while (start <= end) {
96 k = (start + end) >> 1;
100 *edge_ptr = &(vec[k]);
103 }
else if (compare == 1) {
110 for (
int i = 0; i < vec_size; ++i) {
116 *edge_ptr = &(edge_rec);
129 &(
nodes_[node1]->forward_edges) : &(
nodes_[node1]->backward_edges);
133 while (search_index < vec->size() &&
135 (*vec)[search_index]) == 1) {
139 search_index = vec->
size();
142 link_edge(&edge_rec, node2, marker_flag, direction, word_end, unichar_id);
146 (*vec)[edge_index] = edge_rec;
147 }
else if (search_index < vec->size()) {
148 vec->
insert(edge_rec, search_index);
168 unichar_id, &back_edge_ptr, &back_edge_index));
180 if (word.
length() <= 0)
return false;
183 for (
int i = 0; i < word.
length(); ++i) {
191 bool marker_flag =
false;
194 inT32 still_finding_chars =
true;
195 inT32 word_end =
false;
196 bool add_failed =
false;
202 for (i = 0; i < word.
length() - 1; ++i) {
204 marker_flag = (repetitions !=
NULL) ? (*repetitions)[i] :
false;
206 if (still_finding_chars) {
208 unichar_id, &edge_ptr, &edge_index);
211 edge_index, last_node);
214 still_finding_chars =
false;
224 still_finding_chars =
false;
232 if (!still_finding_chars) {
236 if (the_next_node == 0) {
241 marker_flag, word_end, unichar_id)) {
246 last_node = the_next_node;
251 marker_flag = (repetitions !=
NULL) ? (*repetitions)[i] :
false;
253 if (still_finding_chars &&
255 unichar_id, &edge_ptr, &edge_index)) {
259 marker_flag, unichar_id);
265 !
add_new_edge(last_node, the_next_node, marker_flag,
true, unichar_id))
269 tprintf(
"Re-initializing document dictionary...\n");
279 if (node ==
NULL)
return 0;
285 static int sort_strings_by_dec_length(
const void* v1,
const void* v2) {
295 if (!
read_word_list(filename, unicharset, reverse_policy, &word_list))
297 word_list.
sort(sort_strings_by_dec_length);
309 word_file = fopen(filename,
"rb");
310 if (word_file ==
NULL)
return false;
322 tprintf(
"Read %d words so far\n", word_count);
326 tprintf(
"Skipping invalid word %s\n",
string);
331 tprintf(
"Read %d words total.\n", word_count);
338 for (
int i = 0; i < words.
size(); ++i) {
343 tprintf(
"Error: word '%s' not in DAWG after adding it\n",
372 bool is_alpha = unicharset.
get_isalpha(unichar_id);
394 }
else if (ch ==
'd') {
396 }
else if (ch ==
'n') {
398 }
else if (ch ==
'p') {
400 }
else if (ch ==
'a') {
402 }
else if (ch ==
'A') {
405 return INVALID_UNICHAR_ID;
412 tprintf(
"please call initialize_patterns() before read_pattern_list()\n");
416 FILE *pattern_file = fopen(filename,
"rb");
417 if (pattern_file ==
NULL) {
418 tprintf(
"Error opening pattern file %s\n", filename);
422 int pattern_count = 0;
430 const char *str_ptr = string;
431 int step = unicharset.
step(str_ptr);
434 UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
435 if (step == 1 && *str_ptr ==
'\\') {
437 if (*str_ptr ==
'\\') {
441 tprintf(
"Please provide at least %d concrete characters at the"
452 if (curr_unichar_id == INVALID_UNICHAR_ID) {
459 step = unicharset.
step(str_ptr);
461 if (step == 1 && *str_ptr ==
'\\' && *(str_ptr+1) ==
'*') {
462 repetitions_vec[repetitions_vec.
size()-1] =
true;
464 step = unicharset.
step(str_ptr);
468 tprintf(
"Invalid user pattern %s\n",
string);
473 tprintf(
"Inserting expanded user pattern %s\n",
479 tprintf(
"Error: failed to insert pattern '%s'\n",
string);
485 tprintf(
"Read %d valid patterns from %s\n", pattern_count, filename);
487 fclose(pattern_file);
496 unichar_id, &edge_ptr, &edge_index));
504 }
else if (node1 == 0) {
532 for (
int i = 0; i <
nodes_.
size(); i++) reduced_nodes[i] = 0;
534 delete[] reduced_nodes;
545 node_ref_map[i+1] = node_ref_map[i] +
nodes_[i]->forward_edges.
size();
547 int num_forward_edges = node_ref_map[i];
557 for (j = 0; j < end; ++j) {
568 delete[] node_ref_map;
578 tprintf(
"\nCollapsing node %d:\n", node);
602 curr_word_end, curr_unichar_id);
605 curr_word_end, curr_unichar_id,
606 &edge_ptr, &edge_index));
613 next_node2_num_edges, next_node2);
629 bool did_something =
false;
630 for (
int i = edge_index; i < backward_edges->
size() - 1; ++i) {
632 UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
633 while (i < backward_edges->size()) {
634 if (!
DeadEdge((*backward_edges)[i])) {
636 if (curr_unichar_id != unichar_id)
return did_something;
641 if (i == backward_edges->
size())
break;
642 const EDGE_RECORD &edge_rec = (*backward_edges)[i];
644 for (
int j = i + 1; j < backward_edges->
size(); ++j) {
645 const EDGE_RECORD &next_edge_rec = (*backward_edges)[j];
646 if (
DeadEdge(next_edge_rec))
continue;
648 if (next_id != unichar_id)
break;
654 did_something =
true;
659 return did_something;
663 int num_edges = edges->
size();
664 if (num_edges <= 1)
return;
667 for (
int i = 0; i < num_edges; ++i) {
672 for (
int i = 0; i < num_edges; ++i)
673 (*edges)[i] = sort_vec[i].data;
686 while (edge_index < backward_edges.
size()) {
687 if (
DeadEdge(backward_edges[edge_index]))
continue;
691 &backward_edges, reduced_nodes));
692 while (++edge_index < backward_edges.
size()) {
694 if (!
DeadEdge(backward_edges[edge_index]) &&
id != unichar_id)
break;
697 reduced_nodes[node] =
true;
704 for (
int i = 0; i < backward_edges.
size(); ++i) {
705 if (
DeadEdge(backward_edges[i]))
continue;
707 if (next_node != 0 && !reduced_nodes[next_node]) {
714 if (node == NO_EDGE)
return;
719 for (
int dir = 0; dir < 2; ++dir) {
728 for (i = 0; (dir == 0 ? i < num_fwd : i < num_bkw) &&
729 i < max_num_edges; ++i) {
734 if (dir == 0 ? i < num_fwd : i < num_bkw)
tprintf(
"...");
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
bool reduce_lettered_edges(EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node, EDGE_VECTOR *backward_edges, NODE_MARKER reduced_nodes)
static const char kAlphaPatternUnicode[]
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
EDGE_VECTOR backward_edges
void set_marker_flag_in_edge_rec(EDGE_RECORD *edge_rec)
Sets this edge record to be the last one in a sequence of edges.
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
const char kDoNotReverse[]
static const char kDigitPatternUnicode[]
bool eliminate_redundant_edges(NODE_REF node, const EDGE_RECORD &edge1, const EDGE_RECORD &edge2)
UNICHAR_ID alphanum_pattern_
bool DeadEdge(const EDGE_RECORD &edge_rec) const
bool contains_unichar_id(UNICHAR_ID unichar_id) const
static const char kPuncPatternUnicode[]
void print_node(NODE_REF node, int max_num_edges) const
UNICHAR_ID digit_pattern_
bool get_isupper(UNICHAR_ID unichar_id) const
int direction(EDGEPT *point)
void print_edge_rec(const EDGE_RECORD &edge_rec) const
const char *const RTLReversePolicyNames[]
static const char kUpperPatternUnicode[]
void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset, GenericVector< UNICHAR_ID > *vec) const
void sort_edges(EDGE_VECTOR *edges)
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
bool has_rtl_unichar_id() const
bool edge_rec_match(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, NODE_REF other_next_node, bool other_word_end, UNICHAR_ID other_unichar_id) const
bool add_new_edge(NODE_REF node1, NODE_REF node2, bool repeats, bool word_end, UNICHAR_ID unichar_id)
UNICHAR_ID lower_pattern_
EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const
SquishedDawg * trie_to_dawg()
bool initialized_patterns_
const STRING & unichar_string() const
UNICHAR_ID upper_pattern_
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
static const char kLowerPatternUnicode[]
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
bool add_edge_linkage(NODE_REF node1, NODE_REF node2, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
bool get_isdigit(UNICHAR_ID unichar_id) const
void print_all(const char *msg, int max_num_edges)
void reverse_and_mirror_unichar_ids()
UNICHAR_ID alpha_pattern_
void remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, bool word_end, UNICHAR_ID unichar_id)
void reduce_node_input(NODE_REF node, NODE_MARKER reduced_nodes)
void insert(T t, int index)
void chomp_string(char *str)
void initialize_patterns(UNICHARSET *unicharset)
const UNICHAR_ID unichar_id(int index) const
void KillEdge(EDGE_RECORD *edge_rec) const
void delete_data_pointers()
UNICHAR_ID character_class_to_pattern(char ch)
bool can_be_eliminated(const EDGE_RECORD &edge_rec)
#define MAX_NODE_EDGES_DISPLAY
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
static const char * get_reverse_policy_name(RTLReversePolicy reverse_policy)
const STRING debug_string() const
void set_next_node_in_edge_rec(EDGE_RECORD *edge_rec, EDGE_REF value)
Sets the next node link for this edge in the Dawg.
void unichar_insert(const char *const unichar_repr)
int given_greater_than_edge_rec(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
bool get_islower(UNICHAR_ID unichar_id) const
bool read_word_list(const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse_policy, GenericVector< STRING > *words)
bool get_ispunctuation(UNICHAR_ID unichar_id) const
NODE_REF next_node(EDGE_REF edge_ref) const
bool read_and_add_word_list(const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse)
bool get_isalpha(UNICHAR_ID unichar_id) const
EDGE_VECTOR forward_edges
bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the marker flag of this edge.
bool add_word_list(const GenericVector< STRING > &words, const UNICHARSET &unicharset)
int step(const char *str) const
void add_word_ending(EDGE_RECORD *edge, NODE_REF the_next_node, bool repeats, UNICHAR_ID unichar_id)
GenericVector< EDGE_INDEX > root_back_freelist_
void link_edge(EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
static const char kAlphanumPatternUnicode[]
const char * string() const
PermuterType perm_
Permuter code that should be used if the word is found in this Dawg.
const char kReverseIfHasRTL[]
static const int kSaneNumConcreteChars
bool read_pattern_list(const char *filename, const UNICHARSET &unicharset)
void remove_edge(NODE_REF node1, NODE_REF node2, bool word_end, UNICHAR_ID unichar_id)
const char kForceReverse[]