21 #include "config_auto.h"
56 static const char kPermuterTypeNoPerm[] =
"None";
57 static const char kPermuterTypePuncPerm[] =
"Punctuation";
58 static const char kPermuterTypeTopPerm[] =
"Top Choice";
59 static const char kPermuterTypeLowerPerm[] =
"Top Lower Case";
60 static const char kPermuterTypeUpperPerm[] =
"Top Upper Case";
61 static const char kPermuterTypeNgramPerm[] =
"Ngram";
62 static const char kPermuterTypeNumberPerm[] =
"Number";
63 static const char kPermuterTypeUserPatPerm[] =
"User Pattern";
64 static const char kPermuterTypeSysDawgPerm[] =
"System Dictionary";
65 static const char kPermuterTypeDocDawgPerm[] =
"Document Dictionary";
66 static const char kPermuterTypeUserDawgPerm[] =
"User Dictionary";
67 static const char kPermuterTypeFreqDawgPerm[] =
"Frequent Words Dictionary";
68 static const char kPermuterTypeCompoundPerm[] =
"Compound";
70 static const char *
const kPermuterTypeNames[] = {
72 kPermuterTypePuncPerm,
74 kPermuterTypeLowerPerm,
75 kPermuterTypeUpperPerm,
76 kPermuterTypeNgramPerm,
77 kPermuterTypeNumberPerm,
78 kPermuterTypeUserPatPerm,
79 kPermuterTypeSysDawgPerm,
80 kPermuterTypeDocDawgPerm,
81 kPermuterTypeUserDawgPerm,
82 kPermuterTypeFreqDawgPerm,
83 kPermuterTypeCompoundPerm
99 unichar_id_ = src_unichar_id;
100 rating_ = src_rating;
101 certainty_ = src_cert;
104 script_id_ = src_script_id;
123 matrix_cell_ = other.matrix_cell_;
124 min_xheight_ = other.min_xheight_;
125 max_xheight_ = other.max_xheight_;
127 classifier_ = other.classifier_;
128 #ifndef DISABLED_LEGACY_ENGINE
129 fonts_ = other.fonts_;
130 #endif // ndef DISABLED_LEGACY_ENGINE
142 matrix_cell_ = other.matrix_cell_;
143 min_xheight_ = other.min_xheight_;
144 max_xheight_ = other.max_xheight_;
146 classifier_ = other.classifier_;
147 #ifndef DISABLED_LEGACY_ENGINE
148 fonts_ = other.fonts_;
149 #endif // ndef DISABLED_LEGACY_ENGINE
160 tprintf(
"Baseline diff %g for %d v %d\n",
161 baseline_diff, unichar_id_, other.unichar_id_);
167 double denominator =
ClipToRange(std::min(this_range, other_range),
171 overlap /= denominator;
173 tprintf(
"PosAndSize for %d v %d: bl diff = %g, ranges %g, %g / %g ->%g\n",
174 unichar_id_, other.unichar_id_, baseline_diff,
175 this_range, other_range, denominator, overlap);
184 BLOB_CHOICE_LIST* bc_list) {
186 BLOB_CHOICE_IT choice_it(bc_list);
187 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
188 choice_it.forward()) {
198 return kPermuterTypeNames[
permuter];
204 switch (script_pos) {
227 if (unicharset.
encode_string(cleaned.c_str(),
true, &encoding, &lengths,
229 lengths.push_back(
'\0');
230 STRING src_lengths = &lengths[0];
231 this->init(cleaned.c_str(), src_lengths.
c_str(), 0.0, 0.0,
NO_PERM);
249 const char *src_lengths,
252 uint8_t src_permuter) {
253 int src_string_len = strlen(src_string);
254 if (src_string_len == 0) {
257 this->
init(src_lengths ? strlen(src_lengths): src_string_len);
260 for (
int i = 0; i < length_; ++i) {
261 int unichar_length = src_lengths ? src_lengths[i] : 1;
263 unicharset_->
unichar_to_id(src_string+offset, unichar_length);
265 certainties_[i] = src_certainty;
266 offset += unichar_length;
269 adjust_factor_ = 1.0f;
270 rating_ = src_rating;
271 certainty_ = src_certainty;
272 permuter_ = src_permuter;
273 dangerous_ambig_found_ =
false;
280 delete[] unichar_ids_;
281 delete[] script_pos_;
283 delete[] certainties_;
287 return kPermuterTypeNames[permuter_];
295 BLOB_CHOICE_LIST* result = ratings->
get(coord.
col, coord.
row);
296 if (result ==
nullptr) {
297 result =
new BLOB_CHOICE_LIST;
298 ratings->
put(coord.
col, coord.
row, result);
307 for (
int i = 0; i < index; ++i)
309 int row = col + state_[index] - 1;
317 unichar_ids_[index] = blob_choice->
unichar_id();
319 state_[index] = blob_count;
320 certainties_[index] = blob_choice->
certainty();
330 for (
int i = 0; i < length_; ++i) {
348 for (
int i = 0; i < num; ++i) {
350 state_[start - 1] += state_[start + i];
351 else if (start + num < length_)
352 state_[start + num] += state_[start + i];
354 for (
int i = start; i + num < length_; ++i) {
355 unichar_ids_[i] = unichar_ids_[i + num];
356 script_pos_[i] = script_pos_[i + num];
357 state_[i] = state_[i + num];
358 certainties_[i] = certainties_[i + num];
369 for (
int i = 0; i < length_ / 2; ++i) {
371 unichar_ids_[i] = unicharset_->
get_mirror(unichar_ids_[length_-1-i]);
372 unichar_ids_[length_-1-i] = unicharset_->
get_mirror(tmp_id);
374 if (length_ % 2 != 0) {
375 unichar_ids_[length_/2] = unicharset_->
get_mirror(unichar_ids_[length_/2]);
389 while (*start <
length() &&
408 while (start < end &&
420 if (end < start) { end = start; }
422 for (
int i = start; i < end; i++) {
424 unichar_ids_[i], state_[i], 0.0f, certainties_[i]);
436 for (i = 0; i < length_; ++i) {
453 STRING *word_lengths_str)
const {
455 if (word_lengths_str !=
nullptr) *word_lengths_str =
"";
456 for (
int i = 0; i < length_; ++i) {
459 if (word_lengths_str !=
nullptr) {
460 *word_lengths_str += strlen(ch);
473 float rating,
float certainty) {
474 if (length_ == reserved_) {
490 while (reserved_ < length_ + second.
length()) {
494 for (
int i = 0; i < second.
length(); ++i) {
495 unichar_ids_[length_ + i] = other_unichar_ids[i];
496 state_[length_ + i] = second.state_[i];
497 certainties_[length_ + i] = second.certainties_[i];
500 length_ += second.
length();
501 if (second.adjust_factor_ > adjust_factor_)
502 adjust_factor_ = second.adjust_factor_;
503 rating_ += second.
rating();
506 if (second.dangerous_ambig_found_)
507 dangerous_ambig_found_ =
true;
525 while (reserved_ < source.
length()) {
529 unicharset_ = source.unicharset_;
531 for (
int i = 0; i < source.
length(); ++i) {
532 unichar_ids_[i] = other_unichar_ids[i];
533 state_[i] = source.state_[i];
534 certainties_[i] = source.certainties_[i];
537 length_ = source.
length();
538 adjust_factor_ = source.adjust_factor_;
539 rating_ = source.
rating();
544 dangerous_ambig_found_ = source.dangerous_ambig_found_;
555 for (
int i = 0; i < length_; ++i)
561 int position_counts[4] = { 0, 0, 0, 0 };
564 for (
int blob_index = 0; blob_index < length_; ++blob_index, ++chunk_index) {
568 if (state_ !=
nullptr) {
569 for (
int i = 1; i < state_[blob_index]; ++i) {
571 tblob = word->
blobs[chunk_index];
580 position_counts[script_pos_[blob_index]]++;
587 tprintf(
"Most characters of %s are subscript or superscript.\n"
588 "That seems wrong, so I'll assume we got the baseline wrong\n",
591 for (
int i = 0; i < length_; i++) {
594 position_counts[sp]--;
605 for (
int blob_index = 0; blob_index < length_; ++blob_index) {
611 chunk_index += state_ !=
nullptr ? state_[blob_index] : 1;
619 if (positions != script_pos_) {
620 delete [] script_pos_;
622 memcpy(script_pos_, positions,
sizeof(positions[0]) *
length);
627 for (
int i = 0; i < length_; ++i)
628 script_pos_[i] = position;
634 const TBOX& blob_box,
637 int top = blob_box.
top();
638 int bottom = blob_box.
bottom();
639 int min_bottom, max_bottom, min_top, max_top;
641 &min_bottom, &max_bottom,
649 }
else if (top < sub_thresh_top && bottom < sub_thresh_bot) {
651 }
else if (bottom > sup_thresh_bot) {
657 tprintf(
"%s Character %s[bot:%d top: %d] "
658 "bot_range[%d,%d] top_range[%d, %d] "
659 "sub_thresh[bot:%d top:%d] sup_thresh_bot %d\n",
662 min_bottom, max_bottom, min_top, max_top,
663 sub_thresh_bot, sub_thresh_top,
672 int *sid =
new int[max_script];
674 for (x = 0; x < max_script; x++) sid[x] = 0;
675 for (x = 0; x < length_; ++x) {
693 for (x = 1; x < max_script; x++)
694 if (sid[x] >= sid[max_sid]) max_sid = x;
695 if (sid[max_sid] < length_ / 2)
703 int total_chunks = 0;
704 for (
int i = 0; i < length_; ++i) {
705 total_chunks += state_[i];
706 if (total_chunks > blob_position) {
715 int total_chunks = 0;
716 for (
int i = 0; i < length_; ++i) {
717 total_chunks += state_[i];
729 for (
int i = 0; i < length_; ++i) {
732 tprintf(
" : R=%g, C=%g, F=%g, Perm=%d, xht=[%g,%g], ambig=%d\n",
733 rating_, certainty_, adjust_factor_, permuter_,
734 min_x_height_, max_x_height_, dangerous_ambig_found_);
736 for (
int i = 0; i < length_; ++i) {
740 for (
int i = 0; i < length_; ++i) {
744 for (
int i = 0; i < length_; ++i) {
748 for (
int i = 0; i < length_; ++i) {
749 tprintf(
"\t%.3f", certainties_[i]);
757 for (
int i = 0; i < length_; ++i)
765 #ifndef GRAPHICS_DISABLED
767 const int kNumColors = 6;
771 bool already_done = prev_drawn_state.
size() == length_;
772 if (!already_done) prev_drawn_state.
init_to_size(length_, 0);
773 for (
int i = 0; i < length_; ++i) {
774 if (prev_drawn_state[i] != state_[i]) {
775 already_done =
false;
777 prev_drawn_state[i] = state_[i];
779 if (already_done || word->
blobs.
empty())
return;
782 if (segm_window ==
nullptr) {
783 segm_window =
new ScrollView(
"Segmentation", 5, 10, 500, 256,
784 2000.0, 256.0,
true);
786 segm_window->
Clear();
791 for (
int c = 0; c < length_; ++c) {
793 static_cast<ScrollView::Color>(c % kNumColors + 3);
794 for (
int i = 0; i < state_[c]; ++i, ++blob_index) {
797 blob->
plot(segm_window, color, color);
811 if (word2.
unicharset() != uchset)
return false;
816 if (w1end - w1start != w2end - w2start)
return false;
817 for (
int i = 0; i < w1end - w1start; i++) {
837 BLOB_CHOICE_LIST *ratings,
839 if (ratings->length() == 0) {
847 c_it.set_to_list(ratings);
848 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
849 c_it.data()->print(¤t_unicharset);
850 if (!c_it.at_last())
tprintf(
"\n");