tesseract  5.0.0-alpha-619-ge9db
ratngs.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: ratngs.cpp (Formerly ratings.c)
3  * Description: Code to manipulate the BLOB_CHOICE and WERD_CHOICE classes.
4  * Author: Ray Smith
5  *
6  * (C) Copyright 1992, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 
20 #ifdef HAVE_CONFIG_H
21 #include "config_auto.h"
22 #endif
23 
24 #include "ratngs.h"
25 
26 #include <algorithm>
27 #include <string>
28 #include "blobs.h"
29 #include "callcpp.h"
31 #include "matrix.h"
32 #include "normalis.h" // kBlnBaselineOffset.
33 #include "unicharset.h"
34 
36 
39 
40 const float WERD_CHOICE::kBadRating = 100000.0;
41 // Min offset in baseline-normalized coords to make a character a subscript.
42 const int kMinSubscriptOffset = 20;
43 // Min offset in baseline-normalized coords to make a character a superscript.
44 const int kMinSuperscriptOffset = 20;
45 // Max y of bottom of a drop-cap blob.
46 const int kMaxDropCapBottom = -128;
47 // Max fraction of x-height to use as denominator in measuring x-height overlap.
48 const double kMaxOverlapDenominator = 0.125;
49 // Min fraction of x-height range that should be in agreement for matching
50 // x-heights.
51 const double kMinXHeightMatch = 0.5;
52 // Max tolerance on baseline position as a fraction of x-height for matching
53 // baselines.
54 const double kMaxBaselineDrift = 0.0625;
55 
56 static const char kPermuterTypeNoPerm[] = "None";
57 static const char kPermuterTypePuncPerm[] = "Punctuation";
58 static const char kPermuterTypeTopPerm[] = "Top Choice";
59 static const char kPermuterTypeLowerPerm[] = "Top Lower Case";
60 static const char kPermuterTypeUpperPerm[] = "Top Upper Case";
61 static const char kPermuterTypeNgramPerm[] = "Ngram";
62 static const char kPermuterTypeNumberPerm[] = "Number";
63 static const char kPermuterTypeUserPatPerm[] = "User Pattern";
64 static const char kPermuterTypeSysDawgPerm[] = "System Dictionary";
65 static const char kPermuterTypeDocDawgPerm[] = "Document Dictionary";
66 static const char kPermuterTypeUserDawgPerm[] = "User Dictionary";
67 static const char kPermuterTypeFreqDawgPerm[] = "Frequent Words Dictionary";
68 static const char kPermuterTypeCompoundPerm[] = "Compound";
69 
70 static const char * const kPermuterTypeNames[] = {
71  kPermuterTypeNoPerm, // 0
72  kPermuterTypePuncPerm, // 1
73  kPermuterTypeTopPerm, // 2
74  kPermuterTypeLowerPerm, // 3
75  kPermuterTypeUpperPerm, // 4
76  kPermuterTypeNgramPerm, // 5
77  kPermuterTypeNumberPerm, // 6
78  kPermuterTypeUserPatPerm, // 7
79  kPermuterTypeSysDawgPerm, // 8
80  kPermuterTypeDocDawgPerm, // 9
81  kPermuterTypeUserDawgPerm, // 10
82  kPermuterTypeFreqDawgPerm, // 11
83  kPermuterTypeCompoundPerm // 12
84 };
85 
91 BLOB_CHOICE::BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id
92  float src_rating, // rating
93  float src_cert, // certainty
94  int src_script_id, // script
95  float min_xheight, // min xheight allowed
96  float max_xheight, // max xheight by this char
97  float yshift, // yshift out of position
98  BlobChoiceClassifier c) { // adapted match or other
99  unichar_id_ = src_unichar_id;
100  rating_ = src_rating;
101  certainty_ = src_cert;
102  fontinfo_id_ = -1;
103  fontinfo_id2_ = -1;
104  script_id_ = src_script_id;
105  min_xheight_ = min_xheight;
106  max_xheight_ = max_xheight;
107  yshift_ = yshift;
108  classifier_ = c;
109 }
110 
116 BLOB_CHOICE::BLOB_CHOICE(const BLOB_CHOICE &other) : ELIST_LINK(other) {
117  unichar_id_ = other.unichar_id();
118  rating_ = other.rating();
119  certainty_ = other.certainty();
120  fontinfo_id_ = other.fontinfo_id();
121  fontinfo_id2_ = other.fontinfo_id2();
122  script_id_ = other.script_id();
123  matrix_cell_ = other.matrix_cell_;
124  min_xheight_ = other.min_xheight_;
125  max_xheight_ = other.max_xheight_;
126  yshift_ = other.yshift();
127  classifier_ = other.classifier_;
128 #ifndef DISABLED_LEGACY_ENGINE
129  fonts_ = other.fonts_;
130 #endif // ndef DISABLED_LEGACY_ENGINE
131 }
132 
133 // Copy assignment operator.
134 BLOB_CHOICE& BLOB_CHOICE::operator=(const BLOB_CHOICE& other) {
135  ELIST_LINK::operator=(other);
136  unichar_id_ = other.unichar_id();
137  rating_ = other.rating();
138  certainty_ = other.certainty();
139  fontinfo_id_ = other.fontinfo_id();
140  fontinfo_id2_ = other.fontinfo_id2();
141  script_id_ = other.script_id();
142  matrix_cell_ = other.matrix_cell_;
143  min_xheight_ = other.min_xheight_;
144  max_xheight_ = other.max_xheight_;
145  yshift_ = other.yshift();
146  classifier_ = other.classifier_;
147 #ifndef DISABLED_LEGACY_ENGINE
148  fonts_ = other.fonts_;
149 #endif // ndef DISABLED_LEGACY_ENGINE
150  return *this;
151 }
152 
153 // Returns true if *this and other agree on the baseline and x-height
154 // to within some tolerance based on a given estimate of the x-height.
155 bool BLOB_CHOICE::PosAndSizeAgree(const BLOB_CHOICE& other, float x_height,
156  bool debug) const {
157  double baseline_diff = fabs(yshift() - other.yshift());
158  if (baseline_diff > kMaxBaselineDrift * x_height) {
159  if (debug) {
160  tprintf("Baseline diff %g for %d v %d\n",
161  baseline_diff, unichar_id_, other.unichar_id_);
162  }
163  return false;
164  }
165  double this_range = max_xheight() - min_xheight();
166  double other_range = other.max_xheight() - other.min_xheight();
167  double denominator = ClipToRange(std::min(this_range, other_range),
168  1.0, kMaxOverlapDenominator * x_height);
169  double overlap = std::min(max_xheight(), other.max_xheight()) -
170  std::max(min_xheight(), other.min_xheight());
171  overlap /= denominator;
172  if (debug) {
173  tprintf("PosAndSize for %d v %d: bl diff = %g, ranges %g, %g / %g ->%g\n",
174  unichar_id_, other.unichar_id_, baseline_diff,
175  this_range, other_range, denominator, overlap);
176  }
177 
178  return overlap >= kMinXHeightMatch;
179 }
180 
181 // Helper to find the BLOB_CHOICE in the bc_list that matches the given
182 // unichar_id, or nullptr if there is no match.
184  BLOB_CHOICE_LIST* bc_list) {
185  // Find the corresponding best BLOB_CHOICE.
186  BLOB_CHOICE_IT choice_it(bc_list);
187  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
188  choice_it.forward()) {
189  BLOB_CHOICE* choice = choice_it.data();
190  if (choice->unichar_id() == char_id) {
191  return choice;
192  }
193  }
194  return nullptr;
195 }
196 
197 const char *WERD_CHOICE::permuter_name(uint8_t permuter) {
198  return kPermuterTypeNames[permuter];
199 }
200 
201 namespace tesseract {
202 
203 const char *ScriptPosToString(enum ScriptPos script_pos) {
204  switch (script_pos) {
205  case SP_NORMAL: return "NORM";
206  case SP_SUBSCRIPT: return "SUB";
207  case SP_SUPERSCRIPT: return "SUPER";
208  case SP_DROPCAP: return "DROPC";
209  }
210  return "SP_UNKNOWN";
211 }
212 
213 } // namespace tesseract.
214 
221 WERD_CHOICE::WERD_CHOICE(const char *src_string,
222  const UNICHARSET &unicharset)
223  : unicharset_(&unicharset){
224  GenericVector<UNICHAR_ID> encoding;
225  GenericVector<char> lengths;
226  std::string cleaned = unicharset.CleanupString(src_string);
227  if (unicharset.encode_string(cleaned.c_str(), true, &encoding, &lengths,
228  nullptr)) {
229  lengths.push_back('\0');
230  STRING src_lengths = &lengths[0];
231  this->init(cleaned.c_str(), src_lengths.c_str(), 0.0, 0.0, NO_PERM);
232  } else { // There must have been an invalid unichar in the string.
233  this->init(8);
234  this->make_bad();
235  }
236 }
237 
248 void WERD_CHOICE::init(const char *src_string,
249  const char *src_lengths,
250  float src_rating,
251  float src_certainty,
252  uint8_t src_permuter) {
253  int src_string_len = strlen(src_string);
254  if (src_string_len == 0) {
255  this->init(8);
256  } else {
257  this->init(src_lengths ? strlen(src_lengths): src_string_len);
258  length_ = reserved_;
259  int offset = 0;
260  for (int i = 0; i < length_; ++i) {
261  int unichar_length = src_lengths ? src_lengths[i] : 1;
262  unichar_ids_[i] =
263  unicharset_->unichar_to_id(src_string+offset, unichar_length);
264  state_[i] = 1;
265  certainties_[i] = src_certainty;
266  offset += unichar_length;
267  }
268  }
269  adjust_factor_ = 1.0f;
270  rating_ = src_rating;
271  certainty_ = src_certainty;
272  permuter_ = src_permuter;
273  dangerous_ambig_found_ = false;
274 }
275 
280  delete[] unichar_ids_;
281  delete[] script_pos_;
282  delete[] state_;
283  delete[] certainties_;
284 }
285 
286 const char *WERD_CHOICE::permuter_name() const {
287  return kPermuterTypeNames[permuter_];
288 }
289 
290 // Returns the BLOB_CHOICE_LIST corresponding to the given index in the word,
291 // taken from the appropriate cell in the ratings MATRIX.
292 // Borrowed pointer, so do not delete.
293 BLOB_CHOICE_LIST* WERD_CHOICE::blob_choices(int index, MATRIX* ratings) const {
294  MATRIX_COORD coord = MatrixCoord(index);
295  BLOB_CHOICE_LIST* result = ratings->get(coord.col, coord.row);
296  if (result == nullptr) {
297  result = new BLOB_CHOICE_LIST;
298  ratings->put(coord.col, coord.row, result);
299  }
300  return result;
301 }
302 
303 // Returns the MATRIX_COORD corresponding to the location in the ratings
304 // MATRIX for the given index into the word.
305 MATRIX_COORD WERD_CHOICE::MatrixCoord(int index) const {
306  int col = 0;
307  for (int i = 0; i < index; ++i)
308  col += state_[i];
309  int row = col + state_[index] - 1;
310  return MATRIX_COORD(col, row);
311 }
312 
313 // Sets the entries for the given index from the BLOB_CHOICE, assuming
314 // unit fragment lengths, but setting the state for this index to blob_count.
315 void WERD_CHOICE::set_blob_choice(int index, int blob_count,
316  const BLOB_CHOICE* blob_choice) {
317  unichar_ids_[index] = blob_choice->unichar_id();
318  script_pos_[index] = tesseract::SP_NORMAL;
319  state_[index] = blob_count;
320  certainties_[index] = blob_choice->certainty();
321 }
322 
323 
329 bool WERD_CHOICE::contains_unichar_id(UNICHAR_ID unichar_id) const {
330  for (int i = 0; i < length_; ++i) {
331  if (unichar_ids_[i] == unichar_id) {
332  return true;
333  }
334  }
335  return false;
336 }
337 
345 void WERD_CHOICE::remove_unichar_ids(int start, int num) {
346  ASSERT_HOST(start >= 0 && start + num <= length_);
347  // Accumulate the states to account for the merged blobs.
348  for (int i = 0; i < num; ++i) {
349  if (start > 0)
350  state_[start - 1] += state_[start + i];
351  else if (start + num < length_)
352  state_[start + num] += state_[start + i];
353  }
354  for (int i = start; i + num < length_; ++i) {
355  unichar_ids_[i] = unichar_ids_[i + num];
356  script_pos_[i] = script_pos_[i + num];
357  state_[i] = state_[i + num];
358  certainties_[i] = certainties_[i + num];
359  }
360  length_ -= num;
361 }
362 
369  for (int i = 0; i < length_ / 2; ++i) {
370  UNICHAR_ID tmp_id = unichar_ids_[i];
371  unichar_ids_[i] = unicharset_->get_mirror(unichar_ids_[length_-1-i]);
372  unichar_ids_[length_-1-i] = unicharset_->get_mirror(tmp_id);
373  }
374  if (length_ % 2 != 0) {
375  unichar_ids_[length_/2] = unicharset_->get_mirror(unichar_ids_[length_/2]);
376  }
377 }
378 
386 void WERD_CHOICE::punct_stripped(int *start, int *end) const {
387  *start = 0;
388  *end = length() - 1;
389  while (*start < length() &&
390  unicharset()->get_ispunctuation(unichar_id(*start))) {
391  (*start)++;
392  }
393  while (*end > -1 &&
394  unicharset()->get_ispunctuation(unichar_id(*end))) {
395  (*end)--;
396  }
397  (*end)++;
398 }
399 
400 void WERD_CHOICE::GetNonSuperscriptSpan(int *pstart, int *pend) const {
401  int end = length();
402  while (end > 0 &&
403  unicharset_->get_isdigit(unichar_ids_[end - 1]) &&
405  end--;
406  }
407  int start = 0;
408  while (start < end &&
409  unicharset_->get_isdigit(unichar_ids_[start]) &&
411  start++;
412  }
413  *pstart = start;
414  *pend = end;
415 }
416 
417 WERD_CHOICE WERD_CHOICE::shallow_copy(int start, int end) const {
418  ASSERT_HOST(start >= 0 && start <= length_);
419  ASSERT_HOST(end >= 0 && end <= length_);
420  if (end < start) { end = start; }
421  WERD_CHOICE retval(unicharset_, end - start);
422  for (int i = start; i < end; i++) {
424  unichar_ids_[i], state_[i], 0.0f, certainties_[i]);
425  }
426  return retval;
427 }
428 
434 bool WERD_CHOICE::has_rtl_unichar_id() const {
435  int i;
436  for (i = 0; i < length_; ++i) {
437  UNICHARSET::Direction dir = unicharset_->get_direction(unichar_ids_[i]);
438  if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
440  return true;
441  }
442  }
443  return false;
444 }
445 
453  STRING *word_lengths_str) const {
454  *word_str = "";
455  if (word_lengths_str != nullptr) *word_lengths_str = "";
456  for (int i = 0; i < length_; ++i) {
457  const char *ch = unicharset_->id_to_unichar_ext(unichar_ids_[i]);
458  *word_str += ch;
459  if (word_lengths_str != nullptr) {
460  *word_lengths_str += strlen(ch);
461  }
462  }
463 }
464 
472  UNICHAR_ID unichar_id, int blob_count,
473  float rating, float certainty) {
474  if (length_ == reserved_) {
475  this->double_the_size();
476  }
477  this->append_unichar_id_space_allocated(unichar_id, blob_count,
478  rating, certainty);
479 }
480 
489  ASSERT_HOST(unicharset_ == second.unicharset_);
490  while (reserved_ < length_ + second.length()) {
491  this->double_the_size();
492  }
493  const UNICHAR_ID *other_unichar_ids = second.unichar_ids();
494  for (int i = 0; i < second.length(); ++i) {
495  unichar_ids_[length_ + i] = other_unichar_ids[i];
496  state_[length_ + i] = second.state_[i];
497  certainties_[length_ + i] = second.certainties_[i];
498  script_pos_[length_ + i] = second.BlobPosition(i);
499  }
500  length_ += second.length();
501  if (second.adjust_factor_ > adjust_factor_)
502  adjust_factor_ = second.adjust_factor_;
503  rating_ += second.rating(); // add ratings
504  if (second.certainty() < certainty_) // take min
505  certainty_ = second.certainty();
506  if (second.dangerous_ambig_found_)
507  dangerous_ambig_found_ = true;
508  if (permuter_ == NO_PERM) {
509  permuter_ = second.permuter();
510  } else if (second.permuter() != NO_PERM &&
511  second.permuter() != permuter_) {
512  permuter_ = COMPOUND_PERM;
513  }
514  return *this;
515 }
516 
517 
525  while (reserved_ < source.length()) {
526  this->double_the_size();
527  }
528 
529  unicharset_ = source.unicharset_;
530  const UNICHAR_ID *other_unichar_ids = source.unichar_ids();
531  for (int i = 0; i < source.length(); ++i) {
532  unichar_ids_[i] = other_unichar_ids[i];
533  state_[i] = source.state_[i];
534  certainties_[i] = source.certainties_[i];
535  script_pos_[i] = source.BlobPosition(i);
536  }
537  length_ = source.length();
538  adjust_factor_ = source.adjust_factor_;
539  rating_ = source.rating();
540  certainty_ = source.certainty();
541  min_x_height_ = source.min_x_height();
542  max_x_height_ = source.max_x_height();
543  permuter_ = source.permuter();
544  dangerous_ambig_found_ = source.dangerous_ambig_found_;
545  return *this;
546 }
547 
548 // Sets up the script_pos_ member using the blobs_list to get the bln
549 // bounding boxes, *this to get the unichars, and this->unicharset
550 // to get the target positions. If small_caps is true, sub/super are not
551 // considered, but dropcaps are.
552 // NOTE: blobs_list should be the chopped_word blobs. (Fully segemented.)
553 void WERD_CHOICE::SetScriptPositions(bool small_caps, TWERD* word, int debug) {
554  // Initialize to normal.
555  for (int i = 0; i < length_; ++i)
556  script_pos_[i] = tesseract::SP_NORMAL;
557  if (word->blobs.empty() || word->NumBlobs() != TotalOfStates()) {
558  return;
559  }
560 
561  int position_counts[4] = { 0, 0, 0, 0 };
562 
563  int chunk_index = 0;
564  for (int blob_index = 0; blob_index < length_; ++blob_index, ++chunk_index) {
565  TBLOB* tblob = word->blobs[chunk_index];
566  int uni_id = unichar_id(blob_index);
567  TBOX blob_box = tblob->bounding_box();
568  if (state_ != nullptr) {
569  for (int i = 1; i < state_[blob_index]; ++i) {
570  ++chunk_index;
571  tblob = word->blobs[chunk_index];
572  blob_box += tblob->bounding_box();
573  }
574  }
575  script_pos_[blob_index] = ScriptPositionOf(false, *unicharset_, blob_box,
576  uni_id);
577  if (small_caps && script_pos_[blob_index] != tesseract::SP_DROPCAP) {
578  script_pos_[blob_index] = tesseract::SP_NORMAL;
579  }
580  position_counts[script_pos_[blob_index]]++;
581  }
582  // If almost everything looks like a superscript or subscript,
583  // we most likely just got the baseline wrong.
584  if (position_counts[tesseract::SP_SUBSCRIPT] > 0.75 * length_ ||
585  position_counts[tesseract::SP_SUPERSCRIPT] > 0.75 * length_) {
586  if (debug >= 2) {
587  tprintf("Most characters of %s are subscript or superscript.\n"
588  "That seems wrong, so I'll assume we got the baseline wrong\n",
589  unichar_string().c_str());
590  }
591  for (int i = 0; i < length_; i++) {
592  ScriptPos sp = script_pos_[i];
594  position_counts[sp]--;
595  position_counts[tesseract::SP_NORMAL]++;
596  script_pos_[i] = tesseract::SP_NORMAL;
597  }
598  }
599  }
600 
601  if ((debug >= 1 && position_counts[tesseract::SP_NORMAL] < length_) ||
602  debug >= 2) {
603  tprintf("SetScriptPosition on %s\n", unichar_string().c_str());
604  int chunk_index = 0;
605  for (int blob_index = 0; blob_index < length_; ++blob_index) {
606  if (debug >= 2 || script_pos_[blob_index] != tesseract::SP_NORMAL) {
607  TBLOB* tblob = word->blobs[chunk_index];
608  ScriptPositionOf(true, *unicharset_, tblob->bounding_box(),
609  unichar_id(blob_index));
610  }
611  chunk_index += state_ != nullptr ? state_[blob_index] : 1;
612  }
613  }
614 }
615 // Sets the script_pos_ member from some source positions with a given length.
617  int length) {
618  ASSERT_HOST(length == length_);
619  if (positions != script_pos_) {
620  delete [] script_pos_;
621  script_pos_ = new ScriptPos[length];
622  memcpy(script_pos_, positions, sizeof(positions[0]) * length);
623  }
624 }
625 // Sets all the script_pos_ positions to the given position.
627  for (int i = 0; i < length_; ++i)
628  script_pos_[i] = position;
629 }
630 
631 /* static */
633  const UNICHARSET& unicharset,
634  const TBOX& blob_box,
635  UNICHAR_ID unichar_id) {
637  int top = blob_box.top();
638  int bottom = blob_box.bottom();
639  int min_bottom, max_bottom, min_top, max_top;
641  &min_bottom, &max_bottom,
642  &min_top, &max_top);
643 
644  int sub_thresh_top = min_top - kMinSubscriptOffset;
645  int sub_thresh_bot = kBlnBaselineOffset - kMinSubscriptOffset;
646  int sup_thresh_bot = max_bottom + kMinSuperscriptOffset;
647  if (bottom <= kMaxDropCapBottom) {
648  retval = tesseract::SP_DROPCAP;
649  } else if (top < sub_thresh_top && bottom < sub_thresh_bot) {
650  retval = tesseract::SP_SUBSCRIPT;
651  } else if (bottom > sup_thresh_bot) {
652  retval = tesseract::SP_SUPERSCRIPT;
653  }
654 
655  if (print_debug) {
656  const char *pos = ScriptPosToString(retval);
657  tprintf("%s Character %s[bot:%d top: %d] "
658  "bot_range[%d,%d] top_range[%d, %d] "
659  "sub_thresh[bot:%d top:%d] sup_thresh_bot %d\n",
661  bottom, top,
662  min_bottom, max_bottom, min_top, max_top,
663  sub_thresh_bot, sub_thresh_top,
664  sup_thresh_bot);
665  }
666  return retval;
667 }
668 
669 // Returns the script-id (eg Han) of the dominant script in the word.
670 int WERD_CHOICE::GetTopScriptID() const {
671  int max_script = unicharset_->get_script_table_size();
672  int *sid = new int[max_script];
673  int x;
674  for (x = 0; x < max_script; x++) sid[x] = 0;
675  for (x = 0; x < length_; ++x) {
676  int script_id = unicharset_->get_script(unichar_id(x));
677  sid[script_id]++;
678  }
679  if (unicharset_->han_sid() != unicharset_->null_sid()) {
680  // Add the Hiragana & Katakana counts to Han and zero them out.
681  if (unicharset_->hiragana_sid() != unicharset_->null_sid()) {
682  sid[unicharset_->han_sid()] += sid[unicharset_->hiragana_sid()];
683  sid[unicharset_->hiragana_sid()] = 0;
684  }
685  if (unicharset_->katakana_sid() != unicharset_->null_sid()) {
686  sid[unicharset_->han_sid()] += sid[unicharset_->katakana_sid()];
687  sid[unicharset_->katakana_sid()] = 0;
688  }
689  }
690  // Note that high script ID overrides lower one on a tie, thus biasing
691  // towards non-Common script (if sorted that way in unicharset file).
692  int max_sid = 0;
693  for (x = 1; x < max_script; x++)
694  if (sid[x] >= sid[max_sid]) max_sid = x;
695  if (sid[max_sid] < length_ / 2)
696  max_sid = unicharset_->null_sid();
697  delete[] sid;
698  return max_sid;
699 }
700 
701 // Fixes the state_ for a chop at the given blob_posiiton.
702 void WERD_CHOICE::UpdateStateForSplit(int blob_position) {
703  int total_chunks = 0;
704  for (int i = 0; i < length_; ++i) {
705  total_chunks += state_[i];
706  if (total_chunks > blob_position) {
707  ++state_[i];
708  return;
709  }
710  }
711 }
712 
713 // Returns the sum of all the state elements, being the total number of blobs.
714 int WERD_CHOICE::TotalOfStates() const {
715  int total_chunks = 0;
716  for (int i = 0; i < length_; ++i) {
717  total_chunks += state_[i];
718  }
719  return total_chunks;
720 }
721 
727 void WERD_CHOICE::print(const char *msg) const {
728  tprintf("%s : ", msg);
729  for (int i = 0; i < length_; ++i) {
730  tprintf("%s", unicharset_->id_to_unichar(unichar_ids_[i]));
731  }
732  tprintf(" : R=%g, C=%g, F=%g, Perm=%d, xht=[%g,%g], ambig=%d\n",
733  rating_, certainty_, adjust_factor_, permuter_,
734  min_x_height_, max_x_height_, dangerous_ambig_found_);
735  tprintf("pos");
736  for (int i = 0; i < length_; ++i) {
737  tprintf("\t%s", ScriptPosToString(script_pos_[i]));
738  }
739  tprintf("\nstr");
740  for (int i = 0; i < length_; ++i) {
741  tprintf("\t%s", unicharset_->id_to_unichar(unichar_ids_[i]));
742  }
743  tprintf("\nstate:");
744  for (int i = 0; i < length_; ++i) {
745  tprintf("\t%d ", state_[i]);
746  }
747  tprintf("\nC");
748  for (int i = 0; i < length_; ++i) {
749  tprintf("\t%.3f", certainties_[i]);
750  }
751  tprintf("\n");
752 }
753 
754 // Prints the segmentation state with an introductory message.
755 void WERD_CHOICE::print_state(const char *msg) const {
756  tprintf("%s", msg);
757  for (int i = 0; i < length_; ++i)
758  tprintf(" %d", state_[i]);
759  tprintf("\n");
760 }
761 
762 // Displays the segmentation state of *this (if not the same as the last
763 // one displayed) and waits for a click in the window.
765 #ifndef GRAPHICS_DISABLED
766  // Number of different colors to draw with.
767  const int kNumColors = 6;
768  static ScrollView *segm_window = nullptr;
769  // Check the state against the static prev_drawn_state.
770  static GenericVector<int> prev_drawn_state;
771  bool already_done = prev_drawn_state.size() == length_;
772  if (!already_done) prev_drawn_state.init_to_size(length_, 0);
773  for (int i = 0; i < length_; ++i) {
774  if (prev_drawn_state[i] != state_[i]) {
775  already_done = false;
776  }
777  prev_drawn_state[i] = state_[i];
778  }
779  if (already_done || word->blobs.empty()) return;
780 
781  // Create the window if needed.
782  if (segm_window == nullptr) {
783  segm_window = new ScrollView("Segmentation", 5, 10, 500, 256,
784  2000.0, 256.0, true);
785  } else {
786  segm_window->Clear();
787  }
788 
789  TBOX bbox;
790  int blob_index = 0;
791  for (int c = 0; c < length_; ++c) {
792  auto color =
793  static_cast<ScrollView::Color>(c % kNumColors + 3);
794  for (int i = 0; i < state_[c]; ++i, ++blob_index) {
795  TBLOB* blob = word->blobs[blob_index];
796  bbox += blob->bounding_box();
797  blob->plot(segm_window, color, color);
798  }
799  }
800  segm_window->ZoomToRectangle(bbox.left(), bbox.top(),
801  bbox.right(), bbox.bottom());
802  segm_window->Update();
803  window_wait(segm_window);
804 #endif
805 }
806 
807 
809  const WERD_CHOICE &word2) {
810  const UNICHARSET *uchset = word1.unicharset();
811  if (word2.unicharset() != uchset) return false;
812  int w1start, w1end;
813  word1.punct_stripped(&w1start, &w1end);
814  int w2start, w2end;
815  word2.punct_stripped(&w2start, &w2end);
816  if (w1end - w1start != w2end - w2start) return false;
817  for (int i = 0; i < w1end - w1start; i++) {
818  if (uchset->to_lower(word1.unichar_id(w1start + i)) !=
819  uchset->to_lower(word2.unichar_id(w2start + i))) {
820  return false;
821  }
822  }
823  return true;
824 }
825 
836 void print_ratings_list(const char *msg,
837  BLOB_CHOICE_LIST *ratings,
838  const UNICHARSET &current_unicharset) {
839  if (ratings->length() == 0) {
840  tprintf("%s:<none>\n", msg);
841  return;
842  }
843  if (*msg != '\0') {
844  tprintf("%s\n", msg);
845  }
846  BLOB_CHOICE_IT c_it;
847  c_it.set_to_list(ratings);
848  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
849  c_it.data()->print(&current_unicharset);
850  if (!c_it.at_last()) tprintf("\n");
851  }
852  tprintf("\n");
853  fflush(stdout);
854 }
string
std::string string
Definition: equationdetect_test.cc:21
UNICHARSET::get_direction
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:680
WERD_CHOICE::unichar_string
const STRING & unichar_string() const
Definition: ratngs.h:529
ClipToRange
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:106
ScrollView
Definition: scrollview.h:97
normalis.h
WERD_CHOICE::shallow_copy
WERD_CHOICE shallow_copy(int start, int end) const
Definition: ratngs.cpp:416
BlobChoiceClassifier
BlobChoiceClassifier
Definition: ratngs.h:41
UNICHARSET::encode_string
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:258
WERD_CHOICE::unichar_id
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:303
UNICHARSET::id_to_unichar_ext
const char * id_to_unichar_ext(UNICHAR_ID id) const
Definition: unicharset.cpp:298
UNICHARSET::get_isdigit
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:502
WERD_CHOICE
Definition: ratngs.h:261
TWERD
Definition: blobs.h:416
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
BLOB_CHOICE::min_xheight
float min_xheight() const
Definition: ratngs.h:118
WERD_CHOICE::GetTopScriptID
int GetTopScriptID() const
Definition: ratngs.cpp:669
BLOB_CHOICE::certainty
float certainty() const
Definition: ratngs.h:81
WERD_CHOICE::TotalOfStates
int TotalOfStates() const
Definition: ratngs.cpp:713
TBLOB::plot
void plot(ScrollView *window, ScrollView::Color color, ScrollView::Color child_color)
Definition: blobs.cpp:508
WERD_CHOICE::contains_unichar_id
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: ratngs.cpp:328
FindMatchingChoice
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
Definition: ratngs.cpp:182
MATRIX
Definition: matrix.h:574
WERD_CHOICE::certainty
float certainty() const
Definition: ratngs.h:318
NO_PERM
Definition: ratngs.h:231
TBOX::top
int16_t top() const
Definition: rect.h:57
STRING
Definition: strngs.h:45
ScrollView::Clear
void Clear()
Definition: scrollview.cpp:588
WERD_CHOICE::permuter
uint8_t permuter() const
Definition: ratngs.h:334
WERD_CHOICE::operator+=
WERD_CHOICE & operator+=(const WERD_CHOICE &second)
Definition: ratngs.cpp:487
BLOB_CHOICE::BLOB_CHOICE
BLOB_CHOICE()
Definition: ratngs.h:52
COMPOUND_PERM
Definition: ratngs.h:243
kMinSuperscriptOffset
const int kMinSuperscriptOffset
Definition: ratngs.cpp:43
WERD_CHOICE::append_unichar_id_space_allocated
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:440
WERD_CHOICE::unicharset
const UNICHARSET * unicharset() const
Definition: ratngs.h:288
WERD_CHOICE::kBadRating
static const float kBadRating
Definition: ratngs.h:263
BLOB_CHOICE::script_id
int script_id() const
Definition: ratngs.h:112
EqualIgnoringCaseAndTerminalPunct
bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2)
Definition: ratngs.cpp:807
UNICHARSET::katakana_sid
int katakana_sid() const
Definition: unicharset.h:881
WERD_CHOICE::BlobPosition
tesseract::ScriptPos BlobPosition(int index) const
Definition: ratngs.h:310
ScrollView::ZoomToRectangle
void ZoomToRectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:755
blobs.h
BLOB_CHOICE::unichar_id
UNICHAR_ID unichar_id() const
Definition: ratngs.h:75
UNICHARSET::get_script
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:653
ratngs.h
WERD_CHOICE::reverse_and_mirror_unichar_ids
void reverse_and_mirror_unichar_ids()
Definition: ratngs.cpp:367
tesseract::SP_SUBSCRIPT
Definition: ratngs.h:252
WERD_CHOICE::init
void init(int reserved)
Definition: ratngs.h:397
WERD_CHOICE::string_and_lengths
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:451
genericvector.h
WERD_CHOICE::SetScriptPositions
void SetScriptPositions(bool small_caps, TWERD *word, int debug=0)
Definition: ratngs.cpp:552
WERD_CHOICE::punct_stripped
void punct_stripped(int *start_core, int *end_core) const
Definition: ratngs.cpp:385
tesseract::ScriptPos
ScriptPos
Definition: ratngs.h:250
WERD_CHOICE::min_x_height
float min_x_height() const
Definition: ratngs.h:324
UNICHARSET::to_lower
UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:694
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
unicharset.h
UNICHARSET::get_top_bottom
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:558
UNICHARSET::null_sid
int null_sid() const
Definition: unicharset.h:874
WERD_CHOICE::MatrixCoord
MATRIX_COORD MatrixCoord(int index) const
Definition: ratngs.cpp:304
WERD_CHOICE::double_the_size
void double_the_size()
Make more space in unichar_id_ and fragment_lengths_ arrays.
Definition: ratngs.h:375
UNICHARSET::han_sid
int han_sid() const
Definition: unicharset.h:879
UNICHARSET::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
GENERIC_2D_ARRAY::get
T get(ICOORD pos) const
Definition: matrix.h:227
matrix.h
WERD_CHOICE::~WERD_CHOICE
~WERD_CHOICE()
Definition: ratngs.cpp:278
TWERD::blobs
GenericVector< TBLOB * > blobs
Definition: blobs.h:457
BLOB_CHOICE::max_xheight
float max_xheight() const
Definition: ratngs.h:121
GenericVector::empty
bool empty() const
Definition: genericvector.h:86
WERD_CHOICE::UpdateStateForSplit
void UpdateStateForSplit(int blob_position)
Definition: ratngs.cpp:701
kMinXHeightMatch
const double kMinXHeightMatch
Definition: ratngs.cpp:50
UNICHARSET
Definition: unicharset.h:145
UNICHARSET::hiragana_sid
int hiragana_sid() const
Definition: unicharset.h:880
TBOX::bottom
int16_t bottom() const
Definition: rect.h:64
UNICHARSET::get_mirror
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:687
WERD_CHOICE::GetNonSuperscriptSpan
void GetNonSuperscriptSpan(int *start, int *end) const
Definition: ratngs.cpp:399
UNICHARSET::CleanupString
static std::string CleanupString(const char *utf8_str)
Definition: unicharset.h:246
tesseract::SP_NORMAL
Definition: ratngs.h:251
tesseract
Definition: baseapi.h:65
ELIST_LINK::operator=
void operator=(const ELIST_LINK &)
Definition: elst.h:134
kMaxOverlapDenominator
const double kMaxOverlapDenominator
Definition: ratngs.cpp:47
unicharset_
UNICHARSET unicharset_
Definition: unicharcompress_test.cc:167
WERD_CHOICE::DisplaySegmentation
void DisplaySegmentation(TWERD *word)
Definition: ratngs.cpp:763
callcpp.h
kMaxDropCapBottom
const int kMaxDropCapBottom
Definition: ratngs.cpp:45
tesseract::SP_DROPCAP
Definition: ratngs.h:254
BLOB_CHOICE::rating
float rating() const
Definition: ratngs.h:78
UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:36
TBLOB::bounding_box
TBOX bounding_box() const
Definition: blobs.cpp:466
GenericVector< UNICHAR_ID >
UNICHARSET::get_script_table_size
int get_script_table_size() const
Definition: unicharset.h:839
WERD_CHOICE::has_rtl_unichar_id
bool has_rtl_unichar_id() const
Definition: ratngs.cpp:433
BLOB_CHOICE::fontinfo_id
int16_t fontinfo_id() const
Definition: ratngs.h:84
WERD_CHOICE::print_state
void print_state(const char *msg) const
Definition: ratngs.cpp:754
BLOB_CHOICE::PosAndSizeAgree
bool PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const
Definition: ratngs.cpp:154
WERD_CHOICE::operator=
WERD_CHOICE & operator=(const WERD_CHOICE &source)
Definition: ratngs.cpp:523
WERD_CHOICE::print
void print() const
Definition: ratngs.h:568
BLOB_CHOICE::fontinfo_id2
int16_t fontinfo_id2() const
Definition: ratngs.h:87
WERD_CHOICE::length
int length() const
Definition: ratngs.h:291
BLOB_CHOICE
Definition: ratngs.h:49
MATRIX_COORD
Definition: matrix.h:604
TBLOB
Definition: blobs.h:282
BLOB_CHOICE::yshift
float yshift() const
Definition: ratngs.h:124
WERD_CHOICE::SetAllScriptPositions
void SetAllScriptPositions(tesseract::ScriptPos position)
Definition: ratngs.cpp:625
TBOX::left
int16_t left() const
Definition: rect.h:71
UNICHARSET::Direction
Direction
Definition: unicharset.h:156
tesseract::SP_SUPERSCRIPT
Definition: ratngs.h:253
UNICHARSET::U_RIGHT_TO_LEFT_ARABIC
Definition: unicharset.h:170
MATRIX_COORD::col
int col
Definition: matrix.h:632
print_ratings_list
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:835
TBOX::right
int16_t right() const
Definition: rect.h:78
GenericVector::init_to_size
void init_to_size(int size, const T &t)
Definition: genericvector.h:706
GENERIC_2D_ARRAY::put
void put(ICOORD pos, const T &thing)
Definition: matrix.h:219
WERD_CHOICE::permuter_name
const char * permuter_name() const
Definition: ratngs.cpp:285
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
WERD_CHOICE::WERD_CHOICE
WERD_CHOICE(const UNICHARSET *unicharset)
Definition: ratngs.h:266
ScrollView::Update
static void Update()
Definition: scrollview.cpp:708
WERD_CHOICE::ScriptPositionOf
static tesseract::ScriptPos ScriptPositionOf(bool print_debug, const UNICHARSET &unicharset, const TBOX &blob_box, UNICHAR_ID unichar_id)
Definition: ratngs.cpp:631
ELIST_LINK
Definition: elst.h:74
MATRIX_COORD::row
int row
Definition: matrix.h:633
WERD_CHOICE::remove_unichar_ids
void remove_unichar_ids(int index, int num)
Definition: ratngs.cpp:344
WERD_CHOICE::rating
float rating() const
Definition: ratngs.h:315
WERD_CHOICE::unichar_ids
const UNICHAR_ID * unichar_ids() const
Definition: ratngs.h:300
UNICHARSET::id_to_unichar
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
GenericVector::size
int size() const
Definition: genericvector.h:71
window_wait
char window_wait(ScrollView *win)
Definition: callcpp.cpp:103
WERD_CHOICE::set_blob_choice
void set_blob_choice(int index, int blob_count, const BLOB_CHOICE *blob_choice)
Definition: ratngs.cpp:314
kBlnBaselineOffset
const int kBlnBaselineOffset
Definition: normalis.h:24
WERD_CHOICE::append_unichar_id
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:470
WERD_CHOICE::blob_choices
BLOB_CHOICE_LIST * blob_choices(int index, MATRIX *ratings) const
Definition: ratngs.cpp:292
ELISTIZE
#define ELISTIZE(CLASSNAME)
Definition: elst.h:919
TWERD::NumBlobs
int NumBlobs() const
Definition: blobs.h:446
UNICHARSET::U_RIGHT_TO_LEFT
Definition: unicharset.h:158
tesseract::ScriptPosToString
const char * ScriptPosToString(enum ScriptPos script_pos)
Definition: ratngs.cpp:202
kMaxBaselineDrift
const double kMaxBaselineDrift
Definition: ratngs.cpp:53
kMinSubscriptOffset
const int kMinSubscriptOffset
Definition: ratngs.cpp:41
TBOX
Definition: rect.h:33
WERD_CHOICE::max_x_height
float max_x_height() const
Definition: ratngs.h:327