tesseract  5.0.0-alpha-619-ge9db
superscript.cpp
Go to the documentation of this file.
1 /******************************************************************
2  * File: superscript.cpp
3  * Description: Correction pass to fix superscripts and subscripts.
4  * Author: David Eger
5  *
6  * (C) Copyright 2012, Google, Inc.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 #include "normalis.h"
20 #include "tesseractclass.h"
21 
22 static int LeadingUnicharsToChopped(WERD_RES *word, int num_unichars) {
23  int num_chopped = 0;
24  for (int i = 0; i < num_unichars; i++)
25  num_chopped += word->best_state[i];
26  return num_chopped;
27 }
28 
29 static int TrailingUnicharsToChopped(WERD_RES *word, int num_unichars) {
30  int num_chopped = 0;
31  for (int i = 0; i < num_unichars; i++)
32  num_chopped += word->best_state[word->best_state.size() - 1 - i];
33  return num_chopped;
34 }
35 
36 
37 namespace tesseract {
38 
45 static void YOutlierPieces(WERD_RES *word, int rebuilt_blob_index,
46  int super_y_bottom, int sub_y_top,
47  ScriptPos *leading_pos, int *num_leading_outliers,
48  ScriptPos *trailing_pos,
49  int *num_trailing_outliers) {
50  ScriptPos sp_unused1, sp_unused2;
51  int unused1, unused2;
52  if (!leading_pos) leading_pos = &sp_unused1;
53  if (!num_leading_outliers) num_leading_outliers = &unused1;
54  if (!trailing_pos) trailing_pos = &sp_unused2;
55  if (!num_trailing_outliers) num_trailing_outliers = &unused2;
56 
57  *num_leading_outliers = *num_trailing_outliers = 0;
58  *leading_pos = *trailing_pos = SP_NORMAL;
59 
60  int chopped_start = LeadingUnicharsToChopped(word, rebuilt_blob_index);
61  int num_chopped_pieces = word->best_state[rebuilt_blob_index];
62  ScriptPos last_pos = SP_NORMAL;
63  int trailing_outliers = 0;
64  for (int i = 0; i < num_chopped_pieces; i++) {
65  TBOX box = word->chopped_word->blobs[chopped_start + i]->bounding_box();
66  ScriptPos pos = SP_NORMAL;
67  if (box.bottom() >= super_y_bottom) {
68  pos = SP_SUPERSCRIPT;
69  } else if (box.top() <= sub_y_top) {
70  pos = SP_SUBSCRIPT;
71  }
72  if (pos == SP_NORMAL) {
73  if (trailing_outliers == i) {
74  *num_leading_outliers = trailing_outliers;
75  *leading_pos = last_pos;
76  }
77  trailing_outliers = 0;
78  } else {
79  if (pos == last_pos) {
80  trailing_outliers++;
81  } else {
82  trailing_outliers = 1;
83  }
84  }
85  last_pos = pos;
86  }
87  *num_trailing_outliers = trailing_outliers;
88  *trailing_pos = last_pos;
89 }
90 
102  if (word->tess_failed || word->word->flag(W_REP_CHAR) ||
103  !word->best_choice) {
104  return false;
105  }
106  int num_leading, num_trailing;
107  ScriptPos sp_leading, sp_trailing;
108  float leading_certainty, trailing_certainty;
109  float avg_certainty, unlikely_threshold;
110 
111  // Calculate the number of whole suspicious characters at the edges.
113  word, &num_leading, &sp_leading, &leading_certainty,
114  &num_trailing, &sp_trailing, &trailing_certainty,
115  &avg_certainty, &unlikely_threshold);
116 
117  const char *leading_pos = sp_leading == SP_SUBSCRIPT ? "sub" : "super";
118  const char *trailing_pos = sp_trailing == SP_SUBSCRIPT ? "sub" : "super";
119 
120  int num_blobs = word->best_choice->length();
121 
122  // Calculate the remainder (partial characters) at the edges.
123  // This accounts for us having classified the best version of
124  // a word as [speaker?'] when it was instead [speaker.^{21}]
125  // (that is we accidentally thought the 2 was attached to the period).
126  int num_remainder_leading = 0, num_remainder_trailing = 0;
127  if (num_leading + num_trailing < num_blobs && unlikely_threshold < 0.0) {
128  int super_y_bottom =
130  int sub_y_top =
132  int last_word_char = num_blobs - 1 - num_trailing;
133  float last_char_certainty = word->best_choice->certainty(last_word_char);
134  if (word->best_choice->unichar_id(last_word_char) != 0 &&
135  last_char_certainty <= unlikely_threshold) {
136  ScriptPos rpos;
137  YOutlierPieces(word, last_word_char, super_y_bottom, sub_y_top,
138  nullptr, nullptr, &rpos, &num_remainder_trailing);
139  if (num_trailing > 0 && rpos != sp_trailing) num_remainder_trailing = 0;
140  if (num_remainder_trailing > 0 &&
141  last_char_certainty < trailing_certainty) {
142  trailing_certainty = last_char_certainty;
143  }
144  }
145  bool another_blob_available = (num_remainder_trailing == 0) ||
146  num_leading + num_trailing + 1 < num_blobs;
147  int first_char_certainty = word->best_choice->certainty(num_leading);
148  if (another_blob_available &&
149  word->best_choice->unichar_id(num_leading) != 0 &&
150  first_char_certainty <= unlikely_threshold) {
151  ScriptPos lpos;
152  YOutlierPieces(word, num_leading, super_y_bottom, sub_y_top,
153  &lpos, &num_remainder_leading, nullptr, nullptr);
154  if (num_leading > 0 && lpos != sp_leading) num_remainder_leading = 0;
155  if (num_remainder_leading > 0 &&
156  first_char_certainty < leading_certainty) {
157  leading_certainty = first_char_certainty;
158  }
159  }
160  }
161 
162  // If nothing to do, bail now.
163  if (num_leading + num_trailing +
164  num_remainder_leading + num_remainder_trailing == 0) {
165  return false;
166  }
167 
168  if (superscript_debug >= 1) {
169  tprintf("Candidate for superscript detection: %s (",
170  word->best_choice->unichar_string().c_str());
171  if (num_leading || num_remainder_leading) {
172  tprintf("%d.%d %s-leading ", num_leading, num_remainder_leading,
173  leading_pos);
174  }
175  if (num_trailing || num_remainder_trailing) {
176  tprintf("%d.%d %s-trailing ", num_trailing, num_remainder_trailing,
177  trailing_pos);
178  }
179  tprintf(")\n");
180  }
181  if (superscript_debug >= 3) {
182  word->best_choice->print();
183  }
184  if (superscript_debug >= 2) {
185  tprintf(" Certainties -- Average: %.2f Unlikely thresh: %.2f ",
186  avg_certainty, unlikely_threshold);
187  if (num_leading)
188  tprintf("Orig. leading (min): %.2f ", leading_certainty);
189  if (num_trailing)
190  tprintf("Orig. trailing (min): %.2f ", trailing_certainty);
191  tprintf("\n");
192  }
193 
194  // We've now calculated the number of rebuilt blobs we want to carve off.
195  // However, split_word() works from TBLOBs in chopped_word, so we need to
196  // convert to those.
197  int num_chopped_leading =
198  LeadingUnicharsToChopped(word, num_leading) + num_remainder_leading;
199  int num_chopped_trailing =
200  TrailingUnicharsToChopped(word, num_trailing) + num_remainder_trailing;
201 
202  int retry_leading = 0;
203  int retry_trailing = 0;
204  bool is_good = false;
205  WERD_RES *revised = TrySuperscriptSplits(
206  num_chopped_leading, leading_certainty, sp_leading,
207  num_chopped_trailing, trailing_certainty, sp_trailing,
208  word, &is_good, &retry_leading, &retry_trailing);
209  if (is_good) {
210  word->ConsumeWordResults(revised);
211  } else if (retry_leading || retry_trailing) {
212  int retry_chopped_leading =
213  LeadingUnicharsToChopped(revised, retry_leading);
214  int retry_chopped_trailing =
215  TrailingUnicharsToChopped(revised, retry_trailing);
216  WERD_RES *revised2 = TrySuperscriptSplits(
217  retry_chopped_leading, leading_certainty, sp_leading,
218  retry_chopped_trailing, trailing_certainty, sp_trailing,
219  revised, &is_good, &retry_leading, &retry_trailing);
220  if (is_good) {
221  word->ConsumeWordResults(revised2);
222  }
223  delete revised2;
224  }
225  delete revised;
226  return is_good;
227 }
228 
254  int *num_rebuilt_leading,
255  ScriptPos *leading_pos,
256  float *leading_certainty,
257  int *num_rebuilt_trailing,
258  ScriptPos *trailing_pos,
259  float *trailing_certainty,
260  float *avg_certainty,
261  float *unlikely_threshold) {
262  *avg_certainty = *unlikely_threshold = 0.0f;
263  *num_rebuilt_leading = *num_rebuilt_trailing = 0;
264  *leading_certainty = *trailing_certainty = 0.0f;
265 
266  int super_y_bottom =
268  int sub_y_top =
270 
271  // Step one: Get an average certainty for "normally placed" characters.
272 
273  // Counts here are of blobs in the rebuild_word / unichars in best_choice.
274  *leading_pos = *trailing_pos = SP_NORMAL;
275  int leading_outliers = 0;
276  int trailing_outliers = 0;
277  int num_normal = 0;
278  float normal_certainty_total = 0.0f;
279  float worst_normal_certainty = 0.0f;
280  ScriptPos last_pos = SP_NORMAL;
281  int num_blobs = word->rebuild_word->NumBlobs();
282  for (int b = 0; b < num_blobs; ++b) {
283  TBOX box = word->rebuild_word->blobs[b]->bounding_box();
284  ScriptPos pos = SP_NORMAL;
285  if (box.bottom() >= super_y_bottom) {
286  pos = SP_SUPERSCRIPT;
287  } else if (box.top() <= sub_y_top) {
288  pos = SP_SUBSCRIPT;
289  }
290  if (pos == SP_NORMAL) {
291  if (word->best_choice->unichar_id(b) != 0) {
292  float char_certainty = word->best_choice->certainty(b);
293  if (char_certainty < worst_normal_certainty) {
294  worst_normal_certainty = char_certainty;
295  }
296  num_normal++;
297  normal_certainty_total += char_certainty;
298  }
299  if (trailing_outliers == b) {
300  leading_outliers = trailing_outliers;
301  *leading_pos = last_pos;
302  }
303  trailing_outliers = 0;
304  } else {
305  if (last_pos == pos) {
306  trailing_outliers++;
307  } else {
308  trailing_outliers = 1;
309  }
310  }
311  last_pos = pos;
312  }
313  *trailing_pos = last_pos;
314  if (num_normal >= 3) { // throw out the worst as an outlier.
315  num_normal--;
316  normal_certainty_total -= worst_normal_certainty;
317  }
318  if (num_normal > 0) {
319  *avg_certainty = normal_certainty_total / num_normal;
320  *unlikely_threshold = superscript_worse_certainty * (*avg_certainty);
321  }
322  if (num_normal == 0 ||
323  (leading_outliers == 0 && trailing_outliers == 0)) {
324  return;
325  }
326 
327  // Step two: Try to split off bits of the word that are both outliers
328  // and have much lower certainty than average
329  // Calculate num_leading and leading_certainty.
330  for (*leading_certainty = 0.0f, *num_rebuilt_leading = 0;
331  *num_rebuilt_leading < leading_outliers;
332  (*num_rebuilt_leading)++) {
333  float char_certainty = word->best_choice->certainty(*num_rebuilt_leading);
334  if (char_certainty > *unlikely_threshold) {
335  break;
336  }
337  if (char_certainty < *leading_certainty) {
338  *leading_certainty = char_certainty;
339  }
340  }
341 
342  // Calculate num_trailing and trailing_certainty.
343  for (*trailing_certainty = 0.0f, *num_rebuilt_trailing = 0;
344  *num_rebuilt_trailing < trailing_outliers;
345  (*num_rebuilt_trailing)++) {
346  int blob_idx = num_blobs - 1 - *num_rebuilt_trailing;
347  float char_certainty = word->best_choice->certainty(blob_idx);
348  if (char_certainty > *unlikely_threshold) {
349  break;
350  }
351  if (char_certainty < *trailing_certainty) {
352  *trailing_certainty = char_certainty;
353  }
354  }
355 }
356 
357 
383  int num_chopped_leading, float leading_certainty, ScriptPos leading_pos,
384  int num_chopped_trailing, float trailing_certainty,
385  ScriptPos trailing_pos,
386  WERD_RES *word,
387  bool *is_good,
388  int *retry_rebuild_leading, int *retry_rebuild_trailing) {
389  int num_chopped = word->chopped_word->NumBlobs();
390 
391  *retry_rebuild_leading = *retry_rebuild_trailing = 0;
392 
393  // Chop apart the word into up to three pieces.
394 
395  BlamerBundle *bb0 = nullptr;
396  BlamerBundle *bb1 = nullptr;
397  WERD_RES *prefix = nullptr;
398  WERD_RES *core = nullptr;
399  WERD_RES *suffix = nullptr;
400  if (num_chopped_leading > 0) {
401  prefix = new WERD_RES(*word);
402  split_word(prefix, num_chopped_leading, &core, &bb0);
403  } else {
404  core = new WERD_RES(*word);
405  }
406 
407  if (num_chopped_trailing > 0) {
408  int split_pt = num_chopped - num_chopped_trailing - num_chopped_leading;
409  split_word(core, split_pt, &suffix, &bb1);
410  }
411 
412  // Recognize the pieces in turn.
413  int saved_cp_multiplier = classify_class_pruner_multiplier;
414  int saved_im_multiplier = classify_integer_matcher_multiplier;
415  if (prefix) {
416  // Turn off Tesseract's y-position penalties for the leading superscript.
419 
420  // Adjust our expectations about the baseline for this prefix.
421  if (superscript_debug >= 3) {
422  tprintf(" recognizing first %d chopped blobs\n", num_chopped_leading);
423  }
424  recog_word_recursive(prefix);
425  if (superscript_debug >= 2) {
426  tprintf(" The leading bits look like %s %s\n",
427  ScriptPosToString(leading_pos),
428  prefix->best_choice->unichar_string().c_str());
429  }
430 
431  // Restore the normal y-position penalties.
432  classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
433  classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
434  }
435 
436  if (superscript_debug >= 3) {
437  tprintf(" recognizing middle %d chopped blobs\n",
438  num_chopped - num_chopped_leading - num_chopped_trailing);
439  }
440 
441  if (suffix) {
442  // Turn off Tesseract's y-position penalties for the trailing superscript.
445 
446  if (superscript_debug >= 3) {
447  tprintf(" recognizing last %d chopped blobs\n", num_chopped_trailing);
448  }
449  recog_word_recursive(suffix);
450  if (superscript_debug >= 2) {
451  tprintf(" The trailing bits look like %s %s\n",
452  ScriptPosToString(trailing_pos),
453  suffix->best_choice->unichar_string().c_str());
454  }
455 
456  // Restore the normal y-position penalties.
457  classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
458  classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
459  }
460 
461  // Evaluate whether we think the results are believably better
462  // than what we already had.
463  bool good_prefix = !prefix || BelievableSuperscript(
464  superscript_debug >= 1, *prefix,
465  superscript_bettered_certainty * leading_certainty,
466  retry_rebuild_leading, nullptr);
467  bool good_suffix = !suffix || BelievableSuperscript(
468  superscript_debug >= 1, *suffix,
469  superscript_bettered_certainty * trailing_certainty,
470  nullptr, retry_rebuild_trailing);
471 
472  *is_good = good_prefix && good_suffix;
473  if (!*is_good && !*retry_rebuild_leading && !*retry_rebuild_trailing) {
474  // None of it is any good. Quit now.
475  delete core;
476  delete prefix;
477  delete suffix;
478  delete bb1;
479  return nullptr;
480  }
481  recog_word_recursive(core);
482 
483  // Now paste the results together into core.
484  if (suffix) {
485  suffix->SetAllScriptPositions(trailing_pos);
486  join_words(core, suffix, bb1);
487  }
488  if (prefix) {
489  prefix->SetAllScriptPositions(leading_pos);
490  join_words(prefix, core, bb0);
491  core = prefix;
492  prefix = nullptr;
493  }
494 
495  if (superscript_debug >= 1) {
496  tprintf("%s superscript fix: %s\n", *is_good ? "ACCEPT" : "REJECT",
497  core->best_choice->unichar_string().c_str());
498  }
499  return core;
500 }
501 
502 
521 bool Tesseract::BelievableSuperscript(bool debug,
522  const WERD_RES &word,
523  float certainty_threshold,
524  int *left_ok,
525  int *right_ok) const {
526  int initial_ok_run_count = 0;
527  int ok_run_count = 0;
528  float worst_certainty = 0.0f;
529  const WERD_CHOICE &wc = *word.best_choice;
530 
531  const UnicityTable<FontInfo>& fontinfo_table = get_fontinfo_table();
532  for (int i = 0; i < wc.length(); i++) {
533  TBLOB *blob = word.rebuild_word->blobs[i];
534  UNICHAR_ID unichar_id = wc.unichar_id(i);
535  float char_certainty = wc.certainty(i);
536  bool bad_certainty = char_certainty < certainty_threshold;
537  bool is_punc = wc.unicharset()->get_ispunctuation(unichar_id);
538  bool is_italic = word.fontinfo && word.fontinfo->is_italic();
539  BLOB_CHOICE *choice = word.GetBlobChoice(i);
540  if (choice && fontinfo_table.size() > 0) {
541  // Get better information from the specific choice, if available.
542  int font_id1 = choice->fontinfo_id();
543  bool font1_is_italic = font_id1 >= 0
544  ? fontinfo_table.get(font_id1).is_italic() : false;
545  int font_id2 = choice->fontinfo_id2();
546  is_italic = font1_is_italic &&
547  (font_id2 < 0 || fontinfo_table.get(font_id2).is_italic());
548  }
549 
550  float height_fraction = 1.0f;
551  float char_height = blob->bounding_box().height();
552  float normal_height = char_height;
553  if (wc.unicharset()->top_bottom_useful()) {
554  int min_bot, max_bot, min_top, max_top;
555  wc.unicharset()->get_top_bottom(unichar_id,
556  &min_bot, &max_bot,
557  &min_top, &max_top);
558  float hi_height = max_top - max_bot;
559  float lo_height = min_top - min_bot;
560  normal_height = (hi_height + lo_height) / 2;
561  if (normal_height >= kBlnXHeight) {
562  // Only ding characters that we have decent information for because
563  // they're supposed to be normal sized, not tiny specks or dashes.
564  height_fraction = char_height / normal_height;
565  }
566  }
567  bool bad_height = height_fraction < superscript_scaledown_ratio;
568 
569  if (debug) {
570  if (is_italic) {
571  tprintf(" Rejecting: superscript is italic.\n");
572  }
573  if (is_punc) {
574  tprintf(" Rejecting: punctuation present.\n");
575  }
576  const char *char_str = wc.unicharset()->id_to_unichar(unichar_id);
577  if (bad_certainty) {
578  tprintf(" Rejecting: don't believe character %s with certainty %.2f "
579  "which is less than threshold %.2f\n", char_str,
580  char_certainty, certainty_threshold);
581  }
582  if (bad_height) {
583  tprintf(" Rejecting: character %s seems too small @ %.2f versus "
584  "expected %.2f\n", char_str, char_height, normal_height);
585  }
586  }
587  if (bad_certainty || bad_height || is_punc || is_italic) {
588  if (ok_run_count == i) {
589  initial_ok_run_count = ok_run_count;
590  }
591  ok_run_count = 0;
592  } else {
593  ok_run_count++;
594  }
595  if (char_certainty < worst_certainty) {
596  worst_certainty = char_certainty;
597  }
598  }
599  bool all_ok = ok_run_count == wc.length();
600  if (all_ok && debug) {
601  tprintf(" Accept: worst revised certainty is %.2f\n", worst_certainty);
602  }
603  if (!all_ok) {
604  if (left_ok) *left_ok = initial_ok_run_count;
605  if (right_ok) *right_ok = ok_run_count;
606  }
607  return all_ok;
608 }
609 
610 
611 } // namespace tesseract
tesseract::Tesseract::superscript_bettered_certainty
double superscript_bettered_certainty
Definition: tesseractclass.h:981
WERD_CHOICE::unichar_string
const STRING & unichar_string() const
Definition: ratngs.h:529
tesseract::Tesseract::BelievableSuperscript
bool BelievableSuperscript(bool debug, const WERD_RES &word, float certainty_threshold, int *left_ok, int *right_ok) const
Definition: superscript.cpp:520
normalis.h
WERD::flag
bool flag(WERD_FLAGS mask) const
Definition: werd.h:116
kBlnXHeight
const int kBlnXHeight
Definition: normalis.h:23
tesseract::Tesseract::superscript_worse_certainty
double superscript_worse_certainty
Definition: tesseractclass.h:976
W_REP_CHAR
repeated character
Definition: werd.h:52
WERD_CHOICE::unichar_id
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:303
WERD_RES::rebuild_word
TWERD * rebuild_word
Definition: pageres.h:260
WERD_CHOICE
Definition: ratngs.h:261
tesseractclass.h
WERD_CHOICE::certainty
float certainty() const
Definition: ratngs.h:318
tesseract::Tesseract::superscript_min_y_bottom
double superscript_min_y_bottom
Definition: tesseractclass.h:993
TBOX::top
int16_t top() const
Definition: rect.h:57
tesseract::FontInfo::is_italic
bool is_italic() const
Definition: fontinfo.h:111
WERD_RES::fontinfo
const FontInfo * fontinfo
Definition: pageres.h:303
WERD_RES
Definition: pageres.h:160
WERD_RES::ConsumeWordResults
void ConsumeWordResults(WERD_RES *word)
Definition: pageres.cpp:761
tesseract::Classify::classify_class_pruner_multiplier
int classify_class_pruner_multiplier
Definition: classify.h:501
WERD_CHOICE::unicharset
const UNICHARSET * unicharset() const
Definition: ratngs.h:288
WERD_RES::best_state
GenericVector< int > best_state
Definition: pageres.h:279
WERD_RES::SetAllScriptPositions
void SetAllScriptPositions(tesseract::ScriptPos position)
Definition: pageres.cpp:861
WERD_RES::GetBlobChoice
BLOB_CHOICE * GetBlobChoice(int index) const
Definition: pageres.cpp:746
tesseract::Tesseract::superscript_scaledown_ratio
double superscript_scaledown_ratio
Definition: tesseractclass.h:985
tesseract::Tesseract::GetSubAndSuperscriptCandidates
void GetSubAndSuperscriptCandidates(const WERD_RES *word, int *num_rebuilt_leading, ScriptPos *leading_pos, float *leading_certainty, int *num_rebuilt_trailing, ScriptPos *trailing_pos, float *trailing_certainty, float *avg_certainty, float *unlikely_threshold)
Definition: superscript.cpp:252
TBOX::height
int16_t height() const
Definition: rect.h:107
UNICHARSET::get_ispunctuation
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:509
WERD_RES::tess_failed
bool tess_failed
Definition: pageres.h:289
tesseract::SP_SUBSCRIPT
Definition: ratngs.h:252
tesseract::Classify::get_fontinfo_table
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:386
tesseract::ScriptPos
ScriptPos
Definition: ratngs.h:250
tesseract::Tesseract::SubAndSuperscriptFix
bool SubAndSuperscriptFix(WERD_RES *word_res)
Definition: superscript.cpp:100
WERD_RES::best_choice
WERD_CHOICE * best_choice
Definition: pageres.h:235
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
UNICHARSET::get_top_bottom
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:558
TWERD::blobs
GenericVector< TBLOB * > blobs
Definition: blobs.h:457
UNICHARSET::top_bottom_useful
bool top_bottom_useful() const
Definition: unicharset.h:527
TBOX::bottom
int16_t bottom() const
Definition: rect.h:64
tesseract::SP_NORMAL
Definition: ratngs.h:251
UnicityTable::get
const T & get(int id) const
Return the object from an id.
Definition: unicity_table.h:140
WERD_RES::chopped_word
TWERD * chopped_word
Definition: pageres.h:206
tesseract
Definition: baseapi.h:65
tesseract::Tesseract::split_word
void split_word(WERD_RES *word, int split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const
Definition: tfacepp.cpp:174
UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:36
TBLOB::bounding_box
TBOX bounding_box() const
Definition: blobs.cpp:466
UnicityTable
Definition: fontinfo.h:30
tesseract::Tesseract::recog_word_recursive
void recog_word_recursive(WERD_RES *word)
Definition: tfacepp.cpp:104
BLOB_CHOICE::fontinfo_id
int16_t fontinfo_id() const
Definition: ratngs.h:84
UnicityTable::size
int size() const
Return the size used.
Definition: unicity_table.h:127
WERD_CHOICE::print
void print() const
Definition: ratngs.h:568
BLOB_CHOICE::fontinfo_id2
int16_t fontinfo_id2() const
Definition: ratngs.h:87
WERD_CHOICE::length
int length() const
Definition: ratngs.h:291
BLOB_CHOICE
Definition: ratngs.h:49
TBLOB
Definition: blobs.h:282
tesseract::Tesseract::subscript_max_y_top
double subscript_max_y_top
Definition: tesseractclass.h:989
tesseract::SP_SUPERSCRIPT
Definition: ratngs.h:253
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
tesseract::Tesseract::superscript_debug
int superscript_debug
Definition: tesseractclass.h:972
WERD_RES::word
WERD * word
Definition: pageres.h:180
tesseract::Classify::classify_integer_matcher_multiplier
int classify_integer_matcher_multiplier
Definition: classify.h:505
UNICHARSET::id_to_unichar
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
tesseract::Tesseract::TrySuperscriptSplits
WERD_RES * TrySuperscriptSplits(int num_chopped_leading, float leading_certainty, ScriptPos leading_pos, int num_chopped_trailing, float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word, bool *is_good, int *retry_leading, int *retry_trailing)
Definition: superscript.cpp:381
BlamerBundle
Definition: blamer.h:103
tesseract::Tesseract::join_words
void join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const
Definition: tfacepp.cpp:231
GenericVector::size
int size() const
Definition: genericvector.h:71
kBlnBaselineOffset
const int kBlnBaselineOffset
Definition: normalis.h:24
TWERD::NumBlobs
int NumBlobs() const
Definition: blobs.h:446
tesseract::ScriptPosToString
const char * ScriptPosToString(enum ScriptPos script_pos)
Definition: ratngs.cpp:202
TBOX
Definition: rect.h:33