tesseract  5.0.0-alpha-619-ge9db
ScriptDetector Class Reference

#include <osdetect.h>

Public Member Functions

 ScriptDetector (const GenericVector< int > *allowed_scripts, OSResults *osr, tesseract::Tesseract *tess)
 
void detect_blob (BLOB_CHOICE_LIST *scores)
 
bool must_stop (int orientation)
 

Detailed Description

Definition at line 96 of file osdetect.h.

Constructor & Destructor Documentation

◆ ScriptDetector()

ScriptDetector::ScriptDetector ( const GenericVector< int > *  allowed_scripts,
OSResults osr,
tesseract::Tesseract tess 
)

Definition at line 453 of file osdetect.cpp.

454  {
455  osr_ = osr;
456  tess_ = tess;
457  allowed_scripts_ = allowed_scripts;
458  katakana_id_ = tess_->unicharset.add_script(katakana_script);
459  hiragana_id_ = tess_->unicharset.add_script(hiragana_script);
460  han_id_ = tess_->unicharset.add_script(han_script);
461  hangul_id_ = tess_->unicharset.add_script(hangul_script);
462  japanese_id_ = tess_->unicharset.add_script(japanese_script_);
463  korean_id_ = tess_->unicharset.add_script(korean_script_);
464  latin_id_ = tess_->unicharset.add_script(latin_script);
465  fraktur_id_ = tess_->unicharset.add_script(fraktur_script_);
466 }

Member Function Documentation

◆ detect_blob()

void ScriptDetector::detect_blob ( BLOB_CHOICE_LIST *  scores)

Definition at line 471 of file osdetect.cpp.

471  {
472  for (int i = 0; i < 4; ++i) {
473  bool done[kMaxNumberOfScripts] = { false };
474 
475  BLOB_CHOICE_IT choice_it;
476  choice_it.set_to_list(scores + i);
477 
478  float prev_score = -1;
479  int script_count = 0;
480  int prev_id = -1;
481  int prev_fontinfo_id = -1;
482  const char* prev_unichar = "";
483  const char* unichar = "";
484 
485  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
486  choice_it.forward()) {
487  BLOB_CHOICE* choice = choice_it.data();
488  int id = choice->script_id();
489  if (allowed_scripts_ != nullptr && !allowed_scripts_->empty()) {
490  // Check that the choice is in an allowed script.
491  int s = 0;
492  for (s = 0; s < allowed_scripts_->size(); ++s) {
493  if ((*allowed_scripts_)[s] == id) break;
494  }
495  if (s == allowed_scripts_->size()) continue; // Not found in list.
496  }
497  // Script already processed before.
498  if (done[id]) continue;
499  done[id] = true;
500 
501  unichar = tess_->unicharset.id_to_unichar(choice->unichar_id());
502  // Save data from the first match
503  if (prev_score < 0) {
504  prev_score = -choice->certainty();
505  script_count = 1;
506  prev_id = id;
507  prev_unichar = unichar;
508  prev_fontinfo_id = choice->fontinfo_id();
509  } else if (-choice->certainty() < prev_score + kNonAmbiguousMargin) {
510  ++script_count;
511  }
512 
513  if (strlen(prev_unichar) == 1)
514  if (unichar[0] >= '0' && unichar[0] <= '9')
515  break;
516 
517  // if script_count is >= 2, character is ambiguous, skip other matches
518  // since they are useless.
519  if (script_count >= 2)
520  break;
521  }
522  // Character is non ambiguous
523  if (script_count == 1) {
524  // Update the score of the winning script
525  osr_->scripts_na[i][prev_id] += 1.0;
526 
527  // Workaround for Fraktur
528  if (prev_id == latin_id_) {
529  if (prev_fontinfo_id >= 0) {
530  const tesseract::FontInfo &fi =
531  tess_->get_fontinfo_table().get(prev_fontinfo_id);
532  //printf("Font: %s i:%i b:%i f:%i s:%i k:%i (%s)\n", fi.name,
533  // fi.is_italic(), fi.is_bold(), fi.is_fixed_pitch(),
534  // fi.is_serif(), fi.is_fraktur(),
535  // prev_unichar);
536  if (fi.is_fraktur()) {
537  osr_->scripts_na[i][prev_id] -= 1.0;
538  osr_->scripts_na[i][fraktur_id_] += 1.0;
539  }
540  }
541  }
542 
543  // Update Japanese / Korean pseudo-scripts
544  if (prev_id == katakana_id_)
545  osr_->scripts_na[i][japanese_id_] += 1.0;
546  if (prev_id == hiragana_id_)
547  osr_->scripts_na[i][japanese_id_] += 1.0;
548  if (prev_id == hangul_id_)
549  osr_->scripts_na[i][korean_id_] += 1.0;
550  if (prev_id == han_id_) {
551  osr_->scripts_na[i][korean_id_] += kHanRatioInKorean;
552  osr_->scripts_na[i][japanese_id_] += kHanRatioInJapanese;
553  }
554  }
555  } // iterate over each orientation
556 }

◆ must_stop()

bool ScriptDetector::must_stop ( int  orientation)

Definition at line 558 of file osdetect.cpp.

558  {
559  osr_->update_best_script(orientation);
560  return osr_->best_result.sconfidence > 1;
561 }

The documentation for this class was generated from the following files:
OSResults::best_result
OSBestResult best_result
Definition: osdetect.h:81
kNonAmbiguousMargin
const float kNonAmbiguousMargin
Definition: osdetect.cpp:48
kHanRatioInKorean
const float kHanRatioInKorean
Definition: osdetect.cpp:45
tesseract::FontInfo::is_fraktur
bool is_fraktur() const
Definition: fontinfo.h:115
BLOB_CHOICE::certainty
float certainty() const
Definition: ratngs.h:81
kMaxNumberOfScripts
const int kMaxNumberOfScripts
Definition: osdetect.h:39
BLOB_CHOICE::script_id
int script_id() const
Definition: ratngs.h:112
BLOB_CHOICE::unichar_id
UNICHAR_ID unichar_id() const
Definition: ratngs.h:75
tesseract::CCUtil::unicharset
UNICHARSET unicharset
Definition: ccutil.h:57
tesseract::Classify::get_fontinfo_table
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:386
UNICHARSET::add_script
int add_script(const char *script)
Definition: unicharset.cpp:1020
OSResults::scripts_na
float scripts_na[4][kMaxNumberOfScripts]
Definition: osdetect.h:78
GenericVector::empty
bool empty() const
Definition: genericvector.h:86
kHanRatioInJapanese
const float kHanRatioInJapanese
Definition: osdetect.cpp:46
tesseract::FontInfo
Definition: fontinfo.h:62
BLOB_CHOICE::fontinfo_id
int16_t fontinfo_id() const
Definition: ratngs.h:84
OSResults::update_best_script
void update_best_script(int orientation_id)
Definition: osdetect.cpp:89
BLOB_CHOICE
Definition: ratngs.h:49
OSBestResult::sconfidence
float sconfidence
Definition: osdetect.h:46
UNICHARSET::id_to_unichar
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
GenericVector::size
int size() const
Definition: genericvector.h:71