tesseract  4.0.0-1-g2a2b
ScriptDetector Class Reference

#include <osdetect.h>

Public Member Functions

 ScriptDetector (const GenericVector< int > *allowed_scripts, OSResults *osr, tesseract::Tesseract *tess)
 
void detect_blob (BLOB_CHOICE_LIST *scores)
 
bool must_stop (int orientation)
 

Detailed Description

Definition at line 95 of file osdetect.h.

Constructor & Destructor Documentation

◆ ScriptDetector()

ScriptDetector::ScriptDetector ( const GenericVector< int > *  allowed_scripts,
OSResults osr,
tesseract::Tesseract tess 
)

Definition at line 453 of file osdetect.cpp.

454  {
455  osr_ = osr;
456  tess_ = tess;
457  allowed_scripts_ = allowed_scripts;
458  katakana_id_ = tess_->unicharset.add_script(katakana_script);
459  hiragana_id_ = tess_->unicharset.add_script(hiragana_script);
460  han_id_ = tess_->unicharset.add_script(han_script);
461  hangul_id_ = tess_->unicharset.add_script(hangul_script);
462  japanese_id_ = tess_->unicharset.add_script(japanese_script_);
463  korean_id_ = tess_->unicharset.add_script(korean_script_);
464  latin_id_ = tess_->unicharset.add_script(latin_script);
465  fraktur_id_ = tess_->unicharset.add_script(fraktur_script_);
466 }
UNICHARSET unicharset
Definition: ccutil.h:68
int add_script(const char *script)

Member Function Documentation

◆ detect_blob()

void ScriptDetector::detect_blob ( BLOB_CHOICE_LIST *  scores)

Definition at line 471 of file osdetect.cpp.

471  {
472  bool done[kMaxNumberOfScripts];
473  for (int i = 0; i < 4; ++i) {
474  for (int j = 0; j < kMaxNumberOfScripts; ++j)
475  done[j] = false;
476 
477  BLOB_CHOICE_IT choice_it;
478  choice_it.set_to_list(scores + i);
479 
480  float prev_score = -1;
481  int script_count = 0;
482  int prev_id = -1;
483  int prev_fontinfo_id = -1;
484  const char* prev_unichar = "";
485  const char* unichar = "";
486 
487  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
488  choice_it.forward()) {
489  BLOB_CHOICE* choice = choice_it.data();
490  int id = choice->script_id();
491  if (allowed_scripts_ != nullptr && !allowed_scripts_->empty()) {
492  // Check that the choice is in an allowed script.
493  int s = 0;
494  for (s = 0; s < allowed_scripts_->size(); ++s) {
495  if ((*allowed_scripts_)[s] == id) break;
496  }
497  if (s == allowed_scripts_->size()) continue; // Not found in list.
498  }
499  // Script already processed before.
500  if (done[id]) continue;
501  done[id] = true;
502 
503  unichar = tess_->unicharset.id_to_unichar(choice->unichar_id());
504  // Save data from the first match
505  if (prev_score < 0) {
506  prev_score = -choice->certainty();
507  script_count = 1;
508  prev_id = id;
509  prev_unichar = unichar;
510  prev_fontinfo_id = choice->fontinfo_id();
511  } else if (-choice->certainty() < prev_score + kNonAmbiguousMargin) {
512  ++script_count;
513  }
514 
515  if (strlen(prev_unichar) == 1)
516  if (unichar[0] >= '0' && unichar[0] <= '9')
517  break;
518 
519  // if script_count is >= 2, character is ambiguous, skip other matches
520  // since they are useless.
521  if (script_count >= 2)
522  break;
523  }
524  // Character is non ambiguous
525  if (script_count == 1) {
526  // Update the score of the winning script
527  osr_->scripts_na[i][prev_id] += 1.0;
528 
529  // Workaround for Fraktur
530  if (prev_id == latin_id_) {
531  if (prev_fontinfo_id >= 0) {
532  const tesseract::FontInfo &fi =
533  tess_->get_fontinfo_table().get(prev_fontinfo_id);
534  //printf("Font: %s i:%i b:%i f:%i s:%i k:%i (%s)\n", fi.name,
535  // fi.is_italic(), fi.is_bold(), fi.is_fixed_pitch(),
536  // fi.is_serif(), fi.is_fraktur(),
537  // prev_unichar);
538  if (fi.is_fraktur()) {
539  osr_->scripts_na[i][prev_id] -= 1.0;
540  osr_->scripts_na[i][fraktur_id_] += 1.0;
541  }
542  }
543  }
544 
545  // Update Japanese / Korean pseudo-scripts
546  if (prev_id == katakana_id_)
547  osr_->scripts_na[i][japanese_id_] += 1.0;
548  if (prev_id == hiragana_id_)
549  osr_->scripts_na[i][japanese_id_] += 1.0;
550  if (prev_id == hangul_id_)
551  osr_->scripts_na[i][korean_id_] += 1.0;
552  if (prev_id == han_id_) {
553  osr_->scripts_na[i][korean_id_] += kHanRatioInKorean;
554  osr_->scripts_na[i][japanese_id_] += kHanRatioInJapanese;
555  }
556  }
557  } // iterate over each orientation
558 }
float certainty() const
Definition: ratngs.h:83
int size() const
Definition: genericvector.h:71
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:386
float scripts_na[4][kMaxNumberOfScripts]
Definition: osdetect.h:78
int16_t fontinfo_id() const
Definition: ratngs.h:86
UNICHARSET unicharset
Definition: ccutil.h:68
const int kMaxNumberOfScripts
Definition: osdetect.h:38
bool empty() const
Definition: genericvector.h:90
const float kHanRatioInJapanese
Definition: osdetect.cpp:46
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
int script_id() const
Definition: ratngs.h:112
bool is_fraktur() const
Definition: fontinfo.h:115
UNICHAR_ID unichar_id() const
Definition: ratngs.h:77
const float kNonAmbiguousMargin
Definition: osdetect.cpp:48
const float kHanRatioInKorean
Definition: osdetect.cpp:45

◆ must_stop()

bool ScriptDetector::must_stop ( int  orientation)

Definition at line 560 of file osdetect.cpp.

560  {
561  osr_->update_best_script(orientation);
562  return osr_->best_result.sconfidence > 1;
563 }
float sconfidence
Definition: osdetect.h:45
OSBestResult best_result
Definition: osdetect.h:81
void update_best_script(int orientation_id)
Definition: osdetect.cpp:89

The documentation for this class was generated from the following files: