tesseract  5.0.0-alpha-619-ge9db
WERD_CHOICE Class Reference

#include <ratngs.h>

Inheritance diagram for WERD_CHOICE:
ELIST_LINK

Public Member Functions

 WERD_CHOICE (const UNICHARSET *unicharset)
 
 WERD_CHOICE (const UNICHARSET *unicharset, int reserved)
 
 WERD_CHOICE (const char *src_string, const char *src_lengths, float src_rating, float src_certainty, uint8_t src_permuter, const UNICHARSET &unicharset)
 
 WERD_CHOICE (const char *src_string, const UNICHARSET &unicharset)
 
 WERD_CHOICE (const WERD_CHOICE &word)
 
 ~WERD_CHOICE ()
 
const UNICHARSETunicharset () const
 
int length () const
 
float adjust_factor () const
 
void set_adjust_factor (float factor)
 
const UNICHAR_IDunichar_ids () const
 
UNICHAR_ID unichar_id (int index) const
 
int state (int index) const
 
tesseract::ScriptPos BlobPosition (int index) const
 
float rating () const
 
float certainty () const
 
float certainty (int index) const
 
float min_x_height () const
 
float max_x_height () const
 
void set_x_heights (float min_height, float max_height)
 
uint8_t permuter () const
 
const char * permuter_name () const
 
BLOB_CHOICE_LIST * blob_choices (int index, MATRIX *ratings) const
 
MATRIX_COORD MatrixCoord (int index) const
 
void set_unichar_id (UNICHAR_ID unichar_id, int index)
 
bool dangerous_ambig_found () const
 
void set_dangerous_ambig_found_ (bool value)
 
void set_rating (float new_val)
 
void set_certainty (float new_val)
 
void set_permuter (uint8_t perm)
 
void set_length (int len)
 
void double_the_size ()
 Make more space in unichar_id_ and fragment_lengths_ arrays. More...
 
void init (int reserved)
 
void init (const char *src_string, const char *src_lengths, float src_rating, float src_certainty, uint8_t src_permuter)
 
void make_bad ()
 Set the fields in this choice to be default (bad) values. More...
 
void append_unichar_id_space_allocated (UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
 
void append_unichar_id (UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
 
void set_unichar_id (UNICHAR_ID unichar_id, int blob_count, float rating, float certainty, int index)
 
void set_blob_choice (int index, int blob_count, const BLOB_CHOICE *blob_choice)
 
bool contains_unichar_id (UNICHAR_ID unichar_id) const
 
void remove_unichar_ids (int index, int num)
 
void remove_last_unichar_id ()
 
void remove_unichar_id (int index)
 
bool has_rtl_unichar_id () const
 
void reverse_and_mirror_unichar_ids ()
 
void punct_stripped (int *start_core, int *end_core) const
 
void GetNonSuperscriptSpan (int *start, int *end) const
 
WERD_CHOICE shallow_copy (int start, int end) const
 
void string_and_lengths (STRING *word_str, STRING *word_lengths_str) const
 
const STRING debug_string () const
 
bool ContainsAnyNonSpaceDelimited () const
 
bool IsAllSpaces () const
 
bool set_unichars_in_script_order (bool in_script_order)
 
bool unichars_in_script_order () const
 
const STRINGunichar_string () const
 
const STRINGunichar_lengths () const
 
void SetScriptPositions (bool small_caps, TWERD *word, int debug=0)
 
void SetScriptPositions (const tesseract::ScriptPos *positions, int length)
 
void SetAllScriptPositions (tesseract::ScriptPos position)
 
int GetTopScriptID () const
 
void UpdateStateForSplit (int blob_position)
 
int TotalOfStates () const
 
void print () const
 
void print (const char *msg) const
 
void print_state (const char *msg) const
 
void DisplaySegmentation (TWERD *word)
 
WERD_CHOICEoperator+= (const WERD_CHOICE &second)
 
WERD_CHOICEoperator= (const WERD_CHOICE &source)
 
- Public Member Functions inherited from ELIST_LINK
 ELIST_LINK ()
 
 ELIST_LINK (const ELIST_LINK &)
 
void operator= (const ELIST_LINK &)
 

Static Public Member Functions

static const char * permuter_name (uint8_t permuter)
 
static tesseract::ScriptPos ScriptPositionOf (bool print_debug, const UNICHARSET &unicharset, const TBOX &blob_box, UNICHAR_ID unichar_id)
 

Static Public Attributes

static const float kBadRating = 100000.0
 

Detailed Description

Definition at line 261 of file ratngs.h.

Constructor & Destructor Documentation

◆ WERD_CHOICE() [1/5]

WERD_CHOICE::WERD_CHOICE ( const UNICHARSET unicharset)
inline

Definition at line 266 of file ratngs.h.

268  : unicharset_(unicharset) { this->init(8); }

◆ WERD_CHOICE() [2/5]

WERD_CHOICE::WERD_CHOICE ( const UNICHARSET unicharset,
int  reserved 
)
inline

Definition at line 268 of file ratngs.h.

268  : unicharset_(unicharset) { this->init(8); }
269  WERD_CHOICE(const UNICHARSET *unicharset, int reserved)

◆ WERD_CHOICE() [3/5]

WERD_CHOICE::WERD_CHOICE ( const char *  src_string,
const char *  src_lengths,
float  src_rating,
float  src_certainty,
uint8_t  src_permuter,
const UNICHARSET unicharset 
)
inline

Definition at line 270 of file ratngs.h.

270  : unicharset_(unicharset) { this->init(reserved); }
271  WERD_CHOICE(const char *src_string,
272  const char *src_lengths,
273  float src_rating,
274  float src_certainty,
275  uint8_t src_permuter,
276  const UNICHARSET &unicharset)
277  : unicharset_(&unicharset) {
278  this->init(src_string, src_lengths, src_rating,
279  src_certainty, src_permuter);

◆ WERD_CHOICE() [4/5]

WERD_CHOICE::WERD_CHOICE ( const char *  src_string,
const UNICHARSET unicharset 
)

WERD_CHOICE::WERD_CHOICE

Constructor to build a WERD_CHOICE from the given string. The function assumes that src_string is not nullptr.

Definition at line 220 of file ratngs.cpp.

224  GenericVector<UNICHAR_ID> encoding;
225  GenericVector<char> lengths;
226  std::string cleaned = unicharset.CleanupString(src_string);
227  if (unicharset.encode_string(cleaned.c_str(), true, &encoding, &lengths,
228  nullptr)) {
229  lengths.push_back('\0');
230  STRING src_lengths = &lengths[0];
231  this->init(cleaned.c_str(), src_lengths.c_str(), 0.0, 0.0, NO_PERM);
232  } else { // There must have been an invalid unichar in the string.
233  this->init(8);
234  this->make_bad();
235  }

◆ WERD_CHOICE() [5/5]

WERD_CHOICE::WERD_CHOICE ( const WERD_CHOICE word)
inline

Definition at line 281 of file ratngs.h.

283  : ELIST_LINK(word), unicharset_(word.unicharset_) {
284  this->init(word.length());
285  this->operator=(word);

◆ ~WERD_CHOICE()

WERD_CHOICE::~WERD_CHOICE ( )

WERD_CHOICE::~WERD_CHOICE

Definition at line 278 of file ratngs.cpp.

279  {
280  delete[] unichar_ids_;
281  delete[] script_pos_;
282  delete[] state_;
283  delete[] certainties_;

Member Function Documentation

◆ adjust_factor()

float WERD_CHOICE::adjust_factor ( ) const
inline

Definition at line 294 of file ratngs.h.

295  {
296  return adjust_factor_;

◆ append_unichar_id()

void WERD_CHOICE::append_unichar_id ( UNICHAR_ID  unichar_id,
int  blob_count,
float  rating,
float  certainty 
)

append_unichar_id

Make sure there is enough space in the word for the new unichar id and call append_unichar_id_space_allocated().

Definition at line 470 of file ratngs.cpp.

473  {
474  if (length_ == reserved_) {
475  this->double_the_size();
476  }
478  rating, certainty);

◆ append_unichar_id_space_allocated()

void WERD_CHOICE::append_unichar_id_space_allocated ( UNICHAR_ID  unichar_id,
int  blob_count,
float  rating,
float  certainty 
)
inline

This function assumes that there is enough space reserved in the WERD_CHOICE for adding another unichar. This is an efficient alternative to append_unichar_id().

Definition at line 440 of file ratngs.h.

443  {
444  assert(reserved_ > length_);
445  length_++;
446  this->set_unichar_id(unichar_id, blob_count,
447  rating, certainty, length_-1);

◆ blob_choices()

BLOB_CHOICE_LIST * WERD_CHOICE::blob_choices ( int  index,
MATRIX ratings 
) const

Definition at line 292 of file ratngs.cpp.

293  {
294  MATRIX_COORD coord = MatrixCoord(index);
295  BLOB_CHOICE_LIST* result = ratings->get(coord.col, coord.row);
296  if (result == nullptr) {
297  result = new BLOB_CHOICE_LIST;
298  ratings->put(coord.col, coord.row, result);
299  }
300  return result;

◆ BlobPosition()

tesseract::ScriptPos WERD_CHOICE::BlobPosition ( int  index) const
inline

Definition at line 310 of file ratngs.h.

311  {
312  if (index < 0 || index >= length_)
313  return tesseract::SP_NORMAL;
314  return script_pos_[index];

◆ certainty() [1/2]

float WERD_CHOICE::certainty ( ) const
inline

Definition at line 318 of file ratngs.h.

319  {
320  return certainty_;

◆ certainty() [2/2]

float WERD_CHOICE::certainty ( int  index) const
inline

Definition at line 321 of file ratngs.h.

322  {
323  return certainties_[index];

◆ contains_unichar_id()

bool WERD_CHOICE::contains_unichar_id ( UNICHAR_ID  unichar_id) const

contains_unichar_id

Returns true if unichar_ids_ contain the given unichar_id, false otherwise.

Definition at line 328 of file ratngs.cpp.

329  {
330  for (int i = 0; i < length_; ++i) {
331  if (unichar_ids_[i] == unichar_id) {
332  return true;
333  }
334  }
335  return false;

◆ ContainsAnyNonSpaceDelimited()

bool WERD_CHOICE::ContainsAnyNonSpaceDelimited ( ) const
inline

Definition at line 502 of file ratngs.h.

503  {
504  for (int i = 0; i < length_; ++i) {
505  if (!unicharset_->IsSpaceDelimited(unichar_ids_[i])) return true;
506  }
507  return false;

◆ dangerous_ambig_found()

bool WERD_CHOICE::dangerous_ambig_found ( ) const
inline

Definition at line 351 of file ratngs.h.

352  {
353  return dangerous_ambig_found_;

◆ debug_string()

const STRING WERD_CHOICE::debug_string ( ) const
inline

Definition at line 493 of file ratngs.h.

494  {
495  STRING word_str;
496  for (int i = 0; i < length_; ++i) {
497  word_str += unicharset_->debug_str(unichar_ids_[i]);
498  word_str += " ";
499  }
500  return word_str;

◆ DisplaySegmentation()

void WERD_CHOICE::DisplaySegmentation ( TWERD word)

Definition at line 763 of file ratngs.cpp.

764  {
765 #ifndef GRAPHICS_DISABLED
766  // Number of different colors to draw with.
767  const int kNumColors = 6;
768  static ScrollView *segm_window = nullptr;
769  // Check the state against the static prev_drawn_state.
770  static GenericVector<int> prev_drawn_state;
771  bool already_done = prev_drawn_state.size() == length_;
772  if (!already_done) prev_drawn_state.init_to_size(length_, 0);
773  for (int i = 0; i < length_; ++i) {
774  if (prev_drawn_state[i] != state_[i]) {
775  already_done = false;
776  }
777  prev_drawn_state[i] = state_[i];
778  }
779  if (already_done || word->blobs.empty()) return;
780 
781  // Create the window if needed.
782  if (segm_window == nullptr) {
783  segm_window = new ScrollView("Segmentation", 5, 10, 500, 256,
784  2000.0, 256.0, true);
785  } else {
786  segm_window->Clear();
787  }
788 
789  TBOX bbox;
790  int blob_index = 0;
791  for (int c = 0; c < length_; ++c) {
792  auto color =
793  static_cast<ScrollView::Color>(c % kNumColors + 3);
794  for (int i = 0; i < state_[c]; ++i, ++blob_index) {
795  TBLOB* blob = word->blobs[blob_index];
796  bbox += blob->bounding_box();
797  blob->plot(segm_window, color, color);
798  }
799  }
800  segm_window->ZoomToRectangle(bbox.left(), bbox.top(),
801  bbox.right(), bbox.bottom());
802  segm_window->Update();
803  window_wait(segm_window);
804 #endif

◆ double_the_size()

void WERD_CHOICE::double_the_size ( )
inline

Make more space in unichar_id_ and fragment_lengths_ arrays.

Definition at line 375 of file ratngs.h.

376  {
377  if (reserved_ > 0) {
379  reserved_, unichar_ids_);
381  reserved_, script_pos_);
383  reserved_, state_);
385  reserved_, certainties_);
386  reserved_ *= 2;
387  } else {
388  unichar_ids_ = new UNICHAR_ID[1];
389  script_pos_ = new tesseract::ScriptPos[1];
390  state_ = new int[1];
391  certainties_ = new float[1];
392  reserved_ = 1;
393  }

◆ GetNonSuperscriptSpan()

void WERD_CHOICE::GetNonSuperscriptSpan ( int *  start,
int *  end 
) const

Definition at line 399 of file ratngs.cpp.

400  {
401  int end = length();
402  while (end > 0 &&
403  unicharset_->get_isdigit(unichar_ids_[end - 1]) &&
405  end--;
406  }
407  int start = 0;
408  while (start < end &&
409  unicharset_->get_isdigit(unichar_ids_[start]) &&
411  start++;
412  }
413  *pstart = start;
414  *pend = end;

◆ GetTopScriptID()

int WERD_CHOICE::GetTopScriptID ( ) const

Definition at line 669 of file ratngs.cpp.

670  {
671  int max_script = unicharset_->get_script_table_size();
672  int *sid = new int[max_script];
673  int x;
674  for (x = 0; x < max_script; x++) sid[x] = 0;
675  for (x = 0; x < length_; ++x) {
676  int script_id = unicharset_->get_script(unichar_id(x));
677  sid[script_id]++;
678  }
679  if (unicharset_->han_sid() != unicharset_->null_sid()) {
680  // Add the Hiragana & Katakana counts to Han and zero them out.
682  sid[unicharset_->han_sid()] += sid[unicharset_->hiragana_sid()];
683  sid[unicharset_->hiragana_sid()] = 0;
684  }
686  sid[unicharset_->han_sid()] += sid[unicharset_->katakana_sid()];
687  sid[unicharset_->katakana_sid()] = 0;
688  }
689  }
690  // Note that high script ID overrides lower one on a tie, thus biasing
691  // towards non-Common script (if sorted that way in unicharset file).
692  int max_sid = 0;
693  for (x = 1; x < max_script; x++)
694  if (sid[x] >= sid[max_sid]) max_sid = x;
695  if (sid[max_sid] < length_ / 2)
696  max_sid = unicharset_->null_sid();
697  delete[] sid;
698  return max_sid;

◆ has_rtl_unichar_id()

bool WERD_CHOICE::has_rtl_unichar_id ( ) const

has_rtl_unichar_id

Returns true if unichar_ids contain at least one "strongly" RTL unichar.

Definition at line 433 of file ratngs.cpp.

434  {
435  int i;
436  for (i = 0; i < length_; ++i) {
437  UNICHARSET::Direction dir = unicharset_->get_direction(unichar_ids_[i]);
438  if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
440  return true;
441  }
442  }
443  return false;

◆ init() [1/2]

void WERD_CHOICE::init ( const char *  src_string,
const char *  src_lengths,
float  src_rating,
float  src_certainty,
uint8_t  src_permuter 
)

Helper function to build a WERD_CHOICE from the given string, fragment lengths, rating, certainty and permuter. The function assumes that src_string is not nullptr. src_lengths argument could be nullptr, in which case the unichars in src_string are assumed to all be of length 1.

WERD_CHOICE::init

Helper function to build a WERD_CHOICE from the given string, fragment lengths, rating, certainty and permuter.

The function assumes that src_string is not nullptr. src_lengths argument could be nullptr, in which case the unichars in src_string are assumed to all be of length 1.

Definition at line 247 of file ratngs.cpp.

252  {
253  int src_string_len = strlen(src_string);
254  if (src_string_len == 0) {
255  this->init(8);
256  } else {
257  this->init(src_lengths ? strlen(src_lengths): src_string_len);
258  length_ = reserved_;
259  int offset = 0;
260  for (int i = 0; i < length_; ++i) {
261  int unichar_length = src_lengths ? src_lengths[i] : 1;
262  unichar_ids_[i] =
263  unicharset_->unichar_to_id(src_string+offset, unichar_length);
264  state_[i] = 1;
265  certainties_[i] = src_certainty;
266  offset += unichar_length;
267  }
268  }
269  adjust_factor_ = 1.0f;
270  rating_ = src_rating;
271  certainty_ = src_certainty;
272  permuter_ = src_permuter;
273  dangerous_ambig_found_ = false;

◆ init() [2/2]

void WERD_CHOICE::init ( int  reserved)
inline

Initializes WERD_CHOICE - reserves length slots in unichar_ids_ and fragment_length_ arrays. Sets other values to default (blank) values.

Definition at line 397 of file ratngs.h.

398  {
399  reserved_ = reserved;
400  if (reserved > 0) {
401  unichar_ids_ = new UNICHAR_ID[reserved];
402  script_pos_ = new tesseract::ScriptPos[reserved];
403  state_ = new int[reserved];
404  certainties_ = new float[reserved];
405  } else {
406  unichar_ids_ = nullptr;
407  script_pos_ = nullptr;
408  state_ = nullptr;
409  certainties_ = nullptr;
410  }
411  length_ = 0;
412  adjust_factor_ = 1.0f;
413  rating_ = 0.0;
414  certainty_ = FLT_MAX;
415  min_x_height_ = 0.0f;
416  max_x_height_ = FLT_MAX;
417  permuter_ = NO_PERM;
418  unichars_in_script_order_ = false; // Tesseract is strict left-to-right.
419  dangerous_ambig_found_ = false;

◆ IsAllSpaces()

bool WERD_CHOICE::IsAllSpaces ( ) const
inline

Definition at line 509 of file ratngs.h.

510  {
511  for (int i = 0; i < length_; ++i) {
512  if (unichar_ids_[i] != UNICHAR_SPACE) return false;
513  }
514  return true;

◆ length()

int WERD_CHOICE::length ( ) const
inline

Definition at line 291 of file ratngs.h.

292  {
293  return length_;

◆ make_bad()

void WERD_CHOICE::make_bad ( )
inline

Set the fields in this choice to be default (bad) values.

Definition at line 431 of file ratngs.h.

432  {
433  length_ = 0;
434  rating_ = kBadRating;
435  certainty_ = -FLT_MAX;

◆ MatrixCoord()

MATRIX_COORD WERD_CHOICE::MatrixCoord ( int  index) const

Definition at line 304 of file ratngs.cpp.

305  {
306  int col = 0;
307  for (int i = 0; i < index; ++i)
308  col += state_[i];
309  int row = col + state_[index] - 1;
310  return MATRIX_COORD(col, row);

◆ max_x_height()

float WERD_CHOICE::max_x_height ( ) const
inline

Definition at line 327 of file ratngs.h.

328  {
329  return max_x_height_;

◆ min_x_height()

float WERD_CHOICE::min_x_height ( ) const
inline

Definition at line 324 of file ratngs.h.

325  {
326  return min_x_height_;

◆ operator+=()

WERD_CHOICE & WERD_CHOICE::operator+= ( const WERD_CHOICE second)

WERD_CHOICE::operator+=

Cat a second word rating on the end of this current one. The ratings are added and the confidence is the min. If the permuters are NOT the same the permuter is set to COMPOUND_PERM

Definition at line 487 of file ratngs.cpp.

488  {
489  ASSERT_HOST(unicharset_ == second.unicharset_);
490  while (reserved_ < length_ + second.length()) {
491  this->double_the_size();
492  }
493  const UNICHAR_ID *other_unichar_ids = second.unichar_ids();
494  for (int i = 0; i < second.length(); ++i) {
495  unichar_ids_[length_ + i] = other_unichar_ids[i];
496  state_[length_ + i] = second.state_[i];
497  certainties_[length_ + i] = second.certainties_[i];
498  script_pos_[length_ + i] = second.BlobPosition(i);
499  }
500  length_ += second.length();
501  if (second.adjust_factor_ > adjust_factor_)
502  adjust_factor_ = second.adjust_factor_;
503  rating_ += second.rating(); // add ratings
504  if (second.certainty() < certainty_) // take min
505  certainty_ = second.certainty();
506  if (second.dangerous_ambig_found_)
507  dangerous_ambig_found_ = true;
508  if (permuter_ == NO_PERM) {
509  permuter_ = second.permuter();
510  } else if (second.permuter() != NO_PERM &&
511  second.permuter() != permuter_) {
512  permuter_ = COMPOUND_PERM;
513  }
514  return *this;

◆ operator=()

WERD_CHOICE & WERD_CHOICE::operator= ( const WERD_CHOICE source)

WERD_CHOICE::operator=

Allocate enough memory to hold a copy of source and copy over all the information from source to this WERD_CHOICE.

Definition at line 523 of file ratngs.cpp.

524  {
525  while (reserved_ < source.length()) {
526  this->double_the_size();
527  }
528 
529  unicharset_ = source.unicharset_;
530  const UNICHAR_ID *other_unichar_ids = source.unichar_ids();
531  for (int i = 0; i < source.length(); ++i) {
532  unichar_ids_[i] = other_unichar_ids[i];
533  state_[i] = source.state_[i];
534  certainties_[i] = source.certainties_[i];
535  script_pos_[i] = source.BlobPosition(i);
536  }
537  length_ = source.length();
538  adjust_factor_ = source.adjust_factor_;
539  rating_ = source.rating();
540  certainty_ = source.certainty();
541  min_x_height_ = source.min_x_height();
542  max_x_height_ = source.max_x_height();
543  permuter_ = source.permuter();
544  dangerous_ambig_found_ = source.dangerous_ambig_found_;
545  return *this;

◆ permuter()

uint8_t WERD_CHOICE::permuter ( ) const
inline

Definition at line 334 of file ratngs.h.

335  {
336  return permuter_;

◆ permuter_name() [1/2]

const char * WERD_CHOICE::permuter_name ( ) const

Definition at line 285 of file ratngs.cpp.

286  {
287  return kPermuterTypeNames[permuter_];

◆ permuter_name() [2/2]

const char * WERD_CHOICE::permuter_name ( uint8_t  permuter)
static

Definition at line 196 of file ratngs.cpp.

197  {
198  return kPermuterTypeNames[permuter];

◆ print() [1/2]

void WERD_CHOICE::print ( ) const
inline

Definition at line 568 of file ratngs.h.

569 { this->print(""); }

◆ print() [2/2]

void WERD_CHOICE::print ( const char *  msg) const

WERD_CHOICE::print

Print WERD_CHOICE to stdout.

Definition at line 726 of file ratngs.cpp.

727  {
728  tprintf("%s : ", msg);
729  for (int i = 0; i < length_; ++i) {
730  tprintf("%s", unicharset_->id_to_unichar(unichar_ids_[i]));
731  }
732  tprintf(" : R=%g, C=%g, F=%g, Perm=%d, xht=[%g,%g], ambig=%d\n",
733  rating_, certainty_, adjust_factor_, permuter_,
734  min_x_height_, max_x_height_, dangerous_ambig_found_);
735  tprintf("pos");
736  for (int i = 0; i < length_; ++i) {
737  tprintf("\t%s", ScriptPosToString(script_pos_[i]));
738  }
739  tprintf("\nstr");
740  for (int i = 0; i < length_; ++i) {
741  tprintf("\t%s", unicharset_->id_to_unichar(unichar_ids_[i]));
742  }
743  tprintf("\nstate:");
744  for (int i = 0; i < length_; ++i) {
745  tprintf("\t%d ", state_[i]);
746  }
747  tprintf("\nC");
748  for (int i = 0; i < length_; ++i) {
749  tprintf("\t%.3f", certainties_[i]);
750  }
751  tprintf("\n");

◆ print_state()

void WERD_CHOICE::print_state ( const char *  msg) const

Definition at line 754 of file ratngs.cpp.

755  {
756  tprintf("%s", msg);
757  for (int i = 0; i < length_; ++i)
758  tprintf(" %d", state_[i]);
759  tprintf("\n");

◆ punct_stripped()

void WERD_CHOICE::punct_stripped ( int *  start,
int *  end 
) const

punct_stripped

Returns the half-open interval of unichar_id indices [start, end) which enclose the core portion of this word – the part after stripping punctuation from the left and right.

Definition at line 385 of file ratngs.cpp.

386  {
387  *start = 0;
388  *end = length() - 1;
389  while (*start < length() &&
390  unicharset()->get_ispunctuation(unichar_id(*start))) {
391  (*start)++;
392  }
393  while (*end > -1 &&
394  unicharset()->get_ispunctuation(unichar_id(*end))) {
395  (*end)--;
396  }
397  (*end)++;

◆ rating()

float WERD_CHOICE::rating ( ) const
inline

Definition at line 315 of file ratngs.h.

316  {
317  return rating_;

◆ remove_last_unichar_id()

void WERD_CHOICE::remove_last_unichar_id ( )
inline

Definition at line 471 of file ratngs.h.

472 { --length_; }

◆ remove_unichar_id()

void WERD_CHOICE::remove_unichar_id ( int  index)
inline

Definition at line 472 of file ratngs.h.

472  { --length_; }
473  inline void remove_unichar_id(int index) {
474  this->remove_unichar_ids(index, 1);

◆ remove_unichar_ids()

void WERD_CHOICE::remove_unichar_ids ( int  start,
int  num 
)

remove_unichar_ids

Removes num unichar ids starting from index start from unichar_ids_ and updates length_ and fragment_lengths_ to reflect this change. Note: this function does not modify rating_ and certainty_.

Definition at line 344 of file ratngs.cpp.

345  {
346  ASSERT_HOST(start >= 0 && start + num <= length_);
347  // Accumulate the states to account for the merged blobs.
348  for (int i = 0; i < num; ++i) {
349  if (start > 0)
350  state_[start - 1] += state_[start + i];
351  else if (start + num < length_)
352  state_[start + num] += state_[start + i];
353  }
354  for (int i = start; i + num < length_; ++i) {
355  unichar_ids_[i] = unichar_ids_[i + num];
356  script_pos_[i] = script_pos_[i + num];
357  state_[i] = state_[i + num];
358  certainties_[i] = certainties_[i + num];
359  }
360  length_ -= num;

◆ reverse_and_mirror_unichar_ids()

void WERD_CHOICE::reverse_and_mirror_unichar_ids ( )

reverse_and_mirror_unichar_ids

Reverses and mirrors unichars in unichar_ids.

Definition at line 367 of file ratngs.cpp.

368  {
369  for (int i = 0; i < length_ / 2; ++i) {
370  UNICHAR_ID tmp_id = unichar_ids_[i];
371  unichar_ids_[i] = unicharset_->get_mirror(unichar_ids_[length_-1-i]);
372  unichar_ids_[length_-1-i] = unicharset_->get_mirror(tmp_id);
373  }
374  if (length_ % 2 != 0) {
375  unichar_ids_[length_/2] = unicharset_->get_mirror(unichar_ids_[length_/2]);
376  }

◆ ScriptPositionOf()

ScriptPos WERD_CHOICE::ScriptPositionOf ( bool  print_debug,
const UNICHARSET unicharset,
const TBOX blob_box,
UNICHAR_ID  unichar_id 
)
static

Definition at line 631 of file ratngs.cpp.

635  {
637  int top = blob_box.top();
638  int bottom = blob_box.bottom();
639  int min_bottom, max_bottom, min_top, max_top;
641  &min_bottom, &max_bottom,
642  &min_top, &max_top);
643 
644  int sub_thresh_top = min_top - kMinSubscriptOffset;
645  int sub_thresh_bot = kBlnBaselineOffset - kMinSubscriptOffset;
646  int sup_thresh_bot = max_bottom + kMinSuperscriptOffset;
647  if (bottom <= kMaxDropCapBottom) {
648  retval = tesseract::SP_DROPCAP;
649  } else if (top < sub_thresh_top && bottom < sub_thresh_bot) {
650  retval = tesseract::SP_SUBSCRIPT;
651  } else if (bottom > sup_thresh_bot) {
652  retval = tesseract::SP_SUPERSCRIPT;
653  }
654 
655  if (print_debug) {
656  const char *pos = ScriptPosToString(retval);
657  tprintf("%s Character %s[bot:%d top: %d] "
658  "bot_range[%d,%d] top_range[%d, %d] "
659  "sub_thresh[bot:%d top:%d] sup_thresh_bot %d\n",
661  bottom, top,
662  min_bottom, max_bottom, min_top, max_top,
663  sub_thresh_bot, sub_thresh_top,
664  sup_thresh_bot);
665  }
666  return retval;

◆ set_adjust_factor()

void WERD_CHOICE::set_adjust_factor ( float  factor)
inline

Definition at line 297 of file ratngs.h.

298  {
299  adjust_factor_ = factor;

◆ set_blob_choice()

void WERD_CHOICE::set_blob_choice ( int  index,
int  blob_count,
const BLOB_CHOICE blob_choice 
)

Definition at line 314 of file ratngs.cpp.

316  {
317  unichar_ids_[index] = blob_choice->unichar_id();
318  script_pos_[index] = tesseract::SP_NORMAL;
319  state_[index] = blob_count;
320  certainties_[index] = blob_choice->certainty();

◆ set_certainty()

void WERD_CHOICE::set_certainty ( float  new_val)
inline

Definition at line 360 of file ratngs.h.

361  {
362  certainty_ = new_val;

◆ set_dangerous_ambig_found_()

void WERD_CHOICE::set_dangerous_ambig_found_ ( bool  value)
inline

Definition at line 354 of file ratngs.h.

355  {
356  dangerous_ambig_found_ = value;

◆ set_length()

void WERD_CHOICE::set_length ( int  len)
inline

Definition at line 369 of file ratngs.h.

370  {
371  ASSERT_HOST(reserved_ >= len);
372  length_ = len;

◆ set_permuter()

void WERD_CHOICE::set_permuter ( uint8_t  perm)
inline

Definition at line 363 of file ratngs.h.

364  {
365  permuter_ = perm;

◆ set_rating()

void WERD_CHOICE::set_rating ( float  new_val)
inline

Definition at line 357 of file ratngs.h.

358  {
359  rating_ = new_val;

◆ set_unichar_id() [1/2]

void WERD_CHOICE::set_unichar_id ( UNICHAR_ID  unichar_id,
int  blob_count,
float  rating,
float  certainty,
int  index 
)
inline

Definition at line 452 of file ratngs.h.

454  {
455  assert(index < length_);
456  unichar_ids_[index] = unichar_id;
457  state_[index] = blob_count;
458  certainties_[index] = certainty;
459  script_pos_[index] = tesseract::SP_NORMAL;
460  rating_ += rating;
461  if (certainty < certainty_) {
462  certainty_ = certainty;
463  }

◆ set_unichar_id() [2/2]

void WERD_CHOICE::set_unichar_id ( UNICHAR_ID  unichar_id,
int  index 
)
inline

Definition at line 347 of file ratngs.h.

348  {
349  assert(index < length_);
350  unichar_ids_[index] = unichar_id;

◆ set_unichars_in_script_order()

bool WERD_CHOICE::set_unichars_in_script_order ( bool  in_script_order)
inline

Definition at line 519 of file ratngs.h.

520  {
521  return unichars_in_script_order_ = in_script_order;

◆ set_x_heights()

void WERD_CHOICE::set_x_heights ( float  min_height,
float  max_height 
)
inline

Definition at line 330 of file ratngs.h.

331  {
332  min_x_height_ = min_height;
333  max_x_height_ = max_height;

◆ SetAllScriptPositions()

void WERD_CHOICE::SetAllScriptPositions ( tesseract::ScriptPos  position)

Definition at line 625 of file ratngs.cpp.

626  {
627  for (int i = 0; i < length_; ++i)
628  script_pos_[i] = position;

◆ SetScriptPositions() [1/2]

void WERD_CHOICE::SetScriptPositions ( bool  small_caps,
TWERD word,
int  debug = 0 
)

Definition at line 552 of file ratngs.cpp.

553  {
554  // Initialize to normal.
555  for (int i = 0; i < length_; ++i)
556  script_pos_[i] = tesseract::SP_NORMAL;
557  if (word->blobs.empty() || word->NumBlobs() != TotalOfStates()) {
558  return;
559  }
560 
561  int position_counts[4] = { 0, 0, 0, 0 };
562 
563  int chunk_index = 0;
564  for (int blob_index = 0; blob_index < length_; ++blob_index, ++chunk_index) {
565  TBLOB* tblob = word->blobs[chunk_index];
566  int uni_id = unichar_id(blob_index);
567  TBOX blob_box = tblob->bounding_box();
568  if (state_ != nullptr) {
569  for (int i = 1; i < state_[blob_index]; ++i) {
570  ++chunk_index;
571  tblob = word->blobs[chunk_index];
572  blob_box += tblob->bounding_box();
573  }
574  }
575  script_pos_[blob_index] = ScriptPositionOf(false, *unicharset_, blob_box,
576  uni_id);
577  if (small_caps && script_pos_[blob_index] != tesseract::SP_DROPCAP) {
578  script_pos_[blob_index] = tesseract::SP_NORMAL;
579  }
580  position_counts[script_pos_[blob_index]]++;
581  }
582  // If almost everything looks like a superscript or subscript,
583  // we most likely just got the baseline wrong.
584  if (position_counts[tesseract::SP_SUBSCRIPT] > 0.75 * length_ ||
585  position_counts[tesseract::SP_SUPERSCRIPT] > 0.75 * length_) {
586  if (debug >= 2) {
587  tprintf("Most characters of %s are subscript or superscript.\n"
588  "That seems wrong, so I'll assume we got the baseline wrong\n",
589  unichar_string().c_str());
590  }
591  for (int i = 0; i < length_; i++) {
592  ScriptPos sp = script_pos_[i];
594  position_counts[sp]--;
595  position_counts[tesseract::SP_NORMAL]++;
596  script_pos_[i] = tesseract::SP_NORMAL;
597  }
598  }
599  }
600 
601  if ((debug >= 1 && position_counts[tesseract::SP_NORMAL] < length_) ||
602  debug >= 2) {
603  tprintf("SetScriptPosition on %s\n", unichar_string().c_str());
604  int chunk_index = 0;
605  for (int blob_index = 0; blob_index < length_; ++blob_index) {
606  if (debug >= 2 || script_pos_[blob_index] != tesseract::SP_NORMAL) {
607  TBLOB* tblob = word->blobs[chunk_index];
608  ScriptPositionOf(true, *unicharset_, tblob->bounding_box(),
609  unichar_id(blob_index));
610  }
611  chunk_index += state_ != nullptr ? state_[blob_index] : 1;
612  }
613  }

◆ SetScriptPositions() [2/2]

void WERD_CHOICE::SetScriptPositions ( const tesseract::ScriptPos positions,
int  length 
)

Definition at line 615 of file ratngs.cpp.

617  {
618  ASSERT_HOST(length == length_);
619  if (positions != script_pos_) {
620  delete [] script_pos_;
621  script_pos_ = new ScriptPos[length];
622  memcpy(script_pos_, positions, sizeof(positions[0]) * length);
623  }

◆ shallow_copy()

WERD_CHOICE WERD_CHOICE::shallow_copy ( int  start,
int  end 
) const

Definition at line 416 of file ratngs.cpp.

417  {
418  ASSERT_HOST(start >= 0 && start <= length_);
419  ASSERT_HOST(end >= 0 && end <= length_);
420  if (end < start) { end = start; }
421  WERD_CHOICE retval(unicharset_, end - start);
422  for (int i = start; i < end; i++) {
423  retval.append_unichar_id_space_allocated(
424  unichar_ids_[i], state_[i], 0.0f, certainties_[i]);
425  }
426  return retval;

◆ state()

int WERD_CHOICE::state ( int  index) const
inline

Definition at line 307 of file ratngs.h.

308  {
309  return state_[index];

◆ string_and_lengths()

void WERD_CHOICE::string_and_lengths ( STRING word_str,
STRING word_lengths_str 
) const

string_and_lengths

Populates the given word_str with unichars from unichar_ids and and word_lengths_str with the corresponding unichar lengths.

Definition at line 451 of file ratngs.cpp.

453  {
454  *word_str = "";
455  if (word_lengths_str != nullptr) *word_lengths_str = "";
456  for (int i = 0; i < length_; ++i) {
457  const char *ch = unicharset_->id_to_unichar_ext(unichar_ids_[i]);
458  *word_str += ch;
459  if (word_lengths_str != nullptr) {
460  *word_lengths_str += strlen(ch);
461  }
462  }

◆ TotalOfStates()

int WERD_CHOICE::TotalOfStates ( ) const

Definition at line 713 of file ratngs.cpp.

714  {
715  int total_chunks = 0;
716  for (int i = 0; i < length_; ++i) {
717  total_chunks += state_[i];
718  }
719  return total_chunks;

◆ unichar_id()

UNICHAR_ID WERD_CHOICE::unichar_id ( int  index) const
inline

Definition at line 303 of file ratngs.h.

304  {
305  assert(index < length_);
306  return unichar_ids_[index];

◆ unichar_ids()

const UNICHAR_ID* WERD_CHOICE::unichar_ids ( ) const
inline

Definition at line 300 of file ratngs.h.

301  {
302  return unichar_ids_;

◆ unichar_lengths()

const STRING& WERD_CHOICE::unichar_lengths ( ) const
inline

Definition at line 536 of file ratngs.h.

537  {
538  this->string_and_lengths(&unichar_string_, &unichar_lengths_);
539  return unichar_lengths_;

◆ unichar_string()

const STRING& WERD_CHOICE::unichar_string ( ) const
inline

Definition at line 529 of file ratngs.h.

530  {
531  this->string_and_lengths(&unichar_string_, &unichar_lengths_);
532  return unichar_string_;

◆ unichars_in_script_order()

bool WERD_CHOICE::unichars_in_script_order ( ) const
inline

Definition at line 523 of file ratngs.h.

524  {
525  return unichars_in_script_order_;

◆ unicharset()

const UNICHARSET* WERD_CHOICE::unicharset ( ) const
inline

Definition at line 288 of file ratngs.h.

289  {
290  return unicharset_;

◆ UpdateStateForSplit()

void WERD_CHOICE::UpdateStateForSplit ( int  blob_position)

Definition at line 701 of file ratngs.cpp.

702  {
703  int total_chunks = 0;
704  for (int i = 0; i < length_; ++i) {
705  total_chunks += state_[i];
706  if (total_chunks > blob_position) {
707  ++state_[i];
708  return;
709  }
710  }

Member Data Documentation

◆ kBadRating

const float WERD_CHOICE::kBadRating = 100000.0
static

Definition at line 263 of file ratngs.h.


The documentation for this class was generated from the following files:
string
std::string string
Definition: equationdetect_test.cc:21
UNICHARSET::get_direction
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:680
WERD_CHOICE::unichar_string
const STRING & unichar_string() const
Definition: ratngs.h:529
ScrollView
Definition: scrollview.h:97
ELIST_LINK::ELIST_LINK
ELIST_LINK()
Definition: elst.h:125
UNICHARSET::encode_string
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:258
WERD_CHOICE::unichar_id
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:303
UNICHARSET::id_to_unichar_ext
const char * id_to_unichar_ext(UNICHAR_ID id) const
Definition: unicharset.cpp:298
UNICHARSET::get_isdigit
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:502
WERD_CHOICE
Definition: ratngs.h:261
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
BLOB_CHOICE::certainty
float certainty() const
Definition: ratngs.h:81
WERD_CHOICE::TotalOfStates
int TotalOfStates() const
Definition: ratngs.cpp:713
TBLOB::plot
void plot(ScrollView *window, ScrollView::Color color, ScrollView::Color child_color)
Definition: blobs.cpp:508
WERD_CHOICE::make_bad
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:431
GenericVector::double_the_size_memcpy
static T * double_the_size_memcpy(int current_size, T *data)
Definition: genericvector.h:207
WERD_CHOICE::certainty
float certainty() const
Definition: ratngs.h:318
NO_PERM
Definition: ratngs.h:231
TBOX::top
int16_t top() const
Definition: rect.h:57
STRING
Definition: strngs.h:45
ScrollView::Clear
void Clear()
Definition: scrollview.cpp:588
WERD_CHOICE::permuter
uint8_t permuter() const
Definition: ratngs.h:334
COMPOUND_PERM
Definition: ratngs.h:243
kMinSuperscriptOffset
const int kMinSuperscriptOffset
Definition: ratngs.cpp:43
UNICHARSET::IsSpaceDelimited
bool IsSpaceDelimited(UNICHAR_ID unichar_id) const
Definition: unicharset.h:642
WERD_CHOICE::append_unichar_id_space_allocated
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:440
WERD_CHOICE::unicharset
const UNICHARSET * unicharset() const
Definition: ratngs.h:288
WERD_CHOICE::kBadRating
static const float kBadRating
Definition: ratngs.h:263
UNICHARSET::katakana_sid
int katakana_sid() const
Definition: unicharset.h:881
WERD_CHOICE::BlobPosition
tesseract::ScriptPos BlobPosition(int index) const
Definition: ratngs.h:310
ScrollView::ZoomToRectangle
void ZoomToRectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:755
BLOB_CHOICE::unichar_id
UNICHAR_ID unichar_id() const
Definition: ratngs.h:75
UNICHARSET::get_script
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:653
tesseract::SP_SUBSCRIPT
Definition: ratngs.h:252
WERD_CHOICE::init
void init(int reserved)
Definition: ratngs.h:397
WERD_CHOICE::string_and_lengths
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:451
tesseract::ScriptPos
ScriptPos
Definition: ratngs.h:250
WERD_CHOICE::set_unichar_id
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:347
WERD_CHOICE::min_x_height
float min_x_height() const
Definition: ratngs.h:324
UNICHARSET::debug_str
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:342
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
UNICHARSET::get_top_bottom
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:558
UNICHARSET::null_sid
int null_sid() const
Definition: unicharset.h:874
WERD_CHOICE::MatrixCoord
MATRIX_COORD MatrixCoord(int index) const
Definition: ratngs.cpp:304
WERD_CHOICE::double_the_size
void double_the_size()
Make more space in unichar_id_ and fragment_lengths_ arrays.
Definition: ratngs.h:375
UNICHARSET::han_sid
int han_sid() const
Definition: unicharset.h:879
UNICHARSET::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
UNICHAR_SPACE
Definition: unicharset.h:34
GENERIC_2D_ARRAY::get
T get(ICOORD pos) const
Definition: matrix.h:227
TWERD::blobs
GenericVector< TBLOB * > blobs
Definition: blobs.h:457
GenericVector::empty
bool empty() const
Definition: genericvector.h:86
UNICHARSET
Definition: unicharset.h:145
UNICHARSET::hiragana_sid
int hiragana_sid() const
Definition: unicharset.h:880
TBOX::bottom
int16_t bottom() const
Definition: rect.h:64
UNICHARSET::get_mirror
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:687
UNICHARSET::CleanupString
static std::string CleanupString(const char *utf8_str)
Definition: unicharset.h:246
tesseract::SP_NORMAL
Definition: ratngs.h:251
unicharset_
UNICHARSET unicharset_
Definition: unicharcompress_test.cc:167
kMaxDropCapBottom
const int kMaxDropCapBottom
Definition: ratngs.cpp:45
tesseract::SP_DROPCAP
Definition: ratngs.h:254
UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:36
TBLOB::bounding_box
TBOX bounding_box() const
Definition: blobs.cpp:466
GenericVector< UNICHAR_ID >
UNICHARSET::get_script_table_size
int get_script_table_size() const
Definition: unicharset.h:839
WERD_CHOICE::operator=
WERD_CHOICE & operator=(const WERD_CHOICE &source)
Definition: ratngs.cpp:523
WERD_CHOICE::print
void print() const
Definition: ratngs.h:568
WERD_CHOICE::length
int length() const
Definition: ratngs.h:291
MATRIX_COORD
Definition: matrix.h:604
TBLOB
Definition: blobs.h:282
TBOX::left
int16_t left() const
Definition: rect.h:71
UNICHARSET::Direction
Direction
Definition: unicharset.h:156
tesseract::SP_SUPERSCRIPT
Definition: ratngs.h:253
UNICHARSET::U_RIGHT_TO_LEFT_ARABIC
Definition: unicharset.h:170
MATRIX_COORD::col
int col
Definition: matrix.h:632
TBOX::right
int16_t right() const
Definition: rect.h:78
GenericVector::init_to_size
void init_to_size(int size, const T &t)
Definition: genericvector.h:706
GENERIC_2D_ARRAY::put
void put(ICOORD pos, const T &thing)
Definition: matrix.h:219
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
WERD_CHOICE::WERD_CHOICE
WERD_CHOICE(const UNICHARSET *unicharset)
Definition: ratngs.h:266
ScrollView::Update
static void Update()
Definition: scrollview.cpp:708
WERD_CHOICE::ScriptPositionOf
static tesseract::ScriptPos ScriptPositionOf(bool print_debug, const UNICHARSET &unicharset, const TBOX &blob_box, UNICHAR_ID unichar_id)
Definition: ratngs.cpp:631
MATRIX_COORD::row
int row
Definition: matrix.h:633
WERD_CHOICE::remove_unichar_ids
void remove_unichar_ids(int index, int num)
Definition: ratngs.cpp:344
WERD_CHOICE::rating
float rating() const
Definition: ratngs.h:315
WERD_CHOICE::unichar_ids
const UNICHAR_ID * unichar_ids() const
Definition: ratngs.h:300
WERD_CHOICE::remove_unichar_id
void remove_unichar_id(int index)
Definition: ratngs.h:472
UNICHARSET::id_to_unichar
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
GenericVector::size
int size() const
Definition: genericvector.h:71
window_wait
char window_wait(ScrollView *win)
Definition: callcpp.cpp:103
kBlnBaselineOffset
const int kBlnBaselineOffset
Definition: normalis.h:24
TWERD::NumBlobs
int NumBlobs() const
Definition: blobs.h:446
UNICHARSET::U_RIGHT_TO_LEFT
Definition: unicharset.h:158
tesseract::ScriptPosToString
const char * ScriptPosToString(enum ScriptPos script_pos)
Definition: ratngs.cpp:202
kMinSubscriptOffset
const int kMinSubscriptOffset
Definition: ratngs.cpp:41
TBOX
Definition: rect.h:33
WERD_CHOICE::max_x_height
float max_x_height() const
Definition: ratngs.h:327