tesseract  4.0.0-1-g2a2b
WERD_CHOICE Class Reference

#include <ratngs.h>

Inheritance diagram for WERD_CHOICE:
ELIST_LINK

Public Member Functions

 WERD_CHOICE (const UNICHARSET *unicharset)
 
 WERD_CHOICE (const UNICHARSET *unicharset, int reserved)
 
 WERD_CHOICE (const char *src_string, const char *src_lengths, float src_rating, float src_certainty, uint8_t src_permuter, const UNICHARSET &unicharset)
 
 WERD_CHOICE (const char *src_string, const UNICHARSET &unicharset)
 
 WERD_CHOICE (const WERD_CHOICE &word)
 
 ~WERD_CHOICE ()
 
const UNICHARSETunicharset () const
 
int length () const
 
float adjust_factor () const
 
void set_adjust_factor (float factor)
 
const UNICHAR_IDunichar_ids () const
 
UNICHAR_ID unichar_id (int index) const
 
int state (int index) const
 
tesseract::ScriptPos BlobPosition (int index) const
 
float rating () const
 
float certainty () const
 
float certainty (int index) const
 
float min_x_height () const
 
float max_x_height () const
 
void set_x_heights (float min_height, float max_height)
 
uint8_t permuter () const
 
const char * permuter_name () const
 
BLOB_CHOICE_LIST * blob_choices (int index, MATRIX *ratings) const
 
MATRIX_COORD MatrixCoord (int index) const
 
void set_unichar_id (UNICHAR_ID unichar_id, int index)
 
bool dangerous_ambig_found () const
 
void set_dangerous_ambig_found_ (bool value)
 
void set_rating (float new_val)
 
void set_certainty (float new_val)
 
void set_permuter (uint8_t perm)
 
void set_length (int len)
 
void double_the_size ()
 Make more space in unichar_id_ and fragment_lengths_ arrays. More...
 
void init (int reserved)
 
void init (const char *src_string, const char *src_lengths, float src_rating, float src_certainty, uint8_t src_permuter)
 
void make_bad ()
 Set the fields in this choice to be default (bad) values. More...
 
void append_unichar_id_space_allocated (UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
 
void append_unichar_id (UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
 
void set_unichar_id (UNICHAR_ID unichar_id, int blob_count, float rating, float certainty, int index)
 
void set_blob_choice (int index, int blob_count, const BLOB_CHOICE *blob_choice)
 
bool contains_unichar_id (UNICHAR_ID unichar_id) const
 
void remove_unichar_ids (int index, int num)
 
void remove_last_unichar_id ()
 
void remove_unichar_id (int index)
 
bool has_rtl_unichar_id () const
 
void reverse_and_mirror_unichar_ids ()
 
void punct_stripped (int *start_core, int *end_core) const
 
void GetNonSuperscriptSpan (int *start, int *end) const
 
WERD_CHOICE shallow_copy (int start, int end) const
 
void string_and_lengths (STRING *word_str, STRING *word_lengths_str) const
 
const STRING debug_string () const
 
bool ContainsAnyNonSpaceDelimited () const
 
bool IsAllSpaces () const
 
bool set_unichars_in_script_order (bool in_script_order)
 
bool unichars_in_script_order () const
 
const STRINGunichar_string () const
 
const STRINGunichar_lengths () const
 
void SetScriptPositions (bool small_caps, TWERD *word, int debug=0)
 
void SetScriptPositions (const tesseract::ScriptPos *positions, int length)
 
void SetAllScriptPositions (tesseract::ScriptPos position)
 
int GetTopScriptID () const
 
void UpdateStateForSplit (int blob_position)
 
int TotalOfStates () const
 
void print () const
 
void print (const char *msg) const
 
void print_state (const char *msg) const
 
void DisplaySegmentation (TWERD *word)
 
WERD_CHOICEoperator+= (const WERD_CHOICE &second)
 
WERD_CHOICEoperator= (const WERD_CHOICE &source)
 
- Public Member Functions inherited from ELIST_LINK
 ELIST_LINK ()
 
 ELIST_LINK (const ELIST_LINK &)
 
void operator= (const ELIST_LINK &)
 

Static Public Member Functions

static const char * permuter_name (uint8_t permuter)
 
static tesseract::ScriptPos ScriptPositionOf (bool print_debug, const UNICHARSET &unicharset, const TBOX &blob_box, UNICHAR_ID unichar_id)
 

Static Public Attributes

static const float kBadRating = 100000.0
 

Detailed Description

Definition at line 273 of file ratngs.h.

Constructor & Destructor Documentation

◆ WERD_CHOICE() [1/5]

WERD_CHOICE::WERD_CHOICE ( const UNICHARSET unicharset)
inline

Definition at line 278 of file ratngs.h.

279  : unicharset_(unicharset) { this->init(8); }
const UNICHARSET * unicharset() const
Definition: ratngs.h:300
void init(int reserved)
Definition: ratngs.h:409

◆ WERD_CHOICE() [2/5]

WERD_CHOICE::WERD_CHOICE ( const UNICHARSET unicharset,
int  reserved 
)
inline

Definition at line 280 of file ratngs.h.

281  : unicharset_(unicharset) { this->init(reserved); }
const UNICHARSET * unicharset() const
Definition: ratngs.h:300
void init(int reserved)
Definition: ratngs.h:409

◆ WERD_CHOICE() [3/5]

WERD_CHOICE::WERD_CHOICE ( const char *  src_string,
const char *  src_lengths,
float  src_rating,
float  src_certainty,
uint8_t  src_permuter,
const UNICHARSET unicharset 
)
inline

Definition at line 282 of file ratngs.h.

288  : unicharset_(&unicharset) {
289  this->init(src_string, src_lengths, src_rating,
290  src_certainty, src_permuter);
291  }
const UNICHARSET * unicharset() const
Definition: ratngs.h:300
void init(int reserved)
Definition: ratngs.h:409

◆ WERD_CHOICE() [4/5]

WERD_CHOICE::WERD_CHOICE ( const char *  src_string,
const UNICHARSET unicharset 
)

WERD_CHOICE::WERD_CHOICE

Constructor to build a WERD_CHOICE from the given string. The function assumes that src_string is not nullptr.

Definition at line 218 of file ratngs.cpp.

220  : unicharset_(&unicharset){
221  GenericVector<UNICHAR_ID> encoding;
222  GenericVector<char> lengths;
223  std::string cleaned = unicharset.CleanupString(src_string);
224  if (unicharset.encode_string(cleaned.c_str(), true, &encoding, &lengths,
225  nullptr)) {
226  lengths.push_back('\0');
227  STRING src_lengths = &lengths[0];
228  this->init(cleaned.c_str(), src_lengths.string(), 0.0, 0.0, NO_PERM);
229  } else { // There must have been an invalid unichar in the string.
230  this->init(8);
231  this->make_bad();
232  }
233 }
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:258
const char * string() const
Definition: strngs.cpp:196
const UNICHARSET * unicharset() const
Definition: ratngs.h:300
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:443
void init(int reserved)
Definition: ratngs.h:409
static std::string CleanupString(const char *utf8_str)
Definition: unicharset.h:241
Definition: strngs.h:45

◆ WERD_CHOICE() [5/5]

WERD_CHOICE::WERD_CHOICE ( const WERD_CHOICE word)
inline

Definition at line 293 of file ratngs.h.

294  : ELIST_LINK(word), unicharset_(word.unicharset_) {
295  this->init(word.length());
296  this->operator=(word);
297  }
void init(int reserved)
Definition: ratngs.h:409
int length() const
Definition: ratngs.h:303
WERD_CHOICE & operator=(const WERD_CHOICE &source)
Definition: ratngs.cpp:521
ELIST_LINK()
Definition: elst.h:92

◆ ~WERD_CHOICE()

WERD_CHOICE::~WERD_CHOICE ( )

WERD_CHOICE::~WERD_CHOICE

Definition at line 276 of file ratngs.cpp.

276  {
277  delete[] unichar_ids_;
278  delete[] script_pos_;
279  delete[] state_;
280  delete[] certainties_;
281 }

Member Function Documentation

◆ adjust_factor()

float WERD_CHOICE::adjust_factor ( ) const
inline

Definition at line 306 of file ratngs.h.

306  {
307  return adjust_factor_;
308  }

◆ append_unichar_id()

void WERD_CHOICE::append_unichar_id ( UNICHAR_ID  unichar_id,
int  blob_count,
float  rating,
float  certainty 
)

append_unichar_id

Make sure there is enough space in the word for the new unichar id and call append_unichar_id_space_allocated().

Definition at line 468 of file ratngs.cpp.

470  {
471  if (length_ == reserved_) {
472  this->double_the_size();
473  }
475  rating, certainty);
476 }
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:452
float rating() const
Definition: ratngs.h:327
float certainty() const
Definition: ratngs.h:330
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
void double_the_size()
Make more space in unichar_id_ and fragment_lengths_ arrays.
Definition: ratngs.h:387

◆ append_unichar_id_space_allocated()

void WERD_CHOICE::append_unichar_id_space_allocated ( UNICHAR_ID  unichar_id,
int  blob_count,
float  rating,
float  certainty 
)
inline

This function assumes that there is enough space reserved in the WERD_CHOICE for adding another unichar. This is an efficient alternative to append_unichar_id().

Definition at line 452 of file ratngs.h.

454  {
455  assert(reserved_ > length_);
456  length_++;
457  this->set_unichar_id(unichar_id, blob_count,
458  rating, certainty, length_-1);
459  }
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:359
float rating() const
Definition: ratngs.h:327
float certainty() const
Definition: ratngs.h:330
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315

◆ blob_choices()

BLOB_CHOICE_LIST * WERD_CHOICE::blob_choices ( int  index,
MATRIX ratings 
) const

Definition at line 290 of file ratngs.cpp.

290  {
291  MATRIX_COORD coord = MatrixCoord(index);
292  BLOB_CHOICE_LIST* result = ratings->get(coord.col, coord.row);
293  if (result == nullptr) {
294  result = new BLOB_CHOICE_LIST;
295  ratings->put(coord.col, coord.row, result);
296  }
297  return result;
298 }
MATRIX_COORD MatrixCoord(int index) const
Definition: ratngs.cpp:302
void put(ICOORD pos, const T &thing)
Definition: matrix.h:220
T get(ICOORD pos) const
Definition: matrix.h:228

◆ BlobPosition()

tesseract::ScriptPos WERD_CHOICE::BlobPosition ( int  index) const
inline

Definition at line 322 of file ratngs.h.

322  {
323  if (index < 0 || index >= length_)
324  return tesseract::SP_NORMAL;
325  return script_pos_[index];
326  }

◆ certainty() [1/2]

float WERD_CHOICE::certainty ( ) const
inline

Definition at line 330 of file ratngs.h.

330  {
331  return certainty_;
332  }

◆ certainty() [2/2]

float WERD_CHOICE::certainty ( int  index) const
inline

Definition at line 333 of file ratngs.h.

333  {
334  return certainties_[index];
335  }

◆ contains_unichar_id()

bool WERD_CHOICE::contains_unichar_id ( UNICHAR_ID  unichar_id) const

contains_unichar_id

Returns true if unichar_ids_ contain the given unichar_id, false otherwise.

Definition at line 326 of file ratngs.cpp.

326  {
327  for (int i = 0; i < length_; ++i) {
328  if (unichar_ids_[i] == unichar_id) {
329  return true;
330  }
331  }
332  return false;
333 }
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315

◆ ContainsAnyNonSpaceDelimited()

bool WERD_CHOICE::ContainsAnyNonSpaceDelimited ( ) const
inline

Definition at line 514 of file ratngs.h.

514  {
515  for (int i = 0; i < length_; ++i) {
516  if (!unicharset_->IsSpaceDelimited(unichar_ids_[i])) return true;
517  }
518  return false;
519  }
bool IsSpaceDelimited(UNICHAR_ID unichar_id) const
Definition: unicharset.h:647

◆ dangerous_ambig_found()

bool WERD_CHOICE::dangerous_ambig_found ( ) const
inline

Definition at line 363 of file ratngs.h.

363  {
364  return dangerous_ambig_found_;
365  }

◆ debug_string()

const STRING WERD_CHOICE::debug_string ( ) const
inline

Definition at line 505 of file ratngs.h.

505  {
506  STRING word_str;
507  for (int i = 0; i < length_; ++i) {
508  word_str += unicharset_->debug_str(unichar_ids_[i]);
509  word_str += " ";
510  }
511  return word_str;
512  }
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:342
Definition: strngs.h:45

◆ DisplaySegmentation()

void WERD_CHOICE::DisplaySegmentation ( TWERD word)

Definition at line 764 of file ratngs.cpp.

764  {
765 #ifndef GRAPHICS_DISABLED
766  // Number of different colors to draw with.
767  const int kNumColors = 6;
768  static ScrollView *segm_window = nullptr;
769  // Check the state against the static prev_drawn_state.
770  static GenericVector<int> prev_drawn_state;
771  bool already_done = prev_drawn_state.size() == length_;
772  if (!already_done) prev_drawn_state.init_to_size(length_, 0);
773  for (int i = 0; i < length_; ++i) {
774  if (prev_drawn_state[i] != state_[i]) {
775  already_done = false;
776  }
777  prev_drawn_state[i] = state_[i];
778  }
779  if (already_done || word->blobs.empty()) return;
780 
781  // Create the window if needed.
782  if (segm_window == nullptr) {
783  segm_window = new ScrollView("Segmentation", 5, 10, 500, 256,
784  2000.0, 256.0, true);
785  } else {
786  segm_window->Clear();
787  }
788 
789  TBOX bbox;
790  int blob_index = 0;
791  for (int c = 0; c < length_; ++c) {
792  ScrollView::Color color =
793  static_cast<ScrollView::Color>(c % kNumColors + 3);
794  for (int i = 0; i < state_[c]; ++i, ++blob_index) {
795  TBLOB* blob = word->blobs[blob_index];
796  bbox += blob->bounding_box();
797  blob->plot(segm_window, color, color);
798  }
799  }
800  segm_window->ZoomToRectangle(bbox.left(), bbox.top(),
801  bbox.right(), bbox.bottom());
802  segm_window->Update();
803  window_wait(segm_window);
804 #endif
805 }
int size() const
Definition: genericvector.h:71
void ZoomToRectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:760
void plot(ScrollView *window, ScrollView::Color color, ScrollView::Color child_color)
Definition: blobs.cpp:520
Definition: rect.h:34
static void Update()
Definition: scrollview.cpp:711
int16_t left() const
Definition: rect.h:72
int16_t top() const
Definition: rect.h:58
void init_to_size(int size, const T &t)
bool empty() const
Definition: genericvector.h:90
TBOX bounding_box() const
Definition: blobs.cpp:478
GenericVector< TBLOB * > blobs
Definition: blobs.h:443
void Clear()
Definition: scrollview.cpp:591
int16_t right() const
Definition: rect.h:79
char window_wait(ScrollView *win)
Definition: callcpp.cpp:104
Definition: blobs.h:268
int16_t bottom() const
Definition: rect.h:65

◆ double_the_size()

void WERD_CHOICE::double_the_size ( )
inline

Make more space in unichar_id_ and fragment_lengths_ arrays.

Definition at line 387 of file ratngs.h.

387  {
388  if (reserved_ > 0) {
390  reserved_, unichar_ids_);
392  reserved_, script_pos_);
394  reserved_, state_);
396  reserved_, certainties_);
397  reserved_ *= 2;
398  } else {
399  unichar_ids_ = new UNICHAR_ID[1];
400  script_pos_ = new tesseract::ScriptPos[1];
401  state_ = new int[1];
402  certainties_ = new float[1];
403  reserved_ = 1;
404  }
405  }
int UNICHAR_ID
Definition: unichar.h:35
static T * double_the_size_memcpy(int current_size, T *data)

◆ GetNonSuperscriptSpan()

void WERD_CHOICE::GetNonSuperscriptSpan ( int *  start,
int *  end 
) const

Definition at line 397 of file ratngs.cpp.

397  {
398  int end = length();
399  while (end > 0 &&
400  unicharset_->get_isdigit(unichar_ids_[end - 1]) &&
402  end--;
403  }
404  int start = 0;
405  while (start < end &&
406  unicharset_->get_isdigit(unichar_ids_[start]) &&
408  start++;
409  }
410  *pstart = start;
411  *pend = end;
412 }
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507
int length() const
Definition: ratngs.h:303
tesseract::ScriptPos BlobPosition(int index) const
Definition: ratngs.h:322

◆ GetTopScriptID()

int WERD_CHOICE::GetTopScriptID ( ) const

Definition at line 670 of file ratngs.cpp.

670  {
671  int max_script = unicharset_->get_script_table_size();
672  int *sid = new int[max_script];
673  int x;
674  for (x = 0; x < max_script; x++) sid[x] = 0;
675  for (x = 0; x < length_; ++x) {
676  int script_id = unicharset_->get_script(unichar_id(x));
677  sid[script_id]++;
678  }
679  if (unicharset_->han_sid() != unicharset_->null_sid()) {
680  // Add the Hiragana & Katakana counts to Han and zero them out.
681  if (unicharset_->hiragana_sid() != unicharset_->null_sid()) {
682  sid[unicharset_->han_sid()] += sid[unicharset_->hiragana_sid()];
683  sid[unicharset_->hiragana_sid()] = 0;
684  }
685  if (unicharset_->katakana_sid() != unicharset_->null_sid()) {
686  sid[unicharset_->han_sid()] += sid[unicharset_->katakana_sid()];
687  sid[unicharset_->katakana_sid()] = 0;
688  }
689  }
690  // Note that high script ID overrides lower one on a tie, thus biasing
691  // towards non-Common script (if sorted that way in unicharset file).
692  int max_sid = 0;
693  for (x = 1; x < max_script; x++)
694  if (sid[x] >= sid[max_sid]) max_sid = x;
695  if (sid[max_sid] < length_ / 2)
696  max_sid = unicharset_->null_sid();
697  delete[] sid;
698  return max_sid;
699 }
int hiragana_sid() const
Definition: unicharset.h:884
int get_script_table_size() const
Definition: unicharset.h:844
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
int null_sid() const
Definition: unicharset.h:878
int han_sid() const
Definition: unicharset.h:883
int katakana_sid() const
Definition: unicharset.h:885
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:658

◆ has_rtl_unichar_id()

bool WERD_CHOICE::has_rtl_unichar_id ( ) const

has_rtl_unichar_id

Returns true if unichar_ids contain at least one "strongly" RTL unichar.

Definition at line 431 of file ratngs.cpp.

431  {
432  int i;
433  for (i = 0; i < length_; ++i) {
434  UNICHARSET::Direction dir = unicharset_->get_direction(unichar_ids_[i]);
435  if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
437  return true;
438  }
439  }
440  return false;
441 }
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:685

◆ init() [1/2]

void WERD_CHOICE::init ( int  reserved)
inline

Initializes WERD_CHOICE - reserves length slots in unichar_ids_ and fragment_length_ arrays. Sets other values to default (blank) values.

Definition at line 409 of file ratngs.h.

409  {
410  reserved_ = reserved;
411  if (reserved > 0) {
412  unichar_ids_ = new UNICHAR_ID[reserved];
413  script_pos_ = new tesseract::ScriptPos[reserved];
414  state_ = new int[reserved];
415  certainties_ = new float[reserved];
416  } else {
417  unichar_ids_ = nullptr;
418  script_pos_ = nullptr;
419  state_ = nullptr;
420  certainties_ = nullptr;
421  }
422  length_ = 0;
423  adjust_factor_ = 1.0f;
424  rating_ = 0.0;
425  certainty_ = FLT_MAX;
426  min_x_height_ = 0.0f;
427  max_x_height_ = FLT_MAX;
428  permuter_ = NO_PERM;
429  unichars_in_script_order_ = false; // Tesseract is strict left-to-right.
430  dangerous_ambig_found_ = false;
431  }
int UNICHAR_ID
Definition: unichar.h:35

◆ init() [2/2]

void WERD_CHOICE::init ( const char *  src_string,
const char *  src_lengths,
float  src_rating,
float  src_certainty,
uint8_t  src_permuter 
)

Helper function to build a WERD_CHOICE from the given string, fragment lengths, rating, certainty and permuter. The function assumes that src_string is not nullptr. src_lengths argument could be nullptr, in which case the unichars in src_string are assumed to all be of length 1.

WERD_CHOICE::init

Helper function to build a WERD_CHOICE from the given string, fragment lengths, rating, certainty and permuter.

The function assumes that src_string is not nullptr. src_lengths argument could be nullptr, in which case the unichars in src_string are assumed to all be of length 1.

Definition at line 245 of file ratngs.cpp.

249  {
250  int src_string_len = strlen(src_string);
251  if (src_string_len == 0) {
252  this->init(8);
253  } else {
254  this->init(src_lengths ? strlen(src_lengths): src_string_len);
255  length_ = reserved_;
256  int offset = 0;
257  for (int i = 0; i < length_; ++i) {
258  int unichar_length = src_lengths ? src_lengths[i] : 1;
259  unichar_ids_[i] =
260  unicharset_->unichar_to_id(src_string+offset, unichar_length);
261  state_[i] = 1;
262  certainties_[i] = src_certainty;
263  offset += unichar_length;
264  }
265  }
266  adjust_factor_ = 1.0f;
267  rating_ = src_rating;
268  certainty_ = src_certainty;
269  permuter_ = src_permuter;
270  dangerous_ambig_found_ = false;
271 }
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
void init(int reserved)
Definition: ratngs.h:409

◆ IsAllSpaces()

bool WERD_CHOICE::IsAllSpaces ( ) const
inline

Definition at line 521 of file ratngs.h.

521  {
522  for (int i = 0; i < length_; ++i) {
523  if (unichar_ids_[i] != UNICHAR_SPACE) return false;
524  }
525  return true;
526  }

◆ length()

int WERD_CHOICE::length ( ) const
inline

Definition at line 303 of file ratngs.h.

303  {
304  return length_;
305  }

◆ make_bad()

void WERD_CHOICE::make_bad ( )
inline

Set the fields in this choice to be default (bad) values.

Definition at line 443 of file ratngs.h.

443  {
444  length_ = 0;
445  rating_ = kBadRating;
446  certainty_ = -FLT_MAX;
447  }
static const float kBadRating
Definition: ratngs.h:275

◆ MatrixCoord()

MATRIX_COORD WERD_CHOICE::MatrixCoord ( int  index) const

Definition at line 302 of file ratngs.cpp.

302  {
303  int col = 0;
304  for (int i = 0; i < index; ++i)
305  col += state_[i];
306  int row = col + state_[index] - 1;
307  return MATRIX_COORD(col, row);
308 }

◆ max_x_height()

float WERD_CHOICE::max_x_height ( ) const
inline

Definition at line 339 of file ratngs.h.

339  {
340  return max_x_height_;
341  }

◆ min_x_height()

float WERD_CHOICE::min_x_height ( ) const
inline

Definition at line 336 of file ratngs.h.

336  {
337  return min_x_height_;
338  }

◆ operator+=()

WERD_CHOICE & WERD_CHOICE::operator+= ( const WERD_CHOICE second)

WERD_CHOICE::operator+=

Cat a second word rating on the end of this current one. The ratings are added and the confidence is the min. If the permuters are NOT the same the permuter is set to COMPOUND_PERM

Definition at line 485 of file ratngs.cpp.

485  {
486  ASSERT_HOST(unicharset_ == second.unicharset_);
487  while (reserved_ < length_ + second.length()) {
488  this->double_the_size();
489  }
490  const UNICHAR_ID *other_unichar_ids = second.unichar_ids();
491  for (int i = 0; i < second.length(); ++i) {
492  unichar_ids_[length_ + i] = other_unichar_ids[i];
493  state_[length_ + i] = second.state_[i];
494  certainties_[length_ + i] = second.certainties_[i];
495  script_pos_[length_ + i] = second.BlobPosition(i);
496  }
497  length_ += second.length();
498  if (second.adjust_factor_ > adjust_factor_)
499  adjust_factor_ = second.adjust_factor_;
500  rating_ += second.rating(); // add ratings
501  if (second.certainty() < certainty_) // take min
502  certainty_ = second.certainty();
503  if (second.dangerous_ambig_found_)
504  dangerous_ambig_found_ = true;
505  if (permuter_ == NO_PERM) {
506  permuter_ = second.permuter();
507  } else if (second.permuter() != NO_PERM &&
508  second.permuter() != permuter_) {
509  permuter_ = COMPOUND_PERM;
510  }
511  return *this;
512 }
int UNICHAR_ID
Definition: unichar.h:35
uint8_t permuter() const
Definition: ratngs.h:346
float rating() const
Definition: ratngs.h:327
float certainty() const
Definition: ratngs.h:330
int length() const
Definition: ratngs.h:303
const UNICHAR_ID * unichar_ids() const
Definition: ratngs.h:312
void double_the_size()
Make more space in unichar_id_ and fragment_lengths_ arrays.
Definition: ratngs.h:387
tesseract::ScriptPos BlobPosition(int index) const
Definition: ratngs.h:322
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ operator=()

WERD_CHOICE & WERD_CHOICE::operator= ( const WERD_CHOICE source)

WERD_CHOICE::operator=

Allocate enough memory to hold a copy of source and copy over all the information from source to this WERD_CHOICE.

Definition at line 521 of file ratngs.cpp.

521  {
522  while (reserved_ < source.length()) {
523  this->double_the_size();
524  }
525 
526  unicharset_ = source.unicharset_;
527  const UNICHAR_ID *other_unichar_ids = source.unichar_ids();
528  for (int i = 0; i < source.length(); ++i) {
529  unichar_ids_[i] = other_unichar_ids[i];
530  state_[i] = source.state_[i];
531  certainties_[i] = source.certainties_[i];
532  script_pos_[i] = source.BlobPosition(i);
533  }
534  length_ = source.length();
535  adjust_factor_ = source.adjust_factor_;
536  rating_ = source.rating();
537  certainty_ = source.certainty();
538  min_x_height_ = source.min_x_height();
539  max_x_height_ = source.max_x_height();
540  permuter_ = source.permuter();
541  dangerous_ambig_found_ = source.dangerous_ambig_found_;
542  return *this;
543 }
int UNICHAR_ID
Definition: unichar.h:35
uint8_t permuter() const
Definition: ratngs.h:346
float rating() const
Definition: ratngs.h:327
float certainty() const
Definition: ratngs.h:330
float max_x_height() const
Definition: ratngs.h:339
int length() const
Definition: ratngs.h:303
const UNICHAR_ID * unichar_ids() const
Definition: ratngs.h:312
float min_x_height() const
Definition: ratngs.h:336
void double_the_size()
Make more space in unichar_id_ and fragment_lengths_ arrays.
Definition: ratngs.h:387
tesseract::ScriptPos BlobPosition(int index) const
Definition: ratngs.h:322

◆ permuter()

uint8_t WERD_CHOICE::permuter ( ) const
inline

Definition at line 346 of file ratngs.h.

346  {
347  return permuter_;
348  }

◆ permuter_name() [1/2]

const char * WERD_CHOICE::permuter_name ( uint8_t  permuter)
static

Definition at line 194 of file ratngs.cpp.

194  {
195  return kPermuterTypeNames[permuter];
196 }
uint8_t permuter() const
Definition: ratngs.h:346

◆ permuter_name() [2/2]

const char * WERD_CHOICE::permuter_name ( ) const

Definition at line 283 of file ratngs.cpp.

283  {
284  return kPermuterTypeNames[permuter_];
285 }

◆ print() [1/2]

void WERD_CHOICE::print ( ) const
inline

Definition at line 580 of file ratngs.h.

580 { this->print(""); }
void print() const
Definition: ratngs.h:580

◆ print() [2/2]

void WERD_CHOICE::print ( const char *  msg) const

WERD_CHOICE::print

Print WERD_CHOICE to stdout.

Definition at line 727 of file ratngs.cpp.

727  {
728  tprintf("%s : ", msg);
729  for (int i = 0; i < length_; ++i) {
730  tprintf("%s", unicharset_->id_to_unichar(unichar_ids_[i]));
731  }
732  tprintf(" : R=%g, C=%g, F=%g, Perm=%d, xht=[%g,%g], ambig=%d\n",
733  rating_, certainty_, adjust_factor_, permuter_,
734  min_x_height_, max_x_height_, dangerous_ambig_found_);
735  tprintf("pos");
736  for (int i = 0; i < length_; ++i) {
737  tprintf("\t%s", ScriptPosToString(script_pos_[i]));
738  }
739  tprintf("\nstr");
740  for (int i = 0; i < length_; ++i) {
741  tprintf("\t%s", unicharset_->id_to_unichar(unichar_ids_[i]));
742  }
743  tprintf("\nstate:");
744  for (int i = 0; i < length_; ++i) {
745  tprintf("\t%d ", state_[i]);
746  }
747  tprintf("\nC");
748  for (int i = 0; i < length_; ++i) {
749  tprintf("\t%.3f", certainties_[i]);
750  }
751  tprintf("\n");
752 }
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
const char * ScriptPosToString(enum ScriptPos script_pos)
Definition: ratngs.cpp:200

◆ print_state()

void WERD_CHOICE::print_state ( const char *  msg) const

Definition at line 755 of file ratngs.cpp.

755  {
756  tprintf("%s", msg);
757  for (int i = 0; i < length_; ++i)
758  tprintf(" %d", state_[i]);
759  tprintf("\n");
760 }
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37

◆ punct_stripped()

void WERD_CHOICE::punct_stripped ( int *  start,
int *  end 
) const

punct_stripped

Returns the half-open interval of unichar_id indices [start, end) which enclose the core portion of this word – the part after stripping punctuation from the left and right.

Definition at line 383 of file ratngs.cpp.

383  {
384  *start = 0;
385  *end = length() - 1;
386  while (*start < length() &&
387  unicharset()->get_ispunctuation(unichar_id(*start))) {
388  (*start)++;
389  }
390  while (*end > -1 &&
391  unicharset()->get_ispunctuation(unichar_id(*end))) {
392  (*end)--;
393  }
394  (*end)++;
395 }
const UNICHARSET * unicharset() const
Definition: ratngs.h:300
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
int length() const
Definition: ratngs.h:303

◆ rating()

float WERD_CHOICE::rating ( ) const
inline

Definition at line 327 of file ratngs.h.

327  {
328  return rating_;
329  }

◆ remove_last_unichar_id()

void WERD_CHOICE::remove_last_unichar_id ( )
inline

Definition at line 483 of file ratngs.h.

483 { --length_; }

◆ remove_unichar_id()

void WERD_CHOICE::remove_unichar_id ( int  index)
inline

Definition at line 484 of file ratngs.h.

484  {
485  this->remove_unichar_ids(index, 1);
486  }
void remove_unichar_ids(int index, int num)
Definition: ratngs.cpp:342

◆ remove_unichar_ids()

void WERD_CHOICE::remove_unichar_ids ( int  start,
int  num 
)

remove_unichar_ids

Removes num unichar ids starting from index start from unichar_ids_ and updates length_ and fragment_lengths_ to reflect this change. Note: this function does not modify rating_ and certainty_.

Definition at line 342 of file ratngs.cpp.

342  {
343  ASSERT_HOST(start >= 0 && start + num <= length_);
344  // Accumulate the states to account for the merged blobs.
345  for (int i = 0; i < num; ++i) {
346  if (start > 0)
347  state_[start - 1] += state_[start + i];
348  else if (start + num < length_)
349  state_[start + num] += state_[start + i];
350  }
351  for (int i = start; i + num < length_; ++i) {
352  unichar_ids_[i] = unichar_ids_[i + num];
353  script_pos_[i] = script_pos_[i + num];
354  state_[i] = state_[i + num];
355  certainties_[i] = certainties_[i + num];
356  }
357  length_ -= num;
358 }
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ reverse_and_mirror_unichar_ids()

void WERD_CHOICE::reverse_and_mirror_unichar_ids ( )

reverse_and_mirror_unichar_ids

Reverses and mirrors unichars in unichar_ids.

Definition at line 365 of file ratngs.cpp.

365  {
366  for (int i = 0; i < length_ / 2; ++i) {
367  UNICHAR_ID tmp_id = unichar_ids_[i];
368  unichar_ids_[i] = unicharset_->get_mirror(unichar_ids_[length_-1-i]);
369  unichar_ids_[length_-1-i] = unicharset_->get_mirror(tmp_id);
370  }
371  if (length_ % 2 != 0) {
372  unichar_ids_[length_/2] = unicharset_->get_mirror(unichar_ids_[length_/2]);
373  }
374 }
int UNICHAR_ID
Definition: unichar.h:35
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:692

◆ ScriptPositionOf()

ScriptPos WERD_CHOICE::ScriptPositionOf ( bool  print_debug,
const UNICHARSET unicharset,
const TBOX blob_box,
UNICHAR_ID  unichar_id 
)
static

Definition at line 632 of file ratngs.cpp.

635  {
637  int top = blob_box.top();
638  int bottom = blob_box.bottom();
639  int min_bottom, max_bottom, min_top, max_top;
641  &min_bottom, &max_bottom,
642  &min_top, &max_top);
643 
644  int sub_thresh_top = min_top - kMinSubscriptOffset;
645  int sub_thresh_bot = kBlnBaselineOffset - kMinSubscriptOffset;
646  int sup_thresh_bot = max_bottom + kMinSuperscriptOffset;
647  if (bottom <= kMaxDropCapBottom) {
648  retval = tesseract::SP_DROPCAP;
649  } else if (top < sub_thresh_top && bottom < sub_thresh_bot) {
650  retval = tesseract::SP_SUBSCRIPT;
651  } else if (bottom > sup_thresh_bot) {
652  retval = tesseract::SP_SUPERSCRIPT;
653  }
654 
655  if (print_debug) {
656  const char *pos = ScriptPosToString(retval);
657  tprintf("%s Character %s[bot:%d top: %d] "
658  "bot_range[%d,%d] top_range[%d, %d] "
659  "sub_thresh[bot:%d top:%d] sup_thresh_bot %d\n",
661  bottom, top,
662  min_bottom, max_bottom, min_top, max_top,
663  sub_thresh_bot, sub_thresh_top,
664  sup_thresh_bot);
665  }
666  return retval;
667 }
const UNICHARSET * unicharset() const
Definition: ratngs.h:300
const int kBlnBaselineOffset
Definition: normalis.h:25
int16_t top() const
Definition: rect.h:58
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:563
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
const int kMinSuperscriptOffset
Definition: ratngs.cpp:45
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
const char * ScriptPosToString(enum ScriptPos script_pos)
Definition: ratngs.cpp:200
const int kMaxDropCapBottom
Definition: ratngs.cpp:47
const int kMinSubscriptOffset
Definition: ratngs.cpp:43
int16_t bottom() const
Definition: rect.h:65

◆ set_adjust_factor()

void WERD_CHOICE::set_adjust_factor ( float  factor)
inline

Definition at line 309 of file ratngs.h.

309  {
310  adjust_factor_ = factor;
311  }

◆ set_blob_choice()

void WERD_CHOICE::set_blob_choice ( int  index,
int  blob_count,
const BLOB_CHOICE blob_choice 
)

Definition at line 312 of file ratngs.cpp.

313  {
314  unichar_ids_[index] = blob_choice->unichar_id();
315  script_pos_[index] = tesseract::SP_NORMAL;
316  state_[index] = blob_count;
317  certainties_[index] = blob_choice->certainty();
318 }
float certainty() const
Definition: ratngs.h:83
UNICHAR_ID unichar_id() const
Definition: ratngs.h:77

◆ set_certainty()

void WERD_CHOICE::set_certainty ( float  new_val)
inline

Definition at line 372 of file ratngs.h.

372  {
373  certainty_ = new_val;
374  }

◆ set_dangerous_ambig_found_()

void WERD_CHOICE::set_dangerous_ambig_found_ ( bool  value)
inline

Definition at line 366 of file ratngs.h.

366  {
367  dangerous_ambig_found_ = value;
368  }

◆ set_length()

void WERD_CHOICE::set_length ( int  len)
inline

Definition at line 381 of file ratngs.h.

381  {
382  ASSERT_HOST(reserved_ >= len);
383  length_ = len;
384  }
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ set_permuter()

void WERD_CHOICE::set_permuter ( uint8_t  perm)
inline

Definition at line 375 of file ratngs.h.

375  {
376  permuter_ = perm;
377  }

◆ set_rating()

void WERD_CHOICE::set_rating ( float  new_val)
inline

Definition at line 369 of file ratngs.h.

369  {
370  rating_ = new_val;
371  }

◆ set_unichar_id() [1/2]

void WERD_CHOICE::set_unichar_id ( UNICHAR_ID  unichar_id,
int  index 
)
inline

Definition at line 359 of file ratngs.h.

359  {
360  assert(index < length_);
361  unichar_ids_[index] = unichar_id;
362  }
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315

◆ set_unichar_id() [2/2]

void WERD_CHOICE::set_unichar_id ( UNICHAR_ID  unichar_id,
int  blob_count,
float  rating,
float  certainty,
int  index 
)
inline

Definition at line 464 of file ratngs.h.

465  {
466  assert(index < length_);
467  unichar_ids_[index] = unichar_id;
468  state_[index] = blob_count;
469  certainties_[index] = certainty;
470  script_pos_[index] = tesseract::SP_NORMAL;
471  rating_ += rating;
472  if (certainty < certainty_) {
473  certainty_ = certainty;
474  }
475  }
float rating() const
Definition: ratngs.h:327
float certainty() const
Definition: ratngs.h:330
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315

◆ set_unichars_in_script_order()

bool WERD_CHOICE::set_unichars_in_script_order ( bool  in_script_order)
inline

Definition at line 531 of file ratngs.h.

531  {
532  return unichars_in_script_order_ = in_script_order;
533  }

◆ set_x_heights()

void WERD_CHOICE::set_x_heights ( float  min_height,
float  max_height 
)
inline

Definition at line 342 of file ratngs.h.

342  {
343  min_x_height_ = min_height;
344  max_x_height_ = max_height;
345  }

◆ SetAllScriptPositions()

void WERD_CHOICE::SetAllScriptPositions ( tesseract::ScriptPos  position)

Definition at line 626 of file ratngs.cpp.

626  {
627  for (int i = 0; i < length_; ++i)
628  script_pos_[i] = position;
629 }

◆ SetScriptPositions() [1/2]

void WERD_CHOICE::SetScriptPositions ( bool  small_caps,
TWERD word,
int  debug = 0 
)

Definition at line 550 of file ratngs.cpp.

550  {
551  // Initialize to normal.
552  for (int i = 0; i < length_; ++i)
553  script_pos_[i] = tesseract::SP_NORMAL;
554  if (word->blobs.empty() || word->NumBlobs() != TotalOfStates()) {
555  return;
556  }
557 
558  int position_counts[4];
559  for (int i = 0; i < 4; i++) {
560  position_counts[i] = 0;
561  }
562 
563  int chunk_index = 0;
564  for (int blob_index = 0; blob_index < length_; ++blob_index, ++chunk_index) {
565  TBLOB* tblob = word->blobs[chunk_index];
566  int uni_id = unichar_id(blob_index);
567  TBOX blob_box = tblob->bounding_box();
568  if (state_ != nullptr) {
569  for (int i = 1; i < state_[blob_index]; ++i) {
570  ++chunk_index;
571  tblob = word->blobs[chunk_index];
572  blob_box += tblob->bounding_box();
573  }
574  }
575  script_pos_[blob_index] = ScriptPositionOf(false, *unicharset_, blob_box,
576  uni_id);
577  if (small_caps && script_pos_[blob_index] != tesseract::SP_DROPCAP) {
578  script_pos_[blob_index] = tesseract::SP_NORMAL;
579  }
580  position_counts[script_pos_[blob_index]]++;
581  }
582  // If almost everything looks like a superscript or subscript,
583  // we most likely just got the baseline wrong.
584  if (position_counts[tesseract::SP_SUBSCRIPT] > 0.75 * length_ ||
585  position_counts[tesseract::SP_SUPERSCRIPT] > 0.75 * length_) {
586  if (debug >= 2) {
587  tprintf("Most characters of %s are subscript or superscript.\n"
588  "That seems wrong, so I'll assume we got the baseline wrong\n",
589  unichar_string().string());
590  }
591  for (int i = 0; i < length_; i++) {
592  ScriptPos sp = script_pos_[i];
594  position_counts[sp]--;
595  position_counts[tesseract::SP_NORMAL]++;
596  script_pos_[i] = tesseract::SP_NORMAL;
597  }
598  }
599  }
600 
601  if ((debug >= 1 && position_counts[tesseract::SP_NORMAL] < length_) ||
602  debug >= 2) {
603  tprintf("SetScriptPosition on %s\n", unichar_string().string());
604  int chunk_index = 0;
605  for (int blob_index = 0; blob_index < length_; ++blob_index) {
606  if (debug >= 2 || script_pos_[blob_index] != tesseract::SP_NORMAL) {
607  TBLOB* tblob = word->blobs[chunk_index];
608  ScriptPositionOf(true, *unicharset_, tblob->bounding_box(),
609  unichar_id(blob_index));
610  }
611  chunk_index += state_ != nullptr ? state_[blob_index] : 1;
612  }
613  }
614 }
Definition: rect.h:34
int NumBlobs() const
Definition: blobs.h:432
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
bool empty() const
Definition: genericvector.h:90
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
TBOX bounding_box() const
Definition: blobs.cpp:478
GenericVector< TBLOB * > blobs
Definition: blobs.h:443
const STRING & unichar_string() const
Definition: ratngs.h:541
Definition: blobs.h:268
static tesseract::ScriptPos ScriptPositionOf(bool print_debug, const UNICHARSET &unicharset, const TBOX &blob_box, UNICHAR_ID unichar_id)
Definition: ratngs.cpp:632
int TotalOfStates() const
Definition: ratngs.cpp:714

◆ SetScriptPositions() [2/2]

void WERD_CHOICE::SetScriptPositions ( const tesseract::ScriptPos positions,
int  length 
)

Definition at line 616 of file ratngs.cpp.

617  {
618  ASSERT_HOST(length == length_);
619  if (positions != script_pos_) {
620  delete [] script_pos_;
621  script_pos_ = new ScriptPos[length];
622  memcpy(script_pos_, positions, sizeof(positions[0]) * length);
623  }
624 }
int length() const
Definition: ratngs.h:303
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ shallow_copy()

WERD_CHOICE WERD_CHOICE::shallow_copy ( int  start,
int  end 
) const

Definition at line 414 of file ratngs.cpp.

414  {
415  ASSERT_HOST(start >= 0 && start <= length_);
416  ASSERT_HOST(end >= 0 && end <= length_);
417  if (end < start) { end = start; }
418  WERD_CHOICE retval(unicharset_, end - start);
419  for (int i = start; i < end; i++) {
420  retval.append_unichar_id_space_allocated(
421  unichar_ids_[i], state_[i], 0.0f, certainties_[i]);
422  }
423  return retval;
424 }
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ state()

int WERD_CHOICE::state ( int  index) const
inline

Definition at line 319 of file ratngs.h.

319  {
320  return state_[index];
321  }

◆ string_and_lengths()

void WERD_CHOICE::string_and_lengths ( STRING word_str,
STRING word_lengths_str 
) const

string_and_lengths

Populates the given word_str with unichars from unichar_ids and and word_lengths_str with the corresponding unichar lengths.

Definition at line 449 of file ratngs.cpp.

450  {
451  *word_str = "";
452  if (word_lengths_str != nullptr) *word_lengths_str = "";
453  for (int i = 0; i < length_; ++i) {
454  const char *ch = unicharset_->id_to_unichar_ext(unichar_ids_[i]);
455  *word_str += ch;
456  if (word_lengths_str != nullptr) {
457  *word_lengths_str += strlen(ch);
458  }
459  }
460 }
const char * id_to_unichar_ext(UNICHAR_ID id) const
Definition: unicharset.cpp:298

◆ TotalOfStates()

int WERD_CHOICE::TotalOfStates ( ) const

Definition at line 714 of file ratngs.cpp.

714  {
715  int total_chunks = 0;
716  for (int i = 0; i < length_; ++i) {
717  total_chunks += state_[i];
718  }
719  return total_chunks;
720 }

◆ unichar_id()

UNICHAR_ID WERD_CHOICE::unichar_id ( int  index) const
inline

Definition at line 315 of file ratngs.h.

315  {
316  assert(index < length_);
317  return unichar_ids_[index];
318  }

◆ unichar_ids()

const UNICHAR_ID* WERD_CHOICE::unichar_ids ( ) const
inline

Definition at line 312 of file ratngs.h.

312  {
313  return unichar_ids_;
314  }

◆ unichar_lengths()

const STRING& WERD_CHOICE::unichar_lengths ( ) const
inline

Definition at line 548 of file ratngs.h.

548  {
549  this->string_and_lengths(&unichar_string_, &unichar_lengths_);
550  return unichar_lengths_;
551  }
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:449

◆ unichar_string()

const STRING& WERD_CHOICE::unichar_string ( ) const
inline

Definition at line 541 of file ratngs.h.

541  {
542  this->string_and_lengths(&unichar_string_, &unichar_lengths_);
543  return unichar_string_;
544  }
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:449

◆ unichars_in_script_order()

bool WERD_CHOICE::unichars_in_script_order ( ) const
inline

Definition at line 535 of file ratngs.h.

535  {
536  return unichars_in_script_order_;
537  }

◆ unicharset()

const UNICHARSET* WERD_CHOICE::unicharset ( ) const
inline

Definition at line 300 of file ratngs.h.

300  {
301  return unicharset_;
302  }

◆ UpdateStateForSplit()

void WERD_CHOICE::UpdateStateForSplit ( int  blob_position)

Definition at line 702 of file ratngs.cpp.

702  {
703  int total_chunks = 0;
704  for (int i = 0; i < length_; ++i) {
705  total_chunks += state_[i];
706  if (total_chunks > blob_position) {
707  ++state_[i];
708  return;
709  }
710  }
711 }

Member Data Documentation

◆ kBadRating

const float WERD_CHOICE::kBadRating = 100000.0
static

Definition at line 275 of file ratngs.h.


The documentation for this class was generated from the following files: