tesseract  5.0.0-alpha-619-ge9db
UNICHARSET Class Reference

#include <unicharset.h>

Public Types

enum  Direction {
  U_LEFT_TO_RIGHT = 0, U_RIGHT_TO_LEFT = 1, U_EUROPEAN_NUMBER = 2, U_EUROPEAN_NUMBER_SEPARATOR = 3,
  U_EUROPEAN_NUMBER_TERMINATOR = 4, U_ARABIC_NUMBER = 5, U_COMMON_NUMBER_SEPARATOR = 6, U_BLOCK_SEPARATOR = 7,
  U_SEGMENT_SEPARATOR = 8, U_WHITE_SPACE_NEUTRAL = 9, U_OTHER_NEUTRAL = 10, U_LEFT_TO_RIGHT_EMBEDDING = 11,
  U_LEFT_TO_RIGHT_OVERRIDE = 12, U_RIGHT_TO_LEFT_ARABIC = 13, U_RIGHT_TO_LEFT_EMBEDDING = 14, U_RIGHT_TO_LEFT_OVERRIDE = 15,
  U_POP_DIRECTIONAL_FORMAT = 16, U_DIR_NON_SPACING_MARK = 17, U_BOUNDARY_NEUTRAL = 18, U_FIRST_STRONG_ISOLATE = 19,
  U_LEFT_TO_RIGHT_ISOLATE = 20, U_RIGHT_TO_LEFT_ISOLATE = 21, U_POP_DIRECTIONAL_ISOLATE = 22, U_CHAR_DIRECTION_COUNT
}
 

Public Member Functions

 UNICHARSET ()
 
 ~UNICHARSET ()
 
UNICHAR_ID unichar_to_id (const char *const unichar_repr) const
 
UNICHAR_ID unichar_to_id (const char *const unichar_repr, int length) const
 
int step (const char *str) const
 
bool encodable_string (const char *str, int *first_bad_position) const
 
bool encode_string (const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
 
const char * id_to_unichar (UNICHAR_ID id) const
 
const char * id_to_unichar_ext (UNICHAR_ID id) const
 
STRING debug_str (UNICHAR_ID id) const
 
STRING debug_str (const char *unichar_repr) const
 
void unichar_insert (const char *const unichar_repr, OldUncleanUnichars old_style)
 
void unichar_insert (const char *const unichar_repr)
 
void unichar_insert_backwards_compatible (const char *const unichar_repr)
 
bool contains_unichar_id (UNICHAR_ID unichar_id) const
 
bool contains_unichar (const char *const unichar_repr) const
 
bool contains_unichar (const char *const unichar_repr, int length) const
 
bool eq (UNICHAR_ID unichar_id, const char *const unichar_repr) const
 
void delete_pointers_in_unichars ()
 
void clear ()
 
int size () const
 
void reserve (int unichars_number)
 
bool save_to_file (const char *const filename) const
 
bool save_to_file (FILE *file) const
 
bool save_to_file (tesseract::TFile *file) const
 
bool save_to_string (STRING *str) const
 
bool load_from_file (const char *const filename, bool skip_fragments)
 
bool load_from_file (const char *const filename)
 
bool load_from_file (FILE *file, bool skip_fragments)
 
bool load_from_file (FILE *file)
 
bool load_from_file (tesseract::TFile *file, bool skip_fragments)
 
void post_load_setup ()
 
bool major_right_to_left () const
 
void set_black_and_whitelist (const char *blacklist, const char *whitelist, const char *unblacklist)
 
void set_isalpha (UNICHAR_ID unichar_id, bool value)
 
void set_islower (UNICHAR_ID unichar_id, bool value)
 
void set_isupper (UNICHAR_ID unichar_id, bool value)
 
void set_isdigit (UNICHAR_ID unichar_id, bool value)
 
void set_ispunctuation (UNICHAR_ID unichar_id, bool value)
 
void set_isngram (UNICHAR_ID unichar_id, bool value)
 
void set_script (UNICHAR_ID unichar_id, const char *value)
 
void set_other_case (UNICHAR_ID unichar_id, UNICHAR_ID other_case)
 
void set_direction (UNICHAR_ID unichar_id, UNICHARSET::Direction value)
 
void set_mirror (UNICHAR_ID unichar_id, UNICHAR_ID mirror)
 
void set_normed (UNICHAR_ID unichar_id, const char *normed)
 
void set_normed_ids (UNICHAR_ID unichar_id)
 
bool get_isalpha (UNICHAR_ID unichar_id) const
 
bool get_islower (UNICHAR_ID unichar_id) const
 
bool get_isupper (UNICHAR_ID unichar_id) const
 
bool get_isdigit (UNICHAR_ID unichar_id) const
 
bool get_ispunctuation (UNICHAR_ID unichar_id) const
 
bool get_isngram (UNICHAR_ID unichar_id) const
 
bool get_isprivate (UNICHAR_ID unichar_id) const
 
bool top_bottom_useful () const
 
void set_ranges_empty ()
 
void SetPropertiesFromOther (const UNICHARSET &src)
 
void PartialSetPropertiesFromOther (int start_index, const UNICHARSET &src)
 
void ExpandRangesFromOther (const UNICHARSET &src)
 
void CopyFrom (const UNICHARSET &src)
 
void AppendOtherUnicharset (const UNICHARSET &src)
 
bool SizesDistinct (UNICHAR_ID id1, UNICHAR_ID id2) const
 
void get_top_bottom (UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
 
void set_top_bottom (UNICHAR_ID unichar_id, int min_bottom, int max_bottom, int min_top, int max_top)
 
void get_width_stats (UNICHAR_ID unichar_id, float *width, float *width_sd) const
 
void set_width_stats (UNICHAR_ID unichar_id, float width, float width_sd)
 
void get_bearing_stats (UNICHAR_ID unichar_id, float *bearing, float *bearing_sd) const
 
void set_bearing_stats (UNICHAR_ID unichar_id, float bearing, float bearing_sd)
 
void get_advance_stats (UNICHAR_ID unichar_id, float *advance, float *advance_sd) const
 
void set_advance_stats (UNICHAR_ID unichar_id, float advance, float advance_sd)
 
bool PropertiesIncomplete (UNICHAR_ID unichar_id) const
 
bool IsSpaceDelimited (UNICHAR_ID unichar_id) const
 
int get_script (UNICHAR_ID unichar_id) const
 
unsigned int get_properties (UNICHAR_ID unichar_id) const
 
char get_chartype (UNICHAR_ID unichar_id) const
 
UNICHAR_ID get_other_case (UNICHAR_ID unichar_id) const
 
Direction get_direction (UNICHAR_ID unichar_id) const
 
UNICHAR_ID get_mirror (UNICHAR_ID unichar_id) const
 
UNICHAR_ID to_lower (UNICHAR_ID unichar_id) const
 
UNICHAR_ID to_upper (UNICHAR_ID unichar_id) const
 
bool has_special_codes () const
 
bool AnyRepeatedUnicodes () const
 
const CHAR_FRAGMENTget_fragment (UNICHAR_ID unichar_id) const
 
bool get_isalpha (const char *const unichar_repr) const
 
bool get_islower (const char *const unichar_repr) const
 
bool get_isupper (const char *const unichar_repr) const
 
bool get_isdigit (const char *const unichar_repr) const
 
bool get_ispunctuation (const char *const unichar_repr) const
 
unsigned int get_properties (const char *const unichar_repr) const
 
char get_chartype (const char *const unichar_repr) const
 
int get_script (const char *const unichar_repr) const
 
const CHAR_FRAGMENTget_fragment (const char *const unichar_repr) const
 
bool get_isalpha (const char *const unichar_repr, int length) const
 
bool get_islower (const char *const unichar_repr, int length) const
 
bool get_isupper (const char *const unichar_repr, int length) const
 
bool get_isdigit (const char *const unichar_repr, int length) const
 
bool get_ispunctuation (const char *const unichar_repr, int length) const
 
const char * get_normed_unichar (UNICHAR_ID unichar_id) const
 
const GenericVector< UNICHAR_ID > & normed_ids (UNICHAR_ID unichar_id) const
 
int get_script (const char *const unichar_repr, int length) const
 
int get_script_table_size () const
 
const char * get_script_from_script_id (int id) const
 
int get_script_id_from_name (const char *script_name) const
 
bool is_null_script (const char *script) const
 
int add_script (const char *script)
 
bool get_enabled (UNICHAR_ID unichar_id) const
 
int null_sid () const
 
int common_sid () const
 
int latin_sid () const
 
int cyrillic_sid () const
 
int greek_sid () const
 
int han_sid () const
 
int hiragana_sid () const
 
int katakana_sid () const
 
int thai_sid () const
 
int hangul_sid () const
 
int default_sid () const
 
bool script_has_upper_lower () const
 
bool script_has_xheight () const
 

Static Public Member Functions

static STRING debug_utf8_str (const char *str)
 
static std::string CleanupString (const char *utf8_str)
 
static std::string CleanupString (const char *utf8_str, size_t length)
 

Static Public Attributes

static const TESS_API char * kCustomLigatures [][2]
 
static const TESS_API char * kSpecialUnicharCodes [SPECIAL_UNICHAR_CODES_COUNT]
 

Detailed Description

Definition at line 145 of file unicharset.h.

Member Enumeration Documentation

◆ Direction

Enumerator
U_LEFT_TO_RIGHT 
U_RIGHT_TO_LEFT 
U_EUROPEAN_NUMBER 
U_EUROPEAN_NUMBER_SEPARATOR 
U_EUROPEAN_NUMBER_TERMINATOR 
U_ARABIC_NUMBER 
U_COMMON_NUMBER_SEPARATOR 
U_BLOCK_SEPARATOR 
U_SEGMENT_SEPARATOR 
U_WHITE_SPACE_NEUTRAL 
U_OTHER_NEUTRAL 
U_LEFT_TO_RIGHT_EMBEDDING 
U_LEFT_TO_RIGHT_OVERRIDE 
U_RIGHT_TO_LEFT_ARABIC 
U_RIGHT_TO_LEFT_EMBEDDING 
U_RIGHT_TO_LEFT_OVERRIDE 
U_POP_DIRECTIONAL_FORMAT 
U_DIR_NON_SPACING_MARK 
U_BOUNDARY_NEUTRAL 
U_FIRST_STRONG_ISOLATE 
U_LEFT_TO_RIGHT_ISOLATE 
U_RIGHT_TO_LEFT_ISOLATE 
U_POP_DIRECTIONAL_ISOLATE 
U_CHAR_DIRECTION_COUNT 

Definition at line 156 of file unicharset.h.

156  {
157  U_LEFT_TO_RIGHT = 0,
158  U_RIGHT_TO_LEFT = 1,
159  U_EUROPEAN_NUMBER = 2,
162  U_ARABIC_NUMBER = 5,
164  U_BLOCK_SEPARATOR = 7,
167  U_OTHER_NEUTRAL = 10,
175  U_BOUNDARY_NEUTRAL = 18,
180 #ifndef U_HIDE_DEPRECATED_API
182 #endif // U_HIDE_DEPRECATED_API
183  };

Constructor & Destructor Documentation

◆ UNICHARSET()

UNICHARSET::UNICHARSET ( )

Definition at line 175 of file unicharset.cpp.

175  :
176  unichars(nullptr),
177  ids(),
178  size_used(0),
179  size_reserved(0),
180  script_table(nullptr),
181  script_table_size_used(0) {
182  clear();
183  for (int i = 0; i < SPECIAL_UNICHAR_CODES_COUNT; ++i) {
185  if (i == UNICHAR_JOINED)
186  set_isngram(i, true);
187  }
188 }

◆ ~UNICHARSET()

UNICHARSET::~UNICHARSET ( )

Definition at line 190 of file unicharset.cpp.

190  {
191  clear();
192 }

Member Function Documentation

◆ add_script()

int UNICHARSET::add_script ( const char *  script)

Definition at line 1020 of file unicharset.cpp.

1020  {
1021  for (int i = 0; i < script_table_size_used; ++i) {
1022  if (strcmp(script, script_table[i]) == 0)
1023  return i;
1024  }
1025  if (script_table_size_reserved == 0) {
1026  script_table_size_reserved = 8;
1027  script_table = new char*[script_table_size_reserved];
1028  } else if (script_table_size_used >= script_table_size_reserved) {
1029  assert(script_table_size_used == script_table_size_reserved);
1030  script_table_size_reserved += script_table_size_reserved;
1031  char** new_script_table = new char*[script_table_size_reserved];
1032  memcpy(new_script_table, script_table,
1033  script_table_size_used * sizeof(char*));
1034  delete[] script_table;
1035  script_table = new_script_table;
1036  }
1037  script_table[script_table_size_used] = new char[strlen(script) + 1];
1038  strcpy(script_table[script_table_size_used], script);
1039  return script_table_size_used++;
1040 }

◆ AnyRepeatedUnicodes()

bool UNICHARSET::AnyRepeatedUnicodes ( ) const

Definition at line 1007 of file unicharset.cpp.

1007  {
1008  int start_id = 0;
1010  for (int id = start_id; id < size_used; ++id) {
1011  // Convert to unicodes.
1012  std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(get_normed_unichar(id));
1013  for (size_t u = 1; u < unicodes.size(); ++u) {
1014  if (unicodes[u - 1] == unicodes[u]) return true;
1015  }
1016  }
1017  return false;
1018 }

◆ AppendOtherUnicharset()

void UNICHARSET::AppendOtherUnicharset ( const UNICHARSET src)

Definition at line 463 of file unicharset.cpp.

463  {
464  int initial_used = size_used;
465  for (int ch = 0; ch < src.size_used; ++ch) {
466  const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
467  const char* utf8 = src.id_to_unichar(ch);
468  int id = size_used;
469  if (contains_unichar(utf8)) {
470  id = unichar_to_id(utf8);
471  // Just expand current ranges.
472  unichars[id].properties.ExpandRangesFrom(src_props);
473  } else {
475  unichars[id].properties.SetRangesEmpty();
476  }
477  }
478  // Set properties, including mirror and other_case, WITHOUT reordering
479  // the unicharset.
480  PartialSetPropertiesFromOther(initial_used, src);
481 }

◆ CleanupString() [1/2]

static std::string UNICHARSET::CleanupString ( const char *  utf8_str)
inlinestatic

Definition at line 246 of file unicharset.h.

246  {
247  return CleanupString(utf8_str, strlen(utf8_str));
248  }

◆ CleanupString() [2/2]

std::string UNICHARSET::CleanupString ( const char *  utf8_str,
size_t  length 
)
static

Definition at line 1110 of file unicharset.cpp.

1110  {
1111  std::string result;
1112  result.reserve(length);
1113  char ch;
1114  while ((ch = *utf8_str) != '\0' && length-- > 0) {
1115  int key_index = 0;
1116  const char* key;
1117  while ((key = kCleanupMaps[key_index][0]) != nullptr) {
1118  int match = 0;
1119  while (key[match] != '\0' && key[match] == utf8_str[match]) ++match;
1120  if (key[match] == '\0') {
1121  utf8_str += match;
1122  break;
1123  }
1124  ++key_index;
1125  }
1126  if (key == nullptr) {
1127  result.push_back(ch);
1128  ++utf8_str;
1129  } else {
1130  result.append(kCleanupMaps[key_index][1]);
1131  }
1132  }
1133  return result;
1134 }

◆ clear()

void UNICHARSET::clear ( )
inline

Definition at line 306 of file unicharset.h.

306  {
307  if (script_table != nullptr) {
308  for (int i = 0; i < script_table_size_used; ++i)
309  delete[] script_table[i];
310  delete[] script_table;
311  script_table = nullptr;
312  script_table_size_used = 0;
313  }
314  if (unichars != nullptr) {
316  delete[] unichars;
317  unichars = nullptr;
318  }
319  script_table_size_reserved = 0;
320  size_reserved = 0;
321  size_used = 0;
322  ids.clear();
323  top_bottom_set_ = false;
324  script_has_upper_lower_ = false;
325  script_has_xheight_ = false;
326  old_style_included_ = false;
327  null_sid_ = 0;
328  common_sid_ = 0;
329  latin_sid_ = 0;
330  cyrillic_sid_ = 0;
331  greek_sid_ = 0;
332  han_sid_ = 0;
333  hiragana_sid_ = 0;
334  katakana_sid_ = 0;
335  thai_sid_ = 0;
336  hangul_sid_ = 0;
337  default_sid_ = 0;
338  }

◆ common_sid()

int UNICHARSET::common_sid ( ) const
inline

Definition at line 875 of file unicharset.h.

875 { return common_sid_; }

◆ contains_unichar() [1/2]

bool UNICHARSET::contains_unichar ( const char *const  unichar_repr) const

Definition at line 670 of file unicharset.cpp.

670  {
671  std::string cleaned =
672  old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
673  return ids.contains(cleaned.data(), cleaned.size());
674 }

◆ contains_unichar() [2/2]

bool UNICHARSET::contains_unichar ( const char *const  unichar_repr,
int  length 
) const

Definition at line 676 of file unicharset.cpp.

677  {
678  if (length == 0) {
679  return false;
680  }
681  std::string cleaned(unichar_repr, length);
682  if (!old_style_included_) cleaned = CleanupString(unichar_repr, length);
683  return ids.contains(cleaned.data(), cleaned.size());
684 }

◆ contains_unichar_id()

bool UNICHARSET::contains_unichar_id ( UNICHAR_ID  unichar_id) const
inline

Definition at line 284 of file unicharset.h.

284  {
285  return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used &&
286  unichar_id >= 0;
287  }

◆ CopyFrom()

void UNICHARSET::CopyFrom ( const UNICHARSET src)

Definition at line 447 of file unicharset.cpp.

447  {
448  clear();
449  for (int ch = 0; ch < src.size_used; ++ch) {
450  const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
451  const char* utf8 = src.id_to_unichar(ch);
453  unichars[ch].properties.ExpandRangesFrom(src_props);
454  }
455  // Set properties, including mirror and other_case, WITHOUT reordering
456  // the unicharset.
458 }

◆ cyrillic_sid()

int UNICHARSET::cyrillic_sid ( ) const
inline

Definition at line 877 of file unicharset.h.

877 { return cyrillic_sid_; }

◆ debug_str() [1/2]

STRING UNICHARSET::debug_str ( const char *  unichar_repr) const
inline

Definition at line 254 of file unicharset.h.

254  {
255  return debug_str(unichar_to_id(unichar_repr));
256  }

◆ debug_str() [2/2]

STRING UNICHARSET::debug_str ( UNICHAR_ID  id) const

Definition at line 342 of file unicharset.cpp.

342  {
343  if (id == INVALID_UNICHAR_ID) return STRING(id_to_unichar(id));
344  const CHAR_FRAGMENT *fragment = this->get_fragment(id);
345  if (fragment) {
346  return fragment->to_string();
347  }
348  const char* str = id_to_unichar(id);
349  STRING result = debug_utf8_str(str);
350  // Append a for lower alpha, A for upper alpha, and x if alpha but neither.
351  if (get_isalpha(id)) {
352  if (get_islower(id))
353  result += "a";
354  else if (get_isupper(id))
355  result += "A";
356  else
357  result += "x";
358  }
359  // Append 0 if a digit.
360  if (get_isdigit(id)) {
361  result += "0";
362  }
363  // Append p is a punctuation symbol.
364  if (get_ispunctuation(id)) {
365  result += "p";
366  }
367  return result;
368 }

◆ debug_utf8_str()

STRING UNICHARSET::debug_utf8_str ( const char *  str)
static

Definition at line 318 of file unicharset.cpp.

318  {
319  STRING result = str;
320  result += " [";
321  int step = 1;
322  // Chop into unicodes and code each as hex.
323  for (int i = 0; str[i] != '\0'; i += step) {
324  char hex[sizeof(int) * 2 + 1];
325  step = UNICHAR::utf8_step(str + i);
326  if (step == 0) {
327  step = 1;
328  sprintf(hex, "%x", str[i]);
329  } else {
330  UNICHAR ch(str + i, step);
331  sprintf(hex, "%x", ch.first_uni());
332  }
333  result += hex;
334  result += " ";
335  }
336  result += "]";
337  return result;
338 }

◆ default_sid()

int UNICHARSET::default_sid ( ) const
inline

Definition at line 884 of file unicharset.h.

884 { return default_sid_; }

◆ delete_pointers_in_unichars()

void UNICHARSET::delete_pointers_in_unichars ( )
inline

Definition at line 298 of file unicharset.h.

298  {
299  for (int i = 0; i < size_used; ++i) {
300  delete unichars[i].properties.fragment;
301  unichars[i].properties.fragment = nullptr;
302  }
303  }

◆ encodable_string()

bool UNICHARSET::encodable_string ( const char *  str,
int *  first_bad_position 
) const

Definition at line 243 of file unicharset.cpp.

244  {
245  GenericVector<UNICHAR_ID> encoding;
246  return encode_string(str, true, &encoding, nullptr, first_bad_position);
247 }

◆ encode_string()

bool UNICHARSET::encode_string ( const char *  str,
bool  give_up_on_failure,
GenericVector< UNICHAR_ID > *  encoding,
GenericVector< char > *  lengths,
int *  encoded_length 
) const

Definition at line 258 of file unicharset.cpp.

261  {
262  GenericVector<UNICHAR_ID> working_encoding;
263  GenericVector<char> working_lengths;
264  GenericVector<char> best_lengths;
265  encoding->truncate(0); // Just in case str is empty.
266  int str_length = strlen(str);
267  int str_pos = 0;
268  bool perfect = true;
269  while (str_pos < str_length) {
270  encode_string(str, str_pos, str_length, &working_encoding, &working_lengths,
271  &str_pos, encoding, &best_lengths);
272  if (str_pos < str_length) {
273  // This is a non-match. Skip one utf-8 character.
274  perfect = false;
275  if (give_up_on_failure) break;
276  int step = UNICHAR::utf8_step(str + str_pos);
277  if (step == 0) step = 1;
278  encoding->push_back(INVALID_UNICHAR_ID);
279  best_lengths.push_back(step);
280  str_pos += step;
281  working_encoding = *encoding;
282  working_lengths = best_lengths;
283  }
284  }
285  if (lengths != nullptr) *lengths = best_lengths;
286  if (encoded_length != nullptr) *encoded_length = str_pos;
287  return perfect;
288 }

◆ eq()

bool UNICHARSET::eq ( UNICHAR_ID  unichar_id,
const char *const  unichar_repr 
) const

Definition at line 686 of file unicharset.cpp.

687  {
688  return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
689 }

◆ ExpandRangesFromOther()

void UNICHARSET::ExpandRangesFromOther ( const UNICHARSET src)

Definition at line 434 of file unicharset.cpp.

434  {
435  for (int ch = 0; ch < size_used; ++ch) {
436  const char* utf8 = id_to_unichar(ch);
437  UNICHAR_PROPERTIES properties;
438  if (src.GetStrProperties(utf8, &properties)) {
439  // Expand just the ranges from properties.
440  unichars[ch].properties.ExpandRangesFrom(properties);
441  }
442  }
443 }

◆ get_advance_stats()

void UNICHARSET::get_advance_stats ( UNICHAR_ID  unichar_id,
float *  advance,
float *  advance_sd 
) const
inline

Definition at line 620 of file unicharset.h.

621  {
622  if (INVALID_UNICHAR_ID == unichar_id) {
623  *advance = *advance_sd = 0;
624  return;
625  }
626  ASSERT_HOST(contains_unichar_id(unichar_id));
627  *advance = unichars[unichar_id].properties.advance;
628  *advance_sd = unichars[unichar_id].properties.advance_sd;
629  }

◆ get_bearing_stats()

void UNICHARSET::get_bearing_stats ( UNICHAR_ID  unichar_id,
float *  bearing,
float *  bearing_sd 
) const
inline

Definition at line 603 of file unicharset.h.

604  {
605  if (INVALID_UNICHAR_ID == unichar_id) {
606  *bearing = *bearing_sd = 0.0f;
607  return;
608  }
609  ASSERT_HOST(contains_unichar_id(unichar_id));
610  *bearing = unichars[unichar_id].properties.bearing;
611  *bearing_sd = unichars[unichar_id].properties.bearing_sd;
612  }

◆ get_chartype() [1/2]

char UNICHARSET::get_chartype ( const char *const  unichar_repr) const
inline

Definition at line 761 of file unicharset.h.

761  {
762  return get_chartype(unichar_to_id(unichar_repr));
763  }

◆ get_chartype() [2/2]

char UNICHARSET::get_chartype ( UNICHAR_ID  unichar_id) const

Definition at line 616 of file unicharset.cpp.

616  {
617  if (this->get_isupper(id)) return 'A';
618  if (this->get_islower(id)) return 'a';
619  if (this->get_isalpha(id)) return 'x';
620  if (this->get_isdigit(id)) return '0';
621  if (this->get_ispunctuation(id)) return 'p';
622  return 0;
623 }

◆ get_direction()

Direction UNICHARSET::get_direction ( UNICHAR_ID  unichar_id) const
inline

Definition at line 680 of file unicharset.h.

680  {
681  if (INVALID_UNICHAR_ID == unichar_id) return UNICHARSET::U_OTHER_NEUTRAL;
682  ASSERT_HOST(contains_unichar_id(unichar_id));
683  return unichars[unichar_id].properties.direction;
684  }

◆ get_enabled()

bool UNICHARSET::get_enabled ( UNICHAR_ID  unichar_id) const
inline

Definition at line 868 of file unicharset.h.

868  {
869  ASSERT_HOST(contains_unichar_id(unichar_id));
870  return unichars[unichar_id].properties.enabled;
871  }

◆ get_fragment() [1/2]

const CHAR_FRAGMENT* UNICHARSET::get_fragment ( const char *const  unichar_repr) const
inline

Definition at line 774 of file unicharset.h.

774  {
775  if (unichar_repr == nullptr || unichar_repr[0] == '\0' ||
776  !ids.contains(unichar_repr, false)) {
777  return nullptr;
778  }
779  return get_fragment(unichar_to_id(unichar_repr));
780  }

◆ get_fragment() [2/2]

const CHAR_FRAGMENT* UNICHARSET::get_fragment ( UNICHAR_ID  unichar_id) const
inline

Definition at line 724 of file unicharset.h.

724  {
725  if (INVALID_UNICHAR_ID == unichar_id) return nullptr;
726  ASSERT_HOST(contains_unichar_id(unichar_id));
727  return unichars[unichar_id].properties.fragment;
728  }

◆ get_isalpha() [1/3]

bool UNICHARSET::get_isalpha ( const char *const  unichar_repr) const
inline

Definition at line 731 of file unicharset.h.

731  {
732  return get_isalpha(unichar_to_id(unichar_repr));
733  }

◆ get_isalpha() [2/3]

bool UNICHARSET::get_isalpha ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 784 of file unicharset.h.

785  {
786  return get_isalpha(unichar_to_id(unichar_repr, length));
787  }

◆ get_isalpha() [3/3]

bool UNICHARSET::get_isalpha ( UNICHAR_ID  unichar_id) const
inline

Definition at line 481 of file unicharset.h.

481  {
482  if (INVALID_UNICHAR_ID == unichar_id) return false;
483  ASSERT_HOST(contains_unichar_id(unichar_id));
484  return unichars[unichar_id].properties.isalpha;
485  }

◆ get_isdigit() [1/3]

bool UNICHARSET::get_isdigit ( const char *const  unichar_repr) const
inline

Definition at line 746 of file unicharset.h.

746  {
747  return get_isdigit(unichar_to_id(unichar_repr));
748  }

◆ get_isdigit() [2/3]

bool UNICHARSET::get_isdigit ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 805 of file unicharset.h.

806  {
807  return get_isdigit(unichar_to_id(unichar_repr, length));
808  }

◆ get_isdigit() [3/3]

bool UNICHARSET::get_isdigit ( UNICHAR_ID  unichar_id) const
inline

Definition at line 502 of file unicharset.h.

502  {
503  if (INVALID_UNICHAR_ID == unichar_id) return false;
504  ASSERT_HOST(contains_unichar_id(unichar_id));
505  return unichars[unichar_id].properties.isdigit;
506  }

◆ get_islower() [1/3]

bool UNICHARSET::get_islower ( const char *const  unichar_repr) const
inline

Definition at line 736 of file unicharset.h.

736  {
737  return get_islower(unichar_to_id(unichar_repr));
738  }

◆ get_islower() [2/3]

bool UNICHARSET::get_islower ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 791 of file unicharset.h.

792  {
793  return get_islower(unichar_to_id(unichar_repr, length));
794  }

◆ get_islower() [3/3]

bool UNICHARSET::get_islower ( UNICHAR_ID  unichar_id) const
inline

Definition at line 488 of file unicharset.h.

488  {
489  if (INVALID_UNICHAR_ID == unichar_id) return false;
490  ASSERT_HOST(contains_unichar_id(unichar_id));
491  return unichars[unichar_id].properties.islower;
492  }

◆ get_isngram()

bool UNICHARSET::get_isngram ( UNICHAR_ID  unichar_id) const
inline

Definition at line 516 of file unicharset.h.

516  {
517  if (INVALID_UNICHAR_ID == unichar_id) return false;
518  ASSERT_HOST(contains_unichar_id(unichar_id));
519  return unichars[unichar_id].properties.isngram;
520  }

◆ get_isprivate()

bool UNICHARSET::get_isprivate ( UNICHAR_ID  unichar_id) const

Definition at line 387 of file unicharset.cpp.

387  {
388  UNICHAR uc(id_to_unichar(unichar_id), -1);
389  int uni = uc.first_uni();
390  return (uni >= 0xE000 && uni <= 0xF8FF);
391 }

◆ get_ispunctuation() [1/3]

bool UNICHARSET::get_ispunctuation ( const char *const  unichar_repr) const
inline

Definition at line 751 of file unicharset.h.

751  {
752  return get_ispunctuation(unichar_to_id(unichar_repr));
753  }

◆ get_ispunctuation() [2/3]

bool UNICHARSET::get_ispunctuation ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 812 of file unicharset.h.

813  {
814  return get_ispunctuation(unichar_to_id(unichar_repr, length));
815  }

◆ get_ispunctuation() [3/3]

bool UNICHARSET::get_ispunctuation ( UNICHAR_ID  unichar_id) const
inline

Definition at line 509 of file unicharset.h.

509  {
510  if (INVALID_UNICHAR_ID == unichar_id) return false;
511  ASSERT_HOST(contains_unichar_id(unichar_id));
512  return unichars[unichar_id].properties.ispunctuation;
513  }

◆ get_isupper() [1/3]

bool UNICHARSET::get_isupper ( const char *const  unichar_repr) const
inline

Definition at line 741 of file unicharset.h.

741  {
742  return get_isupper(unichar_to_id(unichar_repr));
743  }

◆ get_isupper() [2/3]

bool UNICHARSET::get_isupper ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 798 of file unicharset.h.

799  {
800  return get_isupper(unichar_to_id(unichar_repr, length));
801  }

◆ get_isupper() [3/3]

bool UNICHARSET::get_isupper ( UNICHAR_ID  unichar_id) const
inline

Definition at line 495 of file unicharset.h.

495  {
496  if (INVALID_UNICHAR_ID == unichar_id) return false;
497  ASSERT_HOST(contains_unichar_id(unichar_id));
498  return unichars[unichar_id].properties.isupper;
499  }

◆ get_mirror()

UNICHAR_ID UNICHARSET::get_mirror ( UNICHAR_ID  unichar_id) const
inline

Definition at line 687 of file unicharset.h.

687  {
688  if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
689  ASSERT_HOST(contains_unichar_id(unichar_id));
690  return unichars[unichar_id].properties.mirror;
691  }

◆ get_normed_unichar()

const char* UNICHARSET::get_normed_unichar ( UNICHAR_ID  unichar_id) const
inline

Definition at line 818 of file unicharset.h.

818  {
819  if (unichar_id == UNICHAR_SPACE) return " ";
820  return unichars[unichar_id].properties.normed.c_str();
821  }

◆ get_other_case()

UNICHAR_ID UNICHARSET::get_other_case ( UNICHAR_ID  unichar_id) const
inline

Definition at line 673 of file unicharset.h.

673  {
674  if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
675  ASSERT_HOST(contains_unichar_id(unichar_id));
676  return unichars[unichar_id].properties.other_case;
677  }

◆ get_properties() [1/2]

unsigned int UNICHARSET::get_properties ( const char *const  unichar_repr) const
inline

Definition at line 757 of file unicharset.h.

757  {
758  return get_properties(unichar_to_id(unichar_repr));
759  }

◆ get_properties() [2/2]

unsigned int UNICHARSET::get_properties ( UNICHAR_ID  unichar_id) const

Definition at line 601 of file unicharset.cpp.

601  {
602  unsigned int properties = 0;
603  if (this->get_isalpha(id))
604  properties |= ISALPHA_MASK;
605  if (this->get_islower(id))
606  properties |= ISLOWER_MASK;
607  if (this->get_isupper(id))
608  properties |= ISUPPER_MASK;
609  if (this->get_isdigit(id))
610  properties |= ISDIGIT_MASK;
611  if (this->get_ispunctuation(id))
612  properties |= ISPUNCTUATION_MASK;
613  return properties;
614 }

◆ get_script() [1/3]

int UNICHARSET::get_script ( const char *const  unichar_repr) const
inline

Definition at line 768 of file unicharset.h.

768  {
769  return get_script(unichar_to_id(unichar_repr));
770  }

◆ get_script() [2/3]

int UNICHARSET::get_script ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 833 of file unicharset.h.

834  {
835  return get_script(unichar_to_id(unichar_repr, length));
836  }

◆ get_script() [3/3]

int UNICHARSET::get_script ( UNICHAR_ID  unichar_id) const
inline

Definition at line 653 of file unicharset.h.

653  {
654  if (INVALID_UNICHAR_ID == unichar_id) return null_sid_;
655  ASSERT_HOST(contains_unichar_id(unichar_id));
656  return unichars[unichar_id].properties.script_id;
657  }

◆ get_script_from_script_id()

const char* UNICHARSET::get_script_from_script_id ( int  id) const
inline

Definition at line 844 of file unicharset.h.

844  {
845  if (id >= script_table_size_used || id < 0)
846  return null_script;
847  return script_table[id];
848  }

◆ get_script_id_from_name()

int UNICHARSET::get_script_id_from_name ( const char *  script_name) const

Definition at line 1099 of file unicharset.cpp.

1099  {
1100  for (int i = 0; i < script_table_size_used; ++i) {
1101  if (strcmp(script_name, script_table[i]) == 0)
1102  return i;
1103  }
1104  return 0; // 0 is always the null_script
1105 }

◆ get_script_table_size()

int UNICHARSET::get_script_table_size ( ) const
inline

Definition at line 839 of file unicharset.h.

839  {
840  return script_table_size_used;
841  }

◆ get_top_bottom()

void UNICHARSET::get_top_bottom ( UNICHAR_ID  unichar_id,
int *  min_bottom,
int *  max_bottom,
int *  min_top,
int *  max_top 
) const
inline

Definition at line 558 of file unicharset.h.

560  {
561  if (INVALID_UNICHAR_ID == unichar_id) {
562  *min_bottom = *min_top = 0;
563  *max_bottom = *max_top = 256; // kBlnCellHeight
564  return;
565  }
566  ASSERT_HOST(contains_unichar_id(unichar_id));
567  *min_bottom = unichars[unichar_id].properties.min_bottom;
568  *max_bottom = unichars[unichar_id].properties.max_bottom;
569  *min_top = unichars[unichar_id].properties.min_top;
570  *max_top = unichars[unichar_id].properties.max_top;
571  }

◆ get_width_stats()

void UNICHARSET::get_width_stats ( UNICHAR_ID  unichar_id,
float *  width,
float *  width_sd 
) const
inline

Definition at line 586 of file unicharset.h.

587  {
588  if (INVALID_UNICHAR_ID == unichar_id) {
589  *width = 0.0f;
590  *width_sd = 0.0f;;
591  return;
592  }
593  ASSERT_HOST(contains_unichar_id(unichar_id));
594  *width = unichars[unichar_id].properties.width;
595  *width_sd = unichars[unichar_id].properties.width_sd;
596  }

◆ greek_sid()

int UNICHARSET::greek_sid ( ) const
inline

Definition at line 878 of file unicharset.h.

878 { return greek_sid_; }

◆ han_sid()

int UNICHARSET::han_sid ( ) const
inline

Definition at line 879 of file unicharset.h.

879 { return han_sid_; }

◆ hangul_sid()

int UNICHARSET::hangul_sid ( ) const
inline

Definition at line 883 of file unicharset.h.

883 { return hangul_sid_; }

◆ has_special_codes()

bool UNICHARSET::has_special_codes ( ) const
inline

Definition at line 712 of file unicharset.h.

712  {
713  return get_fragment(UNICHAR_BROKEN) != nullptr &&
716  }

◆ hiragana_sid()

int UNICHARSET::hiragana_sid ( ) const
inline

Definition at line 880 of file unicharset.h.

880 { return hiragana_sid_; }

◆ id_to_unichar()

const char * UNICHARSET::id_to_unichar ( UNICHAR_ID  id) const

Definition at line 290 of file unicharset.cpp.

290  {
291  if (id == INVALID_UNICHAR_ID) {
292  return INVALID_UNICHAR;
293  }
294  ASSERT_HOST(id < this->size());
295  return unichars[id].representation;
296 }

◆ id_to_unichar_ext()

const char * UNICHARSET::id_to_unichar_ext ( UNICHAR_ID  id) const

Definition at line 298 of file unicharset.cpp.

298  {
299  if (id == INVALID_UNICHAR_ID) {
300  return INVALID_UNICHAR;
301  }
302  ASSERT_HOST(id < this->size());
303  // Resolve from the kCustomLigatures table if this is a private encoding.
304  if (get_isprivate(id)) {
305  const char* ch = id_to_unichar(id);
306  for (int i = 0; kCustomLigatures[i][0] != nullptr; ++i) {
307  if (!strcmp(ch, kCustomLigatures[i][1])) {
308  return kCustomLigatures[i][0];
309  }
310  }
311  }
312  // Otherwise return the stored representation.
313  return unichars[id].representation;
314 }

◆ is_null_script()

bool UNICHARSET::is_null_script ( const char *  script) const
inline

Definition at line 858 of file unicharset.h.

858  {
859  return script == null_script;
860  }

◆ IsSpaceDelimited()

bool UNICHARSET::IsSpaceDelimited ( UNICHAR_ID  unichar_id) const
inline

Definition at line 642 of file unicharset.h.

642  {
643  if (INVALID_UNICHAR_ID == unichar_id) return true;
644  int script_id = get_script(unichar_id);
645  return script_id != han_sid_ && script_id != thai_sid_ &&
646  script_id != hangul_sid_ && script_id != hiragana_sid_ &&
647  script_id != katakana_sid_;
648  }

◆ katakana_sid()

int UNICHARSET::katakana_sid ( ) const
inline

Definition at line 881 of file unicharset.h.

881 { return katakana_sid_; }

◆ latin_sid()

int UNICHARSET::latin_sid ( ) const
inline

Definition at line 876 of file unicharset.h.

876 { return latin_sid_; }

◆ load_from_file() [1/5]

bool UNICHARSET::load_from_file ( const char *const  filename)
inline

Definition at line 386 of file unicharset.h.

386  {
387  return load_from_file(filename, false);
388  }

◆ load_from_file() [2/5]

bool UNICHARSET::load_from_file ( const char *const  filename,
bool  skip_fragments 
)
inline

Definition at line 378 of file unicharset.h.

378  {
379  FILE* file = fopen(filename, "rb");
380  if (file == nullptr) return false;
381  bool result = load_from_file(file, skip_fragments);
382  fclose(file);
383  return result;
384  }

◆ load_from_file() [3/5]

bool UNICHARSET::load_from_file ( FILE *  file)
inline

Definition at line 393 of file unicharset.h.

393 { return load_from_file(file, false); }

◆ load_from_file() [4/5]

bool UNICHARSET::load_from_file ( FILE *  file,
bool  skip_fragments 
)

Definition at line 742 of file unicharset.cpp.

742  {
743  LocalFilePointer lfp(file);
744  using namespace std::placeholders; // for _1, _2
745  std::function<char*(char*, int)> fgets_cb =
746  std::bind(&LocalFilePointer::fgets, &lfp, _1, _2);
747  bool success = load_via_fgets(fgets_cb, skip_fragments);
748  return success;
749 }

◆ load_from_file() [5/5]

bool UNICHARSET::load_from_file ( tesseract::TFile file,
bool  skip_fragments 
)

Definition at line 751 of file unicharset.cpp.

751  {
752  using namespace std::placeholders; // for _1, _2
753  std::function<char*(char*, int)> fgets_cb =
754  std::bind(&tesseract::TFile::FGets, file, _1, _2);
755  bool success = load_via_fgets(fgets_cb, skip_fragments);
756  return success;
757 }

◆ major_right_to_left()

bool UNICHARSET::major_right_to_left ( ) const

Definition at line 952 of file unicharset.cpp.

952  {
953  int ltr_count = 0;
954  int rtl_count = 0;
955  for (int id = 0; id < size_used; ++id) {
956  int dir = get_direction(id);
957  if (dir == UNICHARSET::U_LEFT_TO_RIGHT) ltr_count++;
958  if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
960  dir == UNICHARSET::U_ARABIC_NUMBER) rtl_count++;
961  }
962  return rtl_count > ltr_count;
963 }

◆ normed_ids()

const GenericVector<UNICHAR_ID>& UNICHARSET::normed_ids ( UNICHAR_ID  unichar_id) const
inline

Definition at line 825 of file unicharset.h.

825  {
826  return unichars[unichar_id].properties.normed_ids;
827  }

◆ null_sid()

int UNICHARSET::null_sid ( ) const
inline

Definition at line 874 of file unicharset.h.

874 { return null_sid_; }

◆ PartialSetPropertiesFromOther()

void UNICHARSET::PartialSetPropertiesFromOther ( int  start_index,
const UNICHARSET src 
)

Definition at line 404 of file unicharset.cpp.

405  {
406  for (int ch = start_index; ch < size_used; ++ch) {
407  const char* utf8 = id_to_unichar(ch);
408  UNICHAR_PROPERTIES properties;
409  if (src.GetStrProperties(utf8, &properties)) {
410  // Setup the script_id, other_case, and mirror properly.
411  const char* script = src.get_script_from_script_id(properties.script_id);
412  properties.script_id = add_script(script);
413  const char* other_case = src.id_to_unichar(properties.other_case);
414  if (contains_unichar(other_case)) {
415  properties.other_case = unichar_to_id(other_case);
416  } else {
417  properties.other_case = ch;
418  }
419  const char* mirror_str = src.id_to_unichar(properties.mirror);
420  if (contains_unichar(mirror_str)) {
421  properties.mirror = unichar_to_id(mirror_str);
422  } else {
423  properties.mirror = ch;
424  }
425  unichars[ch].properties.CopyFrom(properties);
426  set_normed_ids(ch);
427  }
428  }
429 }

◆ post_load_setup()

void UNICHARSET::post_load_setup ( )

Definition at line 886 of file unicharset.cpp.

886  {
887  // Number of alpha chars with the case property minus those without,
888  // in order to determine that half the alpha chars have case.
889  int net_case_alphas = 0;
890  int x_height_alphas = 0;
891  int cap_height_alphas = 0;
892  top_bottom_set_ = false;
893  for (UNICHAR_ID id = 0; id < size_used; ++id) {
894  int min_bottom = 0;
895  int max_bottom = UINT8_MAX;
896  int min_top = 0;
897  int max_top = UINT8_MAX;
898  get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
899  if (min_top > 0)
900  top_bottom_set_ = true;
901  if (get_isalpha(id)) {
902  if (get_islower(id) || get_isupper(id))
903  ++net_case_alphas;
904  else
905  --net_case_alphas;
906  if (min_top < kMeanlineThreshold && max_top < kMeanlineThreshold)
907  ++x_height_alphas;
908  else if (min_top > kMeanlineThreshold && max_top > kMeanlineThreshold)
909  ++cap_height_alphas;
910  }
911  set_normed_ids(id);
912  }
913 
914  script_has_upper_lower_ = net_case_alphas > 0;
915  script_has_xheight_ = script_has_upper_lower_ ||
916  (x_height_alphas > cap_height_alphas * kMinXHeightFraction &&
917  cap_height_alphas > x_height_alphas * kMinCapHeightFraction);
918 
919  null_sid_ = get_script_id_from_name(null_script);
920  ASSERT_HOST(null_sid_ == 0);
921  common_sid_ = get_script_id_from_name("Common");
922  latin_sid_ = get_script_id_from_name("Latin");
923  cyrillic_sid_ = get_script_id_from_name("Cyrillic");
924  greek_sid_ = get_script_id_from_name("Greek");
925  han_sid_ = get_script_id_from_name("Han");
926  hiragana_sid_ = get_script_id_from_name("Hiragana");
927  katakana_sid_ = get_script_id_from_name("Katakana");
928  thai_sid_ = get_script_id_from_name("Thai");
929  hangul_sid_ = get_script_id_from_name("Hangul");
930 
931  // Compute default script. Use the highest-counting alpha script, that is
932  // not the common script, as that still contains some "alphas".
933  int* script_counts = new int[script_table_size_used];
934  memset(script_counts, 0, sizeof(*script_counts) * script_table_size_used);
935  for (int id = 0; id < size_used; ++id) {
936  if (get_isalpha(id)) {
937  ++script_counts[get_script(id)];
938  }
939  }
940  default_sid_ = 0;
941  for (int s = 1; s < script_table_size_used; ++s) {
942  if (script_counts[s] > script_counts[default_sid_] && s != common_sid_)
943  default_sid_ = s;
944  }
945  delete [] script_counts;
946 }

◆ PropertiesIncomplete()

bool UNICHARSET::PropertiesIncomplete ( UNICHAR_ID  unichar_id) const
inline

Definition at line 636 of file unicharset.h.

636  {
637  return unichars[unichar_id].properties.AnyRangeEmpty();
638  }

◆ reserve()

void UNICHARSET::reserve ( int  unichars_number)

Definition at line 194 of file unicharset.cpp.

194  {
195  if (unichars_number > size_reserved) {
196  auto* unichars_new = new UNICHAR_SLOT[unichars_number];
197  for (int i = 0; i < size_used; ++i)
198  unichars_new[i] = unichars[i];
199  for (int j = size_used; j < unichars_number; ++j) {
200  unichars_new[j].properties.script_id = add_script(null_script);
201  }
202  delete[] unichars;
203  unichars = unichars_new;
204  size_reserved = unichars_number;
205  }
206 }

◆ save_to_file() [1/3]

bool UNICHARSET::save_to_file ( const char *const  filename) const
inline

Definition at line 350 of file unicharset.h.

350  {
351  FILE* file = fopen(filename, "w+b");
352  if (file == nullptr) return false;
353  bool result = save_to_file(file);
354  fclose(file);
355  return result;
356  }

◆ save_to_file() [2/3]

bool UNICHARSET::save_to_file ( FILE *  file) const
inline

Definition at line 360 of file unicharset.h.

360  {
361  STRING str;
362  return save_to_string(&str) &&
363  tesseract::Serialize(file, &str[0], str.length());
364  }

◆ save_to_file() [3/3]

bool UNICHARSET::save_to_file ( tesseract::TFile file) const
inline

Definition at line 366 of file unicharset.h.

366  {
367  STRING str;
368  return save_to_string(&str) && file->Serialize(&str[0], str.length());
369  }

◆ save_to_string()

bool UNICHARSET::save_to_string ( STRING str) const

Definition at line 691 of file unicharset.cpp.

691  {
692  const int kFileBufSize = 1024;
693  char buffer[kFileBufSize + 1];
694  snprintf(buffer, kFileBufSize, "%d\n", this->size());
695  *str = buffer;
696  for (UNICHAR_ID id = 0; id < this->size(); ++id) {
697  int min_bottom, max_bottom, min_top, max_top;
698  get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
699  float width, width_sd;
700  get_width_stats(id, &width, &width_sd);
701  float bearing, bearing_sd;
702  get_bearing_stats(id, &bearing, &bearing_sd);
703  float advance, advance_sd;
704  get_advance_stats(id, &advance, &advance_sd);
705  unsigned int properties = this->get_properties(id);
706  if (strcmp(this->id_to_unichar(id), " ") == 0) {
707  snprintf(buffer, kFileBufSize, "%s %x %s %d\n", "NULL", properties,
708  this->get_script_from_script_id(this->get_script(id)),
709  this->get_other_case(id));
710  *str += buffer;
711  } else {
712  std::ostringstream stream;
713  stream.imbue(std::locale::classic());
714  stream << this->id_to_unichar(id) << ' ' << properties << ' ' <<
715  min_bottom << ',' << max_bottom << ',' <<
716  min_top << ',' << max_top << ',' <<
717  width << ',' << width_sd << ',' <<
718  bearing << ',' << bearing_sd << ',' <<
719  advance << ',' << advance_sd << ' ' <<
720  this->get_script_from_script_id(this->get_script(id)) << ' ' <<
721  this->get_other_case(id) << ' ' <<
722  this->get_direction(id) << ' ' <<
723  this->get_mirror(id) << ' ' <<
724  this->get_normed_unichar(id) << "\t# " <<
725  this->debug_str(id).c_str() << '\n';
726  *str += stream.str().c_str();
727  }
728  }
729  return true;
730 }

◆ script_has_upper_lower()

bool UNICHARSET::script_has_upper_lower ( ) const
inline

Definition at line 887 of file unicharset.h.

887  {
888  return script_has_upper_lower_;
889  }

◆ script_has_xheight()

bool UNICHARSET::script_has_xheight ( ) const
inline

Definition at line 894 of file unicharset.h.

894  {
895  return script_has_xheight_;
896  }

◆ set_advance_stats()

void UNICHARSET::set_advance_stats ( UNICHAR_ID  unichar_id,
float  advance,
float  advance_sd 
)
inline

Definition at line 630 of file unicharset.h.

631  {
632  unichars[unichar_id].properties.advance = advance;
633  unichars[unichar_id].properties.advance_sd = advance_sd;
634  }

◆ set_bearing_stats()

void UNICHARSET::set_bearing_stats ( UNICHAR_ID  unichar_id,
float  bearing,
float  bearing_sd 
)
inline

Definition at line 613 of file unicharset.h.

614  {
615  unichars[unichar_id].properties.bearing = bearing;
616  unichars[unichar_id].properties.bearing_sd = bearing_sd;
617  }

◆ set_black_and_whitelist()

void UNICHARSET::set_black_and_whitelist ( const char *  blacklist,
const char *  whitelist,
const char *  unblacklist 
)

Definition at line 969 of file unicharset.cpp.

971  {
972  bool def_enabled = whitelist == nullptr || whitelist[0] == '\0';
973  // Set everything to default
974  for (int ch = 0; ch < size_used; ++ch)
975  unichars[ch].properties.enabled = def_enabled;
976  if (!def_enabled) {
977  // Enable the whitelist.
978  GenericVector<UNICHAR_ID> encoding;
979  encode_string(whitelist, false, &encoding, nullptr, nullptr);
980  for (int i = 0; i < encoding.size(); ++i) {
981  if (encoding[i] != INVALID_UNICHAR_ID)
982  unichars[encoding[i]].properties.enabled = true;
983  }
984  }
985  if (blacklist != nullptr && blacklist[0] != '\0') {
986  // Disable the blacklist.
987  GenericVector<UNICHAR_ID> encoding;
988  encode_string(blacklist, false, &encoding, nullptr, nullptr);
989  for (int i = 0; i < encoding.size(); ++i) {
990  if (encoding[i] != INVALID_UNICHAR_ID)
991  unichars[encoding[i]].properties.enabled = false;
992  }
993  }
994  if (unblacklist != nullptr && unblacklist[0] != '\0') {
995  // Re-enable the unblacklist.
996  GenericVector<UNICHAR_ID> encoding;
997  encode_string(unblacklist, false, &encoding, nullptr, nullptr);
998  for (int i = 0; i < encoding.size(); ++i) {
999  if (encoding[i] != INVALID_UNICHAR_ID)
1000  unichars[encoding[i]].properties.enabled = true;
1001  }
1002  }
1003 }

◆ set_direction()

void UNICHARSET::set_direction ( UNICHAR_ID  unichar_id,
UNICHARSET::Direction  value 
)
inline

Definition at line 462 of file unicharset.h.

462  {
463  unichars[unichar_id].properties.direction = value;
464  }

◆ set_isalpha()

void UNICHARSET::set_isalpha ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 421 of file unicharset.h.

421  {
422  unichars[unichar_id].properties.isalpha = value;
423  }

◆ set_isdigit()

void UNICHARSET::set_isdigit ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 436 of file unicharset.h.

436  {
437  unichars[unichar_id].properties.isdigit = value;
438  }

◆ set_islower()

void UNICHARSET::set_islower ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 426 of file unicharset.h.

426  {
427  unichars[unichar_id].properties.islower = value;
428  }

◆ set_isngram()

void UNICHARSET::set_isngram ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 446 of file unicharset.h.

446  {
447  unichars[unichar_id].properties.isngram = value;
448  }

◆ set_ispunctuation()

void UNICHARSET::set_ispunctuation ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 441 of file unicharset.h.

441  {
442  unichars[unichar_id].properties.ispunctuation = value;
443  }

◆ set_isupper()

void UNICHARSET::set_isupper ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 431 of file unicharset.h.

431  {
432  unichars[unichar_id].properties.isupper = value;
433  }

◆ set_mirror()

void UNICHARSET::set_mirror ( UNICHAR_ID  unichar_id,
UNICHAR_ID  mirror 
)
inline

Definition at line 467 of file unicharset.h.

467  {
468  unichars[unichar_id].properties.mirror = mirror;
469  }

◆ set_normed()

void UNICHARSET::set_normed ( UNICHAR_ID  unichar_id,
const char *  normed 
)
inline

Definition at line 472 of file unicharset.h.

472  {
473  unichars[unichar_id].properties.normed = normed;
474  unichars[unichar_id].properties.normed_ids.truncate(0);
475  }

◆ set_normed_ids()

void UNICHARSET::set_normed_ids ( UNICHAR_ID  unichar_id)

Definition at line 372 of file unicharset.cpp.

372  {
373  unichars[unichar_id].properties.normed_ids.truncate(0);
374  if (unichar_id == UNICHAR_SPACE && id_to_unichar(unichar_id)[0] == ' ') {
375  unichars[unichar_id].properties.normed_ids.push_back(UNICHAR_SPACE);
376  } else if (!encode_string(unichars[unichar_id].properties.normed.c_str(),
377  true, &unichars[unichar_id].properties.normed_ids,
378  nullptr, nullptr)) {
379  unichars[unichar_id].properties.normed_ids.truncate(0);
380  unichars[unichar_id].properties.normed_ids.push_back(unichar_id);
381  }
382 }

◆ set_other_case()

void UNICHARSET::set_other_case ( UNICHAR_ID  unichar_id,
UNICHAR_ID  other_case 
)
inline

Definition at line 457 of file unicharset.h.

457  {
458  unichars[unichar_id].properties.other_case = other_case;
459  }

◆ set_ranges_empty()

void UNICHARSET::set_ranges_empty ( )

Definition at line 395 of file unicharset.cpp.

395  {
396  for (int id = 0; id < size_used; ++id) {
397  unichars[id].properties.SetRangesEmpty();
398  }
399 }

◆ set_script()

void UNICHARSET::set_script ( UNICHAR_ID  unichar_id,
const char *  value 
)
inline

Definition at line 452 of file unicharset.h.

452  {
453  unichars[unichar_id].properties.script_id = add_script(value);
454  }

◆ set_top_bottom()

void UNICHARSET::set_top_bottom ( UNICHAR_ID  unichar_id,
int  min_bottom,
int  max_bottom,
int  min_top,
int  max_top 
)
inline

Definition at line 572 of file unicharset.h.

574  {
575  unichars[unichar_id].properties.min_bottom =
576  ClipToRange<int>(min_bottom, 0, UINT8_MAX);
577  unichars[unichar_id].properties.max_bottom =
578  ClipToRange<int>(max_bottom, 0, UINT8_MAX);
579  unichars[unichar_id].properties.min_top =
580  ClipToRange<int>(min_top, 0, UINT8_MAX);
581  unichars[unichar_id].properties.max_top =
582  ClipToRange<int>(max_top, 0, UINT8_MAX);
583  }

◆ set_width_stats()

void UNICHARSET::set_width_stats ( UNICHAR_ID  unichar_id,
float  width,
float  width_sd 
)
inline

Definition at line 597 of file unicharset.h.

597  {
598  unichars[unichar_id].properties.width = width;
599  unichars[unichar_id].properties.width_sd = width_sd;
600  }

◆ SetPropertiesFromOther()

void UNICHARSET::SetPropertiesFromOther ( const UNICHARSET src)
inline

Definition at line 535 of file unicharset.h.

535  {
537  }

◆ size()

int UNICHARSET::size ( ) const
inline

Definition at line 341 of file unicharset.h.

341  {
342  return size_used;
343  }

◆ SizesDistinct()

bool UNICHARSET::SizesDistinct ( UNICHAR_ID  id1,
UNICHAR_ID  id2 
) const

Definition at line 485 of file unicharset.cpp.

485  {
486  int overlap = std::min(unichars[id1].properties.max_top,
487  unichars[id2].properties.max_top) -
488  std::max(unichars[id1].properties.min_top,
489  unichars[id2].properties.min_top);
490  return overlap <= 0;
491 }

◆ step()

int UNICHARSET::step ( const char *  str) const

Definition at line 232 of file unicharset.cpp.

232  {
233  GenericVector<UNICHAR_ID> encoding;
234  GenericVector<char> lengths;
235  encode_string(str, true, &encoding, &lengths, nullptr);
236  if (encoding.empty() || encoding[0] == INVALID_UNICHAR_ID) return 0;
237  return lengths[0];
238 }

◆ thai_sid()

int UNICHARSET::thai_sid ( ) const
inline

Definition at line 882 of file unicharset.h.

882 { return thai_sid_; }

◆ to_lower()

UNICHAR_ID UNICHARSET::to_lower ( UNICHAR_ID  unichar_id) const
inline

Definition at line 694 of file unicharset.h.

694  {
695  if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
696  ASSERT_HOST(contains_unichar_id(unichar_id));
697  if (unichars[unichar_id].properties.islower) return unichar_id;
698  return unichars[unichar_id].properties.other_case;
699  }

◆ to_upper()

UNICHAR_ID UNICHARSET::to_upper ( UNICHAR_ID  unichar_id) const
inline

Definition at line 702 of file unicharset.h.

702  {
703  if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
704  ASSERT_HOST(contains_unichar_id(unichar_id));
705  if (unichars[unichar_id].properties.isupper) return unichar_id;
706  return unichars[unichar_id].properties.other_case;
707  }

◆ top_bottom_useful()

bool UNICHARSET::top_bottom_useful ( ) const
inline

Definition at line 527 of file unicharset.h.

527  {
528  return top_bottom_set_;
529  }

◆ unichar_insert() [1/2]

void UNICHARSET::unichar_insert ( const char *const  unichar_repr)
inline

Definition at line 264 of file unicharset.h.

264  {
266  }

◆ unichar_insert() [2/2]

void UNICHARSET::unichar_insert ( const char *const  unichar_repr,
OldUncleanUnichars  old_style 
)

Definition at line 625 of file unicharset.cpp.

626  {
627  if (old_style == OldUncleanUnichars::kTrue) old_style_included_ = true;
628  std::string cleaned =
629  old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
630  if (!cleaned.empty() && !ids.contains(cleaned.data(), cleaned.size())) {
631  const char* str = cleaned.c_str();
632  GenericVector<int> encoding;
633  if (!old_style_included_ &&
634  encode_string(str, true, &encoding, nullptr, nullptr))
635  return;
636  if (size_used == size_reserved) {
637  if (size_used == 0)
638  reserve(8);
639  else
640  reserve(2 * size_used);
641  }
642  int index = 0;
643  do {
644  if (index >= UNICHAR_LEN) {
645  fprintf(stderr, "Utf8 buffer too big, size>%d for %s\n", UNICHAR_LEN,
646  unichar_repr);
647  return;
648  }
649  unichars[size_used].representation[index++] = *str++;
650  } while (*str != '\0');
651  unichars[size_used].representation[index] = '\0';
652  this->set_script(size_used, null_script);
653  // If the given unichar_repr represents a fragmented character, set
654  // fragment property to a pointer to CHAR_FRAGMENT class instance with
655  // information parsed from the unichar representation. Use the script
656  // of the base unichar for the fragmented character if possible.
657  CHAR_FRAGMENT* frag =
658  CHAR_FRAGMENT::parse_from_string(unichars[size_used].representation);
659  this->unichars[size_used].properties.fragment = frag;
660  if (frag != nullptr && this->contains_unichar(frag->get_unichar())) {
661  this->unichars[size_used].properties.script_id =
662  this->get_script(frag->get_unichar());
663  }
664  this->unichars[size_used].properties.enabled = true;
665  ids.insert(unichars[size_used].representation, size_used);
666  ++size_used;
667  }
668 }

◆ unichar_insert_backwards_compatible()

void UNICHARSET::unichar_insert_backwards_compatible ( const char *const  unichar_repr)
inline

Definition at line 269 of file unicharset.h.

269  {
270  std::string cleaned = CleanupString(unichar_repr);
271  if (cleaned != unichar_repr) {
273  } else {
274  int old_size = size();
276  if (size() == old_size) {
278  }
279  }
280  }

◆ unichar_to_id() [1/2]

UNICHAR_ID UNICHARSET::unichar_to_id ( const char *const  unichar_repr) const

Definition at line 209 of file unicharset.cpp.

209  {
210  std::string cleaned =
211  old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
212  return ids.contains(cleaned.data(), cleaned.size())
213  ? ids.unichar_to_id(cleaned.data(), cleaned.size())
214  : INVALID_UNICHAR_ID;
215 }

◆ unichar_to_id() [2/2]

UNICHAR_ID UNICHARSET::unichar_to_id ( const char *const  unichar_repr,
int  length 
) const

Definition at line 217 of file unicharset.cpp.

218  {
219  assert(length > 0 && length <= UNICHAR_LEN);
220  std::string cleaned(unichar_repr, length);
221  if (!old_style_included_) cleaned = CleanupString(unichar_repr, length);
222  return ids.contains(cleaned.data(), cleaned.size())
223  ? ids.unichar_to_id(cleaned.data(), cleaned.size())
224  : INVALID_UNICHAR_ID;
225 }

Member Data Documentation

◆ kCustomLigatures

const char * UNICHARSET::kCustomLigatures
static
Initial value:
= {
{"ct", "\uE003"},
{"ſh", "\uE006"},
{"ſi", "\uE007"},
{"ſl", "\uE008"},
{"ſſ", "\uE009"},
{nullptr, nullptr}
}

Definition at line 150 of file unicharset.h.

◆ kSpecialUnicharCodes

const char * UNICHARSET::kSpecialUnicharCodes
static
Initial value:
= {
" ",
"Joined",
"|Broken|0|1"
}

Definition at line 153 of file unicharset.h.


The documentation for this class was generated from the following files:
UNICHARSET::load_from_file
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:378
string
std::string string
Definition: equationdetect_test.cc:21
UNICHARSET::get_direction
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:680
UNICHARSET::U_EUROPEAN_NUMBER
Definition: unicharset.h:159
UNICHARSET::get_islower
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:488
UNICHARSET::contains_unichar_id
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:284
LocalFilePointer::fgets
char * fgets(char *dst, int size)
Definition: unicharset.cpp:735
UNICHARSET::encode_string
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:258
UNICHARSET::get_isdigit
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:502
UNICHARSET::get_isalpha
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:481
UNICHARSET::get_script_id_from_name
int get_script_id_from_name(const char *script_name) const
Definition: unicharset.cpp:1099
CHAR_FRAGMENT::to_string
static STRING to_string(const char *unichar, int pos, int total, bool natural)
Definition: unicharset.cpp:1044
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
UNICHARMAP::contains
bool contains(const char *const unichar_repr, int length) const
Definition: unicharmap.cpp:79
OldUncleanUnichars::kTrue
CHAR_FRAGMENT::get_unichar
const char * get_unichar() const
Definition: unicharset.h:70
UNICHARSET::U_FIRST_STRONG_ISOLATE
Definition: unicharset.h:176
UNICHARMAP::clear
void clear()
Definition: unicharmap.cpp:115
UNICHARSET::get_advance_stats
void get_advance_stats(UNICHAR_ID unichar_id, float *advance, float *advance_sd) const
Definition: unicharset.h:620
STRING
Definition: strngs.h:45
UNICHARSET::get_script_from_script_id
const char * get_script_from_script_id(int id) const
Definition: unicharset.h:844
UNICHARSET::U_WHITE_SPACE_NEUTRAL
Definition: unicharset.h:166
UNICHARSET::U_OTHER_NEUTRAL
Definition: unicharset.h:167
LocalFilePointer
Definition: unicharset.cpp:732
UNICHARSET::U_LEFT_TO_RIGHT
Definition: unicharset.h:157
UNICHARSET::set_normed_ids
void set_normed_ids(UNICHAR_ID unichar_id)
Definition: unicharset.cpp:372
UNICHARSET::U_RIGHT_TO_LEFT_EMBEDDING
Definition: unicharset.h:171
UNICHARSET::get_width_stats
void get_width_stats(UNICHAR_ID unichar_id, float *width, float *width_sd) const
Definition: unicharset.h:586
UNICHARSET::step
int step(const char *str) const
Definition: unicharset.cpp:232
UNICHARSET::get_normed_unichar
const char * get_normed_unichar(UNICHAR_ID unichar_id) const
Definition: unicharset.h:818
tesseract::UNICHAR
Definition: unichar.h:59
UNICHARSET::clear
void clear()
Definition: unicharset.h:306
UNICHARSET::save_to_file
bool save_to_file(const char *const filename) const
Definition: unicharset.h:350
UNICHARSET::get_script
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:653
UNICHARSET::kCustomLigatures
static const TESS_API char * kCustomLigatures[][2]
Definition: unicharset.h:150
UNICHARSET::get_ispunctuation
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:509
UNICHARSET::U_EUROPEAN_NUMBER_SEPARATOR
Definition: unicharset.h:160
CHAR_FRAGMENT::parse_from_string
static CHAR_FRAGMENT * parse_from_string(const char *str)
Definition: unicharset.cpp:1057
UNICHARMAP::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr, int length) const
Definition: unicharmap.cpp:34
UNICHARSET::add_script
int add_script(const char *script)
Definition: unicharset.cpp:1020
UNICHAR_BROKEN
Definition: unicharset.h:36
GenericVector::push_back
int push_back(T object)
Definition: genericvector.h:799
UNICHARSET::get_properties
unsigned int get_properties(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:601
UNICHARSET::U_DIR_NON_SPACING_MARK
Definition: unicharset.h:174
UNICHARSET::debug_str
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:342
OldUncleanUnichars::kFalse
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
UNICHARSET::get_top_bottom
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:558
file
Definition: include_gunit.h:22
UNICHARSET::U_COMMON_NUMBER_SEPARATOR
Definition: unicharset.h:163
UNICHARSET::reserve
void reserve(int unichars_number)
Definition: unicharset.cpp:194
tesstrain_utils.int
int
Definition: tesstrain_utils.py:154
UNICHARSET::U_POP_DIRECTIONAL_FORMAT
Definition: unicharset.h:173
UNICHARSET::get_isprivate
bool get_isprivate(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:387
UNICHARSET::U_LEFT_TO_RIGHT_OVERRIDE
Definition: unicharset.h:169
UNICHARSET::U_BOUNDARY_NEUTRAL
Definition: unicharset.h:175
UNICHARSET::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
UNICHAR_SPACE
Definition: unicharset.h:34
UNICHARSET::U_RIGHT_TO_LEFT_ISOLATE
Definition: unicharset.h:178
UNICHARSET::U_RIGHT_TO_LEFT_OVERRIDE
Definition: unicharset.h:172
UNICHARSET::save_to_string
bool save_to_string(STRING *str) const
Definition: unicharset.cpp:691
GenericVector::empty
bool empty() const
Definition: genericvector.h:86
UNICHARSET::U_ARABIC_NUMBER
Definition: unicharset.h:162
UNICHARSET::get_chartype
char get_chartype(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:616
UNICHARSET::get_mirror
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:687
UNICHARSET::set_script
void set_script(UNICHAR_ID unichar_id, const char *value)
Definition: unicharset.h:452
UNICHARSET::CleanupString
static std::string CleanupString(const char *utf8_str)
Definition: unicharset.h:246
SPECIAL_UNICHAR_CODES_COUNT
Definition: unicharset.h:38
UNICHARSET::unichar_insert_backwards_compatible
void unichar_insert_backwards_compatible(const char *const unichar_repr)
Definition: unicharset.h:269
UNICHARSET::has_special_codes
bool has_special_codes() const
Definition: unicharset.h:712
UNICHARSET::kSpecialUnicharCodes
static const TESS_API char * kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT]
Definition: unicharset.h:153
UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:36
GenericVector< UNICHAR_ID >
UNICHARSET::U_EUROPEAN_NUMBER_TERMINATOR
Definition: unicharset.h:161
UNICHARSET::get_isupper
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:495
CHAR_FRAGMENT
Definition: unicharset.h:48
UNICHARSET::U_POP_DIRECTIONAL_ISOLATE
Definition: unicharset.h:179
UNICHARSET::U_BLOCK_SEPARATOR
Definition: unicharset.h:164
UNICHAR_LEN
#define UNICHAR_LEN
Definition: unichar.h:32
STRING::length
int32_t length() const
Definition: strngs.cpp:187
UNICHARSET::PartialSetPropertiesFromOther
void PartialSetPropertiesFromOther(int start_index, const UNICHARSET &src)
Definition: unicharset.cpp:404
kMinXHeightFraction
const double kMinXHeightFraction
Definition: unicharset.cpp:58
UNICHARSET::set_isngram
void set_isngram(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:446
GenericVector::truncate
void truncate(int size)
Definition: genericvector.h:132
kMinCapHeightFraction
const double kMinCapHeightFraction
Definition: unicharset.cpp:59
UNICHARSET::contains_unichar
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:670
tesseract::TFile::FGets
char * FGets(char *buffer, int buffer_size)
Definition: serialis.cpp:262
UNICHARSET::U_RIGHT_TO_LEFT_ARABIC
Definition: unicharset.h:170
UNICHARSET::get_bearing_stats
void get_bearing_stats(UNICHAR_ID unichar_id, float *bearing, float *bearing_sd) const
Definition: unicharset.h:603
UNICHARSET::U_CHAR_DIRECTION_COUNT
Definition: unicharset.h:181
UNICHARSET::get_fragment
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:724
UNICHARSET::id_to_unichar
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
UNICHARSET::get_other_case
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
Definition: unicharset.h:673
UNICHARMAP::insert
void insert(const char *const unichar_repr, UNICHAR_ID id)
Definition: unicharmap.cpp:56
UNICHARSET::U_SEGMENT_SEPARATOR
Definition: unicharset.h:165
UNICHARSET::U_LEFT_TO_RIGHT_ISOLATE
Definition: unicharset.h:177
UNICHARSET::debug_utf8_str
static STRING debug_utf8_str(const char *str)
Definition: unicharset.cpp:318
GenericVector::size
int size() const
Definition: genericvector.h:71
tesseract::Serialize
bool Serialize(FILE *fp, const char *data, size_t n=1)
Definition: serialis.cpp:73
UNICHARSET::unichar_insert
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:625
UNICHARSET::size
int size() const
Definition: unicharset.h:341
UNICHARSET::U_RIGHT_TO_LEFT
Definition: unicharset.h:158
UNICHAR_JOINED
Definition: unicharset.h:35
UNICHARSET::U_LEFT_TO_RIGHT_EMBEDDING
Definition: unicharset.h:168
UNICHARSET::delete_pointers_in_unichars
void delete_pointers_in_unichars()
Definition: unicharset.h:298