tesseract  5.0.0-alpha-619-ge9db
tesseract::LMConsistencyInfo Struct Reference

#include <lm_consistency.h>

Public Types

enum  ChartypeEnum { CT_NONE, CT_ALPHA, CT_DIGIT, CT_OTHER }
 

Public Member Functions

 LMConsistencyInfo (const LMConsistencyInfo *parent_info)
 
int NumInconsistentPunc () const
 
int NumInconsistentCase () const
 
int NumInconsistentChartype () const
 
bool Consistent () const
 
int NumInconsistentSpaces () const
 
int InconsistentXHeight () const
 
void ComputeXheightConsistency (const BLOB_CHOICE *b, bool is_punc)
 
float BodyMinXHeight () const
 
float BodyMaxXHeight () const
 

Public Attributes

EDGE_REF punc_ref
 
int num_alphas
 
int num_digits
 
int num_punc
 
int num_other
 
ChartypeEnum chartype
 
XHeightConsistencyEnum xht_decision
 
int num_non_first_upper
 
int num_lower
 
int script_id
 
int num_inconsistent_spaces
 
float xht_lo [kNumPos]
 
float xht_hi [kNumPos]
 
int16_t xht_count [kNumPos]
 
int16_t xht_count_punc [kNumPos]
 
int16_t xht_sp
 
int16_t xpos_entropy
 
bool invalid_punc
 
bool inconsistent_script
 
bool inconsistent_font
 

Static Public Attributes

static const int kShiftThresh = 1
 
static const int kMaxEntropy = 1
 
static const int kSUB = 0
 
static const int kNORM = 1
 
static const int kSUP = 2
 
static const int kNumPos = 3
 

Detailed Description

Definition at line 38 of file lm_consistency.h.

Member Enumeration Documentation

◆ ChartypeEnum

Enumerator
CT_NONE 
CT_ALPHA 
CT_DIGIT 
CT_OTHER 

Definition at line 39 of file lm_consistency.h.

Constructor & Destructor Documentation

◆ LMConsistencyInfo()

tesseract::LMConsistencyInfo::LMConsistencyInfo ( const LMConsistencyInfo parent_info)
inlineexplicit

Definition at line 53 of file lm_consistency.h.

53  {
54  if (parent_info == nullptr) {
55  // Initialize from scratch.
56  num_alphas = 0;
57  num_digits = 0;
58  num_punc = 0;
59  num_other = 0;
60  chartype = CT_NONE;
61  punc_ref = NO_EDGE;
62  invalid_punc = false;
64  num_lower = 0;
65  script_id = 0;
66  inconsistent_script = false;
68  inconsistent_font = false;
69  // Initialize XHeight stats.
70  for (int i = 0; i < kNumPos; i++) {
71  xht_count[i] = 0;
72  xht_count_punc[i] = 0;
73  xht_lo[i] = 0;
74  xht_hi[i] = 256; // kBlnCellHeight
75  }
76  xht_sp = -1; // This invalid value indicates that there was no parent.
77  xpos_entropy = 0;
79  } else {
80  // Copy parent info
81  *this = *parent_info;
82  }
83  }

Member Function Documentation

◆ BodyMaxXHeight()

float tesseract::LMConsistencyInfo::BodyMaxXHeight ( ) const
inline

Definition at line 111 of file lm_consistency.h.

111  {
112  if (InconsistentXHeight())
113  return static_cast<float>(INT16_MAX);
114  return xht_hi[kNORM];
115  }

◆ BodyMinXHeight()

float tesseract::LMConsistencyInfo::BodyMinXHeight ( ) const
inline

Definition at line 106 of file lm_consistency.h.

106  {
107  if (InconsistentXHeight())
108  return 0.0f;
109  return xht_lo[kNORM];
110  }

◆ ComputeXheightConsistency()

void tesseract::LMConsistencyInfo::ComputeXheightConsistency ( const BLOB_CHOICE b,
bool  is_punc 
)

Definition at line 29 of file lm_consistency.cpp.

30  {
32  return; // It isn't going to get any better.
33 
34  // Compute xheight consistency.
35  bool parent_null = xht_sp < 0;
36  int parent_sp = xht_sp;
37  // Debug strings.
40  } else if (b->yshift() < -LMConsistencyInfo::kShiftThresh) {
42  } else {
44  }
45  xht_count[xht_sp]++;
46  if (is_punc) xht_count_punc[xht_sp]++;
47  if (!parent_null) {
48  xpos_entropy += abs(parent_sp - xht_sp);
49  }
50  // TODO(eger): Figure out a better way to account for small caps.
51  // For the first character not y-shifted, we only care if it is too small.
52  // Too large is common in drop caps and small caps.
53  // int16_t small_xht = b->min_xheight();
54  // if (parent_vse == nullptr && sp == LanguageModelConsistencyInfo::kNORM) {
55  // small_xht = 0;
56  // }
58  &(xht_lo[xht_sp]), &(xht_hi[xht_sp]));
59 
60 
61  // Compute xheight inconsistency kinds.
62  if (parent_null) {
63  if (xht_count[kNORM] == 1) {
65  } else {
67  }
68  return;
69  }
70 
71  // When we intersect the ranges of xheights in pixels for all characters in
72  // each position (subscript, normal, superscript),
73  // How much range must be left? 0? [exactly one pixel height for xheight] 1?
74  // TODO(eger): Extend this code to take a prior for the rest of the line.
75  const int kMinIntersectedXHeightRange = 0;
76  for (int i = 0; i < kNumPos; i++) {
77  if (xht_lo[i] > xht_hi[i] - kMinIntersectedXHeightRange) {
79  return;
80  }
81  }
82 
83  // Reject as improbable anything where there's much punctuation in subscript
84  // or superscript regions.
85  if (xht_count_punc[kSUB] > xht_count[kSUB] * 0.4 ||
86  xht_count_punc[kSUP] > xht_count[kSUP] * 0.4) {
88  return;
89  }
90 
91  // Now check that the subscript and superscript aren't too small relative to
92  // the mainline.
93  auto mainline_xht = static_cast<double>(xht_lo[kNORM]);
94  double kMinSizeRatio = 0.4;
95  if (mainline_xht > 0.0 &&
96  (static_cast<double>(xht_hi[kSUB]) / mainline_xht < kMinSizeRatio ||
97  static_cast<double>(xht_hi[kSUP]) / mainline_xht < kMinSizeRatio)) {
99  return;
100  }
101  // TODO(eger): Check into inconsistency of super/subscript y offsets.
102  if (xpos_entropy > kMaxEntropy) {
104  return;
105  }
106  if (xht_count[kSUB] == 0 && xht_count[kSUP] == 0) {
108  return;
109  }
111 }

◆ Consistent()

bool tesseract::LMConsistencyInfo::Consistent ( ) const
inline

Definition at line 94 of file lm_consistency.h.

94  {
95  return (NumInconsistentPunc() == 0 && NumInconsistentCase() == 0 &&
98  }

◆ InconsistentXHeight()

int tesseract::LMConsistencyInfo::InconsistentXHeight ( ) const
inline

Definition at line 102 of file lm_consistency.h.

102  {
103  return xht_decision == XH_INCONSISTENT;
104  }

◆ NumInconsistentCase()

int tesseract::LMConsistencyInfo::NumInconsistentCase ( ) const
inline

Definition at line 87 of file lm_consistency.h.

87  {
89  }

◆ NumInconsistentChartype()

int tesseract::LMConsistencyInfo::NumInconsistentChartype ( ) const
inline

Definition at line 90 of file lm_consistency.h.

90  {
91  return (NumInconsistentPunc() + num_other +
93  }

◆ NumInconsistentPunc()

int tesseract::LMConsistencyInfo::NumInconsistentPunc ( ) const
inline

Definition at line 84 of file lm_consistency.h.

84  {
85  return invalid_punc ? num_punc : 0;
86  }

◆ NumInconsistentSpaces()

int tesseract::LMConsistencyInfo::NumInconsistentSpaces ( ) const
inline

Definition at line 99 of file lm_consistency.h.

99  {
101  }

Member Data Documentation

◆ chartype

ChartypeEnum tesseract::LMConsistencyInfo::chartype

Definition at line 122 of file lm_consistency.h.

◆ inconsistent_font

bool tesseract::LMConsistencyInfo::inconsistent_font

Definition at line 137 of file lm_consistency.h.

◆ inconsistent_script

bool tesseract::LMConsistencyInfo::inconsistent_script

Definition at line 136 of file lm_consistency.h.

◆ invalid_punc

bool tesseract::LMConsistencyInfo::invalid_punc

Definition at line 135 of file lm_consistency.h.

◆ kMaxEntropy

const int tesseract::LMConsistencyInfo::kMaxEntropy = 1
static

Definition at line 47 of file lm_consistency.h.

◆ kNORM

const int tesseract::LMConsistencyInfo::kNORM = 1
static

Definition at line 50 of file lm_consistency.h.

◆ kNumPos

const int tesseract::LMConsistencyInfo::kNumPos = 3
static

Definition at line 51 of file lm_consistency.h.

◆ kShiftThresh

const int tesseract::LMConsistencyInfo::kShiftThresh = 1
static

Definition at line 43 of file lm_consistency.h.

◆ kSUB

const int tesseract::LMConsistencyInfo::kSUB = 0
static

Definition at line 50 of file lm_consistency.h.

◆ kSUP

const int tesseract::LMConsistencyInfo::kSUP = 2
static

Definition at line 50 of file lm_consistency.h.

◆ num_alphas

int tesseract::LMConsistencyInfo::num_alphas

Definition at line 118 of file lm_consistency.h.

◆ num_digits

int tesseract::LMConsistencyInfo::num_digits

Definition at line 119 of file lm_consistency.h.

◆ num_inconsistent_spaces

int tesseract::LMConsistencyInfo::num_inconsistent_spaces

Definition at line 127 of file lm_consistency.h.

◆ num_lower

int tesseract::LMConsistencyInfo::num_lower

Definition at line 125 of file lm_consistency.h.

◆ num_non_first_upper

int tesseract::LMConsistencyInfo::num_non_first_upper

Definition at line 124 of file lm_consistency.h.

◆ num_other

int tesseract::LMConsistencyInfo::num_other

Definition at line 121 of file lm_consistency.h.

◆ num_punc

int tesseract::LMConsistencyInfo::num_punc

Definition at line 120 of file lm_consistency.h.

◆ punc_ref

EDGE_REF tesseract::LMConsistencyInfo::punc_ref

Definition at line 117 of file lm_consistency.h.

◆ script_id

int tesseract::LMConsistencyInfo::script_id

Definition at line 126 of file lm_consistency.h.

◆ xht_count

int16_t tesseract::LMConsistencyInfo::xht_count[kNumPos]

Definition at line 131 of file lm_consistency.h.

◆ xht_count_punc

int16_t tesseract::LMConsistencyInfo::xht_count_punc[kNumPos]

Definition at line 132 of file lm_consistency.h.

◆ xht_decision

XHeightConsistencyEnum tesseract::LMConsistencyInfo::xht_decision

Definition at line 123 of file lm_consistency.h.

◆ xht_hi

float tesseract::LMConsistencyInfo::xht_hi[kNumPos]

Definition at line 130 of file lm_consistency.h.

◆ xht_lo

float tesseract::LMConsistencyInfo::xht_lo[kNumPos]

Definition at line 129 of file lm_consistency.h.

◆ xht_sp

int16_t tesseract::LMConsistencyInfo::xht_sp

Definition at line 133 of file lm_consistency.h.

◆ xpos_entropy

int16_t tesseract::LMConsistencyInfo::xpos_entropy

Definition at line 134 of file lm_consistency.h.


The documentation for this struct was generated from the following files:
tesseract::LMConsistencyInfo::kNORM
static const int kNORM
Definition: lm_consistency.h:50
tesseract::LMConsistencyInfo::chartype
ChartypeEnum chartype
Definition: lm_consistency.h:122
tesseract::XH_SUBNORMAL
Definition: dict.h:78
BLOB_CHOICE::min_xheight
float min_xheight() const
Definition: ratngs.h:118
IntersectRange
void IntersectRange(const T &lower1, const T &upper1, T *lower2, T *upper2)
Definition: helpers.h:143
tesseract::LMConsistencyInfo::kShiftThresh
static const int kShiftThresh
Definition: lm_consistency.h:43
tesseract::LMConsistencyInfo::xht_count_punc
int16_t xht_count_punc[kNumPos]
Definition: lm_consistency.h:132
tesseract::LMConsistencyInfo::inconsistent_script
bool inconsistent_script
Definition: lm_consistency.h:136
tesseract::XH_GOOD
Definition: dict.h:78
tesseract::LMConsistencyInfo::xht_sp
int16_t xht_sp
Definition: lm_consistency.h:133
tesseract::LMConsistencyInfo::CT_ALPHA
Definition: lm_consistency.h:39
tesseract::LMConsistencyInfo::num_punc
int num_punc
Definition: lm_consistency.h:120
tesseract::LMConsistencyInfo::xht_decision
XHeightConsistencyEnum xht_decision
Definition: lm_consistency.h:123
tesseract::LMConsistencyInfo::kNumPos
static const int kNumPos
Definition: lm_consistency.h:51
tesseract::LMConsistencyInfo::kMaxEntropy
static const int kMaxEntropy
Definition: lm_consistency.h:47
tesseract::LMConsistencyInfo::xht_hi
float xht_hi[kNumPos]
Definition: lm_consistency.h:130
tesseract::LMConsistencyInfo::NumInconsistentCase
int NumInconsistentCase() const
Definition: lm_consistency.h:87
tesseract::LMConsistencyInfo::num_lower
int num_lower
Definition: lm_consistency.h:125
tesseract::LMConsistencyInfo::xht_count
int16_t xht_count[kNumPos]
Definition: lm_consistency.h:131
tesseract::LMConsistencyInfo::kSUB
static const int kSUB
Definition: lm_consistency.h:50
tesseract::LMConsistencyInfo::punc_ref
EDGE_REF punc_ref
Definition: lm_consistency.h:117
tesseract::XH_INCONSISTENT
Definition: dict.h:78
tesseract::LMConsistencyInfo::xpos_entropy
int16_t xpos_entropy
Definition: lm_consistency.h:134
BLOB_CHOICE::max_xheight
float max_xheight() const
Definition: ratngs.h:121
tesseract::LMConsistencyInfo::num_inconsistent_spaces
int num_inconsistent_spaces
Definition: lm_consistency.h:127
tesseract::LMConsistencyInfo::xht_lo
float xht_lo[kNumPos]
Definition: lm_consistency.h:129
tesseract::LMConsistencyInfo::num_digits
int num_digits
Definition: lm_consistency.h:119
tesseract::LMConsistencyInfo::script_id
int script_id
Definition: lm_consistency.h:126
BLOB_CHOICE::yshift
float yshift() const
Definition: ratngs.h:124
tesseract::LMConsistencyInfo::CT_DIGIT
Definition: lm_consistency.h:39
tesseract::LMConsistencyInfo::num_other
int num_other
Definition: lm_consistency.h:121
tesseract::LMConsistencyInfo::num_non_first_upper
int num_non_first_upper
Definition: lm_consistency.h:124
tesseract::LMConsistencyInfo::invalid_punc
bool invalid_punc
Definition: lm_consistency.h:135
tesseract::LMConsistencyInfo::CT_OTHER
Definition: lm_consistency.h:39
tesseract::LMConsistencyInfo::kSUP
static const int kSUP
Definition: lm_consistency.h:50
tesseract::LMConsistencyInfo::CT_NONE
Definition: lm_consistency.h:39
tesseract::LMConsistencyInfo::InconsistentXHeight
int InconsistentXHeight() const
Definition: lm_consistency.h:102
tesseract::LMConsistencyInfo::num_alphas
int num_alphas
Definition: lm_consistency.h:118
tesseract::LMConsistencyInfo::NumInconsistentChartype
int NumInconsistentChartype() const
Definition: lm_consistency.h:90
tesseract::LMConsistencyInfo::inconsistent_font
bool inconsistent_font
Definition: lm_consistency.h:137
tesseract::LMConsistencyInfo::NumInconsistentPunc
int NumInconsistentPunc() const
Definition: lm_consistency.h:84