tesseract  4.0.0-1-g2a2b
tesseract::ClassPruner Class Reference

Public Member Functions

 ClassPruner (int max_classes)
 
 ~ClassPruner ()
 
void ComputeScores (const INT_TEMPLATES_STRUCT *int_templates, int num_features, const INT_FEATURE_STRUCT *features)
 
void AdjustForExpectedNumFeatures (const uint16_t *expected_num_features, int cutoff_strength)
 
void DisableDisabledClasses (const UNICHARSET &unicharset)
 
void DisableFragments (const UNICHARSET &unicharset)
 
void NormalizeForXheight (int norm_multiplier, const uint8_t *normalization_factors)
 
void NoNormalization ()
 
void PruneAndSort (int pruning_factor, int keep_this, bool max_of_non_fragments, const UNICHARSET &unicharset)
 
void DebugMatch (const Classify &classify, const INT_TEMPLATES_STRUCT *int_templates, const INT_FEATURE_STRUCT *features) const
 
void SummarizeResult (const Classify &classify, const INT_TEMPLATES_STRUCT *int_templates, const uint16_t *expected_num_features, int norm_multiplier, const uint8_t *normalization_factors) const
 
int SetupResults (GenericVector< CP_RESULT_STRUCT > *results) const
 

Detailed Description

Definition at line 104 of file intmatcher.cpp.

Constructor & Destructor Documentation

◆ ClassPruner()

tesseract::ClassPruner::ClassPruner ( int  max_classes)
inline

Definition at line 106 of file intmatcher.cpp.

106  {
107  // The unrolled loop in ComputeScores means that the array sizes need to
108  // be rounded up so that the array is big enough to accommodate the extra
109  // entries accessed by the unrolling. Each pruner word is of sized
110  // BITS_PER_WERD and each entry is NUM_BITS_PER_CLASS, so there are
111  // BITS_PER_WERD / NUM_BITS_PER_CLASS entries.
112  // See ComputeScores.
113  max_classes_ = max_classes;
114  rounded_classes_ = RoundUp(
116  class_count_ = new int[rounded_classes_];
117  norm_count_ = new int[rounded_classes_];
118  sort_key_ = new int[rounded_classes_ + 1];
119  sort_index_ = new int[rounded_classes_ + 1];
120  for (int i = 0; i < rounded_classes_; i++) {
121  class_count_[i] = 0;
122  }
123  pruning_threshold_ = 0;
124  num_features_ = 0;
125  num_classes_ = 0;
126  }
#define NUM_BITS_PER_CLASS
Definition: intproto.h:55
#define BITS_PER_WERD
Definition: intproto.h:45
int RoundUp(int n, int block_size)
Definition: helpers.h:105
#define WERDS_PER_CP_VECTOR
Definition: intproto.h:62

◆ ~ClassPruner()

tesseract::ClassPruner::~ClassPruner ( )
inline

Definition at line 128 of file intmatcher.cpp.

128  {
129  delete []class_count_;
130  delete []norm_count_;
131  delete []sort_key_;
132  delete []sort_index_;
133  }

Member Function Documentation

◆ AdjustForExpectedNumFeatures()

void tesseract::ClassPruner::AdjustForExpectedNumFeatures ( const uint16_t *  expected_num_features,
int  cutoff_strength 
)
inline

Adjusts the scores according to the number of expected features. Used in lieu of a constant bias, this penalizes classes that expect more features than there are present. Thus an actual c will score higher for c than e, even though almost all the features match e as well as c, because e expects more features to be present.

Definition at line 208 of file intmatcher.cpp.

209  {
210  for (int class_id = 0; class_id < max_classes_; ++class_id) {
211  if (num_features_ < expected_num_features[class_id]) {
212  int deficit = expected_num_features[class_id] - num_features_;
213  class_count_[class_id] -= class_count_[class_id] * deficit /
214  (num_features_ * cutoff_strength + deficit);
215  }
216  }
217  }

◆ ComputeScores()

void tesseract::ClassPruner::ComputeScores ( const INT_TEMPLATES_STRUCT int_templates,
int  num_features,
const INT_FEATURE_STRUCT features 
)
inline

Computes the scores for every class in the character set, by summing the weights for each feature and stores the sums internally in class_count_.

Definition at line 137 of file intmatcher.cpp.

138  {
139  num_features_ = num_features;
140  int num_pruners = int_templates->NumClassPruners;
141  for (int f = 0; f < num_features; ++f) {
142  const INT_FEATURE_STRUCT* feature = &features[f];
143  // Quantize the feature to NUM_CP_BUCKETS*NUM_CP_BUCKETS*NUM_CP_BUCKETS.
144  int x = feature->X * NUM_CP_BUCKETS >> 8;
145  int y = feature->Y * NUM_CP_BUCKETS >> 8;
146  int theta = feature->Theta * NUM_CP_BUCKETS >> 8;
147  int class_id = 0;
148  // Each CLASS_PRUNER_STRUCT only covers CLASSES_PER_CP(32) classes, so
149  // we need a collection of them, indexed by pruner_set.
150  for (int pruner_set = 0; pruner_set < num_pruners; ++pruner_set) {
151  // Look up quantized feature in a 3-D array, an array of weights for
152  // each class.
153  const uint32_t* pruner_word_ptr =
154  int_templates->ClassPruners[pruner_set]->p[x][y][theta];
155  for (int word = 0; word < WERDS_PER_CP_VECTOR; ++word) {
156  uint32_t pruner_word = *pruner_word_ptr++;
157  // This inner loop is unrolled to speed up the ClassPruner.
158  // Currently gcc would not unroll it unless it is set to O3
159  // level of optimization or -funroll-loops is specified.
160  /*
161  uint32_t class_mask = (1 << NUM_BITS_PER_CLASS) - 1;
162  for (int bit = 0; bit < BITS_PER_WERD/NUM_BITS_PER_CLASS; bit++) {
163  class_count_[class_id++] += pruner_word & class_mask;
164  pruner_word >>= NUM_BITS_PER_CLASS;
165  }
166  */
167  class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
168  pruner_word >>= NUM_BITS_PER_CLASS;
169  class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
170  pruner_word >>= NUM_BITS_PER_CLASS;
171  class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
172  pruner_word >>= NUM_BITS_PER_CLASS;
173  class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
174  pruner_word >>= NUM_BITS_PER_CLASS;
175  class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
176  pruner_word >>= NUM_BITS_PER_CLASS;
177  class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
178  pruner_word >>= NUM_BITS_PER_CLASS;
179  class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
180  pruner_word >>= NUM_BITS_PER_CLASS;
181  class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
182  pruner_word >>= NUM_BITS_PER_CLASS;
183  class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
184  pruner_word >>= NUM_BITS_PER_CLASS;
185  class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
186  pruner_word >>= NUM_BITS_PER_CLASS;
187  class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
188  pruner_word >>= NUM_BITS_PER_CLASS;
189  class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
190  pruner_word >>= NUM_BITS_PER_CLASS;
191  class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
192  pruner_word >>= NUM_BITS_PER_CLASS;
193  class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
194  pruner_word >>= NUM_BITS_PER_CLASS;
195  class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
196  pruner_word >>= NUM_BITS_PER_CLASS;
197  class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
198  }
199  }
200  }
201  }
CLASS_PRUNER_STRUCT * ClassPruners[MAX_NUM_CLASS_PRUNERS]
Definition: intproto.h:122
#define NUM_BITS_PER_CLASS
Definition: intproto.h:55
#define CLASS_PRUNER_CLASS_MASK
Definition: intproto.h:56
uint32_t p[NUM_CP_BUCKETS][NUM_CP_BUCKETS][NUM_CP_BUCKETS][WERDS_PER_CP_VECTOR]
Definition: intproto.h:78
#define NUM_CP_BUCKETS
Definition: intproto.h:53
#define WERDS_PER_CP_VECTOR
Definition: intproto.h:62

◆ DebugMatch()

void tesseract::ClassPruner::DebugMatch ( const Classify classify,
const INT_TEMPLATES_STRUCT int_templates,
const INT_FEATURE_STRUCT features 
) const
inline

Prints debug info on the class pruner matches for the pruned classes only.

Definition at line 297 of file intmatcher.cpp.

299  {
300  int num_pruners = int_templates->NumClassPruners;
301  int max_num_classes = int_templates->NumClasses;
302  for (int f = 0; f < num_features_; ++f) {
303  const INT_FEATURE_STRUCT* feature = &features[f];
304  tprintf("F=%3d(%d,%d,%d),", f, feature->X, feature->Y, feature->Theta);
305  // Quantize the feature to NUM_CP_BUCKETS*NUM_CP_BUCKETS*NUM_CP_BUCKETS.
306  int x = feature->X * NUM_CP_BUCKETS >> 8;
307  int y = feature->Y * NUM_CP_BUCKETS >> 8;
308  int theta = feature->Theta * NUM_CP_BUCKETS >> 8;
309  int class_id = 0;
310  for (int pruner_set = 0; pruner_set < num_pruners; ++pruner_set) {
311  // Look up quantized feature in a 3-D array, an array of weights for
312  // each class.
313  const uint32_t* pruner_word_ptr =
314  int_templates->ClassPruners[pruner_set]->p[x][y][theta];
315  for (int word = 0; word < WERDS_PER_CP_VECTOR; ++word) {
316  uint32_t pruner_word = *pruner_word_ptr++;
317  for (int word_class = 0; word_class < 16 &&
318  class_id < max_num_classes; ++word_class, ++class_id) {
319  if (norm_count_[class_id] >= pruning_threshold_) {
320  tprintf(" %s=%d,",
321  classify.ClassIDToDebugStr(int_templates,
322  class_id, 0).string(),
323  pruner_word & CLASS_PRUNER_CLASS_MASK);
324  }
325  pruner_word >>= NUM_BITS_PER_CLASS;
326  }
327  }
328  tprintf("\n");
329  }
330  }
331  }
CLASS_PRUNER_STRUCT * ClassPruners[MAX_NUM_CLASS_PRUNERS]
Definition: intproto.h:122
#define NUM_BITS_PER_CLASS
Definition: intproto.h:55
#define CLASS_PRUNER_CLASS_MASK
Definition: intproto.h:56
uint32_t p[NUM_CP_BUCKETS][NUM_CP_BUCKETS][NUM_CP_BUCKETS][WERDS_PER_CP_VECTOR]
Definition: intproto.h:78
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
#define NUM_CP_BUCKETS
Definition: intproto.h:53
#define WERDS_PER_CP_VECTOR
Definition: intproto.h:62

◆ DisableDisabledClasses()

void tesseract::ClassPruner::DisableDisabledClasses ( const UNICHARSET unicharset)
inline

Zeros the scores for classes disabled in the unicharset. Implements the black-list to recognize a subset of the character set.

Definition at line 221 of file intmatcher.cpp.

221  {
222  for (int class_id = 0; class_id < max_classes_; ++class_id) {
223  if (!unicharset.get_enabled(class_id))
224  class_count_[class_id] = 0; // This char is disabled!
225  }
226  }
bool get_enabled(UNICHAR_ID unichar_id) const
Definition: unicharset.h:873

◆ DisableFragments()

void tesseract::ClassPruner::DisableFragments ( const UNICHARSET unicharset)
inline

Zeros the scores of fragments.

Definition at line 229 of file intmatcher.cpp.

229  {
230  for (int class_id = 0; class_id < max_classes_; ++class_id) {
231  // Do not include character fragments in the class pruner
232  // results if disable_character_fragments is true.
233  if (unicharset.get_fragment(class_id)) {
234  class_count_[class_id] = 0;
235  }
236  }
237  }
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:729

◆ NoNormalization()

void tesseract::ClassPruner::NoNormalization ( )
inline

The nop normalization copies the class_count_ array to norm_count_.

Definition at line 252 of file intmatcher.cpp.

252  {
253  for (int class_id = 0; class_id < max_classes_; class_id++) {
254  norm_count_[class_id] = class_count_[class_id];
255  }
256  }

◆ NormalizeForXheight()

void tesseract::ClassPruner::NormalizeForXheight ( int  norm_multiplier,
const uint8_t *  normalization_factors 
)
inline

Normalizes the counts for xheight, putting the normalized result in norm_count_. Applies a simple subtractive penalty for incorrect vertical position provided by the normalization_factors array, indexed by character class, and scaled by the norm_multiplier.

Definition at line 243 of file intmatcher.cpp.

244  {
245  for (int class_id = 0; class_id < max_classes_; class_id++) {
246  norm_count_[class_id] = class_count_[class_id] -
247  ((norm_multiplier * normalization_factors[class_id]) >> 8);
248  }
249  }

◆ PruneAndSort()

void tesseract::ClassPruner::PruneAndSort ( int  pruning_factor,
int  keep_this,
bool  max_of_non_fragments,
const UNICHARSET unicharset 
)
inline

Prunes the classes using <the maximum count> * pruning_factor/256 as a threshold for keeping classes. If max_of_non_fragments, then ignore fragments in computing the maximum count.

Definition at line 261 of file intmatcher.cpp.

262  {
263  int max_count = 0;
264  for (int c = 0; c < max_classes_; ++c) {
265  if (norm_count_[c] > max_count &&
266  // This additional check is added in order to ensure that
267  // the classifier will return at least one non-fragmented
268  // character match.
269  // TODO(daria): verify that this helps accuracy and does not
270  // hurt performance.
271  (!max_of_non_fragments || !unicharset.get_fragment(c))) {
272  max_count = norm_count_[c];
273  }
274  }
275  // Prune Classes.
276  pruning_threshold_ = (max_count * pruning_factor) >> 8;
277  // Select Classes.
278  if (pruning_threshold_ < 1)
279  pruning_threshold_ = 1;
280  num_classes_ = 0;
281  for (int class_id = 0; class_id < max_classes_; class_id++) {
282  if (norm_count_[class_id] >= pruning_threshold_ ||
283  class_id == keep_this) {
284  ++num_classes_;
285  sort_index_[num_classes_] = class_id;
286  sort_key_[num_classes_] = norm_count_[class_id];
287  }
288  }
289 
290  // Sort Classes using Heapsort Algorithm.
291  if (num_classes_ > 1)
292  HeapSort(num_classes_, sort_key_, sort_index_);
293  }
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:729
void HeapSort(int n, int ra[], int rb[])

◆ SetupResults()

int tesseract::ClassPruner::SetupResults ( GenericVector< CP_RESULT_STRUCT > *  results) const
inline

Copies the pruned, sorted classes into the output results and returns the number of classes.

Definition at line 357 of file intmatcher.cpp.

357  {
358  CP_RESULT_STRUCT empty;
359  results->init_to_size(num_classes_, empty);
360  for (int c = 0; c < num_classes_; ++c) {
361  (*results)[c].Class = sort_index_[num_classes_ - c];
362  (*results)[c].Rating = 1.0 - sort_key_[num_classes_ - c] /
363  (static_cast<float>(CLASS_PRUNER_CLASS_MASK) * num_features_);
364  }
365  return num_classes_;
366  }
#define CLASS_PRUNER_CLASS_MASK
Definition: intproto.h:56
void init_to_size(int size, const T &t)

◆ SummarizeResult()

void tesseract::ClassPruner::SummarizeResult ( const Classify classify,
const INT_TEMPLATES_STRUCT int_templates,
const uint16_t *  expected_num_features,
int  norm_multiplier,
const uint8_t *  normalization_factors 
) const
inline

Prints a summary of the pruner result.

Definition at line 334 of file intmatcher.cpp.

338  {
339  tprintf("CP:%d classes, %d features:\n", num_classes_, num_features_);
340  for (int i = 0; i < num_classes_; ++i) {
341  int class_id = sort_index_[num_classes_ - i];
342  STRING class_string = classify.ClassIDToDebugStr(int_templates,
343  class_id, 0);
344  tprintf("%s:Initial=%d, E=%d, Xht-adj=%d, N=%d, Rat=%.2f\n",
345  class_string.string(),
346  class_count_[class_id],
347  expected_num_features[class_id],
348  (norm_multiplier * normalization_factors[class_id]) >> 8,
349  sort_key_[num_classes_ - i],
350  100.0 - 100.0 * sort_key_[num_classes_ - i] /
351  (CLASS_PRUNER_CLASS_MASK * num_features_));
352  }
353  }
const char * string() const
Definition: strngs.cpp:196
#define CLASS_PRUNER_CLASS_MASK
Definition: intproto.h:56
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
Definition: strngs.h:45

The documentation for this class was generated from the following file: