tesseract  4.0.0-1-g2a2b
adaptmatch.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  ** Filename: adaptmatch.cpp
3  ** Purpose: High level adaptive matcher.
4  ** Author: Dan Johnson
5  **
6  ** (c) Copyright Hewlett-Packard Company, 1988.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  ******************************************************************************/
17 
18 /*-----------------------------------------------------------------------------
19  Include Files and Type Defines
20 -----------------------------------------------------------------------------*/
21 #ifdef HAVE_CONFIG_H
22 #include "config_auto.h"
23 #endif
24 
25 #include <algorithm> // for max, min
26 #include <cassert> // for assert
27 #include <cmath> // for fabs
28 #include <cstdint> // for INT32_MAX, UINT8_MAX
29 #include <cstdio> // for fflush, fclose, fopen, stdout, FILE
30 #include <cstdlib> // for malloc
31 #include <cstring> // for strstr, memset, strcmp
32 #include "adaptive.h" // for ADAPT_CLASS, free_adapted_templates
33 #include "ambigs.h" // for UnicharIdVector, UnicharAmbigs
34 #include "bitvec.h" // for FreeBitVector, NewBitVector, BIT_VECTOR
35 #include "blobs.h" // for TBLOB, TWERD
36 #include "callcpp.h" // for cprintf, window_wait
37 #include "classify.h" // for Classify, CST_FRAGMENT, CST_WHOLE
38 #include "dict.h" // for Dict
39 #include "errcode.h" // for ASSERT_HOST
40 #include "featdefs.h" // for CharNormDesc
41 #include "float2int.h" // for BASELINE_Y_SHIFT
42 #include "fontinfo.h" // for ScoredFont, FontSet
43 #include "genericvector.h" // for GenericVector
44 #include "helpers.h" // for IntCastRounded, ClipToRange
45 #include "host.h" // for FALSE, TRUE
46 #include "intfx.h" // for BlobToTrainingSample, INT_FX_RESULT_S...
47 #include "intmatcher.h" // for CP_RESULT_STRUCT, IntegerMatcher
48 #include "intproto.h" // for INT_FEATURE_STRUCT, (anonymous), Clas...
49 #include "matchdefs.h" // for CLASS_ID, FEATURE_ID, PROTO_ID, NO_PROTO
50 #include "mfoutline.h" // for baseline, character, MF_SCALE_FACTOR
51 #include "normalis.h" // for DENORM, kBlnBaselineOffset, kBlnXHeight
52 #include "normfeat.h" // for ActualOutlineLength, CharNormLength
53 #include "ocrfeatures.h" // for FEATURE_STRUCT, FreeFeatureSet, FEATURE
54 #include "oldlist.h" // for push, delete_d
55 #include "outfeat.h" // for OutlineFeatDir, OutlineFeatLength
56 #include "pageres.h" // for WERD_RES
57 #include "params.h" // for IntParam, BoolParam, DoubleParam, Str...
58 #include "picofeat.h" // for PicoFeatDir, PicoFeatX, PicoFeatY
59 #include "protos.h" // for PROTO_STRUCT, FillABC, PROTO
60 #include "ratngs.h" // for BLOB_CHOICE_IT, BLOB_CHOICE_LIST, BLO...
61 #include "rect.h" // for TBOX
62 #include "scrollview.h" // for ScrollView, ScrollView::BROWN, Scroll...
63 #include "seam.h" // for SEAM
64 #include "serialis.h" // for TFile
65 #include "shapeclassifier.h" // for ShapeClassifier
66 #include "shapetable.h" // for UnicharRating, ShapeTable, Shape, Uni...
67 #include "strngs.h" // for STRING
68 #include "tessclassifier.h" // for TessClassifier
69 #include "tessdatamanager.h" // for TessdataManager, TESSDATA_INTTEMP
70 #include "tprintf.h" // for tprintf
71 #include "trainingsample.h" // for TrainingSample
72 #include "unichar.h" // for UNICHAR_ID, INVALID_UNICHAR_ID
73 #include "unicharset.h" // for UNICHARSET, CHAR_FRAGMENT, UNICHAR_SPACE
74 #include "unicity_table.h" // for UnicityTable
75 
76 #define ADAPT_TEMPLATE_SUFFIX ".a"
77 
78 #define MAX_MATCHES 10
79 #define UNLIKELY_NUM_FEAT 200
80 #define NO_DEBUG 0
81 #define MAX_ADAPTABLE_WERD_SIZE 40
82 
83 #define ADAPTABLE_WERD_ADJUSTMENT (0.05)
84 
85 #define Y_DIM_OFFSET (Y_SHIFT - BASELINE_Y_SHIFT)
86 
87 #define WORST_POSSIBLE_RATING (0.0f)
88 
91 
92 struct ADAPT_RESULTS {
93  int32_t BlobLength;
97  float best_rating;
100 
103  inline void Initialize() {
104  BlobLength = INT32_MAX;
105  HasNonfragment = false;
106  ComputeBest();
107  }
108  // Computes best_unichar_id, best_match_index and best_rating.
109  void ComputeBest() {
110  best_unichar_id = INVALID_UNICHAR_ID;
111  best_match_index = -1;
113  for (int i = 0; i < match.size(); ++i) {
114  if (match[i].rating > best_rating) {
115  best_rating = match[i].rating;
116  best_unichar_id = match[i].unichar_id;
117  best_match_index = i;
118  }
119  }
120  }
121 };
122 
123 struct PROTO_KEY {
126  int ConfigId;
127 };
128 
129 /*-----------------------------------------------------------------------------
130  Private Macros
131 -----------------------------------------------------------------------------*/
132 inline bool MarginalMatch(float confidence, float matcher_great_threshold) {
133  return (1.0f - confidence) > matcher_great_threshold;
134 }
135 
136 /*-----------------------------------------------------------------------------
137  Private Function Prototypes
138 -----------------------------------------------------------------------------*/
139 // Returns the index of the given id in results, if present, or the size of the
140 // vector (index it will go at) if not present.
141 static int FindScoredUnichar(UNICHAR_ID id, const ADAPT_RESULTS& results) {
142  for (int i = 0; i < results.match.size(); i++) {
143  if (results.match[i].unichar_id == id)
144  return i;
145  }
146  return results.match.size();
147 }
148 
149 // Returns the current rating for a unichar id if we have rated it, defaulting
150 // to WORST_POSSIBLE_RATING.
151 static float ScoredUnichar(UNICHAR_ID id, const ADAPT_RESULTS& results) {
152  int index = FindScoredUnichar(id, results);
153  if (index >= results.match.size()) return WORST_POSSIBLE_RATING;
154  return results.match[index].rating;
155 }
156 
157 void InitMatcherRatings(float *Rating);
158 
159 int MakeTempProtoPerm(void *item1, void *item2);
160 
161 void SetAdaptiveThreshold(float Threshold);
162 
163 
164 /*-----------------------------------------------------------------------------
165  Public Code
166 -----------------------------------------------------------------------------*/
167 /*---------------------------------------------------------------------------*/
168 namespace tesseract {
192 void Classify::AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices) {
193  assert(Choices != nullptr);
194  ADAPT_RESULTS *Results = new ADAPT_RESULTS;
195  Results->Initialize();
196 
197  ASSERT_HOST(AdaptedTemplates != nullptr);
198 
199  DoAdaptiveMatch(Blob, Results);
200 
201  RemoveBadMatches(Results);
203  RemoveExtraPuncs(Results);
204  Results->ComputeBest();
205  ConvertMatchesToChoices(Blob->denorm(), Blob->bounding_box(), Results,
206  Choices);
207 
208  // TODO(rays) Move to before ConvertMatchesToChoices!
209  if (LargeSpeckle(*Blob) || Choices->length() == 0)
210  AddLargeSpeckleTo(Results->BlobLength, Choices);
211 
212  if (matcher_debug_level >= 1) {
213  tprintf("AD Matches = ");
214  PrintAdaptiveMatchResults(*Results);
215  }
216 
217 #ifndef GRAPHICS_DISABLED
219  DebugAdaptiveClassifier(Blob, Results);
220 #endif
221 
222  delete Results;
223 } /* AdaptiveClassifier */
224 
225 // If *win is nullptr, sets it to a new ScrollView() object with title msg.
226 // Clears the window and draws baselines.
227 void Classify::RefreshDebugWindow(ScrollView **win, const char *msg,
228  int y_offset, const TBOX &wbox) {
229  #ifndef GRAPHICS_DISABLED
230  const int kSampleSpaceWidth = 500;
231  if (*win == nullptr) {
232  *win = new ScrollView(msg, 100, y_offset, kSampleSpaceWidth * 2, 200,
233  kSampleSpaceWidth * 2, 200, true);
234  }
235  (*win)->Clear();
236  (*win)->Pen(64, 64, 64);
237  (*win)->Line(-kSampleSpaceWidth, kBlnBaselineOffset,
238  kSampleSpaceWidth, kBlnBaselineOffset);
239  (*win)->Line(-kSampleSpaceWidth, kBlnXHeight + kBlnBaselineOffset,
240  kSampleSpaceWidth, kBlnXHeight + kBlnBaselineOffset);
241  (*win)->ZoomToRectangle(wbox.left(), wbox.top(),
242  wbox.right(), wbox.bottom());
243  #endif // GRAPHICS_DISABLED
244 }
245 
246 // Learns the given word using its chopped_word, seam_array, denorm,
247 // box_word, best_state, and correct_text to learn both correctly and
248 // incorrectly segmented blobs. If fontname is not nullptr, then LearnBlob
249 // is called and the data will be saved in an internal buffer.
250 // Otherwise AdaptToBlob is called for adaption within a document.
251 void Classify::LearnWord(const char* fontname, WERD_RES* word) {
252  int word_len = word->correct_text.size();
253  if (word_len == 0) return;
254 
255  float* thresholds = nullptr;
256  if (fontname == nullptr) {
257  // Adaption mode.
258  if (!EnableLearning || word->best_choice == nullptr)
259  return; // Can't or won't adapt.
260 
262  tprintf("\n\nAdapting to word = %s\n",
263  word->best_choice->debug_string().string());
264  thresholds = new float[word_len];
268  matcher_rating_margin, thresholds);
269  }
270  int start_blob = 0;
271 
272  #ifndef GRAPHICS_DISABLED
274  if (learn_fragmented_word_debug_win_ != nullptr) {
275  window_wait(learn_fragmented_word_debug_win_);
276  }
277  RefreshDebugWindow(&learn_fragments_debug_win_, "LearnPieces", 400,
278  word->chopped_word->bounding_box());
279  RefreshDebugWindow(&learn_fragmented_word_debug_win_, "LearnWord", 200,
280  word->chopped_word->bounding_box());
281  word->chopped_word->plot(learn_fragmented_word_debug_win_);
283  }
284  #endif // GRAPHICS_DISABLED
285 
286  for (int ch = 0; ch < word_len; ++ch) {
288  tprintf("\nLearning %s\n", word->correct_text[ch].string());
289  }
290  if (word->correct_text[ch].length() > 0) {
291  float threshold = thresholds != nullptr ? thresholds[ch] : 0.0f;
292 
293  LearnPieces(fontname, start_blob, word->best_state[ch], threshold,
294  CST_WHOLE, word->correct_text[ch].string(), word);
295 
296  if (word->best_state[ch] > 1 && !disable_character_fragments) {
297  // Check that the character breaks into meaningful fragments
298  // that each match a whole character with at least
299  // classify_character_fragments_garbage_certainty_threshold
300  bool garbage = false;
301  int frag;
302  for (frag = 0; frag < word->best_state[ch]; ++frag) {
303  TBLOB* frag_blob = word->chopped_word->blobs[start_blob + frag];
305  garbage |= LooksLikeGarbage(frag_blob);
306  }
307  }
308  // Learn the fragments.
309  if (!garbage) {
310  bool pieces_all_natural = word->PiecesAllNatural(start_blob,
311  word->best_state[ch]);
312  if (pieces_all_natural || !prioritize_division) {
313  for (frag = 0; frag < word->best_state[ch]; ++frag) {
314  GenericVector<STRING> tokens;
315  word->correct_text[ch].split(' ', &tokens);
316 
317  tokens[0] = CHAR_FRAGMENT::to_string(
318  tokens[0].string(), frag, word->best_state[ch],
319  pieces_all_natural);
320 
321  STRING full_string;
322  for (int i = 0; i < tokens.size(); i++) {
323  full_string += tokens[i];
324  if (i != tokens.size() - 1)
325  full_string += ' ';
326  }
327  LearnPieces(fontname, start_blob + frag, 1, threshold,
328  CST_FRAGMENT, full_string.string(), word);
329  }
330  }
331  }
332  }
333 
334  // TODO(rays): re-enable this part of the code when we switch to the
335  // new classifier that needs to see examples of garbage.
336  /*
337  if (word->best_state[ch] > 1) {
338  // If the next blob is good, make junk with the rightmost fragment.
339  if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
340  LearnPieces(fontname, start_blob + word->best_state[ch] - 1,
341  word->best_state[ch + 1] + 1,
342  threshold, CST_IMPROPER, INVALID_UNICHAR, word);
343  }
344  // If the previous blob is good, make junk with the leftmost fragment.
345  if (ch > 0 && word->correct_text[ch - 1].length() > 0) {
346  LearnPieces(fontname, start_blob - word->best_state[ch - 1],
347  word->best_state[ch - 1] + 1,
348  threshold, CST_IMPROPER, INVALID_UNICHAR, word);
349  }
350  }
351  // If the next blob is good, make a join with it.
352  if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
353  STRING joined_text = word->correct_text[ch];
354  joined_text += word->correct_text[ch + 1];
355  LearnPieces(fontname, start_blob,
356  word->best_state[ch] + word->best_state[ch + 1],
357  threshold, CST_NGRAM, joined_text.string(), word);
358  }
359  */
360  }
361  start_blob += word->best_state[ch];
362  }
363  delete [] thresholds;
364 } // LearnWord.
365 
366 // Builds a blob of length fragments, from the word, starting at start,
367 // and then learns it, as having the given correct_text.
368 // If fontname is not nullptr, then LearnBlob is called and the data will be
369 // saved in an internal buffer for static training.
370 // Otherwise AdaptToBlob is called for adaption within a document.
371 // threshold is a magic number required by AdaptToChar and generated by
372 // ComputeAdaptionThresholds.
373 // Although it can be partly inferred from the string, segmentation is
374 // provided to explicitly clarify the character segmentation.
375 void Classify::LearnPieces(const char* fontname, int start, int length,
376  float threshold, CharSegmentationType segmentation,
377  const char* correct_text, WERD_RES* word) {
378  // TODO(daria) Remove/modify this if/when we want
379  // to train and/or adapt to n-grams.
380  if (segmentation != CST_WHOLE &&
381  (segmentation != CST_FRAGMENT || disable_character_fragments))
382  return;
383 
384  if (length > 1) {
385  SEAM::JoinPieces(word->seam_array, word->chopped_word->blobs, start,
386  start + length - 1);
387  }
388  TBLOB* blob = word->chopped_word->blobs[start];
389  // Rotate the blob if needed for classification.
390  TBLOB* rotated_blob = blob->ClassifyNormalizeIfNeeded();
391  if (rotated_blob == nullptr)
392  rotated_blob = blob;
393 
394  #ifndef GRAPHICS_DISABLED
395  // Draw debug windows showing the blob that is being learned if needed.
396  if (strcmp(classify_learn_debug_str.string(), correct_text) == 0) {
397  RefreshDebugWindow(&learn_debug_win_, "LearnPieces", 600,
398  word->chopped_word->bounding_box());
399  rotated_blob->plot(learn_debug_win_, ScrollView::GREEN, ScrollView::BROWN);
400  learn_debug_win_->Update();
401  window_wait(learn_debug_win_);
402  }
403  if (classify_debug_character_fragments && segmentation == CST_FRAGMENT) {
404  ASSERT_HOST(learn_fragments_debug_win_ != nullptr); // set up in LearnWord
405  blob->plot(learn_fragments_debug_win_,
407  learn_fragments_debug_win_->Update();
408  }
409  #endif // GRAPHICS_DISABLED
410 
411  if (fontname != nullptr) {
412  classify_norm_method.set_value(character); // force char norm spc 30/11/93
413  tess_bn_matching.set_value(false); // turn it off
414  tess_cn_matching.set_value(false);
415  DENORM bl_denorm, cn_denorm;
416  INT_FX_RESULT_STRUCT fx_info;
418  &bl_denorm, &cn_denorm, &fx_info);
419  LearnBlob(fontname, rotated_blob, cn_denorm, fx_info, correct_text);
420  } else if (unicharset.contains_unichar(correct_text)) {
421  UNICHAR_ID class_id = unicharset.unichar_to_id(correct_text);
422  int font_id = word->fontinfo != nullptr
423  ? fontinfo_table_.get_id(*word->fontinfo)
424  : 0;
426  tprintf("Adapting to char = %s, thr= %g font_id= %d\n",
427  unicharset.id_to_unichar(class_id), threshold, font_id);
428  // If filename is not nullptr we are doing recognition
429  // (as opposed to training), so we must have already set word fonts.
430  AdaptToChar(rotated_blob, class_id, font_id, threshold, AdaptedTemplates);
431  if (BackupAdaptedTemplates != nullptr) {
432  // Adapt the backup templates too. They will be used if the primary gets
433  // too full.
434  AdaptToChar(rotated_blob, class_id, font_id, threshold,
436  }
437  } else if (classify_debug_level >= 1) {
438  tprintf("Can't adapt to %s not in unicharset\n", correct_text);
439  }
440  if (rotated_blob != blob) {
441  delete rotated_blob;
442  }
443 
444  SEAM::BreakPieces(word->seam_array, word->chopped_word->blobs, start,
445  start + length - 1);
446 } // LearnPieces.
447 
448 /*---------------------------------------------------------------------------*/
461  STRING Filename;
462  FILE *File;
463 
464  if (AdaptedTemplates != nullptr &&
466  Filename = imagefile + ADAPT_TEMPLATE_SUFFIX;
467  File = fopen (Filename.string(), "wb");
468  if (File == nullptr)
469  cprintf ("Unable to save adapted templates to %s!\n", Filename.string());
470  else {
471  cprintf ("\nSaving adapted templates to %s ...", Filename.string());
472  fflush(stdout);
474  cprintf ("\n");
475  fclose(File);
476  }
477  }
478 
479  if (AdaptedTemplates != nullptr) {
481  AdaptedTemplates = nullptr;
482  }
483  if (BackupAdaptedTemplates != nullptr) {
485  BackupAdaptedTemplates = nullptr;
486  }
487 
488  if (PreTrainedTemplates != nullptr) {
490  PreTrainedTemplates = nullptr;
491  }
493  FreeNormProtos();
494  if (AllProtosOn != nullptr) {
499  AllProtosOn = nullptr;
500  AllConfigsOn = nullptr;
501  AllConfigsOff = nullptr;
502  TempProtoMask = nullptr;
503  }
504  delete shape_table_;
505  shape_table_ = nullptr;
506  delete static_classifier_;
507  static_classifier_ = nullptr;
508 } /* EndAdaptiveClassifier */
509 
510 
511 /*---------------------------------------------------------------------------*/
530  return;
531  if (AllProtosOn != nullptr)
532  EndAdaptiveClassifier(); // Don't leak with multiple inits.
533 
534  // If there is no language_data_path_prefix, the classifier will be
535  // adaptive only.
536  if (language_data_path_prefix.length() > 0 && mgr != nullptr) {
537  TFile fp;
540 
541  if (mgr->GetComponent(TESSDATA_SHAPE_TABLE, &fp)) {
543  if (!shape_table_->DeSerialize(&fp)) {
544  tprintf("Error loading shape table!\n");
545  delete shape_table_;
546  shape_table_ = nullptr;
547  }
548  }
549 
551  ReadNewCutoffs(&fp, CharNormCutoffs);
552 
554  NormProtos = ReadNormProtos(&fp);
555  static_classifier_ = new TessClassifier(false, this);
556  }
557 
558  InitIntegerFX();
559 
567 
568  for (int i = 0; i < MAX_NUM_CLASSES; i++) {
569  BaselineCutoffs[i] = 0;
570  }
571 
573  TFile fp;
574  STRING Filename;
575 
576  Filename = imagefile;
577  Filename += ADAPT_TEMPLATE_SUFFIX;
578  if (!fp.Open(Filename.string(), nullptr)) {
580  } else {
581  cprintf("\nReading pre-adapted templates from %s ...\n",
582  Filename.string());
583  fflush(stdout);
585  cprintf("\n");
587 
588  for (int i = 0; i < AdaptedTemplates->Templates->NumClasses; i++) {
589  BaselineCutoffs[i] = CharNormCutoffs[i];
590  }
591  }
592  } else {
593  if (AdaptedTemplates != nullptr)
596  }
597 } /* InitAdaptiveClassifier */
598 
601  tprintf("Resetting adaptive classifier (NumAdaptationsFailed=%d)\n",
602  NumAdaptationsFailed);
603  }
606  if (BackupAdaptedTemplates != nullptr)
608  BackupAdaptedTemplates = nullptr;
609  NumAdaptationsFailed = 0;
610 }
611 
612 // If there are backup adapted templates, switches to those, otherwise resets
613 // the main adaptive classifier (because it is full.)
615  if (BackupAdaptedTemplates == nullptr) {
617  return;
618  }
620  tprintf("Switch to backup adaptive classifier (NumAdaptationsFailed=%d)\n",
621  NumAdaptationsFailed);
622  }
625  BackupAdaptedTemplates = nullptr;
626  NumAdaptationsFailed = 0;
627 }
628 
629 // Resets the backup adaptive classifier to empty.
631  if (BackupAdaptedTemplates != nullptr)
634 }
635 
636 /*---------------------------------------------------------------------------*/
655 
657 
658 } /* SettupPass1 */
659 
660 
661 /*---------------------------------------------------------------------------*/
673 
674 } /* SettupPass2 */
675 
676 
677 /*---------------------------------------------------------------------------*/
695  CLASS_ID ClassId,
696  int FontinfoId,
697  ADAPT_CLASS Class,
698  ADAPT_TEMPLATES Templates) {
699  FEATURE_SET Features;
700  int Fid, Pid;
701  FEATURE Feature;
702  int NumFeatures;
703  TEMP_PROTO TempProto;
704  PROTO Proto;
705  INT_CLASS IClass;
707 
708  classify_norm_method.set_value(baseline);
709  Features = ExtractOutlineFeatures(Blob);
710  NumFeatures = Features->NumFeatures;
711  if (NumFeatures > UNLIKELY_NUM_FEAT || NumFeatures <= 0) {
712  FreeFeatureSet(Features);
713  return;
714  }
715 
716  Config = NewTempConfig(NumFeatures - 1, FontinfoId);
717  TempConfigFor(Class, 0) = Config;
718 
719  /* this is a kludge to construct cutoffs for adapted templates */
720  if (Templates == AdaptedTemplates)
721  BaselineCutoffs[ClassId] = CharNormCutoffs[ClassId];
722 
723  IClass = ClassForClassId (Templates->Templates, ClassId);
724 
725  for (Fid = 0; Fid < Features->NumFeatures; Fid++) {
726  Pid = AddIntProto (IClass);
727  assert (Pid != NO_PROTO);
728 
729  Feature = Features->Features[Fid];
730  TempProto = NewTempProto ();
731  Proto = &(TempProto->Proto);
732 
733  /* compute proto params - NOTE that Y_DIM_OFFSET must be used because
734  ConvertProto assumes that the Y dimension varies from -0.5 to 0.5
735  instead of the -0.25 to 0.75 used in baseline normalization */
736  Proto->Angle = Feature->Params[OutlineFeatDir];
737  Proto->X = Feature->Params[OutlineFeatX];
738  Proto->Y = Feature->Params[OutlineFeatY] - Y_DIM_OFFSET;
739  Proto->Length = Feature->Params[OutlineFeatLength];
740  FillABC(Proto);
741 
742  TempProto->ProtoId = Pid;
743  SET_BIT (Config->Protos, Pid);
744 
745  ConvertProto(Proto, Pid, IClass);
746  AddProtoToProtoPruner(Proto, Pid, IClass,
748 
749  Class->TempProtos = push (Class->TempProtos, TempProto);
750  }
751  FreeFeatureSet(Features);
752 
753  AddIntConfig(IClass);
754  ConvertConfig (AllProtosOn, 0, IClass);
755 
757  tprintf("Added new class '%s' with class id %d and %d protos.\n",
758  unicharset.id_to_unichar(ClassId), ClassId, NumFeatures);
760  DisplayAdaptedChar(Blob, IClass);
761  }
762 
763  if (IsEmptyAdaptedClass(Class))
764  (Templates->NumNonEmptyClasses)++;
765 } /* InitAdaptedClass */
766 
767 
768 /*---------------------------------------------------------------------------*/
788  INT_FEATURE_ARRAY IntFeatures,
789  FEATURE_SET *FloatFeatures) {
790  FEATURE_SET Features;
791  int NumFeatures;
792 
793  classify_norm_method.set_value(baseline);
794  Features = ExtractPicoFeatures(Blob);
795 
796  NumFeatures = Features->NumFeatures;
797  if (NumFeatures == 0 || NumFeatures > UNLIKELY_NUM_FEAT) {
798  FreeFeatureSet(Features);
799  return 0;
800  }
801 
802  ComputeIntFeatures(Features, IntFeatures);
803  *FloatFeatures = Features;
804 
805  return NumFeatures;
806 } /* GetAdaptiveFeatures */
807 
808 
809 /*-----------------------------------------------------------------------------
810  Private Code
811 -----------------------------------------------------------------------------*/
812 /*---------------------------------------------------------------------------*/
824  if (word->best_choice == nullptr) return false;
825  int BestChoiceLength = word->best_choice->length();
826  float adaptable_score =
828  return // rules that apply in general - simplest to compute first
829  BestChoiceLength > 0 &&
830  BestChoiceLength == word->rebuild_word->NumBlobs() &&
831  BestChoiceLength <= MAX_ADAPTABLE_WERD_SIZE &&
832  // This basically ensures that the word is at least a dictionary match
833  // (freq word, user word, system dawg word, etc).
834  // Since all the other adjustments will make adjust factor higher
835  // than higher than adaptable_score=1.1+0.05=1.15
836  // Since these are other flags that ensure that the word is dict word,
837  // this check could be at times redundant.
838  word->best_choice->adjust_factor() <= adaptable_score &&
839  // Make sure that alternative choices are not dictionary words.
840  word->AlternativeChoiceAdjustmentsWorseThan(adaptable_score);
841 }
842 
843 /*---------------------------------------------------------------------------*/
857 void Classify::AdaptToChar(TBLOB* Blob, CLASS_ID ClassId, int FontinfoId,
858  float Threshold,
859  ADAPT_TEMPLATES adaptive_templates) {
860  int NumFeatures;
861  INT_FEATURE_ARRAY IntFeatures;
862  UnicharRating int_result;
863  INT_CLASS IClass;
864  ADAPT_CLASS Class;
865  TEMP_CONFIG TempConfig;
866  FEATURE_SET FloatFeatures;
867  int NewTempConfigId;
868 
869  if (!LegalClassId (ClassId))
870  return;
871 
872  int_result.unichar_id = ClassId;
873  Class = adaptive_templates->Class[ClassId];
874  assert(Class != nullptr);
875  if (IsEmptyAdaptedClass(Class)) {
876  InitAdaptedClass(Blob, ClassId, FontinfoId, Class, adaptive_templates);
877  } else {
878  IClass = ClassForClassId(adaptive_templates->Templates, ClassId);
879 
880  NumFeatures = GetAdaptiveFeatures(Blob, IntFeatures, &FloatFeatures);
881  if (NumFeatures <= 0) {
882  return; // Features already freed by GetAdaptiveFeatures.
883  }
884 
885  // Only match configs with the matching font.
886  BIT_VECTOR MatchingFontConfigs = NewBitVector(MAX_NUM_PROTOS);
887  for (int cfg = 0; cfg < IClass->NumConfigs; ++cfg) {
888  if (GetFontinfoId(Class, cfg) == FontinfoId) {
889  SET_BIT(MatchingFontConfigs, cfg);
890  } else {
891  reset_bit(MatchingFontConfigs, cfg);
892  }
893  }
894  im_.Match(IClass, AllProtosOn, MatchingFontConfigs,
895  NumFeatures, IntFeatures,
898  FreeBitVector(MatchingFontConfigs);
899 
900  SetAdaptiveThreshold(Threshold);
901 
902  if (1.0f - int_result.rating <= Threshold) {
903  if (ConfigIsPermanent(Class, int_result.config)) {
905  tprintf("Found good match to perm config %d = %4.1f%%.\n",
906  int_result.config, int_result.rating * 100.0);
907  FreeFeatureSet(FloatFeatures);
908  return;
909  }
910 
911  TempConfig = TempConfigFor(Class, int_result.config);
912  IncreaseConfidence(TempConfig);
913  if (TempConfig->NumTimesSeen > Class->MaxNumTimesSeen) {
914  Class->MaxNumTimesSeen = TempConfig->NumTimesSeen;
915  }
917  tprintf("Increasing reliability of temp config %d to %d.\n",
918  int_result.config, TempConfig->NumTimesSeen);
919 
920  if (TempConfigReliable(ClassId, TempConfig)) {
921  MakePermanent(adaptive_templates, ClassId, int_result.config, Blob);
922  UpdateAmbigsGroup(ClassId, Blob);
923  }
924  } else {
926  tprintf("Found poor match to temp config %d = %4.1f%%.\n",
927  int_result.config, int_result.rating * 100.0);
929  DisplayAdaptedChar(Blob, IClass);
930  }
931  NewTempConfigId =
932  MakeNewTemporaryConfig(adaptive_templates, ClassId, FontinfoId,
933  NumFeatures, IntFeatures, FloatFeatures);
934  if (NewTempConfigId >= 0 &&
935  TempConfigReliable(ClassId, TempConfigFor(Class, NewTempConfigId))) {
936  MakePermanent(adaptive_templates, ClassId, NewTempConfigId, Blob);
937  UpdateAmbigsGroup(ClassId, Blob);
938  }
939 
940 #ifndef GRAPHICS_DISABLED
942  DisplayAdaptedChar(Blob, IClass);
943  }
944 #endif
945  }
946  FreeFeatureSet(FloatFeatures);
947  }
948 } /* AdaptToChar */
949 
951 #ifndef GRAPHICS_DISABLED
952  INT_FX_RESULT_STRUCT fx_info;
956  &bl_features);
957  if (sample == nullptr) return;
958 
959  UnicharRating int_result;
960  im_.Match(int_class, AllProtosOn, AllConfigsOn,
961  bl_features.size(), &bl_features[0],
964  tprintf("Best match to temp config %d = %4.1f%%.\n",
965  int_result.config, int_result.rating * 100.0);
967  uint32_t ConfigMask;
968  ConfigMask = 1 << int_result.config;
970  im_.Match(int_class, AllProtosOn, (BIT_VECTOR)&ConfigMask,
971  bl_features.size(), &bl_features[0],
975  }
976 
977  delete sample;
978 #endif
979 }
980 
998 void Classify::AddNewResult(const UnicharRating& new_result,
999  ADAPT_RESULTS *results) {
1000  int old_match = FindScoredUnichar(new_result.unichar_id, *results);
1001 
1002  if (new_result.rating + matcher_bad_match_pad < results->best_rating ||
1003  (old_match < results->match.size() &&
1004  new_result.rating <= results->match[old_match].rating))
1005  return; // New one not good enough.
1006 
1007  if (!unicharset.get_fragment(new_result.unichar_id))
1008  results->HasNonfragment = true;
1009 
1010  if (old_match < results->match.size()) {
1011  results->match[old_match].rating = new_result.rating;
1012  } else {
1013  results->match.push_back(new_result);
1014  }
1015 
1016  if (new_result.rating > results->best_rating &&
1017  // Ensure that fragments do not affect best rating, class and config.
1018  // This is needed so that at least one non-fragmented character is
1019  // always present in the results.
1020  // TODO(daria): verify that this helps accuracy and does not
1021  // hurt performance.
1022  !unicharset.get_fragment(new_result.unichar_id)) {
1023  results->best_match_index = old_match;
1024  results->best_rating = new_result.rating;
1025  results->best_unichar_id = new_result.unichar_id;
1026  }
1027 } /* AddNewResult */
1028 
1029 
1030 /*---------------------------------------------------------------------------*/
1050  const GenericVector<INT_FEATURE_STRUCT>& int_features,
1051  const INT_FX_RESULT_STRUCT& fx_info,
1052  const TBLOB *blob,
1053  INT_TEMPLATES templates,
1054  ADAPT_CLASS *classes,
1055  UNICHAR_ID *ambiguities,
1056  ADAPT_RESULTS *results) {
1057  if (int_features.empty()) return;
1058  uint8_t* CharNormArray = new uint8_t[unicharset.size()];
1059  UnicharRating int_result;
1060 
1061  results->BlobLength = GetCharNormFeature(fx_info, templates, nullptr,
1062  CharNormArray);
1063  bool debug = matcher_debug_level >= 2 || classify_debug_level > 1;
1064  if (debug)
1065  tprintf("AM Matches = ");
1066 
1067  int top = blob->bounding_box().top();
1068  int bottom = blob->bounding_box().bottom();
1069  while (*ambiguities >= 0) {
1070  CLASS_ID class_id = *ambiguities;
1071 
1072  int_result.unichar_id = class_id;
1073  im_.Match(ClassForClassId(templates, class_id),
1075  int_features.size(), &int_features[0],
1076  &int_result,
1079 
1080  ExpandShapesAndApplyCorrections(nullptr, debug, class_id, bottom, top, 0,
1081  results->BlobLength,
1083  CharNormArray, &int_result, results);
1084  ambiguities++;
1085  }
1086  delete [] CharNormArray;
1087 } /* AmbigClassifier */
1088 
1089 /*---------------------------------------------------------------------------*/
1093  int16_t num_features,
1094  const INT_FEATURE_STRUCT* features,
1095  const uint8_t* norm_factors,
1096  ADAPT_CLASS* classes,
1097  int debug,
1098  int matcher_multiplier,
1099  const TBOX& blob_box,
1100  const GenericVector<CP_RESULT_STRUCT>& results,
1101  ADAPT_RESULTS* final_results) {
1102  int top = blob_box.top();
1103  int bottom = blob_box.bottom();
1104  UnicharRating int_result;
1105  for (int c = 0; c < results.size(); c++) {
1106  CLASS_ID class_id = results[c].Class;
1107  BIT_VECTOR protos = classes != nullptr ? classes[class_id]->PermProtos
1108  : AllProtosOn;
1109  BIT_VECTOR configs = classes != nullptr ? classes[class_id]->PermConfigs
1110  : AllConfigsOn;
1111 
1112  int_result.unichar_id = class_id;
1113  im_.Match(ClassForClassId(templates, class_id),
1114  protos, configs,
1115  num_features, features,
1116  &int_result, classify_adapt_feature_threshold, debug,
1118  bool is_debug = matcher_debug_level >= 2 || classify_debug_level > 1;
1119  ExpandShapesAndApplyCorrections(classes, is_debug, class_id, bottom, top,
1120  results[c].Rating,
1121  final_results->BlobLength,
1122  matcher_multiplier, norm_factors,
1123  &int_result, final_results);
1124  }
1125 }
1126 
1127 // Converts configs to fonts, and if the result is not adapted, and a
1128 // shape_table_ is present, the shape is expanded to include all
1129 // unichar_ids represented, before applying a set of corrections to the
1130 // distance rating in int_result, (see ComputeCorrectedRating.)
1131 // The results are added to the final_results output.
1133  ADAPT_CLASS* classes, bool debug, int class_id, int bottom, int top,
1134  float cp_rating, int blob_length, int matcher_multiplier,
1135  const uint8_t* cn_factors,
1136  UnicharRating* int_result, ADAPT_RESULTS* final_results) {
1137  if (classes != nullptr) {
1138  // Adapted result. Convert configs to fontinfo_ids.
1139  int_result->adapted = true;
1140  for (int f = 0; f < int_result->fonts.size(); ++f) {
1141  int_result->fonts[f].fontinfo_id =
1142  GetFontinfoId(classes[class_id], int_result->fonts[f].fontinfo_id);
1143  }
1144  } else {
1145  // Pre-trained result. Map fonts using font_sets_.
1146  int_result->adapted = false;
1147  for (int f = 0; f < int_result->fonts.size(); ++f) {
1148  int_result->fonts[f].fontinfo_id =
1150  int_result->fonts[f].fontinfo_id);
1151  }
1152  if (shape_table_ != nullptr) {
1153  // Two possible cases:
1154  // 1. Flat shapetable. All unichar-ids of the shapes referenced by
1155  // int_result->fonts are the same. In this case build a new vector of
1156  // mapped fonts and replace the fonts in int_result.
1157  // 2. Multi-unichar shapetable. Variable unichars in the shapes referenced
1158  // by int_result. In this case, build a vector of UnicharRating to
1159  // gather together different font-ids for each unichar. Also covers case1.
1160  GenericVector<UnicharRating> mapped_results;
1161  for (int f = 0; f < int_result->fonts.size(); ++f) {
1162  int shape_id = int_result->fonts[f].fontinfo_id;
1163  const Shape& shape = shape_table_->GetShape(shape_id);
1164  for (int c = 0; c < shape.size(); ++c) {
1165  int unichar_id = shape[c].unichar_id;
1166  if (!unicharset.get_enabled(unichar_id)) continue;
1167  // Find the mapped_result for unichar_id.
1168  int r = 0;
1169  for (r = 0; r < mapped_results.size() &&
1170  mapped_results[r].unichar_id != unichar_id; ++r) {}
1171  if (r == mapped_results.size()) {
1172  mapped_results.push_back(*int_result);
1173  mapped_results[r].unichar_id = unichar_id;
1174  mapped_results[r].fonts.truncate(0);
1175  }
1176  for (int i = 0; i < shape[c].font_ids.size(); ++i) {
1177  mapped_results[r].fonts.push_back(
1178  ScoredFont(shape[c].font_ids[i], int_result->fonts[f].score));
1179  }
1180  }
1181  }
1182  for (int m = 0; m < mapped_results.size(); ++m) {
1183  mapped_results[m].rating =
1184  ComputeCorrectedRating(debug, mapped_results[m].unichar_id,
1185  cp_rating, int_result->rating,
1186  int_result->feature_misses, bottom, top,
1187  blob_length, matcher_multiplier, cn_factors);
1188  AddNewResult(mapped_results[m], final_results);
1189  }
1190  return;
1191  }
1192  }
1193  if (unicharset.get_enabled(class_id)) {
1194  int_result->rating = ComputeCorrectedRating(debug, class_id, cp_rating,
1195  int_result->rating,
1196  int_result->feature_misses,
1197  bottom, top, blob_length,
1198  matcher_multiplier, cn_factors);
1199  AddNewResult(*int_result, final_results);
1200  }
1201 }
1202 
1203 // Applies a set of corrections to the confidence im_rating,
1204 // including the cn_correction, miss penalty and additional penalty
1205 // for non-alnums being vertical misfits. Returns the corrected confidence.
1206 double Classify::ComputeCorrectedRating(bool debug, int unichar_id,
1207  double cp_rating, double im_rating,
1208  int feature_misses,
1209  int bottom, int top,
1210  int blob_length, int matcher_multiplier,
1211  const uint8_t* cn_factors) {
1212  // Compute class feature corrections.
1213  double cn_corrected = im_.ApplyCNCorrection(1.0 - im_rating, blob_length,
1214  cn_factors[unichar_id],
1215  matcher_multiplier);
1216  double miss_penalty = tessedit_class_miss_scale * feature_misses;
1217  double vertical_penalty = 0.0;
1218  // Penalize non-alnums for being vertical misfits.
1219  if (!unicharset.get_isalpha(unichar_id) &&
1220  !unicharset.get_isdigit(unichar_id) &&
1221  cn_factors[unichar_id] != 0 && classify_misfit_junk_penalty > 0.0) {
1222  int min_bottom, max_bottom, min_top, max_top;
1223  unicharset.get_top_bottom(unichar_id, &min_bottom, &max_bottom,
1224  &min_top, &max_top);
1225  if (debug) {
1226  tprintf("top=%d, vs [%d, %d], bottom=%d, vs [%d, %d]\n",
1227  top, min_top, max_top, bottom, min_bottom, max_bottom);
1228  }
1229  if (top < min_top || top > max_top ||
1230  bottom < min_bottom || bottom > max_bottom) {
1231  vertical_penalty = classify_misfit_junk_penalty;
1232  }
1233  }
1234  double result = 1.0 - (cn_corrected + miss_penalty + vertical_penalty);
1235  if (result < WORST_POSSIBLE_RATING)
1236  result = WORST_POSSIBLE_RATING;
1237  if (debug) {
1238  tprintf("%s: %2.1f%%(CP%2.1f, IM%2.1f + CN%.2f(%d) + MP%2.1f + VP%2.1f)\n",
1239  unicharset.id_to_unichar(unichar_id),
1240  result * 100.0,
1241  cp_rating * 100.0,
1242  (1.0 - im_rating) * 100.0,
1243  (cn_corrected - (1.0 - im_rating)) * 100.0,
1244  cn_factors[unichar_id],
1245  miss_penalty * 100.0,
1246  vertical_penalty * 100.0);
1247  }
1248  return result;
1249 }
1250 
1251 /*---------------------------------------------------------------------------*/
1270  TBLOB *Blob, const GenericVector<INT_FEATURE_STRUCT>& int_features,
1271  const INT_FX_RESULT_STRUCT& fx_info,
1272  ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results) {
1273  if (int_features.empty()) return nullptr;
1274  uint8_t* CharNormArray = new uint8_t[unicharset.size()];
1275  ClearCharNormArray(CharNormArray);
1276 
1278  PruneClasses(Templates->Templates, int_features.size(), -1, &int_features[0],
1279  CharNormArray, BaselineCutoffs, &Results->CPResults);
1280 
1281  if (matcher_debug_level >= 2 || classify_debug_level > 1)
1282  tprintf("BL Matches = ");
1283 
1284  MasterMatcher(Templates->Templates, int_features.size(), &int_features[0],
1285  CharNormArray,
1286  Templates->Class, matcher_debug_flags, 0,
1287  Blob->bounding_box(), Results->CPResults, Results);
1288 
1289  delete [] CharNormArray;
1290  CLASS_ID ClassId = Results->best_unichar_id;
1291  if (ClassId == INVALID_UNICHAR_ID || Results->best_match_index < 0)
1292  return nullptr;
1293 
1294  return Templates->Class[ClassId]->
1295  Config[Results->match[Results->best_match_index].config].Perm->Ambigs;
1296 } /* BaselineClassifier */
1297 
1298 
1299 /*---------------------------------------------------------------------------*/
1316  const TrainingSample& sample,
1317  ADAPT_RESULTS *adapt_results) {
1318  // This is the length that is used for scaling ratings vs certainty.
1319  adapt_results->BlobLength =
1320  IntCastRounded(sample.outline_length() / kStandardFeatureLength);
1321  GenericVector<UnicharRating> unichar_results;
1322  static_classifier_->UnicharClassifySample(sample, blob->denorm().pix(), 0,
1323  -1, &unichar_results);
1324  // Convert results to the format used internally by AdaptiveClassifier.
1325  for (int r = 0; r < unichar_results.size(); ++r) {
1326  AddNewResult(unichar_results[r], adapt_results);
1327  }
1328  return sample.num_features();
1329 } /* CharNormClassifier */
1330 
1331 // As CharNormClassifier, but operates on a TrainingSample and outputs to
1332 // a GenericVector of ShapeRating without conversion to classes.
1334  int keep_this,
1335  const TrainingSample& sample,
1336  GenericVector<UnicharRating>* results) {
1337  results->clear();
1338  ADAPT_RESULTS* adapt_results = new ADAPT_RESULTS();
1339  adapt_results->Initialize();
1340  // Compute the bounding box of the features.
1341  uint32_t num_features = sample.num_features();
1342  // Only the top and bottom of the blob_box are used by MasterMatcher, so
1343  // fabricate right and left using top and bottom.
1344  TBOX blob_box(sample.geo_feature(GeoBottom), sample.geo_feature(GeoBottom),
1345  sample.geo_feature(GeoTop), sample.geo_feature(GeoTop));
1346  // Compute the char_norm_array from the saved cn_feature.
1347  FEATURE norm_feature = sample.GetCNFeature();
1348  uint8_t* char_norm_array = new uint8_t[unicharset.size()];
1349  int num_pruner_classes = std::max(unicharset.size(),
1351  uint8_t* pruner_norm_array = new uint8_t[num_pruner_classes];
1352  adapt_results->BlobLength =
1353  static_cast<int>(ActualOutlineLength(norm_feature) * 20 + 0.5);
1354  ComputeCharNormArrays(norm_feature, PreTrainedTemplates, char_norm_array,
1355  pruner_norm_array);
1356 
1357  PruneClasses(PreTrainedTemplates, num_features, keep_this, sample.features(),
1358  pruner_norm_array,
1359  shape_table_ != nullptr ? &shapetable_cutoffs_[0] : CharNormCutoffs,
1360  &adapt_results->CPResults);
1361  delete [] pruner_norm_array;
1362  if (keep_this >= 0) {
1363  adapt_results->CPResults[0].Class = keep_this;
1364  adapt_results->CPResults.truncate(1);
1365  }
1366  if (pruner_only) {
1367  // Convert pruner results to output format.
1368  for (int i = 0; i < adapt_results->CPResults.size(); ++i) {
1369  int class_id = adapt_results->CPResults[i].Class;
1370  results->push_back(
1371  UnicharRating(class_id, 1.0f - adapt_results->CPResults[i].Rating));
1372  }
1373  } else {
1374  MasterMatcher(PreTrainedTemplates, num_features, sample.features(),
1375  char_norm_array,
1376  nullptr, matcher_debug_flags,
1378  blob_box, adapt_results->CPResults, adapt_results);
1379  // Convert master matcher results to output format.
1380  for (int i = 0; i < adapt_results->match.size(); i++) {
1381  results->push_back(adapt_results->match[i]);
1382  }
1384  }
1385  delete [] char_norm_array;
1386  delete adapt_results;
1387  return num_features;
1388 } /* CharNormTrainingSample */
1389 
1390 
1391 /*---------------------------------------------------------------------------*/
1404  float rating = results->BlobLength / matcher_avg_noise_size;
1405  rating *= rating;
1406  rating /= 1.0 + rating;
1407 
1408  AddNewResult(UnicharRating(UNICHAR_SPACE, 1.0f - rating), results);
1409 } /* ClassifyAsNoise */
1410 
1417 void Classify::ConvertMatchesToChoices(const DENORM& denorm, const TBOX& box,
1418  ADAPT_RESULTS *Results,
1419  BLOB_CHOICE_LIST *Choices) {
1420  assert(Choices != nullptr);
1421  float Rating;
1422  float Certainty;
1423  BLOB_CHOICE_IT temp_it;
1424  bool contains_nonfrag = false;
1425  temp_it.set_to_list(Choices);
1426  int choices_length = 0;
1427  // With no shape_table_ maintain the previous MAX_MATCHES as the maximum
1428  // number of returned results, but with a shape_table_ we want to have room
1429  // for at least the biggest shape (which might contain hundreds of Indic
1430  // grapheme fragments) and more, so use double the size of the biggest shape
1431  // if that is more than the default.
1432  int max_matches = MAX_MATCHES;
1433  if (shape_table_ != nullptr) {
1434  max_matches = shape_table_->MaxNumUnichars() * 2;
1435  if (max_matches < MAX_MATCHES)
1436  max_matches = MAX_MATCHES;
1437  }
1438 
1439  float best_certainty = -FLT_MAX;
1440  for (int i = 0; i < Results->match.size(); i++) {
1441  const UnicharRating& result = Results->match[i];
1442  bool adapted = result.adapted;
1443  bool current_is_frag = (unicharset.get_fragment(result.unichar_id) != nullptr);
1444  if (temp_it.length()+1 == max_matches &&
1445  !contains_nonfrag && current_is_frag) {
1446  continue; // look for a non-fragmented character to fill the
1447  // last spot in Choices if only fragments are present
1448  }
1449  // BlobLength can never be legally 0, this means recognition failed.
1450  // But we must return a classification result because some invoking
1451  // functions (chopper/permuter) do not anticipate a null blob choice.
1452  // So we need to assign a poor, but not infinitely bad score.
1453  if (Results->BlobLength == 0) {
1454  Certainty = -20;
1455  Rating = 100; // should be -certainty * real_blob_length
1456  } else {
1457  Rating = Certainty = (1.0f - result.rating);
1458  Rating *= rating_scale * Results->BlobLength;
1459  Certainty *= -(getDict().certainty_scale);
1460  }
1461  // Adapted results, by their very nature, should have good certainty.
1462  // Those that don't are at best misleading, and often lead to errors,
1463  // so don't accept adapted results that are too far behind the best result,
1464  // whether adapted or static.
1465  // TODO(rays) find some way of automatically tuning these constants.
1466  if (Certainty > best_certainty) {
1467  best_certainty = std::min(Certainty, static_cast<float>(classify_adapted_pruning_threshold));
1468  } else if (adapted &&
1469  Certainty / classify_adapted_pruning_factor < best_certainty) {
1470  continue; // Don't accept bad adapted results.
1471  }
1472 
1473  float min_xheight, max_xheight, yshift;
1474  denorm.XHeightRange(result.unichar_id, unicharset, box,
1475  &min_xheight, &max_xheight, &yshift);
1476  BLOB_CHOICE* choice =
1477  new BLOB_CHOICE(result.unichar_id, Rating, Certainty,
1479  min_xheight, max_xheight, yshift,
1480  adapted ? BCC_ADAPTED_CLASSIFIER
1482  choice->set_fonts(result.fonts);
1483  temp_it.add_to_end(choice);
1484  contains_nonfrag |= !current_is_frag; // update contains_nonfrag
1485  choices_length++;
1486  if (choices_length >= max_matches) break;
1487  }
1488  Results->match.truncate(choices_length);
1489 } // ConvertMatchesToChoices
1490 
1491 
1492 /*---------------------------------------------------------------------------*/
1493 #ifndef GRAPHICS_DISABLED
1494 
1502  ADAPT_RESULTS *Results) {
1503  if (static_classifier_ == nullptr) return;
1504  INT_FX_RESULT_STRUCT fx_info;
1507  BlobToTrainingSample(*blob, false, &fx_info, &bl_features);
1508  if (sample == nullptr) return;
1509  static_classifier_->DebugDisplay(*sample, blob->denorm().pix(),
1510  Results->best_unichar_id);
1511 } /* DebugAdaptiveClassifier */
1512 #endif
1513 
1514 /*---------------------------------------------------------------------------*/
1535  UNICHAR_ID *Ambiguities;
1536 
1537  INT_FX_RESULT_STRUCT fx_info;
1541  &bl_features);
1542  if (sample == nullptr) return;
1543 
1544  // TODO: With LSTM, static_classifier_ is nullptr.
1545  // Return to avoid crash in CharNormClassifier.
1546  if (static_classifier_ == nullptr) {
1547  delete sample;
1548  return;
1549  }
1550 
1552  tess_cn_matching) {
1553  CharNormClassifier(Blob, *sample, Results);
1554  } else {
1555  Ambiguities = BaselineClassifier(Blob, bl_features, fx_info,
1556  AdaptedTemplates, Results);
1557  if ((!Results->match.empty() &&
1558  MarginalMatch(Results->best_rating,
1560  !tess_bn_matching) ||
1561  Results->match.empty()) {
1562  CharNormClassifier(Blob, *sample, Results);
1563  } else if (Ambiguities && *Ambiguities >= 0 && !tess_bn_matching) {
1564  AmbigClassifier(bl_features, fx_info, Blob,
1567  Ambiguities,
1568  Results);
1569  }
1570  }
1571 
1572  // Force the blob to be classified as noise
1573  // if the results contain only fragments.
1574  // TODO(daria): verify that this is better than
1575  // just adding a nullptr classification.
1576  if (!Results->HasNonfragment || Results->match.empty())
1577  ClassifyAsNoise(Results);
1578  delete sample;
1579 } /* DoAdaptiveMatch */
1580 
1581 /*---------------------------------------------------------------------------*/
1597  CLASS_ID CorrectClass) {
1598  ADAPT_RESULTS *Results = new ADAPT_RESULTS();
1599  UNICHAR_ID *Ambiguities;
1600  int i;
1601 
1602  Results->Initialize();
1603  INT_FX_RESULT_STRUCT fx_info;
1607  &bl_features);
1608  if (sample == nullptr) {
1609  delete Results;
1610  return nullptr;
1611  }
1612 
1613  CharNormClassifier(Blob, *sample, Results);
1614  delete sample;
1615  RemoveBadMatches(Results);
1617 
1618  /* copy the class id's into an string of ambiguities - don't copy if
1619  the correct class is the only class id matched */
1620  Ambiguities = new UNICHAR_ID[Results->match.size() + 1];
1621  if (Results->match.size() > 1 ||
1622  (Results->match.size() == 1 &&
1623  Results->match[0].unichar_id != CorrectClass)) {
1624  for (i = 0; i < Results->match.size(); i++)
1625  Ambiguities[i] = Results->match[i].unichar_id;
1626  Ambiguities[i] = -1;
1627  } else {
1628  Ambiguities[0] = -1;
1629  }
1630 
1631  delete Results;
1632  return Ambiguities;
1633 } /* GetAmbiguities */
1634 
1635 // Returns true if the given blob looks too dissimilar to any character
1636 // present in the classifier templates.
1638  BLOB_CHOICE_LIST *ratings = new BLOB_CHOICE_LIST();
1639  AdaptiveClassifier(blob, ratings);
1640  BLOB_CHOICE_IT ratings_it(ratings);
1643  print_ratings_list("======================\nLooksLikeGarbage() got ",
1644  ratings, unicharset);
1645  }
1646  for (ratings_it.mark_cycle_pt(); !ratings_it.cycled_list();
1647  ratings_it.forward()) {
1648  if (unicharset.get_fragment(ratings_it.data()->unichar_id()) != nullptr) {
1649  continue;
1650  }
1651  float certainty = ratings_it.data()->certainty();
1652  delete ratings;
1653  return certainty <
1655  }
1656  delete ratings;
1657  return true; // no whole characters in ratings
1658 }
1659 
1660 /*---------------------------------------------------------------------------*/
1683  INT_TEMPLATES templates,
1684  uint8_t* pruner_norm_array,
1685  uint8_t* char_norm_array) {
1686  FEATURE norm_feature = NewFeature(&CharNormDesc);
1687  float baseline = kBlnBaselineOffset;
1688  float scale = MF_SCALE_FACTOR;
1689  norm_feature->Params[CharNormY] = (fx_info.Ymean - baseline) * scale;
1690  norm_feature->Params[CharNormLength] =
1691  fx_info.Length * scale / LENGTH_COMPRESSION;
1692  norm_feature->Params[CharNormRx] = fx_info.Rx * scale;
1693  norm_feature->Params[CharNormRy] = fx_info.Ry * scale;
1694  // Deletes norm_feature.
1695  ComputeCharNormArrays(norm_feature, templates, char_norm_array,
1696  pruner_norm_array);
1697  return IntCastRounded(fx_info.Length / kStandardFeatureLength);
1698 } /* GetCharNormFeature */
1699 
1700 // Computes the char_norm_array for the unicharset and, if not nullptr, the
1701 // pruner_array as appropriate according to the existence of the shape_table.
1703  INT_TEMPLATES_STRUCT* templates,
1704  uint8_t* char_norm_array,
1705  uint8_t* pruner_array) {
1706  ComputeIntCharNormArray(*norm_feature, char_norm_array);
1707  if (pruner_array != nullptr) {
1708  if (shape_table_ == nullptr) {
1709  ComputeIntCharNormArray(*norm_feature, pruner_array);
1710  } else {
1711  memset(pruner_array, UINT8_MAX,
1712  templates->NumClasses * sizeof(pruner_array[0]));
1713  // Each entry in the pruner norm array is the MIN of all the entries of
1714  // the corresponding unichars in the CharNormArray.
1715  for (int id = 0; id < templates->NumClasses; ++id) {
1716  int font_set_id = templates->Class[id]->font_set_id;
1717  const FontSet &fs = fontset_table_.get(font_set_id);
1718  for (int config = 0; config < fs.size; ++config) {
1719  const Shape& shape = shape_table_->GetShape(fs.configs[config]);
1720  for (int c = 0; c < shape.size(); ++c) {
1721  if (char_norm_array[shape[c].unichar_id] < pruner_array[id])
1722  pruner_array[id] = char_norm_array[shape[c].unichar_id];
1723  }
1724  }
1725  }
1726  }
1727  }
1728  FreeFeature(norm_feature);
1729 }
1730 
1731 /*---------------------------------------------------------------------------*/
1745  CLASS_ID ClassId,
1746  int FontinfoId,
1747  int NumFeatures,
1748  INT_FEATURE_ARRAY Features,
1749  FEATURE_SET FloatFeatures) {
1750  INT_CLASS IClass;
1751  ADAPT_CLASS Class;
1752  PROTO_ID OldProtos[MAX_NUM_PROTOS];
1753  FEATURE_ID BadFeatures[MAX_NUM_INT_FEATURES];
1754  int NumOldProtos;
1755  int NumBadFeatures;
1756  int MaxProtoId, OldMaxProtoId;
1757  int BlobLength = 0;
1758  int MaskSize;
1759  int ConfigId;
1761  int i;
1762  int debug_level = NO_DEBUG;
1763 
1765  debug_level =
1767 
1768  IClass = ClassForClassId(Templates->Templates, ClassId);
1769  Class = Templates->Class[ClassId];
1770 
1771  if (IClass->NumConfigs >= MAX_NUM_CONFIGS) {
1772  ++NumAdaptationsFailed;
1774  cprintf("Cannot make new temporary config: maximum number exceeded.\n");
1775  return -1;
1776  }
1777 
1778  OldMaxProtoId = IClass->NumProtos - 1;
1779 
1780  NumOldProtos = im_.FindGoodProtos(IClass, AllProtosOn, AllConfigsOff,
1781  BlobLength, NumFeatures, Features,
1782  OldProtos, classify_adapt_proto_threshold,
1783  debug_level);
1784 
1785  MaskSize = WordsInVectorOfSize(MAX_NUM_PROTOS);
1786  zero_all_bits(TempProtoMask, MaskSize);
1787  for (i = 0; i < NumOldProtos; i++)
1788  SET_BIT(TempProtoMask, OldProtos[i]);
1789 
1790  NumBadFeatures = im_.FindBadFeatures(IClass, TempProtoMask, AllConfigsOn,
1791  BlobLength, NumFeatures, Features,
1792  BadFeatures,
1794  debug_level);
1795 
1796  MaxProtoId = MakeNewTempProtos(FloatFeatures, NumBadFeatures, BadFeatures,
1797  IClass, Class, TempProtoMask);
1798  if (MaxProtoId == NO_PROTO) {
1799  ++NumAdaptationsFailed;
1801  cprintf("Cannot make new temp protos: maximum number exceeded.\n");
1802  return -1;
1803  }
1804 
1805  ConfigId = AddIntConfig(IClass);
1806  ConvertConfig(TempProtoMask, ConfigId, IClass);
1807  Config = NewTempConfig(MaxProtoId, FontinfoId);
1808  TempConfigFor(Class, ConfigId) = Config;
1809  copy_all_bits(TempProtoMask, Config->Protos, Config->ProtoVectorSize);
1810 
1812  cprintf("Making new temp config %d fontinfo id %d"
1813  " using %d old and %d new protos.\n",
1814  ConfigId, Config->FontinfoId,
1815  NumOldProtos, MaxProtoId - OldMaxProtoId);
1816 
1817  return ConfigId;
1818 } /* MakeNewTemporaryConfig */
1819 
1820 /*---------------------------------------------------------------------------*/
1840  int NumBadFeat,
1841  FEATURE_ID BadFeat[],
1842  INT_CLASS IClass,
1843  ADAPT_CLASS Class,
1844  BIT_VECTOR TempProtoMask) {
1845  FEATURE_ID *ProtoStart;
1846  FEATURE_ID *ProtoEnd;
1847  FEATURE_ID *LastBad;
1848  TEMP_PROTO TempProto;
1849  PROTO Proto;
1850  FEATURE F1, F2;
1851  float X1, X2, Y1, Y2;
1852  float A1, A2, AngleDelta;
1853  float SegmentLength;
1854  PROTO_ID Pid;
1855 
1856  for (ProtoStart = BadFeat, LastBad = ProtoStart + NumBadFeat;
1857  ProtoStart < LastBad; ProtoStart = ProtoEnd) {
1858  F1 = Features->Features[*ProtoStart];
1859  X1 = F1->Params[PicoFeatX];
1860  Y1 = F1->Params[PicoFeatY];
1861  A1 = F1->Params[PicoFeatDir];
1862 
1863  for (ProtoEnd = ProtoStart + 1,
1864  SegmentLength = GetPicoFeatureLength();
1865  ProtoEnd < LastBad;
1866  ProtoEnd++, SegmentLength += GetPicoFeatureLength()) {
1867  F2 = Features->Features[*ProtoEnd];
1868  X2 = F2->Params[PicoFeatX];
1869  Y2 = F2->Params[PicoFeatY];
1870  A2 = F2->Params[PicoFeatDir];
1871 
1872  AngleDelta = fabs(A1 - A2);
1873  if (AngleDelta > 0.5)
1874  AngleDelta = 1.0 - AngleDelta;
1875 
1876  if (AngleDelta > matcher_clustering_max_angle_delta ||
1877  fabs(X1 - X2) > SegmentLength ||
1878  fabs(Y1 - Y2) > SegmentLength)
1879  break;
1880  }
1881 
1882  F2 = Features->Features[*(ProtoEnd - 1)];
1883  X2 = F2->Params[PicoFeatX];
1884  Y2 = F2->Params[PicoFeatY];
1885  A2 = F2->Params[PicoFeatDir];
1886 
1887  Pid = AddIntProto(IClass);
1888  if (Pid == NO_PROTO)
1889  return (NO_PROTO);
1890 
1891  TempProto = NewTempProto();
1892  Proto = &(TempProto->Proto);
1893 
1894  /* compute proto params - NOTE that Y_DIM_OFFSET must be used because
1895  ConvertProto assumes that the Y dimension varies from -0.5 to 0.5
1896  instead of the -0.25 to 0.75 used in baseline normalization */
1897  Proto->Length = SegmentLength;
1898  Proto->Angle = A1;
1899  Proto->X = (X1 + X2) / 2.0;
1900  Proto->Y = (Y1 + Y2) / 2.0 - Y_DIM_OFFSET;
1901  FillABC(Proto);
1902 
1903  TempProto->ProtoId = Pid;
1904  SET_BIT(TempProtoMask, Pid);
1905 
1906  ConvertProto(Proto, Pid, IClass);
1907  AddProtoToProtoPruner(Proto, Pid, IClass,
1909 
1910  Class->TempProtos = push(Class->TempProtos, TempProto);
1911  }
1912  return IClass->NumProtos - 1;
1913 } /* MakeNewTempProtos */
1914 
1915 /*---------------------------------------------------------------------------*/
1926  CLASS_ID ClassId,
1927  int ConfigId,
1928  TBLOB *Blob) {
1929  UNICHAR_ID *Ambigs;
1931  ADAPT_CLASS Class;
1932  PROTO_KEY ProtoKey;
1933 
1934  Class = Templates->Class[ClassId];
1935  Config = TempConfigFor(Class, ConfigId);
1936 
1937  MakeConfigPermanent(Class, ConfigId);
1938  if (Class->NumPermConfigs == 0)
1939  Templates->NumPermClasses++;
1940  Class->NumPermConfigs++;
1941 
1942  // Initialize permanent config.
1943  Ambigs = GetAmbiguities(Blob, ClassId);
1944  PERM_CONFIG Perm = (PERM_CONFIG)malloc(sizeof(PERM_CONFIG_STRUCT));
1945  Perm->Ambigs = Ambigs;
1946  Perm->FontinfoId = Config->FontinfoId;
1947 
1948  // Free memory associated with temporary config (since ADAPTED_CONFIG
1949  // is a union we need to clean up before we record permanent config).
1950  ProtoKey.Templates = Templates;
1951  ProtoKey.ClassId = ClassId;
1952  ProtoKey.ConfigId = ConfigId;
1953  Class->TempProtos = delete_d(Class->TempProtos, &ProtoKey, MakeTempProtoPerm);
1955 
1956  // Record permanent config.
1957  PermConfigFor(Class, ConfigId) = Perm;
1958 
1959  if (classify_learning_debug_level >= 1) {
1960  tprintf("Making config %d for %s (ClassId %d) permanent:"
1961  " fontinfo id %d, ambiguities '",
1962  ConfigId, getDict().getUnicharset().debug_str(ClassId).string(),
1963  ClassId, PermConfigFor(Class, ConfigId)->FontinfoId);
1964  for (UNICHAR_ID *AmbigsPointer = Ambigs;
1965  *AmbigsPointer >= 0; ++AmbigsPointer)
1966  tprintf("%s", unicharset.id_to_unichar(*AmbigsPointer));
1967  tprintf("'.\n");
1968  }
1969 } /* MakePermanent */
1970 } // namespace tesseract
1971 
1972 /*---------------------------------------------------------------------------*/
1985 int MakeTempProtoPerm(void *item1, void *item2) {
1986  ADAPT_CLASS Class;
1988  TEMP_PROTO TempProto;
1989  PROTO_KEY *ProtoKey;
1990 
1991  TempProto = (TEMP_PROTO) item1;
1992  ProtoKey = (PROTO_KEY *) item2;
1993 
1994  Class = ProtoKey->Templates->Class[ProtoKey->ClassId];
1995  Config = TempConfigFor(Class, ProtoKey->ConfigId);
1996 
1997  if (TempProto->ProtoId > Config->MaxProtoId ||
1998  !test_bit (Config->Protos, TempProto->ProtoId))
1999  return FALSE;
2000 
2001  MakeProtoPermanent(Class, TempProto->ProtoId);
2002  AddProtoToClassPruner(&(TempProto->Proto), ProtoKey->ClassId,
2003  ProtoKey->Templates->Templates);
2004  FreeTempProto(TempProto);
2005 
2006  return TRUE;
2007 } /* MakeTempProtoPerm */
2008 
2009 /*---------------------------------------------------------------------------*/
2010 namespace tesseract {
2019  for (int i = 0; i < results.match.size(); ++i) {
2020  tprintf("%s ", unicharset.debug_str(results.match[i].unichar_id).string());
2021  results.match[i].Print();
2022  }
2023 } /* PrintAdaptiveMatchResults */
2024 
2025 /*---------------------------------------------------------------------------*/
2039  int Next, NextGood;
2040  float BadMatchThreshold;
2041  static const char* romans = "i v x I V X";
2042  BadMatchThreshold = Results->best_rating - matcher_bad_match_pad;
2043 
2045  UNICHAR_ID unichar_id_one = unicharset.contains_unichar("1") ?
2046  unicharset.unichar_to_id("1") : -1;
2047  UNICHAR_ID unichar_id_zero = unicharset.contains_unichar("0") ?
2048  unicharset.unichar_to_id("0") : -1;
2049  float scored_one = ScoredUnichar(unichar_id_one, *Results);
2050  float scored_zero = ScoredUnichar(unichar_id_zero, *Results);
2051 
2052  for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
2053  const UnicharRating& match = Results->match[Next];
2054  if (match.rating >= BadMatchThreshold) {
2055  if (!unicharset.get_isalpha(match.unichar_id) ||
2056  strstr(romans,
2057  unicharset.id_to_unichar(match.unichar_id)) != nullptr) {
2058  } else if (unicharset.eq(match.unichar_id, "l") &&
2059  scored_one < BadMatchThreshold) {
2060  Results->match[Next].unichar_id = unichar_id_one;
2061  } else if (unicharset.eq(match.unichar_id, "O") &&
2062  scored_zero < BadMatchThreshold) {
2063  Results->match[Next].unichar_id = unichar_id_zero;
2064  } else {
2065  Results->match[Next].unichar_id = INVALID_UNICHAR_ID; // Don't copy.
2066  }
2067  if (Results->match[Next].unichar_id != INVALID_UNICHAR_ID) {
2068  if (NextGood == Next) {
2069  ++NextGood;
2070  } else {
2071  Results->match[NextGood++] = Results->match[Next];
2072  }
2073  }
2074  }
2075  }
2076  } else {
2077  for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
2078  if (Results->match[Next].rating >= BadMatchThreshold) {
2079  if (NextGood == Next) {
2080  ++NextGood;
2081  } else {
2082  Results->match[NextGood++] = Results->match[Next];
2083  }
2084  }
2085  }
2086  }
2087  Results->match.truncate(NextGood);
2088 } /* RemoveBadMatches */
2089 
2090 /*----------------------------------------------------------------------------*/
2099  int Next, NextGood;
2100  int punc_count; /*no of garbage characters */
2101  int digit_count;
2102  /*garbage characters */
2103  static char punc_chars[] = ". , ; : / ` ~ ' - = \\ | \" ! _ ^";
2104  static char digit_chars[] = "0 1 2 3 4 5 6 7 8 9";
2105 
2106  punc_count = 0;
2107  digit_count = 0;
2108  for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
2109  const UnicharRating& match = Results->match[Next];
2110  bool keep = true;
2111  if (strstr(punc_chars,
2112  unicharset.id_to_unichar(match.unichar_id)) != nullptr) {
2113  if (punc_count >= 2)
2114  keep = false;
2115  punc_count++;
2116  } else {
2117  if (strstr(digit_chars,
2118  unicharset.id_to_unichar(match.unichar_id)) != nullptr) {
2119  if (digit_count >= 1)
2120  keep = false;
2121  digit_count++;
2122  }
2123  }
2124  if (keep) {
2125  if (NextGood == Next) {
2126  ++NextGood;
2127  } else {
2128  Results->match[NextGood++] = match;
2129  }
2130  }
2131  }
2132  Results->match.truncate(NextGood);
2133 } /* RemoveExtraPuncs */
2134 
2135 /*---------------------------------------------------------------------------*/
2146 void Classify::SetAdaptiveThreshold(float Threshold) {
2147  Threshold = (Threshold == matcher_good_threshold) ? 0.9: (1.0 - Threshold);
2149  ClipToRange<int>(255 * Threshold, 0, 255));
2151  ClipToRange<int>(255 * Threshold, 0, 255));
2152 } /* SetAdaptiveThreshold */
2153 
2154 /*---------------------------------------------------------------------------*/
2164 void Classify::ShowBestMatchFor(int shape_id,
2165  const INT_FEATURE_STRUCT* features,
2166  int num_features) {
2167 #ifndef GRAPHICS_DISABLED
2168  uint32_t config_mask;
2169  if (UnusedClassIdIn(PreTrainedTemplates, shape_id)) {
2170  tprintf("No built-in templates for class/shape %d\n", shape_id);
2171  return;
2172  }
2173  if (num_features <= 0) {
2174  tprintf("Illegal blob (char norm features)!\n");
2175  return;
2176  }
2177  UnicharRating cn_result;
2178  classify_norm_method.set_value(character);
2181  num_features, features, &cn_result,
2184  tprintf("\n");
2185  config_mask = 1 << cn_result.config;
2186 
2187  tprintf("Static Shape ID: %d\n", shape_id);
2188  ShowMatchDisplay();
2190  &config_mask, num_features, features, &cn_result,
2194 #endif // GRAPHICS_DISABLED
2195 } /* ShowBestMatchFor */
2196 
2197 // Returns a string for the classifier class_id: either the corresponding
2198 // unicharset debug_str or the shape_table_ debug str.
2200  int class_id, int config_id) const {
2201  STRING class_string;
2202  if (templates == PreTrainedTemplates && shape_table_ != nullptr) {
2203  int shape_id = ClassAndConfigIDToFontOrShapeID(class_id, config_id);
2204  class_string = shape_table_->DebugStr(shape_id);
2205  } else {
2206  class_string = unicharset.debug_str(class_id);
2207  }
2208  return class_string;
2209 }
2210 
2211 // Converts a classifier class_id index to a shape_table_ index
2213  int int_result_config) const {
2214  int font_set_id = PreTrainedTemplates->Class[class_id]->font_set_id;
2215  // Older inttemps have no font_ids.
2216  if (font_set_id < 0)
2217  return kBlankFontinfoId;
2218  const FontSet &fs = fontset_table_.get(font_set_id);
2219  ASSERT_HOST(int_result_config >= 0 && int_result_config < fs.size);
2220  return fs.configs[int_result_config];
2221 }
2222 
2223 // Converts a shape_table_ index to a classifier class_id index (not a
2224 // unichar-id!). Uses a search, so not fast.
2225 int Classify::ShapeIDToClassID(int shape_id) const {
2226  for (int id = 0; id < PreTrainedTemplates->NumClasses; ++id) {
2227  int font_set_id = PreTrainedTemplates->Class[id]->font_set_id;
2228  ASSERT_HOST(font_set_id >= 0);
2229  const FontSet &fs = fontset_table_.get(font_set_id);
2230  for (int config = 0; config < fs.size; ++config) {
2231  if (fs.configs[config] == shape_id)
2232  return id;
2233  }
2234  }
2235  tprintf("Shape %d not found\n", shape_id);
2236  return -1;
2237 }
2238 
2239 // Returns true if the given TEMP_CONFIG is good enough to make it
2240 // a permanent config.
2242  const TEMP_CONFIG &config) {
2243  if (classify_learning_debug_level >= 1) {
2244  tprintf("NumTimesSeen for config of %s is %d\n",
2245  getDict().getUnicharset().debug_str(class_id).string(),
2246  config->NumTimesSeen);
2247  }
2249  return true;
2250  } else if (config->NumTimesSeen < matcher_min_examples_for_prototyping) {
2251  return false;
2252  } else if (use_ambigs_for_adaption) {
2253  // Go through the ambigs vector and see whether we have already seen
2254  // enough times all the characters represented by the ambigs vector.
2255  const UnicharIdVector *ambigs =
2257  int ambigs_size = (ambigs == nullptr) ? 0 : ambigs->size();
2258  for (int ambig = 0; ambig < ambigs_size; ++ambig) {
2259  ADAPT_CLASS ambig_class = AdaptedTemplates->Class[(*ambigs)[ambig]];
2260  assert(ambig_class != nullptr);
2261  if (ambig_class->NumPermConfigs == 0 &&
2262  ambig_class->MaxNumTimesSeen <
2264  if (classify_learning_debug_level >= 1) {
2265  tprintf("Ambig %s has not been seen enough times,"
2266  " not making config for %s permanent\n",
2267  getDict().getUnicharset().debug_str(
2268  (*ambigs)[ambig]).string(),
2269  getDict().getUnicharset().debug_str(class_id).string());
2270  }
2271  return false;
2272  }
2273  }
2274  }
2275  return true;
2276 }
2277 
2279  const UnicharIdVector *ambigs =
2281  int ambigs_size = (ambigs == nullptr) ? 0 : ambigs->size();
2282  if (classify_learning_debug_level >= 1) {
2283  tprintf("Running UpdateAmbigsGroup for %s class_id=%d\n",
2284  getDict().getUnicharset().debug_str(class_id).string(), class_id);
2285  }
2286  for (int ambig = 0; ambig < ambigs_size; ++ambig) {
2287  CLASS_ID ambig_class_id = (*ambigs)[ambig];
2288  const ADAPT_CLASS ambigs_class = AdaptedTemplates->Class[ambig_class_id];
2289  for (int cfg = 0; cfg < MAX_NUM_CONFIGS; ++cfg) {
2290  if (ConfigIsPermanent(ambigs_class, cfg)) continue;
2291  const TEMP_CONFIG config =
2292  TempConfigFor(AdaptedTemplates->Class[ambig_class_id], cfg);
2293  if (config != nullptr && TempConfigReliable(ambig_class_id, config)) {
2294  if (classify_learning_debug_level >= 1) {
2295  tprintf("Making config %d of %s permanent\n", cfg,
2296  getDict().getUnicharset().debug_str(
2297  ambig_class_id).string());
2298  }
2299  MakePermanent(AdaptedTemplates, ambig_class_id, cfg, Blob);
2300  }
2301  }
2302  }
2303 }
2304 
2305 } // namespace tesseract
void InitMatcherRatings(float *Rating)
#define MAX_MATCHES
Definition: adaptmatch.cpp:78
bool PiecesAllNatural(int start, int count) const
Definition: pageres.cpp:1084
TrainingSample * BlobToTrainingSample(const TBLOB &blob, bool nonlinear_norm, INT_FX_RESULT_STRUCT *fx_info, GenericVector< INT_FEATURE_STRUCT > *bl_features)
Definition: intfx.cpp:79
void ClassifyAsNoise(ADAPT_RESULTS *Results)
bool LargeSpeckle(const TBLOB &blob)
Definition: classify.cpp:256
static void SetupBLCNDenorms(const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm, DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info)
Definition: intfx.cpp:131
double tessedit_class_miss_scale
Definition: classify.h:480
ADAPT_TEMPLATES ReadAdaptedTemplates(TFile *File)
Definition: adaptive.cpp:333
int32_t BlobLength
Definition: adaptmatch.cpp:93
TWERD * rebuild_word
Definition: pageres.h:260
int UNICHAR_ID
Definition: unichar.h:35
CLUSTERCONFIG Config
void InitAdaptiveClassifier(TessdataManager *mgr)
Definition: adaptmatch.cpp:528
int size() const
Definition: genericvector.h:71
void ComputeIntFeatures(FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures)
Definition: float2int.cpp:90
double segment_penalty_dict_case_ok
Definition: dict.h:588
int matcher_permanent_classes_min
Definition: classify.h:467
void Initialize()
Definition: adaptmatch.cpp:103
#define TRUE
Definition: capi.h:51
void RefreshDebugWindow(ScrollView **win, const char *msg, int y_offset, const TBOX &wbox)
Definition: adaptmatch.cpp:227
int GetCharNormFeature(const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES templates, uint8_t *pruner_norm_array, uint8_t *char_norm_array)
bool GetComponent(TessdataType type, TFile *fp)
void UpdateMatchDisplay()
Definition: intproto.cpp:451
void cprintf(const char *format,...)
Definition: callcpp.cpp:33
void LearnPieces(const char *fontname, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word)
Definition: adaptmatch.cpp:375
void FreeFeature(FEATURE Feature)
Definition: ocrfeatures.cpp:56
bool classify_enable_adaptive_debugger
Definition: classify.h:455
NORM_PROTOS * NormProtos
Definition: classify.h:527
void DisplayAdaptedChar(TBLOB *blob, INT_CLASS_STRUCT *int_class)
Definition: adaptmatch.cpp:950
float X
Definition: protos.h:46
Definition: cluster.h:32
void free_int_templates(INT_TEMPLATES templates)
Definition: intproto.cpp:708
#define MF_SCALE_FACTOR
Definition: mfoutline.h:64
uint8_t FEATURE_ID
Definition: matchdefs.h:48
#define WordsInVectorOfSize(NumBits)
Definition: bitvec.h:63
void SettupStopperPass1()
Sets up stopper variables in preparation for the first pass.
Definition: stopper.cpp:360
INT_CLASS Class[MAX_NUM_CLASSES]
Definition: intproto.h:121
BIT_VECTOR AllProtosOn
Definition: classify.h:521
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:104
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:686
void ComputeAdaptionThresholds(float certainty_scale, float min_rating, float max_rating, float rating_margin, float *thresholds)
Definition: pageres.cpp:567
UNICHAR_ID best_unichar_id
Definition: adaptmatch.cpp:95
double certainty_scale
Definition: dict.h:611
CharSegmentationType
Definition: classify.h:96
STRING DebugStr(int shape_id) const
Definition: shapetable.cpp:281
bool HasNonfragment
Definition: adaptmatch.cpp:94
#define Y_DIM_OFFSET
Definition: adaptmatch.cpp:85
const char * string() const
Definition: strngs.cpp:196
void set_fonts(const GenericVector< tesseract::ScoredFont > &fonts)
Definition: ratngs.h:95
void DebugAdaptiveClassifier(TBLOB *Blob, ADAPT_RESULTS *Results)
int FindGoodProtos(INT_CLASS ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask, uint16_t BlobLength, int16_t NumFeatures, INT_FEATURE_ARRAY Features, PROTO_ID *ProtoArray, int AdaptProtoThreshold, int Debug)
Definition: intmatcher.cpp:549
void ConvertProto(PROTO Proto, int ProtoId, INT_CLASS Class)
Definition: intproto.cpp:496
void AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
Definition: adaptmatch.cpp:192
bool use_ambigs_for_adaption
Definition: ccutil.h:88
void MasterMatcher(INT_TEMPLATES templates, int16_t num_features, const INT_FEATURE_STRUCT *features, const uint8_t *norm_factors, ADAPT_CLASS *classes, int debug, int matcher_multiplier, const TBOX &blob_box, const GenericVector< CP_RESULT_STRUCT > &results, ADAPT_RESULTS *final_results)
bool classify_bln_numeric_mode
Definition: classify.h:541
void plot(ScrollView *window, ScrollView::Color color, ScrollView::Color child_color)
Definition: blobs.cpp:520
void InitIntegerFX()
Definition: intfx.cpp:53
STRING language_data_path_prefix
Definition: ccutil.h:67
TBOX bounding_box() const
Definition: blobs.cpp:871
bool TempConfigReliable(CLASS_ID class_id, const TEMP_CONFIG &config)
int GetFontinfoId(ADAPT_CLASS Class, uint8_t ConfigId)
Definition: adaptive.cpp:174
Definition: rect.h:34
virtual void DebugDisplay(const TrainingSample &sample, Pix *page_pix, UNICHAR_ID unichar_id)
int NumBlobs() const
Definition: blobs.h:432
static void JoinPieces(const GenericVector< SEAM *> &seams, const GenericVector< TBLOB *> &blobs, int first, int last)
Definition: seam.cpp:216
static void BreakPieces(const GenericVector< SEAM *> &seams, const GenericVector< TBLOB *> &blobs, int first, int last)
Definition: seam.cpp:194
UNICHAR_ID * BaselineClassifier(TBLOB *Blob, const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results)
const int kBlnXHeight
Definition: normalis.h:24
bool AdaptableWord(WERD_RES *word)
Definition: adaptmatch.cpp:823
LIST push(LIST list, void *element)
Definition: oldlist.cpp:283
bool AlternativeChoiceAdjustmentsWorseThan(float threshold) const
Definition: pageres.cpp:445
bool matcher_debug_separate_windows
Definition: classify.h:499
const UnicharIdVector * AmbigsForAdaption(UNICHAR_ID unichar_id) const
Definition: ambigs.h:190
const FontInfo * fontinfo
Definition: pageres.h:304
CLASS_ID ClassId
Definition: adaptmatch.cpp:125
void SetAdaptiveThreshold(float Threshold)
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
void ClearCharNormArray(uint8_t *char_norm_array)
Definition: float2int.cpp:44
double matcher_rating_margin
Definition: classify.h:465
#define PRINT_FEATURE_MATCHES
Definition: intproto.h:189
bool DeSerialize(TFile *fp)
Definition: shapetable.cpp:246
uint32_t * BIT_VECTOR
Definition: bitvec.h:28
void AddNewResult(const UnicharRating &new_result, ADAPT_RESULTS *results)
Definition: adaptmatch.cpp:998
int MakeNewTemporaryConfig(ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures)
const int kBlnBaselineOffset
Definition: normalis.h:25
#define zero_all_bits(array, length)
Definition: bitvec.h:33
ADAPT_TEMPLATES NewAdaptedTemplates(bool InitFromUnicharset)
Definition: adaptive.cpp:152
void UpdateAmbigsGroup(CLASS_ID class_id, TBLOB *Blob)
UNICHAR_ID CLASS_ID
Definition: matchdefs.h:36
#define UnusedClassIdIn(T, c)
Definition: intproto.h:175
void ShowBestMatchFor(int shape_id, const INT_FEATURE_STRUCT *features, int num_features)
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:486
#define WORST_POSSIBLE_RATING
Definition: adaptmatch.cpp:87
PROTO_STRUCT Proto
Definition: adaptive.h:32
void EndDangerousAmbigs()
Definition: stopper.cpp:358
uint16_t ProtoId
Definition: adaptive.h:30
GenericVector< UnicharRating > match
Definition: adaptmatch.cpp:98
int PruneClasses(const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, const INT_FEATURE_STRUCT *features, const uint8_t *normalization_factors, const uint16_t *expected_num_features, GenericVector< CP_RESULT_STRUCT > *results)
Definition: intmatcher.cpp:409
int AddIntProto(INT_CLASS Class)
Definition: intproto.cpp:283
bool prioritize_division
Definition: classify.h:428
static void Update()
Definition: scrollview.cpp:711
void FreeBitVector(BIT_VECTOR BitVector)
Definition: bitvec.cpp:51
int size() const
Definition: shapetable.h:200
void Match(INT_CLASS ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask, int16_t NumFeatures, const INT_FEATURE_STRUCT *Features, tesseract::UnicharRating *Result, int AdaptFeatureThreshold, int Debug, bool SeparateDebugWindows)
Definition: intmatcher.cpp:470
void AddLargeSpeckleTo(int blob_length, BLOB_CHOICE_LIST *choices)
Definition: classify.cpp:233
bool classify_nonlinear_norm
Definition: classify.h:457
#define copy_all_bits(source, dest, length)
Definition: bitvec.h:49
float Params[1]
Definition: ocrfeatures.h:62
int size() const
Definition: unicharset.h:336
int MakeTempProtoPerm(void *item1, void *item2)
int classify_adapt_proto_threshold
Definition: classify.h:486
GenericVector< STRING > correct_text
Definition: pageres.h:275
void free_adapted_templates(ADAPT_TEMPLATES templates)
Definition: adaptive.cpp:183
float Y
Definition: protos.h:47
#define GetPicoFeatureLength()
Definition: picofeat.h:57
void XHeightRange(int unichar_id, const UNICHARSET &unicharset, const TBOX &bbox, float *min_xht, float *max_xht, float *yshift) const
Definition: normalis.cpp:429
uint8_t NumPermConfigs
Definition: adaptive.h:64
float Length
Definition: protos.h:49
#define TempConfigFor(Class, ConfigId)
Definition: adaptive.h:101
FEATURE Features[1]
Definition: ocrfeatures.h:69
int best_match_index
Definition: adaptmatch.cpp:96
NORM_PROTOS * ReadNormProtos(TFile *fp)
Definition: normmatch.cpp:235
double classify_character_fragments_garbage_certainty_threshold
Definition: classify.h:494
void ComputeCharNormArrays(FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uint8_t *char_norm_array, uint8_t *pruner_array)
uint8_t NumConfigs
Definition: intproto.h:108
int16_t left() const
Definition: rect.h:72
#define LegalClassId(c)
Definition: intproto.h:174
static int SortDescendingRating(const void *t1, const void *t2)
Definition: shapetable.h:56
#define MAX_NUM_CONFIGS
Definition: intproto.h:47
#define MAX_NUM_PROTOS
Definition: intproto.h:48
void FreeTempConfig(TEMP_CONFIG Config)
Definition: adaptive.cpp:75
bool classify_debug_character_fragments
Definition: classify.h:496
double classify_adapted_pruning_threshold
Definition: classify.h:484
ADAPT_CLASS Class[MAX_NUM_CLASSES]
Definition: adaptive.h:80
int16_t top() const
Definition: rect.h:58
float best_rating
Definition: adaptmatch.cpp:97
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507
#define PermConfigFor(Class, ConfigId)
Definition: adaptive.h:104
void AmbigClassifier(const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob, INT_TEMPLATES templates, ADAPT_CLASS *classes, UNICHAR_ID *ambiguities, ADAPT_RESULTS *results)
BIT_VECTOR NewBitVector(int NumBits)
Definition: bitvec.cpp:82
uint8_t NumTimesSeen
Definition: adaptive.h:41
int classify_learning_debug_level
Definition: classify.h:460
STRING to_string() const
Definition: unicharset.h:80
#define MakeProtoPermanent(Class, ProtoId)
Definition: adaptive.h:98
float ApplyCNCorrection(float rating, int blob_length, int normalization_factor, int matcher_multiplier)
INT_TEMPLATES PreTrainedTemplates
Definition: classify.h:510
UNICHARSET unicharset
Definition: ccutil.h:68
float ActualOutlineLength(FEATURE Feature)
Definition: normfeat.cpp:32
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:563
uint16_t NumFeatures
Definition: ocrfeatures.h:67
BIT_VECTOR AllConfigsOff
Definition: classify.h:523
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:670
int ClassAndConfigIDToFontOrShapeID(int class_id, int int_result_config) const
#define FALSE
Definition: capi.h:52
double matcher_good_threshold
Definition: classify.h:461
#define IncreaseConfidence(TempConfig)
Definition: adaptive.h:107
TEMP_PROTO NewTempProto()
Definition: adaptive.cpp:229
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:342
#define MAX_NUM_INT_FEATURES
Definition: intproto.h:129
bool MarginalMatch(float confidence, float matcher_great_threshold)
Definition: adaptmatch.cpp:132
ADAPT_TEMPLATES Templates
Definition: adaptmatch.cpp:124
int IntCastRounded(double x)
Definition: helpers.h:168
BIT_VECTOR PermConfigs
Definition: adaptive.h:68
double ComputeCorrectedRating(bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, int matcher_multiplier, const uint8_t *cn_factors)
void SetAdaptiveThreshold(float Threshold)
double matcher_reliable_adaptive_result
Definition: classify.h:462
double classify_adapted_pruning_factor
Definition: classify.h:482
int length() const
Definition: genericvector.h:85
#define LENGTH_COMPRESSION
Definition: normfeat.h:27
void plot(ScrollView *window)
Definition: blobs.cpp:907
STRING ClassIDToDebugStr(const INT_TEMPLATES_STRUCT *templates, int class_id, int config_id) const
UNICHAR_ID * Ambigs
Definition: adaptive.h:51
GenericVector< int > best_state
Definition: pageres.h:271
int MaxNumUnichars() const
Definition: shapetable.cpp:455
double matcher_clustering_max_angle_delta
Definition: classify.h:473
double classify_misfit_junk_penalty
Definition: classify.h:476
bool get_enabled(UNICHAR_ID unichar_id) const
Definition: unicharset.h:873
double certainty_scale
Definition: classify.h:478
#define set_all_bits(array, length)
Definition: bitvec.h:41
STRING imagefile
Definition: ccutil.h:70
ShapeTable * shape_table_
Definition: classify.h:553
PROTO_ID MakeNewTempProtos(FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS IClass, ADAPT_CLASS Class, BIT_VECTOR TempProtoMask)
bool LooksLikeGarbage(TBLOB *blob)
ADAPT_TEMPLATES BackupAdaptedTemplates
Definition: classify.h:518
ADAPT_TEMPLATES AdaptedTemplates
Definition: classify.h:514
bool empty() const
Definition: genericvector.h:90
void MakePermanent(ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob)
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
void PrintAdaptiveMatchResults(const ADAPT_RESULTS &results)
TBOX bounding_box() const
Definition: blobs.cpp:478
float adjust_factor() const
Definition: ratngs.h:306
TBLOB * ClassifyNormalizeIfNeeded() const
Definition: blobs.cpp:356
int length() const
Definition: ratngs.h:303
void PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates)
Definition: adaptive.cpp:245
void ResetAdaptiveClassifierInternal()
Definition: adaptmatch.cpp:599
GenericVector< SEAM * > seam_array
Definition: pageres.h:217
uint8_t NumPermClasses
Definition: adaptive.h:78
INT_FEATURE_STRUCT INT_FEATURE_ARRAY[MAX_NUM_INT_FEATURES]
Definition: intproto.h:150
void ExpandShapesAndApplyCorrections(ADAPT_CLASS *classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uint8_t *cn_factors, UnicharRating *int_result, ADAPT_RESULTS *final_results)
void ConvertMatchesToChoices(const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices)
int CharNormClassifier(TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results)
bool Open(const STRING &filename, FileReader reader)
Definition: serialis.cpp:196
int push_back(T object)
TEMP_CONFIG NewTempConfig(int MaxProtoId, int FontinfoId)
Definition: adaptive.cpp:204
void EndAdaptiveClassifier()
Definition: adaptmatch.cpp:460
GenericVector< TBLOB * > blobs
Definition: blobs.h:443
double matcher_perfect_threshold
Definition: classify.h:463
FEATURE NewFeature(const FEATURE_DESC_STRUCT *FeatureDesc)
Definition: ocrfeatures.cpp:81
const Shape & GetShape(int shape_id) const
Definition: shapetable.h:320
double matcher_avg_noise_size
Definition: classify.h:466
int AddIntConfig(INT_CLASS Class)
Definition: intproto.cpp:262
int matcher_sufficient_examples_for_prototyping
Definition: classify.h:471
int16_t PROTO_ID
Definition: matchdefs.h:42
#define ConfigIsPermanent(Class, ConfigId)
Definition: adaptive.h:92
FEATURE_SET ExtractOutlineFeatures(TBLOB *Blob)
Definition: outfeat.cpp:42
bool disable_character_fragments
Definition: classify.h:491
void FillABC(PROTO Proto)
Definition: protos.cpp:195
void RemoveExtraPuncs(ADAPT_RESULTS *Results)
bool classify_enable_adaptive_matcher
Definition: classify.h:450
GenericVector< CP_RESULT_STRUCT > CPResults
Definition: adaptmatch.cpp:99
PERM_CONFIG_STRUCT * PERM_CONFIG
Definition: adaptive.h:54
#define IsEmptyAdaptedClass(Class)
Definition: adaptive.h:89
int classify_adapt_feature_threshold
Definition: classify.h:488
#define reset_bit(array, bit)
Definition: bitvec.h:59
float Angle
Definition: protos.h:48
void FreeFeatureSet(FEATURE_SET FeatureSet)
Definition: ocrfeatures.cpp:65
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:729
void SwitchAdaptiveClassifier()
Definition: adaptmatch.cpp:614
int classify_integer_matcher_multiplier
Definition: classify.h:510
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:529
Definition: strngs.h:45
BIT_VECTOR PermProtos
Definition: adaptive.h:67
BIT_VECTOR AllConfigsOn
Definition: classify.h:522
void AddProtoToProtoPruner(PROTO Proto, int ProtoId, INT_CLASS Class, bool debug)
Definition: intproto.cpp:370
const UnicharIdVector * ReverseAmbigsForAdaption(UNICHAR_ID unichar_id) const
Definition: ambigs.h:199
void WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates)
Definition: adaptive.cpp:454
int32_t Length
Definition: intfx.h:36
virtual int UnicharClassifySample(const TrainingSample &sample, Pix *page_pix, int debug, UNICHAR_ID keep_this, GenericVector< UnicharRating > *results)
const STRING debug_string() const
Definition: ratngs.h:505
int matcher_min_examples_for_prototyping
Definition: classify.h:469
double matcher_bad_match_pad
Definition: classify.h:464
#define MakeConfigPermanent(Class, ConfigId)
Definition: adaptive.h:95
INT_TEMPLATES ReadIntTemplates(TFile *fp)
Definition: intproto.cpp:728
#define PRINT_MATCH_SUMMARY
Definition: intproto.h:186
#define MAX_ADAPTABLE_WERD_SIZE
Definition: adaptmatch.cpp:81
BIT_VECTOR TempProtoMask
Definition: classify.h:524
uint8_t MaxNumTimesSeen
Definition: adaptive.h:65
void ComputeBest()
Definition: adaptmatch.cpp:109
const DENORM & denorm() const
Definition: blobs.h:347
const FEATURE_DESC_STRUCT CharNormDesc
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
int CharNormTrainingSample(bool pruner_only, int keep_this, const TrainingSample &sample, GenericVector< UnicharRating > *results)
const UNICHARSET & getUnicharset() const
Definition: dict.h:98
void InitAdaptedClass(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS Class, ADAPT_TEMPLATES Templates)
Definition: adaptmatch.cpp:694
char * classify_learn_debug_str
Definition: classify.h:500
int FindBadFeatures(INT_CLASS ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask, uint16_t BlobLength, int16_t NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_ID *FeatureArray, int AdaptFeatureThreshold, int Debug)
Definition: intmatcher.cpp:618
const double kStandardFeatureLength
Definition: intfx.h:46
UNICHAR_ID * GetAmbiguities(TBLOB *Blob, CLASS_ID CorrectClass)
int16_t right() const
Definition: rect.h:79
TEMP_PROTO_STRUCT * TEMP_PROTO
Definition: adaptive.h:37
#define ADAPTABLE_WERD_ADJUSTMENT
Definition: adaptmatch.cpp:83
uint16_t NumProtos
Definition: intproto.h:106
void StartBackupAdaptiveClassifier()
Definition: adaptmatch.cpp:630
void truncate(int size)
void ComputeIntCharNormArray(const FEATURE_STRUCT &norm_feature, uint8_t *char_norm_array)
Definition: float2int.cpp:62
#define NO_PROTO
Definition: matchdefs.h:43
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:836
int16_t Ymean
Definition: intfx.h:37
void LearnBlob(const STRING &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text)
Definition: blobclass.cpp:74
void DoAdaptiveMatch(TBLOB *Blob, ADAPT_RESULTS *Results)
Pix * pix() const
Definition: normalis.h:246
virtual Dict & getDict()
Definition: classify.h:107
#define ADAPT_TEMPLATE_SUFFIX
Definition: adaptmatch.cpp:76
char window_wait(ScrollView *win)
Definition: callcpp.cpp:104
Definition: blobs.h:268
int GetAdaptiveFeatures(TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures)
Definition: adaptmatch.cpp:787
void RemoveBadMatches(ADAPT_RESULTS *Results)
void AdaptToChar(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, float Threshold, ADAPT_TEMPLATES adaptive_templates)
Definition: adaptmatch.cpp:857
#define PRINT_PROTO_MATCHES
Definition: intproto.h:190
void SettupStopperPass2()
Sets up stopper variables in preparation for the second pass.
Definition: stopper.cpp:364
void ReadNewCutoffs(TFile *fp, CLASS_CUTOFF_ARRAY Cutoffs)
Definition: cutoffs.cpp:46
TWERD * chopped_word
Definition: pageres.h:215
#define ClassForClassId(T, c)
Definition: intproto.h:176
#define UNLIKELY_NUM_FEAT
Definition: adaptmatch.cpp:79
int16_t bottom() const
Definition: rect.h:65
#define MAX_NUM_CLASSES
Definition: matchdefs.h:32
INT_TEMPLATES Templates
Definition: adaptive.h:76
void FreeTempProto(void *arg)
Definition: adaptive.cpp:82
bool classify_enable_learning
Definition: classify.h:430
void AddProtoToClassPruner(PROTO Proto, CLASS_ID ClassId, INT_TEMPLATES Templates)
Definition: intproto.cpp:330
int32_t length() const
Definition: strngs.cpp:191
WERD_CHOICE * best_choice
Definition: pageres.h:235
#define NO_DEBUG
Definition: adaptmatch.cpp:80
#define test_bit(array, bit)
Definition: bitvec.h:61
UnicityTable< FontSet > fontset_table_
Definition: classify.h:537
#define SET_BIT(array, bit)
Definition: bitvec.h:57
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:251
void ConvertConfig(BIT_VECTOR Config, int ConfigId, INT_CLASS Class)
Definition: intproto.cpp:469
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:658
IntegerMatcher im_
Definition: classify.h:544
int ShapeIDToClassID(int shape_id) const
bool classify_use_pre_adapted_templates
Definition: classify.h:452
FEATURE_SET ExtractPicoFeatures(TBLOB *Blob)
Definition: picofeat.cpp:64
GenericVector< ScoredFont > fonts
Definition: shapetable.h:88
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool classify_save_adapted_templates
Definition: classify.h:454
LIST delete_d(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:114