tesseract  5.0.0-alpha-619-ge9db
normmatch.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  ** Filename: normmatch.c
3  ** Purpose: Simple matcher based on character normalization features.
4  ** Author: Dan Johnson
5  **
6  ** (c) Copyright Hewlett-Packard Company, 1988.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  ******************************************************************************/
17 /*----------------------------------------------------------------------------
18  Include Files and Type Defines
19 ----------------------------------------------------------------------------*/
20 #include "normmatch.h"
21 
22 #include <cstdio>
23 #include <cmath>
24 #include <sstream> // for std::istringstream
25 
26 #include "classify.h"
27 #include "clusttool.h"
28 #include "emalloc.h"
29 #include <tesseract/helpers.h>
30 #include "normfeat.h"
31 #include "unicharset.h"
32 #include "params.h"
33 
34 struct NORM_PROTOS
35 {
36  int NumParams;
38  LIST* Protos;
39  int NumProtos;
40 };
41 
42 /*----------------------------------------------------------------------------
43  Private Code
44 ----------------------------------------------------------------------------*/
45 
53 static double NormEvidenceOf(double NormAdj) {
54  NormAdj /= classify_norm_adj_midpoint;
55 
56  if (classify_norm_adj_curl == 3) {
57  NormAdj = NormAdj * NormAdj * NormAdj;
58  } else if (classify_norm_adj_curl == 2) {
59  NormAdj = NormAdj * NormAdj;
60  } else {
61  NormAdj = pow(NormAdj, classify_norm_adj_curl);
62  }
63  return (1.0 / (1.0 + NormAdj));
64 }
65 
66 /*----------------------------------------------------------------------------
67  Variables
68 ----------------------------------------------------------------------------*/
69 
71 double_VAR(classify_norm_adj_midpoint, 32.0, "Norm adjust midpoint ...");
72 double_VAR(classify_norm_adj_curl, 2.0, "Norm adjust curl ...");
74 const double kWidthErrorWeighting = 0.125;
75 
76 /*----------------------------------------------------------------------------
77  Public Code
78 ----------------------------------------------------------------------------*/
79 /*---------------------------------------------------------------------------*/
80 namespace tesseract {
95  const FEATURE_STRUCT& feature,
96  bool DebugMatch) {
97  LIST Protos;
98  float BestMatch;
99  float Match;
100  float Delta;
101  PROTOTYPE *Proto;
102  int ProtoId;
103 
104  if (ClassId >= NormProtos->NumProtos) {
105  ClassId = NO_CLASS;
106  }
107 
108  /* handle requests for classification as noise */
109  if (ClassId == NO_CLASS) {
110  /* kludge - clean up constants and make into control knobs later */
111  Match = (feature.Params[CharNormLength] *
112  feature.Params[CharNormLength] * 500.0 +
113  feature.Params[CharNormRx] *
114  feature.Params[CharNormRx] * 8000.0 +
115  feature.Params[CharNormRy] *
116  feature.Params[CharNormRy] * 8000.0);
117  return (1.0 - NormEvidenceOf(Match));
118  }
119 
120  BestMatch = FLT_MAX;
121  Protos = NormProtos->Protos[ClassId];
122 
123  if (DebugMatch) {
124  tprintf("\nChar norm for class %s\n", unicharset.id_to_unichar(ClassId));
125  }
126 
127  ProtoId = 0;
128  iterate(Protos) {
129  Proto = reinterpret_cast<PROTOTYPE *>first_node (Protos);
130  Delta = feature.Params[CharNormY] - Proto->Mean[CharNormY];
131  Match = Delta * Delta * Proto->Weight.Elliptical[CharNormY];
132  if (DebugMatch) {
133  tprintf("YMiddle: Proto=%g, Delta=%g, Var=%g, Dist=%g\n",
134  Proto->Mean[CharNormY], Delta,
135  Proto->Weight.Elliptical[CharNormY], Match);
136  }
137  Delta = feature.Params[CharNormRx] - Proto->Mean[CharNormRx];
138  Match += Delta * Delta * Proto->Weight.Elliptical[CharNormRx];
139  if (DebugMatch) {
140  tprintf("Height: Proto=%g, Delta=%g, Var=%g, Dist=%g\n",
141  Proto->Mean[CharNormRx], Delta,
142  Proto->Weight.Elliptical[CharNormRx], Match);
143  }
144  // Ry is width! See intfx.cpp.
145  Delta = feature.Params[CharNormRy] - Proto->Mean[CharNormRy];
146  if (DebugMatch) {
147  tprintf("Width: Proto=%g, Delta=%g, Var=%g\n",
148  Proto->Mean[CharNormRy], Delta,
149  Proto->Weight.Elliptical[CharNormRy]);
150  }
151  Delta = Delta * Delta * Proto->Weight.Elliptical[CharNormRy];
152  Delta *= kWidthErrorWeighting;
153  Match += Delta;
154  if (DebugMatch) {
155  tprintf("Total Dist=%g, scaled=%g, sigmoid=%g, penalty=%g\n",
156  Match, Match / classify_norm_adj_midpoint,
157  NormEvidenceOf(Match), 256 * (1 - NormEvidenceOf(Match)));
158  }
159 
160  if (Match < BestMatch)
161  BestMatch = Match;
162 
163  ProtoId++;
164  }
165  return 1.0 - NormEvidenceOf(BestMatch);
166 } /* ComputeNormMatch */
167 
169  if (NormProtos != nullptr) {
170  for (int i = 0; i < NormProtos->NumProtos; i++)
174  Efree(NormProtos);
175  NormProtos = nullptr;
176  }
177 }
178 } // namespace tesseract
179 
180 /*---------------------------------------------------------------------------*/
181 namespace tesseract {
192  int i;
193  char unichar[2 * UNICHAR_LEN + 1];
194  UNICHAR_ID unichar_id;
195  LIST Protos;
196  int NumProtos;
197 
198  /* allocate and initialization data structure */
199  NormProtos = static_cast<NORM_PROTOS *>(Emalloc (sizeof (NORM_PROTOS)));
201  NormProtos->Protos = static_cast<LIST *>(Emalloc (NormProtos->NumProtos * sizeof(LIST)));
202  for (i = 0; i < NormProtos->NumProtos; i++)
203  NormProtos->Protos[i] = NIL_LIST;
204 
205  /* read file header and save in data structure */
208 
209  /* read protos for each class into a separate list */
210  const int kMaxLineSize = 100;
211  char line[kMaxLineSize];
212  while (fp->FGets(line, kMaxLineSize) != nullptr) {
213  std::istringstream stream(line);
214  stream >> unichar >> NumProtos;
215  if (stream.fail()) {
216  continue;
217  }
218  if (unicharset.contains_unichar(unichar)) {
219  unichar_id = unicharset.unichar_to_id(unichar);
220  Protos = NormProtos->Protos[unichar_id];
221  for (i = 0; i < NumProtos; i++)
222  Protos = push_last(Protos, ReadPrototype(fp, NormProtos->NumParams));
223  NormProtos->Protos[unichar_id] = Protos;
224  } else {
225  tprintf("Error: unichar %s in normproto file is not in unichar set.\n",
226  unichar);
227  for (i = 0; i < NumProtos; i++)
229  }
230  }
231  return (NormProtos);
232 } /* ReadNormProtos */
233 } // namespace tesseract
NO_CLASS
#define NO_CLASS
Definition: matchdefs.h:34
emalloc.h
CLASS_ID
UNICHAR_ID CLASS_ID
Definition: matchdefs.h:33
NORM_PROTOS::NumParams
int NumParams
Definition: normmatch.cpp:49
tesseract::Classify::NormProtos
NORM_PROTOS * NormProtos
Definition: classify.h:527
first_node
#define first_node(l)
Definition: oldlist.h:84
Emalloc
void * Emalloc(int Size)
Definition: emalloc.cpp:31
list_rec
Definition: oldlist.h:73
params.h
FreeProtoList
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:538
tesseract::Classify::ComputeNormMatch
float ComputeNormMatch(CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch)
Definition: normmatch.cpp:93
FEATURE_STRUCT
Definition: ocrfeatures.h:58
ReadParamDesc
PARAM_DESC * ReadParamDesc(TFile *fp, uint16_t N)
Definition: clusttool.cpp:140
NORM_PROTOS
Definition: normmatch.cpp:33
NIL_LIST
#define NIL_LIST
Definition: oldlist.h:68
tesseract::CCUtil::unicharset
UNICHARSET unicharset
Definition: ccutil.h:57
PROTOTYPE
Definition: cluster.h:62
kWidthErrorWeighting
const double kWidthErrorWeighting
Definition: normmatch.cpp:73
classify_norm_adj_curl
double classify_norm_adj_curl
Definition: normmatch.cpp:71
unicharset.h
FLOATUNION::Elliptical
float * Elliptical
Definition: cluster.h:59
CharNormLength
Definition: normfeat.h:29
NORM_PROTOS::ParamDesc
PARAM_DESC * ParamDesc
Definition: normmatch.cpp:50
UNICHARSET::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
normfeat.h
PROTOTYPE::Weight
FLOATUNION Weight
Definition: cluster.h:78
tesseract::Classify::ReadNormProtos
NORM_PROTOS * ReadNormProtos(TFile *fp)
Definition: normmatch.cpp:189
ReadSampleSize
uint16_t ReadSampleSize(TFile *fp)
Definition: clusttool.cpp:120
helpers.h
ReadPrototype
PROTOTYPE * ReadPrototype(TFile *fp, uint16_t N)
Definition: clusttool.cpp:176
tesseract
Definition: baseapi.h:65
FEATURE_STRUCT::Params
float Params[1]
Definition: ocrfeatures.h:60
double_VAR
#define double_VAR(name, val, comment)
Definition: params.h:309
normmatch.h
UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:36
NORM_PROTOS::NumProtos
int NumProtos
Definition: normmatch.cpp:52
PARAM_DESC
Definition: ocrfeatures.h:41
UNICHAR_LEN
#define UNICHAR_LEN
Definition: unichar.h:32
classify_norm_adj_midpoint
double classify_norm_adj_midpoint
Definition: normmatch.cpp:70
tesseract::Classify::FreeNormProtos
void FreeNormProtos()
Definition: normmatch.cpp:167
PROTOTYPE::Mean
float * Mean
Definition: cluster.h:73
UNICHARSET::contains_unichar
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:670
Efree
void Efree(void *ptr)
Definition: emalloc.cpp:45
iterate
#define iterate(l)
Definition: oldlist.h:92
FreePrototype
void FreePrototype(void *arg)
Definition: cluster.cpp:549
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
classify.h
UNICHARSET::id_to_unichar
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
NORM_PROTOS::Protos
LIST * Protos
Definition: normmatch.cpp:51
CharNormY
Definition: normfeat.h:29
push_last
LIST push_last(LIST list, void *item)
Definition: oldlist.cpp:185
UNICHARSET::size
int size() const
Definition: unicharset.h:341
clusttool.h
CharNormRx
Definition: normfeat.h:29
CharNormRy
Definition: normfeat.h:29