tesseract  4.0.0-1-g2a2b
normmatch.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  ** Filename: normmatch.c
3  ** Purpose: Simple matcher based on character normalization features.
4  ** Author: Dan Johnson
5  ** History: Wed Dec 19 16:18:06 1990, DSJ, Created.
6  **
7  ** (c) Copyright Hewlett-Packard Company, 1988.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  ******************************************************************************/
18 /*----------------------------------------------------------------------------
19  Include Files and Type Defines
20 ----------------------------------------------------------------------------*/
21 #include "normmatch.h"
22 
23 #include <cstdio>
24 #include <cmath>
25 
26 #include "classify.h"
27 #include "clusttool.h"
28 #include "emalloc.h"
29 #include "globals.h"
30 #include "helpers.h"
31 #include "normfeat.h"
32 #include "unicharset.h"
33 #include "params.h"
34 
36 {
37  int NumParams;
40  int NumProtos;
41 };
42 
43 /*----------------------------------------------------------------------------
44  Private Function Prototypes
45 ----------------------------------------------------------------------------*/
46 double NormEvidenceOf(double NormAdj);
47 
48 void PrintNormMatch(FILE *File,
49  int NumParams,
50  PROTOTYPE *Proto,
51  FEATURE Feature);
52 
53 NORM_PROTOS *ReadNormProtos(FILE *File);
54 
55 /*----------------------------------------------------------------------------
56  Variables
57 ----------------------------------------------------------------------------*/
58 
60 double_VAR(classify_norm_adj_midpoint, 32.0, "Norm adjust midpoint ...");
61 double_VAR(classify_norm_adj_curl, 2.0, "Norm adjust curl ...");
63 const double kWidthErrorWeighting = 0.125;
64 
65 /*----------------------------------------------------------------------------
66  Public Code
67 ----------------------------------------------------------------------------*/
68 /*---------------------------------------------------------------------------*/
69 namespace tesseract {
84  const FEATURE_STRUCT& feature,
85  bool DebugMatch) {
86  LIST Protos;
87  float BestMatch;
88  float Match;
89  float Delta;
90  PROTOTYPE *Proto;
91  int ProtoId;
92 
93  if (ClassId >= NormProtos->NumProtos) {
94  ClassId = NO_CLASS;
95  }
96 
97  /* handle requests for classification as noise */
98  if (ClassId == NO_CLASS) {
99  /* kludge - clean up constants and make into control knobs later */
100  Match = (feature.Params[CharNormLength] *
101  feature.Params[CharNormLength] * 500.0 +
102  feature.Params[CharNormRx] *
103  feature.Params[CharNormRx] * 8000.0 +
104  feature.Params[CharNormRy] *
105  feature.Params[CharNormRy] * 8000.0);
106  return (1.0 - NormEvidenceOf (Match));
107  }
108 
109  BestMatch = FLT_MAX;
110  Protos = NormProtos->Protos[ClassId];
111 
112  if (DebugMatch) {
113  tprintf("\nChar norm for class %s\n", unicharset.id_to_unichar(ClassId));
114  }
115 
116  ProtoId = 0;
117  iterate(Protos) {
118  Proto = (PROTOTYPE *) first_node (Protos);
119  Delta = feature.Params[CharNormY] - Proto->Mean[CharNormY];
120  Match = Delta * Delta * Proto->Weight.Elliptical[CharNormY];
121  if (DebugMatch) {
122  tprintf("YMiddle: Proto=%g, Delta=%g, Var=%g, Dist=%g\n",
123  Proto->Mean[CharNormY], Delta,
124  Proto->Weight.Elliptical[CharNormY], Match);
125  }
126  Delta = feature.Params[CharNormRx] - Proto->Mean[CharNormRx];
127  Match += Delta * Delta * Proto->Weight.Elliptical[CharNormRx];
128  if (DebugMatch) {
129  tprintf("Height: Proto=%g, Delta=%g, Var=%g, Dist=%g\n",
130  Proto->Mean[CharNormRx], Delta,
131  Proto->Weight.Elliptical[CharNormRx], Match);
132  }
133  // Ry is width! See intfx.cpp.
134  Delta = feature.Params[CharNormRy] - Proto->Mean[CharNormRy];
135  if (DebugMatch) {
136  tprintf("Width: Proto=%g, Delta=%g, Var=%g\n",
137  Proto->Mean[CharNormRy], Delta,
138  Proto->Weight.Elliptical[CharNormRy]);
139  }
140  Delta = Delta * Delta * Proto->Weight.Elliptical[CharNormRy];
141  Delta *= kWidthErrorWeighting;
142  Match += Delta;
143  if (DebugMatch) {
144  tprintf("Total Dist=%g, scaled=%g, sigmoid=%g, penalty=%g\n",
145  Match, Match / classify_norm_adj_midpoint,
146  NormEvidenceOf(Match), 256 * (1 - NormEvidenceOf(Match)));
147  }
148 
149  if (Match < BestMatch)
150  BestMatch = Match;
151 
152  ProtoId++;
153  }
154  return 1.0 - NormEvidenceOf(BestMatch);
155 } /* ComputeNormMatch */
156 
158  if (NormProtos != nullptr) {
159  for (int i = 0; i < NormProtos->NumProtos; i++)
163  Efree(NormProtos);
164  NormProtos = nullptr;
165  }
166 }
167 } // namespace tesseract
168 
169 /*----------------------------------------------------------------------------
170  Private Code
171 ----------------------------------------------------------------------------*/
179 double NormEvidenceOf(double NormAdj) {
180  NormAdj /= classify_norm_adj_midpoint;
181 
182  if (classify_norm_adj_curl == 3)
183  NormAdj = NormAdj * NormAdj * NormAdj;
184  else if (classify_norm_adj_curl == 2)
185  NormAdj = NormAdj * NormAdj;
186  else
187  NormAdj = pow (NormAdj, classify_norm_adj_curl);
188  return (1.0 / (1.0 + NormAdj));
189 }
190 
191 
192 /*---------------------------------------------------------------------------*/
202 void PrintNormMatch(FILE *File,
203  int NumParams,
204  PROTOTYPE *Proto,
205  FEATURE Feature) {
206  int i;
207  float ParamMatch;
208  float TotalMatch;
209 
210  for (i = 0, TotalMatch = 0.0; i < NumParams; i++) {
211  ParamMatch = (Feature->Params[i] - Mean(Proto, i)) /
212  StandardDeviation(Proto, i);
213 
214  fprintf (File, " %6.1f", ParamMatch);
215 
216  if (i == CharNormY || i == CharNormRx)
217  TotalMatch += ParamMatch * ParamMatch;
218  }
219  fprintf (File, " --> %6.1f (%4.2f)\n",
220  TotalMatch, NormEvidenceOf (TotalMatch));
221 
222 } /* PrintNormMatch */
223 
224 
225 /*---------------------------------------------------------------------------*/
226 namespace tesseract {
237  int i;
238  char unichar[2 * UNICHAR_LEN + 1];
239  UNICHAR_ID unichar_id;
240  LIST Protos;
241  int NumProtos;
242 
243  /* allocate and initialization data structure */
244  NormProtos = (NORM_PROTOS *) Emalloc (sizeof (NORM_PROTOS));
246  NormProtos->Protos = (LIST *) Emalloc (NormProtos->NumProtos * sizeof(LIST));
247  for (i = 0; i < NormProtos->NumProtos; i++)
248  NormProtos->Protos[i] = NIL_LIST;
249 
250  /* read file header and save in data structure */
253 
254  /* read protos for each class into a separate list */
255  const int kMaxLineSize = 100;
256  char line[kMaxLineSize];
257  while (fp->FGets(line, kMaxLineSize) != nullptr) {
258  if (sscanf(line, "%s %d", unichar, &NumProtos) != 2) continue;
259  if (unicharset.contains_unichar(unichar)) {
260  unichar_id = unicharset.unichar_to_id(unichar);
261  Protos = NormProtos->Protos[unichar_id];
262  for (i = 0; i < NumProtos; i++)
263  Protos = push_last(Protos, ReadPrototype(fp, NormProtos->NumParams));
264  NormProtos->Protos[unichar_id] = Protos;
265  } else {
266  tprintf("Error: unichar %s in normproto file is not in unichar set.\n",
267  unichar);
268  for (i = 0; i < NumProtos; i++)
270  }
271  }
272  return (NormProtos);
273 } /* ReadNormProtos */
274 } // namespace tesseract
float ComputeNormMatch(CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch)
Definition: normmatch.cpp:83
int UNICHAR_ID
Definition: unichar.h:35
float * Mean
Definition: cluster.h:78
NORM_PROTOS * ReadNormProtos(FILE *File)
double classify_norm_adj_curl
Definition: normmatch.cpp:61
NORM_PROTOS * NormProtos
Definition: classify.h:527
float StandardDeviation(PROTOTYPE *Proto, uint16_t Dimension)
Definition: cluster.cpp:639
#define double_VAR(name, val, comment)
Definition: params.h:285
void PrintNormMatch(FILE *File, int NumParams, PROTOTYPE *Proto, FEATURE Feature)
Definition: normmatch.cpp:202
uint16_t ReadSampleSize(TFile *fp)
Definition: clusttool.cpp:41
void * Emalloc(int Size)
Definition: emalloc.cpp:31
LIST * Protos
Definition: normmatch.cpp:39
void Efree(void *ptr)
Definition: emalloc.cpp:45
char * FGets(char *buffer, int buffer_size)
Definition: serialis.cpp:248
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
UNICHAR_ID CLASS_ID
Definition: matchdefs.h:36
#define UNICHAR_LEN
Definition: unichar.h:31
float * Elliptical
Definition: cluster.h:64
FLOATUNION Weight
Definition: cluster.h:83
float Params[1]
Definition: ocrfeatures.h:62
int size() const
Definition: unicharset.h:336
NORM_PROTOS * ReadNormProtos(TFile *fp)
Definition: normmatch.cpp:235
void FreePrototype(void *arg)
Definition: cluster.cpp:575
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:563
UNICHARSET unicharset
Definition: ccutil.h:68
PROTOTYPE * ReadPrototype(TFile *fp, uint16_t N)
Definition: clusttool.cpp:99
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:670
LIST push_last(LIST list, void *item)
Definition: oldlist.cpp:297
PARAM_DESC * ParamDesc
Definition: normmatch.cpp:38
const double kWidthErrorWeighting
Definition: normmatch.cpp:63
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
float Mean(PROTOTYPE *Proto, uint16_t Dimension)
Definition: cluster.cpp:628
#define first_node(l)
Definition: oldlist.h:141
#define NIL_LIST
Definition: oldlist.h:127
#define iterate(l)
Definition: oldlist.h:161
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:290
double NormEvidenceOf(double NormAdj)
Definition: normmatch.cpp:179
#define NO_CLASS
Definition: matchdefs.h:37
PARAM_DESC * ReadParamDesc(TFile *fp, uint16_t N)
Definition: clusttool.cpp:61
double classify_norm_adj_midpoint
Definition: normmatch.cpp:60