tesseract  4.0.0-1-g2a2b
clusttool.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  ** Filename: clustertool.c
3  ** Purpose: Misc. tools for use with the clustering routines
4  ** Author: Dan Johnson
5  **
6  ** (c) Copyright Hewlett-Packard Company, 1988.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *****************************************************************************/
17 
18 //--------------------------Include Files----------------------------------
19 #include "clusttool.h"
20 #include "emalloc.h"
21 #include <cstdio>
22 #include <cmath>
23 
24 using tesseract::TFile;
25 
26 //---------------Global Data Definitions and Declarations--------------------
27 #define TOKENSIZE 80 //< max size of tokens read from an input file
28 #define QUOTED_TOKENSIZE "79"
29 #define MAXSAMPLESIZE 65535 //< max num of dimensions in feature space
30 //#define MAXBLOCKSIZE 65535 //< max num of samples in a character (block
31 // size)
32 
41 uint16_t ReadSampleSize(TFile *fp) {
42  int SampleSize = 0;
43 
44  const int kMaxLineSize = 100;
45  char line[kMaxLineSize];
46  ASSERT_HOST(fp->FGets(line, kMaxLineSize) != nullptr);
47  ASSERT_HOST(sscanf(line, "%d", &SampleSize) == 1);
48  ASSERT_HOST(SampleSize >= 0 && SampleSize <= MAXSAMPLESIZE);
49  return SampleSize;
50 }
51 
61 PARAM_DESC *ReadParamDesc(TFile *fp, uint16_t N) {
62  PARAM_DESC *ParamDesc;
63  char linear_token[TOKENSIZE], essential_token[TOKENSIZE];
64 
65  ParamDesc = (PARAM_DESC *) Emalloc (N * sizeof (PARAM_DESC));
66  for (int i = 0; i < N; i++) {
67  const int kMaxLineSize = TOKENSIZE * 4;
68  char line[kMaxLineSize];
69  ASSERT_HOST(fp->FGets(line, kMaxLineSize) != nullptr);
70  ASSERT_HOST(sscanf(line,
71  "%" QUOTED_TOKENSIZE "s %" QUOTED_TOKENSIZE "s %f %f",
72  linear_token, essential_token, &ParamDesc[i].Min,
73  &ParamDesc[i].Max) == 4);
74  if (linear_token[0] == 'c')
75  ParamDesc[i].Circular = TRUE;
76  else
77  ParamDesc[i].Circular = FALSE;
78 
79  if (linear_token[0] == 'e')
80  ParamDesc[i].NonEssential = FALSE;
81  else
82  ParamDesc[i].NonEssential = TRUE;
83  ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
84  ParamDesc[i].HalfRange = ParamDesc[i].Range / 2;
85  ParamDesc[i].MidRange = (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
86  }
87  return (ParamDesc);
88 }
89 
99 PROTOTYPE *ReadPrototype(TFile *fp, uint16_t N) {
100  char sig_token[TOKENSIZE], shape_token[TOKENSIZE];
101  PROTOTYPE *Proto;
102  int SampleCount;
103  int i;
104 
105  const int kMaxLineSize = TOKENSIZE * 4;
106  char line[kMaxLineSize];
107  if (fp->FGets(line, kMaxLineSize) == nullptr ||
108  sscanf(line, "%" QUOTED_TOKENSIZE "s %" QUOTED_TOKENSIZE "s %d",
109  sig_token, shape_token, &SampleCount) != 3) {
110  tprintf("Invalid prototype: %s\n", line);
111  return nullptr;
112  }
113  Proto = (PROTOTYPE *)Emalloc(sizeof(PROTOTYPE));
114  Proto->Cluster = nullptr;
115  if (sig_token[0] == 's')
116  Proto->Significant = TRUE;
117  else
118  Proto->Significant = FALSE;
119 
120  switch (shape_token[0]) {
121  case 's':
122  Proto->Style = spherical;
123  break;
124  case 'e':
125  Proto->Style = elliptical;
126  break;
127  case 'a':
128  Proto->Style = automatic;
129  break;
130  default:
131  tprintf("Invalid prototype style specification:%s\n", shape_token);
132  Proto->Style = elliptical;
133  }
134 
135  ASSERT_HOST(SampleCount >= 0);
136  Proto->NumSamples = SampleCount;
137 
138  Proto->Mean = ReadNFloats(fp, N, nullptr);
139  ASSERT_HOST(Proto->Mean != nullptr);
140 
141  switch (Proto->Style) {
142  case spherical:
143  ASSERT_HOST(ReadNFloats(fp, 1, &(Proto->Variance.Spherical)) != nullptr);
144  Proto->Magnitude.Spherical =
145  1.0 / sqrt(2.0 * M_PI * Proto->Variance.Spherical);
146  Proto->TotalMagnitude = pow(Proto->Magnitude.Spherical, (float)N);
147  Proto->LogMagnitude = log((double)Proto->TotalMagnitude);
148  Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
149  Proto->Distrib = nullptr;
150  break;
151  case elliptical:
152  Proto->Variance.Elliptical = ReadNFloats(fp, N, nullptr);
153  ASSERT_HOST(Proto->Variance.Elliptical != nullptr);
154  Proto->Magnitude.Elliptical = (float *)Emalloc(N * sizeof(float));
155  Proto->Weight.Elliptical = (float *)Emalloc(N * sizeof(float));
156  Proto->TotalMagnitude = 1.0;
157  for (i = 0; i < N; i++) {
158  Proto->Magnitude.Elliptical[i] =
159  1.0 / sqrt(2.0 * M_PI * Proto->Variance.Elliptical[i]);
160  Proto->Weight.Elliptical[i] = 1.0 / Proto->Variance.Elliptical[i];
161  Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
162  }
163  Proto->LogMagnitude = log((double)Proto->TotalMagnitude);
164  Proto->Distrib = nullptr;
165  break;
166  default:
167  Efree(Proto);
168  tprintf("Invalid prototype style\n");
169  return nullptr;
170  }
171  return Proto;
172 }
173 
186 float *ReadNFloats(TFile *fp, uint16_t N, float Buffer[]) {
187  const int kMaxLineSize = 1024;
188  char line[kMaxLineSize];
189  if (fp->FGets(line, kMaxLineSize) == nullptr) {
190  tprintf("Hit EOF in ReadNFloats!\n");
191  return nullptr;
192  }
193  bool needs_free = false;
194 
195  if (Buffer == nullptr) {
196  Buffer = static_cast<float *>(Emalloc(N * sizeof(float)));
197  needs_free = true;
198  }
199 
200  char *startptr = line;
201  for (int i = 0; i < N; i++) {
202  char *endptr;
203  Buffer[i] = strtof(startptr, &endptr);
204  if (endptr == startptr) {
205  tprintf("Read of %d floats failed!\n", N);
206  if (needs_free) Efree(Buffer);
207  return nullptr;
208  }
209  startptr = endptr;
210  }
211  return Buffer;
212 }
213 
223 void WriteParamDesc(FILE *File, uint16_t N, const PARAM_DESC ParamDesc[]) {
224  int i;
225 
226  for (i = 0; i < N; i++) {
227  if (ParamDesc[i].Circular)
228  fprintf (File, "circular ");
229  else
230  fprintf (File, "linear ");
231 
232  if (ParamDesc[i].NonEssential)
233  fprintf (File, "non-essential ");
234  else
235  fprintf (File, "essential ");
236 
237  fprintf (File, "%10.6f %10.6f\n", ParamDesc[i].Min, ParamDesc[i].Max);
238  }
239 }
240 
250 void WritePrototype(FILE *File, uint16_t N, PROTOTYPE *Proto) {
251  int i;
252 
253  if (Proto->Significant)
254  fprintf (File, "significant ");
255  else
256  fprintf (File, "insignificant ");
257  WriteProtoStyle (File, (PROTOSTYLE) Proto->Style);
258  fprintf (File, "%6d\n\t", Proto->NumSamples);
259  WriteNFloats (File, N, Proto->Mean);
260  fprintf (File, "\t");
261 
262  switch (Proto->Style) {
263  case spherical:
264  WriteNFloats (File, 1, &(Proto->Variance.Spherical));
265  break;
266  case elliptical:
267  WriteNFloats (File, N, Proto->Variance.Elliptical);
268  break;
269  case mixed:
270  for (i = 0; i < N; i++)
271  switch (Proto->Distrib[i]) {
272  case normal:
273  fprintf (File, " %9s", "normal");
274  break;
275  case uniform:
276  fprintf (File, " %9s", "uniform");
277  break;
278  case D_random:
279  fprintf (File, " %9s", "random");
280  break;
281  case DISTRIBUTION_COUNT:
282  ASSERT_HOST(!"Distribution count not allowed!");
283  }
284  fprintf (File, "\n\t");
285  WriteNFloats (File, N, Proto->Variance.Elliptical);
286  }
287 }
288 
298 void WriteNFloats(FILE * File, uint16_t N, float Array[]) {
299  for (int i = 0; i < N; i++)
300  fprintf(File, " %9.6f", Array[i]);
301  fprintf(File, "\n");
302 }
303 
313 void WriteProtoStyle(FILE *File, PROTOSTYLE ProtoStyle) {
314  switch (ProtoStyle) {
315  case spherical:
316  fprintf (File, "spherical");
317  break;
318  case elliptical:
319  fprintf (File, "elliptical");
320  break;
321  case mixed:
322  fprintf (File, "mixed");
323  break;
324  case automatic:
325  fprintf (File, "automatic");
326  break;
327  }
328 }
329 
345 void WriteProtoList(FILE* File, uint16_t N, PARAM_DESC* ParamDesc,
346  LIST ProtoList, bool WriteSigProtos,
347  bool WriteInsigProtos) {
348  PROTOTYPE *Proto;
349 
350  /* write file header */
351  fprintf(File,"%0d\n",N);
352  WriteParamDesc(File,N,ParamDesc);
353 
354  /* write prototypes */
355  iterate(ProtoList)
356  {
357  Proto = (PROTOTYPE *) first_node (ProtoList);
358  if ((Proto->Significant && WriteSigProtos) ||
359  (!Proto->Significant && WriteInsigProtos))
360  WritePrototype(File, N, Proto);
361  }
362 }
float MidRange
Definition: ocrfeatures.h:50
int8_t Circular
Definition: ocrfeatures.h:44
float * Mean
Definition: cluster.h:78
float HalfRange
Definition: ocrfeatures.h:49
#define TRUE
Definition: capi.h:51
float Min
Definition: ocrfeatures.h:46
uint16_t ReadSampleSize(TFile *fp)
Definition: clusttool.cpp:41
void * Emalloc(int Size)
Definition: emalloc.cpp:31
void Efree(void *ptr)
Definition: emalloc.cpp:45
char * FGets(char *buffer, int buffer_size)
Definition: serialis.cpp:248
float TotalMagnitude
Definition: cluster.h:79
float Spherical
Definition: cluster.h:63
float * Elliptical
Definition: cluster.h:64
FLOATUNION Weight
Definition: cluster.h:83
PROTOSTYLE
Definition: cluster.h:44
void WritePrototype(FILE *File, uint16_t N, PROTOTYPE *Proto)
Definition: clusttool.cpp:250
DISTRIBUTION * Distrib
Definition: cluster.h:77
unsigned Style
Definition: cluster.h:74
PROTOTYPE * ReadPrototype(TFile *fp, uint16_t N)
Definition: clusttool.cpp:99
#define FALSE
Definition: capi.h:52
float Range
Definition: ocrfeatures.h:48
unsigned Significant
Definition: cluster.h:68
void WriteParamDesc(FILE *File, uint16_t N, const PARAM_DESC ParamDesc[])
Definition: clusttool.cpp:223
float * ReadNFloats(TFile *fp, uint16_t N, float Buffer[])
Definition: clusttool.cpp:186
void WriteProtoList(FILE *File, uint16_t N, PARAM_DESC *ParamDesc, LIST ProtoList, bool WriteSigProtos, bool WriteInsigProtos)
Definition: clusttool.cpp:345
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
FLOATUNION Magnitude
Definition: cluster.h:82
Definition: cluster.h:59
int8_t NonEssential
Definition: ocrfeatures.h:45
#define first_node(l)
Definition: oldlist.h:141
CLUSTER * Cluster
Definition: cluster.h:76
#define iterate(l)
Definition: oldlist.h:161
unsigned NumSamples
Definition: cluster.h:75
float LogMagnitude
Definition: cluster.h:80
#define MAXSAMPLESIZE
Definition: clusttool.cpp:29
float Max
Definition: ocrfeatures.h:47
void WriteProtoStyle(FILE *File, PROTOSTYLE ProtoStyle)
Definition: clusttool.cpp:313
PARAM_DESC * ReadParamDesc(TFile *fp, uint16_t N)
Definition: clusttool.cpp:61
void WriteNFloats(FILE *File, uint16_t N, float Array[])
Definition: clusttool.cpp:298
FLOATUNION Variance
Definition: cluster.h:81
#define QUOTED_TOKENSIZE
Definition: clusttool.cpp:28
#define TOKENSIZE
Definition: clusttool.cpp:27
#define ASSERT_HOST(x)
Definition: errcode.h:84
Definition: cluster.h:45