tesseract  5.0.0-alpha-619-ge9db
clusttool.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  ** Filename: clusttool.cpp
3  ** Purpose: Misc. tools for use with the clustering routines
4  ** Author: Dan Johnson
5  **
6  ** (c) Copyright Hewlett-Packard Company, 1988.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *****************************************************************************/
17 
18 //--------------------------Include Files----------------------------------
19 #define _USE_MATH_DEFINES // for M_PI
20 #include "clusttool.h"
21 #include <cmath> // for M_PI, std::isnan
22 #include <locale> // for std::locale::classic
23 #include <sstream> // for std::stringstream
24 #include "emalloc.h"
25 
26 using tesseract::TFile;
27 
28 //---------------Global Data Definitions and Declarations--------------------
29 #define TOKENSIZE 80
30 #define QUOTED_TOKENSIZE "79"
31 #define MAXSAMPLESIZE 65535
32 
33 
45 static float *ReadNFloats(TFile *fp, uint16_t N, float Buffer[]) {
46  const int kMaxLineSize = 1024;
47  char line[kMaxLineSize];
48  if (fp->FGets(line, kMaxLineSize) == nullptr) {
49  tprintf("Hit EOF in ReadNFloats!\n");
50  return nullptr;
51  }
52  bool needs_free = false;
53 
54  if (Buffer == nullptr) {
55  Buffer = static_cast<float *>(Emalloc(N * sizeof(float)));
56  needs_free = true;
57  }
58 
59  std::stringstream stream(line);
60  // Use "C" locale (needed for float values Buffer[i]).
61  stream.imbue(std::locale::classic());
62  for (uint16_t i = 0; i < N; i++) {
63  float f = NAN;
64  stream >> f;
65  if (std::isnan(f)) {
66  tprintf("Read of %u floats failed!\n", N);
67  if (needs_free) Efree(Buffer);
68  return nullptr;
69  }
70  Buffer[i] = f;
71  }
72  return Buffer;
73 }
74 
82 static void WriteNFloats(FILE * File, uint16_t N, float Array[]) {
83  for (int i = 0; i < N; i++)
84  fprintf(File, " %9.6f", Array[i]);
85  fprintf(File, "\n");
86 }
87 
95 static void WriteProtoStyle(FILE *File, PROTOSTYLE ProtoStyle) {
96  switch (ProtoStyle) {
97  case spherical:
98  fprintf (File, "spherical");
99  break;
100  case elliptical:
101  fprintf (File, "elliptical");
102  break;
103  case mixed:
104  fprintf (File, "mixed");
105  break;
106  case automatic:
107  fprintf (File, "automatic");
108  break;
109  }
110 }
111 
120 uint16_t ReadSampleSize(TFile *fp) {
121  int SampleSize = 0;
122 
123  const int kMaxLineSize = 100;
124  char line[kMaxLineSize];
125  ASSERT_HOST(fp->FGets(line, kMaxLineSize) != nullptr);
126  ASSERT_HOST(sscanf(line, "%d", &SampleSize) == 1);
127  ASSERT_HOST(SampleSize >= 0 && SampleSize <= MAXSAMPLESIZE);
128  return SampleSize;
129 }
130 
140 PARAM_DESC *ReadParamDesc(TFile *fp, uint16_t N) {
141  PARAM_DESC *ParamDesc;
142 
143  ParamDesc = static_cast<PARAM_DESC *>(Emalloc (N * sizeof (PARAM_DESC)));
144  for (int i = 0; i < N; i++) {
145  const int kMaxLineSize = TOKENSIZE * 4;
146  char line[kMaxLineSize];
147  ASSERT_HOST(fp->FGets(line, kMaxLineSize) != nullptr);
148  std::istringstream stream(line);
149  // Use "C" locale (needed for float values Min, Max).
150  stream.imbue(std::locale::classic());
151  std::string linear_token;
152  stream >> linear_token;
153  std::string essential_token;
154  stream >> essential_token;
155  stream >> ParamDesc[i].Min;
156  stream >> ParamDesc[i].Max;
157  ASSERT_HOST(!stream.fail());
158  ParamDesc[i].Circular = (linear_token[0] == 'c');
159  ParamDesc[i].NonEssential = (essential_token[0] != 'e');
160  ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
161  ParamDesc[i].HalfRange = ParamDesc[i].Range / 2;
162  ParamDesc[i].MidRange = (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
163  }
164  return (ParamDesc);
165 }
166 
176 PROTOTYPE *ReadPrototype(TFile *fp, uint16_t N) {
177  char sig_token[TOKENSIZE], shape_token[TOKENSIZE];
178  PROTOTYPE *Proto;
179  int SampleCount;
180  int i;
181 
182  const int kMaxLineSize = TOKENSIZE * 4;
183  char line[kMaxLineSize];
184  if (fp->FGets(line, kMaxLineSize) == nullptr ||
185  sscanf(line, "%" QUOTED_TOKENSIZE "s %" QUOTED_TOKENSIZE "s %d",
186  sig_token, shape_token, &SampleCount) != 3) {
187  tprintf("Invalid prototype: %s\n", line);
188  return nullptr;
189  }
190  Proto = static_cast<PROTOTYPE *>(Emalloc(sizeof(PROTOTYPE)));
191  Proto->Cluster = nullptr;
192  Proto->Significant = (sig_token[0] == 's');
193 
194  switch (shape_token[0]) {
195  case 's':
196  Proto->Style = spherical;
197  break;
198  case 'e':
199  Proto->Style = elliptical;
200  break;
201  case 'a':
202  Proto->Style = automatic;
203  break;
204  default:
205  tprintf("Invalid prototype style specification:%s\n", shape_token);
206  Proto->Style = elliptical;
207  }
208 
209  ASSERT_HOST(SampleCount >= 0);
210  Proto->NumSamples = SampleCount;
211 
212  Proto->Mean = ReadNFloats(fp, N, nullptr);
213  ASSERT_HOST(Proto->Mean != nullptr);
214 
215  switch (Proto->Style) {
216  case spherical:
217  ASSERT_HOST(ReadNFloats(fp, 1, &(Proto->Variance.Spherical)) != nullptr);
218  Proto->Magnitude.Spherical =
219  1.0 / sqrt(2.0 * M_PI * Proto->Variance.Spherical);
220  Proto->TotalMagnitude = pow(Proto->Magnitude.Spherical, static_cast<float>(N));
221  Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude));
222  Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
223  Proto->Distrib = nullptr;
224  break;
225  case elliptical:
226  Proto->Variance.Elliptical = ReadNFloats(fp, N, nullptr);
227  ASSERT_HOST(Proto->Variance.Elliptical != nullptr);
228  Proto->Magnitude.Elliptical = static_cast<float *>(Emalloc(N * sizeof(float)));
229  Proto->Weight.Elliptical = static_cast<float *>(Emalloc(N * sizeof(float)));
230  Proto->TotalMagnitude = 1.0;
231  for (i = 0; i < N; i++) {
232  Proto->Magnitude.Elliptical[i] =
233  1.0 / sqrt(2.0 * M_PI * Proto->Variance.Elliptical[i]);
234  Proto->Weight.Elliptical[i] = 1.0 / Proto->Variance.Elliptical[i];
235  Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
236  }
237  Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude));
238  Proto->Distrib = nullptr;
239  break;
240  default:
241  Efree(Proto);
242  tprintf("Invalid prototype style\n");
243  return nullptr;
244  }
245  return Proto;
246 }
247 
255 void WriteParamDesc(FILE *File, uint16_t N, const PARAM_DESC ParamDesc[]) {
256  int i;
257 
258  for (i = 0; i < N; i++) {
259  if (ParamDesc[i].Circular)
260  fprintf (File, "circular ");
261  else
262  fprintf (File, "linear ");
263 
264  if (ParamDesc[i].NonEssential)
265  fprintf (File, "non-essential ");
266  else
267  fprintf (File, "essential ");
268 
269  fprintf (File, "%10.6f %10.6f\n", ParamDesc[i].Min, ParamDesc[i].Max);
270  }
271 }
272 
280 void WritePrototype(FILE *File, uint16_t N, PROTOTYPE *Proto) {
281  int i;
282 
283  if (Proto->Significant)
284  fprintf (File, "significant ");
285  else
286  fprintf (File, "insignificant ");
287  WriteProtoStyle (File, static_cast<PROTOSTYLE>(Proto->Style));
288  fprintf (File, "%6d\n\t", Proto->NumSamples);
289  WriteNFloats (File, N, Proto->Mean);
290  fprintf (File, "\t");
291 
292  switch (Proto->Style) {
293  case spherical:
294  WriteNFloats (File, 1, &(Proto->Variance.Spherical));
295  break;
296  case elliptical:
297  WriteNFloats (File, N, Proto->Variance.Elliptical);
298  break;
299  case mixed:
300  for (i = 0; i < N; i++)
301  switch (Proto->Distrib[i]) {
302  case normal:
303  fprintf (File, " %9s", "normal");
304  break;
305  case uniform:
306  fprintf (File, " %9s", "uniform");
307  break;
308  case D_random:
309  fprintf (File, " %9s", "random");
310  break;
311  case DISTRIBUTION_COUNT:
312  ASSERT_HOST(!"Distribution count not allowed!");
313  }
314  fprintf (File, "\n\t");
315  WriteNFloats (File, N, Proto->Variance.Elliptical);
316  }
317 }
string
std::string string
Definition: equationdetect_test.cc:21
emalloc.h
PROTOTYPE::TotalMagnitude
float TotalMagnitude
Definition: cluster.h:74
PARAM_DESC::Circular
bool Circular
Definition: ocrfeatures.h:42
Emalloc
void * Emalloc(int Size)
Definition: emalloc.cpp:31
elliptical
Definition: cluster.h:43
PROTOTYPE::LogMagnitude
float LogMagnitude
Definition: cluster.h:75
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:87
language_specific.log
log
Definition: language_specific.py:25
MAXSAMPLESIZE
#define MAXSAMPLESIZE
max num of dimensions in feature space
Definition: clusttool.cpp:31
PROTOTYPE::Magnitude
FLOATUNION Magnitude
Definition: cluster.h:77
normal
Definition: cluster.h:55
ReadParamDesc
PARAM_DESC * ReadParamDesc(TFile *fp, uint16_t N)
Definition: clusttool.cpp:140
PARAM_DESC::Min
float Min
Definition: ocrfeatures.h:44
PARAM_DESC::Range
float Range
Definition: ocrfeatures.h:46
PROTOTYPE
Definition: cluster.h:62
PARAM_DESC::MidRange
float MidRange
Definition: ocrfeatures.h:48
QUOTED_TOKENSIZE
#define QUOTED_TOKENSIZE
Definition: clusttool.cpp:30
uniform
Definition: cluster.h:55
WriteParamDesc
void WriteParamDesc(FILE *File, uint16_t N, const PARAM_DESC ParamDesc[])
Definition: clusttool.cpp:255
FLOATUNION::Elliptical
float * Elliptical
Definition: cluster.h:59
PROTOTYPE::Weight
FLOATUNION Weight
Definition: cluster.h:78
tesseract::TFile
Definition: serialis.h:75
ReadSampleSize
uint16_t ReadSampleSize(TFile *fp)
Definition: clusttool.cpp:120
mixed
Definition: cluster.h:43
TOKENSIZE
#define TOKENSIZE
max size of tokens read from an input file
Definition: clusttool.cpp:29
PARAM_DESC::Max
float Max
Definition: ocrfeatures.h:45
PROTOSTYLE
PROTOSTYLE
Definition: cluster.h:43
ReadPrototype
PROTOTYPE * ReadPrototype(TFile *fp, uint16_t N)
Definition: clusttool.cpp:176
PARAM_DESC
Definition: ocrfeatures.h:41
WritePrototype
void WritePrototype(FILE *File, uint16_t N, PROTOTYPE *Proto)
Definition: clusttool.cpp:280
PROTOTYPE::Significant
bool Significant
Definition: cluster.h:63
PARAM_DESC::NonEssential
bool NonEssential
Definition: ocrfeatures.h:43
PROTOTYPE::Mean
float * Mean
Definition: cluster.h:73
FLOATUNION::Spherical
float Spherical
Definition: cluster.h:58
PARAM_DESC::HalfRange
float HalfRange
Definition: ocrfeatures.h:47
Efree
void Efree(void *ptr)
Definition: emalloc.cpp:45
tesseract::TFile::FGets
char * FGets(char *buffer, int buffer_size)
Definition: serialis.cpp:262
spherical
Definition: cluster.h:43
PROTOTYPE::Style
unsigned Style
Definition: cluster.h:69
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
PROTOTYPE::Variance
FLOATUNION Variance
Definition: cluster.h:76
automatic
Definition: cluster.h:43
PROTOTYPE::NumSamples
unsigned NumSamples
Definition: cluster.h:70
DISTRIBUTION_COUNT
Definition: cluster.h:55
PROTOTYPE::Cluster
CLUSTER * Cluster
Definition: cluster.h:71
clusttool.h
D_random
Definition: cluster.h:55
PROTOTYPE::Distrib
DISTRIBUTION * Distrib
Definition: cluster.h:72