tesseract  5.0.0-alpha-619-ge9db
blobclass.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  ** Filename: blobclass.c
3  ** Purpose: High level blob classification and training routines.
4  ** Author: Dan Johnson
5  **
6  ** (c) Copyright Hewlett-Packard Company, 1988.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  ******************************************************************************/
17 
21 #include "blobclass.h"
22 
23 #include <cstdio>
24 
25 #include "classify.h"
26 #include "featdefs.h"
27 #include "mf.h"
28 #include "normfeat.h"
29 
30 static const char kUnknownFontName[] = "UnknownFont";
31 
32 static STRING_VAR(classify_font_name, kUnknownFontName,
33  "Default font name to be used in training");
34 
35 namespace tesseract {
40 // Finds the name of the training font and returns it in fontname, by cutting
41 // it out based on the expectation that the filename is of the form:
42 // /path/to/dir/[lang].[fontname].exp[num]
43 // The [lang], [fontname] and [num] fields should not have '.' characters.
44 // If the global parameter classify_font_name is set, its value is used instead.
45 void ExtractFontName(const STRING& filename, STRING* fontname) {
46  *fontname = classify_font_name;
47  if (*fontname == kUnknownFontName) {
48  // filename is expected to be of the form [lang].[fontname].exp[num]
49  // The [lang], [fontname] and [num] fields should not have '.' characters.
50  const char *basename = strrchr(filename.c_str(), '/');
51  const char *firstdot = strchr(basename ? basename : filename.c_str(), '.');
52  const char *lastdot = strrchr(filename.c_str(), '.');
53  if (firstdot != lastdot && firstdot != nullptr && lastdot != nullptr) {
54  ++firstdot;
55  *fontname = firstdot;
56  fontname->truncate_at(lastdot - firstdot);
57  }
58  }
59 }
60 
61 
62 /*---------------------------------------------------------------------------*/
63 
64 // Extracts features from the given blob and saves them in the tr_file_data_
65 // member variable.
66 // fontname: Name of font that this blob was printed in.
67 // cn_denorm: Character normalization transformation to apply to the blob.
68 // fx_info: Character normalization parameters computed with cn_denorm.
69 // blob_text: Ground truth text for the blob.
70 void Classify::LearnBlob(const STRING& fontname, TBLOB* blob,
71  const DENORM& cn_denorm,
72  const INT_FX_RESULT_STRUCT& fx_info,
73  const char* blob_text) {
75  CharDesc->FeatureSets[0] = ExtractMicros(blob, cn_denorm);
76  CharDesc->FeatureSets[1] = ExtractCharNormFeatures(fx_info);
77  CharDesc->FeatureSets[2] = ExtractIntCNFeatures(*blob, fx_info);
78  CharDesc->FeatureSets[3] = ExtractIntGeoFeatures(*blob, fx_info);
79 
80  if (ValidCharDescription(feature_defs_, CharDesc)) {
81  // Label the features with a class name and font name.
82  tr_file_data_ += "\n";
83  tr_file_data_ += fontname;
84  tr_file_data_ += " ";
85  tr_file_data_ += blob_text;
86  tr_file_data_ += "\n";
87 
88  // write micro-features to file and clean up
89  WriteCharDescription(feature_defs_, CharDesc, &tr_file_data_);
90  } else {
91  tprintf("Blob learned was invalid!\n");
92  }
93  FreeCharDescription(CharDesc);
94 } // LearnBlob
95 
96 // Writes stored training data to a .tr file based on the given filename.
97 // Returns false on error.
98 bool Classify::WriteTRFile(const STRING& filename) {
99  bool result = false;
100  STRING tr_filename = filename + ".tr";
101  FILE* fp = fopen(tr_filename.c_str(), "wb");
102  if (fp) {
103  result =
104  tesseract::Serialize(fp, &tr_file_data_[0], tr_file_data_.length());
105  fclose(fp);
106  }
107  tr_file_data_.truncate_at(0);
108  return result;
109 }
110 
111 } // namespace tesseract.
ExtractMicros
FEATURE_SET ExtractMicros(TBLOB *Blob, const DENORM &cn_denorm)
Definition: mf.cpp:43
INT_FX_RESULT_STRUCT
Definition: intfx.h:34
mf.h
CHAR_DESC_STRUCT::FeatureSets
FEATURE_SET FeatureSets[NUM_FEATURE_TYPES]
Definition: featdefs.h:40
STRING
Definition: strngs.h:45
STRING::truncate_at
void truncate_at(int32_t index)
Definition: strngs.cpp:258
STRING_VAR
#define STRING_VAR(name, val, comment)
Definition: params.h:306
tesseract::Classify::ExtractIntGeoFeatures
FEATURE_SET ExtractIntGeoFeatures(const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
Definition: picofeat.cpp:246
tesseract::ExtractFontName
void ExtractFontName(const STRING &filename, STRING *fontname)
Definition: blobclass.cpp:45
WriteCharDescription
void WriteCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, CHAR_DESC CharDesc, STRING *str)
Definition: featdefs.cpp:173
STRING::c_str
const char * c_str() const
Definition: strngs.cpp:192
normfeat.h
tesseract::Classify::feature_defs_
FEATURE_DEFS_STRUCT feature_defs_
Definition: classify.h:541
tesseract
Definition: baseapi.h:65
tesseract::Classify::ExtractIntCNFeatures
FEATURE_SET ExtractIntCNFeatures(const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
Definition: picofeat.cpp:216
tesseract::Classify::WriteTRFile
bool WriteTRFile(const STRING &filename)
Definition: blobclass.cpp:98
ExtractCharNormFeatures
FEATURE_SET ExtractCharNormFeatures(const INT_FX_RESULT_STRUCT &fx_info)
Definition: normfeat.cpp:60
CHAR_DESC_STRUCT
Definition: featdefs.h:38
STRING::length
int32_t length() const
Definition: strngs.cpp:187
tesseract::Classify::LearnBlob
void LearnBlob(const STRING &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text)
Definition: blobclass.cpp:70
blobclass.h
TBLOB
Definition: blobs.h:282
featdefs.h
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:34
ValidCharDescription
bool ValidCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, CHAR_DESC CharDesc)
Definition: featdefs.cpp:194
NewCharDescription
CHAR_DESC NewCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs)
Definition: featdefs.cpp:147
classify.h
tesseract::Serialize
bool Serialize(FILE *fp, const char *data, size_t n=1)
Definition: serialis.cpp:73
FreeCharDescription
void FreeCharDescription(CHAR_DESC CharDesc)
Definition: featdefs.cpp:128
DENORM
Definition: normalis.h:49