Main Page
Related Pages
Modules
Namespaces
Classes
Files
File List
File Members
All
Classes
Namespaces
Files
Functions
Variables
Typedefs
Enumerations
Enumerator
Friends
Macros
Modules
Pages
char_bigrams.h
Go to the documentation of this file.
1
/**********************************************************************
2
* File: char_bigrams.h
3
* Description: Declaration of a Character Bigrams Class
4
* Author: Ahmad Abdulkader
5
* Created: 2007
6
*
7
* (C) Copyright 2008, Google Inc.
8
** Licensed under the Apache License, Version 2.0 (the "License");
9
** you may not use this file except in compliance with the License.
10
** You may obtain a copy of the License at
11
** http://www.apache.org/licenses/LICENSE-2.0
12
** Unless required by applicable law or agreed to in writing, software
13
** distributed under the License is distributed on an "AS IS" BASIS,
14
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
** See the License for the specific language governing permissions and
16
** limitations under the License.
17
*
18
**********************************************************************/
19
20
// The CharBigram class represents the interface to the character bigram
21
// table used by Cube
22
// A CharBigram object can be constructed from the Char Bigrams file
23
// Given a sequence of characters, the "Cost" method returns the Char Bigram
24
// cost of the string according to the table
25
26
#ifndef CHAR_BIGRAMS_H
27
#define CHAR_BIGRAMS_H
28
29
#include <string>
30
#include "
char_set.h
"
31
32
namespace
tesseract
{
33
34
// structure representing a single bigram value
35
struct
Bigram
{
36
int
cnt
;
37
int
cost
;
38
};
39
40
// structure representing the char bigram array of characters
41
// following a specific character
42
struct
CharBigram
{
43
int
total_cnt
;
44
char_32
max_char
;
45
Bigram
*
bigram
;
46
};
47
48
// structure representing the whole bigram table
49
struct
CharBigramTable
{
50
int
total_cnt
;
51
int
worst_cost
;
52
char_32
max_char
;
53
CharBigram
*
char_bigram
;
54
};
55
56
class
CharBigrams
{
57
public
:
58
CharBigrams
();
59
~CharBigrams
();
60
// Construct the CharBigrams class from a file
61
static
CharBigrams
*
Create
(
const
string
&data_file_path,
62
const
string
&
lang
);
63
// Top-level function to return the mean character bigram cost of a
64
// sequence of characters. If char_set is not NULL, use
65
// tesseract functions to return a case-invariant cost.
66
// This avoids unnecessarily penalizing all-one-case words or
67
// capitalized words (first-letter upper-case and remaining letters
68
// lower-case).
69
int
Cost
(
const
char_32
*str,
CharSet
*char_set)
const
;
70
71
protected
:
72
// Returns the character bigram cost of two characters.
73
int
PairCost
(
char_32
ch1,
char_32
ch2)
const
;
74
// Returns the mean character bigram cost of a sequence of
75
// characters. Adds a space at the beginning and end to account for
76
// cost of starting and ending characters.
77
int
MeanCostWithSpaces
(
const
char_32
*char_32_ptr)
const
;
78
79
private
:
80
// Only words this length or greater qualify for case-invariant character
81
// bigram cost.
82
static
const
int
kMinLengthCaseInvariant = 4;
83
84
85
CharBigramTable
bigram_table_;
86
};
87
}
88
89
#endif // CHAR_BIGRAMS_H
tesseract::CharBigramTable::max_char
char_32 max_char
Definition:
char_bigrams.h:52
tesseract::Bigram::cnt
int cnt
Definition:
char_bigrams.h:36
tesseract::CharBigram::bigram
Bigram * bigram
Definition:
char_bigrams.h:45
tesseract::CharBigrams::Create
static CharBigrams * Create(const string &data_file_path, const string &lang)
Definition:
char_bigrams.cpp:49
tesseract::CharBigramTable::total_cnt
int total_cnt
Definition:
char_bigrams.h:50
tesseract::CharBigrams::Cost
int Cost(const char_32 *str, CharSet *char_set) const
Definition:
char_bigrams.cpp:171
tesseract::CharBigramTable::worst_cost
int worst_cost
Definition:
char_bigrams.h:51
tesseract::CharBigram::max_char
char_32 max_char
Definition:
char_bigrams.h:44
tesseract::CharBigram
Definition:
char_bigrams.h:42
tesseract::CharBigram::total_cnt
int total_cnt
Definition:
char_bigrams.h:43
tesseract::CharBigramTable::char_bigram
CharBigram * char_bigram
Definition:
char_bigrams.h:53
tesseract::CharSet
Definition:
char_set.h:42
tesseract::CharBigrams::PairCost
int PairCost(char_32 ch1, char_32 ch2) const
Definition:
char_bigrams.cpp:161
tesseract::Bigram::cost
int cost
Definition:
char_bigrams.h:37
char_set.h
tesseract::CharBigrams::~CharBigrams
~CharBigrams()
Definition:
char_bigrams.cpp:36
tesseract
Definition:
baseapi.cpp:83
tesseract::Bigram
Definition:
char_bigrams.h:35
tesseract-c_api-demo.lang
string lang
Definition:
tesseract-c_api-demo.py:28
tesseract::CharBigrams::MeanCostWithSpaces
int MeanCostWithSpaces(const char_32 *char_32_ptr) const
Definition:
char_bigrams.cpp:194
tesseract::CharBigramTable
Definition:
char_bigrams.h:49
tesseract::char_32
signed int char_32
Definition:
string_32.h:40
tesseract::CharBigrams::CharBigrams
CharBigrams()
Definition:
char_bigrams.cpp:32
tesseract::CharBigrams
Definition:
char_bigrams.h:56
cube
char_bigrams.h
Generated on Mon Jul 20 2015 18:37:53 by
1.8.8