tesseract  4.0.0-1-g2a2b
context.cpp
Go to the documentation of this file.
1 /* -*-C-*-
2  ********************************************************************************
3  *
4  * File: context.cpp (Formerly context.c)
5  * Description: Context checking functions
6  * Author: Mark Seaman, OCR Technology
7  * Created: Thu Feb 15 11:18:24 1990
8  * Modified: Tue Jul 9 17:38:16 1991 (Mark Seaman) marks@hpgrlt
9  * Language: C
10  * Package: N/A
11  * Status: Experimental (Do Not Distribute)
12  *
13  * (c) Copyright 1990, Hewlett-Packard Company.
14  ** Licensed under the Apache License, Version 2.0 (the "License");
15  ** you may not use this file except in compliance with the License.
16  ** You may obtain a copy of the License at
17  ** http://www.apache.org/licenses/LICENSE-2.0
18  ** Unless required by applicable law or agreed to in writing, software
19  ** distributed under the License is distributed on an "AS IS" BASIS,
20  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21  ** See the License for the specific language governing permissions and
22  ** limitations under the License.
23  *
24  *********************************************************************************/
25 
26 #include "dict.h"
27 #include "tprintf.h"
28 #include "unicharset.h"
29 
30 namespace tesseract {
31 
32 static const int kMinAbsoluteGarbageWordLength = 10;
33 static const float kMinAbsoluteGarbageAlphanumFrac = 0.5f;
34 
35 const int case_state_table[6][4] = {
36  {/* 0. Beginning of word */
37  /* P U L D */
38  /* -1. Error on case */
39  0, 1, 5, 4},
40  {/* 1. After initial capital */
41  0, 3, 2, 4},
42  {/* 2. After lower case */
43  0, -1, 2, -1},
44  {/* 3. After upper case */
45  0, 3, -1, 4},
46  {/* 4. After a digit */
47  0, -1, -1, 4},
48  {/* 5. After initial lower case */
49  5, -1, 2, -1},
50 };
51 
52 int Dict::case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset) const {
53  int state = 0;
54  int x;
55  for (x = 0; x < word.length(); ++x) {
56  UNICHAR_ID ch_id = word.unichar_id(x);
57  if (unicharset.get_isupper(ch_id))
58  state = case_state_table[state][1];
59  else if (unicharset.get_islower(ch_id))
60  state = case_state_table[state][2];
61  else if (unicharset.get_isdigit(ch_id))
62  state = case_state_table[state][3];
63  else
64  state = case_state_table[state][0];
65  if (state == -1) return false;
66  }
67  return state != 5; // single lower is bad
68 }
69 
71  const UNICHARSET &unicharset) {
72  if (word.length() < kMinAbsoluteGarbageWordLength) return false;
73  int num_alphanum = 0;
74  for (int x = 0; x < word.length(); ++x) {
75  num_alphanum += (unicharset.get_isalpha(word.unichar_id(x)) ||
76  unicharset.get_isdigit(word.unichar_id(x)));
77  }
78  return (static_cast<float>(num_alphanum) /
79  static_cast<float>(word.length()) < kMinAbsoluteGarbageAlphanumFrac);
80 }
81 
82 } // namespace tesseract
int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset) const
Check a string to see if it matches a set of lexical rules.
Definition: context.cpp:52
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:493
int UNICHAR_ID
Definition: unichar.h:35
const int case_state_table[6][4]
Definition: context.cpp:35
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:486
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:315
bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset)
Definition: context.cpp:70
int length() const
Definition: ratngs.h:303
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:500