tesseract  5.0.0-alpha-619-ge9db
unicharmap.cpp
Go to the documentation of this file.
1 // File: unicharmap.cpp
3 // Description: Unicode character/ligature to integer id class.
4 // Author: Thomas Kielbus
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #include <cassert>
20 #include <tesseract/unichar.h>
21 #include "unicharmap.h"
22 
24 nodes(nullptr) {
25 }
26 
28  delete[] nodes;
29 }
30 
31 // Search the given unichar representation in the tree, using length characters
32 // from it maximum. Each character in the string is interpreted as an index in
33 // an array of nodes.
34 UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr,
35  int length) const {
36  UNICHARMAP_NODE* current_nodes = nodes;
37 
38  assert(*unichar_repr != '\0');
39  assert(length > 0 && length <= UNICHAR_LEN);
40 
41  int index = 0;
42  if (length <= 0 || unichar_repr[index] == '\0') return INVALID_UNICHAR_ID;
43  do {
44  if (index + 1 >= length || unichar_repr[index + 1] == '\0')
45  return current_nodes[static_cast<unsigned char>(unichar_repr[index])].id;
46  current_nodes =
47  current_nodes[static_cast<unsigned char>(unichar_repr[index])].children;
48  ++index;
49  } while (true);
50 }
51 
52 // Search the given unichar representation in the tree, creating the possibly
53 // missing nodes. Once the right place has been found, insert the given id and
54 // update the inserted flag to keep track of the insert. Each character in the
55 // string is interpreted as an index in an array of nodes.
56 void UNICHARMAP::insert(const char* const unichar_repr, UNICHAR_ID id) {
57  const char* current_char = unichar_repr;
58  if (*current_char == '\0') return;
59  UNICHARMAP_NODE** current_nodes_pointer = &nodes;
60  do {
61  if (*current_nodes_pointer == nullptr)
62  *current_nodes_pointer = new UNICHARMAP_NODE[256];
63  if (current_char[1] == '\0') {
64  (*current_nodes_pointer)
65  [static_cast<unsigned char>(*current_char)].id = id;
66  return;
67  }
68  current_nodes_pointer =
69  &((*current_nodes_pointer)
70  [static_cast<unsigned char>(*current_char)].children);
71  ++current_char;
72  } while (true);
73 }
74 
75 // Search the given unichar representation in the tree, using length characters
76 // from it maximum. Each character in the string is interpreted as an index in
77 // an array of nodes. Stop once the tree does not have anymore nodes or once we
78 // found the right unichar_repr.
79 bool UNICHARMAP::contains(const char* const unichar_repr,
80  int length) const {
81  if (unichar_repr == nullptr || *unichar_repr == '\0') return false;
82  if (length <= 0 || length > UNICHAR_LEN) return false;
83  int index = 0;
84  if (unichar_repr[index] == '\0') return false;
85  UNICHARMAP_NODE* current_nodes = nodes;
86 
87  while (current_nodes != nullptr && index + 1 < length &&
88  unichar_repr[index + 1] != '\0') {
89  current_nodes =
90  current_nodes[static_cast<unsigned char>(unichar_repr[index])].children;
91  ++index;
92  }
93  return current_nodes != nullptr &&
94  (index + 1 >= length || unichar_repr[index + 1] == '\0') &&
95  current_nodes[static_cast<unsigned char>(unichar_repr[index])].id >= 0;
96 }
97 
98 // Return the minimum number of characters that must be used from this string
99 // to obtain a match in the UNICHARMAP.
100 int UNICHARMAP::minmatch(const char* const unichar_repr) const {
101  const char* current_char = unichar_repr;
102  if (*current_char == '\0') return 0;
103  UNICHARMAP_NODE* current_nodes = nodes;
104 
105  while (current_nodes != nullptr && *current_char != '\0') {
106  if (current_nodes[static_cast<unsigned char>(*current_char)].id >= 0)
107  return current_char + 1 - unichar_repr;
108  current_nodes =
109  current_nodes[static_cast<unsigned char>(*current_char)].children;
110  ++current_char;
111  }
112  return 0;
113 }
114 
116  delete[] nodes;
117  nodes = nullptr;
118 }
119 
120 UNICHARMAP::UNICHARMAP_NODE::UNICHARMAP_NODE() :
121 children(nullptr),
122 id(-1) {
123 }
124 
125 // Recursively delete the children
126 UNICHARMAP::UNICHARMAP_NODE::~UNICHARMAP_NODE() {
127  delete[] children;
128 }
unicharmap.h
UNICHARMAP::contains
bool contains(const char *const unichar_repr, int length) const
Definition: unicharmap.cpp:79
UNICHARMAP::clear
void clear()
Definition: unicharmap.cpp:115
UNICHARMAP::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr, int length) const
Definition: unicharmap.cpp:34
UNICHARMAP::minmatch
int minmatch(const char *const unichar_repr) const
Definition: unicharmap.cpp:100
UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:36
UNICHAR_LEN
#define UNICHAR_LEN
Definition: unichar.h:32
UNICHARMAP::UNICHARMAP
UNICHARMAP()
Definition: unicharmap.cpp:23
UNICHARMAP::~UNICHARMAP
~UNICHARMAP()
Definition: unicharmap.cpp:27
unichar.h
UNICHARMAP::insert
void insert(const char *const unichar_repr, UNICHAR_ID id)
Definition: unicharmap.cpp:56