tesseract  5.0.0-alpha-619-ge9db
bitvector.cpp
Go to the documentation of this file.
1 // Copyright 2011 Google Inc. All Rights Reserved.
2 // Author: rays@google.com (Ray Smith)
4 // File: bitvector.cpp
5 // Description: Class replacement for BITVECTOR.
6 // Author: Ray Smith
7 // Created: Mon Jan 10 17:45:01 PST 2011
8 //
9 // (C) Copyright 2011, Google Inc.
10 // Licensed under the Apache License, Version 2.0 (the "License");
11 // you may not use this file except in compliance with the License.
12 // You may obtain a copy of the License at
13 // http://www.apache.org/licenses/LICENSE-2.0
14 // Unless required by applicable law or agreed to in writing, software
15 // distributed under the License is distributed on an "AS IS" BASIS,
16 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 // See the License for the specific language governing permissions and
18 // limitations under the License.
19 //
21 
22 #include "bitvector.h"
23 #include <algorithm>
24 #include <cstring>
25 #include <tesseract/helpers.h>
26 #include <tesseract/serialis.h> // for tesseract::Serialize
27 
28 namespace tesseract {
29 
30 // Fast lookup table to get the first least significant set bit in a byte.
31 // For zero, the table has 255, but since it is a special case, most code
32 // that uses this table will check for zero before looking up lsb_index_.
33 const uint8_t BitVector::lsb_index_[256] = {
34  255, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
35  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
36  5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
37  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
38  6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
39  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
40  5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
41  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
42  7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
43  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
44  5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
45  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
46  6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
47  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
48  5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
49  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
50 };
51 
52 // Fast lookup table to get the residual bits after zeroing the first (lowest)
53 // set bit in a byte.
54 const uint8_t BitVector::lsb_eroded_[256] = {
55  0, 0, 0, 0x2, 0, 0x4, 0x4, 0x6,
56  0, 0x8, 0x8, 0x0a, 0x08, 0x0c, 0x0c, 0x0e,
57  0, 0x10, 0x10, 0x12, 0x10, 0x14, 0x14, 0x16,
58  0x10, 0x18, 0x18, 0x1a, 0x18, 0x1c, 0x1c, 0x1e,
59  0, 0x20, 0x20, 0x22, 0x20, 0x24, 0x24, 0x26,
60  0x20, 0x28, 0x28, 0x2a, 0x28, 0x2c, 0x2c, 0x2e,
61  0x20, 0x30, 0x30, 0x32, 0x30, 0x34, 0x34, 0x36,
62  0x30, 0x38, 0x38, 0x3a, 0x38, 0x3c, 0x3c, 0x3e,
63  0, 0x40, 0x40, 0x42, 0x40, 0x44, 0x44, 0x46,
64  0x40, 0x48, 0x48, 0x4a, 0x48, 0x4c, 0x4c, 0x4e,
65  0x40, 0x50, 0x50, 0x52, 0x50, 0x54, 0x54, 0x56,
66  0x50, 0x58, 0x58, 0x5a, 0x58, 0x5c, 0x5c, 0x5e,
67  0x40, 0x60, 0x60, 0x62, 0x60, 0x64, 0x64, 0x66,
68  0x60, 0x68, 0x68, 0x6a, 0x68, 0x6c, 0x6c, 0x6e,
69  0x60, 0x70, 0x70, 0x72, 0x70, 0x74, 0x74, 0x76,
70  0x70, 0x78, 0x78, 0x7a, 0x78, 0x7c, 0x7c, 0x7e,
71  0, 0x80, 0x80, 0x82, 0x80, 0x84, 0x84, 0x86,
72  0x80, 0x88, 0x88, 0x8a, 0x88, 0x8c, 0x8c, 0x8e,
73  0x80, 0x90, 0x90, 0x92, 0x90, 0x94, 0x94, 0x96,
74  0x90, 0x98, 0x98, 0x9a, 0x98, 0x9c, 0x9c, 0x9e,
75  0x80, 0xa0, 0xa0, 0xa2, 0xa0, 0xa4, 0xa4, 0xa6,
76  0xa0, 0xa8, 0xa8, 0xaa, 0xa8, 0xac, 0xac, 0xae,
77  0xa0, 0xb0, 0xb0, 0xb2, 0xb0, 0xb4, 0xb4, 0xb6,
78  0xb0, 0xb8, 0xb8, 0xba, 0xb8, 0xbc, 0xbc, 0xbe,
79  0x80, 0xc0, 0xc0, 0xc2, 0xc0, 0xc4, 0xc4, 0xc6,
80  0xc0, 0xc8, 0xc8, 0xca, 0xc8, 0xcc, 0xcc, 0xce,
81  0xc0, 0xd0, 0xd0, 0xd2, 0xd0, 0xd4, 0xd4, 0xd6,
82  0xd0, 0xd8, 0xd8, 0xda, 0xd8, 0xdc, 0xdc, 0xde,
83  0xc0, 0xe0, 0xe0, 0xe2, 0xe0, 0xe4, 0xe4, 0xe6,
84  0xe0, 0xe8, 0xe8, 0xea, 0xe8, 0xec, 0xec, 0xee,
85  0xe0, 0xf0, 0xf0, 0xf2, 0xf0, 0xf4, 0xf4, 0xf6,
86  0xf0, 0xf8, 0xf8, 0xfa, 0xf8, 0xfc, 0xfc, 0xfe
87 };
88 
89 // Fast lookup table to give the number of set bits in a byte.
90 const int BitVector::hamming_table_[256] = {
91  0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
92  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
93  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
94  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
95  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
96  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
97  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
98  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
99  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
100  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
101  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
102  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
103  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
104  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
105  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
106  4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
107 };
108 
109 
110 BitVector::BitVector() : bit_size_(0), array_(nullptr) {}
111 
112 BitVector::BitVector(int length) : bit_size_(length) {
113  array_ = new uint32_t[WordLength()];
114  SetAllFalse();
115 }
116 
117 BitVector::BitVector(const BitVector& src) : bit_size_(src.bit_size_) {
118  if (src.bit_size_ > 0) {
119  array_ = new uint32_t[WordLength()];
120  memcpy(array_, src.array_, ByteLength());
121  } else {
122  array_ = nullptr;
123  }
124 }
125 
127  Alloc(src.bit_size_);
128  if (src.bit_size_ > 0) {
129  memcpy(array_, src.array_, ByteLength());
130  }
131  return *this;
132 }
133 
135  delete [] array_;
136 }
137 
138 // Initializes the array to length * false.
139 void BitVector::Init(int length) {
140  Alloc(length);
141  SetAllFalse();
142 }
143 
144 // Writes to the given file. Returns false in case of error.
145 bool BitVector::Serialize(FILE* fp) const {
146  if (!tesseract::Serialize(fp, &bit_size_)) return false;
147  int wordlen = WordLength();
148  return tesseract::Serialize(fp, &array_[0], wordlen);
149 }
150 
151 // Reads from the given file. Returns false in case of error.
152 // If swap is true, assumes a big/little-endian swap is needed.
153 bool BitVector::DeSerialize(bool swap, FILE* fp) {
154  uint32_t new_bit_size;
155  if (!tesseract::DeSerialize(fp, &new_bit_size)) return false;
156  if (swap) {
157  ReverseN(&new_bit_size, sizeof(new_bit_size));
158  }
159  Alloc(new_bit_size);
160  int wordlen = WordLength();
161  if (!tesseract::DeSerialize(fp, &array_[0], wordlen)) return false;
162  if (swap) {
163  for (int i = 0; i < wordlen; ++i)
164  ReverseN(&array_[i], sizeof(array_[i]));
165  }
166  return true;
167 }
168 
170  memset(array_, 0, ByteLength());
171 }
173  memset(array_, ~0, ByteLength());
174 }
175 
176 // Returns the index of the next set bit after the given index.
177 // Useful for quickly iterating through the set bits in a sparse vector.
178 int BitVector::NextSetBit(int prev_bit) const {
179  // Move on to the next bit.
180  int next_bit = prev_bit + 1;
181  if (next_bit >= bit_size_) return -1;
182  // Check the remains of the word containing the next_bit first.
183  int next_word = WordIndex(next_bit);
184  int bit_index = next_word * kBitFactor;
185  int word_end = bit_index + kBitFactor;
186  uint32_t word = array_[next_word];
187  uint8_t byte = word & 0xff;
188  while (bit_index < word_end) {
189  if (bit_index + 8 > next_bit && byte != 0) {
190  while (bit_index + lsb_index_[byte] < next_bit && byte != 0)
191  byte = lsb_eroded_[byte];
192  if (byte != 0)
193  return bit_index + lsb_index_[byte];
194  }
195  word >>= 8;
196  bit_index += 8;
197  byte = word & 0xff;
198  }
199  // next_word didn't contain a 1, so find the next word with set bit.
200  ++next_word;
201  int wordlen = WordLength();
202  while (next_word < wordlen && (word = array_[next_word]) == 0) {
203  ++next_word;
204  bit_index += kBitFactor;
205  }
206  if (bit_index >= bit_size_) return -1;
207  // Find the first non-zero byte within the word.
208  while ((word & 0xff) == 0) {
209  word >>= 8;
210  bit_index += 8;
211  }
212  return bit_index + lsb_index_[word & 0xff];
213 }
214 
215 // Returns the number of set bits in the vector.
217  int wordlen = WordLength();
218  int total_bits = 0;
219  for (int w = 0; w < wordlen; ++w) {
220  uint32_t word = array_[w];
221  for (int i = 0; i < 4; ++i) {
222  total_bits += hamming_table_[word & 0xff];
223  word >>= 8;
224  }
225  }
226  return total_bits;
227 }
228 
229 // Logical in-place operations on whole bit vectors. Tries to do something
230 // sensible if they aren't the same size, but they should be really.
231 void BitVector::operator|=(const BitVector& other) {
232  int length = std::min(WordLength(), other.WordLength());
233  for (int w = 0; w < length; ++w)
234  array_[w] |= other.array_[w];
235 }
236 void BitVector::operator&=(const BitVector& other) {
237  int length = std::min(WordLength(), other.WordLength());
238  for (int w = 0; w < length; ++w)
239  array_[w] &= other.array_[w];
240  for (int w = WordLength() - 1; w >= length; --w)
241  array_[w] = 0;
242 }
243 void BitVector::operator^=(const BitVector& other) {
244  int length = std::min(WordLength(), other.WordLength());
245  for (int w = 0; w < length; ++w)
246  array_[w] ^= other.array_[w];
247 }
248 // Set subtraction *this = v1 - v2.
249 void BitVector::SetSubtract(const BitVector& v1, const BitVector& v2) {
250  Alloc(v1.size());
251  int length = std::min(v1.WordLength(), v2.WordLength());
252  for (int w = 0; w < length; ++w)
253  array_[w] = v1.array_[w] ^ (v1.array_[w] & v2.array_[w]);
254  for (int w = WordLength() - 1; w >= length; --w)
255  array_[w] = v1.array_[w];
256 }
257 
258 // Allocates memory for a vector of the given length.
259 // Reallocates if the array is a different size, larger or smaller.
260 void BitVector::Alloc(int length) {
261  int initial_wordlength = WordLength();
262  bit_size_ = length;
263  int new_wordlength = WordLength();
264  if (new_wordlength != initial_wordlength) {
265  delete [] array_;
266  array_ = new uint32_t[new_wordlength];
267  }
268 }
269 
270 
271 } // namespace tesseract.
tesseract::BitVector::SetAllFalse
void SetAllFalse()
Definition: bitvector.cpp:169
tesseract::BitVector::hamming_table_
static const int hamming_table_[256]
Definition: bitvector.h:40
tesseract::BitVector::NumSetBits
int NumSetBits() const
Definition: bitvector.cpp:216
tesseract::BitVector::Init
void Init(int length)
Definition: bitvector.cpp:139
tesseract::BitVector::NextSetBit
int NextSetBit(int prev_bit) const
Definition: bitvector.cpp:178
tesseract::BitVector::operator&=
void operator&=(const BitVector &other)
Definition: bitvector.cpp:236
tesseract::BitVector::Serialize
bool Serialize(FILE *fp) const
Definition: bitvector.cpp:145
tesseract::BitVector::operator^=
void operator^=(const BitVector &other)
Definition: bitvector.cpp:243
tesseract::BitVector::operator=
BitVector & operator=(const BitVector &src)
Definition: bitvector.cpp:126
tesseract::BitVector::operator|=
void operator|=(const BitVector &other)
Definition: bitvector.cpp:231
helpers.h
tesseract
Definition: baseapi.h:65
tesseract::BitVector::SetSubtract
void SetSubtract(const BitVector &v1, const BitVector &v2)
Definition: bitvector.cpp:249
bitvector.h
tesseract::BitVector
Definition: bitvector.h:30
tesseract::BitVector::lsb_eroded_
static const uint8_t lsb_eroded_[256]
Definition: bitvector.h:38
tesseract::BitVector::size
int size() const
Definition: bitvector.h:53
tesseract::BitVector::DeSerialize
bool DeSerialize(bool swap, FILE *fp)
Definition: bitvector.cpp:153
tesseract::BitVector::lsb_index_
static const uint8_t lsb_index_[256]
Definition: bitvector.h:35
serialis.h
ReverseN
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:183
tesseract::DeSerialize
bool DeSerialize(FILE *fp, char *data, size_t n=1)
Definition: serialis.cpp:41
tesseract::BitVector::~BitVector
~BitVector()
Definition: bitvector.cpp:134
tesseract::BitVector::BitVector
BitVector()
Definition: bitvector.cpp:110
tesseract::Serialize
bool Serialize(FILE *fp, const char *data, size_t n=1)
Definition: serialis.cpp:73
tesseract::BitVector::SetAllTrue
void SetAllTrue()
Definition: bitvector.cpp:172