tesseract  4.0.0-1-g2a2b
bitvector.cpp
Go to the documentation of this file.
1 // Copyright 2011 Google Inc. All Rights Reserved.
2 // Author: rays@google.com (Ray Smith)
4 // File: bitvector.cpp
5 // Description: Class replacement for BITVECTOR.
6 // Author: Ray Smith
7 // Created: Mon Jan 10 17:45:01 PST 2011
8 //
9 // (C) Copyright 2011, Google Inc.
10 // Licensed under the Apache License, Version 2.0 (the "License");
11 // you may not use this file except in compliance with the License.
12 // You may obtain a copy of the License at
13 // http://www.apache.org/licenses/LICENSE-2.0
14 // Unless required by applicable law or agreed to in writing, software
15 // distributed under the License is distributed on an "AS IS" BASIS,
16 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 // See the License for the specific language governing permissions and
18 // limitations under the License.
19 //
21 
22 #include "bitvector.h"
23 #include <algorithm>
24 #include <cstring>
25 #include "helpers.h"
26 #include "serialis.h" // for tesseract::Serialize
27 
28 namespace tesseract {
29 
30 // Fast lookup table to get the first least significant set bit in a byte.
31 // For zero, the table has 255, but since it is a special case, most code
32 // that uses this table will check for zero before looking up lsb_index_.
33 const uint8_t BitVector::lsb_index_[256] = {
34  255, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
35  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
36  5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
37  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
38  6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
39  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
40  5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
41  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
42  7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
43  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
44  5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
45  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
46  6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
47  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
48  5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
49  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
50 };
51 
52 // Fast lookup table to get the residual bits after zeroing the first (lowest)
53 // set bit in a byte.
54 const uint8_t BitVector::lsb_eroded_[256] = {
55  0, 0, 0, 0x2, 0, 0x4, 0x4, 0x6,
56  0, 0x8, 0x8, 0x0a, 0x08, 0x0c, 0x0c, 0x0e,
57  0, 0x10, 0x10, 0x12, 0x10, 0x14, 0x14, 0x16,
58  0x10, 0x18, 0x18, 0x1a, 0x18, 0x1c, 0x1c, 0x1e,
59  0, 0x20, 0x20, 0x22, 0x20, 0x24, 0x24, 0x26,
60  0x20, 0x28, 0x28, 0x2a, 0x28, 0x2c, 0x2c, 0x2e,
61  0x20, 0x30, 0x30, 0x32, 0x30, 0x34, 0x34, 0x36,
62  0x30, 0x38, 0x38, 0x3a, 0x38, 0x3c, 0x3c, 0x3e,
63  0, 0x40, 0x40, 0x42, 0x40, 0x44, 0x44, 0x46,
64  0x40, 0x48, 0x48, 0x4a, 0x48, 0x4c, 0x4c, 0x4e,
65  0x40, 0x50, 0x50, 0x52, 0x50, 0x54, 0x54, 0x56,
66  0x50, 0x58, 0x58, 0x5a, 0x58, 0x5c, 0x5c, 0x5e,
67  0x40, 0x60, 0x60, 0x62, 0x60, 0x64, 0x64, 0x66,
68  0x60, 0x68, 0x68, 0x6a, 0x68, 0x6c, 0x6c, 0x6e,
69  0x60, 0x70, 0x70, 0x72, 0x70, 0x74, 0x74, 0x76,
70  0x70, 0x78, 0x78, 0x7a, 0x78, 0x7c, 0x7c, 0x7e,
71  0, 0x80, 0x80, 0x82, 0x80, 0x84, 0x84, 0x86,
72  0x80, 0x88, 0x88, 0x8a, 0x88, 0x8c, 0x8c, 0x8e,
73  0x80, 0x90, 0x90, 0x92, 0x90, 0x94, 0x94, 0x96,
74  0x90, 0x98, 0x98, 0x9a, 0x98, 0x9c, 0x9c, 0x9e,
75  0x80, 0xa0, 0xa0, 0xa2, 0xa0, 0xa4, 0xa4, 0xa6,
76  0xa0, 0xa8, 0xa8, 0xaa, 0xa8, 0xac, 0xac, 0xae,
77  0xa0, 0xb0, 0xb0, 0xb2, 0xb0, 0xb4, 0xb4, 0xb6,
78  0xb0, 0xb8, 0xb8, 0xba, 0xb8, 0xbc, 0xbc, 0xbe,
79  0x80, 0xc0, 0xc0, 0xc2, 0xc0, 0xc4, 0xc4, 0xc6,
80  0xc0, 0xc8, 0xc8, 0xca, 0xc8, 0xcc, 0xcc, 0xce,
81  0xc0, 0xd0, 0xd0, 0xd2, 0xd0, 0xd4, 0xd4, 0xd6,
82  0xd0, 0xd8, 0xd8, 0xda, 0xd8, 0xdc, 0xdc, 0xde,
83  0xc0, 0xe0, 0xe0, 0xe2, 0xe0, 0xe4, 0xe4, 0xe6,
84  0xe0, 0xe8, 0xe8, 0xea, 0xe8, 0xec, 0xec, 0xee,
85  0xe0, 0xf0, 0xf0, 0xf2, 0xf0, 0xf4, 0xf4, 0xf6,
86  0xf0, 0xf8, 0xf8, 0xfa, 0xf8, 0xfc, 0xfc, 0xfe
87 };
88 
89 // Fast lookup table to give the number of set bits in a byte.
90 const int BitVector::hamming_table_[256] = {
91  0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
92  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
93  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
94  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
95  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
96  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
97  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
98  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
99  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
100  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
101  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
102  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
103  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
104  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
105  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
106  4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
107 };
108 
109 
110 BitVector::BitVector() : bit_size_(0), array_(nullptr) {}
111 
112 BitVector::BitVector(int length) : bit_size_(length) {
113  array_ = new uint32_t[WordLength()];
114  SetAllFalse();
115 }
116 
117 BitVector::BitVector(const BitVector& src) : bit_size_(src.bit_size_) {
118  array_ = new uint32_t[WordLength()];
119  memcpy(array_, src.array_, ByteLength());
120 }
121 
123  Alloc(src.bit_size_);
124  memcpy(array_, src.array_, ByteLength());
125  return *this;
126 }
127 
129  delete [] array_;
130 }
131 
132 // Initializes the array to length * false.
133 void BitVector::Init(int length) {
134  Alloc(length);
135  SetAllFalse();
136 }
137 
138 // Writes to the given file. Returns false in case of error.
139 bool BitVector::Serialize(FILE* fp) const {
140  if (!tesseract::Serialize(fp, &bit_size_)) return false;
141  int wordlen = WordLength();
142  return tesseract::Serialize(fp, &array_[0], wordlen);
143 }
144 
145 // Reads from the given file. Returns false in case of error.
146 // If swap is true, assumes a big/little-endian swap is needed.
147 bool BitVector::DeSerialize(bool swap, FILE* fp) {
148  uint32_t new_bit_size;
149  if (!tesseract::DeSerialize(fp, &new_bit_size)) return false;
150  if (swap) {
151  ReverseN(&new_bit_size, sizeof(new_bit_size));
152  }
153  Alloc(new_bit_size);
154  int wordlen = WordLength();
155  if (!tesseract::DeSerialize(fp, &array_[0], wordlen)) return false;
156  if (swap) {
157  for (int i = 0; i < wordlen; ++i)
158  ReverseN(&array_[i], sizeof(array_[i]));
159  }
160  return true;
161 }
162 
164  memset(array_, 0, ByteLength());
165 }
167  memset(array_, ~0, ByteLength());
168 }
169 
170 // Returns the index of the next set bit after the given index.
171 // Useful for quickly iterating through the set bits in a sparse vector.
172 int BitVector::NextSetBit(int prev_bit) const {
173  // Move on to the next bit.
174  int next_bit = prev_bit + 1;
175  if (next_bit >= bit_size_) return -1;
176  // Check the remains of the word containing the next_bit first.
177  int next_word = WordIndex(next_bit);
178  int bit_index = next_word * kBitFactor;
179  int word_end = bit_index + kBitFactor;
180  uint32_t word = array_[next_word];
181  uint8_t byte = word & 0xff;
182  while (bit_index < word_end) {
183  if (bit_index + 8 > next_bit && byte != 0) {
184  while (bit_index + lsb_index_[byte] < next_bit && byte != 0)
185  byte = lsb_eroded_[byte];
186  if (byte != 0)
187  return bit_index + lsb_index_[byte];
188  }
189  word >>= 8;
190  bit_index += 8;
191  byte = word & 0xff;
192  }
193  // next_word didn't contain a 1, so find the next word with set bit.
194  ++next_word;
195  int wordlen = WordLength();
196  while (next_word < wordlen && (word = array_[next_word]) == 0) {
197  ++next_word;
198  bit_index += kBitFactor;
199  }
200  if (bit_index >= bit_size_) return -1;
201  // Find the first non-zero byte within the word.
202  while ((word & 0xff) == 0) {
203  word >>= 8;
204  bit_index += 8;
205  }
206  return bit_index + lsb_index_[word & 0xff];
207 }
208 
209 // Returns the number of set bits in the vector.
211  int wordlen = WordLength();
212  int total_bits = 0;
213  for (int w = 0; w < wordlen; ++w) {
214  uint32_t word = array_[w];
215  for (int i = 0; i < 4; ++i) {
216  total_bits += hamming_table_[word & 0xff];
217  word >>= 8;
218  }
219  }
220  return total_bits;
221 }
222 
223 // Logical in-place operations on whole bit vectors. Tries to do something
224 // sensible if they aren't the same size, but they should be really.
225 void BitVector::operator|=(const BitVector& other) {
226  int length = std::min(WordLength(), other.WordLength());
227  for (int w = 0; w < length; ++w)
228  array_[w] |= other.array_[w];
229 }
230 void BitVector::operator&=(const BitVector& other) {
231  int length = std::min(WordLength(), other.WordLength());
232  for (int w = 0; w < length; ++w)
233  array_[w] &= other.array_[w];
234  for (int w = WordLength() - 1; w >= length; --w)
235  array_[w] = 0;
236 }
237 void BitVector::operator^=(const BitVector& other) {
238  int length = std::min(WordLength(), other.WordLength());
239  for (int w = 0; w < length; ++w)
240  array_[w] ^= other.array_[w];
241 }
242 // Set subtraction *this = v1 - v2.
243 void BitVector::SetSubtract(const BitVector& v1, const BitVector& v2) {
244  Alloc(v1.size());
245  int length = std::min(v1.WordLength(), v2.WordLength());
246  for (int w = 0; w < length; ++w)
247  array_[w] = v1.array_[w] ^ (v1.array_[w] & v2.array_[w]);
248  for (int w = WordLength() - 1; w >= length; --w)
249  array_[w] = v1.array_[w];
250 }
251 
252 // Allocates memory for a vector of the given length.
253 // Reallocates if the array is a different size, larger or smaller.
254 void BitVector::Alloc(int length) {
255  int initial_wordlength = WordLength();
256  bit_size_ = length;
257  int new_wordlength = WordLength();
258  if (new_wordlength != initial_wordlength) {
259  delete [] array_;
260  array_ = new uint32_t[new_wordlength];
261  }
262 }
263 
264 
265 } // namespace tesseract.
BitVector & operator=(const BitVector &src)
Definition: bitvector.cpp:122
void operator &=(const BitVector &other)
void Init(int length)
Definition: bitvector.cpp:133
void operator^=(const BitVector &other)
Definition: bitvector.cpp:237
static const uint8_t lsb_eroded_[256]
Definition: bitvector.h:41
bool Serialize(FILE *fp) const
Definition: bitvector.cpp:139
bool Serialize(FILE *fp, const char *data, size_t n)
Definition: serialis.cpp:59
static const int hamming_table_[256]
Definition: bitvector.h:43
int size() const
Definition: bitvector.h:56
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:178
int NextSetBit(int prev_bit) const
Definition: bitvector.cpp:172
bool DeSerialize(bool swap, FILE *fp)
Definition: bitvector.cpp:147
int NumSetBits() const
Definition: bitvector.cpp:210
void SetSubtract(const BitVector &v1, const BitVector &v2)
Definition: bitvector.cpp:243
bool DeSerialize(FILE *fp, char *data, size_t n)
Definition: serialis.cpp:27
static const uint8_t lsb_index_[256]
Definition: bitvector.h:38
void operator|=(const BitVector &other)
Definition: bitvector.cpp:225