tesseract  4.0.0-1-g2a2b
intfeaturedist.cpp
Go to the documentation of this file.
1 // Copyright 2011 Google Inc. All Rights Reserved.
2 // Author: rays@google.com (Ray Smith)
4 // File: intfeaturedist.cpp
5 // Description: Fast set-difference-based feature distance calculator.
6 // Created: Thu Sep 01 13:07:30 PDT 2011
7 //
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #include "intfeaturedist.h"
21 #include "intfeaturemap.h"
22 
23 namespace tesseract {
24 
26  : size_(0), total_feature_weight_(0.0),
27  feature_map_(nullptr), features_(nullptr),
28  features_delta_one_(nullptr), features_delta_two_(nullptr) {
29 }
30 
32  Clear();
33 }
34 
35 // Initialize the table to the given size of feature space.
36 void IntFeatureDist::Init(const IntFeatureMap* feature_map) {
37  size_ = feature_map->sparse_size();
38  Clear();
39  feature_map_ = feature_map;
40  features_ = new bool[size_];
41  features_delta_one_ = new bool[size_];
42  features_delta_two_ = new bool[size_];
43  memset(features_, false, size_ * sizeof(features_[0]));
44  memset(features_delta_one_, false, size_ * sizeof(features_delta_one_[0]));
45  memset(features_delta_two_, false, size_ * sizeof(features_delta_two_[0]));
46  total_feature_weight_ = 0.0;
47 }
48 
49 // Setup the map for the given indexed_features that have been indexed by
50 // feature_map.
51 void IntFeatureDist::Set(const GenericVector<int>& indexed_features,
52  int canonical_count, bool value) {
53  total_feature_weight_ = canonical_count;
54  for (int i = 0; i < indexed_features.size(); ++i) {
55  const int f = indexed_features[i];
56  features_[f] = value;
57  for (int dir = -kNumOffsetMaps; dir <= kNumOffsetMaps; ++dir) {
58  if (dir == 0) continue;
59  const int mapped_f = feature_map_->OffsetFeature(f, dir);
60  if (mapped_f >= 0) {
61  features_delta_one_[mapped_f] = value;
62  for (int dir2 = -kNumOffsetMaps; dir2 <= kNumOffsetMaps; ++dir2) {
63  if (dir2 == 0) continue;
64  const int mapped_f2 = feature_map_->OffsetFeature(mapped_f, dir2);
65  if (mapped_f2 >= 0)
66  features_delta_two_[mapped_f2] = value;
67  }
68  }
69  }
70  }
71 }
72 
73 // Compute the distance between the given feature vector and the last
74 // Set feature vector.
76  const GenericVector<int>& features) const {
77  const int num_test_features = features.size();
78  const double denominator = total_feature_weight_ + num_test_features;
79  double misses = denominator;
80  for (int i = 0; i < num_test_features; ++i) {
81  const int index = features[i];
82  const double weight = 1.0;
83  if (features_[index]) {
84  // A perfect match.
85  misses -= 2.0 * weight;
86  } else if (features_delta_one_[index]) {
87  misses -= 1.5 * weight;
88  } else if (features_delta_two_[index]) {
89  // A near miss.
90  misses -= 1.0 * weight;
91  }
92  }
93  return misses / denominator;
94 }
95 
96 // Compute the distance between the given feature vector and the last
97 // Set feature vector.
99  const GenericVector<int>& features) const {
100  const int num_test_features = features.size();
101  const double denominator = total_feature_weight_ + num_test_features;
102  double misses = denominator;
103  for (int i = 0; i < num_test_features; ++i) {
104  const int index = features[i];
105  const double weight = 1.0;
106  INT_FEATURE_STRUCT f = feature_map_->InverseMapFeature(features[i]);
107  tprintf("Testing feature weight %g:", weight);
108  f.print();
109  if (features_[index]) {
110  // A perfect match.
111  misses -= 2.0 * weight;
112  tprintf("Perfect hit\n");
113  } else if (features_delta_one_[index]) {
114  misses -= 1.5 * weight;
115  tprintf("-1 hit\n");
116  } else if (features_delta_two_[index]) {
117  // A near miss.
118  misses -= 1.0 * weight;
119  tprintf("-2 hit\n");
120  } else {
121  tprintf("Total miss\n");
122  }
123  }
124  tprintf("Features present:");
125  for (int i = 0; i < size_; ++i) {
126  if (features_[i]) {
127  INT_FEATURE_STRUCT f = feature_map_->InverseMapFeature(i);
128  f.print();
129  }
130  }
131  tprintf("\nMinus one features:");
132  for (int i = 0; i < size_; ++i) {
133  if (features_delta_one_[i]) {
134  INT_FEATURE_STRUCT f = feature_map_->InverseMapFeature(i);
135  f.print();
136  }
137  }
138  tprintf("\nMinus two features:");
139  for (int i = 0; i < size_; ++i) {
140  if (features_delta_two_[i]) {
141  INT_FEATURE_STRUCT f = feature_map_->InverseMapFeature(i);
142  f.print();
143  }
144  }
145  tprintf("\n");
146  return misses / denominator;
147 }
148 
149 // Clear all data.
150 void IntFeatureDist::Clear() {
151  delete [] features_;
152  features_ = nullptr;
153  delete [] features_delta_one_;
154  features_delta_one_ = nullptr;
155  delete [] features_delta_two_;
156  features_delta_two_ = nullptr;
157 }
158 
159 } // namespace tesseract
int size() const
Definition: genericvector.h:71
double DebugFeatureDistance(const GenericVector< int > &features) const
void Set(const GenericVector< int > &indexed_features, int canonical_count, bool value)
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
void print() const
Definition: intproto.h:145
double FeatureDistance(const GenericVector< int > &features) const
INT_FEATURE_STRUCT InverseMapFeature(int map_feature) const
void Init(const IntFeatureMap *feature_map)
int OffsetFeature(int index_feature, int dir) const