tesseract  5.0.0-alpha-619-ge9db
tesseract::SampleIterator Class Reference

#include <sampleiterator.h>

Public Member Functions

 SampleIterator ()
 
 ~SampleIterator ()
 
void Clear ()
 
void Init (const IndexMapBiDi *charset_map, const ShapeTable *shape_table, bool randomize, TrainingSampleSet *sample_set)
 
void Begin ()
 
bool AtEnd () const
 
const TrainingSampleGetSample () const
 
TrainingSampleMutableSample () const
 
int GlobalSampleIndex () const
 
int GetCompactClassID () const
 
int GetSparseClassID () const
 
void Next ()
 
int CompactCharsetSize () const
 
int SparseCharsetSize () const
 
const IndexMapBiDicharset_map () const
 
const ShapeTableshape_table () const
 
const TrainingSampleSetsample_set () const
 
void MapSampleFeatures (const IntFeatureMap &feature_map)
 
int UniformSamples ()
 
double NormalizeSamples ()
 

Detailed Description

Definition at line 92 of file sampleiterator.h.

Constructor & Destructor Documentation

◆ SampleIterator()

tesseract::SampleIterator::SampleIterator ( )

Definition at line 27 of file sampleiterator.cpp.

28  : charset_map_(nullptr),
29  shape_table_(nullptr),
30  sample_set_(nullptr),
31  randomize_(false),
32  owned_shape_table_(nullptr) {
33  num_shapes_ = 0;
34  Begin();
35 }

◆ ~SampleIterator()

tesseract::SampleIterator::~SampleIterator ( )

Definition at line 37 of file sampleiterator.cpp.

37  {
38  Clear();
39 }

Member Function Documentation

◆ AtEnd()

bool tesseract::SampleIterator::AtEnd ( ) const

Definition at line 99 of file sampleiterator.cpp.

99  {
100  return shape_index_ >= num_shapes_;
101 }

◆ Begin()

void tesseract::SampleIterator::Begin ( )

Definition at line 87 of file sampleiterator.cpp.

87  {
88  shape_index_ = -1;
89  shape_char_index_ = 0;
90  num_shape_chars_ = 0;
91  shape_font_index_ = 0;
92  num_shape_fonts_ = 0;
93  sample_index_ = 0;
94  num_samples_ = 0;
95  // Find the first indexable sample.
96  Next();
97 }

◆ charset_map()

const IndexMapBiDi& tesseract::SampleIterator::charset_map ( ) const
inline

Definition at line 137 of file sampleiterator.h.

137  {
138  return *charset_map_;
139  }

◆ Clear()

void tesseract::SampleIterator::Clear ( )

Definition at line 41 of file sampleiterator.cpp.

41  {
42  delete owned_shape_table_;
43  owned_shape_table_ = nullptr;
44 }

◆ CompactCharsetSize()

int tesseract::SampleIterator::CompactCharsetSize ( ) const

Definition at line 196 of file sampleiterator.cpp.

196  {
197  return charset_map_ != nullptr ? charset_map_->CompactSize()
198  : SparseCharsetSize();
199 }

◆ GetCompactClassID()

int tesseract::SampleIterator::GetCompactClassID ( ) const

Definition at line 142 of file sampleiterator.cpp.

142  {
143  return charset_map_ != nullptr ? charset_map_->SparseToCompact(shape_index_)
144  : GetSparseClassID();
145 }

◆ GetSample()

const TrainingSample & tesseract::SampleIterator::GetSample ( ) const

Definition at line 103 of file sampleiterator.cpp.

103  {
104  if (shape_table_ != nullptr) {
105  const UnicharAndFonts* shape_entry = GetShapeEntry();
106  int char_id = shape_entry->unichar_id;
107  int font_id = shape_entry->font_ids[shape_font_index_];
108  return *sample_set_->GetSample(font_id, char_id, sample_index_);
109  } else {
110  return *sample_set_->GetSample(shape_index_);
111  }
112 }

◆ GetSparseClassID()

int tesseract::SampleIterator::GetSparseClassID ( ) const

Definition at line 150 of file sampleiterator.cpp.

150  {
151  return shape_table_ != nullptr ? shape_index_ : GetSample().class_id();
152 }

◆ GlobalSampleIndex()

int tesseract::SampleIterator::GlobalSampleIndex ( ) const

Definition at line 127 of file sampleiterator.cpp.

127  {
128  if (shape_table_ != nullptr) {
129  const UnicharAndFonts* shape_entry = GetShapeEntry();
130  int char_id = shape_entry->unichar_id;
131  int font_id = shape_entry->font_ids[shape_font_index_];
132  return sample_set_->GlobalSampleIndex(font_id, char_id, sample_index_);
133  } else {
134  return shape_index_;
135  }
136 }

◆ Init()

void tesseract::SampleIterator::Init ( const IndexMapBiDi charset_map,
const ShapeTable shape_table,
bool  randomize,
TrainingSampleSet sample_set 
)

Definition at line 47 of file sampleiterator.cpp.

50  {
51  Clear();
52  charset_map_ = charset_map;
53  shape_table_ = shape_table;
54  sample_set_ = sample_set;
55  randomize_ = randomize;
56  if (shape_table_ == nullptr && charset_map_ != nullptr) {
57  // The caller wishes to iterate by class. The easiest way to do this
58  // is to create a dummy shape_table_ that we will own.
59  int num_fonts = sample_set_->NumFonts();
60  owned_shape_table_ = new ShapeTable(sample_set_->unicharset());
61  int charsetsize = sample_set_->unicharset().size();
62  for (int c = 0; c < charsetsize; ++c) {
63  // We always add a shape for each character to keep the index in sync
64  // with the unichar_id.
65  int shape_id = owned_shape_table_->AddShape(c, 0);
66  for (int f = 1; f < num_fonts; ++f) {
67  if (sample_set_->NumClassSamples(f, c, true) > 0) {
68  owned_shape_table_->AddToShape(shape_id, c, f);
69  }
70  }
71  }
72  shape_table_ = owned_shape_table_;
73  }
74  if (shape_table_ != nullptr) {
75  num_shapes_ = shape_table_->NumShapes();
76  } else {
77  num_shapes_ = randomize ? sample_set_->num_samples()
78  : sample_set_->num_raw_samples();
79  }
80  Begin();
81 }

◆ MapSampleFeatures()

void tesseract::SampleIterator::MapSampleFeatures ( const IntFeatureMap feature_map)

Definition at line 211 of file sampleiterator.cpp.

211  {
212  for (Begin(); !AtEnd(); Next()) {
213  TrainingSample* sample = MutableSample();
214  sample->MapFeatures(feature_map);
215  }
216 }

◆ MutableSample()

TrainingSample * tesseract::SampleIterator::MutableSample ( ) const

Definition at line 114 of file sampleiterator.cpp.

114  {
115  if (shape_table_ != nullptr) {
116  const UnicharAndFonts* shape_entry = GetShapeEntry();
117  int char_id = shape_entry->unichar_id;
118  int font_id = shape_entry->font_ids[shape_font_index_];
119  return sample_set_->MutableSample(font_id, char_id, sample_index_);
120  } else {
121  return sample_set_->mutable_sample(shape_index_);
122  }
123 }

◆ Next()

void tesseract::SampleIterator::Next ( )

Definition at line 156 of file sampleiterator.cpp.

156  {
157  if (shape_table_ != nullptr) {
158  // Next sample in this class/font combination.
159  ++sample_index_;
160  if (sample_index_ < num_samples_)
161  return;
162  // Next font in this class in this shape.
163  sample_index_ = 0;
164  do {
165  ++shape_font_index_;
166  if (shape_font_index_ >= num_shape_fonts_) {
167  // Next unichar in this shape.
168  shape_font_index_ = 0;
169  ++shape_char_index_;
170  if (shape_char_index_ >= num_shape_chars_) {
171  // Find the next shape that is mapped in the charset_map_.
172  shape_char_index_ = 0;
173  do {
174  ++shape_index_;
175  } while (shape_index_ < num_shapes_ &&
176  charset_map_ != nullptr &&
177  charset_map_->SparseToCompact(shape_index_) < 0);
178  if (shape_index_ >= num_shapes_)
179  return; // The end.
180  num_shape_chars_ = shape_table_->GetShape(shape_index_).size();
181  }
182  }
183  const UnicharAndFonts* shape_entry = GetShapeEntry();
184  num_shape_fonts_ = shape_entry->font_ids.size();
185  int char_id = shape_entry->unichar_id;
186  int font_id = shape_entry->font_ids[shape_font_index_];
187  num_samples_ = sample_set_->NumClassSamples(font_id, char_id, randomize_);
188  } while (num_samples_ == 0);
189  } else {
190  // We are just iterating over the samples.
191  ++shape_index_;
192  }
193 }

◆ NormalizeSamples()

double tesseract::SampleIterator::NormalizeSamples ( )

Definition at line 233 of file sampleiterator.cpp.

233  {
234  double total_weight = 0.0;
235  int sample_count = 0;
236  for (Begin(); !AtEnd(); Next()) {
237  const TrainingSample& sample = GetSample();
238  total_weight += sample.weight();
239  ++sample_count;
240  }
241  // Normalize samples.
242  double min_assigned_sample_weight = 1.0;
243  if (total_weight > 0.0) {
244  for (Begin(); !AtEnd(); Next()) {
245  TrainingSample* sample = MutableSample();
246  double weight = sample->weight() / total_weight;
247  if (weight < min_assigned_sample_weight)
248  min_assigned_sample_weight = weight;
249  sample->set_weight(weight);
250  }
251  }
252  return min_assigned_sample_weight;
253 }

◆ sample_set()

const TrainingSampleSet* tesseract::SampleIterator::sample_set ( ) const
inline

Definition at line 144 of file sampleiterator.h.

144  {
145  return sample_set_;
146  }

◆ shape_table()

const ShapeTable* tesseract::SampleIterator::shape_table ( ) const
inline

Definition at line 140 of file sampleiterator.h.

140  {
141  return shape_table_;
142  }

◆ SparseCharsetSize()

int tesseract::SampleIterator::SparseCharsetSize ( ) const

Definition at line 202 of file sampleiterator.cpp.

202  {
203  return charset_map_ != nullptr
204  ? charset_map_->SparseSize()
205  : (shape_table_ != nullptr ? shape_table_->NumShapes()
206  : sample_set_->charsetsize());
207 }

◆ UniformSamples()

int tesseract::SampleIterator::UniformSamples ( )

Definition at line 220 of file sampleiterator.cpp.

220  {
221  int num_good_samples = 0;
222  for (Begin(); !AtEnd(); Next()) {
223  TrainingSample* sample = MutableSample();
224  sample->set_weight(1.0);
225  ++num_good_samples;
226  }
228  return num_good_samples;
229 }

The documentation for this class was generated from the following files:
tesseract::TrainingSample::class_id
UNICHAR_ID class_id() const
Definition: trainingsample.h:116
tesseract::SampleIterator::Clear
void Clear()
Definition: sampleiterator.cpp:41
tesseract::TrainingSampleSet::MutableSample
TrainingSample * MutableSample(int font_id, int class_id, int index)
Definition: trainingsampleset.cpp:191
tesseract::TrainingSampleSet::mutable_sample
TrainingSample * mutable_sample(int index)
Definition: trainingsampleset.h:161
tesseract::ShapeTable::NumShapes
int NumShapes() const
Definition: shapetable.h:274
tesseract::TrainingSampleSet::NumClassSamples
int NumClassSamples(int font_id, int class_id, bool randomize) const
Definition: trainingsampleset.cpp:156
tesseract::SampleIterator::Next
void Next()
Definition: sampleiterator.cpp:156
tesseract::ShapeTable::AddToShape
void AddToShape(int shape_id, int unichar_id, int font_id)
Definition: shapetable.cpp:369
tesseract::IndexMapBiDi::SparseToCompact
int SparseToCompact(int sparse_index) const override
Definition: indexmapbidi.h:138
tesseract::SampleIterator::NormalizeSamples
double NormalizeSamples()
Definition: sampleiterator.cpp:233
tesseract::TrainingSampleSet::GlobalSampleIndex
int GlobalSampleIndex(int font_id, int class_id, int index) const
Definition: trainingsampleset.cpp:452
tesseract::SampleIterator::shape_table
const ShapeTable * shape_table() const
Definition: sampleiterator.h:140
tesseract::TrainingSampleSet::num_samples
int num_samples() const
Definition: trainingsampleset.h:55
tesseract::SampleIterator::sample_set
const TrainingSampleSet * sample_set() const
Definition: sampleiterator.h:144
tesseract::TrainingSampleSet::GetSample
const TrainingSample * GetSample(int index) const
Definition: trainingsampleset.cpp:174
tesseract::SampleIterator::GetSample
const TrainingSample & GetSample() const
Definition: sampleiterator.cpp:103
tesseract::TrainingSampleSet::num_raw_samples
int num_raw_samples() const
Definition: trainingsampleset.h:58
tesseract::ShapeTable::GetShape
const Shape & GetShape(int shape_id) const
Definition: shapetable.h:319
sample
Definition: cluster.h:31
tesseract::IndexMapBiDi::SparseSize
int SparseSize() const override
Definition: indexmapbidi.h:142
tesseract::Shape::size
int size() const
Definition: shapetable.h:199
tesseract::SampleIterator::GetSparseClassID
int GetSparseClassID() const
Definition: sampleiterator.cpp:150
tesseract::TrainingSampleSet::NumFonts
int NumFonts() const
Definition: trainingsampleset.h:61
tesseract::TrainingSampleSet::charsetsize
int charsetsize() const
Definition: trainingsampleset.h:67
tesseract::ShapeTable::AddShape
int AddShape(int unichar_id, int font_id)
Definition: shapetable.cpp:336
tesseract::IndexMap::CompactSize
int CompactSize() const
Definition: indexmapbidi.h:61
tesseract::SampleIterator::MutableSample
TrainingSample * MutableSample() const
Definition: sampleiterator.cpp:114
tesseract::SampleIterator::Begin
void Begin()
Definition: sampleiterator.cpp:87
tesseract::SampleIterator::SparseCharsetSize
int SparseCharsetSize() const
Definition: sampleiterator.cpp:202
tesseract::TrainingSampleSet::unicharset
const UNICHARSET & unicharset() const
Definition: trainingsampleset.h:64
tesseract::SampleIterator::AtEnd
bool AtEnd() const
Definition: sampleiterator.cpp:99
UNICHARSET::size
int size() const
Definition: unicharset.h:341
tesseract::SampleIterator::charset_map
const IndexMapBiDi & charset_map() const
Definition: sampleiterator.h:137