43 double* unichar_error,
double* scaled_error,
STRING* fonts_report) {
48 clock_t start = clock();
49 unsigned total_samples = 0;
50 double unscaled_error = 0.0;
52 int error_samples = report_level > 3 ? report_level * report_level : 0;
56 int page_index = mutable_sample->
page_num();
57 Pix* page_pix = 0 <= page_index && page_index < page_images.
size()
58 ? page_images[page_index] :
nullptr;
61 INVALID_UNICHAR_ID, &results);
62 bool debug_it =
false;
63 int correct_id = mutable_sample->
class_id();
64 if (counter.unicharset_.has_special_codes() &&
68 debug_it = counter.AccumulateJunk(report_level > 3,
72 debug_it = counter.AccumulateErrors(report_level > 3, boosting_mode,
74 results, mutable_sample);
76 if (debug_it && error_samples > 0) {
78 tprintf(
"Error on sample %d: %s Classifier debug output:\n",
81 classifier->
DebugDisplay(*mutable_sample, page_pix, correct_id);
86 const double total_time = 1.0 * (clock() - start) / CLOCKS_PER_SEC;
88 unscaled_error = counter.ReportErrors(report_level, boosting_mode,
90 *it, unichar_error, fonts_report);
91 if (scaled_error !=
nullptr) *scaled_error = counter.scaled_error_;
92 if (report_level > 1 && total_samples > 0) {
94 tprintf(
"Errors computed in %.2fs at %.1f μs/char\n",
95 total_time, 1000000.0 * total_time / total_samples);
97 return unscaled_error;
116 int total_samples = 0;
117 int error_samples = 25;
118 int total_new_errors = 0;
122 int page_index = mutable_sample->
page_num();
123 Pix* page_pix = 0 <= page_index && page_index < page_images.
size()
124 ? page_images[page_index] :
nullptr;
127 INVALID_UNICHAR_ID, &results);
128 int correct_id = mutable_sample->
class_id();
129 if (correct_id != 0 &&
130 !old_counter.AccumulateErrors(
true, boosting_mode, fontinfo_table,
131 results, mutable_sample)) {
134 INVALID_UNICHAR_ID, &results);
135 if (correct_id != 0 &&
136 new_counter.AccumulateErrors(
true, boosting_mode, fontinfo_table,
137 results, mutable_sample)) {
138 tprintf(
"New Error on sample %d: Classifier debug output:\n",
142 correct_id, &results);
143 if (results.size() > 0 && error_samples > 0) {
144 new_classifier->
DebugDisplay(*mutable_sample, page_pix, correct_id);
151 tprintf(
"Total new errors = %d\n", total_new_errors);
156 ErrorCounter::ErrorCounter(
const UNICHARSET& unicharset,
int fontsize)
158 unichar_counts_(unicharset.size(), unicharset.size(), 0),
159 ok_score_hist_(0, 101), bad_score_hist_(0, 101),
160 unicharset_(unicharset) {
173 bool ErrorCounter::AccumulateErrors(
bool debug,
CountTypes boosting_mode,
174 const FontInfoTable& font_table,
177 int num_results = results.
size();
178 int answer_actual_rank = -1;
179 int font_id =
sample->font_id();
180 int unichar_id =
sample->class_id();
181 sample->set_is_error(
false);
182 if (num_results == 0) {
186 sample->set_is_error(
true);
191 int epsilon_rank = 0;
192 int answer_epsilon_rank = -1;
193 int num_top_answers = 0;
194 double prev_rating = results[0].rating;
198 while (res_index < num_results) {
199 if (results[res_index].rating < prev_rating - rating_epsilon_) {
201 prev_rating = results[res_index].rating;
203 if (results[res_index].unichar_id == unichar_id &&
204 answer_epsilon_rank < 0) {
205 answer_epsilon_rank = epsilon_rank;
206 answer_actual_rank = res_index;
214 else if (epsilon_rank == 0)
218 if (answer_actual_rank != 0) {
223 if (answer_epsilon_rank == 0) {
226 if (num_top_answers > 1) {
228 ++multi_unichar_counts_[unichar_id];
233 if (font_table.SetContainsFontProperties(
234 font_id, results[answer_actual_rank].fonts)) {
237 if (font_table.SetContainsMultipleFontProperties(
238 results[answer_actual_rank].fonts))
249 ++unichar_counts_(unichar_id, results[0].unichar_id);
250 if (answer_epsilon_rank < 0 || answer_epsilon_rank >= 2) {
255 if (answer_epsilon_rank < 0) {
259 answer_epsilon_rank = epsilon_rank;
264 font_counts_[font_id].n[
CT_RANK] += answer_epsilon_rank;
272 scaled_error_ +=
sample->weight();
274 tprintf(
"%d results for char %s font %d :",
277 for (
int i = 0; i < num_results; ++i) {
287 bad_score_hist_.
add(percent, 1);
290 if (answer_actual_rank >= 0)
291 percent =
IntCastRounded(results[answer_actual_rank].rating * 100);
292 ok_score_hist_.
add(percent, 1);
299 bool ErrorCounter::AccumulateJunk(
bool debug,
304 const int num_results = results.
size();
305 const int font_id =
sample->font_id();
306 const int unichar_id =
sample->class_id();
310 if (num_results > 0 && results[0].unichar_id != unichar_id) {
313 sample->set_is_error(
true);
315 scaled_error_ +=
sample->weight();
316 bad_score_hist_.
add(percent, 1);
321 sample->set_is_error(
false);
322 ok_score_hist_.
add(percent, 1);
339 double ErrorCounter::ReportErrors(
int report_level,
CountTypes boosting_mode,
340 const FontInfoTable& fontinfo_table,
341 const SampleIterator& it,
342 double* unichar_error,
347 int fontsize = font_counts_.
size();
348 for (
int f = 0; f < fontsize; ++f) {
350 totals += font_counts_[f];
352 if (ReportString(
false, font_counts_[f], &font_report)) {
353 if (fonts_report !=
nullptr) {
354 *fonts_report += fontinfo_table.get(f).name;
355 *fonts_report +=
": ";
356 *fonts_report += font_report;
357 *fonts_report +=
"\n";
359 if (report_level > 2) {
361 tprintf(
"%s: %s\n", fontinfo_table.get(f).name, font_report.
string());
367 bool any_results = ReportString(
true, totals, &total_report);
368 if (fonts_report !=
nullptr && fonts_report->
length() == 0) {
370 *fonts_report =
"NoSamplesFound: ";
371 *fonts_report += total_report;
372 *fonts_report +=
"\n";
374 if (report_level > 0) {
378 tprintf(
"TOTAL Scaled Err=%.4g%%, %s\n",
379 scaled_error_ * 100.0, total_report.
string());
383 int charsetsize = unicharset_.
size();
384 int worst_uni_id = 0;
385 int worst_result_id = 0;
387 for (
int u = 0; u < charsetsize; ++u) {
388 for (
int v = 0; v < charsetsize; ++v) {
389 if (unichar_counts_(u, v) > worst_err) {
390 worst_err = unichar_counts_(u, v);
397 tprintf(
"Worst error = %d:%s -> %s with %d/%d=%.2f%% errors\n",
404 tprintf(
"Multi-unichar shape use:\n");
405 for (
int u = 0; u < multi_unichar_counts_.
size(); ++u) {
406 if (multi_unichar_counts_[u] > 0) {
407 tprintf(
"%d multiple answers for unichar: %s\n",
408 multi_unichar_counts_[u],
412 tprintf(
"OK Score histogram:\n");
413 ok_score_hist_.
print();
414 tprintf(
"ERROR Score histogram:\n");
415 bad_score_hist_.
print();
419 if (!ComputeRates(totals, rates))
422 if (unichar_error !=
nullptr)
424 return rates[boosting_mode];
431 bool ErrorCounter::ReportString(
bool even_if_empty,
const Counts& counts,
435 if (!ComputeRates(counts, rates) && !even_if_empty)
440 const int kMaxExtraLength = 5;
442 const char* format_str =
"Unichar=%.4g%%[1], %.4g%%[2], %.4g%%[n], %.4g%%[T] " 443 "Mult=%.4g%%, Jn=%.4g%%, Brk=%.4g%%, Rej=%.4g%%, " 444 "FontAttr=%.4g%%, Multi=%.4g%%, " 445 "Answers=%.3g, Rank=%.3g, " 446 "OKjunk=%.4g%%, Badjunk=%.4g%%";
447 const size_t max_str_len = strlen(format_str) + kMaxExtraLength * (
CT_SIZE - 1) + 1;
448 char* formatted_str =
new char[max_str_len];
449 snprintf(formatted_str, max_str_len, format_str,
464 *report = formatted_str;
465 delete [] formatted_str;
468 for (
int ct = 0; ct <
CT_SIZE; ++ct)
475 bool ErrorCounter::ComputeRates(
const Counts& counts,
double rates[
CT_SIZE]) {
480 double denominator =
static_cast<double>(std::max(ok_samples, 1));
481 for (
int ct = 0; ct <=
CT_RANK; ++ct)
482 rates[ct] = counts.n[ct] / denominator;
484 denominator = static_cast<double>(std::max(junk_samples, 1));
486 rates[ct] = counts.n[ct] / denominator;
487 return ok_samples != 0 || junk_samples != 0;
490 ErrorCounter::Counts::Counts() {
491 memset(n, 0,
sizeof(n[0]) *
CT_SIZE);
495 for (
int ct = 0; ct <
CT_SIZE; ++ct)
496 n[ct] += other.n[ct];
ICOORD & operator+=(ICOORD &op1, const ICOORD &op2)
int GlobalSampleIndex() const
const char * string() const
virtual const UNICHARSET & GetUnicharset() const
virtual void DebugDisplay(const TrainingSample &sample, Pix *page_pix, UNICHAR_ID unichar_id)
TrainingSample * MutableSample() const
const TrainingSampleSet * sample_set() const
static void DebugNewErrors(ShapeClassifier *new_classifier, ShapeClassifier *old_classifier, CountTypes boosting_mode, const FontInfoTable &fontinfo_table, const GenericVector< Pix *> &page_images, SampleIterator *it)
void init_to_size(int size, const T &t)
const double kRatingEpsilon
int IntCastRounded(double x)
DLLSYM void tprintf(const char *format,...)
void add(int32_t value, int32_t count)
void add_str_int(const char *str, int number)
static double ComputeErrorRate(ShapeClassifier *classifier, int report_level, CountTypes boosting_mode, const FontInfoTable &fontinfo_table, const GenericVector< Pix *> &page_images, SampleIterator *it, double *unichar_error, double *scaled_error, STRING *fonts_report)
virtual int UnicharClassifySample(const TrainingSample &sample, Pix *page_pix, int debug, UNICHAR_ID keep_this, GenericVector< UnicharRating > *results)
const char * id_to_unichar(UNICHAR_ID id) const
STRING SampleToString(const TrainingSample &sample) const
bool has_special_codes() const
UNICHAR_ID class_id() const