43 double* unichar_error,
double* scaled_error,
STRING* fonts_report) {
48 clock_t start = clock();
49 int total_samples = 0;
50 double unscaled_error = 0.0;
52 int error_samples = report_level > 3 ? report_level * report_level : 0;
56 int page_index = mutable_sample->
page_num();
57 Pix* page_pix = 0 <= page_index && page_index < page_images.
size()
58 ? page_images[page_index] :
NULL;
61 INVALID_UNICHAR_ID, &results);
62 bool debug_it =
false;
63 int correct_id = mutable_sample->
class_id();
64 if (counter.unicharset_.has_special_codes() &&
68 debug_it = counter.AccumulateJunk(report_level > 3,
72 debug_it = counter.AccumulateErrors(report_level > 3, boosting_mode,
74 results, mutable_sample);
76 if (debug_it && error_samples > 0) {
78 tprintf(
"Error on sample %d: %s Classifier debug output:\n",
81 classifier->
DebugDisplay(*mutable_sample, page_pix, correct_id);
86 double total_time = 1.0 * (clock() - start) / CLOCKS_PER_SEC;
88 unscaled_error = counter.ReportErrors(report_level, boosting_mode,
90 *it, unichar_error, fonts_report);
91 if (scaled_error !=
NULL) *scaled_error = counter.scaled_error_;
92 if (report_level > 1) {
94 tprintf(
"Errors computed in %.2fs at %.1f μs/char\n",
95 total_time, 1000000.0 * total_time / total_samples);
97 return unscaled_error;
116 int total_samples = 0;
117 int error_samples = 25;
118 int total_new_errors = 0;
122 int page_index = mutable_sample->
page_num();
123 Pix* page_pix = 0 <= page_index && page_index < page_images.
size()
124 ? page_images[page_index] :
NULL;
127 INVALID_UNICHAR_ID, &results);
128 int correct_id = mutable_sample->
class_id();
129 if (correct_id != 0 &&
130 !old_counter.AccumulateErrors(
true, boosting_mode, fontinfo_table,
131 results, mutable_sample)) {
134 INVALID_UNICHAR_ID, &results);
135 if (correct_id != 0 &&
136 new_counter.AccumulateErrors(
true, boosting_mode, fontinfo_table,
137 results, mutable_sample)) {
138 tprintf(
"New Error on sample %d: Classifier debug output:\n",
142 correct_id, &results);
143 if (results.size() > 0 && error_samples > 0) {
144 new_classifier->
DebugDisplay(*mutable_sample, page_pix, correct_id);
151 tprintf(
"Total new errors = %d\n", total_new_errors);
156 ErrorCounter::ErrorCounter(
const UNICHARSET& unicharset,
int fontsize)
157 : scaled_error_(0.0), rating_epsilon_(kRatingEpsilon),
158 unichar_counts_(unicharset.size(), unicharset.size(), 0),
159 ok_score_hist_(0, 101), bad_score_hist_(0, 101),
160 unicharset_(unicharset) {
165 ErrorCounter::~ErrorCounter() {
175 bool ErrorCounter::AccumulateErrors(
bool debug,
CountTypes boosting_mode,
176 const FontInfoTable& font_table,
179 int num_results = results.
size();
180 int answer_actual_rank = -1;
181 int font_id = sample->font_id();
182 int unichar_id = sample->class_id();
183 sample->set_is_error(
false);
184 if (num_results == 0) {
188 sample->set_is_error(
true);
193 int epsilon_rank = 0;
194 int answer_epsilon_rank = -1;
195 int num_top_answers = 0;
196 double prev_rating = results[0].rating;
200 while (res_index < num_results) {
201 if (results[res_index].rating < prev_rating - rating_epsilon_) {
203 prev_rating = results[res_index].rating;
205 if (results[res_index].unichar_id == unichar_id &&
206 answer_epsilon_rank < 0) {
207 answer_epsilon_rank = epsilon_rank;
208 answer_actual_rank = res_index;
216 else if (epsilon_rank == 0)
220 if (answer_actual_rank != 0) {
225 if (answer_epsilon_rank == 0) {
228 if (num_top_answers > 1) {
230 ++multi_unichar_counts_[unichar_id];
235 if (font_table.SetContainsFontProperties(
236 font_id, results[answer_actual_rank].fonts)) {
239 if (font_table.SetContainsMultipleFontProperties(
240 results[answer_actual_rank].fonts))
251 ++unichar_counts_(unichar_id, results[0].unichar_id);
252 if (answer_epsilon_rank < 0 || answer_epsilon_rank >= 2) {
257 if (answer_epsilon_rank < 0) {
261 answer_epsilon_rank = epsilon_rank;
266 font_counts_[font_id].n[
CT_RANK] += answer_epsilon_rank;
273 if (sample->is_error()) {
274 scaled_error_ += sample->weight();
276 tprintf(
"%d results for char %s font %d :",
279 for (
int i = 0; i < num_results; ++i) {
289 bad_score_hist_.
add(percent, 1);
292 if (answer_actual_rank >= 0)
293 percent =
IntCastRounded(results[answer_actual_rank].rating * 100);
294 ok_score_hist_.
add(percent, 1);
301 bool ErrorCounter::AccumulateJunk(
bool debug,
303 TrainingSample* sample) {
306 int num_results = results.
size();
307 int font_id = sample->font_id();
308 int unichar_id = sample->class_id();
312 if (num_results > 0 && results[0].unichar_id != unichar_id) {
315 sample->set_is_error(
true);
317 scaled_error_ += sample->weight();
318 bad_score_hist_.
add(percent, 1);
323 sample->set_is_error(
false);
324 ok_score_hist_.
add(percent, 1);
341 double ErrorCounter::ReportErrors(
int report_level,
CountTypes boosting_mode,
342 const FontInfoTable& fontinfo_table,
343 const SampleIterator& it,
344 double* unichar_error,
349 int fontsize = font_counts_.
size();
350 for (
int f = 0; f < fontsize; ++f) {
352 totals += font_counts_[f];
354 if (ReportString(
false, font_counts_[f], &font_report)) {
355 if (fonts_report !=
NULL) {
356 *fonts_report += fontinfo_table.get(f).name;
357 *fonts_report +=
": ";
358 *fonts_report += font_report;
359 *fonts_report +=
"\n";
361 if (report_level > 2) {
363 tprintf(
"%s: %s\n", fontinfo_table.get(f).name, font_report.
string());
369 bool any_results = ReportString(
true, totals, &total_report);
370 if (fonts_report !=
NULL && fonts_report->
length() == 0) {
372 *fonts_report =
"NoSamplesFound: ";
373 *fonts_report += total_report;
374 *fonts_report +=
"\n";
376 if (report_level > 0) {
380 tprintf(
"TOTAL Scaled Err=%.4g%%, %s\n",
381 scaled_error_ * 100.0, total_report.
string());
385 int charsetsize = unicharset_.
size();
386 int worst_uni_id = 0;
387 int worst_result_id = 0;
389 for (
int u = 0; u < charsetsize; ++u) {
390 for (
int v = 0; v < charsetsize; ++v) {
391 if (unichar_counts_(u, v) > worst_err) {
392 worst_err = unichar_counts_(u, v);
399 tprintf(
"Worst error = %d:%s -> %s with %d/%d=%.2f%% errors\n",
406 tprintf(
"Multi-unichar shape use:\n");
407 for (
int u = 0; u < multi_unichar_counts_.
size(); ++u) {
408 if (multi_unichar_counts_[u] > 0) {
409 tprintf(
"%d multiple answers for unichar: %s\n",
410 multi_unichar_counts_[u],
414 tprintf(
"OK Score histogram:\n");
415 ok_score_hist_.
print();
416 tprintf(
"ERROR Score histogram:\n");
417 bad_score_hist_.
print();
421 if (!ComputeRates(totals, rates))
424 if (unichar_error !=
NULL)
426 return rates[boosting_mode];
433 bool ErrorCounter::ReportString(
bool even_if_empty,
const Counts& counts,
437 if (!ComputeRates(counts, rates) && !even_if_empty)
442 const int kMaxExtraLength = 5;
444 const char* format_str =
"Unichar=%.4g%%[1], %.4g%%[2], %.4g%%[n], %.4g%%[T] "
445 "Mult=%.4g%%, Jn=%.4g%%, Brk=%.4g%%, Rej=%.4g%%, "
446 "FontAttr=%.4g%%, Multi=%.4g%%, "
447 "Answers=%.3g, Rank=%.3g, "
448 "OKjunk=%.4g%%, Badjunk=%.4g%%";
449 int max_str_len = strlen(format_str) + kMaxExtraLength * (
CT_SIZE - 1) + 1;
450 char* formatted_str =
new char[max_str_len];
451 snprintf(formatted_str, max_str_len, format_str,
466 *report = formatted_str;
467 delete [] formatted_str;
470 for (
int ct = 0; ct <
CT_SIZE; ++ct)
477 bool ErrorCounter::ComputeRates(
const Counts& counts,
double rates[
CT_SIZE]) {
482 double denominator =
static_cast<double>(
MAX(ok_samples, 1));
483 for (
int ct = 0; ct <=
CT_RANK; ++ct)
484 rates[ct] = counts.n[ct] / denominator;
486 denominator = static_cast<double>(
MAX(junk_samples, 1));
488 rates[ct] = counts.n[ct] / denominator;
489 return ok_samples != 0 || junk_samples != 0;
492 ErrorCounter::Counts::Counts() {
493 memset(n, 0,
sizeof(n[0]) * CT_SIZE);
497 for (
int ct = 0; ct <
CT_SIZE; ++ct)
498 n[ct] += other.n[ct];
virtual int UnicharClassifySample(const TrainingSample &sample, Pix *page_pix, int debug, UNICHAR_ID keep_this, GenericVector< UnicharRating > *results)
virtual void DebugDisplay(const TrainingSample &sample, Pix *page_pix, UNICHAR_ID unichar_id)
TrainingSample * MutableSample() const
void add(inT32 value, inT32 count)
int GlobalSampleIndex() const
static double ComputeErrorRate(ShapeClassifier *classifier, int report_level, CountTypes boosting_mode, const FontInfoTable &fontinfo_table, const GenericVector< Pix * > &page_images, SampleIterator *it, double *unichar_error, double *scaled_error, STRING *fonts_report)
const double kRatingEpsilon
const char *const id_to_unichar(UNICHAR_ID id) const
void init_to_size(int size, T t)
bool has_special_codes() const
void add_str_int(const char *str, int number)
UNICHAR_ID class_id() const
int IntCastRounded(double x)
static void DebugNewErrors(ShapeClassifier *new_classifier, ShapeClassifier *old_classifier, CountTypes boosting_mode, const FontInfoTable &fontinfo_table, const GenericVector< Pix * > &page_images, SampleIterator *it)
const TrainingSampleSet * sample_set() const
STRING SampleToString(const TrainingSample &sample) const
const char * string() const
ICOORD & operator+=(ICOORD &op1, const ICOORD &op2)
virtual const UNICHARSET & GetUnicharset() const