tesseract  4.0.0-1-g2a2b
tesseract::Wordrec Class Reference

#include <wordrec.h>

Inheritance diagram for tesseract::Wordrec:
tesseract::Classify tesseract::CCStruct tesseract::CUtil tesseract::CCUtil tesseract::Tesseract

Public Member Functions

 Wordrec ()
 
virtual ~Wordrec ()=default
 
void SaveAltChoices (const LIST &best_choices, WERD_RES *word)
 
void FillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
void CallFillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
void SegSearch (WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void InitialSegSearch (WERD_RES *word_res, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void DoSegSearch (WERD_RES *word_res)
 
SEAMattempt_blob_chop (TWERD *word, TBLOB *blob, int32_t blob_number, bool italic_blob, const GenericVector< SEAM *> &seams)
 
SEAMchop_numbered_blob (TWERD *word, int32_t blob_number, bool italic_blob, const GenericVector< SEAM *> &seams)
 
SEAMchop_overlapping_blob (const GenericVector< TBOX > &boxes, bool italic_blob, WERD_RES *word_res, int *blob_number)
 
void add_seam_to_queue (float new_priority, SEAM *new_seam, SeamQueue *seams)
 
void choose_best_seam (SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile)
 
void combine_seam (const SeamPile &seam_pile, const SEAM *seam, SeamQueue *seam_queue)
 
SEAMpick_good_seam (TBLOB *blob)
 
void try_point_pairs (EDGEPT *points[MAX_NUM_POINTS], int16_t num_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)
 
void try_vertical_splits (EDGEPT *points[MAX_NUM_POINTS], int16_t num_points, EDGEPT_CLIST *new_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)
 
PRIORITY grade_split_length (SPLIT *split)
 
PRIORITY grade_sharpness (SPLIT *split)
 
bool near_point (EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt)
 
virtual BLOB_CHOICE_LIST * classify_piece (const GenericVector< SEAM *> &seams, int16_t start, int16_t end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
 
void merge_fragments (MATRIX *ratings, int16_t num_blobs)
 
void get_fragment_lists (int16_t current_frag, int16_t current_row, int16_t start, int16_t num_frag_parts, int16_t num_blobs, MATRIX *ratings, BLOB_CHOICE_LIST *choice_lists)
 
void merge_and_put_fragment_lists (int16_t row, int16_t column, int16_t num_frag_parts, BLOB_CHOICE_LIST *choice_lists, MATRIX *ratings)
 
void fill_filtered_fragment_list (BLOB_CHOICE_LIST *choices, int fragment_pos, int num_frag_parts, BLOB_CHOICE_LIST *filtered_choices)
 
program_editup

Initialize all the things in the program that need to be initialized. init_permute determines whether to initialize the permute functions and Dawg models.

void program_editup (const char *textbase, TessdataManager *init_classifier, TessdataManager *init_dict)
 
cc_recog

Recognize a word.

void cc_recog (WERD_RES *word)
 
program_editdown

This function holds any necessary post processing for the Wise Owl program.

void program_editdown (int32_t elasped_time)
 
set_pass1

Get ready to do some pass 1 stuff.

void set_pass1 ()
 
set_pass2

Get ready to do some pass 2 stuff.

void set_pass2 ()
 
end_recog

Cleanup and exit the recog program.

int end_recog ()
 
call_matcher

Called from Tess with a blob in tess form. The blob may need rotating to the correct orientation for classification.

BLOB_CHOICE_LIST * call_matcher (TBLOB *blob)
 
dict_word()

Test the dictionaries, returning NO_PERM (0) if not found, or one of the PermuterType values if found, according to the dictionary.

int dict_word (const WERD_CHOICE &word)
 
classify_blob

Classify the this blob if it is not already recorded in the match table. Attempt to recognize this blob as a character. The recognition rating for this blob will be stored as a part of the blob. This value will also be returned to the caller.

Parameters
blobCurrent blob
stringThe string to display in ScrollView
colorThe colour to use when displayed with ScrollView
BLOB_CHOICE_LIST * classify_blob (TBLOB *blob, const char *string, C_COL color, BlamerBundle *blamer_bundle)
 
point_priority

Assign a priority to and edge point that might be used as part of a split. The argument should be of type EDGEPT.

PRIORITY point_priority (EDGEPT *point)
 
add_point_to_list

Add an edge point to a POINT_GROUP containing a list of other points.

void add_point_to_list (PointHeap *point_heap, EDGEPT *point)
 
bool is_inside_angle (EDGEPT *pt)
 
angle_change

Return the change in angle (degrees) of the line segments between points one and two, and two and three.

int angle_change (EDGEPT *point1, EDGEPT *point2, EDGEPT *point3)
 
pick_close_point

Choose the edge point that is closest to the critical point. This point may not be exactly vertical from the critical point.

EDGEPTpick_close_point (EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist)
 
prioritize_points

Find a list of edge points from the outer outline of this blob. For each of these points assign a priority. Sort these points using a heap structure so that they can be visited in order.

void prioritize_points (TESSLINE *outline, PointHeap *points)
 
new_min_point

Found a new minimum point try to decide whether to save it or not. Return the new value for the local minimum. If a point is saved then the local minimum is reset to nullptr.

void new_min_point (EDGEPT *local_min, PointHeap *points)
 
new_max_point

Found a new minimum point try to decide whether to save it or not. Return the new value for the local minimum. If a point is saved then the local minimum is reset to nullptr.

void new_max_point (EDGEPT *local_max, PointHeap *points)
 
vertical_projection_point

For one point on the outline, find the corresponding point on the other side of the outline that is a likely projection for a split point. This is done by iterating through the edge points until the X value of the point being looked at is greater than the X value of the split point. Ensure that the point being returned is not right next to the split point. Return the edge point in *best_point as a result, and any points that were newly created are also saved on the new_points list.

void vertical_projection_point (EDGEPT *split_point, EDGEPT *target_point, EDGEPT **best_point, EDGEPT_CLIST *new_points)
 
improve_one_blob

Finds the best place to chop, based on the worst blob, fixpt, or next to a fragment, according to the input. Returns the SEAM corresponding to the chop point, if any is found, and the index in the ratings_matrix of the chopped blob. Note that blob_choices is just a copy of the pointers in the leading diagonal of the ratings MATRIX. Although the blob is chopped, the returned SEAM is yet to be inserted into word->seam_array and the resulting blobs are unclassified, so this function can be used by ApplyBox as well as during recognition.

SEAMimprove_one_blob (const GenericVector< BLOB_CHOICE *> &blob_choices, DANGERR *fixpt, bool split_next_to_fragment, bool italic_blob, WERD_RES *word, int *blob_number)
 
chop_one_blob

Start with the current one-blob word and its classification. Find the worst blobs and try to divide it up to improve the ratings. Used for testing chopper.

SEAMchop_one_blob (const GenericVector< TBOX > &boxes, const GenericVector< BLOB_CHOICE *> &blob_choices, WERD_RES *word_res, int *blob_number)
 
chop_word_main

Classify the blobs in this word and permute the results. Find the worst blob in the word and chop it up. Continue this process until a good answer has been found or all the blobs have been chopped up enough. The results are returned in the WERD_RES.

void chop_word_main (WERD_RES *word)
 
improve_by_chopping

Repeatedly chops the worst blob, classifying the new blobs fixing up all the data, and incrementally runs the segmentation search until a good word is found, or no more chops can be found.

void improve_by_chopping (float rating_cert_scale, WERD_RES *word, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending)
 
int select_blob_to_split (const GenericVector< BLOB_CHOICE *> &blob_choices, float rating_ceiling, bool split_next_to_fragment)
 
int select_blob_to_split_from_fixpt (DANGERR *fixpt)
 
- Public Member Functions inherited from tesseract::Classify
 Classify ()
 
virtual ~Classify ()
 
virtual DictgetDict ()
 
const ShapeTableshape_table () const
 
void SetStaticClassifier (ShapeClassifier *static_classifier)
 
void AddLargeSpeckleTo (int blob_length, BLOB_CHOICE_LIST *choices)
 
bool LargeSpeckle (const TBLOB &blob)
 
ADAPT_TEMPLATES NewAdaptedTemplates (bool InitFromUnicharset)
 
int GetFontinfoId (ADAPT_CLASS Class, uint8_t ConfigId)
 
int PruneClasses (const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, const INT_FEATURE_STRUCT *features, const uint8_t *normalization_factors, const uint16_t *expected_num_features, GenericVector< CP_RESULT_STRUCT > *results)
 
void ReadNewCutoffs (TFile *fp, CLASS_CUTOFF_ARRAY Cutoffs)
 
void PrintAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
 
void WriteAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
 
ADAPT_TEMPLATES ReadAdaptedTemplates (TFile *File)
 
float ComputeNormMatch (CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch)
 
void FreeNormProtos ()
 
NORM_PROTOSReadNormProtos (TFile *fp)
 
void ConvertProto (PROTO Proto, int ProtoId, INT_CLASS Class)
 
INT_TEMPLATES CreateIntTemplates (CLASSES FloatProtos, const UNICHARSET &target_unicharset)
 
void LearnWord (const char *fontname, WERD_RES *word)
 
void LearnPieces (const char *fontname, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word)
 
void InitAdaptiveClassifier (TessdataManager *mgr)
 
void InitAdaptedClass (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS Class, ADAPT_TEMPLATES Templates)
 
void AmbigClassifier (const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob, INT_TEMPLATES templates, ADAPT_CLASS *classes, UNICHAR_ID *ambiguities, ADAPT_RESULTS *results)
 
void MasterMatcher (INT_TEMPLATES templates, int16_t num_features, const INT_FEATURE_STRUCT *features, const uint8_t *norm_factors, ADAPT_CLASS *classes, int debug, int matcher_multiplier, const TBOX &blob_box, const GenericVector< CP_RESULT_STRUCT > &results, ADAPT_RESULTS *final_results)
 
void ExpandShapesAndApplyCorrections (ADAPT_CLASS *classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uint8_t *cn_factors, UnicharRating *int_result, ADAPT_RESULTS *final_results)
 
double ComputeCorrectedRating (bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, int matcher_multiplier, const uint8_t *cn_factors)
 
void ConvertMatchesToChoices (const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices)
 
void AddNewResult (const UnicharRating &new_result, ADAPT_RESULTS *results)
 
int GetAdaptiveFeatures (TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures)
 
void DebugAdaptiveClassifier (TBLOB *Blob, ADAPT_RESULTS *Results)
 
PROTO_ID MakeNewTempProtos (FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS IClass, ADAPT_CLASS Class, BIT_VECTOR TempProtoMask)
 
int MakeNewTemporaryConfig (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures)
 
void MakePermanent (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob)
 
void PrintAdaptiveMatchResults (const ADAPT_RESULTS &results)
 
void RemoveExtraPuncs (ADAPT_RESULTS *Results)
 
void RemoveBadMatches (ADAPT_RESULTS *Results)
 
void SetAdaptiveThreshold (float Threshold)
 
void ShowBestMatchFor (int shape_id, const INT_FEATURE_STRUCT *features, int num_features)
 
STRING ClassIDToDebugStr (const INT_TEMPLATES_STRUCT *templates, int class_id, int config_id) const
 
int ClassAndConfigIDToFontOrShapeID (int class_id, int int_result_config) const
 
int ShapeIDToClassID (int shape_id) const
 
UNICHAR_IDBaselineClassifier (TBLOB *Blob, const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results)
 
int CharNormClassifier (TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results)
 
int CharNormTrainingSample (bool pruner_only, int keep_this, const TrainingSample &sample, GenericVector< UnicharRating > *results)
 
UNICHAR_IDGetAmbiguities (TBLOB *Blob, CLASS_ID CorrectClass)
 
void DoAdaptiveMatch (TBLOB *Blob, ADAPT_RESULTS *Results)
 
void AdaptToChar (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, float Threshold, ADAPT_TEMPLATES adaptive_templates)
 
void DisplayAdaptedChar (TBLOB *blob, INT_CLASS_STRUCT *int_class)
 
bool AdaptableWord (WERD_RES *word)
 
void EndAdaptiveClassifier ()
 
void SettupPass1 ()
 
void SettupPass2 ()
 
void AdaptiveClassifier (TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
 
void ClassifyAsNoise (ADAPT_RESULTS *Results)
 
void ResetAdaptiveClassifierInternal ()
 
void SwitchAdaptiveClassifier ()
 
void StartBackupAdaptiveClassifier ()
 
int GetCharNormFeature (const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES templates, uint8_t *pruner_norm_array, uint8_t *char_norm_array)
 
void ComputeCharNormArrays (FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uint8_t *char_norm_array, uint8_t *pruner_array)
 
bool TempConfigReliable (CLASS_ID class_id, const TEMP_CONFIG &config)
 
void UpdateAmbigsGroup (CLASS_ID class_id, TBLOB *Blob)
 
bool AdaptiveClassifierIsFull () const
 
bool AdaptiveClassifierIsEmpty () const
 
bool LooksLikeGarbage (TBLOB *blob)
 
void RefreshDebugWindow (ScrollView **win, const char *msg, int y_offset, const TBOX &wbox)
 
void ClearCharNormArray (uint8_t *char_norm_array)
 
void ComputeIntCharNormArray (const FEATURE_STRUCT &norm_feature, uint8_t *char_norm_array)
 
void ComputeIntFeatures (FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures)
 
INT_TEMPLATES ReadIntTemplates (TFile *fp)
 
void WriteIntTemplates (FILE *File, INT_TEMPLATES Templates, const UNICHARSET &target_unicharset)
 
CLASS_ID GetClassToDebug (const char *Prompt, bool *adaptive_on, bool *pretrained_on, int *shape_id)
 
void ShowMatchDisplay ()
 
UnicityTable< FontInfo > & get_fontinfo_table ()
 
const UnicityTable< FontInfo > & get_fontinfo_table () const
 
UnicityTable< FontSet > & get_fontset_table ()
 
void NormalizeOutlines (LIST Outlines, float *XScale, float *YScale)
 
FEATURE_SET ExtractOutlineFeatures (TBLOB *Blob)
 
FEATURE_SET ExtractPicoFeatures (TBLOB *Blob)
 
FEATURE_SET ExtractIntCNFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
 
FEATURE_SET ExtractIntGeoFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
 
void LearnBlob (const STRING &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text)
 
bool WriteTRFile (const STRING &filename)
 
- Public Member Functions inherited from tesseract::CCStruct
 CCStruct ()=default
 
virtual ~CCStruct ()
 
- Public Member Functions inherited from tesseract::CUtil
 CUtil ()=default
 
virtual ~CUtil ()
 
void read_variables (const char *filename, bool global_only)
 
- Public Member Functions inherited from tesseract::CCUtil
 CCUtil ()
 
virtual ~CCUtil ()
 
void main_setup (const char *argv0, const char *basename)
 CCUtil::main_setup - set location of tessdata and name of image. More...
 
ParamsVectorsparams ()
 

Public Attributes

bool merge_fragments_in_matrix = TRUE
 
bool wordrec_no_block = FALSE
 
bool wordrec_enable_assoc = TRUE
 
bool force_word_assoc = FALSE
 
double wordrec_worst_state = 1
 
bool fragments_guide_chopper = FALSE
 
int repair_unchopped_blobs = 1
 
double tessedit_certainty_threshold = -2.25
 
int chop_debug = 0
 
bool chop_enable = 1
 
bool chop_vertical_creep = 0
 
int chop_split_length = 10000
 
int chop_same_distance = 2
 
int chop_min_outline_points = 6
 
int chop_seam_pile_size = 150
 
bool chop_new_seam_pile = 1
 
int chop_inside_angle = -50
 
int chop_min_outline_area = 2000
 
double chop_split_dist_knob = 0.5
 
double chop_overlap_knob = 0.9
 
double chop_center_knob = 0.15
 
int chop_centered_maxwidth = 90
 
double chop_sharpness_knob = 0.06
 
double chop_width_change_knob = 5.0
 
double chop_ok_split = 100.0
 
double chop_good_split = 50.0
 
int chop_x_y_weight = 3
 
int segment_adjust_debug = 0
 
bool assume_fixed_pitch_char_segment = FALSE
 
int wordrec_debug_level = 0
 
int wordrec_max_join_chunks = 4
 
bool wordrec_skip_no_truth_words = false
 
bool wordrec_debug_blamer = false
 
bool wordrec_run_blamer = false
 
int segsearch_debug_level = 0
 
int segsearch_max_pain_points = 2000
 
int segsearch_max_futile_classifications = 10
 
double segsearch_max_char_wh_ratio = 2.0
 
bool save_alt_choices = true
 
std::unique_ptr< LanguageModellanguage_model_
 
PRIORITY pass2_ok_split
 
WERD_CHOICEprev_word_best_choice_
 
GenericVector< int > blame_reasons_
 
void(Wordrec::* fill_lattice_ )(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
- Public Attributes inherited from tesseract::Classify
bool allow_blob_division = true
 
bool prioritize_division = FALSE
 
int tessedit_single_match = FALSE
 
bool classify_enable_learning = true
 
int classify_debug_level = 0
 
int classify_norm_method = character
 
double classify_char_norm_range = 0.2
 
double classify_min_norm_scale_x = 0.0
 
double classify_max_norm_scale_x = 0.325
 
double classify_min_norm_scale_y = 0.0
 
double classify_max_norm_scale_y = 0.325
 
double classify_max_rating_ratio = 1.5
 
double classify_max_certainty_margin = 5.5
 
bool tess_cn_matching = 0
 
bool tess_bn_matching = 0
 
bool classify_enable_adaptive_matcher = 1
 
bool classify_use_pre_adapted_templates = 0
 
bool classify_save_adapted_templates = 0
 
bool classify_enable_adaptive_debugger = 0
 
bool classify_nonlinear_norm = 0
 
int matcher_debug_level = 0
 
int matcher_debug_flags = 0
 
int classify_learning_debug_level = 0
 
double matcher_good_threshold = 0.125
 
double matcher_reliable_adaptive_result = 0.0
 
double matcher_perfect_threshold = 0.02
 
double matcher_bad_match_pad = 0.15
 
double matcher_rating_margin = 0.1
 
double matcher_avg_noise_size = 12.0
 
int matcher_permanent_classes_min = 1
 
int matcher_min_examples_for_prototyping = 3
 
int matcher_sufficient_examples_for_prototyping = 5
 
double matcher_clustering_max_angle_delta = 0.015
 
double classify_misfit_junk_penalty = 0.0
 
double rating_scale = 1.5
 
double certainty_scale = 20.0
 
double tessedit_class_miss_scale = 0.00390625
 
double classify_adapted_pruning_factor = 2.5
 
double classify_adapted_pruning_threshold = -1.0
 
int classify_adapt_proto_threshold = 230
 
int classify_adapt_feature_threshold = 230
 
bool disable_character_fragments = TRUE
 
double classify_character_fragments_garbage_certainty_threshold = -3.0
 
bool classify_debug_character_fragments = FALSE
 
bool matcher_debug_separate_windows = FALSE
 
char * classify_learn_debug_str = ""
 
int classify_class_pruner_threshold = 229
 
int classify_class_pruner_multiplier = 15
 
int classify_cp_cutoff_strength = 7
 
int classify_integer_matcher_multiplier = 10
 
INT_TEMPLATES PreTrainedTemplates
 
ADAPT_TEMPLATES AdaptedTemplates
 
ADAPT_TEMPLATES BackupAdaptedTemplates
 
BIT_VECTOR AllProtosOn
 
BIT_VECTOR AllConfigsOn
 
BIT_VECTOR AllConfigsOff
 
BIT_VECTOR TempProtoMask
 
bool EnableLearning
 
NORM_PROTOSNormProtos
 
UnicityTable< FontInfofontinfo_table_
 
UnicityTable< FontSetfontset_table_
 
int il1_adaption_test = 0
 
bool classify_bln_numeric_mode = 0
 
double speckle_large_max_size = 0.30
 
double speckle_rating_penalty = 10.0
 
- Public Attributes inherited from tesseract::CCUtil
STRING datadir
 
STRING imagebasename
 
STRING lang
 
STRING language_data_path_prefix
 
UNICHARSET unicharset
 
UnicharAmbigs unichar_ambigs
 
STRING imagefile
 
STRING directory
 
int ambigs_debug_level = 0
 
bool use_definite_ambigs_for_classifier = 0
 
bool use_ambigs_for_adaption = 0
 

Protected Member Functions

bool SegSearchDone (int num_futile_classifications)
 
void UpdateSegSearchNodes (float rating_cert_scale, int starting_col, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void ProcessSegSearchPainPoint (float pain_point_priority, const MATRIX_COORD &pain_point, const char *pain_point_type, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle)
 
void ResetNGramSearch (WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, GenericVector< SegSearchPending > *pending)
 
void InitBlamerForSegSearch (WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle, STRING *blamer_debug)
 

Additional Inherited Members

- Static Public Member Functions inherited from tesseract::Classify
static void SetupBLCNDenorms (const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm, DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info)
 
static void ExtractFeatures (const TBLOB &blob, bool nonlinear_norm, GenericVector< INT_FEATURE_STRUCT > *bl_features, GenericVector< INT_FEATURE_STRUCT > *cn_features, INT_FX_RESULT_STRUCT *results, GenericVector< int > *outline_cn_counts)
 
- Static Public Attributes inherited from tesseract::CCStruct
static const double kDescenderFraction = 0.25
 
static const double kXHeightFraction = 0.5
 
static const double kAscenderFraction = 0.25
 
static const double kXHeightCapRatio
 
- Protected Attributes inherited from tesseract::Classify
IntegerMatcher im_
 
FEATURE_DEFS_STRUCT feature_defs_
 
ShapeTableshape_table_
 

Detailed Description

Definition at line 192 of file wordrec.h.

Constructor & Destructor Documentation

◆ Wordrec()

tesseract::Wordrec::Wordrec ( )

Definition at line 47 of file wordrec.cpp.

47  :
48  // control parameters
50  "Merge the fragments in the ratings matrix and delete them"
51  " after merging", params()),
52  BOOL_MEMBER(wordrec_no_block, FALSE, "Don't output block information",
53  params()),
54  BOOL_MEMBER(wordrec_enable_assoc, TRUE, "Associator Enable",
55  params()),
57  "force associator to run regardless of what enable_assoc is."
58  " This is used for CJK where component grouping is necessary.",
59  CCUtil::params()),
60  double_MEMBER(wordrec_worst_state, 1.0, "Worst segmentation state",
61  params()),
63  "Use information from fragments to guide chopping process",
64  params()),
65  INT_MEMBER(repair_unchopped_blobs, 1, "Fix blobs that aren't chopped",
66  params()),
67  double_MEMBER(tessedit_certainty_threshold, -2.25, "Good blob limit",
68  params()),
69  INT_MEMBER(chop_debug, 0, "Chop debug",
70  params()),
71  BOOL_MEMBER(chop_enable, 1, "Chop enable",
72  params()),
73  BOOL_MEMBER(chop_vertical_creep, 0, "Vertical creep",
74  params()),
75  INT_MEMBER(chop_split_length, 10000, "Split Length",
76  params()),
77  INT_MEMBER(chop_same_distance, 2, "Same distance",
78  params()),
79  INT_MEMBER(chop_min_outline_points, 6, "Min Number of Points on Outline",
80  params()),
81  INT_MEMBER(chop_seam_pile_size, 150, "Max number of seams in seam_pile",
82  params()),
83  BOOL_MEMBER(chop_new_seam_pile, 1, "Use new seam_pile", params()),
84  INT_MEMBER(chop_inside_angle, -50, "Min Inside Angle Bend",
85  params()),
86  INT_MEMBER(chop_min_outline_area, 2000, "Min Outline Area",
87  params()),
88  double_MEMBER(chop_split_dist_knob, 0.5, "Split length adjustment",
89  params()),
90  double_MEMBER(chop_overlap_knob, 0.9, "Split overlap adjustment",
91  params()),
92  double_MEMBER(chop_center_knob, 0.15, "Split center adjustment",
93  params()),
94  INT_MEMBER(chop_centered_maxwidth, 90, "Width of (smaller) chopped blobs "
95  "above which we don't care that a chop is not near the center.",
96  params()),
97  double_MEMBER(chop_sharpness_knob, 0.06, "Split sharpness adjustment",
98  params()),
99  double_MEMBER(chop_width_change_knob, 5.0, "Width change adjustment",
100  params()),
101  double_MEMBER(chop_ok_split, 100.0, "OK split limit",
102  params()),
103  double_MEMBER(chop_good_split, 50.0, "Good split limit",
104  params()),
105  INT_MEMBER(chop_x_y_weight, 3, "X / Y length weight",
106  params()),
107  INT_MEMBER(segment_adjust_debug, 0, "Segmentation adjustment debug",
108  params()),
110  "include fixed-pitch heuristics in char segmentation",
111  params()),
113  "Debug level for wordrec", params()),
115  "Max number of broken pieces to associate", params()),
117  "Only run OCR for words that had truth recorded in BlamerBundle",
118  params()),
120  "Print blamer debug messages", params()),
122  "Try to set the blame for errors", params()),
124  "SegSearch debug level", params()),
126  "Maximum number of pain points stored in the queue",
127  params()),
129  "Maximum number of pain point classifications per chunk that"
130  " did not result in finding a better word choice.",
131  params()),
133  "Maximum character width-to-height ratio", params()),
135  "Save alternative paths found during chopping"
136  " and segmentation search",
137  params()),
138  pass2_ok_split(0.0f) {
139  prev_word_best_choice_ = nullptr;
140  language_model_.reset(new LanguageModel(&get_fontinfo_table(),
141  &(getDict())));
142  fill_lattice_ = nullptr;
143 }
int repair_unchopped_blobs
Definition: wordrec.h:206
double segsearch_max_char_wh_ratio
Definition: wordrec.h:244
int chop_same_distance
Definition: wordrec.h:212
#define TRUE
Definition: capi.h:51
double chop_center_knob
Definition: wordrec.h:220
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:481
int wordrec_max_join_chunks
Definition: wordrec.h:233
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:386
bool chop_new_seam_pile
Definition: wordrec.h:215
bool wordrec_skip_no_truth_words
Definition: wordrec.h:235
int segsearch_max_pain_points
Definition: wordrec.h:240
int wordrec_debug_level
Definition: wordrec.h:231
double chop_split_dist_knob
Definition: wordrec.h:218
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:288
int chop_centered_maxwidth
Definition: wordrec.h:222
double chop_width_change_knob
Definition: wordrec.h:224
bool fragments_guide_chopper
Definition: wordrec.h:205
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:291
double chop_overlap_knob
Definition: wordrec.h:219
void(Wordrec::* fill_lattice_)(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
Definition: wordrec.h:485
double chop_sharpness_knob
Definition: wordrec.h:223
bool merge_fragments_in_matrix
Definition: wordrec.h:197
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:297
int chop_min_outline_area
Definition: wordrec.h:217
bool save_alt_choices
Definition: wordrec.h:247
std::unique_ptr< LanguageModel > language_model_
Definition: wordrec.h:476
int chop_min_outline_points
Definition: wordrec.h:213
bool chop_vertical_creep
Definition: wordrec.h:210
#define FALSE
Definition: capi.h:52
double wordrec_worst_state
Definition: wordrec.h:203
ParamsVectors * params()
Definition: ccutil.h:62
double chop_good_split
Definition: wordrec.h:226
bool wordrec_no_block
Definition: wordrec.h:198
int segsearch_max_futile_classifications
Definition: wordrec.h:242
double tessedit_certainty_threshold
Definition: wordrec.h:207
double chop_ok_split
Definition: wordrec.h:225
int segment_adjust_debug
Definition: wordrec.h:228
bool force_word_assoc
Definition: wordrec.h:202
bool assume_fixed_pitch_char_segment
Definition: wordrec.h:230
int segsearch_debug_level
Definition: wordrec.h:238
bool wordrec_enable_assoc
Definition: wordrec.h:199
bool wordrec_run_blamer
Definition: wordrec.h:237
virtual Dict & getDict()
Definition: classify.h:107
bool wordrec_debug_blamer
Definition: wordrec.h:236
int chop_seam_pile_size
Definition: wordrec.h:214
PRIORITY pass2_ok_split
Definition: wordrec.h:477

◆ ~Wordrec()

virtual tesseract::Wordrec::~Wordrec ( )
virtualdefault

Member Function Documentation

◆ add_point_to_list()

void tesseract::Wordrec::add_point_to_list ( PointHeap point_heap,
EDGEPT point 
)

Definition at line 63 of file chop.cpp.

63  {
64  if (point_heap->size() < MAX_NUM_POINTS - 2) {
65  PointPair pair(point_priority(point), point);
66  point_heap->Push(&pair);
67  }
68 
69 #ifndef GRAPHICS_DISABLED
70  if (chop_debug > 2)
71  mark_outline(point);
72 #endif
73 }
#define MAX_NUM_POINTS
Definition: chop.h:39
PRIORITY point_priority(EDGEPT *point)
Definition: chop.cpp:53
void Push(Pair *entry)
Definition: genericheap.h:95
void mark_outline(EDGEPT *edgept)
Definition: plotedges.cpp:92

◆ add_seam_to_queue()

void tesseract::Wordrec::add_seam_to_queue ( float  new_priority,
SEAM new_seam,
SeamQueue seams 
)

Definition at line 73 of file findseam.cpp.

74  {
75  if (new_seam == nullptr) return;
76  if (chop_debug) {
77  tprintf("Pushing new seam with priority %g :", new_priority);
78  new_seam->Print("seam: ");
79  }
80  if (seams->size() >= MAX_NUM_SEAMS) {
81  SeamPair old_pair(0, nullptr);
82  if (seams->PopWorst(&old_pair) && old_pair.key() <= new_priority) {
83  if (chop_debug) {
84  tprintf("Old seam staying with priority %g\n", old_pair.key());
85  }
86  delete new_seam;
87  seams->Push(&old_pair);
88  return;
89  } else if (chop_debug) {
90  tprintf("New seam with priority %g beats old worst seam with %g\n",
91  new_priority, old_pair.key());
92  }
93  }
94  SeamPair new_pair(new_priority, new_seam);
95  seams->Push(&new_pair);
96 }
bool PopWorst(Pair *entry)
Definition: genericheap.h:140
void Print(const char *label) const
Definition: seam.cpp:160
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
void Push(Pair *entry)
Definition: genericheap.h:95
#define MAX_NUM_SEAMS
Definition: findseam.cpp:55

◆ angle_change()

int tesseract::Wordrec::angle_change ( EDGEPT point1,
EDGEPT point2,
EDGEPT point3 
)

Definition at line 87 of file chop.cpp.

87  {
88  VECTOR vector1;
89  VECTOR vector2;
90 
91  int angle;
92 
93  /* Compute angle */
94  vector1.x = point2->pos.x - point1->pos.x;
95  vector1.y = point2->pos.y - point1->pos.y;
96  vector2.x = point3->pos.x - point2->pos.x;
97  vector2.y = point3->pos.y - point2->pos.y;
98  /* Use cross product */
99  float length = std::sqrt(static_cast<float>(LENGTH(vector1)) * LENGTH(vector2));
100  if ((int) length == 0)
101  return (0);
102  angle = static_cast<int>(floor(asin(CROSS (vector1, vector2) /
103  length) / M_PI * 180.0 + 0.5));
104 
105  /* Use dot product */
106  if (SCALAR (vector1, vector2) < 0)
107  angle = 180 - angle;
108  /* Adjust angle */
109  if (angle > 180)
110  angle -= 360;
111  if (angle <= -180)
112  angle += 360;
113  return (angle);
114 }
TPOINT pos
Definition: blobs.h:170
#define CROSS(a, b)
Definition: vecfuncs.h:52
#define SCALAR(a, b)
Definition: vecfuncs.h:61
#define LENGTH(a)
Definition: vecfuncs.h:70
int16_t x
Definition: blobs.h:78
Definition: blobs.h:57
int16_t y
Definition: blobs.h:79

◆ attempt_blob_chop()

SEAM * tesseract::Wordrec::attempt_blob_chop ( TWERD word,
TBLOB blob,
int32_t  blob_number,
bool  italic_blob,
const GenericVector< SEAM *> &  seams 
)

Definition at line 175 of file chopper.cpp.

177  {
180  TBLOB *other_blob = TBLOB::ShallowCopy(*blob); /* Make new blob */
181  // Insert it into the word.
182  word->blobs.insert(other_blob, blob_number + 1);
183 
184  SEAM *seam = nullptr;
185  if (prioritize_division) {
186  TPOINT location;
187  if (divisible_blob(blob, italic_blob, &location)) {
188  seam = new SEAM(0.0f, location);
189  }
190  }
191  if (seam == nullptr)
192  seam = pick_good_seam(blob);
193  if (chop_debug) {
194  if (seam != nullptr)
195  seam->Print("Good seam picked=");
196  else
197  tprintf("\n** no seam picked *** \n");
198  }
199  if (seam) {
200  seam->ApplySeam(italic_blob, blob, other_blob);
201  }
202 
203  seam = CheckSeam(chop_debug, blob_number, word, blob, other_blob,
204  seams, seam);
205  if (seam == nullptr) {
209  // If the blob can simply be divided into outlines, then do that.
210  TPOINT location;
211  if (divisible_blob(blob, italic_blob, &location)) {
212  other_blob = TBLOB::ShallowCopy(*blob); /* Make new blob */
213  word->blobs.insert(other_blob, blob_number + 1);
214  seam = new SEAM(0.0f, location);
215  seam->ApplySeam(italic_blob, blob, other_blob);
216  seam = CheckSeam(chop_debug, blob_number, word, blob, other_blob,
217  seams, seam);
218  }
219  }
220  }
221  if (seam != nullptr) {
222  // Make sure this seam doesn't get chopped again.
223  seam->Finalize();
224  }
225  return seam;
226 }
int repair_unchopped_blobs
Definition: wordrec.h:206
bool allow_blob_division
Definition: classify.h:423
Definition: seam.h:44
void ApplySeam(bool italic_blob, TBLOB *blob, TBLOB *other_blob) const
Definition: seam.cpp:124
SEAM * pick_good_seam(TBLOB *blob)
Definition: findseam.cpp:224
bool divisible_blob(TBLOB *blob, bool italic_blob, TPOINT *location)
Definition: blobs.cpp:923
bool prioritize_division
Definition: classify.h:428
void Print(const char *label) const
Definition: seam.cpp:160
void insert(const T &t, int index)
void Finalize()
Definition: seam.h:116
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
void preserve_outline_tree(TESSLINE *srcline)
Definition: chopper.cpp:88
GenericVector< TBLOB * > blobs
Definition: blobs.h:443
static TBLOB * ShallowCopy(const TBLOB &src)
Definition: blobs.cpp:345
void restore_outline_tree(TESSLINE *srcline)
Definition: chopper.cpp:128
Definition: blobs.h:268
Definition: blobs.h:57
TESSLINE * outlines
Definition: blobs.h:384

◆ call_matcher()

BLOB_CHOICE_LIST * tesseract::Wordrec::call_matcher ( TBLOB blob)

Definition at line 141 of file tface.cpp.

141  {
142  // Rotate the blob for classification if necessary.
143  TBLOB* rotated_blob = tessblob->ClassifyNormalizeIfNeeded();
144  if (rotated_blob == nullptr) {
145  rotated_blob = tessblob;
146  }
147  BLOB_CHOICE_LIST *ratings = new BLOB_CHOICE_LIST(); // matcher result
148  AdaptiveClassifier(rotated_blob, ratings);
149  if (rotated_blob != tessblob) {
150  delete rotated_blob;
151  }
152  return ratings;
153 }
void AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
Definition: adaptmatch.cpp:192
TBLOB * ClassifyNormalizeIfNeeded() const
Definition: blobs.cpp:356
Definition: blobs.h:268

◆ CallFillLattice()

void tesseract::Wordrec::CallFillLattice ( const MATRIX ratings,
const WERD_CHOICE_LIST &  best_choices,
const UNICHARSET unicharset,
BlamerBundle blamer_bundle 
)
inline

Definition at line 264 of file wordrec.h.

267  {
268  (this->*fill_lattice_)(ratings, best_choices, unicharset, blamer_bundle);
269  }
void(Wordrec::* fill_lattice_)(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
Definition: wordrec.h:485
UNICHARSET unicharset
Definition: ccutil.h:68

◆ cc_recog()

void tesseract::Wordrec::cc_recog ( WERD_RES word)

Definition at line 113 of file tface.cpp.

113  {
115  chop_word_main(word);
116  word->DebugWordChoices(getDict().stopper_debug_level >= 1,
117  getDict().word_to_debug.string());
118  ASSERT_HOST(word->StatesAllValid());
119 }
void reset_hyphen_vars(bool last_word_on_line)
Definition: hyphen.cpp:28
Definition: werd.h:35
bool flag(WERD_FLAGS mask) const
Definition: werd.h:126
void DebugWordChoices(bool debug, const char *word_to_debug)
Definition: pageres.cpp:486
void chop_word_main(WERD_RES *word)
Definition: chopper.cpp:399
virtual Dict & getDict()
Definition: classify.h:107
bool StatesAllValid()
Definition: pageres.cpp:464
#define ASSERT_HOST(x)
Definition: errcode.h:84
WERD * word
Definition: pageres.h:189

◆ choose_best_seam()

void tesseract::Wordrec::choose_best_seam ( SeamQueue seam_queue,
const SPLIT split,
PRIORITY  priority,
SEAM **  seam_result,
TBLOB blob,
SeamPile seam_pile 
)

Definition at line 112 of file findseam.cpp.

114  {
115  SEAM *seam;
116  char str[80];
117  float my_priority;
118  /* Add seam of split */
119  my_priority = priority;
120  if (split != nullptr) {
121  TPOINT split_point = split->point1->pos;
122  split_point += split->point2->pos;
123  split_point /= 2;
124  seam = new SEAM(my_priority, split_point, *split);
125  if (chop_debug > 1) seam->Print("Partial priority ");
126  add_seam_to_queue(my_priority, seam, seam_queue);
127 
128  if (my_priority > chop_good_split)
129  return;
130  }
131 
132  TBOX bbox = blob->bounding_box();
133  /* Queue loop */
134  while (!seam_queue->empty()) {
135  SeamPair seam_pair;
136  seam_queue->Pop(&seam_pair);
137  seam = seam_pair.extract_data();
138  /* Set full priority */
139  my_priority = seam->FullPriority(bbox.left(), bbox.right(),
142  if (chop_debug) {
143  sprintf (str, "Full my_priority %0.0f, ", my_priority);
144  seam->Print(str);
145  }
146 
147  if ((*seam_result == nullptr || (*seam_result)->priority() > my_priority) &&
148  my_priority < chop_ok_split) {
149  /* No crossing */
150  if (seam->IsHealthy(*blob, chop_min_outline_points,
152  delete *seam_result;
153  *seam_result = new SEAM(*seam);
154  (*seam_result)->set_priority(my_priority);
155  } else {
156  delete seam;
157  seam = nullptr;
158  my_priority = BAD_PRIORITY;
159  }
160  }
161 
162  if (my_priority < chop_good_split) {
163  delete seam;
164  return; /* Made good answer */
165  }
166 
167  if (seam) {
168  /* Combine with others */
169  if (seam_pile->size() < chop_seam_pile_size) {
170  combine_seam(*seam_pile, seam, seam_queue);
171  SeamDecPair pair(seam_pair.key(), seam);
172  seam_pile->Push(&pair);
173  } else if (chop_new_seam_pile &&
174  seam_pile->size() == chop_seam_pile_size &&
175  seam_pile->PeekTop().key() > seam_pair.key()) {
176  combine_seam(*seam_pile, seam, seam_queue);
177  SeamDecPair pair;
178  seam_pile->Pop(&pair); // pop the worst.
179  // Replace the seam in pair (deleting the old one) with
180  // the new seam and score, then push back into the heap.
181  pair.set_key(seam_pair.key());
182  pair.set_data(seam);
183  seam_pile->Push(&pair);
184  } else {
185  delete seam;
186  }
187  }
188 
189  my_priority = seam_queue->empty() ? NO_FULL_PRIORITY
190  : seam_queue->PeekTop().key();
191  if ((my_priority > chop_ok_split) ||
192  (my_priority > chop_good_split && split))
193  return;
194  }
195 }
bool IsHealthy(const TBLOB &blob, int min_points, int min_area) const
Definition: seam.cpp:72
double chop_center_knob
Definition: wordrec.h:220
TPOINT pos
Definition: blobs.h:170
bool chop_new_seam_pile
Definition: wordrec.h:215
Definition: seam.h:44
EDGEPT * point2
Definition: split.h:104
Definition: rect.h:34
void combine_seam(const SeamPile &seam_pile, const SEAM *seam, SeamQueue *seam_queue)
Definition: findseam.cpp:205
int chop_centered_maxwidth
Definition: wordrec.h:222
double chop_width_change_knob
Definition: wordrec.h:224
void set_data(Data *new_data)
Definition: kdpair.h:126
double chop_overlap_knob
Definition: wordrec.h:219
const Key & key() const
Definition: kdpair.h:116
void Print(const char *label) const
Definition: seam.cpp:160
int chop_min_outline_area
Definition: wordrec.h:217
#define BAD_PRIORITY
Definition: findseam.cpp:60
int16_t left() const
Definition: rect.h:72
int chop_min_outline_points
Definition: wordrec.h:213
bool empty() const
Definition: genericheap.h:68
EDGEPT * point1
Definition: split.h:103
void set_key(const Key &new_key)
Definition: kdpair.h:119
#define NO_FULL_PRIORITY
Definition: findseam.cpp:58
const Pair & PeekTop() const
Definition: genericheap.h:108
TBOX bounding_box() const
Definition: blobs.cpp:478
void Push(Pair *entry)
Definition: genericheap.h:95
double chop_good_split
Definition: wordrec.h:226
Data * extract_data()
Definition: kdpair.h:131
double chop_ok_split
Definition: wordrec.h:225
int16_t right() const
Definition: rect.h:79
void add_seam_to_queue(float new_priority, SEAM *new_seam, SeamQueue *seams)
Definition: findseam.cpp:73
Definition: blobs.h:57
int chop_seam_pile_size
Definition: wordrec.h:214
bool Pop(Pair *entry)
Definition: genericheap.h:118
float FullPriority(int xmin, int xmax, double overlap_knob, int centered_maxwidth, double center_knob, double width_change_knob) const
Definition: seam.cpp:245

◆ chop_numbered_blob()

SEAM * tesseract::Wordrec::chop_numbered_blob ( TWERD word,
int32_t  blob_number,
bool  italic_blob,
const GenericVector< SEAM *> &  seams 
)

Definition at line 229 of file chopper.cpp.

231  {
232  return attempt_blob_chop(word, word->blobs[blob_number], blob_number,
233  italic_blob, seams);
234 }
GenericVector< TBLOB * > blobs
Definition: blobs.h:443
SEAM * attempt_blob_chop(TWERD *word, TBLOB *blob, int32_t blob_number, bool italic_blob, const GenericVector< SEAM *> &seams)
Definition: chopper.cpp:175

◆ chop_one_blob()

SEAM * tesseract::Wordrec::chop_one_blob ( const GenericVector< TBOX > &  boxes,
const GenericVector< BLOB_CHOICE *> &  blob_choices,
WERD_RES word_res,
int *  blob_number 
)

Definition at line 379 of file chopper.cpp.

382  {
383  if (prioritize_division) {
384  return chop_overlapping_blob(boxes, true, word_res, blob_number);
385  } else {
386  return improve_one_blob(blob_choices, nullptr, false, true, word_res,
387  blob_number);
388  }
389 }
SEAM * improve_one_blob(const GenericVector< BLOB_CHOICE *> &blob_choices, DANGERR *fixpt, bool split_next_to_fragment, bool italic_blob, WERD_RES *word, int *blob_number)
Definition: chopper.cpp:335
bool prioritize_division
Definition: classify.h:428
SEAM * chop_overlapping_blob(const GenericVector< TBOX > &boxes, bool italic_blob, WERD_RES *word_res, int *blob_number)
Definition: chopper.cpp:237

◆ chop_overlapping_blob()

SEAM * tesseract::Wordrec::chop_overlapping_blob ( const GenericVector< TBOX > &  boxes,
bool  italic_blob,
WERD_RES word_res,
int *  blob_number 
)

Definition at line 237 of file chopper.cpp.

239  {
240  TWERD *word = word_res->chopped_word;
241  for (*blob_number = 0; *blob_number < word->NumBlobs(); ++*blob_number) {
242  TBLOB *blob = word->blobs[*blob_number];
243  TPOINT topleft, botright;
244  topleft.x = blob->bounding_box().left();
245  topleft.y = blob->bounding_box().top();
246  botright.x = blob->bounding_box().right();
247  botright.y = blob->bounding_box().bottom();
248 
249  TPOINT original_topleft, original_botright;
250  word_res->denorm.DenormTransform(nullptr, topleft, &original_topleft);
251  word_res->denorm.DenormTransform(nullptr, botright, &original_botright);
252 
253  TBOX original_box = TBOX(original_topleft.x, original_botright.y,
254  original_botright.x, original_topleft.y);
255 
256  bool almost_equal_box = false;
257  int num_overlap = 0;
258  for (int i = 0; i < boxes.size(); i++) {
259  if (original_box.overlap_fraction(boxes[i]) > 0.125)
260  num_overlap++;
261  if (original_box.almost_equal(boxes[i], 3))
262  almost_equal_box = true;
263  }
264 
265  TPOINT location;
266  if (divisible_blob(blob, italic_blob, &location) ||
267  (!almost_equal_box && num_overlap > 1)) {
268  SEAM *seam = attempt_blob_chop(word, blob, *blob_number,
269  italic_blob, word_res->seam_array);
270  if (seam != nullptr)
271  return seam;
272  }
273  }
274 
275  *blob_number = -1;
276  return nullptr;
277 }
int size() const
Definition: genericvector.h:71
Definition: blobs.h:402
bool almost_equal(const TBOX &box, int tolerance) const
Definition: rect.cpp:258
Definition: seam.h:44
Definition: rect.h:34
int NumBlobs() const
Definition: blobs.h:432
bool divisible_blob(TBLOB *blob, bool italic_blob, TPOINT *location)
Definition: blobs.cpp:923
double overlap_fraction(const TBOX &box) const
Definition: rect.h:388
int16_t left() const
Definition: rect.h:72
void DenormTransform(const DENORM *last_denorm, const TPOINT &pt, TPOINT *original) const
Definition: normalis.cpp:390
int16_t top() const
Definition: rect.h:58
DENORM denorm
Definition: pageres.h:204
TBOX bounding_box() const
Definition: blobs.cpp:478
GenericVector< SEAM * > seam_array
Definition: pageres.h:217
GenericVector< TBLOB * > blobs
Definition: blobs.h:443
int16_t x
Definition: blobs.h:78
int16_t right() const
Definition: rect.h:79
Definition: blobs.h:268
SEAM * attempt_blob_chop(TWERD *word, TBLOB *blob, int32_t blob_number, bool italic_blob, const GenericVector< SEAM *> &seams)
Definition: chopper.cpp:175
TWERD * chopped_word
Definition: pageres.h:215
Definition: blobs.h:57
int16_t y
Definition: blobs.h:79
int16_t bottom() const
Definition: rect.h:65

◆ chop_word_main()

void tesseract::Wordrec::chop_word_main ( WERD_RES word)

Definition at line 399 of file chopper.cpp.

399  {
400  int num_blobs = word->chopped_word->NumBlobs();
401  if (word->ratings == nullptr) {
402  word->ratings = new MATRIX(num_blobs, wordrec_max_join_chunks);
403  }
404  if (word->ratings->get(0, 0) == nullptr) {
405  // Run initial classification.
406  for (int b = 0; b < num_blobs; ++b) {
407  BLOB_CHOICE_LIST* choices = classify_piece(word->seam_array, b, b,
408  "Initial:", word->chopped_word,
409  word->blamer_bundle);
410  word->ratings->put(b, b, choices);
411  }
412  } else {
413  // Blobs have been pre-classified. Set matrix cell for all blob choices
414  for (int col = 0; col < word->ratings->dimension(); ++col) {
415  for (int row = col; row < word->ratings->dimension() &&
416  row < col + word->ratings->bandwidth(); ++row) {
417  BLOB_CHOICE_LIST* choices = word->ratings->get(col, row);
418  if (choices != nullptr) {
419  BLOB_CHOICE_IT bc_it(choices);
420  for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
421  bc_it.data()->set_matrix_cell(col, row);
422  }
423  }
424  }
425  }
426  }
427 
428  // Run Segmentation Search.
429  BestChoiceBundle best_choice_bundle(word->ratings->dimension());
430  SegSearch(word, &best_choice_bundle, word->blamer_bundle);
431 
432  if (word->best_choice == nullptr) {
433  // SegSearch found no valid paths, so just use the leading diagonal.
435  }
436  word->RebuildBestState();
437  // If we finished without a hyphen at the end of the word, let the next word
438  // be found in the dictionary.
439  if (word->word->flag(W_EOL) &&
440  !getDict().has_hyphen_end(*word->best_choice)) {
441  getDict().reset_hyphen_vars(true);
442  }
443 
444  if (word->blamer_bundle != nullptr && this->fill_lattice_ != nullptr) {
445  CallFillLattice(*word->ratings, word->best_choices,
446  *word->uch_set, word->blamer_bundle);
447  }
448  if (wordrec_debug_level > 0) {
449  tprintf("Final Ratings Matrix:\n");
450  word->ratings->print(getDict().getUnicharset());
451  }
452  word->FilterWordChoices(getDict().stopper_debug_level);
453 }
void print(const UNICHARSET &unicharset) const
Definition: matrix.cpp:117
WERD_CHOICE_LIST best_choices
Definition: pageres.h:243
void FakeWordFromRatings(PermuterType permuter)
Definition: pageres.cpp:904
int wordrec_max_join_chunks
Definition: wordrec.h:233
void reset_hyphen_vars(bool last_word_on_line)
Definition: hyphen.cpp:28
int wordrec_debug_level
Definition: wordrec.h:231
int NumBlobs() const
Definition: blobs.h:432
Definition: werd.h:35
void CallFillLattice(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
Definition: wordrec.h:264
int bandwidth() const
Definition: matrix.h:535
void FilterWordChoices(int debug_level)
Definition: pageres.cpp:519
bool flag(WERD_FLAGS mask) const
Definition: werd.h:126
int dimension() const
Definition: matrix.h:533
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
void put(ICOORD pos, const T &thing)
Definition: matrix.h:220
GenericVector< SEAM * > seam_array
Definition: pageres.h:217
MATRIX * ratings
Definition: pageres.h:231
const UNICHARSET * uch_set
Definition: pageres.h:206
BlamerBundle * blamer_bundle
Definition: pageres.h:246
bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:144
void RebuildBestState()
Definition: pageres.cpp:814
virtual Dict & getDict()
Definition: classify.h:107
Definition: matrix.h:575
virtual BLOB_CHOICE_LIST * classify_piece(const GenericVector< SEAM *> &seams, int16_t start, int16_t end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
Definition: pieces.cpp:55
TWERD * chopped_word
Definition: pageres.h:215
void SegSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:43
WERD_CHOICE * best_choice
Definition: pageres.h:235
T get(ICOORD pos) const
Definition: matrix.h:228
WERD * word
Definition: pageres.h:189

◆ classify_blob()

BLOB_CHOICE_LIST * tesseract::Wordrec::classify_blob ( TBLOB blob,
const char *  string,
C_COL  color,
BlamerBundle blamer_bundle 
)

Definition at line 54 of file wordclass.cpp.

56  {
57 #ifndef GRAPHICS_DISABLED
59  display_blob(blob, color);
60 #endif
61  // TODO(rays) collapse with call_matcher and move all to wordrec.cpp.
62  BLOB_CHOICE_LIST* choices = call_matcher(blob);
63  // If a blob with the same bounding box as one of the truth character
64  // bounding boxes is not classified as the corresponding truth character
65  // blame character classifier for incorrect answer.
66  if (blamer_bundle != nullptr) {
67  blamer_bundle->BlameClassifier(getDict().getUnicharset(),
68  blob->bounding_box(),
69  *choices,
71  }
72  #ifndef GRAPHICS_DISABLED
73  if (classify_debug_level && string)
74  print_ratings_list(string, choices, getDict().getUnicharset());
75 
78 #endif
79 
80  return choices;
81 }
ScrollView * blob_window
Definition: render.cpp:40
void BlameClassifier(const UNICHARSET &unicharset, const TBOX &blob_box, const BLOB_CHOICE_LIST &choices, bool debug)
Definition: blamer.cpp:262
bool wordrec_display_all_blobs
Definition: render.cpp:46
void display_blob(TBLOB *blob, C_COL color)
Definition: render.cpp:61
TBOX bounding_box() const
Definition: blobs.cpp:478
bool wordrec_blob_pause
Definition: render.cpp:50
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:836
virtual Dict & getDict()
Definition: classify.h:107
char window_wait(ScrollView *win)
Definition: callcpp.cpp:104
bool wordrec_debug_blamer
Definition: wordrec.h:236
BLOB_CHOICE_LIST * call_matcher(TBLOB *blob)
Definition: tface.cpp:141

◆ classify_piece()

BLOB_CHOICE_LIST * tesseract::Wordrec::classify_piece ( const GenericVector< SEAM *> &  seams,
int16_t  start,
int16_t  end,
const char *  description,
TWERD word,
BlamerBundle blamer_bundle 
)
virtual

Definition at line 55 of file pieces.cpp.

60  {
61  if (end > start) SEAM::JoinPieces(seams, word->blobs, start, end);
62  BLOB_CHOICE_LIST *choices = classify_blob(word->blobs[start], description,
63  White, blamer_bundle);
64  // Set the matrix_cell_ entries in all the BLOB_CHOICES.
65  BLOB_CHOICE_IT bc_it(choices);
66  for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
67  bc_it.data()->set_matrix_cell(start, end);
68  }
69 
70  if (end > start) SEAM::BreakPieces(seams, word->blobs, start, end);
71 
72  return (choices);
73 }
Definition: callcpp.h:31
static void JoinPieces(const GenericVector< SEAM *> &seams, const GenericVector< TBLOB *> &blobs, int first, int last)
Definition: seam.cpp:216
static void BreakPieces(const GenericVector< SEAM *> &seams, const GenericVector< TBLOB *> &blobs, int first, int last)
Definition: seam.cpp:194
BLOB_CHOICE_LIST * classify_blob(TBLOB *blob, const char *string, C_COL color, BlamerBundle *blamer_bundle)
Definition: wordclass.cpp:54
GenericVector< TBLOB * > blobs
Definition: blobs.h:443

◆ combine_seam()

void tesseract::Wordrec::combine_seam ( const SeamPile seam_pile,
const SEAM seam,
SeamQueue seam_queue 
)

Definition at line 205 of file findseam.cpp.

206  {
207  for (int x = 0; x < seam_pile.size(); ++x) {
208  const SEAM *this_one = seam_pile.get(x).data();
209  if (seam->CombineableWith(*this_one, SPLIT_CLOSENESS, chop_ok_split)) {
210  SEAM *new_one = new SEAM(*seam);
211  new_one->CombineWith(*this_one);
212  if (chop_debug > 1) new_one->Print("Combo priority ");
213  add_seam_to_queue(new_one->priority(), new_one, seam_queue);
214  }
215  }
216 }
const Pair & get(int index) const
Definition: genericheap.h:87
Definition: seam.h:44
#define SPLIT_CLOSENESS
Definition: findseam.cpp:53
void Print(const char *label) const
Definition: seam.cpp:160
float priority() const
Definition: seam.h:65
bool CombineableWith(const SEAM &other, int max_x_dist, float max_total_priority) const
Definition: seam.cpp:46
double chop_ok_split
Definition: wordrec.h:225
void add_seam_to_queue(float new_priority, SEAM *new_seam, SeamQueue *seams)
Definition: findseam.cpp:73
void CombineWith(const SEAM &other)
Definition: seam.cpp:60

◆ dict_word()

int tesseract::Wordrec::dict_word ( const WERD_CHOICE word)

Definition at line 129 of file tface.cpp.

129  {
130  return getDict().valid_word(word);
131 }
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:753
virtual Dict & getDict()
Definition: classify.h:107

◆ DoSegSearch()

void tesseract::Wordrec::DoSegSearch ( WERD_RES word_res)

Definition at line 37 of file segsearch.cpp.

37  {
38  BestChoiceBundle best_choice_bundle(word_res->ratings->dimension());
39  // Run Segmentation Search.
40  SegSearch(word_res, &best_choice_bundle, nullptr);
41 }
int dimension() const
Definition: matrix.h:533
MATRIX * ratings
Definition: pageres.h:231
void SegSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:43

◆ end_recog()

int tesseract::Wordrec::end_recog ( )

Definition at line 62 of file tface.cpp.

62  {
63  program_editdown (0);
64 
65  return (0);
66 }
void program_editdown(int32_t elasped_time)
Definition: tface.cpp:75

◆ fill_filtered_fragment_list()

void tesseract::Wordrec::fill_filtered_fragment_list ( BLOB_CHOICE_LIST *  choices,
int  fragment_pos,
int  num_frag_parts,
BLOB_CHOICE_LIST *  filtered_choices 
)

Definition at line 104 of file pieces.cpp.

107  {
108  BLOB_CHOICE_IT filtered_choices_it(filtered_choices);
109  BLOB_CHOICE_IT choices_it(choices);
110 
111  for (choices_it.mark_cycle_pt(); !choices_it.cycled_list();
112  choices_it.forward()) {
113  UNICHAR_ID choice_unichar_id = choices_it.data()->unichar_id();
114  const CHAR_FRAGMENT *frag = unicharset.get_fragment(choice_unichar_id);
115 
116  if (frag != nullptr && frag->get_pos() == fragment_pos &&
117  frag->get_total() == num_frag_parts) {
118  // Recover the unichar_id of the unichar that this fragment is
119  // a part of
120  BLOB_CHOICE *b = new BLOB_CHOICE(*choices_it.data());
121  int original_unichar = unicharset.unichar_to_id(frag->get_unichar());
122  b->set_unichar_id(original_unichar);
123  filtered_choices_it.add_to_end(b);
124  }
125  }
126 
127  filtered_choices->sort(SortByUnicharID<BLOB_CHOICE>);
128 }
int UNICHAR_ID
Definition: unichar.h:35
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:209
int get_total() const
Definition: unicharset.h:73
const char * get_unichar() const
Definition: unicharset.h:71
UNICHARSET unicharset
Definition: ccutil.h:68
void set_unichar_id(UNICHAR_ID newunichar_id)
Definition: ratngs.h:145
int get_pos() const
Definition: unicharset.h:72
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:729

◆ FillLattice()

void tesseract::Wordrec::FillLattice ( const MATRIX ratings,
const WERD_CHOICE_LIST &  best_choices,
const UNICHARSET unicharset,
BlamerBundle blamer_bundle 
)

◆ get_fragment_lists()

void tesseract::Wordrec::get_fragment_lists ( int16_t  current_frag,
int16_t  current_row,
int16_t  start,
int16_t  num_frag_parts,
int16_t  num_blobs,
MATRIX ratings,
BLOB_CHOICE_LIST *  choice_lists 
)

Definition at line 280 of file pieces.cpp.

283  {
284  if (current_frag == num_frag_parts) {
285  merge_and_put_fragment_lists(start, current_row - 1, num_frag_parts,
286  choice_lists, ratings);
287  return;
288  }
289 
290  for (int16_t x = current_row; x < num_blobs; x++) {
291  BLOB_CHOICE_LIST *choices = ratings->get(current_row, x);
292  if (choices == nullptr)
293  continue;
294 
295  fill_filtered_fragment_list(choices, current_frag, num_frag_parts,
296  &choice_lists[current_frag]);
297  if (!choice_lists[current_frag].empty()) {
298  get_fragment_lists(current_frag + 1, x + 1, start, num_frag_parts,
299  num_blobs, ratings, choice_lists);
300  choice_lists[current_frag].clear();
301  }
302  }
303 }
void merge_and_put_fragment_lists(int16_t row, int16_t column, int16_t num_frag_parts, BLOB_CHOICE_LIST *choice_lists, MATRIX *ratings)
Definition: pieces.cpp:137
void get_fragment_lists(int16_t current_frag, int16_t current_row, int16_t start, int16_t num_frag_parts, int16_t num_blobs, MATRIX *ratings, BLOB_CHOICE_LIST *choice_lists)
Definition: pieces.cpp:280
void fill_filtered_fragment_list(BLOB_CHOICE_LIST *choices, int fragment_pos, int num_frag_parts, BLOB_CHOICE_LIST *filtered_choices)
Definition: pieces.cpp:104
T get(ICOORD pos) const
Definition: matrix.h:228

◆ grade_sharpness()

PRIORITY tesseract::Wordrec::grade_sharpness ( SPLIT split)

Definition at line 74 of file gradechop.cpp.

74  {
75  PRIORITY grade;
76 
77  grade = point_priority (split->point1) + point_priority (split->point2);
78 
79  if (grade < -360.0)
80  grade = 0;
81  else
82  grade += 360.0;
83 
84  grade *= chop_sharpness_knob; /* Values 0 to -360 */
85 
86  return (grade);
87 }
EDGEPT * point2
Definition: split.h:104
double chop_sharpness_knob
Definition: wordrec.h:223
PRIORITY point_priority(EDGEPT *point)
Definition: chop.cpp:53
EDGEPT * point1
Definition: split.h:103
float PRIORITY
Definition: seam.h:42

◆ grade_split_length()

PRIORITY tesseract::Wordrec::grade_split_length ( SPLIT split)

Definition at line 51 of file gradechop.cpp.

51  {
52  PRIORITY grade;
53  float split_length;
54 
55  split_length =
57 
58  if (split_length <= 0)
59  grade = 0;
60  else
61  grade = sqrt (split_length) * chop_split_dist_knob;
62 
63  return (std::max(0.0f, grade));
64 }
EDGEPT * point2
Definition: split.h:104
double chop_split_dist_knob
Definition: wordrec.h:218
EDGEPT * point1
Definition: split.h:103
float PRIORITY
Definition: seam.h:42
int WeightedDistance(const EDGEPT &other, int x_factor) const
Definition: blobs.h:106

◆ improve_by_chopping()

void tesseract::Wordrec::improve_by_chopping ( float  rating_cert_scale,
WERD_RES word,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle,
LMPainPoints pain_points,
GenericVector< SegSearchPending > *  pending 
)

Definition at line 462 of file chopper.cpp.

467  {
468  int blob_number;
469  do { // improvement loop.
470  // Make a simple vector of BLOB_CHOICEs to make it easy to pick which
471  // one to chop.
472  GenericVector<BLOB_CHOICE*> blob_choices;
473  int num_blobs = word->ratings->dimension();
474  for (int i = 0; i < num_blobs; ++i) {
475  BLOB_CHOICE_LIST* choices = word->ratings->get(i, i);
476  if (choices == nullptr || choices->empty()) {
477  blob_choices.push_back(nullptr);
478  } else {
479  BLOB_CHOICE_IT bc_it(choices);
480  blob_choices.push_back(bc_it.data());
481  }
482  }
483  SEAM* seam = improve_one_blob(blob_choices, &best_choice_bundle->fixpt,
484  false, false, word, &blob_number);
485  if (seam == nullptr) break;
486  // A chop has been made. We have to correct all the data structures to
487  // take into account the extra bottom-level blob.
488  // Put the seam into the seam_array and correct everything else on the
489  // word: ratings matrix (including matrix location in the BLOB_CHOICES),
490  // states in WERD_CHOICEs, and blob widths.
491  word->InsertSeam(blob_number, seam);
492  // Insert a new entry in the beam array.
493  best_choice_bundle->beam.insert(new LanguageModelState, blob_number);
494  // Fixpts are outdated, but will get recalculated.
495  best_choice_bundle->fixpt.clear();
496  // Remap existing pain points.
497  pain_points->RemapForSplit(blob_number);
498  // Insert a new pending at the chop point.
499  pending->insert(SegSearchPending(), blob_number);
500 
501  // Classify the two newly created blobs using ProcessSegSearchPainPoint,
502  // as that updates the pending correctly and adds new pain points.
503  MATRIX_COORD pain_point(blob_number, blob_number);
504  ProcessSegSearchPainPoint(0.0f, pain_point, "Chop1", pending, word,
505  pain_points, blamer_bundle);
506  pain_point.col = blob_number + 1;
507  pain_point.row = blob_number + 1;
508  ProcessSegSearchPainPoint(0.0f, pain_point, "Chop2", pending, word,
509  pain_points, blamer_bundle);
510  if (language_model_->language_model_ngram_on) {
511  // N-gram evaluation depends on the number of blobs in a chunk, so we
512  // have to re-evaluate everything in the word.
513  ResetNGramSearch(word, best_choice_bundle, pending);
514  blob_number = 0;
515  }
516  // Run language model incrementally. (Except with the n-gram model on.)
517  UpdateSegSearchNodes(rating_cert_scale, blob_number, pending,
518  word, pain_points, best_choice_bundle, blamer_bundle);
519  } while (!language_model_->AcceptableChoiceFound() &&
520  word->ratings->dimension() < kMaxNumChunks);
521 
522  // If after running only the chopper best_choice is incorrect and no blame
523  // has been yet set, blame the classifier if best_choice is classifier's
524  // top choice and is a dictionary word (i.e. language model could not have
525  // helped). Otherwise blame the tradeoff between the classifier and
526  // the old language model (permuters).
527  if (word->blamer_bundle != nullptr &&
529  !word->blamer_bundle->ChoiceIsCorrect(word->best_choice)) {
530  bool valid_permuter = word->best_choice != nullptr &&
533  getDict().getUnicharset(),
534  valid_permuter,
536  }
537 }
void ProcessSegSearchPainPoint(float pain_point_priority, const MATRIX_COORD &pain_point, const char *pain_point_type, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:249
uint8_t permuter() const
Definition: ratngs.h:346
Definition: seam.h:44
void InsertSeam(int blob_number, SEAM *seam)
Definition: pageres.cpp:424
SEAM * improve_one_blob(const GenericVector< BLOB_CHOICE *> &blob_choices, DANGERR *fixpt, bool split_next_to_fragment, bool italic_blob, WERD_RES *word, int *blob_number)
Definition: chopper.cpp:335
void ResetNGramSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, GenericVector< SegSearchPending > *pending)
Definition: segsearch.cpp:312
bool ChoiceIsCorrect(const WERD_CHOICE *word_choice) const
Definition: blamer.cpp:116
std::unique_ptr< LanguageModel > language_model_
Definition: wordrec.h:476
void BlameClassifierOrLangModel(const WERD_RES *word, const UNICHARSET &unicharset, bool valid_permuter, bool debug)
Definition: blamer.cpp:374
void insert(const T &t, int index)
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:118
int dimension() const
Definition: matrix.h:533
void UpdateSegSearchNodes(float rating_cert_scale, int starting_col, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:181
int push_back(T object)
MATRIX * ratings
Definition: pageres.h:231
BlamerBundle * blamer_bundle
Definition: pageres.h:246
virtual Dict & getDict()
Definition: classify.h:107
bool wordrec_debug_blamer
Definition: wordrec.h:236
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:459
WERD_CHOICE * best_choice
Definition: pageres.h:235
T get(ICOORD pos) const
Definition: matrix.h:228

◆ improve_one_blob()

SEAM * tesseract::Wordrec::improve_one_blob ( const GenericVector< BLOB_CHOICE *> &  blob_choices,
DANGERR fixpt,
bool  split_next_to_fragment,
bool  italic_blob,
WERD_RES word,
int *  blob_number 
)

Definition at line 335 of file chopper.cpp.

340  {
341  float rating_ceiling = FLT_MAX;
342  SEAM *seam = nullptr;
343  do {
344  *blob_number = select_blob_to_split_from_fixpt(fixpt);
345  if (chop_debug) tprintf("blob_number from fixpt = %d\n", *blob_number);
346  bool split_point_from_dict = (*blob_number != -1);
347  if (split_point_from_dict) {
348  fixpt->clear();
349  } else {
350  *blob_number = select_blob_to_split(blob_choices, rating_ceiling,
351  split_next_to_fragment);
352  }
353  if (chop_debug) tprintf("blob_number = %d\n", *blob_number);
354  if (*blob_number == -1)
355  return nullptr;
356 
357  // TODO(rays) it may eventually help to allow italic_blob to be true,
358  seam = chop_numbered_blob(word->chopped_word, *blob_number, italic_blob,
359  word->seam_array);
360  if (seam != nullptr)
361  return seam; // Success!
362  if (blob_choices[*blob_number] == nullptr)
363  return nullptr;
364  if (!split_point_from_dict) {
365  // We chopped the worst rated blob, try something else next time.
366  rating_ceiling = blob_choices[*blob_number]->rating();
367  }
368  } while (true);
369  return seam;
370 }
Definition: seam.h:44
int select_blob_to_split_from_fixpt(DANGERR *fixpt)
Definition: chopper.cpp:634
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
GenericVector< SEAM * > seam_array
Definition: pageres.h:217
int select_blob_to_split(const GenericVector< BLOB_CHOICE *> &blob_choices, float rating_ceiling, bool split_next_to_fragment)
Definition: chopper.cpp:546
TWERD * chopped_word
Definition: pageres.h:215
SEAM * chop_numbered_blob(TWERD *word, int32_t blob_number, bool italic_blob, const GenericVector< SEAM *> &seams)
Definition: chopper.cpp:229

◆ InitBlamerForSegSearch()

void tesseract::Wordrec::InitBlamerForSegSearch ( WERD_RES word_res,
LMPainPoints pain_points,
BlamerBundle blamer_bundle,
STRING blamer_debug 
)
protected

Definition at line 329 of file segsearch.cpp.

332  {
333  pain_points->Clear(); // Clear pain points heap.
335  pain_points, &LMPainPoints::GenerateForBlamer,
336  static_cast<double>(segsearch_max_char_wh_ratio), word_res);
337  blamer_bundle->InitForSegSearch(word_res->best_choice, word_res->ratings,
338  getDict().WildcardID(), wordrec_debug_blamer,
339  blamer_debug, pp_cb);
340  delete pp_cb;
341 }
double segsearch_max_char_wh_ratio
Definition: wordrec.h:244
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
void InitForSegSearch(const WERD_CHOICE *best_choice, MATRIX *ratings, UNICHAR_ID wildcard_id, bool debug, STRING *debug_str, TessResultCallback2< bool, int, int > *pp_cb)
Definition: blamer.cpp:478
MATRIX * ratings
Definition: pageres.h:231
virtual Dict & getDict()
Definition: classify.h:107
bool wordrec_debug_blamer
Definition: wordrec.h:236
WERD_CHOICE * best_choice
Definition: pageres.h:235
bool GenerateForBlamer(double max_char_wh_ratio, WERD_RES *word_res, int col, int row)

◆ InitialSegSearch()

void tesseract::Wordrec::InitialSegSearch ( WERD_RES word_res,
LMPainPoints pain_points,
GenericVector< SegSearchPending > *  pending,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)

Definition at line 137 of file segsearch.cpp.

140  {
141  if (segsearch_debug_level > 0) {
142  tprintf("Starting SegSearch on ratings matrix%s:\n",
143  wordrec_enable_assoc ? " (with assoc)" : "");
144  word_res->ratings->print(getDict().getUnicharset());
145  }
146 
147  pain_points->GenerateInitial(word_res);
148 
149  // Compute scaling factor that will help us recover blob outline length
150  // from classifier rating and certainty for the blob.
151  float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale;
152 
155  segsearch_max_char_wh_ratio, rating_cert_scale);
156 
157  // Initialize blamer-related information: map character boxes recorded in
158  // blamer_bundle->norm_truth_word to the corresponding i,j indices in the
159  // ratings matrix. We expect this step to succeed, since when running the
160  // chopper we checked that the correct chops are present.
161  if (blamer_bundle != nullptr) {
162  blamer_bundle->SetupCorrectSegmentation(word_res->chopped_word,
164  }
165 
166  // pending[col] tells whether there is update work to do to combine
167  // best_choice_bundle->beam[col - 1] with some BLOB_CHOICEs in matrix[col, *].
168  // As the language model state is updated, pending entries are modified to
169  // minimize duplication of work. It is important that during the update the
170  // children are considered in the non-decreasing order of their column, since
171  // this guarantees that all the parents would be up to date before an update
172  // of a child is done.
173  pending->init_to_size(word_res->ratings->dimension(), SegSearchPending());
174 
175  // Search the ratings matrix for the initial best path.
176  (*pending)[0].SetColumnClassified();
177  UpdateSegSearchNodes(rating_cert_scale, 0, pending, word_res,
178  pain_points, best_choice_bundle, blamer_bundle);
179 }
void print(const UNICHARSET &unicharset) const
Definition: matrix.cpp:117
double segsearch_max_char_wh_ratio
Definition: wordrec.h:244
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:481
double certainty_scale
Definition: dict.h:611
std::unique_ptr< LanguageModel > language_model_
Definition: wordrec.h:476
void init_to_size(int size, const T &t)
int dimension() const
Definition: matrix.h:533
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
void UpdateSegSearchNodes(float rating_cert_scale, int starting_col, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:181
void SetupCorrectSegmentation(const TWERD *word, bool debug)
Definition: blamer.cpp:412
MATRIX * ratings
Definition: pageres.h:231
bool assume_fixed_pitch_char_segment
Definition: wordrec.h:230
int segsearch_debug_level
Definition: wordrec.h:238
bool wordrec_enable_assoc
Definition: wordrec.h:199
virtual Dict & getDict()
Definition: classify.h:107
bool wordrec_debug_blamer
Definition: wordrec.h:236
TWERD * chopped_word
Definition: pageres.h:215

◆ is_inside_angle()

bool tesseract::Wordrec::is_inside_angle ( EDGEPT pt)

Definition at line 77 of file chop.cpp.

77  {
78  return angle_change(pt->prev, pt, pt->next) < chop_inside_angle;
79 }
EDGEPT * prev
Definition: blobs.h:177
EDGEPT * next
Definition: blobs.h:176
int angle_change(EDGEPT *point1, EDGEPT *point2, EDGEPT *point3)
Definition: chop.cpp:87

◆ merge_and_put_fragment_lists()

void tesseract::Wordrec::merge_and_put_fragment_lists ( int16_t  row,
int16_t  column,
int16_t  num_frag_parts,
BLOB_CHOICE_LIST *  choice_lists,
MATRIX ratings 
)

Definition at line 137 of file pieces.cpp.

140  {
141  BLOB_CHOICE_IT *choice_lists_it = new BLOB_CHOICE_IT[num_frag_parts];
142 
143  for (int i = 0; i < num_frag_parts; i++) {
144  choice_lists_it[i].set_to_list(&choice_lists[i]);
145  choice_lists_it[i].mark_cycle_pt();
146  }
147 
148  BLOB_CHOICE_LIST *merged_choice = ratings->get(row, column);
149  if (merged_choice == nullptr)
150  merged_choice = new BLOB_CHOICE_LIST;
151 
152  bool end_of_list = false;
153  BLOB_CHOICE_IT merged_choice_it(merged_choice);
154  while (!end_of_list) {
155  // Find the maximum unichar_id of the current entry the iterators
156  // are pointing at
157  UNICHAR_ID max_unichar_id = choice_lists_it[0].data()->unichar_id();
158  for (int i = 0; i < num_frag_parts; i++) {
159  UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
160  if (max_unichar_id < unichar_id) {
161  max_unichar_id = unichar_id;
162  }
163  }
164 
165  // Move the each iterators until it gets to an entry that has a
166  // value greater than or equal to max_unichar_id
167  for (int i = 0; i < num_frag_parts; i++) {
168  UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
169  while (!choice_lists_it[i].cycled_list() &&
170  unichar_id < max_unichar_id) {
171  choice_lists_it[i].forward();
172  unichar_id = choice_lists_it[i].data()->unichar_id();
173  }
174  if (choice_lists_it[i].cycled_list()) {
175  end_of_list = true;
176  break;
177  }
178  }
179 
180  if (end_of_list)
181  break;
182 
183  // Checks if the fragments are parts of the same character
184  UNICHAR_ID first_unichar_id = choice_lists_it[0].data()->unichar_id();
185  bool same_unichar = true;
186  for (int i = 1; i < num_frag_parts; i++) {
187  UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
188  if (unichar_id != first_unichar_id) {
189  same_unichar = false;
190  break;
191  }
192  }
193 
194  if (same_unichar) {
195  // Add the merged character to the result
196  UNICHAR_ID merged_unichar_id = first_unichar_id;
197  GenericVector<ScoredFont> merged_fonts =
198  choice_lists_it[0].data()->fonts();
199  float merged_min_xheight = choice_lists_it[0].data()->min_xheight();
200  float merged_max_xheight = choice_lists_it[0].data()->max_xheight();
201  float positive_yshift = 0, negative_yshift = 0;
202  int merged_script_id = choice_lists_it[0].data()->script_id();
203  BlobChoiceClassifier classifier = choice_lists_it[0].data()->classifier();
204 
205  float merged_rating = 0, merged_certainty = 0;
206  for (int i = 0; i < num_frag_parts; i++) {
207  float rating = choice_lists_it[i].data()->rating();
208  float certainty = choice_lists_it[i].data()->certainty();
209 
210  if (i == 0 || certainty < merged_certainty)
211  merged_certainty = certainty;
212  merged_rating += rating;
213 
214  choice_lists_it[i].forward();
215  if (choice_lists_it[i].cycled_list())
216  end_of_list = true;
217  IntersectRange(choice_lists_it[i].data()->min_xheight(),
218  choice_lists_it[i].data()->max_xheight(),
219  &merged_min_xheight, &merged_max_xheight);
220  float yshift = choice_lists_it[i].data()->yshift();
221  if (yshift > positive_yshift) positive_yshift = yshift;
222  if (yshift < negative_yshift) negative_yshift = yshift;
223  // Use the min font rating over the parts.
224  // TODO(rays) font lists are unsorted. Need to be faster?
225  const GenericVector<ScoredFont>& frag_fonts =
226  choice_lists_it[i].data()->fonts();
227  for (int f = 0; f < frag_fonts.size(); ++f) {
228  int merged_f = 0;
229  for (merged_f = 0; merged_f < merged_fonts.size() &&
230  merged_fonts[merged_f].fontinfo_id != frag_fonts[f].fontinfo_id;
231  ++merged_f) {}
232  if (merged_f == merged_fonts.size()) {
233  merged_fonts.push_back(frag_fonts[f]);
234  } else if (merged_fonts[merged_f].score > frag_fonts[f].score) {
235  merged_fonts[merged_f].score = frag_fonts[f].score;
236  }
237  }
238  }
239 
240  float merged_yshift = positive_yshift != 0
241  ? (negative_yshift != 0 ? 0 : positive_yshift)
242  : negative_yshift;
243  BLOB_CHOICE* choice = new BLOB_CHOICE(merged_unichar_id,
244  merged_rating,
245  merged_certainty,
246  merged_script_id,
247  merged_min_xheight,
248  merged_max_xheight,
249  merged_yshift,
250  classifier);
251  choice->set_fonts(merged_fonts);
252  merged_choice_it.add_to_end(choice);
253  }
254  }
255 
257  print_ratings_list("Merged Fragments", merged_choice,
258  unicharset);
259 
260  if (merged_choice->empty())
261  delete merged_choice;
262  else
263  ratings->put(row, column, merged_choice);
264 
265  delete [] choice_lists_it;
266 }
int UNICHAR_ID
Definition: unichar.h:35
int size() const
Definition: genericvector.h:71
void set_fonts(const GenericVector< tesseract::ScoredFont > &fonts)
Definition: ratngs.h:95
UNICHARSET unicharset
Definition: ccutil.h:68
void put(ICOORD pos, const T &thing)
Definition: matrix.h:220
int push_back(T object)
void IntersectRange(const T &lower1, const T &upper1, T *lower2, T *upper2)
Definition: helpers.h:142
BlobChoiceClassifier
Definition: ratngs.h:41
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:836
T get(ICOORD pos) const
Definition: matrix.h:228

◆ merge_fragments()

void tesseract::Wordrec::merge_fragments ( MATRIX ratings,
int16_t  num_blobs 
)

Definition at line 312 of file pieces.cpp.

312  {
313  BLOB_CHOICE_LIST choice_lists[CHAR_FRAGMENT::kMaxChunks];
314  for (int16_t start = 0; start < num_blobs; start++) {
315  for (int frag_parts = 2; frag_parts <= CHAR_FRAGMENT::kMaxChunks;
316  frag_parts++) {
317  get_fragment_lists(0, start, start, frag_parts, num_blobs,
318  ratings, choice_lists);
319  }
320  }
321 
322  // Delete fragments from the rating matrix
323  for (int16_t x = 0; x < num_blobs; x++) {
324  for (int16_t y = x; y < num_blobs; y++) {
325  BLOB_CHOICE_LIST *choices = ratings->get(x, y);
326  if (choices != nullptr) {
327  BLOB_CHOICE_IT choices_it(choices);
328  for (choices_it.mark_cycle_pt(); !choices_it.cycled_list();
329  choices_it.forward()) {
330  UNICHAR_ID choice_unichar_id = choices_it.data()->unichar_id();
331  const CHAR_FRAGMENT *frag =
332  unicharset.get_fragment(choice_unichar_id);
333  if (frag != nullptr)
334  delete choices_it.extract();
335  }
336  }
337  }
338  }
339 }
int UNICHAR_ID
Definition: unichar.h:35
UNICHARSET unicharset
Definition: ccutil.h:68
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:729
void get_fragment_lists(int16_t current_frag, int16_t current_row, int16_t start, int16_t num_frag_parts, int16_t num_blobs, MATRIX *ratings, BLOB_CHOICE_LIST *choice_lists)
Definition: pieces.cpp:280
static const int kMaxChunks
Definition: unicharset.h:56
T get(ICOORD pos) const
Definition: matrix.h:228

◆ near_point()

bool tesseract::Wordrec::near_point ( EDGEPT point,
EDGEPT line_pt_0,
EDGEPT line_pt_1,
EDGEPT **  near_pt 
)

Definition at line 45 of file outlines.cpp.

47  {
48  TPOINT p;
49 
50  float slope;
51  float intercept;
52 
53  float x0 = line_pt_0->pos.x;
54  float x1 = line_pt_1->pos.x;
55  float y0 = line_pt_0->pos.y;
56  float y1 = line_pt_1->pos.y;
57 
58  if (x0 == x1) {
59  /* Handle vertical line */
60  p.x = (int16_t) x0;
61  p.y = point->pos.y;
62  }
63  else {
64  /* Slope and intercept */
65  slope = (y0 - y1) / (x0 - x1);
66  intercept = y1 - x1 * slope;
67 
68  /* Find perpendicular */
69  p.x = (int16_t) ((point->pos.x + (point->pos.y - intercept) * slope) /
70  (slope * slope + 1));
71  p.y = (int16_t) (slope * p.x + intercept);
72  }
73 
74  if (is_on_line (p, line_pt_0->pos, line_pt_1->pos) &&
75  (!same_point (p, line_pt_0->pos)) && (!same_point (p, line_pt_1->pos))) {
76  /* Intersection on line */
77  *near_pt = make_edgept(p.x, p.y, line_pt_1, line_pt_0);
78  return true;
79  } else { /* Intersection not on line */
80  *near_pt = closest(point, line_pt_0, line_pt_1);
81  return false;
82  }
83 }
EDGEPT * make_edgept(int x, int y, EDGEPT *next, EDGEPT *prev)
Definition: split.cpp:145
TPOINT pos
Definition: blobs.h:170
#define is_on_line(p, p0, p1)
Definition: outlines.h:121
#define closest(test_p, p1, p2)
Definition: outlines.h:72
int16_t x
Definition: blobs.h:78
Definition: blobs.h:57
#define same_point(p1, p2)
Definition: outlines.h:50
int16_t y
Definition: blobs.h:79

◆ new_max_point()

void tesseract::Wordrec::new_max_point ( EDGEPT local_max,
PointHeap points 
)

Definition at line 243 of file chop.cpp.

243  {
244  int16_t dir;
245 
246  dir = direction (local_max);
247 
248  if (dir > 0) {
249  add_point_to_list(points, local_max);
250  return;
251  }
252 
253  if (dir == 0 && point_priority (local_max) < 0) {
254  add_point_to_list(points, local_max);
255  return;
256  }
257 }
void add_point_to_list(PointHeap *point_heap, EDGEPT *point)
Definition: chop.cpp:63
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:43
PRIORITY point_priority(EDGEPT *point)
Definition: chop.cpp:53

◆ new_min_point()

void tesseract::Wordrec::new_min_point ( EDGEPT local_min,
PointHeap points 
)

Definition at line 219 of file chop.cpp.

219  {
220  int16_t dir;
221 
222  dir = direction (local_min);
223 
224  if (dir < 0) {
225  add_point_to_list(points, local_min);
226  return;
227  }
228 
229  if (dir == 0 && point_priority (local_min) < 0) {
230  add_point_to_list(points, local_min);
231  return;
232  }
233 }
void add_point_to_list(PointHeap *point_heap, EDGEPT *point)
Definition: chop.cpp:63
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:43
PRIORITY point_priority(EDGEPT *point)
Definition: chop.cpp:53

◆ pick_close_point()

EDGEPT * tesseract::Wordrec::pick_close_point ( EDGEPT critical_point,
EDGEPT vertical_point,
int *  best_dist 
)

Definition at line 122 of file chop.cpp.

124  {
125  EDGEPT *best_point = nullptr;
126  int this_distance;
127  int found_better;
128 
129  do {
130  found_better = FALSE;
131 
132  this_distance = edgept_dist (critical_point, vertical_point);
133  if (this_distance <= *best_dist) {
134 
135  if (!(same_point (critical_point->pos, vertical_point->pos) ||
136  same_point (critical_point->pos, vertical_point->next->pos) ||
137  (best_point && same_point (best_point->pos, vertical_point->pos)) ||
138  is_exterior_point (critical_point, vertical_point))) {
139  *best_dist = this_distance;
140  best_point = vertical_point;
142  found_better = TRUE;
143  }
144  }
145  vertical_point = vertical_point->next;
146  }
147  while (found_better == TRUE);
148 
149  return (best_point);
150 }
#define TRUE
Definition: capi.h:51
TPOINT pos
Definition: blobs.h:170
#define edgept_dist(p1, p2)
Definition: outlines.h:88
bool chop_vertical_creep
Definition: wordrec.h:210
#define FALSE
Definition: capi.h:52
Definition: blobs.h:83
#define same_point(p1, p2)
Definition: outlines.h:50
EDGEPT * next
Definition: blobs.h:176
#define is_exterior_point(edge, point)
Definition: outlines.h:98

◆ pick_good_seam()

SEAM * tesseract::Wordrec::pick_good_seam ( TBLOB blob)

Definition at line 224 of file findseam.cpp.

224  {
225  SeamPile seam_pile(chop_seam_pile_size);
226  EDGEPT *points[MAX_NUM_POINTS];
227  EDGEPT_CLIST new_points;
228  SEAM *seam = nullptr;
229  TESSLINE *outline;
230  int16_t num_points = 0;
231 
232 #ifndef GRAPHICS_DISABLED
233  if (chop_debug > 2)
234  wordrec_display_splits.set_value(true);
235 
236  draw_blob_edges(blob);
237 #endif
238 
239  PointHeap point_heap(MAX_NUM_POINTS);
240  for (outline = blob->outlines; outline; outline = outline->next)
241  prioritize_points(outline, &point_heap);
242 
243  while (!point_heap.empty() && num_points < MAX_NUM_POINTS) {
244  points[num_points++] = point_heap.PeekTop().data;
245  point_heap.Pop(nullptr);
246  }
247 
248  /* Initialize queue */
249  SeamQueue seam_queue(MAX_NUM_SEAMS);
250 
251  try_point_pairs(points, num_points, &seam_queue, &seam_pile, &seam, blob);
252  try_vertical_splits(points, num_points, &new_points,
253  &seam_queue, &seam_pile, &seam, blob);
254 
255  if (seam == nullptr) {
256  choose_best_seam(&seam_queue, nullptr, BAD_PRIORITY, &seam, blob, &seam_pile);
257  } else if (seam->priority() > chop_good_split) {
258  choose_best_seam(&seam_queue, nullptr, seam->priority(), &seam, blob,
259  &seam_pile);
260  }
261 
262  EDGEPT_C_IT it(&new_points);
263  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
264  EDGEPT *inserted_point = it.data();
265  if (seam == nullptr || !seam->UsesPoint(inserted_point)) {
266  for (outline = blob->outlines; outline; outline = outline->next) {
267  if (outline->loop == inserted_point) {
268  outline->loop = outline->loop->next;
269  }
270  }
271  remove_edgept(inserted_point);
272  }
273  }
274 
275  if (seam) {
276  if (seam->priority() > chop_ok_split) {
277  delete seam;
278  seam = nullptr;
279  }
280 #ifndef GRAPHICS_DISABLED
281  else if (wordrec_display_splits) {
282  seam->Mark(edge_window);
283  if (chop_debug > 2) {
286  }
287  }
288 #endif
289  }
290 
291  if (chop_debug)
292  wordrec_display_splits.set_value(false);
293 
294  return (seam);
295 }
TESSLINE * next
Definition: blobs.h:265
void Mark(ScrollView *window) const
Definition: seam.cpp:186
bool UsesPoint(const EDGEPT *point) const
Definition: seam.h:88
bool wordrec_display_splits
Definition: split.cpp:47
Definition: seam.h:44
#define MAX_NUM_POINTS
Definition: chop.h:39
void remove_edgept(EDGEPT *point)
Definition: split.cpp:206
#define edge_window_wait()
Definition: plotedges.h:61
#define BAD_PRIORITY
Definition: findseam.cpp:60
float priority() const
Definition: seam.h:65
EDGEPT * loop
Definition: blobs.h:264
Definition: blobs.h:83
void try_vertical_splits(EDGEPT *points[MAX_NUM_POINTS], int16_t num_points, EDGEPT_CLIST *new_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)
Definition: findseam.cpp:343
double chop_good_split
Definition: wordrec.h:226
#define update_edge_window()
Definition: plotedges.h:49
void prioritize_points(TESSLINE *outline, PointHeap *points)
Definition: chop.cpp:160
void choose_best_seam(SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile)
Definition: findseam.cpp:112
double chop_ok_split
Definition: wordrec.h:225
#define MAX_NUM_SEAMS
Definition: findseam.cpp:55
ScrollView * edge_window
Definition: plotedges.cpp:40
TESSLINE * outlines
Definition: blobs.h:384
EDGEPT * next
Definition: blobs.h:176
void try_point_pairs(EDGEPT *points[MAX_NUM_POINTS], int16_t num_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)
Definition: findseam.cpp:305
int chop_seam_pile_size
Definition: wordrec.h:214
void draw_blob_edges(TBLOB *blob)
Definition: plotedges.cpp:74

◆ point_priority()

PRIORITY tesseract::Wordrec::point_priority ( EDGEPT point)

Definition at line 53 of file chop.cpp.

53  {
54  return (PRIORITY)angle_change(point->prev, point, point->next);
55 }
EDGEPT * prev
Definition: blobs.h:177
float PRIORITY
Definition: seam.h:42
EDGEPT * next
Definition: blobs.h:176
int angle_change(EDGEPT *point1, EDGEPT *point2, EDGEPT *point3)
Definition: chop.cpp:87

◆ prioritize_points()

void tesseract::Wordrec::prioritize_points ( TESSLINE outline,
PointHeap points 
)

Definition at line 160 of file chop.cpp.

160  {
161  EDGEPT *this_point;
162  EDGEPT *local_min = nullptr;
163  EDGEPT *local_max = nullptr;
164 
165  this_point = outline->loop;
166  local_min = this_point;
167  local_max = this_point;
168  do {
169  if (this_point->vec.y < 0) {
170  /* Look for minima */
171  if (local_max != nullptr)
172  new_max_point(local_max, points);
173  else if (is_inside_angle (this_point))
174  add_point_to_list(points, this_point);
175  local_max = nullptr;
176  local_min = this_point->next;
177  }
178  else if (this_point->vec.y > 0) {
179  /* Look for maxima */
180  if (local_min != nullptr)
181  new_min_point(local_min, points);
182  else if (is_inside_angle (this_point))
183  add_point_to_list(points, this_point);
184  local_min = nullptr;
185  local_max = this_point->next;
186  }
187  else {
188  /* Flat area */
189  if (local_max != nullptr) {
190  if (local_max->prev->vec.y != 0) {
191  new_max_point(local_max, points);
192  }
193  local_max = this_point->next;
194  local_min = nullptr;
195  }
196  else {
197  if (local_min->prev->vec.y != 0) {
198  new_min_point(local_min, points);
199  }
200  local_min = this_point->next;
201  local_max = nullptr;
202  }
203  }
204 
205  /* Next point */
206  this_point = this_point->next;
207  }
208  while (this_point != outline->loop);
209 }
void add_point_to_list(PointHeap *point_heap, EDGEPT *point)
Definition: chop.cpp:63
EDGEPT * loop
Definition: blobs.h:264
VECTOR vec
Definition: blobs.h:171
Definition: blobs.h:83
EDGEPT * prev
Definition: blobs.h:177
void new_min_point(EDGEPT *local_min, PointHeap *points)
Definition: chop.cpp:219
int16_t y
Definition: blobs.h:79
EDGEPT * next
Definition: blobs.h:176
void new_max_point(EDGEPT *local_max, PointHeap *points)
Definition: chop.cpp:243
bool is_inside_angle(EDGEPT *pt)
Definition: chop.cpp:77

◆ ProcessSegSearchPainPoint()

void tesseract::Wordrec::ProcessSegSearchPainPoint ( float  pain_point_priority,
const MATRIX_COORD pain_point,
const char *  pain_point_type,
GenericVector< SegSearchPending > *  pending,
WERD_RES word_res,
LMPainPoints pain_points,
BlamerBundle blamer_bundle 
)
protected

Definition at line 249 of file segsearch.cpp.

253  {
254  if (segsearch_debug_level > 0) {
255  tprintf("Classifying pain point %s priority=%.4f, col=%d, row=%d\n",
256  pain_point_type, pain_point_priority,
257  pain_point.col, pain_point.row);
258  }
259  ASSERT_HOST(pain_points != nullptr);
260  MATRIX *ratings = word_res->ratings;
261  // Classify blob [pain_point.col pain_point.row]
262  if (!pain_point.Valid(*ratings)) {
263  ratings->IncreaseBandSize(pain_point.row + 1 - pain_point.col);
264  }
265  ASSERT_HOST(pain_point.Valid(*ratings));
266  BLOB_CHOICE_LIST *classified = classify_piece(word_res->seam_array,
267  pain_point.col, pain_point.row,
268  pain_point_type,
269  word_res->chopped_word,
270  blamer_bundle);
271  BLOB_CHOICE_LIST *lst = ratings->get(pain_point.col, pain_point.row);
272  if (lst == nullptr) {
273  ratings->put(pain_point.col, pain_point.row, classified);
274  } else {
275  // We can not delete old BLOB_CHOICEs, since they might contain
276  // ViterbiStateEntries that are parents of other "active" entries.
277  // Thus if the matrix cell already contains classifications we add
278  // the new ones to the beginning of the list.
279  BLOB_CHOICE_IT it(lst);
280  it.add_list_before(classified);
281  delete classified; // safe to delete, since empty after add_list_before()
282  classified = nullptr;
283  }
284 
285  if (segsearch_debug_level > 0) {
286  print_ratings_list("Updated ratings matrix with a new entry:",
287  ratings->get(pain_point.col, pain_point.row),
288  getDict().getUnicharset());
289  ratings->print(getDict().getUnicharset());
290  }
291 
292  // Insert initial "pain points" to join the newly classified blob
293  // with its left and right neighbors.
294  if (classified != nullptr && !classified->empty()) {
295  if (pain_point.col > 0) {
296  pain_points->GeneratePainPoint(
297  pain_point.col - 1, pain_point.row, LM_PPTYPE_SHAPE, 0.0,
298  true, segsearch_max_char_wh_ratio, word_res);
299  }
300  if (pain_point.row + 1 < ratings->dimension()) {
301  pain_points->GeneratePainPoint(
302  pain_point.col, pain_point.row + 1, LM_PPTYPE_SHAPE, 0.0,
303  true, segsearch_max_char_wh_ratio, word_res);
304  }
305  }
306  (*pending)[pain_point.col].SetBlobClassified(pain_point.row);
307 }
void print(const UNICHARSET &unicharset) const
Definition: matrix.cpp:117
double segsearch_max_char_wh_ratio
Definition: wordrec.h:244
bool Valid(const MATRIX &m) const
Definition: matrix.h:615
void IncreaseBandSize(int bandwidth)
Definition: matrix.cpp:54
int dimension() const
Definition: matrix.h:533
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
void put(ICOORD pos, const T &thing)
Definition: matrix.h:220
GenericVector< SEAM * > seam_array
Definition: pageres.h:217
MATRIX * ratings
Definition: pageres.h:231
const UNICHARSET & getUnicharset() const
Definition: dict.h:98
int segsearch_debug_level
Definition: wordrec.h:238
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:836
virtual Dict & getDict()
Definition: classify.h:107
Definition: matrix.h:575
virtual BLOB_CHOICE_LIST * classify_piece(const GenericVector< SEAM *> &seams, int16_t start, int16_t end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
Definition: pieces.cpp:55
TWERD * chopped_word
Definition: pageres.h:215
T get(ICOORD pos) const
Definition: matrix.h:228
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ program_editdown()

void tesseract::Wordrec::program_editdown ( int32_t  elasped_time)

Definition at line 75 of file tface.cpp.

75  {
76 #ifndef DISABLED_LEGACY_ENGINE
78 #endif // ndef DISABLED_LEGACY_ENGINE
79  getDict().End();
80 }
void End()
Definition: dict.cpp:343
void EndAdaptiveClassifier()
Definition: adaptmatch.cpp:460
virtual Dict & getDict()
Definition: classify.h:107

◆ program_editup()

void tesseract::Wordrec::program_editup ( const char *  textbase,
TessdataManager init_classifier,
TessdataManager init_dict 
)

Definition at line 40 of file tface.cpp.

42  {
43  if (textbase != nullptr) imagefile = textbase;
44 #ifndef DISABLED_LEGACY_ENGINE
46  InitAdaptiveClassifier(init_classifier);
47  if (init_dict) {
49  getDict().Load(lang, init_dict);
50  getDict().FinishLoad();
51  }
53 #endif // ndef DISABLED_LEGACY_ENGINE
54 }
static DawgCache * GlobalDawgCache()
Definition: dict.cpp:193
void InitAdaptiveClassifier(TessdataManager *mgr)
Definition: adaptmatch.cpp:528
STRING lang
Definition: ccutil.h:66
void InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs)
Definition: featdefs.cpp:112
void Load(const STRING &lang, TessdataManager *data_file)
Definition: dict.cpp:219
bool FinishLoad()
Definition: dict.cpp:323
STRING imagefile
Definition: ccutil.h:70
void SetupForLoad(DawgCache *dawg_cache)
Definition: dict.cpp:201
double chop_ok_split
Definition: wordrec.h:225
virtual Dict & getDict()
Definition: classify.h:107
FEATURE_DEFS_STRUCT feature_defs_
Definition: classify.h:548
PRIORITY pass2_ok_split
Definition: wordrec.h:477

◆ ResetNGramSearch()

void tesseract::Wordrec::ResetNGramSearch ( WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
GenericVector< SegSearchPending > *  pending 
)
protected

Definition at line 312 of file segsearch.cpp.

314  {
315  // TODO(rays) More refactoring required here.
316  // Delete existing viterbi states.
317  for (int col = 0; col < best_choice_bundle->beam.size(); ++col) {
318  best_choice_bundle->beam[col]->Clear();
319  }
320  // Reset best_choice_bundle.
321  word_res->ClearWordChoices();
322  best_choice_bundle->best_vse = nullptr;
323  // Clear out all existing pendings and add a new one for the first column.
324  (*pending)[0].SetColumnClassified();
325  for (int i = 1; i < pending->size(); ++i)
326  (*pending)[i].Clear();
327 }
int size() const
Definition: genericvector.h:71
void ClearWordChoices()
Definition: pageres.cpp:1178

◆ SaveAltChoices()

void tesseract::Wordrec::SaveAltChoices ( const LIST best_choices,
WERD_RES word 
)

◆ SegSearch()

void tesseract::Wordrec::SegSearch ( WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)

Definition at line 43 of file segsearch.cpp.

45  {
46  LMPainPoints pain_points(segsearch_max_pain_points,
50  // Compute scaling factor that will help us recover blob outline length
51  // from classifier rating and certainty for the blob.
52  float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale;
54  InitialSegSearch(word_res, &pain_points, &pending, best_choice_bundle,
55  blamer_bundle);
56 
57  if (!SegSearchDone(0)) { // find a better choice
58  if (chop_enable && word_res->chopped_word != nullptr) {
59  improve_by_chopping(rating_cert_scale, word_res, best_choice_bundle,
60  blamer_bundle, &pain_points, &pending);
61  }
62  if (chop_debug) SEAM::PrintSeams("Final seam list:", word_res->seam_array);
63 
64  if (blamer_bundle != nullptr &&
65  !blamer_bundle->ChoiceIsCorrect(word_res->best_choice)) {
66  blamer_bundle->SetChopperBlame(word_res, wordrec_debug_blamer);
67  }
68  }
69  // Keep trying to find a better path by fixing the "pain points".
70 
71  MATRIX_COORD pain_point;
72  float pain_point_priority;
73  int num_futile_classifications = 0;
74  STRING blamer_debug;
75  while (wordrec_enable_assoc &&
76  (!SegSearchDone(num_futile_classifications) ||
77  (blamer_bundle != nullptr &&
78  blamer_bundle->GuidedSegsearchStillGoing()))) {
79  // Get the next valid "pain point".
80  bool found_nothing = true;
81  LMPainPointsType pp_type;
82  while ((pp_type = pain_points.Deque(&pain_point, &pain_point_priority)) !=
83  LM_PPTYPE_NUM) {
84  if (!pain_point.Valid(*word_res->ratings)) {
85  word_res->ratings->IncreaseBandSize(
86  pain_point.row - pain_point.col + 1);
87  }
88  if (pain_point.Valid(*word_res->ratings) &&
89  !word_res->ratings->Classified(pain_point.col, pain_point.row,
90  getDict().WildcardID())) {
91  found_nothing = false;
92  break;
93  }
94  }
95  if (found_nothing) {
96  if (segsearch_debug_level > 0) tprintf("Pain points queue is empty\n");
97  break;
98  }
99  ProcessSegSearchPainPoint(pain_point_priority, pain_point,
101  &pending, word_res, &pain_points, blamer_bundle);
102 
103  UpdateSegSearchNodes(rating_cert_scale, pain_point.col, &pending,
104  word_res, &pain_points, best_choice_bundle,
105  blamer_bundle);
106  if (!best_choice_bundle->updated) ++num_futile_classifications;
107 
108  if (segsearch_debug_level > 0) {
109  tprintf("num_futile_classifications %d\n", num_futile_classifications);
110  }
111 
112  best_choice_bundle->updated = false; // reset updated
113 
114  // See if it's time to terminate SegSearch or time for starting a guided
115  // search for the true path to find the blame for the incorrect best_choice.
116  if (SegSearchDone(num_futile_classifications) &&
117  blamer_bundle != nullptr &&
118  blamer_bundle->GuidedSegsearchNeeded(word_res->best_choice)) {
119  InitBlamerForSegSearch(word_res, &pain_points, blamer_bundle,
120  &blamer_debug);
121  }
122  } // end while loop exploring alternative paths
123  if (blamer_bundle != nullptr) {
124  blamer_bundle->FinishSegSearch(word_res->best_choice,
125  wordrec_debug_blamer, &blamer_debug);
126  }
127 
128  if (segsearch_debug_level > 0) {
129  tprintf("Done with SegSearch (AcceptableChoiceFound: %d)\n",
130  language_model_->AcceptableChoiceFound());
131  }
132 }
bool GuidedSegsearchStillGoing() const
Definition: blamer.cpp:506
double segsearch_max_char_wh_ratio
Definition: wordrec.h:244
void ProcessSegSearchPainPoint(float pain_point_priority, const MATRIX_COORD &pain_point, const char *pain_point_type, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:249
double certainty_scale
Definition: dict.h:611
int segsearch_max_pain_points
Definition: wordrec.h:240
void SetChopperBlame(const WERD_RES *word, bool debug)
Definition: blamer.cpp:315
bool Valid(const MATRIX &m) const
Definition: matrix.h:615
bool ChoiceIsCorrect(const WERD_CHOICE *word_choice) const
Definition: blamer.cpp:116
static const char * PainPointDescription(LMPainPointsType type)
bool GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const
Definition: blamer.cpp:466
void IncreaseBandSize(int bandwidth)
Definition: matrix.cpp:54
std::unique_ptr< LanguageModel > language_model_
Definition: wordrec.h:476
void InitBlamerForSegSearch(WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle, STRING *blamer_debug)
Definition: segsearch.cpp:329
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
void UpdateSegSearchNodes(float rating_cert_scale, int starting_col, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:181
GenericVector< SEAM * > seam_array
Definition: pageres.h:217
Definition: strngs.h:45
bool Classified(int col, int row, int wildcard_id) const
Definition: matrix.cpp:41
void InitialSegSearch(WERD_RES *word_res, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:137
MATRIX * ratings
Definition: pageres.h:231
bool assume_fixed_pitch_char_segment
Definition: wordrec.h:230
bool SegSearchDone(int num_futile_classifications)
Definition: wordrec.h:491
int segsearch_debug_level
Definition: wordrec.h:238
bool wordrec_enable_assoc
Definition: wordrec.h:199
void improve_by_chopping(float rating_cert_scale, WERD_RES *word, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending)
Definition: chopper.cpp:462
virtual Dict & getDict()
Definition: classify.h:107
static void PrintSeams(const char *label, const GenericVector< SEAM *> &seams)
Definition: seam.cpp:173
void FinishSegSearch(const WERD_CHOICE *best_choice, bool debug, STRING *debug_str)
Definition: blamer.cpp:511
bool wordrec_debug_blamer
Definition: wordrec.h:236
TWERD * chopped_word
Definition: pageres.h:215
WERD_CHOICE * best_choice
Definition: pageres.h:235

◆ SegSearchDone()

bool tesseract::Wordrec::SegSearchDone ( int  num_futile_classifications)
inlineprotected

Definition at line 491 of file wordrec.h.

491  {
492  return (language_model_->AcceptableChoiceFound() ||
493  num_futile_classifications >=
495  }
std::unique_ptr< LanguageModel > language_model_
Definition: wordrec.h:476
int segsearch_max_futile_classifications
Definition: wordrec.h:242

◆ select_blob_to_split()

int tesseract::Wordrec::select_blob_to_split ( const GenericVector< BLOB_CHOICE *> &  blob_choices,
float  rating_ceiling,
bool  split_next_to_fragment 
)

Definition at line 546 of file chopper.cpp.

548  {
549  BLOB_CHOICE *blob_choice;
550  int x;
551  float worst = -FLT_MAX;
552  int worst_index = -1;
553  float worst_near_fragment = -FLT_MAX;
554  int worst_index_near_fragment = -1;
555  const CHAR_FRAGMENT **fragments = nullptr;
556 
557  if (chop_debug) {
558  if (rating_ceiling < FLT_MAX)
559  tprintf("rating_ceiling = %8.4f\n", rating_ceiling);
560  else
561  tprintf("rating_ceiling = No Limit\n");
562  }
563 
564  if (split_next_to_fragment && blob_choices.size() > 0) {
565  fragments = new const CHAR_FRAGMENT *[blob_choices.length()];
566  if (blob_choices[0] != nullptr) {
567  fragments[0] = getDict().getUnicharset().get_fragment(
568  blob_choices[0]->unichar_id());
569  } else {
570  fragments[0] = nullptr;
571  }
572  }
573 
574  for (x = 0; x < blob_choices.size(); ++x) {
575  if (blob_choices[x] == nullptr) {
576  delete[] fragments;
577  return x;
578  } else {
579  blob_choice = blob_choices[x];
580  // Populate fragments for the following position.
581  if (split_next_to_fragment && x+1 < blob_choices.size()) {
582  if (blob_choices[x + 1] != nullptr) {
583  fragments[x + 1] = getDict().getUnicharset().get_fragment(
584  blob_choices[x + 1]->unichar_id());
585  } else {
586  fragments[x + 1] = nullptr;
587  }
588  }
589  if (blob_choice->rating() < rating_ceiling &&
590  blob_choice->certainty() < tessedit_certainty_threshold) {
591  // Update worst and worst_index.
592  if (blob_choice->rating() > worst) {
593  worst_index = x;
594  worst = blob_choice->rating();
595  }
596  if (split_next_to_fragment) {
597  // Update worst_near_fragment and worst_index_near_fragment.
598  bool expand_following_fragment =
599  (x + 1 < blob_choices.size() &&
600  fragments[x+1] != nullptr && !fragments[x+1]->is_beginning());
601  bool expand_preceding_fragment =
602  (x > 0 && fragments[x-1] != nullptr && !fragments[x-1]->is_ending());
603  if ((expand_following_fragment || expand_preceding_fragment) &&
604  blob_choice->rating() > worst_near_fragment) {
605  worst_index_near_fragment = x;
606  worst_near_fragment = blob_choice->rating();
607  if (chop_debug) {
608  tprintf("worst_index_near_fragment=%d"
609  " expand_following_fragment=%d"
610  " expand_preceding_fragment=%d\n",
611  worst_index_near_fragment,
612  expand_following_fragment,
613  expand_preceding_fragment);
614  }
615  }
616  }
617  }
618  }
619  }
620  delete[] fragments;
621  // TODO(daria): maybe a threshold of badness for
622  // worst_near_fragment would be useful.
623  return worst_index_near_fragment != -1 ?
624  worst_index_near_fragment : worst_index;
625 }
bool is_beginning() const
Definition: unicharset.h:106
float certainty() const
Definition: ratngs.h:83
int size() const
Definition: genericvector.h:71
int length() const
Definition: genericvector.h:85
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
double tessedit_certainty_threshold
Definition: wordrec.h:207
bool is_ending() const
Definition: unicharset.h:109
float rating() const
Definition: ratngs.h:80
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:729
const UNICHARSET & getUnicharset() const
Definition: dict.h:98
virtual Dict & getDict()
Definition: classify.h:107

◆ select_blob_to_split_from_fixpt()

int tesseract::Wordrec::select_blob_to_split_from_fixpt ( DANGERR fixpt)

Definition at line 634 of file chopper.cpp.

634  {
635  if (!fixpt)
636  return -1;
637  for (int i = 0; i < fixpt->size(); i++) {
638  if ((*fixpt)[i].begin + 1 == (*fixpt)[i].end &&
639  (*fixpt)[i].dangerous &&
640  (*fixpt)[i].correct_is_ngram) {
641  return (*fixpt)[i].begin;
642  }
643  }
644  return -1;
645 }
int size() const
Definition: genericvector.h:71

◆ set_pass1()

void tesseract::Wordrec::set_pass1 ( )

Definition at line 89 of file tface.cpp.

89  {
90  chop_ok_split.set_value(70.0);
91  language_model_->getParamsModel().SetPass(ParamsModel::PTRAIN_PASS1);
92  SettupPass1();
93 }
std::unique_ptr< LanguageModel > language_model_
Definition: wordrec.h:476
double chop_ok_split
Definition: wordrec.h:225

◆ set_pass2()

void tesseract::Wordrec::set_pass2 ( )

Definition at line 101 of file tface.cpp.

101  {
102  chop_ok_split.set_value(pass2_ok_split);
103  language_model_->getParamsModel().SetPass(ParamsModel::PTRAIN_PASS2);
104  SettupPass2();
105 }
std::unique_ptr< LanguageModel > language_model_
Definition: wordrec.h:476
double chop_ok_split
Definition: wordrec.h:225
PRIORITY pass2_ok_split
Definition: wordrec.h:477

◆ try_point_pairs()

void tesseract::Wordrec::try_point_pairs ( EDGEPT points[MAX_NUM_POINTS],
int16_t  num_points,
SeamQueue seam_queue,
SeamPile seam_pile,
SEAM **  seam,
TBLOB blob 
)

Definition at line 305 of file findseam.cpp.

310  {
311  int16_t x;
312  int16_t y;
313  PRIORITY priority;
314 
315  for (x = 0; x < num_points; x++) {
316  for (y = x + 1; y < num_points; y++) {
317  if (points[y] &&
318  points[x]->WeightedDistance(*points[y], chop_x_y_weight) <
320  points[x] != points[y]->next && points[y] != points[x]->next &&
321  !is_exterior_point(points[x], points[y]) &&
322  !is_exterior_point(points[y], points[x])) {
323  SPLIT split(points[x], points[y]);
324  priority = partial_split_priority(&split);
325 
326  choose_best_seam(seam_queue, &split, priority, seam, blob, seam_pile);
327  }
328  }
329  }
330 }
Definition: split.h:37
#define partial_split_priority(split)
Definition: findseam.cpp:47
void choose_best_seam(SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile)
Definition: findseam.cpp:112
float PRIORITY
Definition: seam.h:42
#define is_exterior_point(edge, point)
Definition: outlines.h:98

◆ try_vertical_splits()

void tesseract::Wordrec::try_vertical_splits ( EDGEPT points[MAX_NUM_POINTS],
int16_t  num_points,
EDGEPT_CLIST *  new_points,
SeamQueue seam_queue,
SeamPile seam_pile,
SEAM **  seam,
TBLOB blob 
)

Definition at line 343 of file findseam.cpp.

349  {
350  EDGEPT *vertical_point = nullptr;
351  int16_t x;
352  PRIORITY priority;
353  TESSLINE *outline;
354 
355  for (x = 0; x < num_points; x++) {
356  vertical_point = nullptr;
357  for (outline = blob->outlines; outline; outline = outline->next) {
358  vertical_projection_point(points[x], outline->loop,
359  &vertical_point, new_points);
360  }
361 
362  if (vertical_point && points[x] != vertical_point->next &&
363  vertical_point != points[x]->next &&
364  points[x]->WeightedDistance(*vertical_point, chop_x_y_weight) <
366  SPLIT split(points[x], vertical_point);
367  priority = partial_split_priority(&split);
368  choose_best_seam(seam_queue, &split, priority, seam, blob, seam_pile);
369  }
370  }
371 }
TESSLINE * next
Definition: blobs.h:265
EDGEPT * loop
Definition: blobs.h:264
Definition: blobs.h:83
Definition: split.h:37
#define partial_split_priority(split)
Definition: findseam.cpp:47
void choose_best_seam(SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile)
Definition: findseam.cpp:112
float PRIORITY
Definition: seam.h:42
void vertical_projection_point(EDGEPT *split_point, EDGEPT *target_point, EDGEPT **best_point, EDGEPT_CLIST *new_points)
Definition: chop.cpp:272
int WeightedDistance(const EDGEPT &other, int x_factor) const
Definition: blobs.h:106
TESSLINE * outlines
Definition: blobs.h:384
EDGEPT * next
Definition: blobs.h:176

◆ UpdateSegSearchNodes()

void tesseract::Wordrec::UpdateSegSearchNodes ( float  rating_cert_scale,
int  starting_col,
GenericVector< SegSearchPending > *  pending,
WERD_RES word_res,
LMPainPoints pain_points,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)
protected

Definition at line 181 of file segsearch.cpp.

188  {
189  MATRIX *ratings = word_res->ratings;
190  ASSERT_HOST(ratings->dimension() == pending->size());
191  ASSERT_HOST(ratings->dimension() == best_choice_bundle->beam.size());
192  for (int col = starting_col; col < ratings->dimension(); ++col) {
193  if (!(*pending)[col].WorkToDo()) continue;
194  int first_row = col;
195  int last_row = std::min(ratings->dimension() - 1,
196  col + ratings->bandwidth() - 1);
197  if ((*pending)[col].SingleRow() >= 0) {
198  first_row = last_row = (*pending)[col].SingleRow();
199  }
200  if (segsearch_debug_level > 0) {
201  tprintf("\n\nUpdateSegSearchNodes: col=%d, rows=[%d,%d], alljust=%d\n",
202  col, first_row, last_row,
203  (*pending)[col].IsRowJustClassified(INT32_MAX));
204  }
205  // Iterate over the pending list for this column.
206  for (int row = first_row; row <= last_row; ++row) {
207  // Update language model state of this child+parent pair.
208  BLOB_CHOICE_LIST *current_node = ratings->get(col, row);
209  LanguageModelState *parent_node =
210  col == 0 ? nullptr : best_choice_bundle->beam[col - 1];
211  if (current_node != nullptr &&
212  language_model_->UpdateState((*pending)[col].IsRowJustClassified(row),
213  col, row, current_node, parent_node,
214  pain_points, word_res,
215  best_choice_bundle, blamer_bundle) &&
216  row + 1 < ratings->dimension()) {
217  // Since the language model state of this entry changed, process all
218  // the child column.
219  (*pending)[row + 1].RevisitWholeColumn();
220  if (segsearch_debug_level > 0) {
221  tprintf("Added child col=%d to pending\n", row + 1);
222  }
223  } // end if UpdateState.
224  } // end for row.
225  } // end for col.
226  if (best_choice_bundle->best_vse != nullptr) {
227  ASSERT_HOST(word_res->StatesAllValid());
228  if (best_choice_bundle->best_vse->updated) {
229  pain_points->GenerateFromPath(rating_cert_scale,
230  best_choice_bundle->best_vse, word_res);
231  if (!best_choice_bundle->fixpt.empty()) {
232  pain_points->GenerateFromAmbigs(best_choice_bundle->fixpt,
233  best_choice_bundle->best_vse, word_res);
234  }
235  }
236  }
237  // The segsearch is completed. Reset all updated flags on all VSEs and reset
238  // all pendings.
239  for (int col = 0; col < pending->size(); ++col) {
240  (*pending)[col].Clear();
241  ViterbiStateEntry_IT
242  vse_it(&best_choice_bundle->beam[col]->viterbi_state_entries);
243  for (vse_it.mark_cycle_pt(); !vse_it.cycled_list(); vse_it.forward()) {
244  vse_it.data()->updated = false;
245  }
246  }
247 }
int size() const
Definition: genericvector.h:71
int bandwidth() const
Definition: matrix.h:535
std::unique_ptr< LanguageModel > language_model_
Definition: wordrec.h:476
int dimension() const
Definition: matrix.h:533
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:37
MATRIX * ratings
Definition: pageres.h:231
int segsearch_debug_level
Definition: wordrec.h:238
Definition: matrix.h:575
T get(ICOORD pos) const
Definition: matrix.h:228
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ vertical_projection_point()

void tesseract::Wordrec::vertical_projection_point ( EDGEPT split_point,
EDGEPT target_point,
EDGEPT **  best_point,
EDGEPT_CLIST *  new_points 
)

Definition at line 272 of file chop.cpp.

274  {
275  EDGEPT *p; /* Iterator */
276  EDGEPT *this_edgept; /* Iterator */
277  EDGEPT_C_IT new_point_it(new_points);
278  int x = split_point->pos.x; /* X value of vertical */
279  int best_dist = LARGE_DISTANCE;/* Best point found */
280 
281  if (*best_point != nullptr)
282  best_dist = edgept_dist(split_point, *best_point);
283 
284  p = target_point;
285  /* Look at each edge point */
286  do {
287  if (((p->pos.x <= x && x <= p->next->pos.x) ||
288  (p->next->pos.x <= x && x <= p->pos.x)) &&
289  !same_point(split_point->pos, p->pos) &&
290  !same_point(split_point->pos, p->next->pos) &&
291  !p->IsChopPt() &&
292  (*best_point == nullptr || !same_point((*best_point)->pos, p->pos))) {
293 
294  if (near_point(split_point, p, p->next, &this_edgept)) {
295  new_point_it.add_before_then_move(this_edgept);
296  }
297 
298  if (*best_point == nullptr)
299  best_dist = edgept_dist (split_point, this_edgept);
300 
301  this_edgept =
302  pick_close_point(split_point, this_edgept, &best_dist);
303  if (this_edgept)
304  *best_point = this_edgept;
305  }
306 
307  p = p->next;
308  }
309  while (p != target_point);
310 }
TPOINT pos
Definition: blobs.h:170
#define edgept_dist(p1, p2)
Definition: outlines.h:88
bool near_point(EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt)
Definition: outlines.cpp:45
Definition: blobs.h:83
EDGEPT * pick_close_point(EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist)
Definition: chop.cpp:122
int16_t x
Definition: blobs.h:78
#define LARGE_DISTANCE
Definition: outlines.h:37
bool IsChopPt() const
Definition: blobs.h:166
#define same_point(p1, p2)
Definition: outlines.h:50
EDGEPT * next
Definition: blobs.h:176

Member Data Documentation

◆ assume_fixed_pitch_char_segment

bool tesseract::Wordrec::assume_fixed_pitch_char_segment = FALSE

"include fixed-pitch heuristics in char segmentation"

Definition at line 230 of file wordrec.h.

◆ blame_reasons_

GenericVector<int> tesseract::Wordrec::blame_reasons_

Definition at line 483 of file wordrec.h.

◆ chop_center_knob

double tesseract::Wordrec::chop_center_knob = 0.15

"Split center adjustment"

Definition at line 220 of file wordrec.h.

◆ chop_centered_maxwidth

int tesseract::Wordrec::chop_centered_maxwidth = 90

"Width of (smaller) chopped blobs " "above which we don't care that a chop is not near the center."

Definition at line 222 of file wordrec.h.

◆ chop_debug

int tesseract::Wordrec::chop_debug = 0

"Chop debug"

Definition at line 208 of file wordrec.h.

◆ chop_enable

bool tesseract::Wordrec::chop_enable = 1

"Chop enable"

Definition at line 209 of file wordrec.h.

◆ chop_good_split

double tesseract::Wordrec::chop_good_split = 50.0

"Good split limit"

Definition at line 226 of file wordrec.h.

◆ chop_inside_angle

int tesseract::Wordrec::chop_inside_angle = -50

"Min Inside Angle Bend"

Definition at line 216 of file wordrec.h.

◆ chop_min_outline_area

int tesseract::Wordrec::chop_min_outline_area = 2000

"Min Outline Area"

Definition at line 217 of file wordrec.h.

◆ chop_min_outline_points

int tesseract::Wordrec::chop_min_outline_points = 6

"Min Number of Points on Outline"

Definition at line 213 of file wordrec.h.

◆ chop_new_seam_pile

bool tesseract::Wordrec::chop_new_seam_pile = 1

"Use new seam_pile"

Definition at line 215 of file wordrec.h.

◆ chop_ok_split

double tesseract::Wordrec::chop_ok_split = 100.0

"OK split limit"

Definition at line 225 of file wordrec.h.

◆ chop_overlap_knob

double tesseract::Wordrec::chop_overlap_knob = 0.9

"Split overlap adjustment"

Definition at line 219 of file wordrec.h.

◆ chop_same_distance

int tesseract::Wordrec::chop_same_distance = 2

"Same distance"

Definition at line 212 of file wordrec.h.

◆ chop_seam_pile_size

int tesseract::Wordrec::chop_seam_pile_size = 150

"Max number of seams in seam_pile"

Definition at line 214 of file wordrec.h.

◆ chop_sharpness_knob

double tesseract::Wordrec::chop_sharpness_knob = 0.06

"Split sharpness adjustment"

Definition at line 223 of file wordrec.h.

◆ chop_split_dist_knob

double tesseract::Wordrec::chop_split_dist_knob = 0.5

"Split length adjustment"

Definition at line 218 of file wordrec.h.

◆ chop_split_length

int tesseract::Wordrec::chop_split_length = 10000

"Split Length"

Definition at line 211 of file wordrec.h.

◆ chop_vertical_creep

bool tesseract::Wordrec::chop_vertical_creep = 0

"Vertical creep"

Definition at line 210 of file wordrec.h.

◆ chop_width_change_knob

double tesseract::Wordrec::chop_width_change_knob = 5.0

"Width change adjustment"

Definition at line 224 of file wordrec.h.

◆ chop_x_y_weight

int tesseract::Wordrec::chop_x_y_weight = 3

"X / Y length weight"

Definition at line 227 of file wordrec.h.

◆ fill_lattice_

void(Wordrec::* tesseract::Wordrec::fill_lattice_) (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)

Definition at line 485 of file wordrec.h.

◆ force_word_assoc

bool tesseract::Wordrec::force_word_assoc = FALSE

"force associator to run regardless of what enable_assoc is." "This is used for CJK where component grouping is necessary."

Definition at line 202 of file wordrec.h.

◆ fragments_guide_chopper

bool tesseract::Wordrec::fragments_guide_chopper = FALSE

"Use information from fragments to guide chopping process"

Definition at line 205 of file wordrec.h.

◆ language_model_

std::unique_ptr<LanguageModel> tesseract::Wordrec::language_model_

Definition at line 476 of file wordrec.h.

◆ merge_fragments_in_matrix

bool tesseract::Wordrec::merge_fragments_in_matrix = TRUE

"Merge the fragments in the ratings matrix and delete them " "after merging"

Definition at line 197 of file wordrec.h.

◆ pass2_ok_split

PRIORITY tesseract::Wordrec::pass2_ok_split

Definition at line 477 of file wordrec.h.

◆ prev_word_best_choice_

WERD_CHOICE* tesseract::Wordrec::prev_word_best_choice_

Definition at line 481 of file wordrec.h.

◆ repair_unchopped_blobs

int tesseract::Wordrec::repair_unchopped_blobs = 1

"Fix blobs that aren't chopped"

Definition at line 206 of file wordrec.h.

◆ save_alt_choices

bool tesseract::Wordrec::save_alt_choices = true

"Save alternative paths found during chopping " "and segmentation search"

Definition at line 247 of file wordrec.h.

◆ segment_adjust_debug

int tesseract::Wordrec::segment_adjust_debug = 0

"Segmentation adjustment debug"

Definition at line 228 of file wordrec.h.

◆ segsearch_debug_level

int tesseract::Wordrec::segsearch_debug_level = 0

"SegSearch debug level"

Definition at line 238 of file wordrec.h.

◆ segsearch_max_char_wh_ratio

double tesseract::Wordrec::segsearch_max_char_wh_ratio = 2.0

"Maximum character width-to-height ratio"

Definition at line 244 of file wordrec.h.

◆ segsearch_max_futile_classifications

int tesseract::Wordrec::segsearch_max_futile_classifications = 10

"Maximum number of pain point classifications per word."

Definition at line 242 of file wordrec.h.

◆ segsearch_max_pain_points

int tesseract::Wordrec::segsearch_max_pain_points = 2000

"Maximum number of pain points stored in the queue"

Definition at line 240 of file wordrec.h.

◆ tessedit_certainty_threshold

double tesseract::Wordrec::tessedit_certainty_threshold = -2.25

"Good blob limit"

Definition at line 207 of file wordrec.h.

◆ wordrec_debug_blamer

bool tesseract::Wordrec::wordrec_debug_blamer = false

"Print blamer debug messages"

Definition at line 236 of file wordrec.h.

◆ wordrec_debug_level

int tesseract::Wordrec::wordrec_debug_level = 0

"Debug level for wordrec"

Definition at line 231 of file wordrec.h.

◆ wordrec_enable_assoc

bool tesseract::Wordrec::wordrec_enable_assoc = TRUE

"Associator Enable"

Definition at line 199 of file wordrec.h.

◆ wordrec_max_join_chunks

int tesseract::Wordrec::wordrec_max_join_chunks = 4

"Max number of broken pieces to associate"

Definition at line 233 of file wordrec.h.

◆ wordrec_no_block

bool tesseract::Wordrec::wordrec_no_block = FALSE

"Don't output block information"

Definition at line 198 of file wordrec.h.

◆ wordrec_run_blamer

bool tesseract::Wordrec::wordrec_run_blamer = false

"Try to set the blame for errors"

Definition at line 237 of file wordrec.h.

◆ wordrec_skip_no_truth_words

bool tesseract::Wordrec::wordrec_skip_no_truth_words = false

"Only run OCR for words that had truth recorded in BlamerBundle"

Definition at line 235 of file wordrec.h.

◆ wordrec_worst_state

double tesseract::Wordrec::wordrec_worst_state = 1

"Worst segmentation state"

Definition at line 203 of file wordrec.h.


The documentation for this class was generated from the following files: