Typedefs | |
typedef int(Dict::* | DictFunc )(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const |
typedef double(Dict::* | ProbabilityInContextFunc )(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes) |
typedef float(Dict::* | ParamsModelClassifyFunc )(const char *lang, void *path) |
typedef void(Wordrec::* | FillLatticeFunc )(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle) |
typedef TessCallback4< const UNICHARSET &, int, PageIterator *, Pix * > | TruthCallback |
typedef GenericVectorEqEq < const ParagraphModel * > | SetOfModels |
typedef void(Tesseract::* | WordRecognizer )(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words) |
typedef GenericVector < ParamsTrainingHypothesis > | ParamsTrainingHypothesisList |
typedef GenericVector< UNICHAR_ID > | UnicharIdVector |
typedef GenericVector < AmbigSpec_LIST * > | UnicharAmbigsVector |
typedef bool(* | FileReader )(const STRING &filename, GenericVector< char > *data) |
typedef bool(* | FileWriter )(const GenericVector< char > &data, const STRING &filename) |
typedef KDPairInc< int, int > | IntKDPair |
typedef GenericHeap < ShapeQueueEntry > | ShapeQueue |
typedef signed int | char_32 |
typedef basic_string< char_32 > | string_32 |
typedef GenericVector< NodeChild > | NodeChildVector |
typedef GenericVector< int > | SuccessorList |
typedef GenericVector < SuccessorList * > | SuccessorListsVector |
typedef GenericVector< Dawg * > | DawgVector |
typedef GridSearch< BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT > | BlobGridSearch |
typedef GridSearch < ColPartition, ColPartition_CLIST, ColPartition_C_IT > | ColPartitionGridSearch |
typedef GenericVector < ColPartitionSet * > | PartSetVector |
typedef TessResultCallback1 < bool, int > | WidthCallback |
typedef BBGrid< ColSegment, ColSegment_CLIST, ColSegment_C_IT > | ColSegmentGrid |
typedef GridSearch< ColSegment, ColSegment_CLIST, ColSegment_C_IT > | ColSegmentGridSearch |
typedef BBGrid< WordWithBox, WordWithBox_CLIST, WordWithBox_C_IT > | WordGrid |
typedef GridSearch < WordWithBox, WordWithBox_CLIST, WordWithBox_C_IT > | WordSearch |
typedef hash_map< string, string, StringHash > | LigHash |
typedef GenericHeap < MatrixCoordPair > | PainPointHeap |
typedef unsigned char | LanguageModelFlagsType |
Used for expressing various language model flags. More... | |
Functions | |
int | CubeAPITest (Boxa *boxa_blocks, Pixa *pixa_blocks, Boxa *boxa_words, Pixa *pixa_words, const FCOORD &reskew, Pix *page_pix, PAGE_RES *page_res) |
TBLOB * | make_tesseract_blob (float baseline, float xheight, float descender, float ascender, bool numeric_mode, Pix *pix) |
STRING | HOcrEscape (const char *text) |
double | prec (double x) |
long | dist2 (int x1, int y1, int x2, int y2) |
void | GetWordBaseline (int writing_direction, int ppi, int height, int word_x1, int word_y1, int word_x2, int word_y2, int line_x1, int line_y1, int line_x2, int line_y2, double *x0, double *y0, double *length) |
void | AffineMatrix (int writing_direction, int line_x1, int line_y1, int line_x2, int line_y2, double *a, double *b, double *c, double *d) |
void | ClipBaseline (int ppi, int x1, int y1, int x2, int y2, int *line_x1, int *line_y1, int *line_x2, int *line_y2) |
bool | IsTextOrEquationType (PolyBlockType type) |
bool | IsLeftIndented (const EquationDetect::IndentType type) |
bool | IsRightIndented (const EquationDetect::IndentType type) |
STRING | RtlEmbed (const STRING &word, bool rtlify) |
bool | IsLatinLetter (int ch) |
bool | IsDigitLike (int ch) |
bool | IsOpeningPunct (int ch) |
bool | IsTerminalPunct (int ch) |
const char * | SkipChars (const char *str, const char *toskip) |
const char * | SkipChars (const char *str, bool(*skip)(int)) |
const char * | SkipOne (const char *str, const char *toskip) |
bool | LikelyListNumeral (const STRING &word) |
bool | LikelyListMark (const STRING &word) |
bool | AsciiLikelyListItem (const STRING &word) |
int | UnicodeFor (const UNICHARSET *u, const WERD_CHOICE *werd, int pos) |
bool | LikelyListMarkUnicode (int ch) |
bool | UniLikelyListItem (const UNICHARSET *u, const WERD_CHOICE *werd) |
void | LeftWordAttributes (const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea) |
void | RightWordAttributes (const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea) |
int | ClosestCluster (const GenericVector< Cluster > &clusters, int value) |
void | CalculateTabStops (GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, int tolerance, GenericVector< Cluster > *left_tabs, GenericVector< Cluster > *right_tabs) |
void | MarkRowsWithModel (GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, const ParagraphModel *model, bool ltr, int eop_threshold) |
void | GeometricClassifyThreeTabStopTextBlock (int debug_level, GeometricClassifierState &s, ParagraphTheory *theory) |
void | GeometricClassify (int debug_level, GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory) |
bool | ValidFirstLine (const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model) |
bool | ValidBodyLine (const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model) |
bool | CrownCompatible (const GenericVector< RowScratchRegisters > *rows, int a, int b, const ParagraphModel *model) |
void | DiscardUnusedModels (const GenericVector< RowScratchRegisters > &rows, ParagraphTheory *theory) |
void | DowngradeWeakestToCrowns (int debug_level, ParagraphTheory *theory, GenericVector< RowScratchRegisters > *rows) |
void | RecomputeMarginsAndClearHypotheses (GenericVector< RowScratchRegisters > *rows, int start, int end, int percentile) |
int | InterwordSpace (const GenericVector< RowScratchRegisters > &rows, int row_start, int row_end) |
bool | FirstWordWouldHaveFit (const RowScratchRegisters &before, const RowScratchRegisters &after, tesseract::ParagraphJustification justification) |
bool | FirstWordWouldHaveFit (const RowScratchRegisters &before, const RowScratchRegisters &after) |
bool | TextSupportsBreak (const RowScratchRegisters &before, const RowScratchRegisters &after) |
bool | LikelyParagraphStart (const RowScratchRegisters &before, const RowScratchRegisters &after) |
bool | LikelyParagraphStart (const RowScratchRegisters &before, const RowScratchRegisters &after, tesseract::ParagraphJustification j) |
ParagraphModel | InternalParagraphModelByOutline (const GenericVector< RowScratchRegisters > *rows, int start, int end, int tolerance, bool *consistent) |
ParagraphModel | ParagraphModelByOutline (int debug_level, const GenericVector< RowScratchRegisters > *rows, int start, int end, int tolerance) |
bool | RowsFitModel (const GenericVector< RowScratchRegisters > *rows, int start, int end, const ParagraphModel *model) |
void | MarkStrongEvidence (GenericVector< RowScratchRegisters > *rows, int row_start, int row_end) |
void | ModelStrongEvidence (int debug_level, GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, bool allow_flush_models, ParagraphTheory *theory) |
void | StrongEvidenceClassify (int debug_level, GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory) |
void | SeparateSimpleLeaderLines (GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory) |
void | ConvertHypothesizedModelRunsToParagraphs (int debug_level, const GenericVector< RowScratchRegisters > &rows, GenericVector< PARA * > *row_owners, ParagraphTheory *theory) |
bool | RowIsStranded (const GenericVector< RowScratchRegisters > &rows, int row) |
void | LeftoverSegments (const GenericVector< RowScratchRegisters > &rows, GenericVector< Interval > *to_fix, int row_start, int row_end) |
void | CanonicalizeDetectionResults (GenericVector< PARA * > *row_owners, PARA_LIST *paragraphs) |
void | DetectParagraphs (int debug_level, GenericVector< RowInfo > *row_infos, GenericVector< PARA * > *row_owners, PARA_LIST *paragraphs, GenericVector< ParagraphModel * > *models) |
void | InitializeTextAndBoxesPreRecognition (const MutableIterator &it, RowInfo *info) |
void | InitializeRowInfo (bool after_recognition, const MutableIterator &it, RowInfo *info) |
void | DetectParagraphs (int debug_level, bool after_text_recognition, const MutableIterator *block_start, GenericVector< ParagraphModel * > *models) |
bool | StrongModel (const ParagraphModel *model) |
bool | read_t (PAGE_RES_IT *page_res_it, TBOX *tbox) |
void | YOutlierPieces (WERD_RES *word, int rebuilt_blob_index, int super_y_bottom, int sub_y_top, ScriptPos *leading_pos, int *num_leading_outliers, ScriptPos *trailing_pos, int *num_trailing_outliers) |
bool | CompareFontInfo (const FontInfo &fi1, const FontInfo &fi2) |
bool | CompareFontSet (const FontSet &fs1, const FontSet &fs2) |
void | FontInfoDeleteCallback (FontInfo f) |
void | FontSetDeleteCallback (FontSet fs) |
bool | read_info (FILE *f, FontInfo *fi, bool swap) |
bool | write_info (FILE *f, const FontInfo &fi) |
bool | read_spacing_info (FILE *f, FontInfo *fi, bool swap) |
bool | write_spacing_info (FILE *f, const FontInfo &fi) |
bool | read_set (FILE *f, FontSet *fs, bool swap) |
bool | write_set (FILE *f, const FontSet &fs) |
int | OtsuThreshold (Pix *src_pix, int left, int top, int width, int height, int **thresholds, int **hi_values) |
void | HistogramRect (Pix *src_pix, int channel, int left, int top, int width, int height, int *histogram) |
int | OtsuStats (const int *histogram, int *H_out, int *omega0_out) |
int | ParamsTrainingFeatureByName (const char *name) |
bool | PSM_OSD_ENABLED (int pageseg_mode) |
bool | PSM_ORIENTATION_ENABLED (int pageseg_mode) |
bool | PSM_COL_FIND_ENABLED (int pageseg_mode) |
bool | PSM_SPARSE (int pageseg_mode) |
bool | PSM_BLOCK_FIND_ENABLED (int pageseg_mode) |
bool | PSM_LINE_FIND_ENABLED (int pageseg_mode) |
bool | PSM_WORD_FIND_ENABLED (int pageseg_mode) |
const char * | ScriptPosToString (enum ScriptPos script_pos) |
ELISTIZE (AmbigSpec) | |
ELISTIZEH (AmbigSpec) | |
bool | LoadDataFromFile (const STRING &filename, GenericVector< char > *data) |
bool | SaveDataToFile (const GenericVector< char > &data, const STRING &filename) |
template<typename T > | |
bool | cmp_eq (T const &t1, T const &t2) |
template<typename T > | |
int | sort_cmp (const void *t1, const void *t2) |
template<typename T > | |
int | sort_ptr_cmp (const void *t1, const void *t2) |
void | ExtractFontName (const STRING &filename, STRING *fontname) |
TrainingSample * | BlobToTrainingSample (const TBLOB &blob, bool nonlinear_norm, INT_FX_RESULT_STRUCT *fx_info, GenericVector< INT_FEATURE_STRUCT > *bl_features) |
uinT8 | NormalizeDirection (uinT8 dir, const FCOORD &unnormed_pos, const DENORM &denorm, const DENORM *root_denorm) |
void | ClearFeatureSpaceWindow (NORM_METHOD norm_method, ScrollView *window) |
void | CallWithUTF8 (TessCallback1< const char * > *cb, const WERD_CHOICE *wc) |
Pix * | GridReducedPix (const TBOX &box, int gridsize, ICOORD bleft, int *left, int *bottom) |
Pix * | TraceOutlineOnReducedPix (C_OUTLINE *outline, int gridsize, ICOORD bleft, int *left, int *bottom) |
Pix * | TraceBlockOnReducedPix (BLOCK *block, int gridsize, ICOORD bleft, int *left, int *bottom) |
template<class BBC > | |
int | SortByBoxLeft (const void *void1, const void *void2) |
template<class BBC > | |
int | SortRightToLeft (const void *void1, const void *void2) |
template<class BBC > | |
int | SortByBoxBottom (const void *void1, const void *void2) |
template<typename T > | |
void | DeleteObject (T *object) |
void | SetBlobStrokeWidth (Pix *pix, BLOBNBOX *blob) |
void | assign_blobs_to_blocks2 (Pix *pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *port_blocks) |
void | ParseCommandLineFlags (const char *usage, int *argc, char ***argv, const bool remove_flags) |
ShapeTable * | LoadShapeTable (const STRING &file_prefix) |
void | WriteShapeTable (const STRING &file_prefix, const ShapeTable &shape_table) |
MasterTrainer * | LoadTrainingData (int argc, const char *const *argv, bool replication, ShapeTable **shape_table, STRING *file_prefix) |
Pix * | DegradeImage (Pix *input, int exposure, TRand *randomizer, float *rotation) |
void | UTF8ToUTF32 (const char *utf8_str, GenericVector< char32 > *str32) |
void | UTF32ToUTF8 (const GenericVector< char32 > &str32, STRING *utf8_str) |
bool | is_hyphen_punc (const char32 ch) |
bool | is_single_quote (const char32 ch) |
bool | is_double_quote (const char32 ch) |
STRING | NormalizeUTF8String (const char *str8) |
void | NormalizeChar32 (char32 ch, GenericVector< char32 > *str) |
char32 | OCRNormalize (char32 ch) |
bool | IsOCREquivalent (char32 ch1, char32 ch2) |
bool | IsValidCodepoint (const char32 ch) |
bool | IsWhitespace (const char32 ch) |
bool | IsUTF8Whitespace (const char *text) |
int | SpanUTF8Whitespace (const char *text) |
int | SpanUTF8NotWhitespace (const char *text) |
bool | IsInterchangeValid (const char32 ch) |
bool | IsInterchangeValid7BitAscii (const char32 ch) |
char32 | FullwidthToHalfwidth (const char32 ch) |
Pix * | CairoARGB32ToPixFormat (cairo_surface_t *surface) |
void | ExtractFontProperties (const string &utf8_text, StringRenderer *render, const string &output_base) |
bool | MakeIndividualGlyphs (Pix *pix, const vector< BoxChar * > &vbox, const int input_tiff_page) |
void | SetupBasicProperties (bool report_errors, UNICHARSET *unicharset) |
void | SetPropertiesForInputFile (const string &script_dir, const string &input_unicharset_file, const string &output_unicharset_file, const string &output_xheights_file) |
ELISTIZE (ViterbiStateEntry) | |
ELISTIZEH (ViterbiStateEntry) | |
template<class BLOB_CHOICE > | |
int | SortByUnicharID (const void *void1, const void *void2) |
template<class BLOB_CHOICE > | |
int | SortByRating (const void *void1, const void *void2) |
convert_prob_to_tess_certainty | |
Normalize a probability in the range [0.0, 1.0] to a tesseract certainty in the range [-20.0, 0.0] | |
char_box_to_tbox | |
Create a TBOX from a character bounding box. If nonzero, the x_offset accounts for any additional padding of the word box that should be taken into account. | |
TBOX | char_box_to_tbox (Box *char_box, TBOX word_box, int x_offset) |
The box file is assumed to contain box definitions, one per line, of the following format for blob-level boxes:
* <UTF8 str> <left> <bottom> <right> <top> <page id> *
and for word/line-level boxes:
* WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str> *
NOTES: The boxes use tesseract coordinates, i.e. 0,0 is at BOTTOM-LEFT.
<page id>=""> is 0-based, and the page number is used for multipage input (tiff).
In the blob-level form, each line represents a recognizable unit, which may be several UTF-8 bytes, but there is a bounding box around each recognizable unit, and no classifier is needed to train in this mode (bootstrapping.)
In the word/line-level form, the line begins with the literal "WordStr", and the bounding box bounds either a whole line or a whole word. The recognizable units in the word/line are listed after the # at the end of the line and are space delimited, ignoring any original spaces on the line. Eg.
* word -> #w o r d * multi word line -> #m u l t i w o r d l i n e *
The recognizable units must be space-delimited in order to allow multiple unicodes to be used for a single recognizable unit, eg Hindi.
In this mode, the classifier must have been pre-trained with the desired character set, or it will not be able to find the character segmentations.
Make a word from the selected blobs and run Tess on them.
page_res | recognise blobs |
selection_box | within this box |
fp_eval_word_spacing() Evaluation function for fixed pitch word lists.
Basically, count the number of "nice" characters - those which are in tess acceptable words or in dict words and are not rejected. Penalise any potential noise chars
build_menu()
Construct the menu tree used by the command window
process_cmd_win_event()
Process a command returned from the command window (Just call the appropriate command handler)
word_blank_and_set_display() Word processor
Blank display of word then redisplay word according to current display mode settings
typedef GridSearch<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> tesseract::BlobGridSearch |
Definition at line 31 of file blobgrid.h.
typedef signed int tesseract::char_32 |
Definition at line 40 of file string_32.h.
typedef GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> tesseract::ColPartitionGridSearch |
Definition at line 913 of file colpartition.h.
typedef BBGrid<ColSegment, ColSegment_CLIST, ColSegment_C_IT> tesseract::ColSegmentGrid |
Definition at line 118 of file tablefind.h.
typedef GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> tesseract::ColSegmentGridSearch |
Definition at line 121 of file tablefind.h.
typedef GenericVector<Dawg *> tesseract::DawgVector |
typedef int(Dict::* tesseract::DictFunc)(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const |
typedef bool(* tesseract::FileReader)(const STRING &filename, GenericVector< char > *data) |
Definition at line 349 of file genericvector.h.
typedef bool(* tesseract::FileWriter)(const GenericVector< char > &data, const STRING &filename) |
Definition at line 352 of file genericvector.h.
typedef void(Wordrec::* tesseract::FillLatticeFunc)(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle) |
typedef KDPairInc<int, int> tesseract::IntKDPair |
typedef unsigned char tesseract::LanguageModelFlagsType |
Used for expressing various language model flags.
Definition at line 37 of file lm_state.h.
typedef hash_map<string, string, StringHash> tesseract::LigHash |
Definition at line 32 of file ligature_table.h.
Definition at line 34 of file lm_pain_points.h.
typedef float(Dict::* tesseract::ParamsModelClassifyFunc)(const char *lang, void *path) |
Definition at line 122 of file params_training_featdef.h.
Definition at line 33 of file colpartitionset.h.
typedef double(Dict::* tesseract::ProbabilityInContextFunc)(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes) |
typedef GenericVectorEqEq<const ParagraphModel *> tesseract::SetOfModels |
Definition at line 94 of file paragraphs_internal.h.
Definition at line 156 of file shapetable.h.
typedef basic_string<char_32> tesseract::string_32 |
Definition at line 41 of file string_32.h.
typedef GenericVector<int> tesseract::SuccessorList |
typedef TessCallback4<const UNICHARSET &, int, PageIterator *, Pix *> tesseract::TruthCallback |
typedef GenericVector<AmbigSpec_LIST *> tesseract::UnicharAmbigsVector |
typedef TessResultCallback1<bool, int> tesseract::WidthCallback |
typedef BBGrid<WordWithBox, WordWithBox_CLIST, WordWithBox_C_IT> tesseract::WordGrid |
typedef void(Tesseract::* tesseract::WordRecognizer)(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words) |
Definition at line 166 of file tesseractclass.h.
typedef GridSearch<WordWithBox, WordWithBox_CLIST, WordWithBox_C_IT> tesseract::WordSearch |
enum tesseract::AmbigType |
Enumerator | |
---|---|
NOT_AMBIG | |
REPLACE_AMBIG | |
DEFINITE_AMBIG | |
SIMILAR_AMBIG | |
CASE_AMBIG | |
AMBIG_TYPE_COUNT |
Definition at line 44 of file ambigs.h.
Enumerator | |
---|---|
CST_FRAGMENT | |
CST_WHOLE | |
CST_IMPROPER | |
CST_NGRAM |
Definition at line 54 of file classify.h.
Enumerator | |
---|---|
ACTION_1_CMD_EVENT | |
RECOG_WERDS | |
RECOG_PSEUDO | |
ACTION_2_CMD_EVENT |
Definition at line 477 of file tessedit.cpp.
Enumerator | |
---|---|
COL_UNKNOWN | |
COL_TEXT | |
COL_TABLE | |
COL_MIXED | |
COL_COUNT |
Definition at line 30 of file tablefind.h.
Enumerator | |
---|---|
CST_NOISE | |
CST_FLOWING | |
CST_HEADING | |
CST_PULLOUT | |
CST_COUNT |
Definition at line 47 of file colpartition.h.
Definition at line 69 of file errorcounter.h.
enum tesseract::DawgType |
Definition at line 39 of file params_training_featdef.h.
Enumerator | |
---|---|
LR_LEFT | |
LR_RIGHT |
Definition at line 39 of file strokewidth.h.
enum tesseract::LineType |
Enumerator | |
---|---|
LT_START | |
LT_BODY | |
LT_UNKNOWN | |
LT_MULTIPLE |
Definition at line 54 of file paragraphs_internal.h.
Enumerator | |
---|---|
LM_PPTYPE_BLAMER | |
LM_PPTYPE_AMBIG | |
LM_PPTYPE_PATH | |
LM_PPTYPE_SHAPE | |
LM_PPTYPE_NUM |
Definition at line 37 of file lm_pain_points.h.
Enumerator | |
---|---|
NPT_HTEXT | |
NPT_VTEXT | |
NPT_WEAK_HTEXT | |
NPT_WEAK_VTEXT | |
NPT_IMAGE | |
NPT_COUNT |
Definition at line 1558 of file colpartitiongrid.cpp.
Enumerator | |
---|---|
NM_BASELINE | |
NM_CHAR_ISOTROPIC | |
NM_CHAR_ANISOTROPIC |
Definition at line 44 of file normalis.h.
When Tesseract/Cube is initialized we can choose to instantiate/load/run only the Tesseract part, only the Cube part or both along with the combiner. The preference of which engine to use is stored in tessedit_ocr_engine_mode.
ATTENTION: When modifying this enum, please make sure to make the appropriate changes to all the enums mirroring it (e.g. OCREngine in cityblock/workflow/detection/detection_storage.proto). Such enums will mention the connection to OcrEngineMode in the comments.
Enumerator | |
---|---|
OEM_TESSERACT_ONLY | |
OEM_CUBE_ONLY | |
OEM_TESSERACT_CUBE_COMBINED | |
OEM_DEFAULT |
Definition at line 256 of file publictypes.h.
+---------------—+ Orientation Example: | 1 Aaaa Aaaa Aaaa | ==================== | Aaa aa aaa aa | To left is a diagram of some (1) English and | aaaaaa A aa aaa. | (2) Chinese text and a (3) photo credit. | 2 | | ####### c c C | Upright Latin characters are represented as A and a. | ####### c c c | '<' represents a latin character rotated | < ####### c c c | anti-clockwise 90 degrees. | < ####### c c | | < ####### . c | Upright Chinese characters are represented C and c. | 3 ####### c | +---------------—+ NOTA BENE: enum values here should match goodoc.proto
If you orient your head so that "up" aligns with Orientation, then the characters will appear "right side up" and readable.
In the example above, both the English and Chinese paragraphs are oriented so their "up" is the top of the page (page up). The photo credit is read with one's head turned leftward ("up" is to page left).
The values of this enum match the convention of Tesseract's osdetect.h
Enumerator | |
---|---|
ORIENTATION_PAGE_UP | |
ORIENTATION_PAGE_RIGHT | |
ORIENTATION_PAGE_DOWN | |
ORIENTATION_PAGE_LEFT |
Definition at line 108 of file publictypes.h.
enum of the elements of the page hierarchy, used in ResultIterator to provide functions that operate on each level without having to have 5x as many functions.
Enumerator | |
---|---|
RIL_BLOCK | |
RIL_PARA | |
RIL_TEXTLINE | |
RIL_WORD | |
RIL_SYMBOL |
Definition at line 207 of file publictypes.h.
Possible modes for page layout analysis. These must be kept in order of decreasing amount of layout analysis to be done, except for OSD_ONLY, so that the inequality test macros below work.
Definition at line 151 of file publictypes.h.
JUSTIFICATION_UNKNONW The alignment is not clearly one of the other options. This could happen for example if there are only one or two lines of text or the text looks like source code or poetry.
NOTA BENE: Fully justified paragraphs (text aligned to both left and right margins) are marked by Tesseract with JUSTIFICATION_LEFT if their text is written with a left-to-right script and with JUSTIFICATION_RIGHT if their text is written in a right-to-left script.
Interpretation for text read in vertical lines: "Left" is wherever the starting reading position is.
JUSTIFICATION_LEFT Each line, except possibly the first, is flush to the same left tab stop.
JUSTIFICATION_CENTER The text lines of the paragraph are centered about a line going down through their middle of the text lines.
JUSTIFICATION_RIGHT Each line, except possibly the first, is flush to the same right tab stop.
Enumerator | |
---|---|
JUSTIFICATION_UNKNOWN | |
JUSTIFICATION_LEFT | |
JUSTIFICATION_CENTER | |
JUSTIFICATION_RIGHT |
Definition at line 239 of file publictypes.h.
Enumerator | |
---|---|
PFR_OK | |
PFR_SKEW | |
PFR_NOISE |
Definition at line 46 of file strokewidth.h.
enum tesseract::ScriptPos |
Enumerator | |
---|---|
SP_NORMAL | |
SP_SUBSCRIPT | |
SP_SUPERSCRIPT | |
SP_DROPCAP |
Enumerator | |
---|---|
SET_PARAM_CONSTRAINT_NONE | |
SET_PARAM_CONSTRAINT_DEBUG_ONLY | |
SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY | |
SET_PARAM_CONSTRAINT_NON_INIT_ONLY |
Definition at line 36 of file params.h.
Enumerator | |
---|---|
TA_LEFT_ALIGNED | |
TA_LEFT_RAGGED | |
TA_CENTER_JUSTIFIED | |
TA_RIGHT_ALIGNED | |
TA_RIGHT_RAGGED | |
TA_SEPARATOR | |
TA_COUNT |
Definition at line 43 of file tabvector.h.
Definition at line 53 of file tessdatamanager.h.
The text lines are read in the given sequence.
In English, the order is top-to-bottom. In Chinese, vertical text lines are read right-to-left. Mongolian is written in vertical columns top to bottom like Chinese, but the lines order left-to right.
Note that only some combinations make sense. For example, WRITING_DIRECTION_LEFT_TO_RIGHT implies TEXTLINE_ORDER_TOP_TO_BOTTOM
Enumerator | |
---|---|
TEXTLINE_ORDER_LEFT_TO_RIGHT | |
TEXTLINE_ORDER_RIGHT_TO_LEFT | |
TEXTLINE_ORDER_TOP_TO_BOTTOM |
Definition at line 140 of file publictypes.h.
The grapheme clusters within a line of text are laid out logically in this direction, judged when looking at the text line rotated so that its Orientation is "page up".
For English text, the writing direction is left-to-right. For the Chinese text in the above example, the writing direction is top-to-bottom.
Enumerator | |
---|---|
WRITING_DIRECTION_LEFT_TO_RIGHT | |
WRITING_DIRECTION_RIGHT_TO_LEFT | |
WRITING_DIRECTION_TOP_TO_BOTTOM |
Definition at line 123 of file publictypes.h.
void tesseract::AffineMatrix | ( | int | writing_direction, |
int | line_x1, | ||
int | line_y1, | ||
int | line_x2, | ||
int | line_y2, | ||
double * | a, | ||
double * | b, | ||
double * | c, | ||
double * | d | ||
) |
Definition at line 246 of file pdfrenderer.cpp.
bool tesseract::AsciiLikelyListItem | ( | const STRING & | word | ) |
Definition at line 267 of file paragraphs.cpp.
void tesseract::assign_blobs_to_blocks2 | ( | Pix * | pix, |
BLOCK_LIST * | blocks, | ||
TO_BLOCK_LIST * | port_blocks | ||
) |
Definition at line 157 of file tordmain.cpp.
TrainingSample * tesseract::BlobToTrainingSample | ( | const TBLOB & | blob, |
bool | nonlinear_norm, | ||
INT_FX_RESULT_STRUCT * | fx_info, | ||
GenericVector< INT_FEATURE_STRUCT > * | bl_features | ||
) |
Definition at line 81 of file intfx.cpp.
Pix* tesseract::CairoARGB32ToPixFormat | ( | cairo_surface_t * | surface | ) |
Definition at line 78 of file stringrenderer.cpp.
void tesseract::CalculateTabStops | ( | GenericVector< RowScratchRegisters > * | rows, |
int | row_start, | ||
int | row_end, | ||
int | tolerance, | ||
GenericVector< Cluster > * | left_tabs, | ||
GenericVector< Cluster > * | right_tabs | ||
) |
Definition at line 691 of file paragraphs.cpp.
void tesseract::CallWithUTF8 | ( | TessCallback1< const char * > * | cb, |
const WERD_CHOICE * | wc | ||
) |
Definition at line 112 of file dawg.cpp.
void tesseract::CanonicalizeDetectionResults | ( | GenericVector< PARA * > * | row_owners, |
PARA_LIST * | paragraphs | ||
) |
Definition at line 2232 of file paragraphs.cpp.
Definition at line 42 of file cube_control.cpp.
void tesseract::ClearFeatureSpaceWindow | ( | NORM_METHOD | norm_method, |
ScrollView * | window | ||
) |
Clears the given window and draws the featurespace guides for the appropriate normalization method.
Definition at line 1104 of file intproto.cpp.
void tesseract::ClipBaseline | ( | int | ppi, |
int | x1, | ||
int | y1, | ||
int | x2, | ||
int | y2, | ||
int * | line_x1, | ||
int * | line_y1, | ||
int * | line_x2, | ||
int * | line_y2 | ||
) |
Definition at line 275 of file pdfrenderer.cpp.
int tesseract::ClosestCluster | ( | const GenericVector< Cluster > & | clusters, |
int | value | ||
) |
Definition at line 665 of file paragraphs.cpp.
bool tesseract::cmp_eq | ( | T const & | t1, |
T const & | t2 | ||
) |
Definition at line 382 of file genericvector.h.
bool tesseract::CompareFontInfo | ( | const FontInfo & | fi1, |
const FontInfo & | fi2 | ||
) |
Definition at line 120 of file fontinfo.cpp.
bool tesseract::CompareFontSet | ( | const FontSet & | fs1, |
const FontSet & | fs2 | ||
) |
Definition at line 128 of file fontinfo.cpp.
void tesseract::ConvertHypothesizedModelRunsToParagraphs | ( | int | debug_level, |
const GenericVector< RowScratchRegisters > & | rows, | ||
GenericVector< PARA * > * | row_owners, | ||
ParagraphTheory * | theory | ||
) |
Definition at line 2041 of file paragraphs.cpp.
bool tesseract::CrownCompatible | ( | const GenericVector< RowScratchRegisters > * | rows, |
int | a, | ||
int | b, | ||
const ParagraphModel * | model | ||
) |
Definition at line 1288 of file paragraphs.cpp.
int tesseract::CubeAPITest | ( | Boxa * | boxa_blocks, |
Pixa * | pixa_blocks, | ||
Boxa * | boxa_words, | ||
Pixa * | pixa_words, | ||
const FCOORD & | reskew, | ||
Pix * | page_pix, | ||
PAGE_RES * | page_res | ||
) |
Placeholder for call to Cube and test that the input data is correct. reskew is the direction of baselines in the skewed image in normalized (cos theta, sin theta) form, so (0.866, 0.5) would represent a 30 degree anticlockwise skew.
Definition at line 757 of file baseapi.cpp.
struct Pix * tesseract::DegradeImage | ( | Pix * | input, |
int | exposure, | ||
TRand * | randomizer, | ||
float * | rotation | ||
) |
Definition at line 65 of file degradeimage.cpp.
void tesseract::DeleteObject | ( | T * | object | ) |
Definition at line 165 of file tablefind.cpp.
void tesseract::DetectParagraphs | ( | int | debug_level, |
GenericVector< RowInfo > * | row_infos, | ||
GenericVector< PARA * > * | row_owners, | ||
PARA_LIST * | paragraphs, | ||
GenericVector< ParagraphModel * > * | models | ||
) |
Definition at line 2264 of file paragraphs.cpp.
void tesseract::DetectParagraphs | ( | int | debug_level, |
bool | after_text_recognition, | ||
const MutableIterator * | block_start, | ||
GenericVector< ParagraphModel * > * | models | ||
) |
Definition at line 2509 of file paragraphs.cpp.
void tesseract::DiscardUnusedModels | ( | const GenericVector< RowScratchRegisters > & | rows, |
ParagraphTheory * | theory | ||
) |
Definition at line 1455 of file paragraphs.cpp.
long tesseract::dist2 | ( | int | x1, |
int | y1, | ||
int | x2, | ||
int | y2 | ||
) |
Definition at line 192 of file pdfrenderer.cpp.
void tesseract::DowngradeWeakestToCrowns | ( | int | debug_level, |
ParagraphTheory * | theory, | ||
GenericVector< RowScratchRegisters > * | rows | ||
) |
Definition at line 1488 of file paragraphs.cpp.
tesseract::ELISTIZE | ( | ViterbiStateEntry | ) |
tesseract::ELISTIZE | ( | AmbigSpec | ) |
tesseract::ELISTIZEH | ( | AmbigSpec | ) |
tesseract::ELISTIZEH | ( | ViterbiStateEntry | ) |
void tesseract::ExtractFontProperties | ( | const string & | utf8_text, |
StringRenderer * | render, | ||
const string & | output_base | ||
) |
Definition at line 212 of file text2image.cpp.
bool tesseract::FirstWordWouldHaveFit | ( | const RowScratchRegisters & | before, |
const RowScratchRegisters & | after, | ||
tesseract::ParagraphJustification | justification | ||
) |
Definition at line 1621 of file paragraphs.cpp.
bool tesseract::FirstWordWouldHaveFit | ( | const RowScratchRegisters & | before, |
const RowScratchRegisters & | after | ||
) |
Definition at line 1646 of file paragraphs.cpp.
void tesseract::FontInfoDeleteCallback | ( | FontInfo | f | ) |
Definition at line 139 of file fontinfo.cpp.
void tesseract::FontSetDeleteCallback | ( | FontSet | fs | ) |
Definition at line 146 of file fontinfo.cpp.
Definition at line 239 of file normstrngs.cpp.
void tesseract::GeometricClassify | ( | int | debug_level, |
GenericVector< RowScratchRegisters > * | rows, | ||
int | row_start, | ||
int | row_end, | ||
ParagraphTheory * | theory | ||
) |
Definition at line 1077 of file paragraphs.cpp.
void tesseract::GeometricClassifyThreeTabStopTextBlock | ( | int | debug_level, |
GeometricClassifierState & | s, | ||
ParagraphTheory * | theory | ||
) |
Definition at line 985 of file paragraphs.cpp.
void tesseract::GetWordBaseline | ( | int | writing_direction, |
int | ppi, | ||
int | height, | ||
int | word_x1, | ||
int | word_y1, | ||
int | word_x2, | ||
int | word_y2, | ||
int | line_x1, | ||
int | line_y1, | ||
int | line_x2, | ||
int | line_y2, | ||
double * | x0, | ||
double * | y0, | ||
double * | length | ||
) |
Definition at line 204 of file pdfrenderer.cpp.
Pix* tesseract::GridReducedPix | ( | const TBOX & | box, |
int | gridsize, | ||
ICOORD | bleft, | ||
int * | left, | ||
int * | bottom | ||
) |
Definition at line 212 of file bbgrid.cpp.
void tesseract::HistogramRect | ( | Pix * | src_pix, |
int | channel, | ||
int | left, | ||
int | top, | ||
int | width, | ||
int | height, | ||
int * | histogram | ||
) |
Definition at line 157 of file otsuthr.cpp.
STRING tesseract::HOcrEscape | ( | const char * | text | ) |
Escape a char string - remove <>&"' with HTML codes.
Escape a char string - remove &<>"' with HTML codes.
Definition at line 2644 of file baseapi.cpp.
void tesseract::InitializeRowInfo | ( | bool | after_recognition, |
const MutableIterator & | it, | ||
RowInfo * | info | ||
) |
Definition at line 2411 of file paragraphs.cpp.
void tesseract::InitializeTextAndBoxesPreRecognition | ( | const MutableIterator & | it, |
RowInfo * | info | ||
) |
Definition at line 2359 of file paragraphs.cpp.
ParagraphModel tesseract::InternalParagraphModelByOutline | ( | const GenericVector< RowScratchRegisters > * | rows, |
int | start, | ||
int | end, | ||
int | tolerance, | ||
bool * | consistent | ||
) |
Definition at line 1692 of file paragraphs.cpp.
int tesseract::InterwordSpace | ( | const GenericVector< RowScratchRegisters > & | rows, |
int | row_start, | ||
int | row_end | ||
) |
Definition at line 1598 of file paragraphs.cpp.
bool tesseract::is_double_quote | ( | const char32 | ch | ) |
Definition at line 97 of file normstrngs.cpp.
bool tesseract::is_hyphen_punc | ( | const char32 | ch | ) |
Definition at line 58 of file normstrngs.cpp.
bool tesseract::is_single_quote | ( | const char32 | ch | ) |
Definition at line 77 of file normstrngs.cpp.
bool tesseract::IsDigitLike | ( | int | ch | ) |
Definition at line 197 of file paragraphs.cpp.
bool tesseract::IsInterchangeValid | ( | const char32 | ch | ) |
Definition at line 208 of file normstrngs.cpp.
bool tesseract::IsInterchangeValid7BitAscii | ( | const char32 | ch | ) |
Definition at line 232 of file normstrngs.cpp.
bool tesseract::IsLatinLetter | ( | int | ch | ) |
Definition at line 193 of file paragraphs.cpp.
|
inline |
Definition at line 95 of file equationdetect.cpp.
Definition at line 166 of file normstrngs.cpp.
bool tesseract::IsOpeningPunct | ( | int | ch | ) |
Definition at line 201 of file paragraphs.cpp.
|
inline |
Definition at line 100 of file equationdetect.cpp.
bool tesseract::IsTerminalPunct | ( | int | ch | ) |
Definition at line 205 of file paragraphs.cpp.
|
inline |
Definition at line 91 of file equationdetect.cpp.
bool tesseract::IsUTF8Whitespace | ( | const char * | text | ) |
Definition at line 182 of file normstrngs.cpp.
bool tesseract::IsValidCodepoint | ( | const char32 | ch | ) |
Definition at line 170 of file normstrngs.cpp.
bool tesseract::IsWhitespace | ( | const char32 | ch | ) |
Definition at line 176 of file normstrngs.cpp.
void tesseract::LeftoverSegments | ( | const GenericVector< RowScratchRegisters > & | rows, |
GenericVector< Interval > * | to_fix, | ||
int | row_start, | ||
int | row_end | ||
) |
Definition at line 2181 of file paragraphs.cpp.
void tesseract::LeftWordAttributes | ( | const UNICHARSET * | unicharset, |
const WERD_CHOICE * | werd, | ||
const STRING & | utf8, | ||
bool * | is_list, | ||
bool * | starts_idea, | ||
bool * | ends_idea | ||
) |
Definition at line 394 of file paragraphs.cpp.
bool tesseract::LikelyListMark | ( | const STRING & | word | ) |
Definition at line 262 of file paragraphs.cpp.
bool tesseract::LikelyListMarkUnicode | ( | int | ch | ) |
Definition at line 328 of file paragraphs.cpp.
bool tesseract::LikelyListNumeral | ( | const STRING & | word | ) |
Definition at line 228 of file paragraphs.cpp.
bool tesseract::LikelyParagraphStart | ( | const RowScratchRegisters & | before, |
const RowScratchRegisters & | after | ||
) |
Definition at line 1672 of file paragraphs.cpp.
bool tesseract::LikelyParagraphStart | ( | const RowScratchRegisters & | before, |
const RowScratchRegisters & | after, | ||
tesseract::ParagraphJustification | j | ||
) |
Definition at line 1679 of file paragraphs.cpp.
|
inline |
Definition at line 356 of file genericvector.h.
ShapeTable * tesseract::LoadShapeTable | ( | const STRING & | file_prefix | ) |
Definition at line 118 of file commontraining.cpp.
MasterTrainer * tesseract::LoadTrainingData | ( | int | argc, |
const char *const * | argv, | ||
bool | replication, | ||
ShapeTable ** | shape_table, | ||
STRING * | file_prefix | ||
) |
Creates a MasterTraininer and loads the training data into it: Initializes feature_defs and IntegerFX. Loads the shape_table if shape_table != NULL. Loads initial unicharset from -U command-line option. If FLAGS_T is set, loads the majority of data from there, else:
Definition at line 175 of file commontraining.cpp.
TBLOB* tesseract::make_tesseract_blob | ( | float | baseline, |
float | xheight, | ||
float | descender, | ||
float | ascender, | ||
bool | numeric_mode, | ||
Pix * | pix | ||
) |
Return a TBLOB * from the whole pix. To be freed later with delete.
Definition at line 2338 of file baseapi.cpp.
bool tesseract::MakeIndividualGlyphs | ( | Pix * | pix, |
const vector< BoxChar * > & | vbox, | ||
const int | input_tiff_page | ||
) |
Definition at line 309 of file text2image.cpp.
void tesseract::MarkRowsWithModel | ( | GenericVector< RowScratchRegisters > * | rows, |
int | row_start, | ||
int | row_end, | ||
const ParagraphModel * | model, | ||
bool | ltr, | ||
int | eop_threshold | ||
) |
Definition at line 807 of file paragraphs.cpp.
void tesseract::MarkStrongEvidence | ( | GenericVector< RowScratchRegisters > * | rows, |
int | row_start, | ||
int | row_end | ||
) |
Definition at line 1830 of file paragraphs.cpp.
void tesseract::ModelStrongEvidence | ( | int | debug_level, |
GenericVector< RowScratchRegisters > * | rows, | ||
int | row_start, | ||
int | row_end, | ||
bool | allow_flush_models, | ||
ParagraphTheory * | theory | ||
) |
Definition at line 1900 of file paragraphs.cpp.
void tesseract::NormalizeChar32 | ( | char32 | ch, |
GenericVector< char32 > * | str | ||
) |
Definition at line 131 of file normstrngs.cpp.
uinT8 tesseract::NormalizeDirection | ( | uinT8 | dir, |
const FCOORD & | unnormed_pos, | ||
const DENORM & | denorm, | ||
const DENORM * | root_denorm | ||
) |
Definition at line 171 of file intfx.cpp.
STRING tesseract::NormalizeUTF8String | ( | const char * | str8 | ) |
Definition at line 116 of file normstrngs.cpp.
Definition at line 156 of file normstrngs.cpp.
int tesseract::OtsuStats | ( | const int * | histogram, |
int * | H_out, | ||
int * | omega0_out | ||
) |
Definition at line 182 of file otsuthr.cpp.
int tesseract::OtsuThreshold | ( | Pix * | src_pix, |
int | left, | ||
int | top, | ||
int | width, | ||
int | height, | ||
int ** | thresholds, | ||
int ** | hi_values | ||
) |
Definition at line 39 of file otsuthr.cpp.