21 #ifndef TESSERACT_TEXTORD_TEXTORD_H_ 22 #define TESSERACT_TEXTORD_TEXTORD_H_ 46 int height = bounding_box_.
height();
47 bounding_box_.
pad(height, height);
83 int height, Pix *binary_pix, Pix *thresholds_pix,
84 Pix *grey_pix,
bool use_box_bottoms,
85 BLOBNBOX_LIST *diacritic_blobs, BLOCK_LIST *blocks,
86 TO_BLOCK_LIST *to_blocks);
93 return use_cjk_fp_model_;
96 use_cjk_fp_model_ = flag;
102 TO_BLOCK_LIST *blocks
111 void find_components(Pix* pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks);
112 void filter_blobs(
ICOORD page_tr, TO_BLOCK_LIST* blocks,
bool testing_on);
121 bool use_cjk_fp_model_;
126 int width,
int height, TO_BLOCK_LIST* to_blocks);
128 void MakeBlockRows(
int min_spacing,
int max_spacing,
133 void compute_block_xheight(
TO_BLOCK *block,
float gradient);
134 void compute_row_xheight(
TO_ROW *row,
137 int block_line_size);
138 void make_spline_rows(
TO_BLOCK* block,
143 void make_old_baselines(
TO_BLOCK* block,
146 void correlate_lines(
TO_BLOCK *block,
float gradient);
147 void correlate_neighbours(
TO_BLOCK *block,
150 int correlate_with_stats(
TO_ROW **rows,
153 void find_textlines(
TO_BLOCK *block,
159 void block_spacing_stats(
TO_BLOCK* block,
161 bool& old_text_ord_proportional,
163 int16_t& block_space_gap_width,
165 int16_t& block_non_space_gap_width
167 void row_spacing_stats(
TO_ROW *row,
172 int16_t block_space_gap_width,
174 int16_t block_non_space_gap_width
176 void old_to_method(
TO_ROW *row,
177 STATS *all_gap_stats,
178 STATS *space_gap_stats,
179 STATS *small_gap_stats,
180 int16_t block_space_gap_width,
182 int16_t block_non_space_gap_width
184 bool isolated_row_stats(
TO_ROW* row,
186 STATS* all_gap_stats,
187 bool suspected_table,
190 int16_t stats_count_under(
STATS *stats, int16_t threshold);
191 void improve_row_threshold(
TO_ROW *row,
STATS *all_gap_stats);
192 bool make_a_word_break(
TO_ROW* row,
196 int16_t real_current_gap,
197 int16_t within_xht_current_gap,
203 bool& prev_gap_was_a_space,
204 bool& break_at_next_gap);
207 bool suspected_punct_blob(
TO_ROW* row,
TBOX box);
208 void peek_at_next_gap(
TO_ROW *row,
212 int16_t &next_within_xht_gap);
213 void mark_gap(
TBOX blob,
216 int16_t prev_blob_width,
218 int16_t next_blob_width,
220 float find_mean_blob_spacing(
WERD *word);
221 bool ignore_big_gap(
TO_ROW* row,
232 float filter_noise_blobs(BLOBNBOX_LIST *src_list,
233 BLOBNBOX_LIST *noise_list,
234 BLOBNBOX_LIST *small_list,
235 BLOBNBOX_LIST *large_list);
240 void cleanup_nontext_block(
BLOCK* block);
241 void cleanup_blocks(
bool clean_noise, BLOCK_LIST *blocks);
242 bool clean_noise_from_row(
ROW* row);
243 void clean_noise_from_words(
ROW *row);
246 void clean_small_noise_from_words(
ROW *row);
250 void TransferDiacriticsToBlockGroups(BLOBNBOX_LIST* diacritic_blobs,
255 void TransferDiacriticsToWords(BLOBNBOX_LIST *diacritic_blobs,
261 "Script has no xheight, so use a single mode for horizontal text");
263 BOOL_VAR_H(tosp_old_to_method,
false,
"Space stats use prechopping?");
264 BOOL_VAR_H(tosp_old_to_constrain_sp_kn,
false,
265 "Constrain relative values of inter and intra-word gaps for " 268 "Block stats to use fixed pitch rows?");
269 BOOL_VAR_H(tosp_force_wordbreak_on_punct,
false,
270 "Force word breaks on punct to break long lines in non-space " 273 "Space stats use prechopping?");
275 "Fix suspected bug in old code");
277 "Only stat OBVIOUS spaces");
279 "Only stat OBVIOUS spaces");
281 "Only stat OBVIOUS spaces");
283 "Only stat OBVIOUS spaces");
284 BOOL_VAR_H(tosp_recovery_isolated_row_stats,
true,
285 "Use row alone when inadequate cert spaces");
286 BOOL_VAR_H(tosp_only_small_gaps_for_kern,
false,
"Better guess");
287 BOOL_VAR_H(tosp_all_flips_fuzzy,
false,
"Pass ANY flip to context?");
289 "Don't restrict kn->sp fuzzy limit to tables");
291 "Use within xht gap for wd breaks");
293 "Use within xht gap for wd breaks");
295 "Only use within xht gap for wd breaks");
297 "Don't chng kn to space next to punct");
301 "Enable improvement heuristic");
303 INT_VAR_H(tosp_enough_space_samples_for_median, 3,
304 "or should we use mean");
306 "No.samples reqd to reestimate for row");
308 "No.gaps reqd with 1 large gap to treat as a table");
310 "No.gaps reqd with few cert spaces to use certs");
311 INT_VAR_H(tosp_sanity_method, 1,
"How to avoid being silly");
313 "Factor for defining space threshold in terms of space and " 316 "how far between kern and space?");
318 "how far between kern and space?");
320 "Fract of xheight for narrow");
322 "narrow if w/h less than this");
325 "wide if w/h less than this");
327 "Fract of xheight for fuzz sp");
329 "Fract of xheight for fuzz sp");
331 "Fract of xheight for fuzz sp");
334 "gap ratio to flip kern->sp");
336 "gap ratio to flip kern->sp");
338 "gap ratio to flip kern->sp");
343 "Fract of kerns reqd for isolated row stats");
345 "Min difference of kn & sp in table");
347 "Expect spaces bigger than this");
349 "Fuzzy if less than this");
353 "Don't trust spaces less than this time kn");
355 "Thresh guess - mult kn by this");
357 "Thresh guess - mult xht by this");
359 "Multiplier on kn to limit thresh");
361 "Don't autoflip kn to sp when large separation");
363 "Limit use of xht gap with large kns");
365 "Limit use of xht gap with odd small kns");
367 "Don't reduce box if the top left is non blank");
369 "Don't let sp minus kn get too small");
371 "How wide fuzzies need context");
373 BOOL_VAR_H(textord_no_rejects,
false,
"Don't remove noise blobs");
374 BOOL_VAR_H(textord_show_blobs,
false,
"Display unsorted blobs");
376 INT_VAR_H(textord_max_noise_size, 7,
"Pixel size of noise");
377 INT_VAR_H(textord_baseline_debug, 0,
"Baseline debug level");
378 double_VAR_H(textord_blob_size_bigile, 95,
"Percentile for large blobs");
380 "Fraction of bounding box for noise");
381 double_VAR_H(textord_blob_size_smallile, 20,
"Percentile for small blobs");
382 double_VAR_H(textord_initialx_ile, 0.75,
"Ile of sizes for xheight guess");
383 double_VAR_H(textord_initialasc_ile, 0.90,
"Ile of sizes for xheight guess");
384 INT_VAR_H(textord_noise_sizefraction, 10,
"Fraction of size for maxima");
385 double_VAR_H(textord_noise_sizelimit, 0.5,
"Fraction of x for big t count");
386 INT_VAR_H(textord_noise_translimit, 16,
"Transitions for normal blob");
387 double_VAR_H(textord_noise_normratio, 2.0,
"Dot to norm ratio for deletion");
388 BOOL_VAR_H(textord_noise_rejwords,
true,
"Reject noise-like words");
389 BOOL_VAR_H(textord_noise_rejrows,
true,
"Reject noise-like rows");
390 double_VAR_H(textord_noise_syfract, 0.2,
"xh fract error for norm blobs");
392 "xh fract width error for norm blobs");
394 "Height fraction to discard outlines as speckle noise");
395 INT_VAR_H(textord_noise_sncount, 1,
"super norm blobs to save row");
396 double_VAR_H(textord_noise_rowratio, 6.0,
"Dot to norm ratio for deletion");
399 double_VAR_H(textord_blshift_xfraction, 9.99,
"Min size of baseline shift");
403 #endif // TESSERACT_TEXTORD_TEXTORD_H_
#define INT_VAR_H(name, val, comment)
bool use_cjk_fp_model() const
#define BOOL_VAR_H(name, val, comment)
#define double_VAR_H(name, val, comment)
const WERD * word() const
C_BLOB_LIST * rej_cblob_list()
CLISTIZEH(STRING) CLISTIZE(STRING) namespace tesseract
TBOX true_bounding_box() const
C_BLOB_LIST * RejBlobs() const
const TBOX & bounding_box() const
void set_use_cjk_fp_model(bool flag)
void pad(int xpad, int ypad)
TBOX true_bounding_box() const