tesseract  5.0.0-alpha-619-ge9db
tesstrain_utils Namespace Reference

Classes

class  TrainingArgs
 

Functions

def err_exit (msg)
 
def run_command (cmd, *args, env=None)
 
def check_file_readable (*filenames)
 
def parse_flags (argv=None)
 
def cleanup (ctx)
 
def initialize_fontconfig (ctx)
 
def make_fontname (font)
 
def make_outbase (ctx, fontname, exposure)
 
def generate_font_image (ctx, font, exposure, char_spacing)
 
def phase_I_generate_image (ctx, par_factor=None)
 
def phase_UP_generate_unicharset (ctx)
 
def phase_E_extract_features (ctx, box_config, ext)
 
def make_lstmdata (ctx)
 

Variables

 log = logging.getLogger(__name__)
 
 parser
 
 dest
 
 nargs
 
 type
 
 help
 
 metavar
 
 int
 
 action
 
 inputdata_group
 
 tessdata_group
 

Function Documentation

◆ check_file_readable()

def tesstrain_utils.check_file_readable ( filenames)

Definition at line 113 of file tesstrain_utils.py.

113 def check_file_readable(*filenames):
114  if isinstance(filenames, (str, pathlib.Path)):
115  filenames = [filenames]
116  for filename in filenames:
117  try:
118  with pathlib.Path(filename).open():
119  pass
120  except FileNotFoundError:
121  err_exit(f"Required/expected file '{filename}' does not exist")
122  except PermissionError:
123  err_exit(f"{filename} is not readable")
124  except IOError as e:
125  err_exit(f"{filename} IO Error: {str(e)}")
126  return True
127 
128 

◆ cleanup()

def tesstrain_utils.cleanup (   ctx)

Definition at line 288 of file tesstrain_utils.py.

288 def cleanup(ctx):
289  shutil.copy(ctx.log_file, ctx.output_dir)
290  shutil.rmtree(ctx.training_dir)
291  return
292 
293 
294 # Function initializes font config with a unique font cache dir.

◆ err_exit()

def tesstrain_utils.err_exit (   msg)

Definition at line 70 of file tesstrain_utils.py.

70 def err_exit(msg):
71  log.critical(msg)
72  sys.exit(1)
73 
74 
75 # Helper function to run a command and append its output to a log. Aborts early
76 # if the program file is not found.
77 # Usage: run_command CMD ARG1 ARG2...

◆ generate_font_image()

def tesstrain_utils.generate_font_image (   ctx,
  font,
  exposure,
  char_spacing 
)

Definition at line 319 of file tesstrain_utils.py.

319 def generate_font_image(ctx, font, exposure, char_spacing):
320  log.info(f"Rendering using {font}")
321  fontname = make_fontname(font)
322  outbase = make_outbase(ctx, fontname, exposure)
323 
324  common_args = [
325  f"--fontconfig_tmpdir={ctx.font_config_cache}",
326  f"--fonts_dir={ctx.fonts_dir}",
327  f"--strip_unrenderable_words",
328  f"--leading={ctx.leading}",
329  f"--char_spacing={char_spacing}",
330  f"--exposure={exposure}",
331  f"--outputbase={outbase}",
332  f"--max_pages={ctx.max_pages}",
333  ]
334 
335  if ctx.distort_image:
336  common_args.append("--distort_image")
337 
338  # add --writing_mode=vertical-upright to common_args if the font is
339  # specified to be rendered vertically.
340  if font in VERTICAL_FONTS:
341  common_args.append("--writing_mode=vertical-upright")
342 
343  run_command(
344  "text2image",
345  *common_args,
346  f"--font={font}",
347  f"--text={ctx.training_text}",
348  *ctx.text2image_extra_args,
349  )
350 
351  check_file_readable(str(outbase) + ".box", str(outbase) + ".tif")
352 
353  if ctx.extract_font_properties and pathlib.Path(ctx.train_ngrams_file).exists():
354  log.info(f"Extracting font properties of {font}")
355  run_command(
356  "text2image",
357  *common_args,
358  f"--font={font}",
359  f"--ligatures=false",
360  f"--text={ctx.train_ngrams_file}",
361  f"--only_extract_font_properties",
362  f"--ptsize=32",
363  )
364  check_file_readable(str(outbase) + ".fontinfo")
365  return f"{font}-{exposure}"
366 
367 
368 # Phase I : Generate (I)mages from training text for each font.

◆ initialize_fontconfig()

def tesstrain_utils.initialize_fontconfig (   ctx)

Definition at line 295 of file tesstrain_utils.py.

295 def initialize_fontconfig(ctx):
296  sample_path = pathlib.Path(ctx.font_config_cache) / "sample_text.txt"
297  pathlib.Path(sample_path).write_text("Text\n")
298  log.info(f"Testing font: {ctx.fonts[0]}")
299  run_command(
300  "text2image",
301  f"--fonts_dir={ctx.fonts_dir}",
302  f"--font={ctx.fonts[0]}",
303  f"--outputbase={sample_path}",
304  f"--text={sample_path}",
305  f"--fontconfig_tmpdir={ctx.font_config_cache}",
306  )
307 
308 

◆ make_fontname()

def tesstrain_utils.make_fontname (   font)

Definition at line 309 of file tesstrain_utils.py.

309 def make_fontname(font):
310  return font.replace(" ", "_").replace(",", "")
311 
312 

◆ make_lstmdata()

def tesstrain_utils.make_lstmdata (   ctx)

Definition at line 653 of file tesstrain_utils.py.

653 def make_lstmdata(ctx):
654  log.info("=== Constructing LSTM training data ===")
655  lang_prefix = f"{ctx.langdata_dir}/{ctx.lang_code}/{ctx.lang_code}"
656  path_output = pathlib.Path(ctx.output_dir)
657  if not path_output.is_dir():
658  log.info(f"Creating new directory {ctx.output_dir}")
659  path_output.mkdir(exist_ok=True, parents=True)
660 
661  args = []
662  if ctx.lang_is_rtl:
663  args.append("--lang_is_rtl")
664  if ctx.norm_mode >= 2:
665  args.append("--pass_through_recoder")
666 
667  # Build the starter traineddata from the inputs.
668  run_command(
669  "combine_lang_model",
670  "--input_unicharset",
671  f"{ctx.training_dir}/{ctx.lang_code}.unicharset",
672  "--script_dir",
673  f"{ctx.langdata_dir}",
674  "--words",
675  f"{lang_prefix}.wordlist",
676  "--numbers",
677  f"{lang_prefix}.numbers",
678  "--puncs",
679  f"{lang_prefix}.punc",
680  "--output_dir",
681  f"{ctx.output_dir}",
682  "--lang",
683  f"{ctx.lang_code}",
684  *args,
685  )
686 
687  def get_file_list():
688  training_path = pathlib.Path(ctx.training_dir)
689  if ctx.save_box_tiff:
690  log.info("=== Saving box/tiff pairs for training data ===")
691  yield from training_path.glob(f"{ctx.lang_code}*.box")
692  yield from training_path.glob(f"{ctx.lang_code}*.tif")
693  log.info("=== Moving lstmf files for training data ===")
694  yield from training_path.glob(f"{ctx.lang_code}.*.lstmf")
695 
696  for f in get_file_list():
697  log.debug(f"Moving {f} to {path_output / f.name}")
698  shutil.move(str(f), path_output / f.name)
699 
700  lstm_list = f"{ctx.output_dir}/{ctx.lang_code}.training_files.txt"
701  dir_listing = (str(p) for p in path_output.glob(f"{ctx.lang_code}.*.lstmf"))
702  pathlib.Path(lstm_list).write_text("\n".join(dir_listing))
703 
704 # make__traineddata() {
705 # tlog "\n=== Making final traineddata file ==="
706 # local lang_prefix={ctx.langdata_dir}/${LANG_CODE}/${LANG_CODE}
707 
708 # # Combine available files for this language from the langdata dir.
709 # if [[ -r ${lang_prefix}.config ]]; then
710 # tlog "Copying ${lang_prefix}.config to ${TRAINING_DIR}"
711 # cp ${lang_prefix}.config ${TRAINING_DIR}
712 # chmod u+w ${TRAINING_DIR}/${LANG_CODE}.config
713 # fi
714 # if [[ -r ${lang_prefix}.params-model ]]; then
715 # tlog "Copying ${lang_prefix}.params-model to ${TRAINING_DIR}"
716 # cp ${lang_prefix}.params-model ${TRAINING_DIR}
717 # chmod u+w ${TRAINING_DIR}/${LANG_CODE}.params-model
718 # fi
719 
720 # # Compose the traineddata file.
721 # run_command combine_tessdata ${TRAINING_DIR}/${LANG_CODE}.
722 
723 # # Copy it to the output dir, overwriting only if allowed by the cmdline flag.
724 # if [[ ! -d ${OUTPUT_DIR} ]]; then
725 # tlog "Creating new directory ${OUTPUT_DIR}"
726 # mkdir -p ${OUTPUT_DIR}
727 # fi
728 # local destfile=${OUTPUT_DIR}/${LANG_CODE}.traineddata;
729 # if [[ -f ${destfile} ]] && ((! OVERWRITE)); then
730 # err_exit "File ${destfile} exists and no --overwrite specified";
731 # fi
732 # tlog "Moving ${TRAINING_DIR}/${LANG_CODE}.traineddata to ${OUTPUT_DIR}"
733 # cp -f ${TRAINING_DIR}/${LANG_CODE}.traineddata ${destfile}
734 # }

◆ make_outbase()

def tesstrain_utils.make_outbase (   ctx,
  fontname,
  exposure 
)

Definition at line 313 of file tesstrain_utils.py.

313 def make_outbase(ctx, fontname, exposure):
314  return pathlib.Path(ctx.training_dir) / f"{ctx.lang_code}.{fontname}.exp{exposure}"
315 
316 
317 # Helper function for phaseI_generate_image. Generates the image for a single
318 # language/font combination in a way that can be run in parallel.

◆ parse_flags()

def tesstrain_utils.parse_flags (   argv = None)

Definition at line 216 of file tesstrain_utils.py.

216 def parse_flags(argv=None):
217  ctx = TrainingArgs()
218  log.debug(ctx)
219  parser.parse_args(args=argv, namespace=ctx)
220  log.debug(ctx)
221 
222  if not ctx.lang_code:
223  err_exit("Need to specify a language --lang")
224  if not ctx.langdata_dir:
225  err_exit("Need to specify path to language files --langdata_dir")
226  if not ctx.tessdata_dir:
227  tessdata_prefix = os.environ.get("TESSDATA_PREFIX", "")
228  if not tessdata_prefix:
229  err_exit(
230  "Need to specify a --tessdata_dir or have a "
231  "TESSDATA_PREFIX variable defined in your environment"
232  )
233  else:
234  ctx.tessdata_dir = tessdata_prefix
235  if not ctx.output_dir:
236  ctx.output_dir = mkdtemp(prefix=f"trained-{ctx.lang_code}-{ctx.timestamp}")
237  log.info(f"Output directory set to: {ctx.output_dir}")
238 
239  # Location where intermediate files will be created.
240  if not ctx.tmp_dir:
241  ctx.training_dir = mkdtemp(prefix=f"{ctx.lang_code}-{ctx.timestamp}")
242  else:
243  ctx.training_dir = mkdtemp(prefix=f"{ctx.lang_code}-{ctx.timestamp}", dir=ctx.tmp_dir)
244  # Location of log file for the whole run.
245  ctx.log_file = pathlib.Path(ctx.training_dir) / "tesstrain.log"
246  log.info(f"Log file location: {ctx.log_file}")
247 
248  def show_tmpdir_location(training_dir):
249  # On successful exit we will delete this first; on failure we want to let the user
250  # know where the log is
251  if pathlib.Path(training_dir).exists():
252  print(f"Temporary files retained at: {training_dir}")
253 
254  atexit.register(show_tmpdir_location, ctx.training_dir)
255 
256  # Take training text and wordlist from the langdata directory if not
257  # specified in the command-line.
258  if not ctx.training_text:
259  ctx.training_text = (
260  pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.training_text"
261  )
262  if not ctx.wordlist_file:
263  ctx.wordlist_file = (
264  pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.wordlist"
265  )
266 
267  ctx.word_bigrams_file = (
268  pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.word.bigrams"
269  )
270  ctx.numbers_file = (
271  pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.numbers"
272  )
273  ctx.punc_file = pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.punc"
274  ctx.bigram_freqs_file = pathlib.Path(ctx.training_text).with_suffix(
275  ".training_text.bigram_freqs"
276  )
277  ctx.unigram_freqs_file = pathlib.Path(ctx.training_text).with_suffix(
278  ".training_text.unigram_freqs"
279  )
280  ctx.train_ngrams_file = pathlib.Path(ctx.training_text).with_suffix(
281  ".training_text.train_ngrams"
282  )
283  ctx.generate_dawgs = 1
284  log.debug(ctx)
285  return ctx
286 
287 

◆ phase_E_extract_features()

def tesstrain_utils.phase_E_extract_features (   ctx,
  box_config,
  ext 
)

Definition at line 525 of file tesstrain_utils.py.

525 def phase_E_extract_features(ctx, box_config, ext):
526  log.info(f"=== Phase E: Generating {ext} files ===")
527 
528  img_files = list(pathlib.Path(ctx.training_dir).glob("*.exp*.tif"))
529  log.debug(img_files)
530 
531  # Use any available language-specific configs.
532  config = ""
533  testconfig = pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.config"
534  if testconfig.exists():
535  config = testconfig
536  log.info(f"Using {ctx.lang_code}.config")
537 
538  tessdata_environ = os.environ.copy()
539  tessdata_environ["TESSDATA_PREFIX"] = str(ctx.tessdata_dir)
540 
541  log.info(f"Using TESSDATA_PREFIX={tessdata_environ['TESSDATA_PREFIX']}")
542 
543  with tqdm(total=len(img_files)) as pbar, concurrent.futures.ThreadPoolExecutor(
544  max_workers=2
545  ) as executor:
546  futures = []
547  for img_file in img_files:
548  future = executor.submit(
549  run_command,
550  "tesseract",
551  img_file,
552  pathlib.Path(img_file).with_suffix(""),
553  *box_config,
554  config,
555  env=tessdata_environ,
556  )
557  futures.append(future)
558 
559  for future in concurrent.futures.as_completed(futures):
560  try:
561  future.result()
562  except Exception as exc:
563  err_exit("Failed while extracting features: " + str(exc))
564  else:
565  pbar.update(1)
566  # Check that all the output files were produced.
567  for img_file in img_files:
568  check_file_readable(pathlib.Path(img_file.with_suffix("." + ext)))
569 
570  return
571 
572 
573 # # Phase C : (C)luster feature prototypes in .tr into normproto file (cnTraining)
574 # # phaseC_cluster_prototypes ${TRAINING_DIR}/${LANG_CODE}.normproto
575 # phase_C_cluster_prototypes() {
576 # tlog "\n=== Phase C: Clustering feature prototypes (cnTraining) ==="
577 # local out_normproto=$1
578 
579 # run_command cntraining -D "${TRAINING_DIR}/" \
580 # $(ls ${TRAINING_DIR}/*.tr)
581 
582 # check_file_readable ${TRAINING_DIR}/normproto
583 # mv ${TRAINING_DIR}/normproto ${out_normproto}
584 # }
585 
586 # # Phase S : (S)hape clustering
587 # phase_S_cluster_shapes() {
588 # if ((! RUN_SHAPE_CLUSTERING)); then
589 # tlog "\n=== Shape Clustering disabled ==="
590 # return
591 # fi
592 # check_file_readable {ctx.langdata_dir}/font_properties
593 # local font_props="-F {ctx.langdata_dir}/font_properties"
594 # if [[ -r ${TRAINING_DIR}/${LANG_CODE}.xheights ]] &&\
595 # [[ -s ${TRAINING_DIR}/${LANG_CODE}.xheights ]]; then
596 # font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights"
597 # fi
598 
599 # run_command shapeclustering \
600 # -D "${TRAINING_DIR}/" \
601 # -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \
602 # -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \
603 # ${font_props} \
604 # $(ls ${TRAINING_DIR}/*.tr)
605 # check_file_readable ${TRAINING_DIR}/shapetable \
606 # ${TRAINING_DIR}/${LANG_CODE}.mfunicharset
607 # }
608 
609 # # Phase M : Clustering microfeatures (mfTraining)
610 # phase_M_cluster_microfeatures() {
611 # tlog "\n=== Phase M : Clustering microfeatures (mfTraining) ==="
612 
613 # check_file_readable {ctx.langdata_dir}/font_properties
614 # font_props="-F {ctx.langdata_dir}/font_properties"
615 # if [[ -r ${TRAINING_DIR}/${LANG_CODE}.xheights ]] && \
616 # [[ -s ${TRAINING_DIR}/${LANG_CODE}.xheights ]]; then
617 # font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights"
618 # fi
619 
620 # run_command mftraining \
621 # -D "${TRAINING_DIR}/" \
622 # -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \
623 # -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \
624 # ${font_props} \
625 # $(ls ${TRAINING_DIR}/*.tr)
626 # check_file_readable ${TRAINING_DIR}/inttemp ${TRAINING_DIR}/shapetable \
627 # ${TRAINING_DIR}/pffmtable ${TRAINING_DIR}/${LANG_CODE}.mfunicharset
628 # mv ${TRAINING_DIR}/inttemp ${TRAINING_DIR}/${LANG_CODE}.inttemp
629 # mv ${TRAINING_DIR}/shapetable ${TRAINING_DIR}/${LANG_CODE}.shapetable
630 # mv ${TRAINING_DIR}/pffmtable ${TRAINING_DIR}/${LANG_CODE}.pffmtable
631 # mv ${TRAINING_DIR}/${LANG_CODE}.mfunicharset ${TRAINING_DIR}/${LANG_CODE}.unicharset
632 # }
633 
634 # phase_B_generate_ambiguities() {
635 # tlog "\n=== Phase B : ambiguities training ==="
636 
637 # # Check for manually created ambiguities data.
638 # if [[ -r {ctx.langdata_dir}/${LANG_CODE}/${LANG_CODE}.unicharambigs ]]; then
639 # tlog "Found file {ctx.langdata_dir}/${LANG_CODE}/${LANG_CODE}.unicharambigs"
640 # cp {ctx.langdata_dir}/${LANG_CODE}/${LANG_CODE}.unicharambigs \
641 # ${TRAINING_DIR}/${LANG_CODE}.unicharambigs
642 # # Make it writable, as it may be read-only in the client.
643 # chmod u+w ${TRAINING_DIR}/${LANG_CODE}.unicharambigs
644 # return
645 # else
646 # tlog "No unicharambigs file found!"
647 # fi
648 
649 # # TODO: Add support for generating ambiguities automatically.
650 # }
651 
652 

◆ phase_I_generate_image()

def tesstrain_utils.phase_I_generate_image (   ctx,
  par_factor = None 
)

Definition at line 369 of file tesstrain_utils.py.

369 def phase_I_generate_image(ctx, par_factor=None):
370  if not par_factor or par_factor <= 0:
371  par_factor = 1
372 
373  log.info("=== Phase I: Generating training images ===")
374  check_file_readable(ctx.training_text)
375  char_spacing = 0.0
376 
377  for exposure in ctx.exposures:
378  if ctx.extract_font_properties and pathlib.Path(ctx.bigram_freqs_file).exists():
379  # Parse .bigram_freqs file and compose a .train_ngrams file with text
380  # for tesseract to recognize during training. Take only the ngrams whose
381  # combined weight accounts for 95% of all the bigrams in the language.
382  lines = pathlib.Path(ctx.bigram_freqs_file).read_text(encoding="utf-8").split("\n")
383  records = (line.split() for line in lines)
384  p = 0.99
385  ngram_frac = p * sum(int(rec[1]) for rec in records if len(rec) >= 2)
386 
387  with pathlib.Path(ctx.train_ngrams_file).open("w", encoding="utf-8") as f:
388  cumsum = 0
389  for bigram, count in sorted(records, key=itemgetter(1), reverse=True):
390  if cumsum > ngram_frac:
391  break
392  f.write(bigram + " ")
393  cumsum += count
394 
395  check_file_readable(ctx.train_ngrams_file)
396 
397  with tqdm(
398  total=len(ctx.fonts)
399  ) as pbar, concurrent.futures.ThreadPoolExecutor(max_workers=par_factor) as executor:
400  futures = [
401  executor.submit(generate_font_image, ctx, font, exposure, char_spacing)
402  for font in ctx.fonts
403  ]
404  for future in concurrent.futures.as_completed(futures):
405  try:
406  future.result()
407  except Exception as exc:
408  err_exit("Failed while generating images " + str(exc))
409  else:
410  pbar.update(1)
411 
412  # Check that each process was successful.
413  for font in ctx.fonts:
414  fontname = make_fontname(font)
415  outbase = make_outbase(ctx, fontname, exposure)
416  check_file_readable(str(outbase) + ".box", str(outbase) + ".tif")
417  return
418 
419 
420 # Phase UP : Generate (U)nicharset and (P)roperties file.

◆ phase_UP_generate_unicharset()

def tesstrain_utils.phase_UP_generate_unicharset (   ctx)

Definition at line 421 of file tesstrain_utils.py.

422  log.info("=== Phase UP: Generating unicharset and unichar properties files ===")
423 
424  box_files = pathlib.Path(ctx.training_dir).glob("*.box")
425 
426  ctx.unicharset_file = pathlib.Path(ctx.training_dir) / f"{ctx.lang_code}.unicharset"
427 
428  run_command(
429  "unicharset_extractor",
430  "--output_unicharset",
431  f"{ctx.unicharset_file}",
432  "--norm_mode",
433  f"{ctx.norm_mode}",
434  *box_files,
435  )
436  check_file_readable(ctx.unicharset_file)
437 
438  ctx.xheights_file = pathlib.Path(ctx.training_dir) / f"{ctx.lang_code}.xheights"
439  run_command(
440  "set_unicharset_properties",
441  "-U",
442  f"{ctx.unicharset_file}",
443  "-O",
444  f"{ctx.unicharset_file}",
445  "-X",
446  f"{ctx.xheights_file}",
447  f"--script_dir={ctx.langdata_dir}",
448  )
449  check_file_readable(ctx.xheights_file)
450 
451 
452 # # Phase D : Generate (D)awg files from unicharset file and wordlist files
453 # phase_D_generate_dawg() {
454 # tlog "\n=== Phase D: Generating Dawg files ==="
455 
456 # # Skip if requested
457 # if [[ ${GENERATE_DAWGS} -eq 0 ]]; then
458 # tlog "Skipping ${phase_name}"
459 # return
460 # fi
461 
462 # # Output files
463 # WORD_DAWG=${TRAINING_DIR}/${LANG_CODE}.word-dawg
464 # FREQ_DAWG=${TRAINING_DIR}/${LANG_CODE}.freq-dawg
465 # PUNC_DAWG=${TRAINING_DIR}/${LANG_CODE}.punc-dawg
466 # NUMBER_DAWG=${TRAINING_DIR}/${LANG_CODE}.number-dawg
467 # BIGRAM_DAWG=${TRAINING_DIR}/${LANG_CODE}.bigram-dawg
468 
469 # # Word DAWG
470 # local freq_wordlist_file=${TRAINING_DIR}/${LANG_CODE}.wordlist.clean.freq
471 # if [[ -s ${WORDLIST_FILE} ]]; then
472 # tlog "Generating word Dawg"
473 # check_file_readable ${unicharset_file}
474 # run_command wordlist2dawg -r 1 ${WORDLIST_FILE} ${WORD_DAWG} \
475 # ${UNICHARSET_FILE}
476 # check_file_readable ${WORD_DAWG}
477 
478 # FREQ_DAWG_SIZE=100
479 # head -n ${FREQ_DAWG_SIZE} ${WORDLIST_FILE} > ${freq_wordlist_file}
480 # fi
481 
482 # # Freq-word DAWG
483 # if [[ -s ${freq_wordlist_file} ]]; then
484 # check_file_readable ${UNICHARSET_FILE}
485 # tlog "Generating frequent-word Dawg"
486 # run_command wordlist2dawg -r 1 ${freq_wordlist_file} \
487 # ${FREQ_DAWG} ${UNICHARSET_FILE}
488 # check_file_readable ${FREQ_DAWG}
489 # fi
490 
491 # # Punctuation DAWG
492 # # -r arguments to wordlist2dawg denote RTL reverse policy
493 # # (see Trie::RTLReversePolicy enum in tesseract/src/dict/trie.h).
494 # # We specify 0/RRP_DO_NO_REVERSE when generating number DAWG,
495 # # 1/RRP_REVERSE_IF_HAS_RTL for freq and word DAWGS,
496 # # 2/RRP_FORCE_REVERSE for the punctuation DAWG.
497 # local punc_reverse_policy=0;
498 # if [[ "${LANG_IS_RTL}" == "1" ]]; then
499 # punc_reverse_policy=2
500 # fi
501 # if [[ ! -s ${PUNC_FILE} ]]; then
502 # PUNC_FILE="{ctx.langdata_dir}/common.punc"
503 # fi
504 # check_file_readable ${PUNC_FILE}
505 # run_command wordlist2dawg -r ${punc_reverse_policy} \
506 # ${PUNC_FILE} ${PUNC_DAWG} ${UNICHARSET_FILE}
507 # check_file_readable ${PUNC_DAWG}
508 
509 # # Numbers DAWG
510 # if [[ -s ${NUMBERS_FILE} ]]; then
511 # run_command wordlist2dawg -r 0 \
512 # ${NUMBERS_FILE} ${NUMBER_DAWG} ${UNICHARSET_FILE}
513 # check_file_readable ${NUMBER_DAWG}
514 # fi
515 
516 # # Bigram dawg
517 # if [[ -s ${WORD_BIGRAMS_FILE} ]]; then
518 # run_command wordlist2dawg -r 1 \
519 # ${WORD_BIGRAMS_FILE} ${BIGRAM_DAWG} ${UNICHARSET_FILE}
520 # check_file_readable ${BIGRAM_DAWG}
521 # fi
522 # }
523 
524 # Phase E : (E)xtract .tr feature files from .tif/.box files

◆ run_command()

def tesstrain_utils.run_command (   cmd,
args,
  env = None 
)

Definition at line 78 of file tesstrain_utils.py.

78 def run_command(cmd, *args, env=None):
79  for d in ("", "api/", "training/"):
80  testcmd = shutil.which(f"{d}{cmd}")
81  if shutil.which(testcmd):
82  cmd = testcmd
83  break
84  if not shutil.which(cmd):
85  err_exit(f"{cmd} not found")
86 
87  log.debug(f"Running {cmd}")
88  args = list(args)
89  for idx, arg in enumerate(args):
90  log.debug(arg)
91  # Workaround for https://bugs.python.org/issue33617
92  # TypeError: argument of type 'WindowsPath' is not iterable
93  if isinstance(arg, pathlib.WindowsPath):
94  args[idx] = str(arg)
95 
96  proc = subprocess.run(
97  [cmd, *args], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=env
98  )
99  proclog = logging.getLogger(cmd)
100  if proc.returncode == 0:
101  proclog.debug(proc.stdout.decode("utf-8", errors="replace"))
102  else:
103  try:
104  proclog.error(proc.stdout.decode("utf-8", errors="replace"))
105  except Exception as e:
106  proclog.error(e)
107  err_exit(f"Program {cmd} failed with return code {proc.returncode}. Abort.")
108 
109 
110 # Check if all the given files exist, or exit otherwise.
111 # Used to check required input files and produced output files in each phase.
112 # Usage: check_file_readable FILE1 FILE2...

Variable Documentation

◆ action

tesstrain_utils.action

Definition at line 159 of file tesstrain_utils.py.

◆ dest

tesstrain_utils.dest

Definition at line 139 of file tesstrain_utils.py.

◆ help

tesstrain_utils.help

Definition at line 142 of file tesstrain_utils.py.

◆ inputdata_group

tesstrain_utils.inputdata_group
Initial value:
1 = parser.add_argument_group(
2  "inputdata",
3  "OPTIONAL flags for input data. If unspecified we will look for them in the langdata_dir directory.",
4 )

Definition at line 173 of file tesstrain_utils.py.

◆ int

tesstrain_utils.int

Definition at line 154 of file tesstrain_utils.py.

◆ log

tesstrain_utils.log = logging.getLogger(__name__)

Definition at line 35 of file tesstrain_utils.py.

◆ metavar

tesstrain_utils.metavar

Definition at line 147 of file tesstrain_utils.py.

◆ nargs

tesstrain_utils.nargs

Definition at line 140 of file tesstrain_utils.py.

◆ parser

tesstrain_utils.parser
Initial value:
1 = argparse.ArgumentParser(
2  epilog=
3 )

Definition at line 129 of file tesstrain_utils.py.

◆ tessdata_group

tesstrain_utils.tessdata_group
Initial value:
1 = parser.add_argument_group(
2  "tessdata",
3  "OPTIONAL flag to specify location of existing traineddata files, required during feature extraction. If unspecified will use TESSDATA_PREFIX defined in the current environment.",
4 )

Definition at line 196 of file tesstrain_utils.py.

◆ type

tesstrain_utils.type

Definition at line 141 of file tesstrain_utils.py.

tesstrain_utils.parse_flags
def parse_flags(argv=None)
Definition: tesstrain_utils.py:216
tesstrain_utils.make_lstmdata
def make_lstmdata(ctx)
Definition: tesstrain_utils.py:653
tesstrain_utils.cleanup
def cleanup(ctx)
Definition: tesstrain_utils.py:288
tesstrain_utils.phase_I_generate_image
def phase_I_generate_image(ctx, par_factor=None)
Definition: tesstrain_utils.py:369
tesstrain_utils.check_file_readable
def check_file_readable(*filenames)
Definition: tesstrain_utils.py:113
tesstrain_utils.int
int
Definition: tesstrain_utils.py:154
tesstrain_utils.initialize_fontconfig
def initialize_fontconfig(ctx)
Definition: tesstrain_utils.py:295
tesstrain_utils.make_outbase
def make_outbase(ctx, fontname, exposure)
Definition: tesstrain_utils.py:313
tesstrain_utils.err_exit
def err_exit(msg)
Definition: tesstrain_utils.py:70
tesstrain_utils.generate_font_image
def generate_font_image(ctx, font, exposure, char_spacing)
Definition: tesstrain_utils.py:319
tesstrain_utils.phase_E_extract_features
def phase_E_extract_features(ctx, box_config, ext)
Definition: tesstrain_utils.py:525
tesstrain_utils.make_fontname
def make_fontname(font)
Definition: tesstrain_utils.py:309
tesstrain_utils.run_command
def run_command(cmd, *args, env=None)
Definition: tesstrain_utils.py:78
tesstrain_utils.phase_UP_generate_unicharset
def phase_UP_generate_unicharset(ctx)
Definition: tesstrain_utils.py:421