tesseract  5.0.0-alpha-619-ge9db
tesstrain_utils.py
Go to the documentation of this file.
1 # (C) Copyright 2014, Google Inc.
2 # (C) Copyright 2018, James R Barlow
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 # http://www.apache.org/licenses/LICENSE-2.0
7 # Unless required by applicable law or agreed to in writing, software
8 # distributed under the License is distributed on an "AS IS" BASIS,
9 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 # See the License for the specific language governing permissions and
11 # limitations under the License.
12 #
13 # For a detailed description of the phases, see
14 # https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract
15 #
16 
17 import argparse
18 import atexit
19 import concurrent.futures
20 import logging
21 import os
22 import pathlib
23 import platform
24 import shutil
25 import subprocess
26 import sys
27 from datetime import date
28 from operator import itemgetter
29 from tempfile import TemporaryDirectory, mkdtemp
30 
31 from tqdm import tqdm
32 
33 from language_specific import VERTICAL_FONTS
34 
35 log = logging.getLogger(__name__)
36 
37 
38 class TrainingArgs(argparse.Namespace):
39  def __init__(self):
40  super(TrainingArgs, self).__init__()
41  self.uname = platform.uname().system.lower()
42  self.lang_code = "eng"
43  self.timestamp = str(date.today())
44 
45  self._font_config_cache = TemporaryDirectory(prefix="font_tmp")
47  self.fonts_dir = (
48  "/Library/Fonts/" if "darwin" in self.uname else "/usr/share/fonts/"
49  )
50 
51  self.max_pages = 0
52  self.save_box_tiff = False
53  self.overwrite = False
54  self.linedata = False
55  self.run_shape_clustering = False
57  self.distort_image = False
58 
59  def __eq__(self, other):
60  return (argparse.Namespace.__eq__(self, other) and
61  self.uname == other.uname and self.lang_code == other.lang_code and
62  self.timestamp == other.timestamp and self.font_config_cache == other.font_config_cache and
63  self.fonts_dir == other.fonts_dir and self.max_pages == other.max_pages and
64  self.save_box_tiff == other.save_box_tiff and self.overwrite == other.overwrite and
65  self.linedata == other.linedata and self.run_shape_clustering == other.run_shape_clustering and
66  self.extract_font_properties == other.extract_font_properties and
67  self.distort_image == other.distort_image)
68 
69 
70 def err_exit(msg):
71  log.critical(msg)
72  sys.exit(1)
73 
74 
75 # Helper function to run a command and append its output to a log. Aborts early
76 # if the program file is not found.
77 # Usage: run_command CMD ARG1 ARG2...
78 def run_command(cmd, *args, env=None):
79  for d in ("", "api/", "training/"):
80  testcmd = shutil.which(f"{d}{cmd}")
81  if shutil.which(testcmd):
82  cmd = testcmd
83  break
84  if not shutil.which(cmd):
85  err_exit(f"{cmd} not found")
86 
87  log.debug(f"Running {cmd}")
88  args = list(args)
89  for idx, arg in enumerate(args):
90  log.debug(arg)
91  # Workaround for https://bugs.python.org/issue33617
92  # TypeError: argument of type 'WindowsPath' is not iterable
93  if isinstance(arg, pathlib.WindowsPath):
94  args[idx] = str(arg)
95 
96  proc = subprocess.run(
97  [cmd, *args], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=env
98  )
99  proclog = logging.getLogger(cmd)
100  if proc.returncode == 0:
101  proclog.debug(proc.stdout.decode("utf-8", errors="replace"))
102  else:
103  try:
104  proclog.error(proc.stdout.decode("utf-8", errors="replace"))
105  except Exception as e:
106  proclog.error(e)
107  err_exit(f"Program {cmd} failed with return code {proc.returncode}. Abort.")
108 
109 
110 # Check if all the given files exist, or exit otherwise.
111 # Used to check required input files and produced output files in each phase.
112 # Usage: check_file_readable FILE1 FILE2...
113 def check_file_readable(*filenames):
114  if isinstance(filenames, (str, pathlib.Path)):
115  filenames = [filenames]
116  for filename in filenames:
117  try:
118  with pathlib.Path(filename).open():
119  pass
120  except FileNotFoundError:
121  err_exit(f"Required/expected file '{filename}' does not exist")
122  except PermissionError:
123  err_exit(f"{filename} is not readable")
124  except IOError as e:
125  err_exit(f"{filename} IO Error: {str(e)}")
126  return True
127 
128 
129 parser = argparse.ArgumentParser(
130  epilog="""
131  The font names specified in --fontlist need to be recognizable by Pango using
132  fontconfig. An easy way to list the canonical names of all fonts available on
133  your system is to run text2image with --list_available_fonts and the
134  appropriate --fonts_dir path.
135  """
136 )
137 parser.add_argument(
138  "--fontlist",
139  dest="fonts",
140  nargs="+",
141  type=str,
142  help="A list of fontnames to train on.",
143 )
144 parser.add_argument("--fonts_dir", help="Path to font files.")
145 parser.add_argument("--tmp_dir", help="Path to temporary training directory.")
146 parser.add_argument(
147  "--lang", metavar="LANG_CODE", dest="lang_code", help="ISO 639 code."
148 )
149 parser.add_argument(
150  "--langdata_dir",
151  metavar="DATADIR",
152  help="Path to tesseract/training/langdata directory.",
153 )
154 parser.add_argument("--maxpages", type=int, dest="max_pages")
155 parser.add_argument(
156  "--output_dir", metavar="OUTPUTDIR", help="Location of output traineddata file."
157 )
158 parser.add_argument(
159  "--overwrite", action="store_true", help="Safe to overwrite files in output_dir."
160 )
161 parser.add_argument(
162  "--save_box_tiff",
163  action="store_true",
164  help="Save box/tiff pairs along with lstmf files.",
165 )
166 parser.add_argument(
167  "--linedata_only",
168  dest="linedata",
169  action="store_true",
170  help="Only generate training data for lstmtraining.",
171 )
172 
173 inputdata_group = parser.add_argument_group(
174  "inputdata",
175  "OPTIONAL flags for input data. If unspecified we will look for them in the langdata_dir directory.",
176 )
177 inputdata_group.add_argument(
178  "--training_text", metavar="TEXTFILE", help="Text to render and use for training."
179 )
180 inputdata_group.add_argument(
181  "--wordlist",
182  dest="wordlist_file",
183  metavar="WORDFILE",
184  help="Word list for the language ordered by decreasing frequency.",
185 )
186 
187 parser.add_argument("--extract_font_properties", action="store_true")
188 parser.add_argument(
189  "--noextract_font_properties", dest="extract_font_properties", action="store_false"
190 )
191 
192 parser.add_argument(
193  "--distort_image", dest="distort_image", action="store_true"
194 )
195 
196 tessdata_group = parser.add_argument_group(
197  "tessdata",
198  "OPTIONAL flag to specify location of existing traineddata files, required during feature extraction. If unspecified will use TESSDATA_PREFIX defined in the current environment.",
199 )
200 tessdata_group.add_argument(
201  "--tessdata_dir",
202  metavar="TESSDATADIR",
203  help="Path to tesseract/tessdata directory.",
204 )
205 
206 parser.add_argument(
207  "--exposures",
208  metavar="EXPOSURES",
209  action="append",
210  nargs="+",
211  help="A list of exposure levels to use (e.g. -1,0,1).",
212 )
213 
214 
215 # Does simple command-line parsing and initialization.
216 def parse_flags(argv=None):
217  ctx = TrainingArgs()
218  log.debug(ctx)
219  parser.parse_args(args=argv, namespace=ctx)
220  log.debug(ctx)
221 
222  if not ctx.lang_code:
223  err_exit("Need to specify a language --lang")
224  if not ctx.langdata_dir:
225  err_exit("Need to specify path to language files --langdata_dir")
226  if not ctx.tessdata_dir:
227  tessdata_prefix = os.environ.get("TESSDATA_PREFIX", "")
228  if not tessdata_prefix:
229  err_exit(
230  "Need to specify a --tessdata_dir or have a "
231  "TESSDATA_PREFIX variable defined in your environment"
232  )
233  else:
234  ctx.tessdata_dir = tessdata_prefix
235  if not ctx.output_dir:
236  ctx.output_dir = mkdtemp(prefix=f"trained-{ctx.lang_code}-{ctx.timestamp}")
237  log.info(f"Output directory set to: {ctx.output_dir}")
238 
239  # Location where intermediate files will be created.
240  if not ctx.tmp_dir:
241  ctx.training_dir = mkdtemp(prefix=f"{ctx.lang_code}-{ctx.timestamp}")
242  else:
243  ctx.training_dir = mkdtemp(prefix=f"{ctx.lang_code}-{ctx.timestamp}", dir=ctx.tmp_dir)
244  # Location of log file for the whole run.
245  ctx.log_file = pathlib.Path(ctx.training_dir) / "tesstrain.log"
246  log.info(f"Log file location: {ctx.log_file}")
247 
248  def show_tmpdir_location(training_dir):
249  # On successful exit we will delete this first; on failure we want to let the user
250  # know where the log is
251  if pathlib.Path(training_dir).exists():
252  print(f"Temporary files retained at: {training_dir}")
253 
254  atexit.register(show_tmpdir_location, ctx.training_dir)
255 
256  # Take training text and wordlist from the langdata directory if not
257  # specified in the command-line.
258  if not ctx.training_text:
259  ctx.training_text = (
260  pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.training_text"
261  )
262  if not ctx.wordlist_file:
263  ctx.wordlist_file = (
264  pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.wordlist"
265  )
266 
267  ctx.word_bigrams_file = (
268  pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.word.bigrams"
269  )
270  ctx.numbers_file = (
271  pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.numbers"
272  )
273  ctx.punc_file = pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.punc"
274  ctx.bigram_freqs_file = pathlib.Path(ctx.training_text).with_suffix(
275  ".training_text.bigram_freqs"
276  )
277  ctx.unigram_freqs_file = pathlib.Path(ctx.training_text).with_suffix(
278  ".training_text.unigram_freqs"
279  )
280  ctx.train_ngrams_file = pathlib.Path(ctx.training_text).with_suffix(
281  ".training_text.train_ngrams"
282  )
283  ctx.generate_dawgs = 1
284  log.debug(ctx)
285  return ctx
286 
287 
288 def cleanup(ctx):
289  shutil.copy(ctx.log_file, ctx.output_dir)
290  shutil.rmtree(ctx.training_dir)
291  return
292 
293 
294 # Function initializes font config with a unique font cache dir.
296  sample_path = pathlib.Path(ctx.font_config_cache) / "sample_text.txt"
297  pathlib.Path(sample_path).write_text("Text\n")
298  log.info(f"Testing font: {ctx.fonts[0]}")
299  run_command(
300  "text2image",
301  f"--fonts_dir={ctx.fonts_dir}",
302  f"--font={ctx.fonts[0]}",
303  f"--outputbase={sample_path}",
304  f"--text={sample_path}",
305  f"--fontconfig_tmpdir={ctx.font_config_cache}",
306  )
307 
308 
309 def make_fontname(font):
310  return font.replace(" ", "_").replace(",", "")
311 
312 
313 def make_outbase(ctx, fontname, exposure):
314  return pathlib.Path(ctx.training_dir) / f"{ctx.lang_code}.{fontname}.exp{exposure}"
315 
316 
317 # Helper function for phaseI_generate_image. Generates the image for a single
318 # language/font combination in a way that can be run in parallel.
319 def generate_font_image(ctx, font, exposure, char_spacing):
320  log.info(f"Rendering using {font}")
321  fontname = make_fontname(font)
322  outbase = make_outbase(ctx, fontname, exposure)
323 
324  common_args = [
325  f"--fontconfig_tmpdir={ctx.font_config_cache}",
326  f"--fonts_dir={ctx.fonts_dir}",
327  f"--strip_unrenderable_words",
328  f"--leading={ctx.leading}",
329  f"--char_spacing={char_spacing}",
330  f"--exposure={exposure}",
331  f"--outputbase={outbase}",
332  f"--max_pages={ctx.max_pages}",
333  ]
334 
335  if ctx.distort_image:
336  common_args.append("--distort_image")
337 
338  # add --writing_mode=vertical-upright to common_args if the font is
339  # specified to be rendered vertically.
340  if font in VERTICAL_FONTS:
341  common_args.append("--writing_mode=vertical-upright")
342 
343  run_command(
344  "text2image",
345  *common_args,
346  f"--font={font}",
347  f"--text={ctx.training_text}",
348  *ctx.text2image_extra_args,
349  )
350 
351  check_file_readable(str(outbase) + ".box", str(outbase) + ".tif")
352 
353  if ctx.extract_font_properties and pathlib.Path(ctx.train_ngrams_file).exists():
354  log.info(f"Extracting font properties of {font}")
355  run_command(
356  "text2image",
357  *common_args,
358  f"--font={font}",
359  f"--ligatures=false",
360  f"--text={ctx.train_ngrams_file}",
361  f"--only_extract_font_properties",
362  f"--ptsize=32",
363  )
364  check_file_readable(str(outbase) + ".fontinfo")
365  return f"{font}-{exposure}"
366 
367 
368 # Phase I : Generate (I)mages from training text for each font.
369 def phase_I_generate_image(ctx, par_factor=None):
370  if not par_factor or par_factor <= 0:
371  par_factor = 1
372 
373  log.info("=== Phase I: Generating training images ===")
374  check_file_readable(ctx.training_text)
375  char_spacing = 0.0
376 
377  for exposure in ctx.exposures:
378  if ctx.extract_font_properties and pathlib.Path(ctx.bigram_freqs_file).exists():
379  # Parse .bigram_freqs file and compose a .train_ngrams file with text
380  # for tesseract to recognize during training. Take only the ngrams whose
381  # combined weight accounts for 95% of all the bigrams in the language.
382  lines = pathlib.Path(ctx.bigram_freqs_file).read_text(encoding="utf-8").split("\n")
383  records = (line.split() for line in lines)
384  p = 0.99
385  ngram_frac = p * sum(int(rec[1]) for rec in records if len(rec) >= 2)
386 
387  with pathlib.Path(ctx.train_ngrams_file).open("w", encoding="utf-8") as f:
388  cumsum = 0
389  for bigram, count in sorted(records, key=itemgetter(1), reverse=True):
390  if cumsum > ngram_frac:
391  break
392  f.write(bigram + " ")
393  cumsum += count
394 
395  check_file_readable(ctx.train_ngrams_file)
396 
397  with tqdm(
398  total=len(ctx.fonts)
399  ) as pbar, concurrent.futures.ThreadPoolExecutor(max_workers=par_factor) as executor:
400  futures = [
401  executor.submit(generate_font_image, ctx, font, exposure, char_spacing)
402  for font in ctx.fonts
403  ]
404  for future in concurrent.futures.as_completed(futures):
405  try:
406  future.result()
407  except Exception as exc:
408  err_exit("Failed while generating images " + str(exc))
409  else:
410  pbar.update(1)
411 
412  # Check that each process was successful.
413  for font in ctx.fonts:
414  fontname = make_fontname(font)
415  outbase = make_outbase(ctx, fontname, exposure)
416  check_file_readable(str(outbase) + ".box", str(outbase) + ".tif")
417  return
418 
419 
420 # Phase UP : Generate (U)nicharset and (P)roperties file.
422  log.info("=== Phase UP: Generating unicharset and unichar properties files ===")
423 
424  box_files = pathlib.Path(ctx.training_dir).glob("*.box")
425 
426  ctx.unicharset_file = pathlib.Path(ctx.training_dir) / f"{ctx.lang_code}.unicharset"
427 
428  run_command(
429  "unicharset_extractor",
430  "--output_unicharset",
431  f"{ctx.unicharset_file}",
432  "--norm_mode",
433  f"{ctx.norm_mode}",
434  *box_files,
435  )
436  check_file_readable(ctx.unicharset_file)
437 
438  ctx.xheights_file = pathlib.Path(ctx.training_dir) / f"{ctx.lang_code}.xheights"
439  run_command(
440  "set_unicharset_properties",
441  "-U",
442  f"{ctx.unicharset_file}",
443  "-O",
444  f"{ctx.unicharset_file}",
445  "-X",
446  f"{ctx.xheights_file}",
447  f"--script_dir={ctx.langdata_dir}",
448  )
449  check_file_readable(ctx.xheights_file)
450 
451 
452 # # Phase D : Generate (D)awg files from unicharset file and wordlist files
453 # phase_D_generate_dawg() {
454 # tlog "\n=== Phase D: Generating Dawg files ==="
455 
456 # # Skip if requested
457 # if [[ ${GENERATE_DAWGS} -eq 0 ]]; then
458 # tlog "Skipping ${phase_name}"
459 # return
460 # fi
461 
462 # # Output files
463 # WORD_DAWG=${TRAINING_DIR}/${LANG_CODE}.word-dawg
464 # FREQ_DAWG=${TRAINING_DIR}/${LANG_CODE}.freq-dawg
465 # PUNC_DAWG=${TRAINING_DIR}/${LANG_CODE}.punc-dawg
466 # NUMBER_DAWG=${TRAINING_DIR}/${LANG_CODE}.number-dawg
467 # BIGRAM_DAWG=${TRAINING_DIR}/${LANG_CODE}.bigram-dawg
468 
469 # # Word DAWG
470 # local freq_wordlist_file=${TRAINING_DIR}/${LANG_CODE}.wordlist.clean.freq
471 # if [[ -s ${WORDLIST_FILE} ]]; then
472 # tlog "Generating word Dawg"
473 # check_file_readable ${unicharset_file}
474 # run_command wordlist2dawg -r 1 ${WORDLIST_FILE} ${WORD_DAWG} \
475 # ${UNICHARSET_FILE}
476 # check_file_readable ${WORD_DAWG}
477 
478 # FREQ_DAWG_SIZE=100
479 # head -n ${FREQ_DAWG_SIZE} ${WORDLIST_FILE} > ${freq_wordlist_file}
480 # fi
481 
482 # # Freq-word DAWG
483 # if [[ -s ${freq_wordlist_file} ]]; then
484 # check_file_readable ${UNICHARSET_FILE}
485 # tlog "Generating frequent-word Dawg"
486 # run_command wordlist2dawg -r 1 ${freq_wordlist_file} \
487 # ${FREQ_DAWG} ${UNICHARSET_FILE}
488 # check_file_readable ${FREQ_DAWG}
489 # fi
490 
491 # # Punctuation DAWG
492 # # -r arguments to wordlist2dawg denote RTL reverse policy
493 # # (see Trie::RTLReversePolicy enum in tesseract/src/dict/trie.h).
494 # # We specify 0/RRP_DO_NO_REVERSE when generating number DAWG,
495 # # 1/RRP_REVERSE_IF_HAS_RTL for freq and word DAWGS,
496 # # 2/RRP_FORCE_REVERSE for the punctuation DAWG.
497 # local punc_reverse_policy=0;
498 # if [[ "${LANG_IS_RTL}" == "1" ]]; then
499 # punc_reverse_policy=2
500 # fi
501 # if [[ ! -s ${PUNC_FILE} ]]; then
502 # PUNC_FILE="{ctx.langdata_dir}/common.punc"
503 # fi
504 # check_file_readable ${PUNC_FILE}
505 # run_command wordlist2dawg -r ${punc_reverse_policy} \
506 # ${PUNC_FILE} ${PUNC_DAWG} ${UNICHARSET_FILE}
507 # check_file_readable ${PUNC_DAWG}
508 
509 # # Numbers DAWG
510 # if [[ -s ${NUMBERS_FILE} ]]; then
511 # run_command wordlist2dawg -r 0 \
512 # ${NUMBERS_FILE} ${NUMBER_DAWG} ${UNICHARSET_FILE}
513 # check_file_readable ${NUMBER_DAWG}
514 # fi
515 
516 # # Bigram dawg
517 # if [[ -s ${WORD_BIGRAMS_FILE} ]]; then
518 # run_command wordlist2dawg -r 1 \
519 # ${WORD_BIGRAMS_FILE} ${BIGRAM_DAWG} ${UNICHARSET_FILE}
520 # check_file_readable ${BIGRAM_DAWG}
521 # fi
522 # }
523 
524 # Phase E : (E)xtract .tr feature files from .tif/.box files
525 def phase_E_extract_features(ctx, box_config, ext):
526  log.info(f"=== Phase E: Generating {ext} files ===")
527 
528  img_files = list(pathlib.Path(ctx.training_dir).glob("*.exp*.tif"))
529  log.debug(img_files)
530 
531  # Use any available language-specific configs.
532  config = ""
533  testconfig = pathlib.Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.config"
534  if testconfig.exists():
535  config = testconfig
536  log.info(f"Using {ctx.lang_code}.config")
537 
538  tessdata_environ = os.environ.copy()
539  tessdata_environ["TESSDATA_PREFIX"] = str(ctx.tessdata_dir)
540 
541  log.info(f"Using TESSDATA_PREFIX={tessdata_environ['TESSDATA_PREFIX']}")
542 
543  with tqdm(total=len(img_files)) as pbar, concurrent.futures.ThreadPoolExecutor(
544  max_workers=2
545  ) as executor:
546  futures = []
547  for img_file in img_files:
548  future = executor.submit(
549  run_command,
550  "tesseract",
551  img_file,
552  pathlib.Path(img_file).with_suffix(""),
553  *box_config,
554  config,
555  env=tessdata_environ,
556  )
557  futures.append(future)
558 
559  for future in concurrent.futures.as_completed(futures):
560  try:
561  future.result()
562  except Exception as exc:
563  err_exit("Failed while extracting features: " + str(exc))
564  else:
565  pbar.update(1)
566  # Check that all the output files were produced.
567  for img_file in img_files:
568  check_file_readable(pathlib.Path(img_file.with_suffix("." + ext)))
569 
570  return
571 
572 
573 # # Phase C : (C)luster feature prototypes in .tr into normproto file (cnTraining)
574 # # phaseC_cluster_prototypes ${TRAINING_DIR}/${LANG_CODE}.normproto
575 # phase_C_cluster_prototypes() {
576 # tlog "\n=== Phase C: Clustering feature prototypes (cnTraining) ==="
577 # local out_normproto=$1
578 
579 # run_command cntraining -D "${TRAINING_DIR}/" \
580 # $(ls ${TRAINING_DIR}/*.tr)
581 
582 # check_file_readable ${TRAINING_DIR}/normproto
583 # mv ${TRAINING_DIR}/normproto ${out_normproto}
584 # }
585 
586 # # Phase S : (S)hape clustering
587 # phase_S_cluster_shapes() {
588 # if ((! RUN_SHAPE_CLUSTERING)); then
589 # tlog "\n=== Shape Clustering disabled ==="
590 # return
591 # fi
592 # check_file_readable {ctx.langdata_dir}/font_properties
593 # local font_props="-F {ctx.langdata_dir}/font_properties"
594 # if [[ -r ${TRAINING_DIR}/${LANG_CODE}.xheights ]] &&\
595 # [[ -s ${TRAINING_DIR}/${LANG_CODE}.xheights ]]; then
596 # font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights"
597 # fi
598 
599 # run_command shapeclustering \
600 # -D "${TRAINING_DIR}/" \
601 # -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \
602 # -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \
603 # ${font_props} \
604 # $(ls ${TRAINING_DIR}/*.tr)
605 # check_file_readable ${TRAINING_DIR}/shapetable \
606 # ${TRAINING_DIR}/${LANG_CODE}.mfunicharset
607 # }
608 
609 # # Phase M : Clustering microfeatures (mfTraining)
610 # phase_M_cluster_microfeatures() {
611 # tlog "\n=== Phase M : Clustering microfeatures (mfTraining) ==="
612 
613 # check_file_readable {ctx.langdata_dir}/font_properties
614 # font_props="-F {ctx.langdata_dir}/font_properties"
615 # if [[ -r ${TRAINING_DIR}/${LANG_CODE}.xheights ]] && \
616 # [[ -s ${TRAINING_DIR}/${LANG_CODE}.xheights ]]; then
617 # font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights"
618 # fi
619 
620 # run_command mftraining \
621 # -D "${TRAINING_DIR}/" \
622 # -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \
623 # -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \
624 # ${font_props} \
625 # $(ls ${TRAINING_DIR}/*.tr)
626 # check_file_readable ${TRAINING_DIR}/inttemp ${TRAINING_DIR}/shapetable \
627 # ${TRAINING_DIR}/pffmtable ${TRAINING_DIR}/${LANG_CODE}.mfunicharset
628 # mv ${TRAINING_DIR}/inttemp ${TRAINING_DIR}/${LANG_CODE}.inttemp
629 # mv ${TRAINING_DIR}/shapetable ${TRAINING_DIR}/${LANG_CODE}.shapetable
630 # mv ${TRAINING_DIR}/pffmtable ${TRAINING_DIR}/${LANG_CODE}.pffmtable
631 # mv ${TRAINING_DIR}/${LANG_CODE}.mfunicharset ${TRAINING_DIR}/${LANG_CODE}.unicharset
632 # }
633 
634 # phase_B_generate_ambiguities() {
635 # tlog "\n=== Phase B : ambiguities training ==="
636 
637 # # Check for manually created ambiguities data.
638 # if [[ -r {ctx.langdata_dir}/${LANG_CODE}/${LANG_CODE}.unicharambigs ]]; then
639 # tlog "Found file {ctx.langdata_dir}/${LANG_CODE}/${LANG_CODE}.unicharambigs"
640 # cp {ctx.langdata_dir}/${LANG_CODE}/${LANG_CODE}.unicharambigs \
641 # ${TRAINING_DIR}/${LANG_CODE}.unicharambigs
642 # # Make it writable, as it may be read-only in the client.
643 # chmod u+w ${TRAINING_DIR}/${LANG_CODE}.unicharambigs
644 # return
645 # else
646 # tlog "No unicharambigs file found!"
647 # fi
648 
649 # # TODO: Add support for generating ambiguities automatically.
650 # }
651 
652 
653 def make_lstmdata(ctx):
654  log.info("=== Constructing LSTM training data ===")
655  lang_prefix = f"{ctx.langdata_dir}/{ctx.lang_code}/{ctx.lang_code}"
656  path_output = pathlib.Path(ctx.output_dir)
657  if not path_output.is_dir():
658  log.info(f"Creating new directory {ctx.output_dir}")
659  path_output.mkdir(exist_ok=True, parents=True)
660 
661  args = []
662  if ctx.lang_is_rtl:
663  args.append("--lang_is_rtl")
664  if ctx.norm_mode >= 2:
665  args.append("--pass_through_recoder")
666 
667  # Build the starter traineddata from the inputs.
668  run_command(
669  "combine_lang_model",
670  "--input_unicharset",
671  f"{ctx.training_dir}/{ctx.lang_code}.unicharset",
672  "--script_dir",
673  f"{ctx.langdata_dir}",
674  "--words",
675  f"{lang_prefix}.wordlist",
676  "--numbers",
677  f"{lang_prefix}.numbers",
678  "--puncs",
679  f"{lang_prefix}.punc",
680  "--output_dir",
681  f"{ctx.output_dir}",
682  "--lang",
683  f"{ctx.lang_code}",
684  *args,
685  )
686 
687  def get_file_list():
688  training_path = pathlib.Path(ctx.training_dir)
689  if ctx.save_box_tiff:
690  log.info("=== Saving box/tiff pairs for training data ===")
691  yield from training_path.glob(f"{ctx.lang_code}*.box")
692  yield from training_path.glob(f"{ctx.lang_code}*.tif")
693  log.info("=== Moving lstmf files for training data ===")
694  yield from training_path.glob(f"{ctx.lang_code}.*.lstmf")
695 
696  for f in get_file_list():
697  log.debug(f"Moving {f} to {path_output / f.name}")
698  shutil.move(str(f), path_output / f.name)
699 
700  lstm_list = f"{ctx.output_dir}/{ctx.lang_code}.training_files.txt"
701  dir_listing = (str(p) for p in path_output.glob(f"{ctx.lang_code}.*.lstmf"))
702  pathlib.Path(lstm_list).write_text("\n".join(dir_listing))
703 
704 # make__traineddata() {
705 # tlog "\n=== Making final traineddata file ==="
706 # local lang_prefix={ctx.langdata_dir}/${LANG_CODE}/${LANG_CODE}
707 
708 # # Combine available files for this language from the langdata dir.
709 # if [[ -r ${lang_prefix}.config ]]; then
710 # tlog "Copying ${lang_prefix}.config to ${TRAINING_DIR}"
711 # cp ${lang_prefix}.config ${TRAINING_DIR}
712 # chmod u+w ${TRAINING_DIR}/${LANG_CODE}.config
713 # fi
714 # if [[ -r ${lang_prefix}.params-model ]]; then
715 # tlog "Copying ${lang_prefix}.params-model to ${TRAINING_DIR}"
716 # cp ${lang_prefix}.params-model ${TRAINING_DIR}
717 # chmod u+w ${TRAINING_DIR}/${LANG_CODE}.params-model
718 # fi
719 
720 # # Compose the traineddata file.
721 # run_command combine_tessdata ${TRAINING_DIR}/${LANG_CODE}.
722 
723 # # Copy it to the output dir, overwriting only if allowed by the cmdline flag.
724 # if [[ ! -d ${OUTPUT_DIR} ]]; then
725 # tlog "Creating new directory ${OUTPUT_DIR}"
726 # mkdir -p ${OUTPUT_DIR}
727 # fi
728 # local destfile=${OUTPUT_DIR}/${LANG_CODE}.traineddata;
729 # if [[ -f ${destfile} ]] && ((! OVERWRITE)); then
730 # err_exit "File ${destfile} exists and no --overwrite specified";
731 # fi
732 # tlog "Moving ${TRAINING_DIR}/${LANG_CODE}.traineddata to ${OUTPUT_DIR}"
733 # cp -f ${TRAINING_DIR}/${LANG_CODE}.traineddata ${destfile}
734 # }
tesstrain_utils.parse_flags
def parse_flags(argv=None)
Definition: tesstrain_utils.py:216
tesstrain_utils.make_lstmdata
def make_lstmdata(ctx)
Definition: tesstrain_utils.py:653
tesstrain_utils.TrainingArgs.font_config_cache
font_config_cache
Definition: tesstrain_utils.py:46
tesstrain_utils.TrainingArgs.uname
uname
Definition: tesstrain_utils.py:41
tesstrain_utils.TrainingArgs.distort_image
distort_image
Definition: tesstrain_utils.py:57
tesstrain_utils.cleanup
def cleanup(ctx)
Definition: tesstrain_utils.py:288
tesstrain_utils.phase_I_generate_image
def phase_I_generate_image(ctx, par_factor=None)
Definition: tesstrain_utils.py:369
tesstrain_utils.check_file_readable
def check_file_readable(*filenames)
Definition: tesstrain_utils.py:113
tesstrain_utils.TrainingArgs.extract_font_properties
extract_font_properties
Definition: tesstrain_utils.py:56
tesstrain_utils.int
int
Definition: tesstrain_utils.py:154
tesstrain_utils.TrainingArgs.save_box_tiff
save_box_tiff
Definition: tesstrain_utils.py:52
tesstrain_utils.TrainingArgs
Definition: tesstrain_utils.py:38
tesstrain_utils.TrainingArgs.fonts_dir
fonts_dir
Definition: tesstrain_utils.py:47
tesstrain_utils.initialize_fontconfig
def initialize_fontconfig(ctx)
Definition: tesstrain_utils.py:295
tesstrain_utils.TrainingArgs.lang_code
lang_code
Definition: tesstrain_utils.py:42
tesstrain_utils.TrainingArgs.run_shape_clustering
run_shape_clustering
Definition: tesstrain_utils.py:55
tesstrain_utils.TrainingArgs.__init__
def __init__(self)
Definition: tesstrain_utils.py:39
tesstrain_utils.make_outbase
def make_outbase(ctx, fontname, exposure)
Definition: tesstrain_utils.py:313
tesstrain_utils.err_exit
def err_exit(msg)
Definition: tesstrain_utils.py:70
tesstrain_utils.generate_font_image
def generate_font_image(ctx, font, exposure, char_spacing)
Definition: tesstrain_utils.py:319
tesstrain_utils.TrainingArgs._font_config_cache
_font_config_cache
Definition: tesstrain_utils.py:45
tesstrain_utils.TrainingArgs.overwrite
overwrite
Definition: tesstrain_utils.py:53
tesstrain_utils.TrainingArgs.timestamp
timestamp
Definition: tesstrain_utils.py:43
tesstrain_utils.phase_E_extract_features
def phase_E_extract_features(ctx, box_config, ext)
Definition: tesstrain_utils.py:525
tesstrain_utils.make_fontname
def make_fontname(font)
Definition: tesstrain_utils.py:309
tesstrain_utils.run_command
def run_command(cmd, *args, env=None)
Definition: tesstrain_utils.py:78
tesstrain_utils.TrainingArgs.__eq__
def __eq__(self, other)
Definition: tesstrain_utils.py:59
tesstrain_utils.phase_UP_generate_unicharset
def phase_UP_generate_unicharset(ctx)
Definition: tesstrain_utils.py:421
tesstrain_utils.TrainingArgs.linedata
linedata
Definition: tesstrain_utils.py:54
tesstrain_utils.TrainingArgs.max_pages
max_pages
Definition: tesstrain_utils.py:51