#!/bin/bash
## 2009-06:  tesseract-training steps as per http://code.google.com/p/tesseract-ocr/wiki/TrainingTesseract;
##	is run in directory containing your training images as TIFFs;
##
## PREREQ:  cvt-utf8-to-latin1 cvt-latin1-to-utf8 -- from http://ereimer.net/programs/general-purpose-scripts.htm
##
## Copyright © 2009 Eugene Reimer;  can be used, modified, copied, and distributed or sold under the terms of either the LGPL or the GPL (your choice);
## see http://www.gnu.org/licenses for the details of these terms.

eDo () { echo "==$@"; "$@"; }					##function to echo then execute args
X=$(pwd);X=/tmp/${X##*/}-$(date +%Y%m%d);rm -fR $X;cp -pR . $X	##make backup copy of curdir (while debugging)
LNG=deu								##language-code to use in makebox step	<--CHANGE AS NEEDED

for F in *tif;do  X=${F%.*}
  if [ -e $X.box-ORIG-FROM-??? ];then				##avoid redoing a tif+box pair from another language's training pkg
    cp -v  $X.box-ORIG-FROM-???			  $X.box	##copy, skipping the latin1 version (twould cause confusion since all box.latin1 files are to be revised)

  elif [ -e $X.box-EDITED.latin1 ];then				##have box-EDITED -- a manually revised copy that's a keeper...
    if [[ -e $X.box-SOURCEFILE-EDITED.latin1 && $X.box-SOURCEFILE-EDITED.latin1 -nt $X.box-EDITED.latin1 ]];then		##also SOURCEFILE-EDITED and it's newer
      eDo cvt-tesseract-source-to-box $X.box-EDITED.latin1	##use box.SOURCEFILE-EDITED to revise box-EDITED
    fi
    eDo cvt-latin1-to-utf8  $X.box-EDITED.latin1  $X.box	##convert box-EDITED to utf8  (removing X.box.latin1 not needed, see renames at end)
  else
    eDo tesseract $F $X -l $LNG batch.nochop makebox		##reads X.tif,  writes X.txt
    mv  $X.txt				$X.box			##rename to X.box
    cvt-utf8-to-latin1  $X.box		$X.box.latin1		##convert to latin1  (X.box must be in utf8, but I want to edit it in latin1-form)
    cvt-tesseract-box-fixups		$X.box.latin1		##fixups for faulty yHI etc
  fi
done
for F in *tif;do  X=${F%.*};  [ -e $X.box.latin1 ] && eDo cvt-tesseract-box-to-source $X.box.latin1;  done			##convert boxfile to SOURCEFILE
echo -n "==now edit ONE from each (X.box-SOURCEFILE.latin1, X.box.latin1) pair -- press any key when done";  read		##====----EDITING DONE HERE----====

for F in *tif;do  X=${F%.*}
  if [[ -e $X.box.latin1 && $X.box-SOURCEFILE.latin1 -nt $X.box.latin1 ]];then eDo cvt-tesseract-source-to-box $X.box.latin1;fi	##use SOURCEFILE if newer
  if [[ -e $X.box.latin1 ]];then  eDo cvt-latin1-to-utf8 $X.box.latin1 $X.box;  fi						##convert edited boxfile back to utf8
  eDo tesseract $F junk nobatch box.train		##reads X.tif and X.box,  writes X.tr
done
P=${F%%.*}						##==Prefix for the renaming of output files
echo "==mftraining";   mftraining		*.tr	##writes inttemp (the shape prototypes) and pffmtable (the features for each character) and Microfeat (not used)
echo "==cntraining";   cntraining		*.tr	##writes normproto (the character normalization sensitivity prototypes)
echo "==unicharset";   unicharset_extractor	*.box	##writes unicharset (the isdigit,isupper,islower,isalpha Properties for each char, encoded in a bitfield)
##==make a frequently-used-words list, of the English words used in the headings;  extract-tesseract... remove punctuation + uniquify + cvt-to-utf8;
echo -e "ENGLISH\nGERMAN" >tmpf				##doc says one-word-per-line  (although space-separated also works)
echo -e "harrow\neggen"   >tmpw
echo "==freq-dawg";  wordlist2dawg tmpf $P.freq-dawg	##freq-dawg  mayNOT be empty;  DAWG == Directed Acyclic Word Graph
echo "==word-dawg";  wordlist2dawg tmpw $P.word-dawg	##word-dawg  mayNOT be empty
>$P.user-words                                     	##user-words may be empty
cp -u /usr/share/tessdata/$LNG.DangAmbigs $P.DangAmbigs	##DangAmbigs file specifying that r+n resembles m, etc;  copy from /usr/share/tessdata/$LNG.DangAmbigs
for F in inttemp normproto pffmtable unicharset;do mv -f $F $P.$F; done		##rename to tesseract naming-convention
rm -f tmp* *~ Microfeat junk*							##cleanup the junk
for F in *tif;do  X=${F%.*}				##==renames to avoid losing edits:  {box,box-SOURCEFILE}.latin1-->{box-EDITED,box-SOURCEFILE-EDITED}.latin1
  if [[ -e $X.box.latin1 && $X.box-SOURCEFILE.latin1 -ot $X.box.latin1 ]];then  rm -f $X.box-SOURCEFILE.latin1                                 ;fi
  if [[ -e $X.box-SOURCEFILE.latin1                                    ]];then  mv -v $X.box-SOURCEFILE.latin1 $X.box-SOURCEFILE-EDITED.latin1 ;fi
  if [[ -e $X.box.latin1                                               ]];then  mv -v $X.box.latin1 $X.box-EDITED.latin1                       ;fi
done							##could simplify the naming-conventions, so these renames not needed??
sudo cp -vf $P.{DangAmbigs,freq-dawg,inttemp,normproto,pffmtable,unicharset,user-words,word-dawg}  /usr/share/tessdata  	##====----Install----====


exit
HINT:  Converting PBM to TIFF:  pamtotiff -g4  X.pbm  >X.tif
HINT:  Making multi-page-TIFF:  tiffcp -c g4  X*tif  X-CAT.tif  (my img2pdf also uses -t for tiles vs strips as recommended by tiff2pdf for large images)
HINT:  when msgs voluminous use:  tesseract-training-from-images 2>&1 |less
===========
CHANGE-LOG:
===========
2009-06:
Substantial differences on the DEU Arial image, although running in DEU mode, is surprising and dissappointing--!!--
the redone version lacks PAGENBRS, has many COORD differences;  when comparing only on CHAR in 1st column, then:
	many differences involving various kinds of quote chars:  » --> >>  and  '' --> "  and  Microsoft-quotes to ANSI-quotes;
	also misdiagnosed 1 l I !;  Z-->z;  Ä-->A;
	most surprising differences:  ... --> ~;  « --> er;
	note: stripping off PAGENBRS made no difference;  although it is needed for tesseractTrainer.py to work--!!--
--FIXED, by not redoing such cases:  if box-ORIG-FROM-??? exists then copy it rather than running  tesseract...makebox;

errormsgs from cmdline  tesseract pde.img0005.tif junk nobatch box.train:
	APPLY_BOXES: boxfile 10/2/. ((2676,5577),(2694,5593)): FAILURE! box overlaps no blobs or blobs in multiple rows
	APPLY_BOXES: Unlabelled word blk:1 row:2 allrows:2  ...  APPLY_BOXES: Unlabelled word blk:1 row:37 allrows:37
	APPLY_BOXES: FATALITY - 0 labelled samples of "." - target is 5:
--NOTE: for manually revised box-file, now treat box-EDITED much like box-ORIG-FROM-???  (am using for img0005 boxfile);
--only troublesome cp1252-char was one hex 92 -- have changed it to stroph;
--it's complaining about the line:  . 2676 5577 2694 5593  <--UNCHANGED from the way the earlier tesseract step wrote it!!  And ALL other "." chars!!
--Asked for help, from tesseract-ocr@googlegroups.com 2009-06-22 23:18;  Response by Ray Smith via updates to TrainingTesseract page;
--FOUND workaround for "box overlaps no blobs or blobs in multiple rows" msg:  revise yHI on each "." to be like its same-row neighbours!!
  that also gets rid of FATAL msg, altho not the "Unlabelled word" msgs -- APPEARS TO WORK==!!==

CONSIDER: revising yHI on all chars to the maximum occuring in that row;  might that also solve Z<->z etc errors--??--
	--2009-07-18:  DONE (sort of);  see cvt-tesseract-box-fixups;
Note: wordlist2dawg with empty input never ends;  whereas with non-empty list, it works, and isn't nearly as slow as I'd thought;
Note: Michael Reimer reports (in google-group) better results with an empty DangAmbigs file!!  SEE ALSO HIS http://www.cs.toronto.edu/~mreimer/tesseract.html

2009-07-20: ran tesseract-training-from-images on all my PDE training pages;  (SEE DETAILED NOTES are in /pix/pkg/tesseract-Box-TrainingPages-pde/00-README-ER)
	installed a BOX-VIEWER tesseractTrainer.py (see view-tesseract-boxes);  discovered bug in tesseract-makebox;
--ISSUE 223 (http://code.google.com/p/tesseract-ocr/issues/detail?id=223);  illustrated using pde.img0015.tif;
	2009-08-02: no sign of Ray Smith having looked at it, nor any issue since approx 2009-06-19??
	considered adding 0005 as 2nd example, one having the problem only in some parts??