#!/bin/bash
## 2009-08-03:  tesseract-training steps as per http://code.google.com/p/tesseract-ocr/wiki/TrainingTesseract;
##	receives your sourcefile and a list of fontnames;
##	uses ghostscript to create rasterized images, then the rest is much like my tesseract-training-from-images script, except for fully-automated corrections;
##
## PREREQ:  arr cvt-utf8-to-latin1 cvt-latin1-to-utf8 -- from http://ereimer.net/programs/general-purpose-scripts.htm
##
## USAGE EXAMPLES:
##	tesseract-training-from-source  deu.SOURCEFILE 600 10  Arial Times Geneva	--make training pages at 600dpi in 10-point Arial, Times, and Geneva;
##	tesseract-training-from-source  deu.SOURCEFILE 600 10  Arial Times Geneva.0.9	--same as preceding, except the Geneva font is Condensed by a 0.9 ratio;
##	tesseract-training-from-source  deu.SOURCEFILE 101 10  Arial Times Geneva	--make training pages in 14-pixel high "screen fonts";
##
##	--the examples above will create the images:  deu.Arial.tif  deu.Times.tif  deu.Geneva.tif (or deu.Geneva.0.9.tif);
##	--the sourcefile is plain-text, normally in utf8 (but in latin1 if filename ends in ".latin1");
##	--pagesize doesn't have cmdline parameters, but is easily changed in the source (below);
##	--this script can be modified to use an encoding other than cp1252, fairly easily provided it's another one-byte-per-character encoding;
##	--to use a mixture of "real" and "synthetic" training-images, you can run this script after running my tesseract-training-from-images script;
##
## Copyright © 2009,2010 Eugene Reimer;  can be used, modified, copied, and distributed or sold under the terms of either the LGPL or the GPL (your choice);
## see http://www.gnu.org/licenses for the details of these terms.

LC_CTYPE=C									##sed,grep,wc need single-byte chars because gs does; tesseract ignores LC_CTYPE
eDo () { echo "==$@" >&2; "$@"; }						##function to echo then execute args
X=$(pwd);BKX=/tmp/${X##*/}-$(date +%Y%m%d);  ##rm -fR $BKX;cp -pR . $BKX	##make backup copy of curdir (while debugging)

SU=$1; DPI=$2; FSZ=$3; shift; shift; shift					##get cmdline params: filename, dpi, fontsize
SU=${SU%.latin1};  SL=$SU.latin1						##SU is the utf8 sourcefile; SL is the latin1 sourcefile
[[ $SU -nt $SL ]] && cvt-utf8-to-latin1 $SU $SL					##convert input to latin1 if needed
[[ $SL -nt $SU ]] && cvt-latin1-to-utf8 $SL $SU					##convert input to  utf8  if needed
LNG=deu										##language-code to use in makebox step	<--CHANGE AS NEEDED
PW=612; PH=792									##image width & height in points	<--CHANGE AS NEEDED
((IW=PW*DPI/72,IH=PH*DPI/72))							##image width & height in pixels, derived from preceding
P=${SU%%.*}									##Prefix for the naming of output files

for FONT in "$@";do FONTB=${FONT%%.*}; XR=1;[[ $FONT == *.* ]] && XR=${FONT#*.}	##for each font on cmdline...
  X=$P.$FONT
  echo '%!PS-Adobe-2.0'				>$X.ps				##postscript identifier
  echo "%%BoundingBox: 0 0 $PW $PH
/cp1252-er-encoding [
/.notdef/dotaccent/fi/fl/fraction/hungarumlaut/Lslash/lslash/ogonek/ring/.notdef/breve/minus/.notdef/radical/lozenge
/caron/dotlessi/dotlessj/ff/ffi/ffl/notequal/infinity/lessequal/greaterequal/partialdiff/summation/product/pi/.notdef/.notdef
/space/exclam/quotedbl/numbersign/dollar/percent/ampersand/quotesingle/parenleft/parenright/asterisk/plus/comma/hyphen/period/slash
/zero/one/two/three/four/five/six/seven/eight/nine/colon/semicolon/less/equal/greater/question
/at/A/B/C/D/E/F/G/H/I/J/K/L/M/N/O
/P/Q/R/S/T/U/V/W/X/Y/Z/bracketleft/backslash/bracketright/asciicircum/underscore
/grave/a/b/c/d/e/f/g/h/i/j/k/l/m/n/o
/p/q/r/s/t/u/v/w/x/y/z/braceleft/bar/braceright/asciitilde/.notdef
/Euro/integral/quotesinglbase/florin/quotedblbase/ellipsis/dagger/daggerdbl/circumflex/perthousand/Scaron/guilsinglleft/OE/Omega/Zcaron/approxequal
/Germandbls/quoteleft/quoteright/quotedblleft/quotedblright/bullet/endash/emdash/tilde/trademark/scaron/guilsinglright/oe/Delta/zcaron/Ydieresis
/space/exclamdown/cent/sterling/currency/yen/brokenbar/section/dieresis/copyright/ordfeminine/guillemotleft/logicalnot/hyphen/registered/macron
/degree/plusminus/twosuperior/threesuperior/acute/mu/paragraph/periodcentered/cedilla/onesuperior/ordmasculine/guillemotright/onequarter/onehalf/threequarters/questiondown
/Agrave/Aacute/Acircumflex/Atilde/Adieresis/Aring/AE/Ccedilla/Egrave/Eacute/Ecircumflex/Edieresis/Igrave/Iacute/Icircumflex/Idieresis
/Eth/Ntilde/Ograve/Oacute/Ocircumflex/Otilde/Odieresis/multiply/Oslash/Ugrave/Uacute/Ucircumflex/Udieresis/Yacute/Thorn/germandbls
/agrave/aacute/acircumflex/atilde/adieresis/aring/ae/ccedilla/egrave/eacute/ecircumflex/edieresis/igrave/iacute/icircumflex/idieresis
/eth/ntilde/ograve/oacute/ocircumflex/otilde/odieresis/divide/oslash/ugrave/uacute/ucircumflex/udieresis/yacute/thorn/ydieresis
] def					%%an extended cp1252-encoding; see notes below under CHANGE-LOG
/$FONTB-latin1
  /$FONTB findfont dup length dict begin
    {1 index /FID ne {def} {pop pop} ifelse} forall
    /Encoding cp1252-er-encoding def	%%was ISOLatin1Encoding
  currentdict end  definefont pop
/$FONTB-latin1 $FSZ selectfont		%%font and font-size, with re-encoded font
/yy	$FSZ 1.25 mul def		%%line-spacing 1.25 times fontheight
/x0	9 def				%%left-margin
/curx {currentpoint pop} bind def
/cury {currentpoint exch pop} bind def
/n {x0 cury yy sub moveto} bind def	%%newline function
/p {print} bind def			%%alias for print
/P {20 string cvs p ( )p} bind def	%%print (any) with trailing space
/strbbox {				%%str => llx lly urx ury  (where ll==lower-left; ur==upper-right)
  gsave
    newpath
    0 0 moveto
    false charpath
    flattenpath
    pathbbox
  grestore
} def
/boxp {					%%produce bbox line;  str => -
  /s1 exch def
  s1 strbbox /ury exch def /urx exch def /lly exch def /llx exch def	%%get bbox of the one-char str
  /llx llx curx add $DPI mul $XR mul 72 div round cvi def		%%convert to pixels
  /urx urx curx add $DPI mul $XR mul 72 div round cvi def		%%convert to pixels
  /lly lly cury add $DPI mul         72 div round cvi def		%%convert to pixels
  /ury ury cury add $DPI mul         72 div round cvi def		%%convert to pixels
  s1 P llx P lly P urx P ury P (\n)p					%%print bbox line;  eg:e 76 6495 114 6540
} def
/boxshow {				%%show-variant that loops thru chars...
  /str exch def
  str {
    /char exch def
    /charstr ( ) dup 0 char put def
    char 32 ne {charstr boxp} if	%%produce bbox line, except if char is space
    charstr show			%%display char normally
    1 0 rmoveto				%%extra space between chars  <--CHANGE AS NEEDED
    char 32 eq {1 0 rmoveto} if		%%extra space between words  <--CHANGE AS NEEDED
  } forall
} def
/s {boxshow} bind def			%%alias for show or boxshow
$XR 1 scale				%%to expand or condense font
0 $PH moveto
"						>>$X.ps				##postscript preamble
  cat $SL |sed 's|[()\]|\\&|g; s|.*|n(&)s|'	>>$X.ps				##postscript text-lines, in latin1
  echo $'showpage \x0C\x04'			>>$X.ps				##postscript ending
  eDo gs -sDEVICE=tiffg4 -g${IW}x$IH -r$DPI -q -o $X.tif $X.ps  >$X.box.latin1	##==use gs to rasterize (for msgs, remove -q and stdout redirection)
  cvt-latin1-to-utf8  $X.box.latin1	$X.box					##
  ##eDo tesseract $X.tif $X -l $LNG batch.nochop makebox; mv $X.txt $X.tessbox	##reads X.tif, writes X.tessbox;  DEBUG: for X.box to X.tessbox comparison
  eDo tesseract $X.tif junk nobatch box.train					##reads X.tif and X.box, writes X.tr
done
echo "==mftraining";   mftraining		*.tr	##writes inttemp (the shape prototypes) and pffmtable (the features for each character) and Microfeat (not used)
echo "==cntraining";   cntraining		*.tr	##writes normproto (the character normalization sensitivity prototypes)
echo "==unicharset";   unicharset_extractor	*.box	##writes unicharset (the isdigit,isupper,islower,isalpha Properties for each char, encoded in a bitfield)
echo -e "ENGLISH\nGERMAN" >tmpf				##doc says one-word-per-line  (space-separated also works)
echo -e "harrow\neggen"   >tmpw
echo "==freq-dawg";  wordlist2dawg tmpf $P.freq-dawg	##freq-dawg  mayNOT be empty;  DAWG == Directed Acyclic Word Graph
echo "==word-dawg";  wordlist2dawg tmpw $P.word-dawg	##word-dawg  mayNOT be empty
>$P.user-words                                     	##user-words may be empty
cp -u /usr/share/tessdata/$LNG.DangAmbigs $P.DangAmbigs	##DangAmbigs file specifying that r+n resembles m, etc;  copy from /usr/share/tessdata/$LNG.DangAmbigs
for F in inttemp normproto pffmtable unicharset;do mv -f $F $P.$F; done		##rename to tesseract naming-convention
rm -f tmp* *~ Microfeat junk*							##cleanup the junk
Z=tesseract-2.04.$P.zip; rm -f $Z; eDo zip -j $Z $P.{DangAmbigs,freq-dawg,inttemp,normproto,pffmtable,unicharset,user-words,word-dawg}	##make the operational zipfile
Z=boxtiff-2.04.$P.zip;   rm -f $Z; eDo zip -j $Z $P.{SOURCEFILE,*{ps,tif,box}}								##make the source zipfile
D=/usr/share/tessdata; eDo sudo cp -vf $P.{inttemp,normproto,pffmtable,unicharset}  $D  			##====INSTALL unconditionally====
for F in $P.{DangAmbigs,freq-dawg,user-words,word-dawg};do [ -e $D/$F ] || eDo sudo cp -v $F $D; done   	##====INSTALL conditionally====

cat $SU |tr ' ' '\n' >tmpWbW-$SU;  WC=$(wc tmpWbW-$SU |arr 1)
worddiff () { cat $1 |tr ' ' '\n' >tmpWbW-$1;  diff -bBs -U0 tmpWbW-$SU tmpWbW-$1; }

for FONT in "$@";do FONTB=${FONT%%.*}; X=$P.$FONT	##==Testing, for each font on cmdline...
  for L in $LNG $P;do					##for language $LNG and $P...
    tesseract $X.tif tmp -l $L				##OCR X.tif in language:$L to tmp.txt
    worddiff tmp.txt |sed '1,2d'  >tmp.$FONT.$L.diff	##form word-by-word-diff file
    WM=$(grep '^-' tmp.$FONT.$L.diff |wc |arr 1)	##count deletions
    WP=$(grep '^+' tmp.$FONT.$L.diff |wc |arr 1)	##count additions
    echo "==testing on $X.tif in language:$L: $WC words, $WM($WP) changes"
  done
  ##break  						##YANKED for full testing (sometimes one testcase is enough)
done
##for FONT in "$@";do X=$P.$FONT; diff -s -U0 $BKX/$X.box $X.box; done |less	##DEBUG-Comparing boxfiles to previous

exit
===========
CHANGE-LOG:
===========
2009-08-03:
Pagesize:
   PW=160; PH=105	##at 600dpi this is roughly the biggest that can be viewed in its entirety at 100% on a 1400-pixel-wide monitor (for what that's worth);
Producing boxfile from postscript:
   note: running tesseract-makebox then my cvt-tesseract-box-to-source would be close, but potentially futile...
   BETTER to ignore makebox-output, instead use postscript code to output the boxes;
   SEE print and "=" builtins, and /usr/share/ghostscript/8.60/lib/ps2ascii.ps;  and cmshow function in my BizCard.eps;
Cmdline Parameters:
   separate DPI and FONTSIZE-in-points sounded good, but ends up being troublesome:
   when SOURCEFILE is made with long lines (that just barely fit onto letter-sized page at 10-points) then cannot use that sourcefile at larger pointsize;
   eg: would be natural to use DPI:72 FONTSIZE:14 for 14-pixel screen-fonts  <--easily done by increasing PW PH vars;
   the nice solution: script figures out what pagesize (PW, PH) is required?
File Sizes:
   .tr files ~4000KB each,  box/box.latin1 ~100KB,  tif files ~45KB, ps files ~8KB,  all 8 "operational" files together ~700KB (~350KB zip-compressed);
   source-zipfile of everything 16MB;
	omit .tr files, box.latin1 (don't need both kinds);  source-zipfile now only 2MB;
   operational-zipfile 0.35MB;
Makefile enhancement:
   CONSIDER: source-zipfile having only PRF.SOURCEFILE and a Makefile, that invokes this script??
   --move zipfile-making into Makefile - a dir using mixture of from-images and from-source, like my PDE dir, may have "export" needs neither script can know...
   --may separate the "install" as "make install"
Naming:
   encoding-related names, currently a mixture of "latin1" and "cp1252", could be improved by simplifying?


2010-01-03:
Morten Langlo from Denmark <ml4711@gmail.com> suggests the following changes:

(1) re-encode Postscript font to "ISOLatin1 encoding vector" to get all characters printed correctly, by
replacing:
	/$FONTB	findfont $FSZ scalefont setfont		%%font and font-size
with:
	/${FONTB}Latin
	  << /$FONTB findfont {} forall >>
	  begin
	    /Encoding ISOLatin1Encoding 256 array copy def currentdict
	  end
	  definefont pop
	/${FONTB}Latin $FSZ selectfont			%%font and font-size

(2) to avoid characters having overlapping boxes, in the Postscript boxshow routine, after "charstr show", add the following:
	currentpoint exch 2 add exch moveto		%%increase distance between characters; amount arbitrary but 2 is a good value
	--ER: did that although I decided on: 1 0 rmoveto;

(3) that it not copy any of freq-dawg and word-dawg, if they are found in the tessdata directory;
	--ER: now doing conditional-copy of DangAmbigs, user-words, freq-dawg, word-dawg;  the things produced reluctantly only because required by tesseract:-)
	==ER: tesseract-v3 training doc says the dawg-files can be omitted;  possibly true for v2 also??


2010-01-26:  ER:
Morten's (1) re-encoding in the sort of postscript I know:
	/$FONTB-latin1
	  /$FONTB findfont dup length dict begin
	    {1 index /FID ne {def} {pop pop} ifelse} forall
	    /Encoding ISOLatin1Encoding def
	  currentdict end  definefont pop
	/$FONTB-latin1 findfont $FSZ scalefont setfont	%%font and font-size
--also tried to simplify to the following, but it gets "invalid font" msg from ghostscript;  seems the FID entry added by definefont is needed??
	/$FONTB findfont  dup length dict begin
	    {1 index /FID ne {def} {pop pop} ifelse} forall
	    /Encoding ISOLatin1Encoding def
	    currentdict end  $FSZ scalefont setfont	%%font and font-size

for a cp1252-encoding instead of isolatin1,  see cp1252.ps from http://www.gnuplot.info/scripts/index.html;  or http://www.tug.org/fontname/8r.enc;
note that such re-encoding isn't needed if you're using a (TrueType) font that comes with the Windows-encoding, but will be harmless even in such cases;
BTW, fonts in Postscript are a mess: most come with the misleadingly named StandardEncoding which bears no resemblance to any widely used standard;
Adobe also offers its misleadingly named ISOLatin1Encoding which differs from the one defined by the ISO for several characters -- however it is at least close;
the correct iso8859-1 encoding is easily implemented, but going to cp1252 is considerably better also giving you the Euro, etc;
note that most modern fonts have all the named chars (or rather glyphs) needed for a cp1252-encoding;

Without the re-encoding one could use fonts in the cp1252 (aka Windows-Latin1 aka Windows-ANSI) encoding;  TrueType fonts typically come that way;
	others can be converted to such using fontforge??  see also fontinst for TeX;

To investigate what encoding a font is in, and what named glyphs it has, see:
	CharStrings in the font;  the ghostscrpt operator:  <name> .namestring <string>  (returns string for a name) may also be useful;
	--fontforge makes it easy;

Another recipe for re-encoding a font, from http://www.home.no/mlinux/comment.htm is:
	/Helvetica findfont dup length dict copy begin /Encoding ISOLatin1Encoding def /Helvetica-Lat1 currentdict definefont pop end

Getting UTF8 to work in Postscript: apparently no easy way, but see "Unicode" and/or "UTF-16" in:  http://www.ghostscript.com/doc/current/Language.htm
	see also glyphshow;  worth a look: utf2ps, Cedilla, texttops (filter in CUPS), Pango;

Also of interest is: http://www.anastigmatix.net/postscript/Hyphenate.html
	worth a look is the installation of "Resources" into ghostscript's resource directory: /usr/share/ghostscript/8.60/Resource/{CMap,ColorSpace,Decoding,Encoding}/
	apparently that page also provides some useful encodings albeit in an unreadable form; and alas much that I don't understand;

2010-01-26: added LC_CTYPE=C, so in utf8 or other multi-byte locale sed, grep, wc, etc cmds will still work on one-byte chars;
	considered LC_CTYPE=en_US.CP1252 rather than LC_CTYPE=C, in case it matters to gs font+encoding selection, but found that it doesn't;
	also simplified the testing step, to do the comparison in utf8;  am now catering to people using utf8;
	added extra space between chars, as in Morten's (2);
	tested using TrueType fonts that don't need re-encoding;

2010-01-31:
	fixed X-coords being wrong on condensed/expanded fonts, by using DPI*XR for X coords;  (nicer fix possible using makefont?)
	--now getting reasonable results:
	--the mistakes:  Infinitive-->lnfinitive;  additional 100 in the-->additional100inthe;
	extra space between words solves run-together-words problem, although it won't do anything to improve the training wrt space-detection on other images;
	added re-encoding as in Morten's (1) but with an extended cp1252encoding vector;
	using vector from www.gnuplot.info/scripts/index.html, except:  quoteright->quotesingle;  minus->hyphen;  quoteleft->grave;  0x81->minus;

2010-02-02:  ENCODING-VECTOR:
	took a closer look at the "8r" encoding from the TeX-site (www.tug.org/fontname/8r.enc), thinking it would be a useful encoding to be compatible with;
	my new encoding, as close to the TeX one as possible while remaining true to ASCII, iso8859-1, and cp1252, covers exactly the same glyphset as the gnuplot one;
	Corrections it needed (the TeX people appear to have used a cp1252 definition that was erroneous for Zcaron, zcaron):
	   revised quoteright->quotesingle; quoteleft->grave;  also freeing up 0x1E, 0x1F;
	   revised 0x91->quoteleft; 0x92->quoteright; 0xA0->space (were .notdef);
	   swapped radical<->Zcaron (0x8E is Zcaron in cp1252);
	   swapped lozenge<->zcaron (0x9E is zcaron in cp1252);
	NOTE: 0x00..0x1f are condiderably rearranged wrt my previous cp1252-from-gnuplot-corrected-by-ER, but all told it covers exactly the same glyphs;
	NOTE differences wrt cp1252 from KODERS.COM:  verticalbar<->bar;  euro<->Euro;  overscore<->macron;
	   in http://www.adobe.com/devnet/opentype/archives/glyphlist.txt,  bar, verticalbar both map to 007C;  Euro, euro to 20AC;  macron, overscore to 00AF;
	   there & elsewhere they appear to be synonyms, ghostscript regards bar as the preferred name, found nothing on the others;
	ADDED Germandbls;  it ought to be 0xDF, moving germandbls to 0xFF, but that would violate the standards;  Scedilla has been suggested a substitute;
	NOTE: now using 250 of the 256 slots, providing 248 distinct glyphs, duplicates for space & hyphen unavoidable due to definitions of isi8859-1 and cp1252;
	NOTE: avoiding 0x00, 0x0A, 0x0D, still allows another 3 glyphs to be added in positions 0x1E, 0x1F, 0x7F;  avoiding 0x00 most essential, 0x0D least?
	SEE ALSO: http://ereimer.net/programs/cp1252-er.enc