#!/bin/bash ## 2009-08-03: tesseract-training steps as per http://code.google.com/p/tesseract-ocr/wiki/TrainingTesseract; ## receives your sourcefile and a list of fontnames; ## uses ghostscript to create rasterized images, then the rest is much like my tesseract-training-from-images script, except for fully-automated corrections; ## ## PREREQ: arr cvt-utf8-to-latin1 cvt-latin1-to-utf8 -- from http://ereimer.net/programs/general-purpose-scripts.htm ## ## USAGE EXAMPLES: ## tesseract-training-from-source deu.SOURCEFILE 600 10 Arial Times Geneva --make training pages at 600dpi in 10-point Arial, Times, and Geneva; ## tesseract-training-from-source deu.SOURCEFILE 600 10 Arial Times Geneva.0.9 --same as preceding, except the Geneva font is Condensed by a 0.9 ratio; ## tesseract-training-from-source deu.SOURCEFILE 101 10 Arial Times Geneva --make training pages in 14-pixel high "screen fonts"; ## ## --the examples above will create the images: deu.Arial.tif deu.Times.tif deu.Geneva.tif (or deu.Geneva.0.9.tif); ## --the sourcefile is plain-text, normally in utf8 (but in latin1 if filename ends in ".latin1"); ## --pagesize doesn't have cmdline parameters, but is easily changed in the source (below); ## --this script can be modified to use an encoding other than cp1252, fairly easily provided it's another one-byte-per-character encoding; ## --to use a mixture of "real" and "synthetic" training-images, you can run this script after running my tesseract-training-from-images script; ## ## Copyright © 2009,2010 Eugene Reimer; can be used, modified, copied, and distributed or sold under the terms of either the LGPL or the GPL (your choice); ## see http://www.gnu.org/licenses for the details of these terms. LC_CTYPE=C ##sed,grep,wc need single-byte chars because gs does; tesseract ignores LC_CTYPE eDo () { echo "==$@" >&2; "$@"; } ##function to echo then execute args X=$(pwd);BKX=/tmp/${X##*/}-$(date +%Y%m%d); ##rm -fR $BKX;cp -pR . $BKX ##make backup copy of curdir (while debugging) SU=$1; DPI=$2; FSZ=$3; shift; shift; shift ##get cmdline params: filename, dpi, fontsize SU=${SU%.latin1}; SL=$SU.latin1 ##SU is the utf8 sourcefile; SL is the latin1 sourcefile [[ $SU -nt $SL ]] && cvt-utf8-to-latin1 $SU $SL ##convert input to latin1 if needed [[ $SL -nt $SU ]] && cvt-latin1-to-utf8 $SL $SU ##convert input to utf8 if needed LNG=deu ##language-code to use in makebox step <--CHANGE AS NEEDED PW=612; PH=792 ##image width & height in points <--CHANGE AS NEEDED ((IW=PW*DPI/72,IH=PH*DPI/72)) ##image width & height in pixels, derived from preceding P=${SU%%.*} ##Prefix for the naming of output files for FONT in "$@";do FONTB=${FONT%%.*}; XR=1;[[ $FONT == *.* ]] && XR=${FONT#*.} ##for each font on cmdline... X=$P.$FONT echo '%!PS-Adobe-2.0' >$X.ps ##postscript identifier echo "%%BoundingBox: 0 0 $PW $PH /cp1252-er-encoding [ /.notdef/dotaccent/fi/fl/fraction/hungarumlaut/Lslash/lslash/ogonek/ring/.notdef/breve/minus/.notdef/radical/lozenge /caron/dotlessi/dotlessj/ff/ffi/ffl/notequal/infinity/lessequal/greaterequal/partialdiff/summation/product/pi/.notdef/.notdef /space/exclam/quotedbl/numbersign/dollar/percent/ampersand/quotesingle/parenleft/parenright/asterisk/plus/comma/hyphen/period/slash /zero/one/two/three/four/five/six/seven/eight/nine/colon/semicolon/less/equal/greater/question /at/A/B/C/D/E/F/G/H/I/J/K/L/M/N/O /P/Q/R/S/T/U/V/W/X/Y/Z/bracketleft/backslash/bracketright/asciicircum/underscore /grave/a/b/c/d/e/f/g/h/i/j/k/l/m/n/o /p/q/r/s/t/u/v/w/x/y/z/braceleft/bar/braceright/asciitilde/.notdef /Euro/integral/quotesinglbase/florin/quotedblbase/ellipsis/dagger/daggerdbl/circumflex/perthousand/Scaron/guilsinglleft/OE/Omega/Zcaron/approxequal /Germandbls/quoteleft/quoteright/quotedblleft/quotedblright/bullet/endash/emdash/tilde/trademark/scaron/guilsinglright/oe/Delta/zcaron/Ydieresis /space/exclamdown/cent/sterling/currency/yen/brokenbar/section/dieresis/copyright/ordfeminine/guillemotleft/logicalnot/hyphen/registered/macron /degree/plusminus/twosuperior/threesuperior/acute/mu/paragraph/periodcentered/cedilla/onesuperior/ordmasculine/guillemotright/onequarter/onehalf/threequarters/questiondown /Agrave/Aacute/Acircumflex/Atilde/Adieresis/Aring/AE/Ccedilla/Egrave/Eacute/Ecircumflex/Edieresis/Igrave/Iacute/Icircumflex/Idieresis /Eth/Ntilde/Ograve/Oacute/Ocircumflex/Otilde/Odieresis/multiply/Oslash/Ugrave/Uacute/Ucircumflex/Udieresis/Yacute/Thorn/germandbls /agrave/aacute/acircumflex/atilde/adieresis/aring/ae/ccedilla/egrave/eacute/ecircumflex/edieresis/igrave/iacute/icircumflex/idieresis /eth/ntilde/ograve/oacute/ocircumflex/otilde/odieresis/divide/oslash/ugrave/uacute/ucircumflex/udieresis/yacute/thorn/ydieresis ] def %%an extended cp1252-encoding; see notes below under CHANGE-LOG /$FONTB-latin1 /$FONTB findfont dup length dict begin {1 index /FID ne {def} {pop pop} ifelse} forall /Encoding cp1252-er-encoding def %%was ISOLatin1Encoding currentdict end definefont pop /$FONTB-latin1 $FSZ selectfont %%font and font-size, with re-encoded font /yy $FSZ 1.25 mul def %%line-spacing 1.25 times fontheight /x0 9 def %%left-margin /curx {currentpoint pop} bind def /cury {currentpoint exch pop} bind def /n {x0 cury yy sub moveto} bind def %%newline function /p {print} bind def %%alias for print /P {20 string cvs p ( )p} bind def %%print (any) with trailing space /strbbox { %%str => llx lly urx ury (where ll==lower-left; ur==upper-right) gsave newpath 0 0 moveto false charpath flattenpath pathbbox grestore } def /boxp { %%produce bbox line; str => - /s1 exch def s1 strbbox /ury exch def /urx exch def /lly exch def /llx exch def %%get bbox of the one-char str /llx llx curx add $DPI mul $XR mul 72 div round cvi def %%convert to pixels /urx urx curx add $DPI mul $XR mul 72 div round cvi def %%convert to pixels /lly lly cury add $DPI mul 72 div round cvi def %%convert to pixels /ury ury cury add $DPI mul 72 div round cvi def %%convert to pixels s1 P llx P lly P urx P ury P (\n)p %%print bbox line; eg:e 76 6495 114 6540 } def /boxshow { %%show-variant that loops thru chars... /str exch def str { /char exch def /charstr ( ) dup 0 char put def char 32 ne {charstr boxp} if %%produce bbox line, except if char is space charstr show %%display char normally 1 0 rmoveto %%extra space between chars <--CHANGE AS NEEDED char 32 eq {1 0 rmoveto} if %%extra space between words <--CHANGE AS NEEDED } forall } def /s {boxshow} bind def %%alias for show or boxshow $XR 1 scale %%to expand or condense font 0 $PH moveto " >>$X.ps ##postscript preamble cat $SL |sed 's|[()\]|\\&|g; s|.*|n(&)s|' >>$X.ps ##postscript text-lines, in latin1 echo $'showpage \x0C\x04' >>$X.ps ##postscript ending eDo gs -sDEVICE=tiffg4 -g${IW}x$IH -r$DPI -q -o $X.tif $X.ps >$X.box.latin1 ##==use gs to rasterize (for msgs, remove -q and stdout redirection) cvt-latin1-to-utf8 $X.box.latin1 $X.box ## ##eDo tesseract $X.tif $X -l $LNG batch.nochop makebox; mv $X.txt $X.tessbox ##reads X.tif, writes X.tessbox; DEBUG: for X.box to X.tessbox comparison eDo tesseract $X.tif junk nobatch box.train ##reads X.tif and X.box, writes X.tr done echo "==mftraining"; mftraining *.tr ##writes inttemp (the shape prototypes) and pffmtable (the features for each character) and Microfeat (not used) echo "==cntraining"; cntraining *.tr ##writes normproto (the character normalization sensitivity prototypes) echo "==unicharset"; unicharset_extractor *.box ##writes unicharset (the isdigit,isupper,islower,isalpha Properties for each char, encoded in a bitfield) echo -e "ENGLISH\nGERMAN" >tmpf ##doc says one-word-per-line (space-separated also works) echo -e "harrow\neggen" >tmpw echo "==freq-dawg"; wordlist2dawg tmpf $P.freq-dawg ##freq-dawg mayNOT be empty; DAWG == Directed Acyclic Word Graph echo "==word-dawg"; wordlist2dawg tmpw $P.word-dawg ##word-dawg mayNOT be empty >$P.user-words ##user-words may be empty cp -u /usr/share/tessdata/$LNG.DangAmbigs $P.DangAmbigs ##DangAmbigs file specifying that r+n resembles m, etc; copy from /usr/share/tessdata/$LNG.DangAmbigs for F in inttemp normproto pffmtable unicharset;do mv -f $F $P.$F; done ##rename to tesseract naming-convention rm -f tmp* *~ Microfeat junk* ##cleanup the junk Z=tesseract-2.04.$P.zip; rm -f $Z; eDo zip -j $Z $P.{DangAmbigs,freq-dawg,inttemp,normproto,pffmtable,unicharset,user-words,word-dawg} ##make the operational zipfile Z=boxtiff-2.04.$P.zip; rm -f $Z; eDo zip -j $Z $P.{SOURCEFILE,*{ps,tif,box}} ##make the source zipfile D=/usr/share/tessdata; eDo sudo cp -vf $P.{inttemp,normproto,pffmtable,unicharset} $D ##====INSTALL unconditionally==== for F in $P.{DangAmbigs,freq-dawg,user-words,word-dawg};do [ -e $D/$F ] || eDo sudo cp -v $F $D; done ##====INSTALL conditionally==== cat $SU |tr ' ' '\n' >tmpWbW-$SU; WC=$(wc tmpWbW-$SU |arr 1) worddiff () { cat $1 |tr ' ' '\n' >tmpWbW-$1; diff -bBs -U0 tmpWbW-$SU tmpWbW-$1; } for FONT in "$@";do FONTB=${FONT%%.*}; X=$P.$FONT ##==Testing, for each font on cmdline... for L in $LNG $P;do ##for language $LNG and $P... tesseract $X.tif tmp -l $L ##OCR X.tif in language:$L to tmp.txt worddiff tmp.txt |sed '1,2d' >tmp.$FONT.$L.diff ##form word-by-word-diff file WM=$(grep '^-' tmp.$FONT.$L.diff |wc |arr 1) ##count deletions WP=$(grep '^+' tmp.$FONT.$L.diff |wc |arr 1) ##count additions echo "==testing on $X.tif in language:$L: $WC words, $WM($WP) changes" done ##break ##YANKED for full testing (sometimes one testcase is enough) done ##for FONT in "$@";do X=$P.$FONT; diff -s -U0 $BKX/$X.box $X.box; done |less ##DEBUG-Comparing boxfiles to previous exit =========== CHANGE-LOG: =========== 2009-08-03: Pagesize: PW=160; PH=105 ##at 600dpi this is roughly the biggest that can be viewed in its entirety at 100% on a 1400-pixel-wide monitor (for what that's worth); Producing boxfile from postscript: note: running tesseract-makebox then my cvt-tesseract-box-to-source would be close, but potentially futile... BETTER to ignore makebox-output, instead use postscript code to output the boxes; SEE print and "=" builtins, and /usr/share/ghostscript/8.60/lib/ps2ascii.ps; and cmshow function in my BizCard.eps; Cmdline Parameters: separate DPI and FONTSIZE-in-points sounded good, but ends up being troublesome: when SOURCEFILE is made with long lines (that just barely fit onto letter-sized page at 10-points) then cannot use that sourcefile at larger pointsize; eg: would be natural to use DPI:72 FONTSIZE:14 for 14-pixel screen-fonts <--easily done by increasing PW PH vars; the nice solution: script figures out what pagesize (PW, PH) is required? File Sizes: .tr files ~4000KB each, box/box.latin1 ~100KB, tif files ~45KB, ps files ~8KB, all 8 "operational" files together ~700KB (~350KB zip-compressed); source-zipfile of everything 16MB; omit .tr files, box.latin1 (don't need both kinds); source-zipfile now only 2MB; operational-zipfile 0.35MB; Makefile enhancement: CONSIDER: source-zipfile having only PRF.SOURCEFILE and a Makefile, that invokes this script?? --move zipfile-making into Makefile - a dir using mixture of from-images and from-source, like my PDE dir, may have "export" needs neither script can know... --may separate the "install" as "make install" Naming: encoding-related names, currently a mixture of "latin1" and "cp1252", could be improved by simplifying? 2010-01-03: Morten Langlo from Denmark suggests the following changes: (1) re-encode Postscript font to "ISOLatin1 encoding vector" to get all characters printed correctly, by replacing: /$FONTB findfont $FSZ scalefont setfont %%font and font-size with: /${FONTB}Latin << /$FONTB findfont {} forall >> begin /Encoding ISOLatin1Encoding 256 array copy def currentdict end definefont pop /${FONTB}Latin $FSZ selectfont %%font and font-size (2) to avoid characters having overlapping boxes, in the Postscript boxshow routine, after "charstr show", add the following: currentpoint exch 2 add exch moveto %%increase distance between characters; amount arbitrary but 2 is a good value --ER: did that although I decided on: 1 0 rmoveto; (3) that it not copy any of freq-dawg and word-dawg, if they are found in the tessdata directory; --ER: now doing conditional-copy of DangAmbigs, user-words, freq-dawg, word-dawg; the things produced reluctantly only because required by tesseract:-) ==ER: tesseract-v3 training doc says the dawg-files can be omitted; possibly true for v2 also?? 2010-01-26: ER: Morten's (1) re-encoding in the sort of postscript I know: /$FONTB-latin1 /$FONTB findfont dup length dict begin {1 index /FID ne {def} {pop pop} ifelse} forall /Encoding ISOLatin1Encoding def currentdict end definefont pop /$FONTB-latin1 findfont $FSZ scalefont setfont %%font and font-size --also tried to simplify to the following, but it gets "invalid font" msg from ghostscript; seems the FID entry added by definefont is needed?? /$FONTB findfont dup length dict begin {1 index /FID ne {def} {pop pop} ifelse} forall /Encoding ISOLatin1Encoding def currentdict end $FSZ scalefont setfont %%font and font-size for a cp1252-encoding instead of isolatin1, see cp1252.ps from http://www.gnuplot.info/scripts/index.html; or http://www.tug.org/fontname/8r.enc; note that such re-encoding isn't needed if you're using a (TrueType) font that comes with the Windows-encoding, but will be harmless even in such cases; BTW, fonts in Postscript are a mess: most come with the misleadingly named StandardEncoding which bears no resemblance to any widely used standard; Adobe also offers its misleadingly named ISOLatin1Encoding which differs from the one defined by the ISO for several characters -- however it is at least close; the correct iso8859-1 encoding is easily implemented, but going to cp1252 is considerably better also giving you the Euro, etc; note that most modern fonts have all the named chars (or rather glyphs) needed for a cp1252-encoding; Without the re-encoding one could use fonts in the cp1252 (aka Windows-Latin1 aka Windows-ANSI) encoding; TrueType fonts typically come that way; others can be converted to such using fontforge?? see also fontinst for TeX; To investigate what encoding a font is in, and what named glyphs it has, see: CharStrings in the font; the ghostscrpt operator: .namestring (returns string for a name) may also be useful; --fontforge makes it easy; Another recipe for re-encoding a font, from http://www.home.no/mlinux/comment.htm is: /Helvetica findfont dup length dict copy begin /Encoding ISOLatin1Encoding def /Helvetica-Lat1 currentdict definefont pop end Getting UTF8 to work in Postscript: apparently no easy way, but see "Unicode" and/or "UTF-16" in: http://www.ghostscript.com/doc/current/Language.htm see also glyphshow; worth a look: utf2ps, Cedilla, texttops (filter in CUPS), Pango; Also of interest is: http://www.anastigmatix.net/postscript/Hyphenate.html worth a look is the installation of "Resources" into ghostscript's resource directory: /usr/share/ghostscript/8.60/Resource/{CMap,ColorSpace,Decoding,Encoding}/ apparently that page also provides some useful encodings albeit in an unreadable form; and alas much that I don't understand; 2010-01-26: added LC_CTYPE=C, so in utf8 or other multi-byte locale sed, grep, wc, etc cmds will still work on one-byte chars; considered LC_CTYPE=en_US.CP1252 rather than LC_CTYPE=C, in case it matters to gs font+encoding selection, but found that it doesn't; also simplified the testing step, to do the comparison in utf8; am now catering to people using utf8; added extra space between chars, as in Morten's (2); tested using TrueType fonts that don't need re-encoding; 2010-01-31: fixed X-coords being wrong on condensed/expanded fonts, by using DPI*XR for X coords; (nicer fix possible using makefont?) --now getting reasonable results: --the mistakes: Infinitive-->lnfinitive; additional 100 in the-->additional100inthe; extra space between words solves run-together-words problem, although it won't do anything to improve the training wrt space-detection on other images; added re-encoding as in Morten's (1) but with an extended cp1252encoding vector; using vector from www.gnuplot.info/scripts/index.html, except: quoteright->quotesingle; minus->hyphen; quoteleft->grave; 0x81->minus; 2010-02-02: ENCODING-VECTOR: took a closer look at the "8r" encoding from the TeX-site (www.tug.org/fontname/8r.enc), thinking it would be a useful encoding to be compatible with; my new encoding, as close to the TeX one as possible while remaining true to ASCII, iso8859-1, and cp1252, covers exactly the same glyphset as the gnuplot one; Corrections it needed (the TeX people appear to have used a cp1252 definition that was erroneous for Zcaron, zcaron): revised quoteright->quotesingle; quoteleft->grave; also freeing up 0x1E, 0x1F; revised 0x91->quoteleft; 0x92->quoteright; 0xA0->space (were .notdef); swapped radical<->Zcaron (0x8E is Zcaron in cp1252); swapped lozenge<->zcaron (0x9E is zcaron in cp1252); NOTE: 0x00..0x1f are condiderably rearranged wrt my previous cp1252-from-gnuplot-corrected-by-ER, but all told it covers exactly the same glyphs; NOTE differences wrt cp1252 from KODERS.COM: verticalbar<->bar; euro<->Euro; overscore<->macron; in http://www.adobe.com/devnet/opentype/archives/glyphlist.txt, bar, verticalbar both map to 007C; Euro, euro to 20AC; macron, overscore to 00AF; there & elsewhere they appear to be synonyms, ghostscript regards bar as the preferred name, found nothing on the others; ADDED Germandbls; it ought to be 0xDF, moving germandbls to 0xFF, but that would violate the standards; Scedilla has been suggested a substitute; NOTE: now using 250 of the 256 slots, providing 248 distinct glyphs, duplicates for space & hyphen unavoidable due to definitions of isi8859-1 and cp1252; NOTE: avoiding 0x00, 0x0A, 0x0D, still allows another 3 glyphs to be added in positions 0x1E, 0x1F, 0x7F; avoiding 0x00 most essential, 0x0D least? SEE ALSO: http://ereimer.net/programs/cp1252-er.enc