#!/bin/bash ## 2009-06: tesseract-training steps as per http://code.google.com/p/tesseract-ocr/wiki/TrainingTesseract; ## is run in directory containing your training images as TIFFs; ## ## PREREQ: cvt-utf8-to-latin1 cvt-latin1-to-utf8 -- from http://ereimer.net/programs/general-purpose-scripts.htm ## ## Copyright © 2009 Eugene Reimer; can be used, modified, copied, and distributed or sold under the terms of either the LGPL or the GPL (your choice); ## see http://www.gnu.org/licenses for the details of these terms. eDo () { echo "==$@"; "$@"; } ##function to echo then execute args X=$(pwd);X=/tmp/${X##*/}-$(date +%Y%m%d);rm -fR $X;cp -pR . $X ##make backup copy of curdir (while debugging) LNG=deu ##language-code to use in makebox step <--CHANGE AS NEEDED for F in *tif;do X=${F%.*} if [ -e $X.box-ORIG-FROM-??? ];then ##avoid redoing a tif+box pair from another language's training pkg cp -v $X.box-ORIG-FROM-??? $X.box ##copy, skipping the latin1 version (twould cause confusion since all box.latin1 files are to be revised) elif [ -e $X.box-EDITED.latin1 ];then ##have box-EDITED -- a manually revised copy that's a keeper... if [[ -e $X.box-SOURCEFILE-EDITED.latin1 && $X.box-SOURCEFILE-EDITED.latin1 -nt $X.box-EDITED.latin1 ]];then ##also SOURCEFILE-EDITED and it's newer eDo cvt-tesseract-source-to-box $X.box-EDITED.latin1 ##use box.SOURCEFILE-EDITED to revise box-EDITED fi eDo cvt-latin1-to-utf8 $X.box-EDITED.latin1 $X.box ##convert box-EDITED to utf8 (removing X.box.latin1 not needed, see renames at end) else eDo tesseract $F $X -l $LNG batch.nochop makebox ##reads X.tif, writes X.txt mv $X.txt $X.box ##rename to X.box cvt-utf8-to-latin1 $X.box $X.box.latin1 ##convert to latin1 (X.box must be in utf8, but I want to edit it in latin1-form) cvt-tesseract-box-fixups $X.box.latin1 ##fixups for faulty yHI etc fi done for F in *tif;do X=${F%.*}; [ -e $X.box.latin1 ] && eDo cvt-tesseract-box-to-source $X.box.latin1; done ##convert boxfile to SOURCEFILE echo -n "==now edit ONE from each (X.box-SOURCEFILE.latin1, X.box.latin1) pair -- press any key when done"; read ##====----EDITING DONE HERE----==== for F in *tif;do X=${F%.*} if [[ -e $X.box.latin1 && $X.box-SOURCEFILE.latin1 -nt $X.box.latin1 ]];then eDo cvt-tesseract-source-to-box $X.box.latin1;fi ##use SOURCEFILE if newer if [[ -e $X.box.latin1 ]];then eDo cvt-latin1-to-utf8 $X.box.latin1 $X.box; fi ##convert edited boxfile back to utf8 eDo tesseract $F junk nobatch box.train ##reads X.tif and X.box, writes X.tr done P=${F%%.*} ##==Prefix for the renaming of output files echo "==mftraining"; mftraining *.tr ##writes inttemp (the shape prototypes) and pffmtable (the features for each character) and Microfeat (not used) echo "==cntraining"; cntraining *.tr ##writes normproto (the character normalization sensitivity prototypes) echo "==unicharset"; unicharset_extractor *.box ##writes unicharset (the isdigit,isupper,islower,isalpha Properties for each char, encoded in a bitfield) ##==make a frequently-used-words list, of the English words used in the headings; extract-tesseract... remove punctuation + uniquify + cvt-to-utf8; echo -e "ENGLISH\nGERMAN" >tmpf ##doc says one-word-per-line (although space-separated also works) echo -e "harrow\neggen" >tmpw echo "==freq-dawg"; wordlist2dawg tmpf $P.freq-dawg ##freq-dawg mayNOT be empty; DAWG == Directed Acyclic Word Graph echo "==word-dawg"; wordlist2dawg tmpw $P.word-dawg ##word-dawg mayNOT be empty >$P.user-words ##user-words may be empty cp -u /usr/share/tessdata/$LNG.DangAmbigs $P.DangAmbigs ##DangAmbigs file specifying that r+n resembles m, etc; copy from /usr/share/tessdata/$LNG.DangAmbigs for F in inttemp normproto pffmtable unicharset;do mv -f $F $P.$F; done ##rename to tesseract naming-convention rm -f tmp* *~ Microfeat junk* ##cleanup the junk for F in *tif;do X=${F%.*} ##==renames to avoid losing edits: {box,box-SOURCEFILE}.latin1-->{box-EDITED,box-SOURCEFILE-EDITED}.latin1 if [[ -e $X.box.latin1 && $X.box-SOURCEFILE.latin1 -ot $X.box.latin1 ]];then rm -f $X.box-SOURCEFILE.latin1 ;fi if [[ -e $X.box-SOURCEFILE.latin1 ]];then mv -v $X.box-SOURCEFILE.latin1 $X.box-SOURCEFILE-EDITED.latin1 ;fi if [[ -e $X.box.latin1 ]];then mv -v $X.box.latin1 $X.box-EDITED.latin1 ;fi done ##could simplify the naming-conventions, so these renames not needed?? sudo cp -vf $P.{DangAmbigs,freq-dawg,inttemp,normproto,pffmtable,unicharset,user-words,word-dawg} /usr/share/tessdata ##====----Install----==== exit HINT: Converting PBM to TIFF: pamtotiff -g4 X.pbm >X.tif HINT: Making multi-page-TIFF: tiffcp -c g4 X*tif X-CAT.tif (my img2pdf also uses -t for tiles vs strips as recommended by tiff2pdf for large images) HINT: when msgs voluminous use: tesseract-training-from-images 2>&1 |less =========== CHANGE-LOG: =========== 2009-06: Substantial differences on the DEU Arial image, although running in DEU mode, is surprising and dissappointing--!!-- the redone version lacks PAGENBRS, has many COORD differences; when comparing only on CHAR in 1st column, then: many differences involving various kinds of quote chars: » --> >> and '' --> " and Microsoft-quotes to ANSI-quotes; also misdiagnosed 1 l I !; Z-->z; Ä-->A; most surprising differences: ... --> ~; « --> er; note: stripping off PAGENBRS made no difference; although it is needed for tesseractTrainer.py to work--!!-- --FIXED, by not redoing such cases: if box-ORIG-FROM-??? exists then copy it rather than running tesseract...makebox; errormsgs from cmdline tesseract pde.img0005.tif junk nobatch box.train: APPLY_BOXES: boxfile 10/2/. ((2676,5577),(2694,5593)): FAILURE! box overlaps no blobs or blobs in multiple rows APPLY_BOXES: Unlabelled word blk:1 row:2 allrows:2 ... APPLY_BOXES: Unlabelled word blk:1 row:37 allrows:37 APPLY_BOXES: FATALITY - 0 labelled samples of "." - target is 5: --NOTE: for manually revised box-file, now treat box-EDITED much like box-ORIG-FROM-??? (am using for img0005 boxfile); --only troublesome cp1252-char was one hex 92 -- have changed it to stroph; --it's complaining about the line: . 2676 5577 2694 5593 <--UNCHANGED from the way the earlier tesseract step wrote it!! And ALL other "." chars!! --Asked for help, from tesseract-ocr@googlegroups.com 2009-06-22 23:18; Response by Ray Smith via updates to TrainingTesseract page; --FOUND workaround for "box overlaps no blobs or blobs in multiple rows" msg: revise yHI on each "." to be like its same-row neighbours!! that also gets rid of FATAL msg, altho not the "Unlabelled word" msgs -- APPEARS TO WORK==!!== CONSIDER: revising yHI on all chars to the maximum occuring in that row; might that also solve Z<->z etc errors--??-- --2009-07-18: DONE (sort of); see cvt-tesseract-box-fixups; Note: wordlist2dawg with empty input never ends; whereas with non-empty list, it works, and isn't nearly as slow as I'd thought; Note: Michael Reimer reports (in google-group) better results with an empty DangAmbigs file!! SEE ALSO HIS http://www.cs.toronto.edu/~mreimer/tesseract.html 2009-07-20: ran tesseract-training-from-images on all my PDE training pages; (SEE DETAILED NOTES are in /pix/pkg/tesseract-Box-TrainingPages-pde/00-README-ER) installed a BOX-VIEWER tesseractTrainer.py (see view-tesseract-boxes); discovered bug in tesseract-makebox; --ISSUE 223 (http://code.google.com/p/tesseract-ocr/issues/detail?id=223); illustrated using pde.img0015.tif; 2009-08-02: no sign of Ray Smith having looked at it, nor any issue since approx 2009-06-19?? considered adding 0005 as 2nd example, one having the problem only in some parts??