#!/bin/bash ## 2009-07-15: to allow my manual fixups (during tesseract-training) to be done on the SOURCEFILE form; ## this is the inverse of cvt-tesseract-box-to-source; ## ## boxfile has one char per line, with coord info, eg: ## a 830 3513 849 3536 0 <--char "a" in a box with coords x:830..849 y:3513..3536 on page:0 <-- from multi-page tiff ## a 830 3513 849 3536 <--char "a" in a box with coords x:830..849 y:3513..3536 <-- from simple tiff ## ## USAGE EXAMPLE: cvt-tesseract-source-to-box pde.img0005.box.latin1 --uses pde.img0005.box-SOURCEFILE.latin1 to revise pde.img0005.box.latin1 ## ## Copyright © 2009 Eugene Reimer; can be used, modified, copied, and distributed or sold under the terms of either the LGPL or the GPL (your choice); ## see http://www.gnu.org/licenses for the details of these terms. SSTR=; while read -r;do SSTR="$SSTR $REPLY"; done <${1/box/box-SOURCEFILE} ##inhale the SOURCEFILE SSTR="${SSTR}()()()()" ##add some trailing junk, just in case... getSCHR() { ##function to get SSTR[SPOS] into SCHR, skipping spaces, handling parens (for pairs etc) SCHR=${SSTR:$SPOS:1} ##get char while [ "$SCHR" == " " ];do ((++SPOS)); SCHR=${SSTR:$SPOS:1}; done ##skip space(s) ##if [ "$SCHR" == "(" ];then SCHR=${SSTR:1+$SPOS:2}; ((SPOS+=3)); fi ##handle pair in parens if [ "$SCHR" == "(" ];then ((B=SPOS+1,E=B)); while [[ ${SSTR:$E:1} != ")" ]];do ((++E));done; SCHR=${SSTR:$B:$E-$B}; ((SPOS=E)); fi ##handle string in parens ##if [ "$SCHR" == "~" ];then SCHR=""; fi ##==YANK this line if charset includes tilde } SPOS=0; getSCHR ##init SPOS & SCHR to first char of SSTR ((XHIprv=999999,Y=999999)) while read -r BCHR XLO YLO XHI YHI PAGENBR;do ##read line of boxfile; vars get empty-string if omitted (all if empty-line; PAGENBR if simple tiff) if [ "$BCHR" != "$SCHR" ];then echo "==revising $BCHR-->$SCHR" >&2;fi ##msg about fixup if [ "$SCHR" == "" ];then echo "==empty-line at $Y" >&2;fi ##msg about empty-line, may scrap?? if [ "$SCHR" == "" ];then echo "" ##==write empty-line for a deletion else echo "$SCHR $XLO $YLO $XHI $YHI $PAGENBR" ##write revised boxfile-line fi ((++SPOS)); getSCHR ##advance to next char of SSTR if [[ $SCHR ]];then ((XHIprv=XHI,Y=YLO)); fi done <$1 >tmp$$-$1 ##loop reads $1, writes tmp$$-$1 touch -r$1 tmp$$-$1 ##preserve timestamp mv -bfv tmp$$-$1 $1 ##rename; the tmp$$-$1 output replaces $1 (mv -b, rather than 2 mv-cmds, for one-line msg) exit CHANGE-LOG: =========== TESTING: converted pde.img0005.box-EDITED.latin1 to pde.img0005.box-SOURCEFILE-EDITED.latin1; after correcting it, will use this script on that pair... have corrected it, using the new "(AB)" notation for pairs; note: cvt-tesseract-box-to-source lost some space chars at CONS:23; but works properly at CONS:20; not as easy as I'd hoped; on page 0005, I'm getting box-errors, two chars in one box, eg: ft-->ü wo-->m Consider using SPACE to "resync" at end of each word?? Support "(AB)" character-pairs -- just pairs was sufficient for my pde cases; Added discarding of tilde-lines, after further testing on all 10 test-images; --NOTE: for general use, have switched to using () rather than tilde--!!-- 2009-07-22: support arbitrary-length parenthecized-strings in SOURCEFILE; support "()" in SOURCEFILE for a deletion; support & produce empty-line in boxfile for a deletion (so reapplying this source-to-box will work); briefly kept tilde as synonym for "()" deletion; have since scrapped; ==note: may get sick of tesseract-warnings due to empty-lines?? 2009-08-02: preseve timestamps, using touch -r;