#!/bin/bash ## 2009-07-18: applying the yHI fixups to a tesseract boxfile, as described in tesseract-training-from-images; ## ## boxfile has one char per line, with coord info, eg: ## a 830 3513 849 3536 0 <--char "a" in a box with coords x:830..849 y:3513..3536 on page:0 <-- from multi-page tiff ## a 830 3513 849 3536 <--char "a" in a box with coords x:830..849 y:3513..3536 <-- from simple tiff ## ## USAGE EXAMPLE: cvt-tesseract-box-fixups pde.img0005.box.latin1 --revises pde.img0005.box.latin1 ## ## Copyright © 2009 Eugene Reimer; can be used, modified, copied, and distributed or sold under the terms of either the LGPL or the GPL (your choice); ## see http://www.gnu.org/licenses for the details of these terms. ((XHIprv=999999)) ((YHIprv=0,YHImax=0,YLOmin=999999)) while read -r CHR XLO YLO XHI YHI PAGENBR;do ##read line of boxfile; vars get empty-string if omitted (all if empty-line; PAGENBR if simple tiff) if [[ $CHR ]] && ((XLO$YHIprv($CHRprv) on chr:$CHR at x:$XLO..$XHI" >&2;fi ##debug msg for "." if [[ $CHR ]] && ((YHIprv>YHI+4 && YHI-YLO<40));then echo "==revising yHI:$YHI-->$YHIprv($CHRprv) on chr:$CHR at x:$XLO..$XHI" >&2;fi ##debug msg about yHI-revision if [[ $CHR ]] && ((YHIprv>YHI+4 && YHI-YLO<40));then ((YHI=YHIprv));fi ##revise yHI using crude heuristic if [[ $CHR ]];then echo "$CHR $XLO $YLO $XHI $YHI $PAGENBR" ##write revised-line for boxfile else echo "" ##write empty-line for boxfile fi if [[ $CHR ]] && ((YHI>YHImax));then ((YHImax=YHI)); CHRmax=$CHR;fi if [[ $CHR ]] && ((YLOtmp$$-$1 ##loop reads $1, writes tmp$$-$1 mv -fv $1 $1~; mv -fv tmp$$-$1 $1 ##rename; the tmp$$-$1 output replaces $1 exit CHANGE-LOG: =========== Consider an even simpler heuristic: set yHI for all chars in same row to max-in-row -- would mean a 2-pass alg, and might well be worse for cases where tesseract has included a spurious speck in a char-blob; ==NOTE: will devise way to cope with such spurious specks, if they become a problem--??-- HEURISTIC: may yet try using a much fussier test to see whether that helps with lowercase<->uppercase confusion problems; but for revisions similar to my manually-made ones on img0005, need yHI:=yHIprv revisions, and the comparison needs to be roughly YHIgive up on YHI:YHIprv comparison; TRY: fixup-needed if yHI-yLO < 40 <--close and might be ok, but produces un-needed fixups on stroph chars; TRY: fixup-needed if yHI-yLO < 40 && yHIprv>yHI+4 <--identical to the ones I made manually <-- GOING WITH THIS VERSION TRY: fixup-needed if yHI-yLOmin < 40 && yHIprv>0 <--got many more un-needed fixups, but why?? due to current yLO being a new min?? ==BETTER: detect fixup-needed based on HEIGHT==yHI-yLO as a fraction of yHImax-yLOmin--??-- 2009-07-22: support empty-lines in boxfile; NOTE: the problem this script works around may in fact be just another symptom of the "bounding-box at wrong Y-coord" problem==??==