#!/bin/bash ## 2009-06-21: because the Tesseract Training-Page (Box) packages come without "Source" files, ## this is my attempt to extract the source, from the "box" file; ## ## boxfile has one char per line, with coord info, eg: ## a 830 3513 849 3536 0 <--char "a" in a box with coords x:830..849 y:3513..3536 on page:0 <-- from multi-page tiff ## a 830 3513 849 3536 <--char "a" in a box with coords x:830..849 y:3513..3536 <-- from simple tiff ## ## PREREQ: cvt-utf8-to-latin1 cvt-latin1-to-utf8 -- from http://ereimer.net/programs/general-purpose-scripts.htm ## ## USAGE EXAMPLE: cvt-tesseract-box-to-source deu.arial.box --produces: deu.arial.box-SOURCEFILE (utf8) and deu.arial.box-SOURCEFILE.latin1 (cp1252) ## ## Copyright © 2009 Eugene Reimer; can be used, modified, copied, and distributed or sold under the terms of either the LGPL or the GPL (your choice); ## see http://www.gnu.org/licenses for the details of these terms. INP=$1; [[ $1 == *latin1 ]] && INP=${1%.latin1} ##also support $1 being a box.latin1 file OUT=${INP/box/box-SOURCEFILE}; [ -e $INP.latin1 ] || cvt-utf8-to-latin1 $INP $INP.latin1 ##convert to latin1, if needed ((XHIprv=999999)) while read -r CHR XLO YLO XHI YHI PAGENBR;do ##read line of boxfile; vars get empty-string if omitted (all if empty-line; PAGENBR if simple tiff) if [[ $CHR ]] && ((XLOXHIprv+20));then echo -n " "; ##space; ==needs tuning for font; 20 works for deu.arial; 23 for deu.verdanab; fi ##if [[ $CHR == "~" ]];then CHR=; fi ##==TEMP== if [[ ${#CHR} -ne 1 ]];then CHR="($CHR)"; fi ##multi-char/empty-string needs parentheses - convention shared with cvt-source-to-box echo -n "$CHR" if [[ $CHR ]];then ((XHIprv=XHI)); fi done <$INP.latin1 >$OUT.latin1; echo "" >>$OUT.latin1 ##loop reads $INP.latin1, writes $OUT.latin1 touch -r$1 $OUT.latin1 ##preseve timestamp [[ $1 != *latin1 ]] && cvt-latin1-to-utf8 $OUT.latin1 $OUT ##for latin1 input, don't create utf8 version exit CHANGE-LOG: =========== 2009-06-21: comparing CONS:20 & CONS:23 on deu.arialbi, deu.verdana: shows no single SPACE-WIDTH-CONSTANT (CONS) will work for all DEU files; see notes in: /pix/pkg/tesseract-Box-TrainingPages-deu/00-FILENAMES-ETC-INFO-ER CONSIDER: iterative adjusting of SPACE-WIDTH-CONSTANT until wc (word-count) gets the desired nbr-of-words?? --Some manual fixing is inevitable; eg: comparing deu.verdanab & deu.verdanaz (with CONS:20) shows verdanaz has both added & removed spaces:-( 2009-07-15: support $1 being a box.latin1 file; and donot create utf8 output in that case; 2009-07-22: produce parenthecized-string for multi-character boxes, and for empty-string, as needed for the inverse script; support empty-line in boxfile - corresponds to a deletion "()" in SOURCEFILE, as needed for the inverse to be re-applyable; renamed: extract-tesseract-trainingpage-source --> cvt-tesseract-box-to-source (to better fit with the inverse: cvt-tesseract-source-to-box); 2009-08-02: preseve timestamps, using touch -r; also done in the inverse script, and in the utf8<-->latin1 conversions;