#!/bin/bash
## 2009-06-21:  because the Tesseract Training-Page (Box) packages come without "Source" files, 
## this is my attempt to extract the source, from the "box" file;
##
## boxfile has one char per line, with coord info, eg:
##	a 830 3513 849 3536 0	<--char "a" in a box with coords x:830..849 y:3513..3536 on page:0  <-- from multi-page tiff
##	a 830 3513 849 3536	<--char "a" in a box with coords x:830..849 y:3513..3536            <-- from simple tiff
##
## PREREQ:  cvt-utf8-to-latin1 cvt-latin1-to-utf8 -- from http://ereimer.net/programs/general-purpose-scripts.htm
##
## USAGE EXAMPLE:  cvt-tesseract-box-to-source  deu.arial.box   --produces: deu.arial.box-SOURCEFILE (utf8) and deu.arial.box-SOURCEFILE.latin1 (cp1252)
##
## Copyright © 2009 Eugene Reimer;  can be used, modified, copied, and distributed or sold under the terms of either the LGPL or the GPL (your choice);
## see http://www.gnu.org/licenses for the details of these terms.

INP=$1;  [[ $1 == *latin1 ]] && INP=${1%.latin1}		##also support $1 being a box.latin1 file
OUT=${INP/box/box-SOURCEFILE};

[ -e $INP.latin1 ] || cvt-utf8-to-latin1  $INP $INP.latin1	##convert to latin1, if needed
((XHIprv=999999))
while read -r CHR XLO YLO XHI YHI PAGENBR;do			##read line of boxfile;  vars get empty-string if omitted (all if empty-line; PAGENBR if simple tiff)
  if   [[ $CHR ]] && ((XLO<XHIprv-15));then echo "";		##newline;  was 10, but encountered more overlap than that
  elif [[ $CHR ]] && ((XLO>XHIprv+20));then echo -n " ";	##space;  ==needs tuning for font;  20 works for deu.arial;  23 for deu.verdanab;
  fi
  ##if [[ $CHR == "~" ]];then CHR=; fi				##==TEMP==
  if [[ ${#CHR} -ne 1 ]];then CHR="($CHR)"; fi			##multi-char/empty-string needs parentheses - convention shared with cvt-source-to-box
  echo -n "$CHR"
  if [[ $CHR ]];then ((XHIprv=XHI)); fi
done  <$INP.latin1  >$OUT.latin1;  echo "" >>$OUT.latin1	##loop reads $INP.latin1, writes $OUT.latin1
touch -r$1 $OUT.latin1						##preseve timestamp
[[ $1 != *latin1 ]] && cvt-latin1-to-utf8  $OUT.latin1  $OUT	##for latin1 input, don't create utf8 version


exit
CHANGE-LOG:
===========
2009-06-21: comparing CONS:20 & CONS:23 on deu.arialbi, deu.verdana:  shows no single SPACE-WIDTH-CONSTANT (CONS) will work for all DEU files;
	see notes in:  /pix/pkg/tesseract-Box-TrainingPages-deu/00-FILENAMES-ETC-INFO-ER
CONSIDER: iterative adjusting of SPACE-WIDTH-CONSTANT until wc (word-count) gets the desired nbr-of-words??
--Some manual fixing is inevitable;  eg: comparing deu.verdanab & deu.verdanaz (with CONS:20) shows verdanaz has both added & removed spaces:-(

2009-07-15: support $1 being a box.latin1 file;  and donot create utf8 output in that case;

2009-07-22: produce parenthecized-string for multi-character boxes, and for empty-string, as needed for the inverse script;
	support empty-line in boxfile - corresponds to a deletion "()" in SOURCEFILE, as needed for the inverse to be re-applyable;
	renamed:  extract-tesseract-trainingpage-source --> cvt-tesseract-box-to-source  (to better fit with the inverse: cvt-tesseract-source-to-box);

2009-08-02: preseve timestamps, using touch -r;  also done in the inverse script, and in the utf8<-->latin1 conversions;