#!/bin/bash
## Convert Peggy's "xls" MBRS database to HTML - by Eugene Reimer 2003-Feb
## PREREQ:  xlhtml -- from http://chicago.sourceforge.net/xlhtml/
##
## USAGE:
##	fixPeggyMBRS  MBRS200602.xls	-- produces MBRS200602.htm  (old versions were filters, that read from stdin, wrote to stdout)
##					-- this new version also invokes countMBRS (the old version didn't)
## NOTE: is now invoked by sendtoElist (via generateElistFromHISTORY+MBRS), if needed -- still wise to run separately beforehand to catch surprises??
## NOTE: resulting HTML is read by both generateElistFromHISTORY+MBRS  and by books-program (for DonorsReport only);

if [ ${#} -ne 1 ];then  echo "Usage: $0 <xls-file>";  exit 1;  fi
B=${1##*/}; B=${B%.*}
[ -e "$B.htm" ] && mv -fv $B.htm $B.htm~		##keep bkup copy of previous version, if any
{							##--first part (nearly) same as xls2htm--
xlhtml -a "$1" |					##convert to html
iconv -c -f utf8 -t cp1252//TRANSLIT |  		##convert utf8 to cp1252;    new 2007feb24;  2009-09:added -c and //TRANSLIT	-- Yank to leave in utf8??
LC_CTYPE=C  tr  $'\x91-\x98'  "''\"\"*\-\-~"  |  	##then cp1252  to iso8859-1; new 2007feb24;  result usually pure-ASCII;		-- Yank to leave in utf8??
iconv -f cp1252 -t utf8 |				##then back    to utf8;	     new 2009-09;    result usually pure-ASCII;		-- Yank to leave in utf8??
tr '\n' $'\x01' |					##NL-to-hex01    (revised 2007feb24, 2009-10-08)
sed $'
s|\r\x01|\x01|g;					##discard CR before NL
s|\x01|<br>|g;						##embedded NL --> <br>
s|<TR|\x01<TR|g; s|</table>.*$|\x01</TABLE>|;		##introduce newlines;  also simplify & standardize
s|</*FONT[^>]*>||g;					##simplify by discarding FONT-tags
s|\x01||;						##remove first linebreak (TABLE, heading)
'  |
tr $'\x01' '\n'  |					##last part of xls2htm
sed '
  s|<TR[^>]*>|<tr>|g; s|<TD[^>]*>|<td>|g;		##lowercase TR and TD + remove ALIGN="right" etc (needed by sed-cmds below & other scripts)
  s|</*B>||g;						##remove Bold-tags -- 20061122 introduced these in headings (they interfere with next s-cmd)
  s|<td><|<td>\&nbsp;<|g; s|<td>$|<td>\&nbsp;|;		##NOT NEEDED since xlhtml does this??
  s|   *| |g; s|( ) ||g;				##destutter spaces + remove empty-parens for omitted areacode (new in 20061108)
  s|<td> |<td>|g;					##remove leading spaces within any field (new 20090706);  is fine although did not solve Donor-Report grumbling
  /TABLE/{
    s| BORD|!BORD|g; s| CELL|!CELL|g; s| ||g;		##remove spaces from Column-Headings -- 20061108 reintroduced many spaces
    s|DatePaid/|DonDate|g;				##revise Column-Heading
    s|Donation/|DonAmt|g;				##revise Column-Heading
    s|Pledge/|DonAmt|g;					##revise Column-Heading
    s|MailingListID|ID|g;				##revise Column-Heading -- spelled this way prior to 20061108
    s|MemberID|ID|g;					##revise Column-Heading -- 20061108 introduced this spelling
    s|MembershipStatus|Active|g;			##revise Column-Heading -- spelled this way prior to 200804, then Active;  used to fix to Status
    s|Dues|DuesDate|g;					##revise Column-Heading
    s|State|Prov|g;					##revise Column-Heading -- 20061122 introduced this spelling
    s|EmailAddress|Email|g;				##revise Column-Heading -- 20061122 introduced this spelling
  }
  s|mb.sympatico.ca|mts.net|g;				##NOTE THE SYMPATICO FIXUP IS DONE HERE--!!--
  s|&quot;|"|g;						##fixup spurious use of html-entity (new 2007feb26)
  /TABLE/s|LastName|00LastName|				##temp kludges for sort
  s| |!|g; s|<td| <td|g; s|<tr> |<tr>|g;		##temp kludges for sort
'  |
grep -v '</TABLE'   |					##exclude for sort
sort -k 3,3 -k 2,2  |					##sort on Surname, Firstname -- Peggy usually sends unordered
sed '
  s| <td|<td|g; s|!| |g; s|00LastName|LastName|;	##undo the sort-kludges -- BEWARE: SOME OLDER SCRIPTS EXPECT TAB/SPACE BEFORE <td> ??
'
echo "</TABLE>"						##undo sort-exclusion
}  >${B}.htm						##converted result to $B.htm
echo "The converted-to-HTML form written to $B.htm"

##-- Check whether in BIG or SMALL format;
##== 2008-04-28:  revised SMALL-vs-BIG test, to handle Peggy sending a BIG with subset of columns;
grep '<TABLE' $B.htm  |sed 's|.*<tr><td>||; s|<td>|~|g'  |tr '~' '\n'  >tmpColHeadings				##write ColHeadings temp-file
NROW=$(cat $B.htm         |wc |arr 1);  ((NROW-=2))								##count nbr-rows; new 2008-04-28
NCOL=$(cat tmpColHeadings |wc |arr 1)
FILESZ=$(d -l $B.htm |arr 4)
echo -n "It has $NROW rows, $NCOL columns, and $FILESZ bytes -- IMPLIES ";
## ((NCOL>=34)) && ((FILESZ>=242000));then echo "BIG"; BIG=1; else echo "SMALL"; fi				##old test (based on col34 being 2006DuesDate)
if ((NROW>=500))                     ;then echo "BIG"; BIG=1; else echo "SMALL"; fi				##2008-04-28: new BIG-test based only on nbr-rows
if ((NROW>=500)) && [[ $B != *BIG* ]];then echo "-----$B ought to contain BIG--??--";     fi			##2008-04-28: new msg about renaming
if ((NROW< 500)) && [[ $B == *BIG* ]];then echo "-----$B ought NOT to contain BIG--??--"; fi			##2008-04-28: new msg about renaming

##-- Check the ColumnHeadings for surprises:
##-- for a SMALL only need the usual  columns1..11;
##-- for a BIG also need  2005Dues in col32,  2006Dues in col34,  2007Dues in col36,  2008Dues in col38,  etc  <==no longer true, nor needed, as of 2008-04-28
YEAR=$(date +%Y)
((NCOLNEEDED=22+ (YEAR-2000)*2+1))
while ((NCOL<NCOLNEEDED));do echo "--missing--" >>tmpColHeadings; ((NCOL+=1)); done				##2008-04-28: new way to handle missing columns
HDG=$(head -n11 tmpColHeadings | tr '\n' '~')
echo -n "First 11 columns expected to be ID~FirstName...Email;  "
if [[ $HDG == ID~FirstName~LastName~Address~City~Prov~PostalCode~Country~HomePhone~WorkPhone~Email~ ]];then  echo "and they are";   fi
if [[ $HDG != ID~FirstName~LastName~Address~City~Prov~PostalCode~Country~HomePhone~WorkPhone~Email~ ]];then  echo "but are:  $HDG"; fi
##col38=$(head -n38 tmpColHeadings |tail -n1)											##col38-test no longer needed
##echo -n "col38-heading:$col38 -- ";  if [[ $col38 == 2008Dues* ]];then echo "OK"; else echo "EXPECTED 2008DuesDate"; fi	##col38:2008DuesDate;  no longer needed

##-- REARRANGE COLUMNS, to handle Peggy sending with subset of columns--!!--  (new 2008-04-28)
##-- am rearranging to a sensible order:  21 fixed columns then a Dues+DonAmt pair for each year  (BIGs prior to 200804 aren't that way, unless reconverted!!)
echo "--rearranging columns..."
CH=("" ID FirstName LastName Address City Prov PostalCode Country HomePhone WorkPhone Email Active DateJoined Hobbies Occupation Volunteer HowfindNOCI DonDate00 DonDate01 DonDate02 DonDate03)
for((Y=2000; Y<=YEAR; ++Y));do ((J=22+ (Y-2000)*2)); CH[J]="${Y}DuesDate"; CH[J+1]="DonAmt${Y#20}"; done	##array CH shows desired Columns, and order (nbred 1...)
##for((J=1; J<=NCOLNEEDED; ++J));do echo "J:$J CH[J]:${CH[J]}"; done						##DEBUG
for((J=1; J<=NCOLNEEDED; ++J));do CN[J]=0; done									##zero in CN-array means not-found
((I=0)); while read;do ((++I))											##go thru tmpColHeadings line-by-line...
  for((J=1; J<=NCOLNEEDED; ++J));do if [[ $REPLY == ${CH[J]} ]];then CN[J]=$I; fi; done				##if the I'th heading == CH[J] then CN[J] := I
done  <tmpColHeadings
##r((J=1; J<=NCOLNEEDED; ++J));do echo "J:$J CH[J]:${CH[J]}	CN[J]:${CN[J]}"; done				##DEBUG
for((J=1; J<=NCOLNEEDED; ++J));do if ((CN[J]==0));then ARR="$ARR <td>&nbsp;"; else ARR="$ARR ${CN[J]}"; fi;done	##produce rearrangement pattern $ARR
echo "ARR:$ARR;"												##DEBUG
mv -fv $B.htm tmp$B.PeggyColumnOrder.htm									##rename, will be replacing $B.htm;  2010-01:tmpname
cat tmp$B.PeggyColumnOrder.htm |sed $'s|<td>|\t<td>|g; s|<tr>\t|<tr>|; s| |!|g' |arr $ARR  |			##rearrange columns according to $ARR
  sed $'s|!| |g; s|\t||g'  |											##undo TAB & SPACE kludges...
  cat  >$B.htm													##produce new $B.htm
for((Y=YEAR-1; Y<=YEAR; ++Y));do ((J=22+ (Y-2000)*2)); if ((CN[J]==0));then echo "-----FATAL: column ${CH[J]} omitted"; fi; done	##sanity testing


##-- count number of MBRS, adjusting for twosomes etc in the mailing-list form, and checking for troublesome cases:
echo "";  countMBRS $B.htm


exit	##skip remarks
===========================================================================================
=====================<<this is the former MBRS--README>>===================================
===========================================================================================
	
Resulting-Columns:
1  2         3        4       5    6  7   8   9         10        11    12  13         14     15     16     17     18    19    20     21     22    23     24    25     26      27         28        29     30    31   32     33    34     35   
A  B         C        D       E    F  G   H   I         J         K     L   M          N      O      P      Q      R     S     T      U      V     W      X     Y      Z       AA         AB        AC     AD    AE   AF     AG    AH     AI   
ID FirstName LastName Address City St Zip Nat HomePhone WorkPhone Email Sts DateJoined Dues00 Dues01 Dues02 Dues03 Don00 Don01 Date00 Date01 Don02 Date02 Don03 Date03 Hobbies Occupation Volunteer Dues04 Don04 How? Dues05 Don05 Dues06 Don06

--consider: exchging columns 19 & 20  -- to get sensible ordering of Donation info?
--consider: combining columns 18&20, 19&21, 22&23 24&25  -- donation info into Amt,Date pairs
--or: just discard the DonDate columns, since newer years being done that way--??

When Peggy sends XL database, i used:

(1) 2002feb: the first time, i used StarOffice "convert to HTML", plus a lot of manual editing;

(2) 2003feb: for MBRS200302, used Staroffice "convert to HTML", and was thinking about a SED script to automate the cleaning-up processing needed;
	then discovered StarOffice "convert to TEXT",  which lets me supply a Field-Delimiter,  and is a simpler way to produce simple HTML
	-- i used "{" as Delimiter, then changed it to "<td>{tab}" ... and was pretty much done!

-- 2003feb:  she also sent MBRS2001, MBRS2003, each in slightly different format

(3) 2003feb: i got her to send MBRSBIG which has everything -- so only need to reformat it, 
	-- converted to HTML, as above, using StarOffice "to text" ...
	-- changed all blanks to underscores -- for sorting and column-arranging
	   (also Lastname --> 00Lastname)
	-- sorted:  sort -f -k 3,3 -k 2,2 -k 1,1 MBRSBIG.htm-orig >MBRSBIG.htm

-- <<fixPeggyMBRS>> script (in /etc/sbin) automates most of the conversions needed;
	fixPeggyMBRS2003 -- as used in 2003feb
	fixPeggyMBRS2004 -- as used in 2004feb

-- I extracted the 2002-renewals dated 2001-Nov,Dec, 2002-Jan,Feb,Mar -- to find the "missing" members

2004-Feb:  Peggy resent the XL database;  converted using OpenOffice this time;  then using fixPeggyMBRS
	--Columns are Different (yet again) -- see fixPeggyMBRS2004 script
	--thought this was a "BIG" (complete database), but was only the 2003 memberships...

2005-Feb:  Peggy resent the XL database; converted to text w StarOffice, then used fixPeggyMBRS
2005-Feb:  after my mentioning the missing "late-in-year joiners" problem, Peggy sent a "BIG"
	(the incomplete MBRS-db for 2004 is called MBRS200502-V1)
	(MBRS200502 is really a BIG-db - has all members who have ever belonged + has valid 2005-Dues column)
		(still needs fixing if want end-of-2004 members, e.g. R Lemieux joined 2004-dec27 & is only shown in 2005-Dues)
	-- Columns changed yet again; but are now in a sensible order!  (well, mostly)  -- see fixPeggyMBRS2005 script
	==Note: the address-lookup code in books.icn (donSection) wants the "current" MBRS-db to be called MBRSBIG.htm -- high time to change name!
	==REVISED: now tries to open MBRSyyyy02-BIG.htm;  2nd choice is MBRSyyyy02.htm
	-- could construct a true BIG by merging previous years??
	-- also want to redo Extracting of Email-ids - for adding to Elist
	-- redo: Extracting Email-ids - use test: if 2004-Dues or 2005-Dues non-empty...
2005-Feb:  new script  extractMBRS-BIGtoSMALL  to create MBRS200502-SMALL from MBRS200502-BIG

2006-Feb: Peggy just added 2 columns to the end;
	-- yanked the 'arr' line => no changes to this script needed for such adding of columns.
	-- NOTE: need to edit Textfile to remove Newline char within address field for John & Chris Neufeld--!!--
2006-May: applying the mb.sympatico.ca --> mts.net fixup for email-ids here -- not sure where it's best done==??==
2006-May: downloaded <<catdoc>> pkg which includes <<xls2csv>> convertor, as a C program;
	also a perl script of same name <<xls2csv>> -- one of these may be the answer to automate everything--??--

2006-Sep: ==REWRITE==  installed <<xlhtml>> program, and wrote <<xls2htm>> script -- it looks like the simplest way to automate everything;
	-- rewrote this script using <<xlhtml>>  (fixPeggyMBRS-csv is the former from-CSV version of this script)
	-- USAGE now different!
	-- no longer need to use OpenOffice to convert to csv;
	-- script now handles embedded-NEWLINEs--!!--
2006-Nov08 version had many minor changes from previous SMALLs - the xlhtml-based conversion was first tested on it;
2006-Nov22 version undid many Nov08-changes;  lacked ALIGN on the heading-line-TR which messed-up my trying to keep it together with <TABLE...
	fixed that part here,  ALSO in xls2htm script.

2007-Feb24:  revisions to make output nicer for books-DonorReport:
(1) remove CR before <br>
(2) avoid non-ASCII chars in Addresses;  eg Ron Lemieux's address contains an N-dash:
    it became &#8211;       in the 20061108 copy -- was left that way (may need to revise if re-occurs??)
    it became \xe2\x80\x93  in the 20061122 copy -- that became \x96 in cp1252, then ASCII-dash

2008-Apr:  to handle Peggy sending only a subset and/or rearranged columns,  now do reordering based on column-headings;
	==NOTE: if she continues in this vein and I ever need to test for membership-dues in years before 2004 will need to get those old columns from the 200802 db;

2009-07-06:  revised the CR-->tilde changes, to become CR-->times-sign,  to handle Peggy using tilde on the remarks fields;

2009-10-07:  the screwup on John & Christiane Neufeld entry with NEWLINE within address is BACK,  also lots of blank lines in output, possibly related?  ==YUP;
	--Elist-sendto-20091007-ToBeRemoved.eml-TBR is ready to go, but unsent as there's no-one to send it to (only hans007@shaw.ca);  FIXED & sent later...
	--BUG IN tr:  altho all settings are for utf8, tr-to-tilde-n produces hex-97, a Latin1 tilde-n;  tr LACKS UTF8 SUPPORT!!  whereas sed has utf8 support!!

2010-01-23:  Peg sent an XLS (small) lacking a 2010-Dues column (leads to grumbling);
	==it has new columns for Debwendon-donations, presumably meaning those targeted for Boardwalk-Project;  ==may want to keep those too?
