#!/bin/bash
## cvt-textfiles-to-utf8-charset -- mass-conversion of files to utf8, with backups -- by Eugene Reimer 2009-09-01.
## You can run my find-anomalous-textfiles first to help identify files needing manual intervention, although doing so isn't essential as this script will 
## also grumble about, and leave unconverted, most kinds of anomalous files;  and since it makes backups you'll be able to undo what it done wrong;
##
## USAGE:  cvt-textfiles-to-utf8-charset  DIRECTORY...	--will convert non-UTF-8 non-ASCII textfiles below those Directories, recursively, with backups;
##
## PREREQ:  charsetdetective -- from http://ereimer.net/programs/charsetdetective.htm;
## PREREQ:  chgsed chg fullnameNOSLASH -- from http://ereimer.net/programs/general-purpose-scripts.htm;
##
## Copyright © 2009 Eugene Reimer;  can be used, modified, copied, and distributed or sold under the terms of either the LGPL or the GPL (your choice);
## see http://www.gnu.org/licenses for the details of these terms.

shopt -s extglob								##enable extglob for the @(...) etc extensions
LC_CTYPE=C									##NEED an 8-bit charset for grep to work on bytes, as opposed to utf8 characters!!
DEF=windows-1252													##default encoding	<--REVIEW NEEDED
BKUPDIR=/tmp/cvt-to-utf8-$(date +%Y%m%d-%H%M)-BKUP									##backup directory
LOGFILE=/tmp/cvt-to-utf8-LOG												##log-file
msg     () { echo "$@" >>$LOGFILE; }											##msg to logfile
filterF () { egrep -vi '(jpg|png|gif|tif|p[abgp]m|avi|flv|wmv|mov|ogg|mpg|mp3|VRO|pdf|doc|xls|zip|gz|tgz|~)$'; }	##filter for filenames
filterD () { egrep -v '/\.| |nsmail'; }											##filter for dir-names	<--REVIEW NEEDED
filtMET () { sed "s/.*\(charset\|CHARSET\)=\([^\"']*\).*/\2/"  |tr '\n' ' ' |tr A-Z a-z |sed 's/ $//'; }		##extract charset from META-Content-Type
prereqs () {
  for P;do if [[ $(type -p $P) == "" ]];then echo "$P from http://ereimer.net/programs is required"; PRE_ERR=1; fi; done
  if [ $PRE_ERR ];then exit 9; fi
}
doFile () {
  [ -L "$1" ] && return;  [ -f "$1" ] || return					##skip dir or other non-file
  ASC=; ISO=; WIN=; UTF=; MET=; DET=;
  MET=$(grep -i '<meta.*charset=' "$1" |filtMET)				##save any Encoding-info from an HTML META-Content-Type-tag
  if     ! egrep -vq $'^[\x09-\x0d\x20-\x7e]*$'              $1;then ASC=ASC	##pattern matches ASCII line;  egrep -v succeeds on a non-ASC file;  negated=>ASCII
  else										##remaining tests only on non-ASCII...
    if   ! egrep -vq $'^[\x09-\x0d\x20-\x7e\xa0-\xff]*$'     $1;then ISO=ISO	##pattern matches 8859  line;  egrep -v succeeds on a non-ISO file;  negated=>ISO-8859
    elif ! egrep -vq $'^[\x09-\x0d\x20-\x7e\x80\x82-\xff]*$' $1;then WIN=WIN;fi	##pattern matches WINcp line;  egrep -v succeeds on a non-WIN file;  negated=>WIN-cp
    if   ! egrep -vq $'^([\x09-\x0d\x20-\x7e]|[\xC2-\xDF][\x80-\xBF]|\xE0[\xA0-\xBF][\x80-\xBF]|[\xE1-\xEC\xEE\xEF][\x80-\xBF][\x80-\xBF]|\xED[\x80-\x9F][\x80-\xBF]|\xF0[\x90-\xBF][\x80-\xBF][\x80-\xBF]|[\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF]|\xF4[\x80-\x8F][\x80-\xBF][\x80-\xBF])*$'  $1
    then							     UTF=UTF;fi	##pattern matches UTF-8 line;  egrep -v succeeds on a non-UTF file;  negated=>UTF-8
  fi
  if [[ $ISO$WIN$UTF ]];then true; else return; fi							##skip quietly if pure-ASCII or Non-Text file (by my tests)
  DET=$(charsetdetective "$1" |sed 's|.*: ||' |tr A-Z a-z)						##get the answer from charsetdetective
  WAS="$ISO$WIN$UTF+$MET+$DET"										##save for msg, before stdizing
  [[ $UTF && $DET == *utf* && $MET == @(*utf*|'') ]] && { return;				    }	##skip quietly if unambiguously utf8
  [[ $UTF || $DET == *utf* || $MET == *utf*       ]] && { msg "$1 ?? $ISO$WIN$UTF+$MET+$DET";return;}	##skip after grumbling about ambiguous case assumed to be utf8
  [[ $MET == *8859-1				  ]] && { MET=windows-1252;			    }	##standardize to the broader variant (latin1)
  [[ $MET == *8859-8				  ]] && { MET=windows-1255;			    }	##standardize to the broader variant (Hebrew)
  [[ $MET == iso8859*				  ]] && { MET=${MET/iso/iso-};			    }	##standardize to the hyphenated spelling
  [[ $DET == unknown && $MET			  ]] && { msg "$1 using met:$MET"; DET=$MET;	    }	##grumble about "unknown" and assume META-info correct
  [[ $DET == unknown				  ]] && { msg "$1 using def:$DEF"; DET=$DEF;	    }	##grumble about "unknown" and assume $DEF
  [[ $MET && $MET != $DET			  ]] && { msg "$1 met:$MET; using det:$DET";	    }	##grumble about METa-DETective mismatch (stdized to reduce these)
  [[ $DET == $DEF				  ]] || { msg "$1 ==unusual encoding $DET";	    }	##inform about unusual charset		<--REVIEW NEEDED
  FN=$(fullnameNOSLASH $1); BK=$BKUPDIR/$FN								##form fullname of where to place the backup copy
  msg "$1 converting from $DET to utf8; was:$WAS"							##msg
  mv -f $1 $BK;  iconv -f $DET -t utf8 $BK >$1 2>>$LOGFILE;  chmod --reference=$BK $1			##==backup-rename + iconv-to-utf8 replacing original + keep perms
  if [[ $MET && $1 == *@(html|htm) ]];then								##if an html file (by filename)...
    msg "$1 chgseding met:$MET->utf-8"									##msg
    chgsed -k -n "/<meta\|<META/s/\(charset\|CHARSET\)=[^\"'>]*/charset=utf-8/" $1  			##==revise META-CHARSET in html file
  fi
}
doDir () {
  for F in $(ls -1 $1 |filterF);do doFile "$1/$F"; done				##excluding with egrep simpler and likely faster than with bash tests??
}
prereqs charsetdetective chgsed chg fullnameNOSLASH				##ensure required scripts are available
[ $# -eq 0 ] && set .								##default to curdir if no dirs specified
[ -d "$BKUPDIR" ] || mkdir "$BKUPDIR"						##ensure backup-directory exists
msg "$(date +%Y%m%d-%H%M): cvt-textfiles-to-utf8-charset  $@"			##logmsg with timestamp and cmdline
for DIR in "$@";do								##for each DIR on cmdline...
  for D in $(find $DIR -type d |filterD |sort);do doDir $D; done		##do DIR and its subdirs, recursively, in alphabetic order
done


exit
==================
==    NOTES	==
==================
--checking MET inconsistencies:
MET examples: ISO8859-1  ISO-8859-1  iso-8859-1  iso-8859-15  windows-1252  UTF-8  utf-8
can use globs: *8859-1 *8859-15  *1252  utf*	<--am lowercasing so last glob can be: utf*
for my files, META-charset is best ignored when it disagrees with DET or with my tests;
	in all but one such disagreement MET is wrong, and for that one using DET's UTF is also fine (it's a mixed-encoding mess best left unconverted);

--handling inconsistencies:
if any of the tests indicate UTF, then treat as UTF (ie: leave unconverted);
	with diagnostic-msg if tests disagree;
if none of the tests indicate UTF (but my tests indicate ISO or WIN) then file will be converted;
	if DET is unknown and has META-charset, then the META-charset used as FROM-charset;
	if DET is unknown and no META-charset, then DEF (cp1252) used as FROM-charset;
	otherwise DET (as determined by charsetdetective) used as FROM-charset;
	with diagnostic-msg if using a FROM-charset that differs from the META-charset, and/or from "the usual";

==================
==  CHAGE-LOG	==
==================
Revising any meta-Content-Type-charset within converted file;  but only on *htm files -- to avoid spurious revision to scripts (eg: this file);
iconv errmsgs:  redirected stderr (to LOG) so user can tell which file was the cause:
omiting call to file-cmd;  wasn't using the result;  ie: decided to simplify, despite the advantages of keeping info-gathering part same as in find-anomalous...

revised WIN test;  eg hex 81 although permitted in some of the DOS/IBM/Windows codepages (eg ibm850) is NOT permitted in cp1252.
	cp437(DOS-default):	all hi-bit-chars 80..ff are valid;  and has graphic-chars for 01..1f, and 7f although these not normally used in files;
	cp850(DOS-european):	all hi-bit-chars 80..ff are valid;  and has graphic-chars for 01..1f, and 7f although these not normally used in files;
	cp858(with d5=euro):	all hi-bit-chars 80..ff are valid;  and has graphic-chars for 01..1f, and 7f although these not normally used in files;
	cp1252(W-european):	hex 81, 8d, 8f, 90, 9d -- are unused;
	cp1250(C-european):	hex 81, 83, 88, 90, 98 -- are unused;
	cp1251(Cyrillic):	hex 98 -- unused;				<--my new tests will treat this as "DOS"
	cp1255(Hebrew):		hex 81, 8a, 8c..90, 9a, 9c..9f, ETC -- unused;
	--new WIN test outlawing hex 81 will catch at least some cases of file in old "DOS" encoding, however it results in cp1251 being lumped with DOS/IBM;
	--note that hex 81, u-umlaut, should be reasonably common in cp850/858/437 files;  hex 90, E-acute, is another candidate that might be better for some people;

added LC_CTYPE=C to work for utf8-user;
preserve permissions using chmod --reference;  considered also preserving timestamp using touch -r;

2011-01-06:  BEWARE:  bash-v4 has changed the meaning of =~ within double-square-brackets:  specifically what quoting in RHS does;
        best to avoid it, using == and the extglob extensions since those work everywhere;
	add:  shopt -s extglob
	replace:
	[[ $UTF && $DET =~ utf && $MET =~ 'utf|^$' ]]	-->	[[ $UTF && $DET == *utf* && $MET == @(*utf*|'') ]]
	[[ $UTF || $DET =~ utf || $MET =~ 'utf'    ]]	-->	[[ $UTF || $DET == *utf* || $MET == *utf*       ]]
	[[ $MET && $1 =~ 'html?$' ]]			-->	[[ $MET && $1 == *@(html|htm) ]]
	--made above revisions, with very little testing;

2011-01-06:  "continue" within doFile routine ought to be "return"  (smacks of relying on undocumented "feature" that might stop working in new version of bash);
	have so revised;  all present uses of "return" in doFile execpt the 2 in 1st line resulted from this revision;