#!/bin/bash
## find-anomalous-textfiles -- identify textfiles whose charset/encoding is in doubt, as an aid during conversion to utf8;  by Eugene Reimer 2009-08;
##
## USAGE:  find-anomalous-textfiles  DIRECTORY...	--will analyze textfiles below those Directories, recursively, reporting on troublesome cases;
##
## Some utf8 files are also valid 8859-x (the reverse is also possible though less likely).  Files valid as both cp125x and utf8 are somewhat more likely, 
## so if you have a mixture of those you may want additional heuristics to distinguish them.  To convert (iconv) your 8859-x and/or cp125x files to utf8, 
## you need to know "x".  Most people will, for most files.  If you don't, you'll need heuristics, such as looking for common English words, common Czech 
## words, etc; see:  http://ereimer.net/programs/charsetdetective.htm  or the Mozilla source it's based on.
##
## This script uses very simple tests;  it exists mainly to get around deficiencies in the file command;  although its filtering was the original reason.
## However if you've installed charsetdetective, then this script will use it for better answers on files where the simple tests are not enough.
##
## Copyright © 2009 Eugene Reimer;  can be used, modified, copied, and distributed or sold under the terms of either the LGPL or the GPL (your choice);
## see http://www.gnu.org/licenses for the details of these terms.

shopt -s extglob								##enable extglob for @(...) etc
LC_CTYPE=C									##NEED an 8-bit charset for grep to work on bytes, as opposed to utf8 characters!!

filterF () { egrep -vi '(jpg|png|gif|tif|p[abgp]m|avi|flv|wmv|mov|ogg|mpg|mp3|VRO|pdf|doc|xls|zip|gz|tgz|~)$'; }	##filter for filenames
filterD () { egrep -v '/\.| |nsmail|/pix/[0-9]|/pix/bkup|/pix/pkg'; }							##filter for dir-names	<--REVIEW NEEDED
filtMET () { sed "s/.*\(charset\|CHARSET\)=\([^\"']*\).*/\2/"  |tr '\n' ' ' |tr A-Z a-z|sed 's/ $//'; }			##extract charset from META-Content-Type
det () {														##invoke charsetdetective, if installed
  if [[ $(type -p charsetdetective) != "" ]];then  charsetdetective "$1";
  elif [ $DETMSG ];then echo ""; 
  else echo "find-anomalous-textfiles: charsetdetective needed -- get it from http://ereimer.net/programs/charsetdetective.htm" >&2; DETMSG=1; echo "";
fi
}
doFile () {
  [ -L "$1" ] && return;  [ -f "$1" ] || return					##skip dir or other non-file
  NTX=; HTM=; ASC=; ISO=; WIN=; UTF=; MET=; DET=; enc=;
  X=$(file -b "$1")								##get output from the Linux file command;  consider -i for mime-style w charset=
  [[ $X != *text* && $X != *empty*	]] && NTX=NONTEXT			##make note of file-command proclaiming it to be non-text;  not altogether trustworthy?
  [[ $X == *HTML*			]] && HTM=HTM				##make note of file-command proclaiming it to be HTML
  [[ $X == *@(ASCII|ISO|UTF)*		]] && enc=Y				##make note of file-command having provided Encoding-info  (rare on HTML)
  MET=$(grep -i '<meta.*charset=' "$1" |filtMET)				##save any Encoding-info from an HTML META-Content-Type-tag
  if     ! egrep -vq $'^[\x09-\x0d\x20-\x7e]*$'              $1;then ASC=ASC	##pattern matches ASCII line;  egrep -v succeeds on a non-ASC file;  negated=>ASCII
  else										##remaining tests only on non-ASCII...
    if   ! egrep -vq $'^[\x09-\x0d\x20-\x7e\xa0-\xff]*$'     $1;then ISO=ISO	##pattern matches 8859  line;  egrep -v succeeds on a non-ISO file;  negated=>ISO-8859
    elif ! egrep -vq $'^[\x09-\x0d\x20-\x7e\x80\x81-\xff]*$' $1;then WIN=WIN;fi	##pattern matches WINcp line;  egrep -v succeeds on a non-WIN file;  negated=>WIN-cp
    if   ! egrep -vq $'^([\x09-\x0d\x20-\x7e]|[\xC2-\xDF][\x80-\xBF]|\xE0[\xA0-\xBF][\x80-\xBF]|[\xE1-\xEC\xEE\xEF][\x80-\xBF][\x80-\xBF]|\xED[\x80-\x9F][\x80-\xBF]|\xF0[\x90-\xBF][\x80-\xBF][\x80-\xBF]|[\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF]|\xF4[\x80-\x8F][\x80-\xBF][\x80-\xBF])*$'  $1
    then							     UTF=UTF;fi	##pattern matches UTF-8 line;  egrep -v succeeds on a non-UTF file;  negated=>UTF-8
  fi
  if [[ $ISO$WIN$UTF ]];then DET=$(det "$1" |sed 's|.*: |det:|' |tr A-Z a-z);fi	##get the answer from charsetdetective
  [[ $MET != "" ]] && MET="met:$MET"
  if false;then true								##sounds silly but needed so elif's are yankable
##elif [[ $NTX && $ASC ]];then echo "$1: $X  ==$NTX+$ASC$ISO$WIN$UTF+$MET"	##produce msg for:  nontext according to file-cmd, but ASC according to my tests
##elif [[ $HTM         ]];then echo "$1: $X  ==$NTX+$ASC$ISO$WIN$UTF+$enc"	##produce msg for:  HTML according to file-cmd, showing whether Encoding-info provided
  elif [[ $ISO$WIN$UTF ]];then echo "$1: $X  ==$NTX+$ASC$ISO$WIN$UTF+$MET+$DET"	##produce msg for:  8859-1/cp1252/utf8 with non-ascii chars, according to my tests
  fi
}
doDir () {
  for F in $(ls -1 $1 |filterF);do doFile "$1/$F"; done				##excluding with egrep simpler and likely faster than with bash tests??
}
[ $# -eq 0 ] && set .								##default to curdir if no dirs specified
for DIR in "$@";do								##for each DIR on cmdline...
  for D in $(find $DIR -type d |filterD |sort);do doDir $D; done		##do DIR and its subdirs, recursively, in alphabetic order
done


exit
==================
==    NOTES	==
==================
ISO-CHARSETS and corresponding Windows-CODEPAGES  (most do not correspond in the nice way that 8859-1 and cp1252 do):
iso-8859-1	windows-1252	(western european)
iso-8859-2	windows-1250	(central european)
iso-8859-3	windows-1254	(turkish)
iso-8859-4	windows-1257	(baltic: latvian, lithuanian)
iso-8859-5	windows-1251	(cyrillic)
iso-8859-6	windows-1256	(arabic)
iso-8859-7	windows-1253	(greek)
iso-8859-8	windows-1255	(hebrew;  cp1255 is superset of 8859-8)
iso-8859-9	windows-1254	(turkish amended)
iso-8859-10	windows-1252	(western european amended for nordic languages)
iso-8859-11	windows-?	(thai)
iso-8859-13	windows-1257	(baltic plus polish)
iso-8859-14	windows-?	(celtic)
iso-8859-15	windows-1252	(western european amended)
iso-8859-16	windows-1250?	(central european)
big5		windows-950	(chinese traditional: taiwan, hong-kong)
iso10646-1	--the Unicode character-set, which is separate from the encoding; ie: fonts may be iso10646-1, webpages may be utf8 or utf16 etc;


TEST for valid utf8-sequence (in Perl) from http://www.w3.org/International/questions/qa-forms-utf-8.en.php is:
	$field =~ m/\A(
	     [\x09\x0A\x0D\x20-\x7E]            # ASCII
	   | [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte
	   |  \xE0[\xA0-\xBF][\x80-\xBF]        # excluding overlongs
	   | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}  # straight 3-byte
	   |  \xED[\x80-\x9F][\x80-\xBF]        # excluding surrogates
	   |  \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3
	   | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
	   |  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16
	  )*\z/x;
NOTE: the w3.org defn of "pure-ASCII" allows only TAB, NL, CR control-chars (\x09 \x0A \x0D);
NOTE: my definition also allows  BS, VT, FF, (\x08 \x0b \x0c);  ergo I'm replacing first test with: [\x09-\x0d\x20-\x7e];
NOTE: file-command  also allows  BS,     FF  (but not VT)


FILE-COMMAND is close to what's needed, but has some shortcomings:
	for utf8	"UTF-8 Unicode text";				(is fine)
	for iso8859-1	"ISO-8859 text"					(is fine, file could be any 8859-x)
	for cp1252	"Non-ISO extended-ASCII text"			(is fine, file could be any cp125x);
	for ascii+VT	"data"						<==NOTE: file-cmd outlaws (mistakenly?) VT control-chars==!!==
	for ascii+FF	"ASCII text"					(is fine wrt FF)
	for ascii+BS	"ASCII text, with overstriking"			(is fine wrt BS
	for ascii+CR	"ASCII text, with CRLF line terminators"	when CR elsewhere (than line-end) it says "...with CR, LF line terminators"
	for cp1251	"ISO-8859"					<==eg: noper-lang-ru detected as 8859, by my tests too; nonsense in 8859-5, sense in cp1251==!!==
	for big5	"ISO-8859 text"					<==NOTE; a better test is easily written==!!==
	HTML files		get "HTML document text"		<==NOTE: no info about charset/encoding (more info below)==!!==
	Postscript fragments	are misdiagnosed as C-programs		<--not serious
	Javascript programs	are misdiagnosed as C++ programs	<--not serious
	old from-DOS files	are misdiagnosed as data		<==NOTE: dubious wrt trailing Ctrl+Z, as are my tests;  FIX the files??
	RTF files		are declared to be "data"		<==RTF files are text just as well as HTML or shell-scripts are==??==
	SVG images		are declared to be "image"		<==SVG files are arguably "text" as well??
	Fortran programs	get "FORTRAN program"			<==NOTE: these lack "text", whereas C/C++/Pascal/shell-scripts/etc are "text"==??== 
with -i (mime-style) then get:
	text/plain; charset=unknown	(for cp1252)
	text/plain; charset=utf-8	(for utf8)
	text/html			(for HTML file)			<==NOTE: still no charset-info for HTML==!!==
HTML-files:  file-cmd does sometimes provide meaningful charset info, though I fail to understand under what circumstances;
	see: /home/ereimer/doc/*ReimerPeterR*  <--two HTML files  (now 3 including tmpReimerPeterR.htm):
	eg: one file having <meta..."text/html; charset=windows-1252"> is reported as being ISO-8859, and that's what it really is;
	however, on another file with same meta-tag also without any windows-only code-points, it says nothing about charset -- very strange??
	briefly thought such a meta-tag triggered file-cmd's reporting charset;  briefly thought CRLF endings involved;  neither was the answer;
	AHA, presence of a DOCTYPE appears to be the answer==!!==  (it's not quite that simple wrt the XHTML variants...)
	note: get the same info with or without -i in these cases (only the format changes);
	file-cmd is also erratic wrt reporting "with very long lines" -- looks as though files not getting charset-info also don't get long-lines-info??
	some HTML files get "exported SGML document" instead??
UPGRADED file-cmd (from 4.13) to 5.03;
	no change wrt quirky lack of info on HTML files;  no change on VT control-chars, nor Ctrl+Z;
	noticed several English-text files by Kip, me, Peggy are diagnosed as being FORTRAN-programs -- may not be a new quirk(?);
	==send suggestions to file-mailing-list:  http://mx.gw.com/mailman/listinfo/file
SUMMARY:
	IMHO, the following ought to be "text": FORTRAN-program, RTF-data, vCalendar (others such as SVG-image, font-metrics, libtool-object may be arguable);
	all "text" cases ought to get encoding info (whether ASCII, ISO-8859, Non-ISO extended-ASCII, UTF-8) -- note: some "text" files, eg HTML/XML/GEDCOM, often get no
	encoding info (absence of DOCTYPE seemingly results in absence of encoding-info; for the XHTML alternative to DOCTYPE seems to matter whether on one line);
	some HTML files are described as "exported SGML document" instead of "HTML document", and I'm unable to appreciate how that's useful;
MORE FILE-CMD QUIRKS:
	when running on all my files, encountered a few more rather surprising misdiagnoses, where perfectly ordinary Plautdietsch prose in iso8859-1/cp1252 is
	pronounced to be MPEG-4 or BOA archive:
	/pix/er-JackThiessen-writings-v1/jt143-Vaeaspruch-eml.txt: MPEG-4 LOAS  ==NONTEXT+ISO+det:windows-1252
	/pix/er-JackThiessen-writings-v2/m20081130-000104--janda-ToMANY-QP-BOARESAULW-mbox.txt: BOA archive data  ==NONTEXT+ISO+det:windows-1252


CHANGES
2009-08: filtMET: output was messy for file with more than one META-Content-Type-tag;  but don't know which a browser will use, so added tr NL-->SPC;
	also fixed it to handle content= preceding http-equiv=  (common in emails, tho now skipping nsmail dirs)
2009-08: NONTEXT+ASC msgs became tiresome on RTF,FORTRAN,SVG,font-metrics,libtool-object-file,etc;  yanked it  (know enough about flaws in file-cmd)
2009-08: added to filterD -- things sensible for my file-tree although likely not for anyone else's;
2009-08: added further tests to help distinguish utf8 from cp1252, and the different kinds of cp125x from each other, by using charsetdetective;
2009-09: added LC_CTYPE=C to work for utf8-user;
2009-09: note: this script uses a very slightly looser test for "WIN" codepage than does my cvt-textfiles-to-utf8-charset script;
	the troublesome char being hex 81, allowed by cp1250 and the old cp437/850/858, but not by cp1252, cp1251, cp1255, etc;

2011-01-06:  BEWARE:  bash-v4 has changed the meaning of =~ within double-square-brackets:  specifically what quoting in RHS does;
        best to avoid it, using == and the extglob extensions since those work everywhere;
	added:  shopt -s extglob
	revised:  [[ $X =~ "ASCII|ISO|UTF" ]]  --> [[ $X == *@(ASCII|ISO|UTF)* ]]