#!/bin/bash ## find-anomalous-textfiles -- identify textfiles whose charset/encoding is in doubt, as an aid during conversion to utf8; by Eugene Reimer 2009-08; ## ## USAGE: find-anomalous-textfiles DIRECTORY... --will analyze textfiles below those Directories, recursively, reporting on troublesome cases; ## ## Some utf8 files are also valid 8859-x (the reverse is also possible though less likely). Files valid as both cp125x and utf8 are somewhat more likely, ## so if you have a mixture of those you may want additional heuristics to distinguish them. To convert (iconv) your 8859-x and/or cp125x files to utf8, ## you need to know "x". Most people will, for most files. If you don't, you'll need heuristics, such as looking for common English words, common Czech ## words, etc; see: http://ereimer.net/programs/charsetdetective.htm or the Mozilla source it's based on. ## ## This script uses very simple tests; it exists mainly to get around deficiencies in the file command; although its filtering was the original reason. ## However if you've installed charsetdetective, then this script will use it for better answers on files where the simple tests are not enough. ## ## Copyright © 2009 Eugene Reimer; can be used, modified, copied, and distributed or sold under the terms of either the LGPL or the GPL (your choice); ## see http://www.gnu.org/licenses for the details of these terms. shopt -s extglob ##enable extglob for @(...) etc LC_CTYPE=C ##NEED an 8-bit charset for grep to work on bytes, as opposed to utf8 characters!! filterF () { egrep -vi '(jpg|png|gif|tif|p[abgp]m|avi|flv|wmv|mov|ogg|mpg|mp3|VRO|pdf|doc|xls|zip|gz|tgz|~)$'; } ##filter for filenames filterD () { egrep -v '/\.| |nsmail|/pix/[0-9]|/pix/bkup|/pix/pkg'; } ##filter for dir-names <--REVIEW NEEDED filtMET () { sed "s/.*\(charset\|CHARSET\)=\([^\"']*\).*/\2/" |tr '\n' ' ' |tr A-Z a-z|sed 's/ $//'; } ##extract charset from META-Content-Type det () { ##invoke charsetdetective, if installed if [[ $(type -p charsetdetective) != "" ]];then charsetdetective "$1"; elif [ $DETMSG ];then echo ""; else echo "find-anomalous-textfiles: charsetdetective needed -- get it from http://ereimer.net/programs/charsetdetective.htm" >&2; DETMSG=1; echo ""; fi } doFile () { [ -L "$1" ] && return; [ -f "$1" ] || return ##skip dir or other non-file NTX=; HTM=; ASC=; ISO=; WIN=; UTF=; MET=; DET=; enc=; X=$(file -b "$1") ##get output from the Linux file command; consider -i for mime-style w charset= [[ $X != *text* && $X != *empty* ]] && NTX=NONTEXT ##make note of file-command proclaiming it to be non-text; not altogether trustworthy? [[ $X == *HTML* ]] && HTM=HTM ##make note of file-command proclaiming it to be HTML [[ $X == *@(ASCII|ISO|UTF)* ]] && enc=Y ##make note of file-command having provided Encoding-info (rare on HTML) MET=$(grep -i 'ASCII else ##remaining tests only on non-ASCII... if ! egrep -vq $'^[\x09-\x0d\x20-\x7e\xa0-\xff]*$' $1;then ISO=ISO ##pattern matches 8859 line; egrep -v succeeds on a non-ISO file; negated=>ISO-8859 elif ! egrep -vq $'^[\x09-\x0d\x20-\x7e\x80\x81-\xff]*$' $1;then WIN=WIN;fi ##pattern matches WINcp line; egrep -v succeeds on a non-WIN file; negated=>WIN-cp if ! egrep -vq $'^([\x09-\x0d\x20-\x7e]|[\xC2-\xDF][\x80-\xBF]|\xE0[\xA0-\xBF][\x80-\xBF]|[\xE1-\xEC\xEE\xEF][\x80-\xBF][\x80-\xBF]|\xED[\x80-\x9F][\x80-\xBF]|\xF0[\x90-\xBF][\x80-\xBF][\x80-\xBF]|[\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF]|\xF4[\x80-\x8F][\x80-\xBF][\x80-\xBF])*$' $1 then UTF=UTF;fi ##pattern matches UTF-8 line; egrep -v succeeds on a non-UTF file; negated=>UTF-8 fi if [[ $ISO$WIN$UTF ]];then DET=$(det "$1" |sed 's|.*: |det:|' |tr A-Z a-z);fi ##get the answer from charsetdetective [[ $MET != "" ]] && MET="met:$MET" if false;then true ##sounds silly but needed so elif's are yankable ##elif [[ $NTX && $ASC ]];then echo "$1: $X ==$NTX+$ASC$ISO$WIN$UTF+$MET" ##produce msg for: nontext according to file-cmd, but ASC according to my tests ##elif [[ $HTM ]];then echo "$1: $X ==$NTX+$ASC$ISO$WIN$UTF+$enc" ##produce msg for: HTML according to file-cmd, showing whether Encoding-info provided elif [[ $ISO$WIN$UTF ]];then echo "$1: $X ==$NTX+$ASC$ISO$WIN$UTF+$MET+$DET" ##produce msg for: 8859-1/cp1252/utf8 with non-ascii chars, according to my tests fi } doDir () { for F in $(ls -1 $1 |filterF);do doFile "$1/$F"; done ##excluding with egrep simpler and likely faster than with bash tests?? } [ $# -eq 0 ] && set . ##default to curdir if no dirs specified for DIR in "$@";do ##for each DIR on cmdline... for D in $(find $DIR -type d |filterD |sort);do doDir $D; done ##do DIR and its subdirs, recursively, in alphabetic order done exit ================== == NOTES == ================== ISO-CHARSETS and corresponding Windows-CODEPAGES (most do not correspond in the nice way that 8859-1 and cp1252 do): iso-8859-1 windows-1252 (western european) iso-8859-2 windows-1250 (central european) iso-8859-3 windows-1254 (turkish) iso-8859-4 windows-1257 (baltic: latvian, lithuanian) iso-8859-5 windows-1251 (cyrillic) iso-8859-6 windows-1256 (arabic) iso-8859-7 windows-1253 (greek) iso-8859-8 windows-1255 (hebrew; cp1255 is superset of 8859-8) iso-8859-9 windows-1254 (turkish amended) iso-8859-10 windows-1252 (western european amended for nordic languages) iso-8859-11 windows-? (thai) iso-8859-13 windows-1257 (baltic plus polish) iso-8859-14 windows-? (celtic) iso-8859-15 windows-1252 (western european amended) iso-8859-16 windows-1250? (central european) big5 windows-950 (chinese traditional: taiwan, hong-kong) iso10646-1 --the Unicode character-set, which is separate from the encoding; ie: fonts may be iso10646-1, webpages may be utf8 or utf16 etc; TEST for valid utf8-sequence (in Perl) from http://www.w3.org/International/questions/qa-forms-utf-8.en.php is: $field =~ m/\A( [\x09\x0A\x0D\x20-\x7E] # ASCII | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 )*\z/x; NOTE: the w3.org defn of "pure-ASCII" allows only TAB, NL, CR control-chars (\x09 \x0A \x0D); NOTE: my definition also allows BS, VT, FF, (\x08 \x0b \x0c); ergo I'm replacing first test with: [\x09-\x0d\x20-\x7e]; NOTE: file-command also allows BS, FF (but not VT) FILE-COMMAND is close to what's needed, but has some shortcomings: for utf8 "UTF-8 Unicode text"; (is fine) for iso8859-1 "ISO-8859 text" (is fine, file could be any 8859-x) for cp1252 "Non-ISO extended-ASCII text" (is fine, file could be any cp125x); for ascii+VT "data" <==NOTE: file-cmd outlaws (mistakenly?) VT control-chars==!!== for ascii+FF "ASCII text" (is fine wrt FF) for ascii+BS "ASCII text, with overstriking" (is fine wrt BS for ascii+CR "ASCII text, with CRLF line terminators" when CR elsewhere (than line-end) it says "...with CR, LF line terminators" for cp1251 "ISO-8859" <==eg: noper-lang-ru detected as 8859, by my tests too; nonsense in 8859-5, sense in cp1251==!!== for big5 "ISO-8859 text" <==NOTE; a better test is easily written==!!== HTML files get "HTML document text" <==NOTE: no info about charset/encoding (more info below)==!!== Postscript fragments are misdiagnosed as C-programs <--not serious Javascript programs are misdiagnosed as C++ programs <--not serious old from-DOS files are misdiagnosed as data <==NOTE: dubious wrt trailing Ctrl+Z, as are my tests; FIX the files?? RTF files are declared to be "data" <==RTF files are text just as well as HTML or shell-scripts are==??== SVG images are declared to be "image" <==SVG files are arguably "text" as well?? Fortran programs get "FORTRAN program" <==NOTE: these lack "text", whereas C/C++/Pascal/shell-scripts/etc are "text"==??== with -i (mime-style) then get: text/plain; charset=unknown (for cp1252) text/plain; charset=utf-8 (for utf8) text/html (for HTML file) <==NOTE: still no charset-info for HTML==!!== HTML-files: file-cmd does sometimes provide meaningful charset info, though I fail to understand under what circumstances; see: /home/ereimer/doc/*ReimerPeterR* <--two HTML files (now 3 including tmpReimerPeterR.htm): eg: one file having is reported as being ISO-8859, and that's what it really is; however, on another file with same meta-tag also without any windows-only code-points, it says nothing about charset -- very strange?? briefly thought such a meta-tag triggered file-cmd's reporting charset; briefly thought CRLF endings involved; neither was the answer; AHA, presence of a DOCTYPE appears to be the answer==!!== (it's not quite that simple wrt the XHTML variants...) note: get the same info with or without -i in these cases (only the format changes); file-cmd is also erratic wrt reporting "with very long lines" -- looks as though files not getting charset-info also don't get long-lines-info?? some HTML files get "exported SGML document" instead?? UPGRADED file-cmd (from 4.13) to 5.03; no change wrt quirky lack of info on HTML files; no change on VT control-chars, nor Ctrl+Z; noticed several English-text files by Kip, me, Peggy are diagnosed as being FORTRAN-programs -- may not be a new quirk(?); ==send suggestions to file-mailing-list: http://mx.gw.com/mailman/listinfo/file SUMMARY: IMHO, the following ought to be "text": FORTRAN-program, RTF-data, vCalendar (others such as SVG-image, font-metrics, libtool-object may be arguable); all "text" cases ought to get encoding info (whether ASCII, ISO-8859, Non-ISO extended-ASCII, UTF-8) -- note: some "text" files, eg HTML/XML/GEDCOM, often get no encoding info (absence of DOCTYPE seemingly results in absence of encoding-info; for the XHTML alternative to DOCTYPE seems to matter whether on one line); some HTML files are described as "exported SGML document" instead of "HTML document", and I'm unable to appreciate how that's useful; MORE FILE-CMD QUIRKS: when running on all my files, encountered a few more rather surprising misdiagnoses, where perfectly ordinary Plautdietsch prose in iso8859-1/cp1252 is pronounced to be MPEG-4 or BOA archive: /pix/er-JackThiessen-writings-v1/jt143-Vaeaspruch-eml.txt: MPEG-4 LOAS ==NONTEXT+ISO+det:windows-1252 /pix/er-JackThiessen-writings-v2/m20081130-000104--janda-ToMANY-QP-BOARESAULW-mbox.txt: BOA archive data ==NONTEXT+ISO+det:windows-1252 CHANGES 2009-08: filtMET: output was messy for file with more than one META-Content-Type-tag; but don't know which a browser will use, so added tr NL-->SPC; also fixed it to handle content= preceding http-equiv= (common in emails, tho now skipping nsmail dirs) 2009-08: NONTEXT+ASC msgs became tiresome on RTF,FORTRAN,SVG,font-metrics,libtool-object-file,etc; yanked it (know enough about flaws in file-cmd) 2009-08: added to filterD -- things sensible for my file-tree although likely not for anyone else's; 2009-08: added further tests to help distinguish utf8 from cp1252, and the different kinds of cp125x from each other, by using charsetdetective; 2009-09: added LC_CTYPE=C to work for utf8-user; 2009-09: note: this script uses a very slightly looser test for "WIN" codepage than does my cvt-textfiles-to-utf8-charset script; the troublesome char being hex 81, allowed by cp1250 and the old cp437/850/858, but not by cp1252, cp1251, cp1255, etc; 2011-01-06: BEWARE: bash-v4 has changed the meaning of =~ within double-square-brackets: specifically what quoting in RHS does; best to avoid it, using == and the extglob extensions since those work everywhere; added: shopt -s extglob revised: [[ $X =~ "ASCII|ISO|UTF" ]] --> [[ $X == *@(ASCII|ISO|UTF)* ]]