#!/bin/bash ## charsetdetect -- show simple (algorithmic) charset-test results, along with charsetdetective (heuristic) results for file(s); by Eugene Reimer 2009-08; ## the tests were devised for find-anomalous-textfiles which also has filtering within directories; this script applies those tests to individual files; ## ## USAGE: charsetdetect FILE... --will analyze textfiles reporting on charset; ## ## Interpreting the Output: ## the "er" part has provable facts; the "det" part is from charsetdetective; a "met" part shows info from HTML META-Content-Type-tag, if present; ## a file containing nothing but ASCII-characters is said to be "ASC" (the file is also valid as "ISO", "WIN", and UTF-8 but the output doesn't say so); ## similarly a file said to be "ISO" is also valid as "WIN" (but the output doesn't say so); ## however, an "ISO" or "WIN" file may, but need not, also be valid as UTF-8 and the output gives both (as "ISOUTF" or "WINUTF") if and only if it is; ## note: "UTF" here means UTF-8 and nothing but; "ISO" here means any of the 8859-x encodings; "WIN" here means any of the cp125x encodings; ## ## Background: ## You sometimes receive a textfile in an encoding/charset other than what you use; having files in a mixture of encodings makes life hell; a wrongly ## converted file is the worst kind of hell, ergo when switching charsets (eg: from ISO-8859-1 to UTF-8) you have a stronger than usual need for consistency. ## To convert (with iconv) an 8859-x or cp125x file to utf-8, you need to know "x"; if you don't, you'll need heuristics, such as looking for common English ## words, common Czech words, etc, and that's what charsetdetective (http://ereimer.net/programs/charsetdetective.htm) does. ## ## Copyright © 2009,2011 Eugene Reimer; can be modified and/or distributed under the terms of the GPL; see http://www.gnu.org/licenses/gpl.html. shopt -s extglob ##enable extglob for @(...) etc LC_CTYPE=C ##NEED an 8-bit charset for grep to work on bytes, as opposed to utf8 characters!! filtMET () { sed "s/.*\(charset\|CHARSET\)=\([^\"']*\).*/\2/" |tr '\n' ' ' |tr A-Z a-z|sed 's/ $//'; } ##extract charset from META-Content-Type det () { ##invoke charsetdetective, if installed if [[ $(type -p charsetdetective) != "" ]];then charsetdetective "$1"; elif [ $DETMSG ];then echo ""; else echo "charsetdetect: charsetdetective needed -- get it from http://ereimer.net/programs/charsetdetective.htm" >&2; DETMSG=1; echo ""; fi } doFile () { [ -L "$1" ] && return; [ -f "$1" ] || return ##skip dir or other non-file ASC=; ISO=; WIN=; UTF=; MET=; DET=; MET=$(grep -i 'ASCII else ##remaining tests only on non-ASCII... if ! egrep -vq $'^[\x08-\x0d\x20-\x7e\xa0-\xff]*$' $1;then ISO=ISO ##pattern matches 8859 line; egrep -v succeeds on a non-ISO file; negated=>ISO-8859 elif ! egrep -vq $'^[\x08-\x0d\x20-\x7e\x80\x81-\xff]*$' $1;then WIN=WIN;fi ##pattern matches WINcp line; egrep -v succeeds on a non-WIN file; negated=>WIN-cp if ! egrep -vq $'^([\x08-\x0d\x20-\x7e]|[\xC2-\xDF][\x80-\xBF]|\xE0[\xA0-\xBF][\x80-\xBF]|[\xE1-\xEC\xEE\xEF][\x80-\xBF][\x80-\xBF]|\xED[\x80-\x9F][\x80-\xBF]|\xF0[\x90-\xBF][\x80-\xBF][\x80-\xBF]|[\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF]|\xF4[\x80-\x8F][\x80-\xBF][\x80-\xBF])*$' $1 then UTF=UTF;fi ##pattern matches UTF-8 line; egrep -v succeeds on a non-UTF file; negated=>UTF-8 fi DET=$(det "$1" |sed 's|.*: ||' |tr A-Z a-z) ##get the answer from charsetdetective [[ $MET != "" ]] && MET=" met:$MET" echo "$1: er:$ASC$ISO$WIN$UTF$MET det:$DET" ##produce msg } [ $# -eq 0 ] && set * ##default to all files in curdir if no files specified for F in "$@";do doFile "$F"; done ##for each FILE on cmdline... exit ================== == NOTES == ================== ISO-CHARSETS and corresponding Windows-CODEPAGES (most do not correspond in the nice way that 8859-1 and cp1252 do): iso-8859-1 windows-1252 (western european; cp1252 is a superset of 8859-1) iso-8859-2 windows-1250 (central european) iso-8859-3 windows-1254 (turkish) iso-8859-4 windows-1257 (baltic: latvian, lithuanian) iso-8859-5 windows-1251 (cyrillic) iso-8859-6 windows-1256 (arabic) iso-8859-7 windows-1253 (greek) iso-8859-8 windows-1255 (hebrew; cp1255 is a superset of 8859-8) iso-8859-9 windows-1254 (turkish amended) iso-8859-10 windows-1252 (western european amended for nordic languages) iso-8859-11 windows-? (thai) iso-8859-13 windows-1257 (baltic plus polish) iso-8859-14 windows-? (celtic) iso-8859-15 windows-1252 (western european amended) iso-8859-16 windows-1250? (central european) big5 windows-950 (chinese traditional: taiwan, hong-kong) iso10646-1 --the Unicode character-set, which is separate from the encoding; ie: fonts may be iso10646-1, webpages may be utf8 or utf16 etc; TEST for valid utf8-sequence (in Perl) from http://www.w3.org/International/questions/qa-forms-utf-8.en.php is: $field =~ m/\A( [\x09\x0A\x0D\x20-\x7E] # ASCII | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 )*\z/x; NOTE: the w3.org defn of "ASCII" only allows 3 control-chars: TAB, NL, CR (\x09 \x0a \x0d); NOTE: my definition allows those 3 plus: BS, VT, FF (\x08 \x0b \x0c); ie: I'm replacing first test with: [\x08-\x0d\x20-\x7e]; NOTE: file-command allows those 3 plus: BS, FF; The charsetdetective sometimes reports "unknown" and occasionally gives an incorrect answer, making the "er" facts an important addition; similarly, the "facts" often tell less than the whole story, so by themselves they're not enough; and that's why this program gives both; for example, a file shown as "er:WINUTF" is valid as Windows-Codepage-encoded or UTF-8-encoded text; =========== CHANGE-LOG: =========== 2011-03-20: forked from find-anomalous-textfiles, to get something that works on files, not directories; consider: adding a "quiet" option so find-anomalous can invoke this script (it must say nothing about straightforward cases - inappropriate for this script); simplified by scrapping file-cmd output: too often wrong; wordiness gets in the way of more valuable info; simpler is better wrt comprehension... (find-anomalous-textfiles has more info on flaws in file-command) Obsoleted code: X=$(file -b "$1") ##get output from file-command; consider -i for mime-style w charset= [[ $X == *HTML* ]] && HTM=HTM ##make note of file-command proclaiming it to be HTML fixed: BS|TAB|NL|VT|FF|CR had been written as \x09-\x0d, ought to be \x08-\x0d; also in: find-anomalous-textfiles cvt-textfiles-to-utf8-charset;