#!/bin/bash ## cvt-textfiles-to-utf8-charset -- mass-conversion of files to utf8, with backups -- by Eugene Reimer 2009-09-01. ## You can run my find-anomalous-textfiles first to help identify files needing manual intervention, although doing so isn't essential as this script will ## also grumble about, and leave unconverted, most kinds of anomalous files; and since it makes backups you'll be able to undo what it done wrong; ## ## USAGE: cvt-textfiles-to-utf8-charset DIRECTORY... --will convert non-UTF-8 non-ASCII textfiles below those Directories, recursively, with backups; ## ## PREREQ: charsetdetective -- from http://ereimer.net/programs/charsetdetective.htm; ## PREREQ: chgsed chg fullnameNOSLASH -- from http://ereimer.net/programs/general-purpose-scripts.htm; ## ## Copyright © 2009 Eugene Reimer; can be used, modified, copied, and distributed or sold under the terms of either the LGPL or the GPL (your choice); ## see http://www.gnu.org/licenses for the details of these terms. shopt -s extglob ##enable extglob for the @(...) etc extensions LC_CTYPE=C ##NEED an 8-bit charset for grep to work on bytes, as opposed to utf8 characters!! DEF=windows-1252 ##default encoding <--REVIEW NEEDED BKUPDIR=/tmp/cvt-to-utf8-$(date +%Y%m%d-%H%M)-BKUP ##backup directory LOGFILE=/tmp/cvt-to-utf8-LOG ##log-file msg () { echo "$@" >>$LOGFILE; } ##msg to logfile filterF () { egrep -vi '(jpg|png|gif|tif|p[abgp]m|avi|flv|wmv|mov|ogg|mpg|mp3|VRO|pdf|doc|xls|zip|gz|tgz|~)$'; } ##filter for filenames filterD () { egrep -v '/\.| |nsmail'; } ##filter for dir-names <--REVIEW NEEDED filtMET () { sed "s/.*\(charset\|CHARSET\)=\([^\"']*\).*/\2/" |tr '\n' ' ' |tr A-Z a-z |sed 's/ $//'; } ##extract charset from META-Content-Type prereqs () { for P;do if [[ $(type -p $P) == "" ]];then echo "$P from http://ereimer.net/programs is required"; PRE_ERR=1; fi; done if [ $PRE_ERR ];then exit 9; fi } doFile () { [ -L "$1" ] && return; [ -f "$1" ] || return ##skip dir or other non-file ASC=; ISO=; WIN=; UTF=; MET=; DET=; MET=$(grep -i 'ASCII else ##remaining tests only on non-ASCII... if ! egrep -vq $'^[\x09-\x0d\x20-\x7e\xa0-\xff]*$' $1;then ISO=ISO ##pattern matches 8859 line; egrep -v succeeds on a non-ISO file; negated=>ISO-8859 elif ! egrep -vq $'^[\x09-\x0d\x20-\x7e\x80\x82-\xff]*$' $1;then WIN=WIN;fi ##pattern matches WINcp line; egrep -v succeeds on a non-WIN file; negated=>WIN-cp if ! egrep -vq $'^([\x09-\x0d\x20-\x7e]|[\xC2-\xDF][\x80-\xBF]|\xE0[\xA0-\xBF][\x80-\xBF]|[\xE1-\xEC\xEE\xEF][\x80-\xBF][\x80-\xBF]|\xED[\x80-\x9F][\x80-\xBF]|\xF0[\x90-\xBF][\x80-\xBF][\x80-\xBF]|[\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF]|\xF4[\x80-\x8F][\x80-\xBF][\x80-\xBF])*$' $1 then UTF=UTF;fi ##pattern matches UTF-8 line; egrep -v succeeds on a non-UTF file; negated=>UTF-8 fi if [[ $ISO$WIN$UTF ]];then true; else return; fi ##skip quietly if pure-ASCII or Non-Text file (by my tests) DET=$(charsetdetective "$1" |sed 's|.*: ||' |tr A-Z a-z) ##get the answer from charsetdetective WAS="$ISO$WIN$UTF+$MET+$DET" ##save for msg, before stdizing [[ $UTF && $DET == *utf* && $MET == @(*utf*|'') ]] && { return; } ##skip quietly if unambiguously utf8 [[ $UTF || $DET == *utf* || $MET == *utf* ]] && { msg "$1 ?? $ISO$WIN$UTF+$MET+$DET";return;} ##skip after grumbling about ambiguous case assumed to be utf8 [[ $MET == *8859-1 ]] && { MET=windows-1252; } ##standardize to the broader variant (latin1) [[ $MET == *8859-8 ]] && { MET=windows-1255; } ##standardize to the broader variant (Hebrew) [[ $MET == iso8859* ]] && { MET=${MET/iso/iso-}; } ##standardize to the hyphenated spelling [[ $DET == unknown && $MET ]] && { msg "$1 using met:$MET"; DET=$MET; } ##grumble about "unknown" and assume META-info correct [[ $DET == unknown ]] && { msg "$1 using def:$DEF"; DET=$DEF; } ##grumble about "unknown" and assume $DEF [[ $MET && $MET != $DET ]] && { msg "$1 met:$MET; using det:$DET"; } ##grumble about METa-DETective mismatch (stdized to reduce these) [[ $DET == $DEF ]] || { msg "$1 ==unusual encoding $DET"; } ##inform about unusual charset <--REVIEW NEEDED FN=$(fullnameNOSLASH $1); BK=$BKUPDIR/$FN ##form fullname of where to place the backup copy msg "$1 converting from $DET to utf8; was:$WAS" ##msg mv -f $1 $BK; iconv -f $DET -t utf8 $BK >$1 2>>$LOGFILE; chmod --reference=$BK $1 ##==backup-rename + iconv-to-utf8 replacing original + keep perms if [[ $MET && $1 == *@(html|htm) ]];then ##if an html file (by filename)... msg "$1 chgseding met:$MET->utf-8" ##msg chgsed -k -n "/]*/charset=utf-8/" $1 ##==revise META-CHARSET in html file fi } doDir () { for F in $(ls -1 $1 |filterF);do doFile "$1/$F"; done ##excluding with egrep simpler and likely faster than with bash tests?? } prereqs charsetdetective chgsed chg fullnameNOSLASH ##ensure required scripts are available [ $# -eq 0 ] && set . ##default to curdir if no dirs specified [ -d "$BKUPDIR" ] || mkdir "$BKUPDIR" ##ensure backup-directory exists msg "$(date +%Y%m%d-%H%M): cvt-textfiles-to-utf8-charset $@" ##logmsg with timestamp and cmdline for DIR in "$@";do ##for each DIR on cmdline... for D in $(find $DIR -type d |filterD |sort);do doDir $D; done ##do DIR and its subdirs, recursively, in alphabetic order done exit ================== == NOTES == ================== --checking MET inconsistencies: MET examples: ISO8859-1 ISO-8859-1 iso-8859-1 iso-8859-15 windows-1252 UTF-8 utf-8 can use globs: *8859-1 *8859-15 *1252 utf* <--am lowercasing so last glob can be: utf* for my files, META-charset is best ignored when it disagrees with DET or with my tests; in all but one such disagreement MET is wrong, and for that one using DET's UTF is also fine (it's a mixed-encoding mess best left unconverted); --handling inconsistencies: if any of the tests indicate UTF, then treat as UTF (ie: leave unconverted); with diagnostic-msg if tests disagree; if none of the tests indicate UTF (but my tests indicate ISO or WIN) then file will be converted; if DET is unknown and has META-charset, then the META-charset used as FROM-charset; if DET is unknown and no META-charset, then DEF (cp1252) used as FROM-charset; otherwise DET (as determined by charsetdetective) used as FROM-charset; with diagnostic-msg if using a FROM-charset that differs from the META-charset, and/or from "the usual"; ================== == CHAGE-LOG == ================== Revising any meta-Content-Type-charset within converted file; but only on *htm files -- to avoid spurious revision to scripts (eg: this file); iconv errmsgs: redirected stderr (to LOG) so user can tell which file was the cause: omiting call to file-cmd; wasn't using the result; ie: decided to simplify, despite the advantages of keeping info-gathering part same as in find-anomalous... revised WIN test; eg hex 81 although permitted in some of the DOS/IBM/Windows codepages (eg ibm850) is NOT permitted in cp1252. cp437(DOS-default): all hi-bit-chars 80..ff are valid; and has graphic-chars for 01..1f, and 7f although these not normally used in files; cp850(DOS-european): all hi-bit-chars 80..ff are valid; and has graphic-chars for 01..1f, and 7f although these not normally used in files; cp858(with d5=euro): all hi-bit-chars 80..ff are valid; and has graphic-chars for 01..1f, and 7f although these not normally used in files; cp1252(W-european): hex 81, 8d, 8f, 90, 9d -- are unused; cp1250(C-european): hex 81, 83, 88, 90, 98 -- are unused; cp1251(Cyrillic): hex 98 -- unused; <--my new tests will treat this as "DOS" cp1255(Hebrew): hex 81, 8a, 8c..90, 9a, 9c..9f, ETC -- unused; --new WIN test outlawing hex 81 will catch at least some cases of file in old "DOS" encoding, however it results in cp1251 being lumped with DOS/IBM; --note that hex 81, u-umlaut, should be reasonably common in cp850/858/437 files; hex 90, E-acute, is another candidate that might be better for some people; added LC_CTYPE=C to work for utf8-user; preserve permissions using chmod --reference; considered also preserving timestamp using touch -r; 2011-01-06: BEWARE: bash-v4 has changed the meaning of =~ within double-square-brackets: specifically what quoting in RHS does; best to avoid it, using == and the extglob extensions since those work everywhere; add: shopt -s extglob replace: [[ $UTF && $DET =~ utf && $MET =~ 'utf|^$' ]] --> [[ $UTF && $DET == *utf* && $MET == @(*utf*|'') ]] [[ $UTF || $DET =~ utf || $MET =~ 'utf' ]] --> [[ $UTF || $DET == *utf* || $MET == *utf* ]] [[ $MET && $1 =~ 'html?$' ]] --> [[ $MET && $1 == *@(html|htm) ]] --made above revisions, with very little testing; 2011-01-06: "continue" within doFile routine ought to be "return" (smacks of relying on undocumented "feature" that might stop working in new version of bash); have so revised; all present uses of "return" in doFile execpt the 2 in 1st line resulted from this revision;