#!/bin/bash ## 2006-06: find and display in hex any non-ascii characters, as an aid in identifying a file's Encoding, eg: UTF-8 vs ISO-8859-1; ## also treating ctrl-chars other than NL(x0A), TAB(x09) as unprintable; ## ## Copyright © 2006,2009 Eugene Reimer; can be used, modified, copied, and distributed or sold under the terms of either the LGPL or the GPL (your choice); ## see http://www.gnu.org/licenses for the details of these terms. LC_CTYPE=C ##NEED an 8-bit charset for grep to work on bytes, as opposed to utf8 characters!! ## Construct list of unprintable chars: ## FIXME - if one line has multiple strings of such chars, will miss all but the first; ## echo "

produced by: find-unprintable-meaning-nonASCII $@

" >tmp-unprintable.htm echo "this file $(pwd)/tmp-unprintable.htm is best viewed in a browser to try different Charset/Encodings

" >>tmp-unprintable.htm grep $'[^\x09\x20-\x7e]' "$@" | sed $' s|<|\<|g; s|>|\>|g; s|^[\x09\x20-\x7e]*\\([^\x09\x20-\x7e][^\x09\x20-\x7e]*\\).*$|''
\1: &| ' | sort -us -k 1,1 >>tmp-unprintable.htm ## The following might be useful, but I was happier without it: ## #for CHARSET in utf-8 iso-8859-1;do # echo "" >tmp-unprintable-$CHARSET.htm # cat tmp-unprintable.htm |sed "s|to try different.*|which should detect the $CHARSET charset/encoding

|" >>tmp-unprintable-$CHARSET.htm #done ## Display the results using less with special CHARSETDEF to show non-ascii chars in hex: ## unset LESSCHARSET ##unset to avoid having LESSCHARDEF be overridden by it export LESSCHARDEF="32b95.b" ##for ASCII (less appears to ignore "b" vs "c" distinction) export LESSBINFMT="*s<%x>" ##to show others in hex less -dfQU tmp-unprintable.htm ## -d avoid "dumb terminal"; -f avoid "may be binary"; -Q avoid bell-ringing; -U treat BS, TAB, CR as binary exit 2006-09-04: encountered mysterious characters in bot2com-NATS plantlist - some that should be apostrophes are Ctrl-Y--??-- 2009-08: was missing non-ascii chars within pointy-brackets; instead of removing HTML-tags with s|<[^>]*>| |g now using s|<|\<|g; s|>|\>|g 2009-09: added LC_CTYPE=C so will work for a utf8-user;