#!/bin/bash
## Check websites for illegal HTML-comments containing "--" by Eugene Reimer 2010-04-25;
##
## PREREQ: find-dirs fullname -- from http://ereimer.net/programs/general-purpose-scripts.htm
##
## Copyright © 2010 Eugene Reimer; can be used, modified, copied, and distributed or sold under the terms of either the LGPL or the GPL (your choice);
## see http://www.gnu.org/licenses for the details of these terms.
WEBSITES="/debwendon/website /er/website /noci/website" ##--REVISE AS NEEDED--
shopt -s nullglob ##set nullglob, so for-loop works within DIR lacking *htm files (eg cgi-bin)
WEBCOMMENTCHECK_LL () { ##WEBCOMMENTCHECK_LL: function to check in one dir
TMP=/tmp/WEBCOMMENT-$(fullnameNOSLASH "$1")
x01=$'\x01'; x02=$'\x02'
cat "$1" |
sed "s||$x02|g" | ##replace comment-starter with x01, comment-ender with x02
sed ":a \$!{N;ba}; s|^[^$x01]*||; s|\($x01[^$x02]*$x02\)[^$x01]*|\n\1|g" | ##treating file as one line, discard everything but comments
cat >$TMP ##comments-only copy of input-file to tmpfile, in case needed later
if V=$(grep -- '--' $TMP); [[ $V ]];then
echo;echo "===$(fullname $1) contains double-dash within HTML-comments==="
echo "$V" |sed "s|$x01||g" ##echo the faulty lines, replacing x01,x02 with comment-delimeters (undoing)
else rm -f $TMP ##discard tmpfile for correct HTML-file (keep for faulty ones)
fi
}
for WEBSITE in $WEBSITES;do ##==for each website...
cd $WEBSITE || exit ##in its HOME-dir
for DIR in . $(find-dirs);do ##for this dir and each subdir...
cd $WEBSITE/$DIR || exit ##==work in DIR
for H in *htm;do WEBCOMMENTCHECK_LL $H; done ##for each HTML file, check its comments
done
done
exit
I needed this because my habit of using double-dash for em-dash gets me into trouble, and I sometimes mess-up when yanking a chunk of HTML;
when "yanking" a chunk of HTML by enclosing in comment-delimeters, one needs to modify any comments within that chunk:
(1) to ensure a single ending-delimiter, and (2) starting-delimeters are modified to avoid any use of "--" within the comment;
incidentally most browsers ignore the rule about "--" being illegal within a comment, but not all, for example Firefox enforces it;
the actual rule about "--" is complex, but I see nothing gained by stating it as other than simply disallowed.
NOTE: can do this checking with tidy, as in my verify-html-comments:
WARN=$(tidy -q -o /dev/null "$F" 2>&1 |egrep "comment|--" |tr '\n' ' ') ##msgs such as: "adjacent hyphens within comment", "XML comments can't contain --"