#!/bin/bash ## Check websites for illegal HTML-comments containing "--" by Eugene Reimer 2010-04-25; ## ## PREREQ: find-dirs fullname -- from http://ereimer.net/programs/general-purpose-scripts.htm ## ## Copyright © 2010 Eugene Reimer; can be used, modified, copied, and distributed or sold under the terms of either the LGPL or the GPL (your choice); ## see http://www.gnu.org/licenses for the details of these terms. WEBSITES="/debwendon/website /er/website /noci/website" ##--REVISE AS NEEDED-- shopt -s nullglob ##set nullglob, so for-loop works within DIR lacking *htm files (eg cgi-bin) WEBCOMMENTCHECK_LL () { ##WEBCOMMENTCHECK_LL: function to check in one dir TMP=/tmp/WEBCOMMENT-$(fullnameNOSLASH "$1") x01=$'\x01'; x02=$'\x02' cat "$1" | sed "s||$x02|g" | ##replace comment-starter with x01, comment-ender with x02 sed ":a \$!{N;ba}; s|^[^$x01]*||; s|\($x01[^$x02]*$x02\)[^$x01]*|\n\1|g" | ##treating file as one line, discard everything but comments cat >$TMP ##comments-only copy of input-file to tmpfile, in case needed later if V=$(grep -- '--' $TMP); [[ $V ]];then echo;echo "===$(fullname $1) contains double-dash within HTML-comments===" echo "$V" |sed "s|$x01||g" ##echo the faulty lines, replacing x01,x02 with comment-delimeters (undoing) else rm -f $TMP ##discard tmpfile for correct HTML-file (keep for faulty ones) fi } for WEBSITE in $WEBSITES;do ##==for each website... cd $WEBSITE || exit ##in its HOME-dir for DIR in . $(find-dirs);do ##for this dir and each subdir... cd $WEBSITE/$DIR || exit ##==work in DIR for H in *htm;do WEBCOMMENTCHECK_LL $H; done ##for each HTML file, check its comments done done exit I needed this because my habit of using double-dash for em-dash gets me into trouble, and I sometimes mess-up when yanking a chunk of HTML; when "yanking" a chunk of HTML by enclosing in comment-delimeters, one needs to modify any comments within that chunk: (1) to ensure a single ending-delimiter, and (2) starting-delimeters are modified to avoid any use of "--" within the comment; incidentally most browsers ignore the rule about "--" being illegal within a comment, but not all, for example Firefox enforces it; the actual rule about "--" is complex, but I see nothing gained by stating it as other than simply disallowed. NOTE: can do this checking with tidy, as in my verify-html-comments: WARN=$(tidy -q -o /dev/null "$F" 2>&1 |egrep "comment|--" |tr '\n' ' ') ##msgs such as: "adjacent hyphens within comment", "XML comments can't contain --"