#!/bin/bash ## check websites for broken links, and automatically fix MOVED/BROKEN ones (can be run as a cron-job) -- by Eugene Reimer 2010-04-25; ## PREREQ: WEBLINKCHECK-LL chgsed find-dirs SetIntersection -- from http://ereimer.net/programs/ertools.zip ## Copyright © 2010 Eugene Reimer; can be modified and/or distributed under the terms of the GPL; see http://www.gnu.org/licenses/gpl.html. WEBSITES="/debwendon/website /er/website /noci/website" ##--REVISE AS NEEDED-- {in published copy} EMAIL=ereimer@shaw.ca ##--REVISE AS NEEDED-- LOG=/pix/WEBLINKCHECK/WEBLINKCHECK-msgs-$(date +%Y%m%d-%H%M) ##--REVISE AS NEEDED-- MSGS=/tmp/WEBLINKCHECK-msgs shopt -s nullglob ##set nullglob, so for-loop works within DIR lacking *htm files (eg cgi-bin) if tsMSGS=$(date -r$MSGS +%s); tsNOW=$(date +%s); ((tsNOW-tsMSGS>=20*60*60)) ##if $MSGS at least 20-hours old... then mv -fv $MSGS-1 $MSGS-2; mv -fv $MSGS $MSGS-1; fi ##rotate for 2 previous copies of $MSGS (with -v msgs to stdout) for WEBSITE in $WEBSITES;do ##==for each website... cd $WEBSITE || exit ##in its HOME-dir for DIR in . $(find-dirs);do ##for HOME-dir and each subdir... if [[ $DIR == wa-stats || $DIR == webalizer ]];then continue; fi ##skip wa-stats, webalizer dirs cd $WEBSITE/$DIR || exit ##==work in DIR for H in *htm;do WEBLINKCHECK-LL $H $WEBSITE; done ##for each HTML file, check its links done done &>$MSGS ##output to $MSGS file SetIntersection $MSGS $MSGS-1 |grep '==BROKEN' >$MSGS-2-timers ##form list of 2-time-BROKENs SetIntersection $MSGS-2-timers $MSGS-2 >$MSGS-3-timers ##form list of 3-time-BROKENs { grep '==NOTFOUND' $MSGS; cat $MSGS-2-timers; } >>$LOG ##==NOTFOUND and 2-time-BROKEN msgs to $LOG (2010-06-11) while read PAGE URL JUNK;do ##==automated elimination of 3-time-BROKENs... chgsed -v --tmp "s|[Hh][Rr][Ee][Ff]=[\"']*$URL[\"']*|href=\"javascript:alert('${URL//&/\\&} link inoperative $(date +%Y-%m-%d)')\"|g" $PAGE done <$MSGS-3-timers >>$LOG 2>&1 ##appending msgs onto $LOG grep '==MOVEDTO:' $MSGS |sort -r |while read PAGE OLD NEW;do ##==automated 301-redirect-fixups (is reverse order useful?); was '==MOVEDTO:http://' NEW=${NEW#*:} ##remove "==MOVEDTO:" prefix from $NEW [[ $NEW != *://* ]] && NEW=$(echo $OLD|sed 's|\(.*://[^/]*\).*|\1|')$NEW ##handle site-relative $NEW (assuming it has leading-slash) NEW=${NEW//\/\//\/}; NEW=${NEW/:\//:\/\/} ##stdize double-slashes in $NEW (all slash+slash->slash, colon+slash->colon+slash+slash) [[ $OLD == $NEW ]] && continue ##do nothing if identical after double-slash-stdizing (hope this never happens) CHG="/^[ \t]*" ##add html-comment at end-of-file about OLD-to-NEW 301-revision chgsed -v --tmp "$CHG" $PAGE if V=$(sed -n "/^[ \t]* grep '==MOVEDTO:'; and added line to copy leading part of $OLD to $NEW, assume leading-slash; was there some subtle reason for matching ==MOVEDTO only when immediately followed by "http://"; hope not==??== Note: had those 3 days in a row, in /tmp/WEBLINKCHECK-msgs, /tmp/WEBLINKCHECK-msgs-1, /tmp/WEBLINKCHECK-msgs-2 from 2011-01-0[234]; then after making this change at 06:36 on 2011-01-05, weblinkcheck.cron ran at 06:40 and behaved as expected (except it was slower than expected leading to this aborted report:) 2011-01-19: an erroneous fixup: s|http://www.manitobanature.ca/publications/current_newsletter.pdf|http://www.manitobanature.cahttp:/www.naturemanitoba.ca/publications/current_newsletter.pdf| obviously I've erred in the Jan05 fixup for non-fully-qualified redirects; Note: the only thing that's changing is manitobanature-->naturemanitoba; fixed: [[ $NEW != http:// ]] --> [[ $NEW != http://* ]] <--had forgotten the star; undid the revision on that webpage, in order to see it be redone by corrected version of this script; OLD->NEW OK, but old->new not done?? 2011-04-14: fix: 301-redirect fixups were done correctly, however the link-text went unrevised -- same problem as in previous entry ($OLD->$NEW OK, $old->$new not done); the cause is so obvious I'm ashamed of myself: sed s-cmd using OR (\|) cannot use VERTICALBAR (|) as the delimeter; bug was introduced by 2010-11-28 change; Question: when Verticalbar is the s-cmd-delimiter, is the regex-OR (\|) then impossible, or will it work with an extra backslash?? (NOPE, see below) what char to use as delimiter? obviously not Slash as these are URLs; Bang (!) also sometimes used in an URL (see 2010-11-28 above); considered using non-ASCII-char but sed does NOT allow that; causes it to grumble "unknown option to 's'"; (tried ñ and ¥) considered using Stroph (') but it is poor choice since entirely legal in an URL; (wrote urlchars.htm at this point in my research) candidates for sed-s-cmd-delimiter: char not used unencoded in an URL: Caret Braces Brackets Backslash Backtick Verticalbar <--SEE MY urlchars.htm; however: Braces are used in bash-variable-substitutions eg ${V/A/B}; Brackets are regex metachars; Backtick used by bash as synonym for $(cmd); Went with Caret; (bad choice - see below) 2011-05-23: (sed errmsg showed up on 2011-05-19); Caret was bad choice as it's also being used as regex metachar; even when being careful I still screw up; investigate Verticalbar and/or Caret: when used as the s-cmd-delimiter, then it needs extra level of backslashing on other use==?? with Verticalbar: there is no multiple-backslashes-preceding-the-regex-OR solution; with Caret: one backslash before the regex-Leftanchor-metachar works==!!== Cmdline that demonstrates: echo "now is now" |sed 's^\(\^\|[ \t>]\)now^\1NOW^g' ==> NOW is NOW Another solution: ASCII-control-char as s-cmd-delimiter, demonstrated in my chg script: DLM=$'\x01';... "sed 's$DLM...$DLM...$DLM'" (A: Caret) CHG="$CHG /^[ \t]*