#!/bin/bash ## check websites for broken links, and automatically fix MOVED/BROKEN ones (can be run as a cron-job) -- by Eugene Reimer 2010-04-25; ## ## PREREQ: WEBLINKCHECK-LL -- from http://ereimer.net/programs/webmaster-tools.htm ## PREREQ: chgsed find-dirs SetIntersection -- from http://ereimer.net/programs/general-purpose-scripts.htm ## ## Copyright © 2010 Eugene Reimer; can be used, modified, copied, and distributed or sold under the terms of either the LGPL or the GPL (your choice); ## see http://www.gnu.org/licenses for the details of these terms. WEBSITES="/debwendon/website /er/website /noci/website" ##--REVISE AS NEEDED-- {in published copy} EMAIL=ereimer@shaw.ca ##--REVISE AS NEEDED-- LOG=/pix/WEBLINKCHECK/WEBLINKCHECK-msgs-$(date +%Y%m%d-%H%M) ##--REVISE AS NEEDED-- MSGS=/tmp/WEBLINKCHECK-msgs shopt -s nullglob ##set nullglob, so for-loop works within DIR lacking *htm files (eg cgi-bin) if tsMSGS=$(date -r$MSGS +%s); tsNOW=$(date +%s); ((tsNOW-tsMSGS>=20*60*60)) ##if $MSGS at least 20-hours old... then mv -fv $MSGS-1 $MSGS-2; mv -fv $MSGS $MSGS-1; fi ##rotate for 2 previous copies of $MSGS (with -v msgs to stdout) for WEBSITE in $WEBSITES;do ##==for each website... cd $WEBSITE || exit ##in its HOME-dir for DIR in . $(find-dirs);do ##for HOME-dir and each subdir... if [[ $DIR == wa-stats || $DIR == webalizer ]];then continue; fi ##skip wa-stats, webalizer dirs cd $WEBSITE/$DIR || exit ##==work in DIR for H in *htm;do WEBLINKCHECK-LL $H $WEBSITE; done ##for each HTML file, check its links done done &>$MSGS ##output to $MSGS file SetIntersection $MSGS $MSGS-1 |grep '==BROKEN' >$MSGS-2-timers ##form list of 2-time-BROKENs SetIntersection $MSGS-2-timers $MSGS-2 >$MSGS-3-timers ##form list of 3-time-BROKENs { grep '==NOTFOUND' $MSGS; cat $MSGS-2-timers; } >>$LOG ##==NOTFOUND and 2-time-BROKEN msgs to $LOG (2010-06-11) while read PAGE URL JUNK;do ##==automated elimination of 3-time-BROKENs... chgsed -v --tmp "s|[Hh][Rr][Ee][Ff]=[\"']*$URL[\"']*|href=\"javascript:alert('${URL//&/\\&} link inoperative $(date +%Y-%m-%d)')\"|g" $PAGE done <$MSGS-3-timers >>$LOG 2>&1 ##appending msgs onto $LOG grep '==MOVEDTO:' $MSGS |sort -r |while read PAGE OLD NEW;do ##==automated 301-redirect-fixups (is reverse order useful?); was '==MOVEDTO:http://' NEW=${NEW#*:} ##remove "==MOVEDTO:" prefix from $NEW [[ $NEW != http:* ]] && NEW=$(echo $OLD|sed 's|\(http://[^/]*\).*|\1|')$NEW ##handle site-relative $NEW which has leading-slash NEW=${NEW//\/\//\/}; NEW=${NEW/http:\//http:\/\/} ##stdize double-slashes in $NEW [[ $OLD == $NEW ]] && continue ##do nothing if identical after double-slash-stdizing (hope this never happens) CHG="/^[ \t]*" ##add html-comment at end-of-file about OLD-to-NEW 301-revision chgsed -v --tmp "$CHG" $PAGE if V=$(sed -n "/^[ \t]* grep '==MOVEDTO:'; and added line to copy leading part of $OLD to $NEW, assume leading-slash; was there some subtle reason for matching ==MOVEDTO only when immediately followed by "http://"; hope not==??== Note: had those 3 days in a row, in /tmp/WEBLINKCHECK-msgs, /tmp/WEBLINKCHECK-msgs-1, /tmp/WEBLINKCHECK-msgs-2 from 2011-01-0[234]; then after making this change at 06:36 on 2011-01-05, weblinkcheck.cron ran at 06:40 and behaved as expected (except it was slower than expected leading to this aborted report:) 2011-01-19: an erroneous fixup: s|http://www.manitobanature.ca/publications/current_newsletter.pdf|http://www.manitobanature.cahttp:/www.naturemanitoba.ca/publications/current_newsletter.pdf| obviously I've erred in the Jan05 fixup for non-fully-qualified redirects; Note: the only thing that's changing is manitobanature-->naturemanitoba; fixed: [[ $NEW != http:// ]] --> [[ $NEW != http://* ]] <--had forgotten the star; undid the revision on that webpage, in order to see it be redone by corrected version of this script; OLD->NEW OK, but old->new not done??