#!/bin/bash
## Given an HTML file, check for broken and permanently-moved links -- by Eugene Reimer 2010-04-25;
## curdir must be where the input file resides, if it contains unqualified links (HREFs);
## leading-slashed link is resolved wrt optional 2nd param (HOME), default: first two components of curdir name;
##
## PREREQ: fullname fullnameNOSLASH -- from http://ereimer.net/programs/general-purpose-scripts.htm
##
## USAGE EXAMPLE:
## for H in *{htm,html};do WEBLINKCHECK-LL $H $WEBHOME; done ##for each HTML file, check its links (see weblinkcheck.cron for a complete example)
##
## Copyright © 2010 Eugene Reimer; can be used, modified, copied, and distributed or sold under the terms of either the LGPL or the GPL (your choice);
## see http://www.gnu.org/licenses for the details of these terms.
if [ $# -ge 2 ];then HOME=$2 ##set HOME from 2nd param
else HOME=$(pwd); while [[ $HOME == /*/*/* ]];do HOME=${HOME%/*};done ##set HOME to the first two components of curdir name
fi
USERAGENT="Opera/9.63 (X11; Linux i686; U; en) Presto/2.1.1" ##UserAgent-string for Opera on Linux
WGET () { wget -U"$USERAGENT" --tries=5 --timeout=120 --no-check-certificate "$@"; } ##invoke wget with more moderate tries & timeout settings; supply UserAgent
cat "$1" |
sed $'s|-->|\x02|g' | ##replace comment-ender with x02
sed $':a $!{N;ba}; s||g' | ##x02 back to comment-ender for unmatched uses of "-->"
cat >/tmp/WEBLINK-tmp ##comment-stripped copy of input-file to tmpfile
BASE=$(cat /tmp/WEBLINK-tmp |grep -i ']*\).*|\1|; s|['\"]||g") ##BASE gets base-href if present, emptystring if not
cat /tmp/WEBLINK-tmp |
grep -i ']*\).*|\1|p' | ##isolate just the URL part of each href=URL
sed "s|['\"]||g; s|#.*||" | ##remove quotes and strophs; remove any within-page link-component (can result in emptystring)
sort -u | ##uniquify
egrep -iv 'mailto:|javascript:|^$' | ##ignore mailto, javascript, emptystring links
while read;do
F=$REPLY; [[ $F != *://* && $F != /* ]] && F=$BASE$F ##handle BASE for unqualified ref the way Apache does
if [[ $F == *://* ]];then ##==http: use wget to test for existence (also handle ftp https?)
MSG="/tmp/WEBLINK-msg-$(fullnameNOSLASH $1==$F)" ##name of wget-msgs tmpfile, to be kept for anomalous case
WGET -O/tmp/WEBLINK-tmp2 $F &>"$MSG"; RC=$? ##wget with msgs to tmpfile for subsequent scanning
GETRC=$(cat "$MSG" |sed -n '/HTTP request sent/s|.* \([0-9]\+\).*|\1|p' |tail -n1) ##get the HTTP-RC from wget-msgs
[[ $RC -eq 0 ]] || echo "$(fullname $1) $F ==BROKEN-$GETRC" ##msg for broken-link, RC is always 1 for failure, but the HTTP-RC can be of interest
[[ $RC -eq 0 && $GETRC -ne 200 ]] && echo "$(fullname $1) $F ==WGET-ANOMALY" ##never happens?
[[ $RC -eq 0 ]] || continue ##skip looking at 301-redirect if redirected-result still broken, and keep $MSG for these
NEW=$(cat "$MSG" |sed -n '/301 Moved Permanently/{N;N;s|.*\nLocation: ||;s| .*||;p}' |tail -n1) ##get 301-redirect-Location from wget-msgs, if any
[[ $NEW != "" ]] && echo "$(fullname $1) $F ==MOVEDTO:$NEW" ##msg for 301-redirect
[[ $RC -eq 0 && $GETRC -eq 200 ]] && rm -f "$MSG" ##discard $MSG file except for anomalous case
else ##==local file:
[[ $F == /* ]] && F=$HOME$F ##handle leading-slash the way Apache does
[[ $HOME == /er* && $F == *[0-9][0-9][0-9].htm && ! -e $F ]] && F=${F/.htm/.jpg} ##==simulate Apache-rewriterule from .htaccess (KLUGE)
[[ -e $F ]] || echo "$(fullname $1) $F ==NOTFOUND" ##msg for broken-link
fi
done
exit
NOTE: doesn't properly parse HTML; the comment-skipping and HREF-matching will only work if "", "href=" are avoided in quoted-strings and javascript;
handles " worst case was 20 * 15min = 5-hours!!
switch to: --tries=5 --timeout=120 ==> worst case now 5 * 2min = 10-min;
--worst case is actually worse than that, eg: wget times-out while connecting, retries & succeeds, then times-out in read, for total of 4+ minutes on one "try";
was getting 403 (Forbidden) on some URLs that work in a browser (eg http://celebrity.myheritage.com/celebrity-morph);
(A) USERAGENT=Mozilla -- solves 4 failures, but brings 5 new ones:
(B) USERAGENT="Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)" -- solves same 4, brings 3 new ones, adobe-getflashplayer gets worse, is erratic:
(C) USERAGENT="Opera/9.63 (X11; Linux i686; U; en) Presto/2.1.1" -- solves same 4, and solves adobe-getflashplayer; ==SEEMS THE BEST ANSWER:
--several of the "solutions" are of dubious merit, producing pages of the domain-name-for-sale sort which I'd rather have seen as failures;
--turns out the adobe-getflashplayer redirects remain weirdly erratic with every USERAGENT I've tried, and may have to ignore them to avoid "thrashing";
==NOTE: could treat