#!/bin/bash ## Remove unreferenced files in a directory of website files -- by Eugene Reimer 2001-Oct; ## PREREQ: fullnameNOSLASH -- from http://ereimer.net/programs/general-purpose-scripts.htm ## ## USAGE: ## webclean [DIR] [option]... ##removes and reports; removes unreferenced "cleanable" files, reports unreferenced "keepers" ## webclean [DIR] [option]... --test ##just reports w/o removing; reports all unreferenced files regardless of whether they be keepers or not ## ## the --keeper=GLOB option specifies a pattern that identifies filenames immune to cleaning; the default --keeper='!(*jpg|*gif|*png)' matches anything but a name ## ending in jpg|gif|png; where !(...) is the negation of ...; ## ## the --page=GLOB option specifies a pattern that identifies filenames to be searched for links; the default --page='@(*htm|*html)' matches any name ending in ## htm|html; depending on your requirements you may need to include names like asp|php; ## ## the --casematters option specifies that keeper- and page-matching be case-sensitive; note that links are always case-sensitive; ## the --tmp option specifies that any "clean"ed file be moved rather than removed; it'll be moved to /tmp/webclean-FILENAME; ## ## NOTE: after running this script, if anything was removed, then you need to run the same cmdline again; because some links no longer exist... Usage () { echo "USAGE: webclean [DIR] [--keeper=GLOB] [--test]"; } PAGE='@(*htm|*html)' ##which files to be checked for links KEEP='!(*[Jj][Pp][Gg]|*[Gg][Ii][Ff]|*[Pp][Nn][Gg])' ##which files are immune to cleaning -- for correct or broken bash ##KEEP='!(*jpg|*gif|*png)' ##which files are immune to cleaning -- for correct bash only GLOBIGNORE="." ##enable dotglob but ignore dot and dotdot, so star works properly shopt -s nullglob ##enable nullglob so nonexistent becomes emptystring shopt -s extglob ##enable extglob for extended-patterns (negation etc) shopt -s nocaseglob ##enable nocaseglob for caseblind globbing DIR=. ##default for $DIR (for the optional DIR non-option cmdline-arg) for ARG in "$@";do ##for each cmdline-arg... if [[ $ARG == -* ]];then ##have an option... if [[ $ARG == --test ]];then TEST=1 elif [[ $ARG == --tmp ]];then TMP=1 elif [[ $ARG == --keep*=* ]];then KEEP=${ARG#*=} elif [[ $ARG == --page=* ]];then PAGE=${ARG#*=} elif [[ $ARG == --casem* ]];then shopt -u nocaseglob ##disable nocaseglob for --casematters option else echo "unrecognized option: $ARG"; Usage; exit fi else ##have a non-option arg... DIR=$ARG; if ((++K>1));then Usage; exit; fi ##handle DIR cmdline-arg, ensuring at most one such fi done echo -n "##TEST:$TEST; KEEP:$KEEP; PAGE:$PAGE; "; shopt dotglob extglob nocaseglob nullglob |sed 's|[ \t]\+| |' |tr '\n' ' '; echo ##DEBUG cd $DIR || exit 9; if [ $TEST ];then echo "cd $(pwd) || exit"; fi ##work in $DIR; 2009-12: echo "cd" if TEST [ $TEST ] || clean $DIR listG=$(echo -n $KEEP) ##test to safeguard against losing files when equal-equal is broken listM=$(for F in *;do if [[ $F == $KEEP ]];then echo -n "$F ";fi;done |sed 's| $||') if [ "$listG" != "$listM" ];then echo "your bash has a broken implementation of doubly-square-bracketed-equal-equal operator wrt nocaseglob; that problem together with your filenames, means you need to modify your keeper pattern to handle both lower- and uppercase letters." exit 99 fi for F in $PAGE;do true; done; if ! [[ -e $F ]];then echo "no pages"; exit; fi ##errmsg if nothing matches $PAGE; note nullglob in effect cat $PAGE |sed $'s|-->|\x02|g' |sed $':a $!{N;ba}; s||g' | ##excluding HTML-comment lines... 2011-01-15:added this egrep -i 'href=|src=|background=|url[(]' |sed 's|"||g; s|\./||' >/tmp/tmpXREF ##isolate links; 2006nov:added "url(" 2009-01-08:remove "./" for F in *;do ##for each file in dir... [ -f "$F" ] || continue ##skip non-file x=$(grep "[=(]$F" /tmp/tmpXREF) ##determine if file F is linked-to; 2006nov:added "(" if [ "$x" = "" ];then ##file is unreferenced if [[ $F == $KEEP ]];then ##file is a keeper ==equal-equal vs equal-tilde if [ $TEST ];then echo "## $F unreferenced"; fi ##for keeper, msg if --test, nothing otherwise else ##file is cleanable CMD="rm -fv $F"; [ $TMP ] && CMD="mv -fv $F /tmp/webclean-$(fullnameNOSLASH $F)" ##construct CMD as either rm or mv (controlled by --tmp option) if [ $TEST ];then echo "$CMD ##UNREFERENCED"; else $CMD; fi ##for cleanable file, msg if --test, execute CMD otherwise fi fi done rm /tmp/tmpXREF exit my WEBXREF used a similar algorithm but only reports, is obsolete? 2006-11: also handle "url(FILENAME" references 2008-10-26: rewrote the ARG handling in a new operand-in-same-word style that may become my new "norm" for cmdline-arg-parsing==??== 2008-10-26: consider making this script traverse subdirs; presently caller needs to invoke for each subdir; 2008-11-01: scrapped touching $(pwd)-clean-needed on doing a removal; 2008-12: briefly used egrep "[=(]$f|[=(]./$f" instead of grep "[=(]$f" -- wanted for DEMOCULM; 'twas done badly... 2009-01-08: the botched egrep-matching got WRONG result for filename containing "+" or "("; fixed by going back to grep rather than egrep, and removing "./" on lines of tmpXREF; avoid changing "../" ?? 2009-12: echo "cd" if TEST 2010-06: added --tmp option, to move rather then remove; 2010-11-01: want to optionally clean other than image filetypes; eg: sometimes want cleaning-up of css, js files, sometimes even of htm files; new --keeper=GLOB option to indicate files to be kept even if unreferenced; it replaces the previously builtin matching for "image filetype" (negated); Note: this script makes 2 different distinctions by filetype: (1) which files to scan for links, (2) which ones are removed if unreferenced; PAGE: determines which files are to be scanned for links; KEEP: determines which are immune to cleaning; its negation (set-complement) has replaced REGEX_IMG; negating KEEP pattern: enabling shell option extglob adds !(A|B|C) for negation and other regex-like globs; are msgs altered for the "keeper" view (vs the is-image view) to reassure user about which will be kept?? NOPE, first running with --test will give the info; yet another "matching", identifying links, is done in 2 parts: (1) isolating lines containing 'href=|src=|background=|url(' -- is done while constructing /tmp/tmpXREF, as is removing quote characters; (2) to determine if F is-linked-to, we look for a line in /tmp/tmpXREF containing either "=F" or "(F" (with grep-pattern: [=(]$F having removed quotes); NOTE: to make specifying --keep on cmdline convenient, the case-blind aspect needs to be handled by this script (subject to an option); since caseblindness will be the default, the option has to be --casematters (the opposite of the customary --ignorecase); handle caseblindness via shopt nocaseglob, which obsoletes the unreadable method: KEEP='!(*[Jj][Pp][Gg]|*[Gg][Ii][Ff]|*[Pp][Nn][Gg])' <--for equal-equal matching KEEP='!([Jj][Pp][Gg]$|[Gg][Ii][Ff]$|[Pp][Nn][Gg]$)' <--for equal-tilde (extended-regex) matching, also needs leading dot-star?? sadly feel forced to use the unreadable version as builtin default in order to work with a broken bash; ==only diagnosed as such if such names exist; want --casematters to apply to both keeper- & page-matching, but not to link-matching; link-matching via grep controlled separately, so can use global setting of the shell-option; ie: don't need my own global var and enabling/disabling of the shell-option; NOTE: page-matching uses Pathname-Expansion aka globbing, with @(A|B|C) and !(A|B|C) allowed since extglob enabled; at-sign notation is ugly but best there is; now using equal-equal for keeper-matching, so that the 2 "patterns" can employ same notation; (was using equal-tilde for regex-matching); the matching code can be foiled by many FAULTY VERSIONS OF BASH disobeying the documentation, however the test and warning for buggy bash make it safe; NOTE: the link-detection can get fooled, eg: when one filename a substring of another, but we only err on the safe side; NOTE: after running this script, if anything was removed, then you need to run the same cmdline again; because some links no longer exist; (this also up front) that will usually work, although it can be foiled by a group of files that refer to each other although none is referenced by any keeper; a proper solution needs different & tricky XREF part: start with links from keeper+page files, for each new keeper add its links, deferring removals... a workaround is to manually remove the "root" page of such a file-tree then rerun webclean; CONSIDER: Javascript PAGEs and adding "js" to default for PAGE; note: straightforward javascript code that generates HTML with link is handled correctly, although a link made with string-catenation isn't; however, having js PAGEs makes me question the assumption about PAGEs being in the same directory as where we're doing the cleaning; does my --page=GLOB need to become --page=LIST_OF_PATHNAMES?? actually a "GLOB" is a generalization of PATHNAME, but can it serve to match LIST_OF?? yup, with extglob enabled it can; having bash-v4 and having its globstar enabled further improves the path-specifying power; eg: --page='@(../js/*|*htm)' eg: --page='@(../js|.)/@(*htm|*html|*js)' ==NOTE: inter-directory links aren't handled by the present code; need to handle fully-qualified and ../UncleTom kinds of links==!!== CONSIDER: Would a PAGE-pattern that includes all HTML-pages (and javascript files) on the entire website do away with needing --keeper option?? Answer: very nearly, but possibly not altogether? consider some link not handled by webclean's link-detection; --keeper can be workaround for such flaw, like the present flaw wrt fully-qualified; BROKEN vs NON-BROKEN BASH: has there ever been a non-broken bash WRT doubly-bracketed equal-equal operator respecting the nocaseglob shell-option?? I have no evidence that there's ever been a version of bash that's correct in that regard, since I've never had such a version; furthermore I'm having doubts about whether the bash man-page promises that that operator will respect nocaseglob; on my first reading I thought it did, and yet under shopt, it states that nocaseglob only affects pathname-expansion so one can certainly argue that no such promise is made; 2011-01-15: skipping HTML-comments in the link-detection code: cat $PAGE |grep -v $'[ \t]*|\x02|g' |sed $':a $!{N;ba}; s||g' | ##improved method, as used in WEBLINKCHECK-LL 2011-01-15: for the comparison of $listG to $listM: what's required is a string-not-equal operator, but there is no string-not-equal within double-square-brackets; recall that "==" and "!=" are pattern-matching operators within double brackets; fixed: [[ $listG != $listM ]] --> [ "$listG" != "$listM" ] (need quotes to ensure exactly one "word" on each side of the operator when singly bracketed??)