#!/bin/bash ## Google-Sitemap-generating script -- by Eugene Reimer 2005july04; ## used for each site I look after; ## I first installed the Google-provided sitemap_gen.py (under /pkg/sitemap_gen-1.1); it failed needing newer python, so I wrote my own; ## SIZE-LIMITS: ## see: https://www.google.com/webmasters/sitemaps/docs/en/protocol.html ## MAX: 50,000 url's or 10MB in one file -- use a sitemap-index and multiple files if exceeeding those limits; can also gzip-compress; ## note: on 2008-10-19 after including SUBDIR/* files, the sitemap for ER-site is 2,693,442 bytes, has 16,223 -entries; <--2008-11-18: per-img-htm made 1.5x ## note: on 2009-12-16 sitemap-er BEFORE DynamicPerPhoto + EXCLUDE images: 4,781,330 bytes, has 28,026 -entries; ## note: on 2009-12-19 sitemap-er AFTER DynamicPerPhoto + EXCLUDE images: 1,684,857 bytes, has 9,819 -entries; <--YUP, roughly one-third!! ## ergo, expect to run into the url-limit first ==> gzipping is only useful to get faster uploads... ## PRIORITY: ## using 0.9 for frontpage & mborchids.htm, 0.8 for other HTML/TXT files, and 0.2 for everything else --??-- ## == may want higher for 2nd-level menu pages ?? ## == may want lower for largely obsolete things like Newpix01nov or fieldtrips00 (noci) ?? ## CHANGEFREQ: ## weekly for frontpage & sitemap, monthly for other HTML files, yearly for images?? ## AUTOMATING: ## 1. execute this script as part of webput, so the LASTMOD fields are always correct; ## 2. notify Google/Yahoo/Ask.com of the content change, by issuing via wget/curl: ## wget www.google.com/webmasters/tools/ping?sitemap=sitemap_url ## wget ## see: http://www.google.com/support/webmasters/bin/answer.py?answer=34609&query=ping&topic=&type= --was https://www.google.com/webmasters/sitemaps/docs/en/submit.html#ping ## also: http://www.seroundtable.com/archives/013113.html <--covers all 3 major search-engines, however since it's wrong for Google... ## ==do pinging in WEBPUT-LL, after upload complete==!!== ## ## TYPICAL USAGE: ## sitemap-gen /noci/website shopt -s extglob ##enable extglob for @(...) etc cd $1 || exit 9 ##2008-11-01: cd to the website|website dir (was website|dotnet) . WEBURL $1 ##2008-11-05: use WEBURL to map $1 into $URL (shared with other scripts) if ! [ $URL ];then echo "sitemap-gen: website ($1) not supported"; exit; fi ##2008-11-01: new MAP=sitemap.xml; ##can become ...xml.gz for gzip-compressed DATE=$(date -u -Iseconds) ##using -Iseconds (ISO-8601 format with time to the second); -u (times in GMT) BKID=$(pwd |sed 's|^/||;s|/.*||') ##get BKID; formerly NOCI/ER/Debwendon but now lowercased as the dirnames are echo "sitemap-gen: $1 URL=$URL BKID=$BKID" ##info, possibly DEBUG ##clean ##yanked, now the caller's responsibility to clean... mv -f $MAP /tmp/sitemap.xml-$BKID ##move old sitemap; will use for diffs later >$MAP echo "" >>$MAP echo '>$MAP echo 'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"' >>$MAP echo 'xsi:schemaLocation="http://www.google.com/schemas/sitemap/0.84' >>$MAP echo 'http://www.google.com/schemas/sitemap/0.84/sitemap.xsd">' >>$MAP echo " $URL/ $DATE weekly 0.9 " >>$MAP ##the sitemap-entry for the frontpage for f in * */*;do R=$f; D=.; [[ $f == */* ]] && D=${f%/*} [[ $f == UNINDEXED* ]] && continue ##skip "hidden" files [[ $f == ANON* ]] && continue ##skip "hidden" files [[ $f == BUDSPHOTOS* ]] && continue ##skip "hidden" files (for NOCI) [[ $f == *_be* ]] && continue ##skip "hidden" files (for NOCI) [[ $f == OrchidPinCata* ]] && continue ##skip "hidden" files (for NOCI) [[ $f == wa-* ]] && continue ##skip "hidden" wa-stats dir & subfiles thereof [[ $f == webalizer* ]] && continue ##skip the stats directory & subfiles thereof, at HostExcellence [[ $f == stats* ]] && continue ##skip the stats directory & subfiles thereof (OBSOLETE??) [[ $f == cgi-bin* ]] && continue ##skip cgi-bin directory & subfiles thereof [[ $f == photo-upload* ]] && continue ##skip photo-upload directory & subfiles thereof [[ $f == nativeorchid.htm && $URL == *erei* ]] && continue ##skip the skill-testing page (ER-ONLY) [[ $f == FP[MO]-* ]] && continue ##2010-03: skip FPM-* FPO-* frontpage image (for ER) [[ $f == *[2-9][0-9][0-9]\.jpg && $URL == *erei* ]] && f=${f/.jpg/.htm} && R=$D/index.htm ##2009-12-19: (ER-ONLY) for 3-digits.jpg non-thumb, mk htm w TS from $D [[ $f == *[0-9][0-9][0-9][0-9]\.jpg && $URL == *erei* ]] && f=${f/.jpg/.htm} && R=$D/index.htm ##2009-12-19: (ER-ONLY) for 4-digits.jpg, mk htm w TS from $D [[ $f == *\.@(jpg|gif|png) ]] && continue ##2009-12-19: skip image (LONG NEEDED...) [[ $f == index.htm* ]] && continue ##skip the frontpage; it's done specially (as just slash) above #[[ -d $f ]] && continue ##was skip SUBDIR when using links to SUBDIR/index.htm [[ -d $f ]] && ! [[ -e $f/index.htm ]] && continue ##skip SUBDIR only if it lacks index.htm (nearly OBSOLETE, except for NOPERCART-DEMOx) [[ $f == */index.htm ]] && continue ##skip SUBDIR/index.htm file, since will use just SUBDIR/ for it DATE=$(date --reference="$R" -u -Iseconds) ##get TS from $R, same as $f except for ER-only invented htm page PRIO=0.2; FREQ=yearly ##for pdf/xml/(image)/other, use low priority, yearly update-frequency [[ -d $f ]] && { PRIO=0.8; FREQ=monthly; } ##give SUBDIR the same priority and freq as HTML (is alias for its index.htm) [[ -d $f ]] && { f=$f/; } ##give SUBDIR an ending-slash!! [[ $f == *htm ]] && PRIO=0.8 ##higher priority for HTML pages [[ $f == *txt ]] && PRIO=0.8 ##higher priority for TXT pages, eg ER-genealogy; icon/js/bash have HTML pages? [[ $f == mborchids.htm ]] && PRIO=0.9 ##ensure mborchids.htm more important than mborchids-KJ (for NOCI) [[ $f == shopping.htm && $URL == *nativeorchid* ]] && PRIO=0.99 ##ensure shopping.htm the most important page (NOCI-ONLY, avoid ER) [[ $f == IT-InuvikTrip.htm ]] && PRIO=0.9 ##ensure InuvikTrip page more important than most (for NOCI) [[ $f == CP-Sex+Murder-Slideshow.htm ]] && PRIO=0.9 ##ensure Sex+Murder page more important than most (for NOCI, added 2010-04) [[ $f == nopercart.htm ]] && PRIO=0.9 ##ensure nopercart.htm more important than most (for ER) [[ $f == Thiessen/contact.htm ]] && PRIO=0.9 ##ensure Thiessen contact page more important than most (for ER) #[[ $f == Thiessen/Review-of-Dictionary-by-DrJohnCon* ]] && PRIO=0.9 ##ensure that review more important than most?? (for ER) [[ $f == *htm ]] && FREQ=monthly ##monthly for HTML files [[ $f == pixDate.htm ]] && FREQ=weekly ##weekly for pixDate.htm HTML file (for ER; may add other special-cases) [[ $f == nopercart.htm ]] && FREQ=weekly ##weekly for nopercart.htm HTML file (for ER; may add other special-cases) [[ $f == robots.txt ]] && FREQ=monthly ##monthly for robots.txt; yearly ok for other *txt (eg: ER-genealogy) [[ $f == sitemap.xml* ]] && FREQ=weekly ##weekly for sitemap files - this CHANGEFREQ is likely ignored?? echo " $URL/$f $DATE $FREQ $PRIO " |sed 's|\&|\&|g' >>$MAP ##==sitemap-entry done echo "" >>$MAP chgsed -n -q 's|Z|+00:00|g; s|+0000|+00:00|g' $MAP ##fixup Unix-timezone to Google-timezone echo "sitemap-gen: all done - to see changes: dif /tmp/sitemap.xml-$BKID $1/$MAP" ##info ##dif /tmp/sitemap.xml-$BKID $MAP |less ##DEBUG, normally yanked; exit 2005july04: Google says "Invalid date" need colons in the -05:00 and -06:00 in parts?? (their examples differ from my Linux-date re ISO-8601) switched to -u (GMT), and replacing "Z" with +00:00 as wanted by Google, then it passed 05july05 2005nov26: Google once again says "Invalid Date" for all 4 sitemaps (don't know exactly when it started, as I hadn't checked for about a month); my sed-fixup stopped working, due to changes in the Fedora FC4 version of date; it produces +0000 rather than the "Z" I was expecting -- now revising that to +00:00 also. 2008-10-10: combined sitemap-gen-ER sitemap-gen-Debwendon sitemap-gen-NOCI into this script (briefly had each of those calling this one); revised for files under SUBDIRs: (A) include dir having index.htm page but then omit that page; or (B) after 2008-11-13: omit dir, include index.htm; 2008-11-01: new $1 param eliminates the need for -ER etc versions; USAGE was: cd /noci/website; URL=http://www.nativeorchid.org sitemap-gen; 2008-11-05: now using WEBURL to map $ROOTDIR into $URL -- WEBURL is shared with other scripts such as webmv; 2008-11-13: about SUBDIR-refs: went with index.htm (rather than just slash), so testing locally works; (top-level remains as just slash); ==COMPRESS using gzip (making sitemap.xml.gz) for faster uploads?? ==MAKING FASTER: rewrite using ls|sed... to make it a lot faster, and immune to "arglist too long" (altho for-loop isn't subject to that limit) ==NOTE: ls --time-style=full-iso provides nearly the desired time-format; revise 2002-07-15 20:06:01.000000000 -0500 --> 2002-07-15T20:06:01-05:00 2009-12-19: ER-only, for each 3-digits.jpg file, add samename.htm -- get TS from the per-DAY-page for that img?? --TEMP: tried using TS from NEWEST $D (dir jpg is in), but that was just too slow; --TEMP-2: just use TS of $D; --2009-12-21: now that one-dir-per-day reorg is done, am using TS from $D/index.htm, so it will work as wanted--!!-- 2009-12-19: HAVE DECIDED to omit images (jpg|gif|png); ==including sitemap.xml (in itself) may well be pointless too==??== sitemap-er will be roughly one-third the size it was -- YUP; see notes in cvt-to-dynamic-per-img-pages-ER; ==NEEDS: GLOBIGNORE="." in order to see names like .Xmodmap; would then add rule to skip .htaccess; 2010-01-26: noticed that google has crawled http://ereimer.net/nativeorchid.htm -- a tricky experiment best left uncrawled--!!-- exclude from sitemap; exclude via robots.txt; AND use nofollow in the link from /rants/are-you-phishing-proof.htm 2010-03: after local-APACHE, rethink use of SUBDIR/index.htm: went back to using SUBDIR/ and suppressing index.htm; /er/website/.htaccess: revised accordingly, replacing "/index.htm" with just "/" <--all occurances of "/index.htm" WERE IN RHS /noci/website/.htaccess: revised accordingly, replacing "/index.htm" with just "/" <--only one occurances on wa-stats/ /debwendon/website/.htaccess: no revision needed as it contains no "/index.htm"; webmv: in mk_redirect (making .htaccess entry), replace SUBDIR/index.htm with just SUBDIR/ in either LHS or RHS; revised all links within ER-site that were to .../index.htm, affecting: about.htm, aboutMore.htm, links.htm, pixDate.htm(LOTS), sitemap.htm, 20080704/index.htm, Thiessen/custom404page.htm, programs/related.htm, programs/webalizer-ER.htm, rants/ChristmasCards.htm [] do I still have any SUBDIR lacking index.htm, after eliminating pixDATE.htm files?? YUP my NOPERCART-DEMOx dirs still lack index.htm==!!== 2010-03: skip FP[MO]-* -- the frontpage images (for ER); other unused former frontpage-images (Calypso-05297*jpg, Me*jpg) would go away after a webclean; 2011-01-06: BEWARE: bash-v4 has changed the meaning of =~ within double-square-brackets: specifically what quoting in RHS does; best to avoid it, using == and the extglob extensions since those work everywhere; added: shopt -s extglob revised: [[ $f =~ "\.(jpg|gif|png)\$" ]] --> [[ $f == *\.@(jpg|gif|png) ]]