#!/bin/bash ## invoked from WEBGETLOGS (cron-job) to do the daily webalizing -- Eugene Reimer 2009-12-08; ## receives DOM (domain-name), invokes webalize after determining which logfiles to pass; ## PREREQ: dateplusdays -- from http://ereimer.net/programs/general-purpose-scripts.htm ## USAGE EXAMPLES: ## webalize-auto nativeorchid.org ## webalize-auto ereimer.net if [[ $@ == *debwendon* ]];then DOM=debwendon.org; PROJ=debwendon; elif [[ $@ == *ereimer* ]];then DOM=ereimer.net; PROJ=er; elif [[ $@ == *nativeorchid* ]];then DOM=nativeorchid.org; PROJ=noci; fi { if ! [ $DOM ];then echo "webalize-auto lacks support for names like $@"; exit 8; fi ##quit if args invalid; msg will get to tee?? DATE=$(date +%Y%m%d) ##get DATE in YYYYMMDD style for W in /pix/WEBSTATS-$PROJ/wa-Monthly_*htm;do true; done ##get W newest webalizer-Monthly-report filename WDAY=$(cat $W |tr '\n' ' ' |sed 's|.*||; s|<[^>]*>||g; p}' | ##keep only Daily-Stats rows, and only first column tail -n1) ##keep only the last of those WYYYYMM=${W##*_}; WYYYYMM=${WYYYYMM%.htm} ##get year+month from $W WDATEN=$WYYYYMM$(printf "%02d" $WDAY) ##get WDATEN last date in webalizer-report (first 2-hours) WDATE=$(dateplusdays $WDATEN -1) ##get WDATE last full date webalized ARGS=; ##init ARGS-list for webalize-call for ((DT=WDATEN; DT=$(dateplusdays $DATE -6) ));then echo "webalize-auto: quit at $DT"; break; fi ##quit if only a few days ago (can fix), otherwise do best we can fi for H in $F-hourly*;do [ -e $H ] && ARGS="$ARGS $H"; done ##add all HOURLYs for $DT, if any, to ARGS [ -e $F ] && ARGS="$ARGS $F" ##add $F to ARGS done if [[ $ARGS != "" ]];then ##do nothing if ARGS empty echo "webalize-auto: calling webalize $ARGS" ##msg webalize $ARGS ##==webalize the ARGS-list WEBPUT-LL /$PROJ/website wa-stats ##==upload wa-stats subdir, and nothing but--!!-- fi } 2>&1 |tee /tmp/webalize-auto-msgs-$PROJ ##==msgs to stdout, and into /tmp-file for cron-use exit ========== == METHOD: ========== normally we webalize one day's logfile, for preceding day; however for robustness, we determine the last date that has been webalized, then: we do the best we can for all unwebalized days up to curdate-7; but stop at any bad day after that, leaving that day and any later days undone; eg: on 2009-12-08, newest-logfile:20091207, the 20091201 logfile has just had its last chance, ergo: we stop at bad day among 200912{02,03,04,06,07}; --WEBGETLOGS-changes on 2010-03-09: scrapped the "dubious" renaming to "-altfromGDATE", added need to handle "-hourly"; Notes on logfile names: /pix/WEBLOGS/access_log-$DOM.$DATE <--normal Daily-logfile for $DATE /pix/WEBLOGS/access_log-$DOM.$DATE-altfrom$GDATE <--dubious Daily-logfile for $DATE obtained on $GDATE; when multiple such we use most recent?? /pix/WEBLOGS/access_log-$DOM.$DATE-hourly$GTIMESTAMP <--Hourly-logfile for $DATE obtained at $GTIMESTAMP; we use catenation of ALL such PLUS normal-Daily!! To get last date webalized: we look at newest /pix/WEBSTATS-$PROJ/wa-Monthly_YYYYMM.htm file, last row in "Daily Statistics" table; note that logfile for $DATE contains records for first 2 hours of following day; OBSOLETE PARTS: =============== ##==building ARGS, the version that handled dubious-DAILYs, but not HOURLYs: ARGS=; ##init ARGS-list for webalize-call for ((DT=WDATEN; DT=$(dateplusdays $DATE -6) ));then echo "webalize-auto: quit at $DT"; break ##quit at nonexistent/dubious day within last 6 elif for A in $F-altfrom*;do true;done; [ -e $A ];then ARGS="$ARGS $A" ##add $A to ARGS, for A the newest of the dubious logfiles fi done CHANGE-LOG: =========== NOTE: one thing that can go wrong: when last webalized log has no logrecords from early-AM hours of next day, then it will be fed in twice; could happen on ultra low-activity website, or if webhoster-computer down during those hours -- expect it to be very rare, and likely harmless (webalizer kluge); 2009-12-08: the simplest approach would be to do the webalizing + uploading from webput, so it only happens when I'm ready to upload other changes -- but decided on: WEBPUT-LL now supports optional 2nd-param (largely untested); using that option here to restrict upload to wa-stats dir; because automated daily-upload would otherwise be too bloody dangerous wrt unfinished changes; 2009-12-10: improved MSGS, webalize just writes to stdout/stderr; then this script redirects everything from both to tmpfile; this solves seeing msgs out-of-order in manual use, losing chg,etc msgs in cron-use, also means only ONE /tmp/webalize-XX file per PROJ--!!-- switched to ; &>$MSG;cat $MSG went to 2>&1|tee $MSG <--nicer for interactive use, unchanged for cron-use (altho a gotcha wrt vars); ==see webalize; has notes on running weblog-search-strings-report... ==integrating with weblog-search-strings-report: == catenate onto monthly-catted-logfile, and run weblog-search-strings-report here, pass param to webalize telling it to do the replacement?? == OR, that param tells webalize to do such catenating, running, replacement?? 2010-03: WEBGETLOGS-changes: scrapped the "-altfrom" naming of "Dubious" logfiles -- since was doing no good AND HostExcellence has screwed up the naming/numbering... however, WEBGETLOGS is now leaving one or more HOURLYs, named .DATE-hourlyTIMESTAMP, whenever there has been "Restart Problems" at HE; when HOURLY(s) exist, they + DAILY are catenated; Need to see examples, but my guess is that lost-data happens when they reboot/Apache-restart that empties the log, ergo don't expect them to overlap==??==