#!/bin/bash ## script to invoke webalizer on Webhoster-provided access_log files; E Reimer 2005-Mar; ## the logfiles for all 3 sites are now under /pix/WEBLOGS (daily from 20081213 on); no longer catenating under /pix/WEBSTATS-$PROJ; ## PREREQ: webalizer-ER -- from http://ereimer.net/programs/webalizer-ER.htm; ## PREREQ: chg chgsed clean dateplusdays -- from http://ereimer.net/programs/general-purpose-scripts.htm; ## USAGE EXAMPLES: ## webalize /pix/WEBLOGS/access_log-nativeorchid.org.20091001 ## webalize /pix/WEBLOGS/access_log-nativeorchid.org.200910* shopt -s extglob ##enable extglob for @(...) etc if [[ $@ == *debwendon* ]];then URL=debwendon.org; PROJ=debwendon; elif [[ $@ == *ereimer* ]];then URL=ereimer.net; PROJ=er; elif [[ $@ == *nativeorchid* ]];then URL=nativeorchid.org; PROJ=noci; else echo "webalize lacks support for names like $@"; exit 8 ##quit if args invalid fi AT="$@" LASTARGDATE=$(echo "$AT" |tr ' ' '\n' |tail -n1 |sed 's|.*\.||; s|-.*||') ##get date from last logfile arg LASTARGDATEPLUSONE=$(dateplusdays $LASTARGDATE +1) ##add one day, needed for MG as logfile has a wee bit of next day Y2=$(date -d$LASTARGDATEPLUSONE +%Y); M2=$(date -d$LASTARGDATEPLUSONE +%-m); ##get ending year and month for MG (Months-in-Graph) calculation Y1=2005; M1=2; if [[ $PROJ != noci ]];then Y1=2008; M1=12; fi ##noci starts 200502; er|deb* start 200812; could use FRSTM?? ((MG=(Y2-Y1)*12+M2-M1+1)); ((MG<12)) && ((MG=12)); ((MG>72)) && ((MG=72)) ##MG=months-in-graph; eg: MG=59 noci 200912, MG=13 er 200912 OPT="-f -Q -n$URL -r$URL -r${URL/.*/.com} -C100 -K120 -k$MG -Dwebalizer.dns -N10 -j -p" ##org-->com for noci; DNS(-N10) GeoDB(-j) Incr(-p) ok for reruns if [[ $AT == *.200[56]* ]];then OPT="$OPT -R0"; fi ##200502..200610 logs lack referrer => need -R0 cd /pix/WEBSTATS-$PROJ || exit ##==work in webstats dir rm -f /tmp/webalize-tmp-$PROJ ##cleanup catenation-tmpfile from previous run (not crucial) if [ $# -gt 1 ];then cat "$@" >/tmp/webalize-tmp-$PROJ; set /tmp/webalize-tmp-$PROJ; fi ##for multiple input-files, make catenated file, set as arg echo -e "webalize: AT:$AT;\nwebalizer $OPT $@" ##msg webalizer $OPT "$@" ##==WEBALIZE WEBSITEDIR=/$PROJ/website/wa-stats COPY=Y; if [ $COPY ];then ##==COPY files to website + modify its index.htm GENDATELINE=$(grep '^Generated' wa-StatsIndex.htm) ##get generated-on-date line WEBLASTM=$(cat $WEBSITEDIR/index.htm |sed -n 's|.*HREF="wa-Monthly_\(.*\).htm.*|\1|p' |tail -n1) ##get newest month (YYYYMM) in WEBSITE/index.htm NEW=$(cat wa-StatsIndex.htm |tr '\n' ' ' |sed 's|Generated.*|$GENDATELINE|; ##revise generated-on-date; ==USE -p (prompt) for DEBUG-ONLY /wa-Monthly_$WEBLASTM.htm/,/[0-9] Total/d; /<\/TABLE><\/CENTER>/i$NEW" $WEBSITEDIR/index.htm ##and replace WEBLASTM-and-YEARLY-lines with $NEW clean; for F in wa-[A-Z]*;do [[ $F != wa-StatsIndex.htm ]] && cp -pufv $F $WEBSITEDIR; done ##copy new wa-[A-Z]* files, except wa-StatsIndex fi ##ENDIF $COPY BDIR=/pix/bkup-webstats if [[ $LASTARGDATEPLUSONE == *01 ]];then ##==on last-of-month, make Monthend backup of state zzH=zz-webalizer.hist-thru-$LASTARGDATE; cp -pfv webalizer.hist $zzH ##keep webalizer.hist-copy named-by-date zzC=zz-webalizer.current-thru-$LASTARGDATE; cp -pfv webalizer.current $zzC ##keep webalizer.current-copy named-by-date echo "Monthend backup of webalizer-state in $zzH and $zzC" ##msg fi if [[ $LASTARGDATE == *1231 ]];then ##==on Dec31, make Yearend backups FRSTM=$(cat wa-StatsIndex.htm |sed -n 's|.*HREF="wa-Monthly_\(.*\).htm.*|\1|p' |head -n1) ##get oldest month (YYYYMM) in StatsIndex zzSG=zz-StatsGraph-$FRSTM-thru-$LASTARGDATE.png; cp -pfv wa-StatsGraph.png $zzSG ##keep StatsGraph-copy named-by-date zzSI=zz-StatsIndex-$FRSTM-thru-$LASTARGDATE.htm; cp -pfv wa-StatsIndex.htm $zzSI ##keep StatsIndex-copy named-by-date chgsed -n "s|wa-StatsGraph.png|$zzSG|g" $zzSI ##fixup StatsIndex-copy wrt StatsGraph-name BD=$BDIR/webstats-$PROJ-$FRSTM-thru-$LASTARGDATE; rm -f -R $BD; mkdir $BD; cp {wa-,weba,zz}* $BD ##bkup-copy named-by-date; needed?? 2010-10:remove-->rm -f -R echo "Yearend backup copy under $BD" ##msg fi exit ============== LOGFILE NOTES: ============== 2005-Feb: for NOCI, started getting logs at MTS-hosting, they lack Referrer and User-Agent but are DNSed; part of 2005-Sep has different logs (from NewWinnipeg) needing DNS-lookups; have nothing for 2nd half of 2006-Oct (MTS had writing disabled during outsourcing-migration); 2006-Nov: MTS went to an outsourced hoster, and NOCI began to get a modern log complete with Referrer and User-Agent; 2008-Mar: NOCI+ER+Debwendon went to HostExcellence where logs are modern but non-DNSed, but I was slow to revise log-downloading; for NOCI, lost logs+stats for 2008-Mar13 through 2008-Jun08, but have stats for 2008-Jun thru 2008-Dec (webalized by HE with v2.01); for ALL-3, have daily logs from 2008-12-13 on; OPTIONS used: 2005|2006: MTS-logs lack referrer => need -R0; 200509 has been rerun with DNSlookup(-N10); will use DNSlookup(-N10) + GeoDB(-j) on reruns; 2007|2008: most are DNSed but used DNSlookup (-N10); will also use GeoDB (-j) on reruns; 200812: started using incremental (-p) and GeoDB (-j); using -j will be desirable if rewebalizing older logs; and -p will be harmless, just means slightly different state-restoring?? =============== == MAKE RECIPE: =============== after source mods, under /pix/pkg/webalizer-VERS, run: make; sudo make install ===================== == WEBALIZER OPTIONS: ===================== ## -f to prevent ignoring out-of-sequence records -- they'll be treated as having same time as last in-sequence one ## -Q to suppress all error & warning messages -- otherwise get thousands of warnings about referrer (in MTS logs); only needed for 2005+2006==??== ## -x htm (or HTMLExtension in conf) to get output files called xx.htm -- otherwise .html --done in /etc/webalizer.conf ## -n nativeorchid.org otherwise get s010600e029965b45.wp.shawcable.net (my machine's hostname) as the website-name; was www.nativeorchid.org ## -r nativeorchid.org hide referrer 'nativeorchid.org' ## -U 100 to get bigger URL's table --GO BACK TO DEFAULT OF 30==(done)== ## -C 100 to get bigger Country table ## -R 0 to suppress the Referrer table --referrer is meaningless junk in MTS logs --OMIT WITH NEWER LOGS; but beware of Referer-spam==??== ## -D webalizer.dns (or DNSCache in conf) DNS-Cache-name --ignored unless -N also specified --in /etc/webalizer.conf 2009-09; was weba-dns-cache ## -N 10 Number of DNS processes (0=disable) --for NewWpg|HostExcellence, 5 to 20 suggested --ENABLE for v2.20; => AVOID STDIN, needs 2-passes... ## TopUsers 0 Suppress Usernames table -- only an issue for 2005-09 (NewWinnipeg quirk) --in /etc/webalizer.conf, since no cmdline option!! ## -P txt -P pdf (or PageType in conf) --default is htm|html|cgi --in /etc/webalizer.conf, added txt|pdf|pl|sh ## -M 1 simplify user-agent-names; levels are: 0:as-is to 5:Browsername+majorversion ==consider some simplification?? ## -p preserve state (incremental) ## -b ignore state (incremental) ## -i ignore history file ## -K 120 months in Index (12 to 120 allowed) --new in 2.20 ## -k 72 months in Graph (12 to 72 allowed) --new in 2.20 ## -j turns on GeoDB geolocation services --new in 2.20 (yup) ==START USING 2008-12, together with -N 10/0 ?? ## NOTE: multiple months on one run is handled; also partial months via "incremental"-mode with state-info in webalizer.current; ## ## MORE INFO ON WEBALIZER OPTIONS: ## webalizer -h ## /pix/pkg/webalizer/README + DNS.README for reverse-DNS-lookups ## /etc/webalizer.conf + webalizer.conf.sample ## http://www.webalizer.org/faq.html <-- online FAQ by the author of webalizer (was http://www.mrunix.net/webalizer/faq.html) ## ## CONSIDER: pre-processing the logfile, applying recent webpage renames to the access-records, before the summarizing?? =================== == OBSOLETE STUFF: =================== ## OBSOLETE; RENAMES taking care to avoid re-revising references--!!-- ##rm -f wa-StatsIndex.htm wa-StatsGraph.png ##webmv index.htm wa-StatsIndex.htm ##webmv usage.png wa-StatsGraph.png ##webrename usage_ wa-Monthly_ usage*htm ##==was MonthlyReport_ in my version 2.01 mods... ##webrename ctry_usage_ wa-Cntry_ *png ##webrename daily_usage_ wa-Daily_ *png ##webrename hourly_usage_ wa-Hourly_ *png ##chgsed "s|usage_|wa-Monthly_|" wa-StatsIndex.htm ##-- NOTE: the only other webalizer file, besides those being renamed above, is called webalizer.hist; ## webalizer uses webalizer.hist to produce a correct "index.htm", without needing the previously produced one?? ## yes that works!! ## but do need to re-revise links in index.htm (wa-StatsIndex.htm); usage_ --> wa-Monthly_ ## NOTE: these renames & revising no longer needed, due to source mods; ## NOTE: source mods (2.01): revised webalizer.hist --> wa-webalizer.hist; ==DROPPED THIS in 2.20; ##--2008-12: OBSOLETED by source mods; Add "Webalizer-ER" to Index + Monthly reports; ##chgsed -n 's|Webalizer Version 2.01<|Webalizer-ER Version 2.01-10-ER<|g; s|http://www.mrunix.net/webalizer/|http://ereimer.net/programs/webalizer-ER.htm|g' wa-*htm ##chgsed -n 's|Webalizer Version 2.20<|Webalizer-ER Version 2.20-03-ER<|g; s|http://www.webalizer.org/|http://ereimer.net/programs/webalizer-ER.htm|g' wa-*htm ##--OBSOLETE; ADD REMARKS to wa-StatsIndex.htm, that are specific to nativeorchid use of webalizer; ((reworded 06nov)) ##==2008-12: OBSOLETED by leaving unchanged that part of $WEBSITEDIR/index.htm==!!== chgsed " s|Webalizer Version.*|Webalizer-ER Version 2.01-10-ER|; s|http://www.mrunix.net/webalizer/|http://ereimer.net/programs/webalizer-ER.htm|g; s||\


\

Note: click on any month, in the table above, to get detailed stats for that month.\

Note: these stats are combined for our two domain-names: nativeorchid.com and nativeorchid.org.\

Until 2006-Oct, our webhoster ManitobaTelephoneSystem provided a NON-chronological log \ (requires sorting before webalizing) that lacked referrer and browser info; \ they discarded the log for the 2nd half of 2006-Oct while in the slow process of farming-out their webhosting business; \ then in 2006-Nov, we began to get a modern log complete with Referrer and User-Agent.\

In 2008-Mar, we changed webhosters to HostExcellence; they do the webalizing for us although not with the desired options nor version of webalizer; \ due to an accident we lost the stats for 2008-Mar13 through 2008-Jun08 (our webhoster has some backups but not of logs nor of webalized logs).\ |" wa-StatsIndex.htm ##--OBSOLETE; Remove the "Last 12 months" from TITLE and BODY of wa-StatsIndex.htm: chgsed '/Summary Period: Last 12 Months/d; //s| - Last 12 Months||' wa-StatsIndex.htm ##2008-12: added TITLE-fixup; then made obsolete... ##--OBSOLETED BY DNS_LOOKUPS with 2.20; For 200509 ONLY (NewWinnipeg), add Remarks about part of month being in different format ##-- this was to explain the higher than usual number of UnResolved/Unknown Country-code -- solved by -N ... ##chgsed "s|.*<HR>|Note: for one-third of this month we used a different webhoster, whose access-log lacked reverse-dns-lookup.<p><HR>|" wa-Monthly_200509.htm ## ##cat "$@" |webstatslogsort |webalizer $OPT ##==old SORT+WEBALIZE, pipe voided DNS-lookups (needs 2-passes) ## CONSIDER: standardizing the "GET" log-records, to combine (relatively rare) cases like these -- 301-redirects will have OBSOLETED this: ## GET http://www.nativeorchid.com/guestbookview-051005.htm ## GET /guestbookview-051005.htm ## CONSIDER: option for GeoDB-lookup to overide Reverse-DNS-lookup?? (see -j -D -N options) ##--2009-11: OLD method (from 2008-12) of copying to website, for month-at-a-time, that only added months: cat wa-StatsIndex.htm |tr '\n' ' ' |sed 's|</*TABLE|\n&|g; s|<TR|\n&|g' | ##convert to one-line-per-TR format grep 'wa-Monthly' | ##exclude all but monthly lines while read;do M=${REPLY#*Monthly_}; M=${M%.htm*}; ((M>WEBLASTM)) && echo "$REPLY"; done | ##only monthly lines > WEBLASTM cat >tmpNewLines ##to tmpfile chgsed "/<\/TABLE><\/CENTER>/i<===INSERT-HERE===>" $WEBSITEDIR/index.htm ##insert tmpfile into StatsIndex (painful with sed??) chgsed "/<===INSERT-HERE===>/r tmpNewLines" $WEBSITEDIR/index.htm ##insert tmpfile into StatsIndex (painful with sed??) chgsed "/<===INSERT-HERE===>/d" $WEBSITEDIR/index.htm ##insert tmpfile into StatsIndex (painful with sed??) ## ##--2009-11: first try at NEW method -- rewritten to use single chgsed-cmd (so can use -p): chgsed "/wa-Monthly_$WEBLASTM.htm/,/[0-9] Total/c<==INSERT-HERE==>" $WEBSITEDIR/index.htm ##replace WEBLASTM-and-YEARLY-lines with "INSERT-HERE" chgsed "/<==INSERT-HERE==>/r tmpNewLines" $WEBSITEDIR/index.htm ##insert tmpfile into StatsIndex, after old WEBLASTM-line chgsed "/<==INSERT-HERE==>/d" $WEBSITEDIR/index.htm ##then delete the old WEBLASTM-line ##--2009-11: OLD test to detect Yearend: if [[ $LASTM == *12 ]];then ##==in Dec -- for month-at-a-time ##--2009-11: OLD way to calculate MG (months-in-graph): DT=$(dateplusdays $(date +%Y%m%d) -1) ##use yesterday's date Y=$(date -d$DT +%Y); M=$(date -d$DT +%-m); ##get ending year and month for MG-calculation (months-in-graph) if [[ $PROJ == noci ]];then ((MG=(Y-2005)*12+M-1)) ##months-in-graph for From-2005-Feb graph; eg: MG=47 for 2008-12 elif [[ $PROJ == er ]];then ((MG=(Y-2008)*12+M-11)) ##months-in-graph for From-2008-Dec graph; eg: MG=1 for 2008-12 elif [[ $PROJ == deb* ]];then ((MG=(Y-2008)*12+M-11)) ##months-in-graph for From-2008-Dec graph; eg: MG=1 for 2008-12 fi ##--2009-11: OLD way to name Monthend + Yearend copies -- now using LASTARGDATE instead LASTM=$(cat wa-StatsIndex.htm |sed -n 's|.*HREF="wa-Monthly_\(.*\).htm.*|\1|p' |tail -n1) ##get newest month (YYYYMM) in StatsIndex LASTM=${dateplusdays ${LASTM}01 -1 +%Y%m} ##assume newest-month has one-hour's data from 1st--??-- ##--2009-11: first attempt at last-of-month test did NOT handle Feb, nor leapyears -- later realized the easy way is to test if date-plus-1 ends in 01; if [[ $LASTARGDATE =~ '^20??((09|04|06|11)30|..31)$' ]];then ##also flawed wrt dot vs query-mark?? ##rm -f webalizer.{current,hist} ##==normally YANKED; BETTER: restore state from backup ============== == CHANGE-LOG: ============== 2005mar04: installed the binary (static) linux version of webalizer, and used it with this script; 2005mar05: installed the src version, and compiled it (also the gd graphics pkg); made source mods: (1) date-format, (2) filenames, (3) index in chronological order, was reverse-chronological, (4) Link-Colors + blue-->black ink on graphs; output.c webalizer.c graphs.c preserve.c -- mod (2) removes the need for those renames below -- see remarks in output.c; ==other mods: computing Daily-Avg stats, handle partial month based on days completely absent; ==other mods: handle reading previously produced StatsIndex - as alternative to webalizer.hist file--??-- ==other mods: revise "SITES" to VISITORS; ==OTHER MODS: label StatsGraph with YYYY-MM..YYYY-MM; OR, with just YEAR if ending-month is December==??== also params to specify Scale of graph, so can make 2005, 2006, 2007, 2008, all to same scale==??== ==CONSIDER: Index has Graph-2005, links for 200502..200512, Graph-2006, links 200601..200612, Graph-2007, links 200701..200712... 2005mar: NOTES on OUT-OF-SEQUENCE log-records: for 2005feb, without -f got "referrers" as follows: 30158 www.nativeorchid.com 204 nativeorchid.com--??-- 5 nativeorchid.com:80 1 www.nativeorchid.com:80 ----- 30,368 == total "hits" reported. file ftp151jjaccess_log-200502 contains 31,309 logrecords => 941 records went uncounted--??-- it has 30,197 "www.nativeorchid" records; it has 1,112 "nativeorchid" records -- most of these out-of-sequence, and thus ignored by webalizer!! NUMBERS ARE FINE with -f, except the hourly info has too many in the 23rd hour. 2005oct02: SORTING: added the <webstatslogsort> step, to get around non-chronological "log" files. rerun on stats prior to 05aug, since those weren't sorted (only flaw is too many in the 23rd hour of day); ==DONE on 2008-12-27, dated them 2005-Oct-03... 2005oct02: REVERSE-DNS-LOOKUPS: doc says: configure with --enable-dns, then recompile (or: recompile with -DUSE_DNS). ./configure --enable-dns --with-db=/usr/include/db3 --with-dblib=/usr/lib <--didn't work wrt db_185.h; so i edited Makefile manually, as per the online-webalizer-FAQ, then recompiled -- seems OK; ==BUT does nothing - tried the 2005sep stats both without and with reverse-dns lookups, and got identical results--??-- ==MTS had already attempted reverse-dns lookups, but NewWinnipeg Had Not; ==> TRY AGAIN on HostExcellence non-reverse-lookedup logs; ==problem may have been due to piping into stdin, and webalizer needing 2-passes for DNS-lookups?? Not clear when I learned that, but got DNS to work in v2.20 2006apr: NEEDED: keep more than 12 months of history - considering 24 ? --am making StatsIndex semi-manually for now, keeping all months, but using a single 12-month StatsGraph; 2006nov: our webhoster (MTS) changed filenames from ftp151jjaccess_log, to access_log; 2006nov: DAILY-STATS FOR SITES(VISITORS) are roughly the same as Visits, about twice what monthly-totals show; also each day's percentage of monthly total seems high, sums to about 200% ?? AHA, All is Well, he uses DISTINCT-VISITORS... 2008-12: ==CATCHING-UP== website/wa-StatsIndex has: 2005-Feb thru 2006-Sep; website/webalizer has: 2008-Jun..2008-Dec; <--2008-Mar..mid-Jun missing; ==COPY THESE /pix/WEBSTATS-noci has: parts of 2006-Oct; all of 2006-Nov..2008-Feb; (raw logs) <==HAVE WEBALIZED, kept StatsGraph-200701-thru-200712 these logs are also not altogether chronological, altho out-of-order cases mostly out by only a few seconds... access_log_20070131 had malformed record: livebot-65-55-208-208.search.li100 livebot-65-55-208-208.search.live.com - - [13/Jan/2007:20:18:34 -0500]...==FIXED /pix/WEBLOGS has: daily raw logs starting on 2008-12-13 (for all 3 sites); --can now quickly rerun entire YEARs: catted-accesslog-YYYY... ==Have verified that results identical to previous 2008-12: tried to recompile (first time since upgrading to FC4); much grief; unresolved __ctype_b -- solution is to recompile everything with new gcc?? in dns_resolv.c, related to Berkeley-DB; yup, FC4 brought db4... -- note: /usr/bin/webalizer-ER-2.01-10 is the old modified version compiled on 2005-10-02; 2008-12: ==Installed Version 2.20 of Webalizer== (v2.20 2008-July offers MORE-THAN-12-MONTHS reporting) needs: GD Graphics Library Version 1.7.3 - mine is 2.0.33; needs Berkeley DB Library v4.1 - mine is 4.3; ran: ./configure --prefix=/usr --enable-dns --sysconfdir=/etc; make; sudo make install ==NO PROBLEMS==!!== refitting mods: /pix/pkg/webalizer-2.20-03/00-README-ER /etc/sbin/mk-webalizer-mods 2009-01-04: made /etc/webalizer.conf; PageType defns mean (10-15%) higher Pages+Visits-counts; HideURL defns greatly improve the Top-URLs-table!! add -r on cmdline (or HideReferrer) for when NOT using -R0 (www not needed) 2009-01-04: started using -N10 for Reverse-DNS-lookup; solves the anomalous 2005-09; subtle improvement elsewhere, eg 2005-10 unresolved 16%-->14%; DECIDING on DNS-lookups versus GeoDB-lookups: --From webalizer-FAQ: While geolocation support will give you accurate country information, other aspects of the analysis may suffer, such as search string analysis (which depends on resolved hostnames to identify the various search engines). --From README: this [geolocation lookups] produces more accurate Country information than DNS lookups, since the DNS address space has TLDs that do not map to a specific country (such as '.net' and '.com'). It is possible to use both DNS lookups and geolocation lookups at the same time, which will cause any addresses that could not be resolved using DNS lookups to then be looked up in the database, greatly reducing the number of 'Unknown/Unresolved' entries in the generated reports. GeoDB is updated regularly. The most current version of the database can be obtained from our ftp site. ==USING BOTH (starting in 2009-01); ==Works very well, now getting less than 1% Unresolved; 2009-01-04: Fixed the HostExcellence pages 200806..200812, removing links in Referrers Table with: chgsed -p '/Top.*Total Referrers/,/<\/CENTER>/s|<A HREF=[^>]*>\([^<]*\)</A>|\1|g' wa-Monthly_2008*htm ==SEE: do-webalizer-copying+renaming 2009-01-04: made /noci/website/wa-stats/index.htm to summarize all months 2005-Feb thru 2008-Dec, and with 47-month graph, but where monthly reports 2008-Jun..Dec are as webalized by HostExcellence; ==HOW did I do that conversion of History (v2.01 to v2.2) in order to get a 47-month graph?? ==Sadly, I made no notes on my reverse-engineering:-( ==SEE /pix/WEBSTATS-noci/webalizer*hist* files for clues: webalizer-hist-200806--200812 <--8 lines, 2009-01, 2008-06..2008-12 <--v2.01 format, presumably as copied from HostExcellence-dir webalizer-hist-200806--200812-REV <--8 lines, 2008-12..2008-06, 2009-01 <--line-by-line REVersal of preceding, presumably an aid in making next one webalizer.hist <--120 lines, 2008-12..1999-01 <--v2.20 format with -K=120, presumably made semi-manually by me ==my guess as to how I did things: see the "presumably" remarks above; this re-documenting done 2009-09-21 when embarking on another catching-up... ==still unclear on how I got webalyzer to construct the 47-month graph; purely from hist-file, without new log data?? Run on an old log, its graph uses hist?? ==flaw with my guesses: mtime on webalizer.hist identical to mtime on wa-*200802*, wa-StatsIndex.htm & seconds after wa*200801 ==> all produced on one run!! ==note that /pix/WEBSTATS-noci/wa-StatsIndex.htm is complete thru 2008-Dec, just like the page up on noci website (except for the added prose); ==BEWARE: monthly-reports 2008-06..2008-12 CANNOT be recreated from logfiles -- since I don't have them; 2009-09-21: webalizing 2009-Jan thru present; --backup /pix/WEBSTATS-noci/{wa-,weba}* to $BDIR/webstats...; --backup /noci/website/wa-stats/* to $BDIR/webstats...from-website; --2009-09-21 23:57:52: ran on catenated /pix/WEBLOGS/access_log-nativeorchid.org.2009* files, updating website/wa-stats; cat /pix/WEBLOGS/access_log-nativeorchid.org.20090[1-9]* >/tmp/webalize-tmp; set /tmp/webalize-tmp ##making catenated logfile for 2009-Jan..Sep --puzzle: Top-URLs-Table has no images (other than favicon.ico); surely images used on frontpage have same hits as it?? Am I using some "hiding" rules?? ==YUP; see HideURL defns 2009-01-04 -- compare to HE-produced reports where images are amongst Top-URLs; ==consider: HIDING robots.txt, favicon.ico, sitemap.xml, from Top-URLs-Table; ==add HideURL rules for: .ico .js .xml robots.txt(??) ==consider: webalizer offering Top-Pages as well as Top-URLs; would make it simpler to get what one typically wants?? 2009-09-21: many Search-Strings look mangled, see eg: nativeorchid.org 200901; same with v2.01 or v2.20-ER; ==Investigate==!!== wrote weblog-parse-search-strings to investigate; AND weblog-search-strings-report to produce proper Search-Strings-Report; crude fix: replace the Search-Strings section of webalizer-report with output produced by my script==??== 2009-09-21: REORG + RENAMES to prepare for Stats-on-All-3-Sites (done on 2009-11-23); NOTE: logfiles are now kept separate from stats-files; revised+renamed weba-dns-cache-->webalizer.dns; simplifies naming conventions ==>eg: could use webalizer* in place of weba* when making backups; IS a separate stats dir needed, versus working directly in website/wa-stats?? ==ANSWER: (1) "added prose"; (2) avoid webalizer.{current,dns,hist} on website; old logfiles /noci/website-stats/catted-accesslog-YYYYMM-thru-YYYYMM --are now /pix/WEBLOGS/access_log-nativeorchid.org.YYYY <--CAN BE USED FOR RERUNNING!! old logfiles /noci/website-stats/access_log* (partials etc) --are now /pix/WEBLOGS/access_log-nativeorchid.org__* <--hopefully never needed stats-files /noci/website-stats/{wa-,weba,zz-}* --are now /pix/WEBSTATS-noci{wa-,weba,zz-}* ==ALL logfiles under /pix/WEBLOGS <--renamed from /pix/er-WEBLOGS ==NOCI stats-files made & kept under /pix/WEBSTATS-noci <--renamed from /noci/website-stats (also /pix/WEBSTATS-er /pix/WEBSTATS-debwendon) 2009-09-21: Modifying to automate, using daily "incremental" webalizing?? running webalizer-ER with my options on HostExcellence site has some appeal?? however I want to have "raw" logfiles, so can re-webalize with different options -- and that requires continued daily downloading of access_log; also means I'm better off continuing to webalize on my computer!! (consider: daily-webalize and a daily-WEBPUT-LL on wa-stats dir) ==NOTE: hostexcellence-produced reports made by v2.01 still have spammer-exposure in Referrer-Table ==>turn off webalizer-option?? ==OR just ignore?? in 2009-09, http://theblogmoney.com looks to be such a spammer; in 2009-09, http://www.jkquilting.com/z_code/Upload_Mini_Cart/shopping.htm links to my shoppingcart-page, tho uses Paypal-Cart; see email ek@jkquilting.com 2009-10-14: /etc/webalizer.conf: added lots of SearchEngine lines; ==reran nativeorchid stats for 200901..200909; manually replaced 200909 line in index.htm; --compared rerun reports to the previous backup copies; --am fuzzy on how .hist is used/updated, on needing to remove when rerunning; never mind .current, tho in some respects it's actually easier?? ==note: the -b (ignore incremental-state) and -i (ignore hist) options -- to handle rerunning; but when graph needs some of hist?? IS OK without -i==!!== --was dubious about WEBLASTM-var being used in a nested sub-shell, but to my surprise it works; 2009-10-14: MODS TO webalizer-ER -- see details in /pix/pkg/webalizer-2.20-03-ER/00-README-ER; --in sample.conf+DNS.README: default DNS-Cache is now webalizer.dns; added many SearchEngine strings, as in my /etc/webalizer.conf; --made new zipfile on /er/website/programs; (didn't install it myself as there were no code, only doc, changes) ==need to handle new-style google-images searches -- see email to brad@mrunix.net -- see also: weblog-parse-search-strings; ==NOT DONE; ==FUTURE UPDATES to webalizer.conf: look for logrecs with referrer matching KW patterns, but not matching SE pattern <--see weblog-parse-search-strings; 2009-11: revised COPYing-to-website code to replace last-monthly line AND the following yearly-totals line -- with a single chgsed-cmd (so can use -p to debug); (before incremental-webalizing, was only adding months-after-WEBLASTM): /noci/website/wa-stats/index.htm: manually added the tiny-partial 200910 monthly-line, and the 2009-yearly-totals line -- to get into the new "normal" state; now ready to run incremental-mode on a daily basis, I think; 2009-11: Did October both ways to COMPARE: (1) incremental one day at a time; (2) entire month (catenated) at once; First, made current backup $BDIR/webstats-nativeorchid.org-200502-thru-20091001; then did: cd /pix/WEBSTATS-noci for F in /pix/WEBLOGS/access_log-nativeorchid.org.200910??;do webalize $F; done ##run each day separately, in incremental-mode BD=$BDIR/webstats-nativeorchid.org-200502-thru-200910-INCR; remove $BD; mkdir $BD; cp {wa-,weba}* $BD ##keep complete copy incremental-mode copy $BDIR/webstats-nativeorchid.org-200502-thru-20091001/webalizer.{current,hist} . ##restore state from end-of-Sep backup webalize /pix/WEBLOGS/access_log-nativeorchid.org.200910?? ##rerun all of Oct at once AD=$BDIR/webstats-nativeorchid.org-200502-thru-200910-AAOM; remove $AD; mkdir $AD; cp {wa-,weba}* $AD ##keep complete copy all-at-once dif $BD/wa-StatsIndex.htm $AD/wa-StatsIndex.htm ##Compare ==DIFFS: -2009-Oct 3905 3628 1125 505 5669 4087099 15658 34903 112475 121062 +2009-Oct 4031 3746 1175 517 5669 4211607 16033 36440 116140 124974 <--NOTE: all stats except Monthly Total Visitors are 3-5% low with incremental --AHA, turns out the daily-stats disagree on only one day: 20091024; INCR lost nearly 4,000 hits for that day--??-- --and AAOM has a huge spike for Hour-02 => likely involves out-of-order logrecords; YUP; --actually: logfiles for 20091023 and 20091024 are identical; Suggests a flaw in my handling of day-minus-1(??); Got errmsgs for 20091024--!!-- --both start at 23/Oct/2009:02:06, end at 24/Oct/2009:02:07; ie: are perfectly sensible for 20091023, not for 20091024; ==that bum log for 20091024 remains a MYSTERY; my retrying in WEBGETLOGS has worked for other retryings, including one prior to that one; ==webalizer has a flaw wrt out-of-order records: -f not honoured for incremental-invocation ==!!==SEE /er/website/programs/webalizer-ER.htm; WEBGETLOGS: added a new check to ensure that contents of access_log for DATE are in fact for that date, and that TIMEs are reasonable; 2009-11: generalized to work for any of the sites I look after; setting URL=nativeorchid.org; PROJ=noci; etc; then revised noci-->$PROJ nativeorchid.org-->$URL; improved calculation of MG (months-in=graph) by using date from last arg (logfile-name) on cmdline; revised -n option, removing "www."; wanted for ereimer.net, and seems harmless for nativeorchid.org & debwendon.org; added monthend backup of webalizer-state, zz-dated copies of hist+current in stats-dir; may no longer need a $BDIR yearend backup copy?? ==note: webalizer supports gzipped logfiles => could save space by gzipping old (prior to current month) logfiles==!!== did month-end (200910) state-saving for noci; (as if newly revised script had been used to do 200910*) brought all 3 up-to-date by webalizing for noci, debwendon, er; now ready to continue webalizing by a cron-job... 2009-11: revisions to run as cron-job: (1) avoid prompting; (2) msgs to a /tmp file (YANKED); NOTE: PATH= is OK, since the export PATH= in WEBGETLOGS suffices; 2009-12-10: YANKED (2) as tis better to let webalize-auto redirect (both ours and its) msgs to a single $MSG tmpfile; 2009-12: ==AWFFULL, is a webalizer-fork I just learned of, tho fairly old; many of its improvements have made it into webalizer, tho not the 404-reporting; 2009-12-08: WEBGETLOGS, webalize-auto: WEBGETLOGS run as cron-job, invokes webalize-auto, which invokes this script webalize; == ==NEEDED: replace Search-Strings part (of each Monthly-report) with output of weblog-search-strings-report==??== ==BETTER done in webalize-auto, because this script can be used on old catted logs... one tricky aspect: avoid multiple logs for a date having "dubious" -altfrom logfiles==!!== note: webalize-auto already has knowledge of the -altfrom conventions; webalize already has code to revise webalizer-files with chgsed; METHOD: take the args, isolate YYYYMM, then uniqify those; (weblog-search-strings-report needs all logfiles for the month) 2009-12-14: added -n to the first chgsed, to prevent uploading wa-stats/index.htm~ 2010-10-27: remove-->rm -f -R -- to avoid publishing remove; 2011-01-06: BEWARE: bash-v4 has changed the meaning of =~ within double-square-brackets: specifically what quoting in RHS does; best to avoid it, using == and the extglob extensions since those work everywhere; added: shopt -s extglob revision, not needed, usage is in remarks: [[ $LASTARGDATE =~ '^20??((09|04|06|11)30|..31)$' ]] --> [[ $LASTARGDATE == 20??@(@(09|04|06|11)30|??31) ]]