#!/bin/bash ## 2009-09-23: a webalizer-alternative for Search-String reporting on weblog records -- Eugene Reimer 2009-09; ## wrote this because the webalizer Top-Search-Strings report has bugs, fails to handle some important search-engines most notably Google-Images; ## PREREQS: url-decode arr -- from http://ereimer.net/programs/general-purpose-scripts.htm ## USAGE: ## weblog-search-strings-report LOGFILE... >tmp.htm --produce Top-Search-Strings report tmp.htm ## ## Copyright © 2009 Eugene Reimer; can be used, modified, copied, and distributed or sold under the terms of either the LGPL or the GPL (your choice); ## see http://www.gnu.org/licenses for the details of these terms. weblogParseSearchStrings () { ##filter to parse search-strings in logrecords SEPAT="[a-z0-9.]*(info.com|/search\?q=cache|/search/cache\ |(alexa|alltheweb|alot|altavista|aol|aolsearch|ask|askpeter|badiu|baidu|bing|blingo|business|chiff|clusty|cuil|dearcomputer|devilfinder|dmoz|dogpile|eureka|euroseek\ |exalead|excite|ezilon|freecause|gigablast|goodsearch|google|hakia|hotbot|icq|infoseek|infospace|joeant|live|lycos|mamma|metacrawler|msn|myembarq|mytelus|mywebsearch\ |netscape|netzero|northernlight|pch|picsearch|rambler|scour|search.bt|search.comcast|search.conduit|search.earthlink|search.juno|search.magentic|search.mywestnet\ |searching.uk|similar-images.googlelabs|snap|startlap|suche.t-online|verizon|webcrawler|webfetch|yahoo|yandex|yodao\ )\.)" KW1="cisearch|text|q|query|Keywords|keywords|qrh|searchfor|string|words" ##KW1: these override the ones in KW2 KW2="p|search|wd" ##KW2: these can be overridden by the ones in KW1, eg text>>p q>>search q>>wd KW1sed=${KW1//|/\\|}; KW2sed=${KW2//|/\\|} ##using sed -r would be a better way?? LC_CTYPE=C ##need 8-bit charset (because logrecords can be in mixture of charsets) egrep "\"http://$SEPAT" | ##match search-engine-referral lines; using Quote+http obviates need to count fields sed 's|.*\("http:[^ "]*\).*|\1|' | ##isolate referrer field url-decode | ##1st URL-decode to undo url-encoding found in log-records url-decode | ##2nd URL-decode to handle the doubly-url-encoded cases from google-images egrep "[?&]($KW2|$KW1|as_[a-z]*q)=|/Images/|/Web/" | ##exclude NON-Searches, those lacking an understood query-param (after url-decoding) sed ' ##isolate the search-string part of referrer-field, and add img: or web: up front /[?&]as_[a-z]*q=/{s!^[^?]*!web:!; s![?&][a-z_A-Z]*[^q]=[^?&]*!!g; s!?!\&!g; s!= *!=!g;}; ##multiple as_q etc from google-advanced-search s!?q=tbn:[^&]*!!g; ##remove unwanted sort of q= used by google /\/imgres\|\/images\|picsearch\|dearcomputer/s!.*[?&]\('$KW1sed'\)=\([^&]*\).*!img:\2!; ##traditional img-searches for keyword-set-1 (the overriding ones) /\/imgres\|\/images\|picsearch\|dearcomputer/s!.*[?&]\('$KW2sed'\)=\([^&]*\).*!img:\2!; ##traditional img-searches for keyword-set-2 (the overridden ones) /^img:\|^web:/!s!.*[?&]\('$KW1sed'\)=\([^&]*\).*!web:\2!; ##traditional web-searches for keyword-set-1 (the overriding ones) /^img:\|^web:/!s!.*[?&]\('$KW2sed'\)=\([^&]*\).*!web:\2!; ##traditional web-searches for keyword-set-2 (the overridden ones) /\/Images\//s!.*/Images/\([^/]*\).*!img:\1!; ##webcrawler-style img-searches /\/Web\//s!.*/Web/\([^/]*\).*!web:\1!; ##webcrawler-style web-searches s!cache:[^ ]*!!; ##fixup for google-from-cache; note: creates some web:emptystring results s![,\\]\|[-+/.]$\|[-+/.] \| [/.]! !g; s! *! !g; s!: !:!; s! $!!g; ##standardize punctuation and spaces /^web:$/d; ##discard web:emptystring results ' } linecnt () { LC_CTYPE=C wc "$@" |arr 1; } ##linecnt; note: wc needs LC_CTYPE=C to work on mixed-charset data TOPN=50 ##N for the Top-N... tmp=/tmp/weblogss$$ cat "$@" |weblogParseSearchStrings >${tmp}H; cntH=$(linecnt ${tmp}H) ##get and count total search-strings cat ${tmp}H |sed 's!.*!\L&!' |sort |LC_CTYPE=C uniq -c |sort -k1nr >${tmp}U; cntU=$(linecnt ${tmp}U) ##get and count unique search-strings echo "

Top $TOPN of $cntU Distinct, $cntH Total Search Strings
#HitsSearch String " ##produce webalizer-compatible preamble HTML cat -n ${tmp}U |head -n$TOPN |sed 's|\t| |g; ##number the lines and make top-N subset s| *\([^ ]*\) *\([^ ]*\) \(.*\)|
\1\2\3|' ##and convert to HTML echo "
" ##end HTML table rm ${tmp}[HU] ##cleanup temporaries exit =============== == CHANGE-LOG: =============== developed as weblog-parse-search-strings which, with counted-uniquify-ordered (using sort+uniq), did essentially what this script does; an example: cat LOGFILES |weblog-parse-search-strings |tolower |counted-uniquify-ordered ##--produce Top-Search-Strings report repackaged to be more webalizer-like; [to myself: see notes in weblog-parse-search-strings for more info on logrecords etc] now also converts output to webalizer-compatible HTML, and adds a count (or two); difficulties with handling mixed-charset logfiles: - incorporating the lowercasing into the "parse" part won't work: (1) parsing code needs LC_CTYPE=C to avoid missing ampersands, (2) lowercasing works better in utf8; - uniq-c fails to combine identical koi8 lines when LC-CTYPE=.UTF-8 is in effect; but it does combine them with LC_CTYPE=C -- seems weird?? - were a string in a foreign encoding (eg koi8) to make the Top-N list, would be nice to see it converted to utf8, but that's beyond the scope of this script; 2009-10-13: minor fixes to google-advanced-search (as_) parsing; now using "stopper" of [?&], though [&] might suffice; web:emptystring cases are now ignored, though somewhat reluctantly; examples were from: ?q=cache:http://www.nativeorchid.com/