#!/bin/bash
## 2009-09-23:  a webalizer-alternative for Search-String reporting on weblog records -- Eugene Reimer 2009-09;
## wrote this because the webalizer Top-Search-Strings report has bugs, fails to handle some important search-engines most notably Google-Images;
## PREREQS:  url-decode arr -- from http://ereimer.net/programs/general-purpose-scripts.htm
## USAGE:
##	weblog-search-strings-report  LOGFILE...  >tmp.htm		--produce Top-Search-Strings report tmp.htm
##
## Copyright © 2009 Eugene Reimer;  can be used, modified, copied, and distributed or sold under the terms of either the LGPL or the GPL (your choice);
## see http://www.gnu.org/licenses for the details of these terms.

weblogParseSearchStrings () {						##filter to parse search-strings in logrecords
SEPAT="[a-z0-9.]*(info.com|/search\?q=cache|/search/cache\
|(alexa|alltheweb|alot|altavista|aol|aolsearch|ask|askpeter|badiu|baidu|bing|blingo|business|chiff|clusty|cuil|dearcomputer|devilfinder|dmoz|dogpile|eureka|euroseek\
|exalead|excite|ezilon|freecause|gigablast|goodsearch|google|hakia|hotbot|icq|infoseek|infospace|joeant|live|lycos|mamma|metacrawler|msn|myembarq|mytelus|mywebsearch\
|netscape|netzero|northernlight|pch|picsearch|rambler|scour|search.bt|search.comcast|search.conduit|search.earthlink|search.juno|search.magentic|search.mywestnet\
|searching.uk|similar-images.googlelabs|snap|startlap|suche.t-online|verizon|webcrawler|webfetch|yahoo|yandex|yodao\
)\.)"
KW1="cisearch|text|q|query|Keywords|keywords|qrh|searchfor|string|words"	##KW1: these override the ones in KW2
KW2="p|search|wd"								##KW2: these can be overridden by the ones in KW1, eg text>>p q>>search q>>wd
KW1sed=${KW1//|/\\|};  KW2sed=${KW2//|/\\|}					##using sed -r would be a better way??
LC_CTYPE=C									##need 8-bit charset (because logrecords can be in mixture of charsets)
egrep  "\"http://$SEPAT"						|	##match search-engine-referral lines;  using Quote+http obviates need to count fields
sed 's|.*\("http:[^ "]*\).*|\1|'					|	##isolate referrer field
url-decode								|	##1st URL-decode to undo url-encoding found in log-records
url-decode								|	##2nd URL-decode to handle the doubly-url-encoded cases from google-images
egrep "[?&]($KW2|$KW1|as_[a-z]*q)=|/Images/|/Web/"			|	##exclude NON-Searches, those lacking an understood query-param (after url-decoding)
sed '										##isolate the search-string part of referrer-field, and add img: or web: up front
  /[?&]as_[a-z]*q=/{s!^[^?]*!web:!; s![?&][a-z_A-Z]*[^q]=[^?&]*!!g; s!?!\&!g; s!= *!=!g;};	##multiple as_q etc from google-advanced-search
  s!?q=tbn:[^&]*!!g;										##remove unwanted sort of q= used by google
  /\/imgres\|\/images\|picsearch\|dearcomputer/s!.*[?&]\('$KW1sed'\)=\([^&]*\).*!img:\2!;	##traditional img-searches for keyword-set-1 (the overriding ones)
  /\/imgres\|\/images\|picsearch\|dearcomputer/s!.*[?&]\('$KW2sed'\)=\([^&]*\).*!img:\2!;	##traditional img-searches for keyword-set-2 (the overridden ones)
  /^img:\|^web:/!s!.*[?&]\('$KW1sed'\)=\([^&]*\).*!web:\2!;					##traditional web-searches for keyword-set-1 (the overriding ones)
  /^img:\|^web:/!s!.*[?&]\('$KW2sed'\)=\([^&]*\).*!web:\2!;					##traditional web-searches for keyword-set-2 (the overridden ones)
  /\/Images\//s!.*/Images/\([^/]*\).*!img:\1!;							##webcrawler-style img-searches
  /\/Web\//s!.*/Web/\([^/]*\).*!web:\1!;							##webcrawler-style web-searches
  s!cache:[^ ]*!!;								##fixup for google-from-cache;  note: creates some web:emptystring results
  s![,\\]\|[-+/.]$\|[-+/.] \| [/.]! !g; s!  *! !g; s!: !:!; s! $!!g;		##standardize punctuation and spaces
  /^web:$/d;									##discard web:emptystring results
'
}
linecnt () { LC_CTYPE=C wc "$@" |arr 1; }					##linecnt;  note: wc needs LC_CTYPE=C to work on mixed-charset data

TOPN=50										##N for the Top-N...
tmp=/tmp/weblogss$$
cat "$@" |weblogParseSearchStrings                                 >${tmp}H;  cntH=$(linecnt ${tmp}H)	##get and count total search-strings
cat ${tmp}H |sed 's!.*!\L&!' |sort |LC_CTYPE=C uniq -c |sort -k1nr >${tmp}U;  cntU=$(linecnt ${tmp}U)	##get and count unique search-strings

echo "<P><A NAME=TOPSEARCH></A><TABLE WIDTH=510 BORDER=1 CELLSPACING=1 CELLPADDING=1 style=font-size:smaller>
<TR><TH BGCOLOR=#C0C0C0 COLSPAN=9><big>Top $TOPN of $cntU Distinct, $cntH Total Search Strings</big>
<TR><TH HEIGHT=8 COLSPAN=9>
<TR><TH BGCOLOR=#C0C0C0>#<TH BGCOLOR=#008040>Hits<TH BGCOLOR=#00E0FF>Search String
"													##produce webalizer-compatible preamble HTML
cat -n ${tmp}U |head -n$TOPN |sed 's|\t| |g;								##number the lines and make top-N subset
s| *\([^ ]*\) *\([^ ]*\) \(.*\)|<tr><td align=center><b>\1</b><td align=right><b>\2</b><td nowrap>\3|'	##and convert to HTML
echo "</TABLE>"												##end HTML table
rm ${tmp}[HU]												##cleanup temporaries


exit
===============
==  CHANGE-LOG:
===============
developed as weblog-parse-search-strings which, with counted-uniquify-ordered (using sort+uniq), did essentially what this script does;  an example:
	cat LOGFILES |weblog-parse-search-strings |tolower |counted-uniquify-ordered	##--produce Top-Search-Strings report
repackaged to be more webalizer-like;  [to myself: see notes in weblog-parse-search-strings for more info on logrecords etc]
now also converts output to webalizer-compatible HTML, and adds a count (or two);
difficulties with handling mixed-charset logfiles:
- incorporating the lowercasing into the "parse" part won't work: (1) parsing code needs LC_CTYPE=C to avoid missing ampersands, (2) lowercasing works better in utf8;
- uniq-c fails to combine identical koi8 lines when LC-CTYPE=.UTF-8 is in effect;  but it does combine them with LC_CTYPE=C -- seems weird??
- were a string in a foreign encoding (eg koi8) to make the Top-N list, would be nice to see it converted to utf8, but that's beyond the scope of this script;

2009-10-13:  minor fixes to google-advanced-search (as_) parsing;  now using "stopper" of [?&], though [&] might suffice;
	web:emptystring cases are now ignored, though somewhat reluctantly;  examples were from: ?q=cache:http://www.nativeorchid.com/