#!/bin/bash ## harvest email-ids for all organizations listed in orchidmall.com / orchidwire.com (or other such harvesting); by Eugene Reimer 2007-10-29; ## the starting URL(s) will typically be for links-page(s), containing mostly fully-qualified URLs; ## each of those typically takes one to the frontpage of some website; ## on each of those sites: ## will walk that site to a specified max-depth (typically one, assuming site has a contact-us page linked-to from the frontpage); ## will ignore any links that go off-site (may want to make this an option?); ## note: can use wget to grab a page AND sub-pages it links-to (recursive downloading); ## -r recursive downloading (of subpages) ## -l limit max-depth of recursive downloading ## -nd no directories to be created, all downloaded files will reside in curdir; .NN suffix for uniqueness ## -E force HTML files w/o htm/html/HTM/HTML ending to get .html ending (behaviour together with -nd isn't specified??) ## -k fix links: link to downloaded subfile becomes relative; link to non-downloaded file becomes fully-qualified (http:...) ## -T set a timeout -- the default of 900 seconds (15 minutes) is way too long for my taste ## FLAWED: it gets images,stylesheets,etc even when --page-requisites is omitted ==> will do the recursive part myself (using only links to HTML pages) ## USAGE: harvest-emailids URL... ## EXAMPLE: harvest-emailids http://www.orchidmall.com/society.htm ## EXAMPLE: harvest-emailids http://www.orchidmall.com/{society,plants,supplies,fotosite,special}.htm ## ## Used on 2007oct31 to harvest from orchidmall; did the emailing on 2007nov02; ## Used on 2007nov05 to harvest from orchidwire; did the emailing on 2007nov07; ## ## HANDLING 302 redirections; eg: http://www.orchidweb.org redirects to http://www.orchidweb.org/aos/ --non-slashed filenames must be relative to that!! ## clues in log: ## HTTP/1.1 302 Found ## Location: /aos/ ## Location: /aos/ [following] ## -- then see another connecting msg with new nm, which usually gets rc:200... ## ## HANDLING various obfuscations: ## mailbox_@_domain <--used many times in orchidmall + sites found from it (now handled by removing those underscores); ## mailbox@domain <--used in orchidmall; possibly in other sites; (now handled, but need to rerun on orchidmall to see additions) ## ...AT... <--consider producing diagnostics to aid in finding these sorts==??== ##[ $# -eq 0 ] && set http://www.orchidmall.com/{society,plants,supplies,fotosite,special}.htm ##--07oct31 starting-pages: all links-pages from orchidmall.com [ $# -eq 0 ] && set http://www.orchidwire.com/Earth/1/{Argentina,Australia,Austria,Belgium,Belize,Bermuda,Bolivia,Brazil,Canada,Chile,China,Colombia,Costa_Rica,Czech_Republic,Denmark,Dominican_Republic,Ecuador,Estonia,Finland,France,Germany,Greece,Hungary,India,Indonesia,Ireland,Italy,Jamaica,Japan,Korea,Lebanon,Malaysia,Mexico,Netherlands,New_Zealand,Norway,Panama,Papua_New_Guinea,Paraguay,Peru,Philippines,Poland,Portugal,Russian_Federation,Singapore,Slovenia,South_Africa,Spain,Sweden,Switzerland,Taiwan,Thailand,Trinidad_And_Tobago,Ukraine,United_Kingdom,United_States,Venezuela,Vietnam}.html ##--07nov05 starting-pages: all Country-pages from orchidwire.com NM=/tmp/harvestROOT.htm NM2=/tmp/harvestROOT.htm-2 TMPDIR=/tmp/harvest TMP=/tmp/harvestTMP LOG=/tmp/HARVEST-LOG OUT=/tmp/HARVEST-OUTPUT OUT2=/tmp/HARVEST-ELIST-UNIQUIFIED+CLEANED MAXDEPTH=1 T=${1#http://}; PRUNE=http://${T%%/*} ##assumes multiple urls are always from one site; could make it an array==??== elog() { echo "$@"; echo "$@" >>$LOG; } ##echo to stdout, and to log eout() { echo "$@"; echo "$@" >>$LOG; echo "$@" >>$OUT; } ##echo to stdout, and to log, and to output-file wget-recursive() { ##----function, my replacement for wget -r; receives one fully-qualified URL---- elog "====wget-recursive $1" ##DEBUG wget -T60 -o$TMP -S -nd "$1"; cat $TMP >>$LOG ##need the log from this wget in order to detect 301/302/303/307-redirects T=${1#http://} ##T: url without the leading http:// L=$(grep Location: $TMP |head -n1 |sed 's|.*Location: *||') ##==handle 301/302/303/307-redirects== if [[ $L != "" ]];then ##==handle 301/302/303/307-redirects== elog "====redirect:$L" ##msg showing redirect if [[ $L == /* ]];then T=${T%%/*}$L ##handle slashed-redirect elif [[ $L == *:* ]];then T=${L#http://} ##handle fully-qualified-redirect elif [[ $T == */*.* ]];then T=${T%/*}/$L ##handle relative-redirect case-1 (may not be needed??) (fixed slash after 07oct31 use) else T=${T%/}/$L ##handle relative-redirect case-2 (may not be needed??) (fixed slash after 07oct31 use) fi fi if [[ $T == */*.* ]];then dir=${T%/*}; else dir=${T%/}; fi ##dir: url without filename if any -- kludgey: assumes dirname lacks dot, filename has dot site=${T%%/*} ##site: url without directory parts etc export dir=http://$dir site=http://$site ##export so vars can be used in a subshell; also put back the leading http:// export -f elog eout ##export so functions can be used in a subshell cat * |tr '\n' ' ' |sed "s|<[Aa]|\n|\n|g" | ##ensure each link on line by itself grep -i '].*||" | ##keep only those lines, and only the URL while read;do if [[ $REPLY == /* ]];then nm="$site$REPLY"; elif [[ $REPLY != *:* ]];then nm="$dir/$REPLY"; else nm="$REPLY"; fi if [[ $nm != $site* ]];then elog "----skipping:$nm"; continue;fi ##ignore external links to other sites if [[ $nm == *[Pp][Dd][Ff] ]];then elog "----skipping:$nm"; continue;fi ##ignore *PDF filenames (could also skip *JPG|*JPEG|*GIF|*MP3|*WAV|*DOC|...) if [[ $nm == *[Jj][Pp][Gg] ]];then elog "----skipping:$nm"; continue;fi ##ignore *JPG filenames if [[ $nm == *[Gg][Ii][Ff] ]];then elog "----skipping:$nm"; continue;fi ##ignore *GIF filenames elog "====site:$site dir:$dir REPLY:$REPLY nm:$nm" ##DEBUG sleep 0.5; wget -T60 -a$LOG -S -nd "$nm" ##sleep between wgets to avoid hitting any site too hard; 1.5 makes it SLOW==!!== done elog -e "====d:\n$(d -l|grep -v '^total')" ##DEBUG } ##--1-- grab the starting page(s) >$LOG ##may not want $LOG emptied?? >$OUT ##may not want $OUT emptied?? eout "##--StartingPages:$@" ##echo informative line wget -T60 -a$LOG -S -O$NM "$@" ##download all URLs specified on cmdline, catenated into file $NM cat $NM |tr '\n' ' ' |sed "s|<[Aa]|\n|\n|g" | ##ensure each link on line by itself grep -i '].*||" >$NM2 ##keep only those lines, and only the URL; result to $NM2 ##less $NM2 ##DEBUG ##--2-- follow each link in starting-page(s) while read -u4;do ##read $NM2 line-by-line (using fd#4 so can still use fd#0 as stdin) if [[ $REPLY != http* ]];then elog "----Skipping:$REPLY"; continue; fi ##skip non-fully-qualified URL if [[ $REPLY == $PRUNE* ]];then elog "----Skipping:$REPLY"; continue; fi ##skip URL that while fully-qualified is nevertheless from the starting-site==??== elog "----Harvesting:$REPLY" rm -fR $TMPDIR; mkdir $TMPDIR; cd $TMPDIR ##make tmp-dir, and cd to it; 2011-01:remove-->rm -fR ##wget -T60 -a$LOG -kS -r -l$MAXDEPTH -nd -w3 --random-wait "$REPLY" ##grab specified page and subpages to depth=1, into tmp-dir -- first try wget-recursive "$REPLY" ##grab specified page and subpages to depth=1, into tmp-dir -- my own version for f in *;do ##for each file retrieved if file "$f"|grep -q 'text';then ##for each HTML file (just test for text, since HTML sometimes misdiagnosed) elog "----scanning:$REPLY $f" ##DEBUG - chged eout->elog, since now adding source-comment to each email-id cat "$f" | sed 's|%20| |g' | ##fix %20 (sometimes see mailto:%20EMAIL) -- may need to handle others?? sed 's|@|@|g' | ##fix @ (sometimes see mailto:ID@DOMAIN) -- (added after 07oct31 use) sed 's|$[-_.A-Za-z0-9]*@[-_.A-Za-z0-9]*$|\n\L\1\n|g' | ##ensure each email-address on line by itself, and lowercased (prob with ñ, to ~, to \n) grep '.@.*\.' |sed "s|\$| ##from $REPLY $f|" >>$OUT ##append each email-address to $OUT, with comment showing source else elog "----Non-Textfile:$f" fi done ##less $OUT ##DEBUG ##echo -n "----Continue?"; read ##DEBUG - for initial testing, want the ability to Abort after one site... done 4<$NM2 ##(using fd#4 so can still use fd#0 as stdin; don't know any better way...) ##--3-- Uniquify the email-ids; also some Cleanup & Pruning; ## (Note: only wrote this after doing the above (on orchidmall.com) which took 15-hours!! ran it on 2007-10-31, from 03:09 to 17:48.) ## remove leading & trailing dots dashes underscores -- then re-prune those lacking dot after at-sign; ## -- better solved by improving the pattern used above, but since I'm unwilling to repeat the 15-hour run... ## prune (discard) email-ids containing "subscribe"; ## prune based on implausible domain-name (after the at-sign) -- first check that TLD contains letters only, at least 2, at most 6; ## -- for orchidmall harvest 2007oct31, that left only two implausible TLDs: comadd netcom ==FIXED MANUALLY: comadd->com; netcom->net== ## cat $OUT | sort -k1,1 -us | ##uniquify on email-id (to reduce size; will be redoing) sed 's|^[-_.]*$[A-Za-z0-9][-_.A-Za-z0-9]*@[-_.A-Za-z0-9]*[A-Za-z0-9]$[-_.]*|\1|' | ##fixup for having used too-weak pattern for email-id grep '^[A-Za-z0-9][-_.A-Za-z0-9]*@[-_.A-Za-z0-9]*[A-Za-z0-9] ' | ##fixup for having used too-weak pattern for email-id - part-2 grep '.@[^#]*\.' | ##re-prune those lacking dot after at-sign grep -vi 'subscribe.*@' | ##prune subscribe|unsubscribe ids -- other spellings?? grep -vi 'yahoogroups\.com' | ##==prune Yahoo-Groups (added 2007-nov02 after emailing...)== sed 's|_@_|@|g' | ##(added after 07nov05 use; based on bounces seen on 07oct31 list) grep '^[^#]*\.[a-z]\{2,6\} ' | ##prune invalid top-level-domain-names, by 2-6-letters rule sort -k1,1 -us | ##uniquify on email-id sort -k3,3 -k1,1 -f >$OUT2 ##finally sort on source-website||email-id - for people-readability wc $OUT $OUT2 cat $OUT2 |sed 's|[^#]*\.$[-_.A-Za-z0-9]*$ .*|\1|' |sort -u |m ##--show all TLDs-- ##t $OUT2 |sed 's|[^#]*\.$[-_.A-Za-z0-9]*$ .*|\1|' |sort -u |grep -v '^[a-z]\{2,6\}$' |m ##--show obviously invalid TLDs-- exit 2011-01-10: remove-->rm -fR; to avoid needing prereq;