#!/bin/bash
## PREREQ: find-dirs dif -- from http://ereimer.net/programs/general-purpose-scripts.htm
##
## ====================
## ===== Part-1 =====
## ====================
## READS HTML FILES WITH LINES LIKE:
## ...
er DATE... <--title line:
## ...Photos from LOCATION on DATE:... <--location+date line:
## <--img line type1; 2010-03: HREF NOW jpg-->htm
## <--img line type2;
##
## OUTPUT FORMATS BY EXAMPLE:
## SmallRoundleafOrchid 06jun09-plant
<--er-caption1.htm (used by Part-2 below)
## 06jun09 Woodridge SmallRoundleafOrchid-plant
<--er-caption2.htm (used by genOrchidsBySpecies-ER)
## 06jun09 <--er-caption3.htm (used by Part-3 below) 2010-03:
[ -e /tmp/er-caption1.htm ] && mv -f /tmp/er-caption1.htm /tmp/er-caption1.htm~
[ -e /tmp/er-caption2.htm ] && mv -f /tmp/er-caption2.htm /tmp/er-caption2.htm~
[ -e /tmp/er-caption3.htm ] && mv -f /tmp/er-caption3.htm /tmp/er-caption3.htm~ ##2008-10-02: output-file for cgi-bin/pixsearch
##-e /tmp/er-caption3a.htm ] && mv -f /tmp/er-caption3a.htm /tmp/er-caption3a.htm~ ##2008-10-02: OBSOLETE output-file
CWD=/er/website; cd $CWD || exit 9 ##work under /er/website (output files were under /er, now under /tmp)
##chgsed --tmp 's|\(title="[^:"]*\)"|\1:"|' {,*/}/[a-z]*htm ##==New Rule: auto-add colon at eol, to colon-less captions
for DIR in $(find-dirs) .;do ##2008-11-11 was:for DIR in 2* KPLR Sid .; pre-07may:cd /er; for DIR in reimer reimer? irhymer rhj00? ereimer;
##[[ $DIR == *-* ]] && continue ##2008-11-11: yanked as now undesirable; long obsolete, was designed to skip reimer*-redirects etc;
##[[ -d "$CWD/$DIR" ]] || continue ##2008-11-11: obsoleted by use of find-dirs in the for-loop
[[ $DIR == cgi-bin ]] && continue ##2008-11-16: skip cgi-bin
[[ $DIR == webalizer ]] && continue ##2008-11-16: skip webalizer
[[ $DIR == NOPER* ]] && continue ##2008-11-16: skip NOPERCART-DEMO dirs; ought AuntHelen|Sid|etc to be excluded==??==
cd $CWD/$DIR || exit 9
for HTM in [a-z]*.htm*;do ##07may: allow html as well as htm (not needed); NOTE: [a-z] skips most caption-pages (but not all?)
[[ -e $HTM ]] || continue ##2008-11-16: only needed for cgi-bin|webalizer?? (are excluded above)
[[ $HTM == tmp* || $HTM == *~ ]] && continue ##EXCLUDE tmp-files ((was in both))
[[ $DIR/$HTM == ./index* ]] && continue ##EXCLUDE these are also in other webpages ((was ONLY in genByCaption2, was done by not matching html))
[[ $HTM == pixDate* ]] && continue ##EXCLUDE these are also in other webpages ((was in neither; non-essential since has no img lines))
[[ $HTM == pixCaption* ]] && continue ##EXCLUDE these are also in other webpages ((was in neither; non-essential since has no img lines))
[[ $HTM == pixOrchid* ]] && continue ##EXCLUDE these are also in other webpages ((was in genByCaption; non-essential since has no img lines))
[[ $HTM == *-ALT.htm ]] && continue ##EXCLUDE these are also in another page without the -ALT; used in 200608 (was -B), and KPLReimer (was Slide)
##[[ $HTM == shop*.htm ]] && continue ##EXCLUDE these are from nativeorchid site, don't want them indexed ==obsoleted by DIR-skipping above
[[ $DIR/$HTM == *InuvikTrip* ]] && continue ##EXCLUDE these are Non-Photos, has images from Google-Maps
[[ $HTM == making-favicons* ]] && continue ##EXCLUDE these are Non-Photos ((was ONLY in genByCaption2))
[[ $DIR/$HTM == James/index.htm ]] && continue ##EXCLUDE ==MOST== are also in other webpages 2010-06 (??) some were recaptioned in James/index.htm
if ! grep -q '/tmp/er-cap1
>/tmp/er-cap2
>/tmp/er-cap3
grep -v '^/er/website
OUT=pixCaption.htm
##[ -e /tmp/$OUT ] || mv -f $OUT /tmp/$OUT ##harmless but not needed, since backup copy now made after...
echo "ER By CaptionPhotographs by Caption
" >$OUT
##--now, sort /tmp/er-caption1.htm, then revise by combining lines with same TITLE--
## the sorting was a struggle, due to quirks in ; MORE INFO IN: /etc/sbin/SORTBUG
##
{ cat /tmp/er-caption1.htm; echo "~~zzzzzENDFILE"; } | ##note: ending-line serves to simplify the awk-program below
sed 's|>| SC=|; s|<| <|g' | ##5th field becomes SC=SUBCAP and SUBCAP usually begins with YYmmmDD-style date
sed 's|SC=\([0-9][0-9][A-Za-z]\)|SC=20\1|' | ##2008-09-28: convert a YY-date to YYYY-form by adding leading "20" (KPL subcaps NN- are ok)
##rt -bfs -k 1,1 -k 5.4,5.5 -k 5.6bfM,5.8 -k 5.9,5.10 | ##sort-cmd for YYmmmDD dates (pre-2008-09-28)
sort -bfs -k 1,1 -k 5.4,5.7 -k 5.8bfM,5.10 -k 5.11,5.12 | ##sort-cmd for YYYYmmmDD dates (post2008-09-28)
sed 's|SC=20\([0-9][0-9][A-Za-z]\)|SC=\1|' | ##2008-09-28: undo addition of leading "20" (keep even if going to 4-digit-year everywhere??)
awk '{
for(i=2;i<=NF;++i) {
if($i ~ "SC=") subcaption=substr($i,4)
if($i ~ "href") href = substr($i,6)
}
if($1!=majcaption){
if(k!=0) {printf majcaption ":"; for(i=1;i<=k;++i)printf " " st[i] ""; printf "
\n";}
k=0; majcaption=$1
}
++k; st[k]=subcaption; pix[k]=href
}' |sed 's|ñ| |g; s| |\ |g' >>$OUT ##2008-09-22: sed-filter to undo space-to-ntilde and to revise spacing
echo "
This page was generated mechanically on $(date +%Y%b%d)" >>$OUT
dif +s --sed='s|This page was generated.*||' /tmp/$OUT $OUT ##SHOW changes, and test whether significantly changed; +s option to suppress output if same
if [ $? -eq 0 ]; then ##identical, except for generated-DATE => keep previous version
cp -fp /tmp/$OUT $OUT ##preserve generated-DATE, timestamp, etc from previous OUT page
else ##have significantly changed OUT page => keep new version
wc /tmp/$OUT $OUT ##counts are nice when a great many lines change (the -20080922 was TEMP)
mv -f /tmp/$OUT /tmp/$OUT~
cp -fp $OUT /tmp/$OUT ##keep a backup copy, where immune to operations
fi
##dif /tmp/pixCaption-20080922 $OUT|m ##DEBUG for testing against the 2008-09-22 output
## ====================
## ===== Part-3 =====
## ====================
## produce output-file for cgi-bin/pixsearch;
## am now sorting lines by DATE; earlier method rearranged the DIR-order (dot last) to get close to chronological...
## NOTE: caption3a was made by first crude method; caption3 is sortable since has date up front;
## 2009-11: Added DATE & LOCATION to these captions, by revising caption3-output in Part-1 above
OUT=ANON-pixsearch.txt
cat /tmp/er-caption3.htm |
sed 's|: | |; s|,|:|' | ##2009-12-19: revise COMMA + COLON punctuation; 2010-03-16: scrapped jpg-->htm in href
sed 's|^\([0-9][0-9][A-Za-z]\)|20\1|' | ##convert a YY-date to YYYY-form by adding leading "20"; not needed after FDATE mods--??--
sort -bfs -k 1.1,1.4 -k 1.5bfM,1.7 -k 1.8,1.9 | ##sort-cmd for YYYYmmmDD dates (5.N-->1.N-3)
sed 's|^[^ ]* ||' | ##discard leading DATE (was just for sorting)
cat >/tmp/$OUT ##produce the file needed by cgi-bin/pixsearch (is really HTML...)
if dif +s $OUT /tmp/$OUT;then rm -f /tmp/$OUT ##2008-11-03: unchanged, leave mod-time unchanged to prevent upload; +s for no output if same
else mv -f /tmp/$OUT $OUT ##2008-11-03: changed, replace $OUT
fi
##difsed $'s|: *"|"|; s|[\t ]*1536 which seems about right);
2008-09-28: PART-2: wanted AuntHelen photos ordered by YYYY subcaption; added support for dates with 4-digit-year==!!==
2008-10-02: added PART-3 to produce extra output-file as needed by cgi-bin/pixsearch script; 2008-10-14: renamed it: cgi-bin/pixsearch.txt-->ANON-pixsearch.txt;
2008-11-11: for ER-reorg: revised skipping rules for: index.htm -ALT InuvikTrip; pixTitle-->pixCaption;
2009-10: ==Check that ñ works properly in awk; it does in sed, but not in tr;
2009-11-22: use Date+Location+Caption (was just caption) in caption3, for pixsearch;
DATE stdizing:
note: TITLE-line has YY; Photos-from-line has YYYY usually, but only YY in some 2002-pages;
in photos-by-caption page, want YY-dates for ER-photos (from 20nn years), but also using YYYY for some (eg: AuntHelen photos);
in pixsearch-results, want YYYY-dates?? OR as used by cvt-to-htm-per-img-ER (ie: in per-img pages);
cvt-to-htm-per-img-ER uses only "Photos from" lines, and leaves date in that form (either YYYY or YY);
AuntHelen photos have YYYY at start of SubCaption -- these were NOT HANDLED wrt caption3 output -- now are;
also considering switching to Numeric-Month--??--
LOCATION: subtle differences in how it gets simplified, in caption3 versus in per-img pages, may seem confusing--??--
cvt-to-htm-per-img-ER: has notes on eliminating those differences, by having it do same simplifying as done here; <==2009-12-19: now OBSOLETE
Added "Example" to the pixsearch page to illustrate Location + Date searching;
2009-12-19: on going to pixpage for Dynamic-PerPhoto-Pages, no longer any need to update cvt-to-htm-per-img-ER, since it is now OBSOLETE;
ANON-pixsearch.txt to contain htm links - despite per-DAY pages containing jpg ones; also use COLON after date+location + remove colon within caption;
achieved by adding a sed-line in Part-3;
2009-12-29: revised matching for TITLE-line, from expecting HTML+TITLE to TITLE+/TITLE on same line; fixes Taiwan and Richard+Betty images lacking DATE-prefix;
2010-03-16: add leading slashes to HREF & SRC in cap3 for ANON-pixsearch.txt lines (for local-Apache, so pixsearch can avoid BASE, also affects cgi-bin/pixpage);
fixed htm|jpg for pixname var, although that var is very rarely used, and must've been misbehaving during the one-htm-page-per-img era;
scrapped the jpg-->htm kluge in part-3: s|\(href=[^>]*[0-9][0-9][0-9]\).jpg|\1.htm|;
2010-07: parsing for LOCATION flaw,
eg "Photos from Hadashville, Braintree and Wye on 2004may27" -- becomes, in ANON-pixsearch.txt: 2004may27 at Hadashville: Braintree and Wye, Coltsfoot...
fixed by removing commas from location -- in cap3 only;
LOCATIONS needing fixups (have fixed):
comma-->comma+space OR "and": GarvenRdPineRidgeRd HadashvilleBraintree DevilsLakeMB Hadashville Ste.Rita
comma-->semicolon: PR464 N of TCH (approx 20mi E 2mi N of Brandon)
2010-09-03:
==FIXME== excluding files/dirs based on these being duplicates needs rethink; eg: James/index.htm has MOSTLY duplicate photos, yet James-as-youngster is UNIQUE!!
wrt producing ANON-pixsearch.txt (Part-3), the solution may be easy: don't exclude, use sort-unique instead; could Part-1/Part-2 output be so fixed??
note: many of the Excludes can be yanked without changing anything, since the "EXCLUDE if not thumbnails-page" test handles those cases;
--tried yanking the line to exclude James/index.htm; adding sort -u to Part-3; ==at best only a partial solution to above "FIXME"??
--UNDID: (1) the James-as-youngster photo is not (yet) being captioned; (2) am getting URLs like /James//20090601... (3) doubt pixpage handles duplicates;
solution for James-as-youngster photo: made a "std" YYYYMMDD/index.htm page for as-scanned & contrast-enhanced versions; so it gets "indexed" + captioned;
==FIXME== too much output from Part-3 dif can result in losing sight of important info -- needs better strategy;
also need undo-capability for ANON-pixsearch.txt -- have it for /tmp/er-caption3.htm but that's less useful==??==