#!/bin/csh -f # # Get a show's episode list from tvrage.com # and generate an editor script for numbering the episodes. # (Based on my get.episodes script.) # # Gets one show at a time. The argument can be the URL for # the show's episode list, or it can be the program name; # multiple words are allowed, capitalization is NOT significant. # Given a title, the script will guess at the TVrage URL. # It is often the program name with the spaces changed # to underscores, and /episode_list/all added. # # Brent Chivers 2010 Jan 5 # # Solaris-10 seems to need "nawk" for that 2nd awk. 8-( 2005/Jun/27 # # Show title is no longer on the "Change Layout" line. 2010/Jun/09 # Find it 2 lines later; force it to sort first into final awk. # Pick up "Unaired" episodes # # lynx and sed path changes from Darwin to Linux. 2011/Jan/08 # No preceding number on the episode-date lines # Episode number is not on the line with the date; # it's on the line before the date. # # TVRage.com page layout has changed; 2011/Apr/17 # actually simpler to use now. # Not handling "unaired" episodes 'cause I don't know how they look now. # Get rid of "View Trailer" crap 2011/Apr/29 # TVRage.com page layout changed again; show title 2011/Jul/18 # is now the line after the 2nd empty line. # TVRage.com page layout changed again; show title 2012/Oct/23 # is now the line after the 5th empty line. # TVRage.com page layout changed again; show title is 2013/Apr/06 # now the line preceeding the line "^Classification: ". # Adding "tee" commands to catch the scraped web page was clever. # (It's normally a waste of CPU and disk, but it really speeds # up diagnosis when the format changes, and I can't count on # being as clever next time.) # Pick up specials (and sort them back into sequence). 2013/Apr/12 # set verbose # see commands before variable substitution # set echo # see commands after variable substitution unset histchars # get other users' history characters out of the way unalias * # get other users' .cshrc stuff out of the way # $0:t construction doesn't work! set cmd_name = $0 set cmd_name = $cmd_name:t # the name of this command # set webpage = /tmp/$cmd_name.$$ # temp file for data from TVRage if ($#argv == 0) then # no arguments provided echo "usage: $cmd_name show name [or tvrage.com URL]" exit endif if ($#argv == 1) then # one argument provided if ( "$1" =~ *tvrage.com* ) then set url = "$1" # looks like a TVRage URL else # guess a probable TVRage URL set url = "http://www.tvrage.com/$1/episode_list/all" endif else # guess at the TVRage URL set show = `echo "$*" | tr ' ' '_'` set url = "http://www.tvrage.com/$show/episode_list/all" endif # this works surprisingly often.... # but not always. 8-( # get data from TVRage # strip off "Buy" and "Rating" # convert any html "&" to "&" character. # strip out icons # join episode lines to preceeding episode-number/air-date lines (reformated for sorting) # convert month to digits for sorting # keep show title # sort episode lines by air date # strip out text found with episode number # format program title, episode number, and episode title into editor commands ## echo "Using URL $url" /usr/bin/lynx -dump -nolist -pseudo_inlines -width=300 $url | tee /tmp/tvrage1 | /bin/sed -e 's/^ *//' \ -e 's= *[NY] *[0-9][0-9]* *Amazon *[0-9\.]*$==' \ -e 's= *[NY] *[0-9][0-9]* *Amazon *N/A$==' \ -e 's= *[0-9][0-9]* *Amazon *[0-9\.]*$==' \ -e 's= *[0-9][0-9]* *Amazon *N/A$==' \ -e 's= *[0-9][0-9]* *N/A$==' \ -e 's= *[NY] *Amazon *[0-9\.]*$==' \ -e 's= *[NY] *Amazon *N/A$==' \ -e 's= *Amazon *[0-9\.]*$==' \ -e 's= *Amazon *N/A$==' \ -e 's= *N/A$==' \ -e 's= *[NY] *[0-9]* *[0-9\.]*$==' \ -e 's= *[0-9]* *[0-9\.]*$==' \ -e 's= *[NY] *[0-9\.]*$==' \ -e 's= *View Trailer==' \ -e 's/&/\&/' -e 's/.favicon.ico. //' -e 's/.film.gif. //' \ -e 's=^[1-9][0-9]* *\([1-9][0-9]*\)x\([0-9][0-9]*\) *\([0-3][0-9]\)/\([JFMASOND][aepuco][nbrylgptvc]\)/\([12][0-9][0-9][0-9]\) *\(.*\)$=\1.\2 \5/\4/\3 \6=' \ -e 's=^\(S\)[0-9]* - \(#*[1-9][0-9]*\) *\([0-3][0-9]\)/\([JFMASOND][aepuco][nbrylgptvc]\)/\([12][0-9][0-9][0-9]\) *\(.*\)$=\1p\2 \5/\4/\3 \6=' \ -e 's=\( [12][0-9][0-9][0-9]\)/Jan/\([0-3][0-9] \)=\1/01/\2=' \ -e 's=\( [12][0-9][0-9][0-9]\)/Feb/\([0-3][0-9] \)=\1/02/\2=' \ -e 's=\( [12][0-9][0-9][0-9]\)/Mar/\([0-3][0-9] \)=\1/03/\2=' \ -e 's=\( [12][0-9][0-9][0-9]\)/Apr/\([0-3][0-9] \)=\1/04/\2=' \ -e 's=\( [12][0-9][0-9][0-9]\)/May/\([0-3][0-9] \)=\1/05/\2=' \ -e 's=\( [12][0-9][0-9][0-9]\)/Jun/\([0-3][0-9] \)=\1/06/\2=' \ -e 's=\( [12][0-9][0-9][0-9]\)/Jul/\([0-3][0-9] \)=\1/07/\2=' \ -e 's=\( [12][0-9][0-9][0-9]\)/Aug/\([0-3][0-9] \)=\1/08/\2=' \ -e 's=\( [12][0-9][0-9][0-9]\)/Sep/\([0-3][0-9] \)=\1/09/\2=' \ -e 's=\( [12][0-9][0-9][0-9]\)/Oct/\([0-3][0-9] \)=\1/10/\2=' \ -e 's=\( [12][0-9][0-9][0-9]\)/Nov/\([0-3][0-9] \)=\1/11/\2=' \ -e 's=\( [12][0-9][0-9][0-9]\)/Dec/\([0-3][0-9] \)=\1/12/\2=' | /usr/bin/tee /tmp/tvrage2 | /usr/bin/awk '\ /^$/ { blank +=1 } \ /^Classification: / { getline ; printf ("++++\t0000/00/00\t%s\n", prev) ; blank = 100; next } \ $2 ~ /[12][0-9][0-9][0-9]\/[01][0-9]\/[0-3][0-9]/ { print } \ { prev = $0 }' | /usr/bin/sort -k2 | /usr/bin/tee /tmp/tvrage3 | /usr/bin/awk '\ BEGIN { FS=" " } \ /^++++ / { show = $3 ; next } \ NF==3 { printf ("g_^%s\t%s\t_s_\t%s\t_\t%s %s\t_p\n", show, $3, $3, $1, $3) }' | /bin/sed -e 's/ (unaired)//' -e 's/ (unaired)//' # /bin/rm $webpage