#!/bin/bash #-------------------------------------------- # GetDilbert was created by ir. Niels Basjes # E-Mail : DilbertScript@Basjes.nl # Homepage: http://oss.basjes.nl #-------------------------------------------- # This script is intended to automatically retrieve and store the cartoons # on the dilbert homepage so they can be read offline. # An HTML or ASCII Email is generated automatically aswell. #-------------------------------------------- # Version 1.0 10-May-1998 Initial version # Version 1.1 23-Jun-1998 Added chmod and URL mailing # Version 1.2 15-Feb-1999 Changed grep expression # Version 1.3 30-Mar-1999 Added HTML mail generation # Added Check to see if a file is really an image # Added rename if file is a JPEG # Changed grep expression from HREF to OPTION # Version 1.4 29-Jul-1999 United Media dratically changed the layout of # their HTML code in an attempt to stop people me from # downloading the comic without reding the commericials # Version 1.5 10-Jan-2000 Fixed the next year change (not a millennium problem but a year problem) # Version 1.6 02-Nov-2000 And they changed the code of their page again. # Version 1.7 04-May-2004 Fixed JPG issue # Version 1.8 15-Nov-2004 Switched to wget # Added more usefull storage directories # Version 1.9 10-Jan-2005 United media changed something against leaching :( # So now we tell them we are FireFox instead of wget :D # MAILING_LIST=niels@example.com #-------------------------------------------- # Select HOW the mail should be send # Use "MAIL_TYPE=ASCII" to get a plaintext ascii file with URLS # Use "MAIL_TYPE=HTML" to get an HTML file that directly shows the URLS MAIL_TYPE=HTML #-------------------------------------------- # This is the prefix for the image file name when accessing through the WWW server BASE_WWW_SERVER_DIR="http://cartoons.example.nl/Dilbert/Archive" #-------------------------------------------- # File locations # BASE_IMAGE_DIR is the directory where the downloaded images will be stored BASE_IMAGE_DIR=/opt/cartoons/Archive/Dilbert/Archive # TEMP_FILES is the directory where some intermediae files will be stored TEMP_FILES=/tmp #-------------------------------------------- # Tools WGET="wget -q --user-agent \"Mozilla/5.0 \(Windows\; U\; Windows NT 5.1\; en-US\; rv:1.7.5\) Gecko/20041107 Firefox/1.0\" " PATH=$PATH:/usr/local/bin #-------------------------------------------- #-------------------------------------------- # United media server information SRV_NAME=www.unitedmedia.com SRV_DIR=comics/dilbert/archive #-------------------------------------------- #Temp files ALL_HTML_URLS=${TEMP_FILES}/DILBERT_todaysHTML INDEX_HTML=${TEMP_FILES}/DILBERT_CurrentIndex.html URL_TEMP_FILE=${TEMP_FILES}/DILBERT_NewURLS HTML_MAIL_TEMP_FILE=${TEMP_FILES}/DILBERT_HTMLMail #-------------------------------------------- rm -f ${ALL_HTML_URLS} rm -f ${URL_TEMP_FILE} rm -f ${INDEX_HTML} rm -f ${HTML_MAIL_TEMP_FILE} echo Getting http://${SRV_NAME}/${SRV_DIR}/ for available dilberts ${WGET} http://${SRV_NAME}/${SRV_DIR}/ -O ${INDEX_HTML} echo Index received. cat ${INDEX_HTML} | grep OPTION.*dilbert-[0-9][0-9]*.html | awk -F\" '{print$2}' | sort | uniq | awk -F\/ '{print $1"/"$2"/"$3"/"$4"/#"$5}' > ${ALL_HTML_URLS} #cat ${INDEX_HTML} | grep OPTION.*archive/dilbert.*html | awk -F\" '{print$2$3}' | sort | uniq | awk -F\/ '{print $1"/"$2"/"$3"/"$4"/#"$5}' | awk -F\> '{print $1"#"$2}' | sed s/\.\ /-/g > ${ALL_HTML_URLS} echo Index filtered. for I in `cat ${ALL_HTML_URLS}` ; do HTML_PATH=`echo ${I} | awk -F\# '{print$1}'` HTML_NAME=`echo ${I} | awk -F\# '{print$2}'` GIF_NAME=${HTML_NAME%.*}.gif JPG_NAME=${HTML_NAME%.*}.jpg #GIF_NAME=`echo ${I} | awk -F\# '{print "dilbert2000"$3}'`.gif IMAGE_DATE_DIR=$(echo ${HTML_NAME%.*} | sed "s/dilbert-//g;s/\([0-9][0-9][0-9][0-9]\)\([0-9][0-9]\)\([0-9][0-9]\)/\1\/\2/g" ) IMAGE_DIR=${BASE_IMAGE_DIR}/${IMAGE_DATE_DIR} WWW_SERVER_DIR=${BASE_WWW_SERVER_DIR}/${IMAGE_DATE_DIR} mkdir -p ${IMAGE_DIR} #echo "${HTML_NAME} --\> IMAGE_DATE_DIR=${IMAGE_DATE_DIR}" if [ -f ${IMAGE_DIR}/${GIF_NAME} -o -f ${IMAGE_DIR}/${JPG_NAME} ] then if [ -f ${IMAGE_DIR}/${GIF_NAME} ] then echo Skipping ${GIF_NAME} else echo Skipping ${JPG_NAME} fi else echo Missing ${GIF_NAME} --\> Getting ${HTML_NAME} for exact image name rm -f ${TEMP_FILES}/${HTML_NAME} echo Getting http://${SRV_NAME}${HTML_PATH}${HTML_NAME} ${WGET} http://${SRV_NAME}${HTML_PATH}${HTML_NAME} -O ${TEMP_FILES}/${HTML_NAME} GIF_URL=http://${SRV_NAME}`cat ${TEMP_FILES}/${HTML_NAME} | grep href.*dilbert[0-9][0-9]*\. | grep -v SendAStrip | awk -F\" '{print$14}'` echo Getting image file ${GIF_URL} ${WGET} ${GIF_URL} -O ${IMAGE_DIR}/${GIF_NAME} chmod a+r ${IMAGE_DIR}/${GIF_NAME} # We now have a file which we think is the GIF file. # Because it happens too often that we get an HTML file instead # we check for the actual format of the file. if echo `file ${IMAGE_DIR}/${GIF_NAME}` | grep -i "GIF image" > /dev/null then if identify ${IMAGE_DIR}/${GIF_NAME} > /dev/null; then echo "The file ${IMAGE_DIR}/${GIF_NAME} is a good GIF image" echo ${WWW_SERVER_DIR}/${GIF_NAME} >> ${URL_TEMP_FILE} else echo "The file ${IMAGE_DIR}/${GIF_NAME} is a GIF image with errors in it." echo Deleting ${IMAGE_DIR}/${GIF_NAME} rm -f ${IMAGE_DIR}/${GIF_NAME} fi else if echo `file ${IMAGE_DIR}/${GIF_NAME}` | grep -i "JPEG image" > /dev/null then echo "The file ${IMAGE_DIR}/${GIF_NAME} is a JPEG image" mv ${IMAGE_DIR}/${GIF_NAME} ${IMAGE_DIR}/`basename ${GIF_NAME} .gif`.jpg echo ${WWW_SERVER_DIR}/`basename ${GIF_NAME} .gif`.jpg >> ${URL_TEMP_FILE} else echo "The file ${IMAGE_DIR}/${GIF_NAME} is NOT an image" file ${IMAGE_DIR}/${GIF_NAME} echo Deleting ${IMAGE_DIR}/${GIF_NAME} rm -f ${IMAGE_DIR}/${GIF_NAME} fi fi # rm -f ${TEMP_FILES}/${HTML_NAME} fi done if [ -f ${URL_TEMP_FILE} ] then case ${MAIL_TYPE} in "ASCII" ) cat ${URL_TEMP_FILE} | /bin/mail -s "Todays new Dilberts" ${MAILING_LIST} ;; "HTML" ) rm -f ${HTML_MAIL_TEMP_FILE} echo "" >> ${HTML_MAIL_TEMP_FILE} echo "" >> ${HTML_MAIL_TEMP_FILE} echo "" >> ${HTML_MAIL_TEMP_FILE} echo "" >> ${HTML_MAIL_TEMP_FILE} echo "Dilberts" >> ${HTML_MAIL_TEMP_FILE} echo "" >> ${HTML_MAIL_TEMP_FILE} echo "" >> ${HTML_MAIL_TEMP_FILE} for IMAGEURL in `cat ${URL_TEMP_FILE}` do echo "${IMAGEURL}
" >> ${HTML_MAIL_TEMP_FILE} echo "

" >> ${HTML_MAIL_TEMP_FILE} done echo "" >> ${HTML_MAIL_TEMP_FILE} echo "" >> ${HTML_MAIL_TEMP_FILE} metasend -b -e 7bit -m "text/html" -F "Dilbert@example.nl" -s "Todays new Dilberts" -t ${MAILING_LIST} -f ${HTML_MAIL_TEMP_FILE} ;; esac fi #rm -f ${ALL_HTML_URLS} #rm -f ${URL_TEMP_FILE} #rm -f ${INDEX_HTML} #rm -f ${HTML_MAIL_TEMP_FILE}