#!/bin/bash #-------------------------------------------- # GetUserFriendly was created by ir. Niels Basjes # E-Mail : UserFriendlyScript@Basjes.nl # Homepage: http://oss.basjes.nl #-------------------------------------------- # This script is intended to automatically retrieve and store the cartoons # on the dilbert homepage so they can be read offline. # An HTML or ASCII Email is generated automatically aswell. #-------------------------------------------- # Version 1.0 10-May-1998 Initial version # Version 1.1 23-Jun-1998 Added chmod and URL mailing # Version 1.2 15-Feb-1999 Changed grep expression # Version 1.3 30-Mar-1999 Added HTML mail generation # Added Check to see if a file is really an image # Added rename if file is a JPEG # Changed grep expression from HREF to OPTION # Version 1.4 12-Apr-1999 Created version for UserFriendly # Version 1.5 19-Apr-1999 Did some optimisation for UserFriendly # Version 1.6 11-Aug-1999 Small patch because the HTML changed # Version 1.7 23-May-2000 They really changed their website this time :-( # Version 1.8 09-Feb-2001 The script needed another huge update. # Version 1.9 16-May-2001 Sneaky change from SRC= to src= MAILING_LIST=Niels@Basjes.nl #-------------------------------------------- # Select HOW the mail should be send # Use "MAIL_TYPE=ASCII" to get a plaintext ascii file with URLS # Use "MAIL_TYPE=HTML" to get an HTML file that directly shows the URLS MAIL_TYPE=HTML #-------------------------------------------- # This is the prefix for the image file name when accessing through the WWW server LOCAL_WWW_SERVER_BASE="http://cartoons.example.com/UserFriendly/" #-------------------------------------------- # File locations # IMAGE_DIR is the directory where the downloaded images will be stored # This should effectively be the same directory as $LOCAL_WWW_SERVER_BASE IMAGE_DIR=/opt/cartoons/Archive/UserFriendly # TEMP_FILES is the directory where some intermediate files will be stored TEMP_FILES=/tmp #-------------------------------------------- # Tools URLGET=/usr/local/bin/urlget WGET="wget -q" #-------------------------------------------- #-------------------------------------------- # Userfriendly server information #START_URL=http://www.userfriendly.org/cartoons/archives/2000.html START_URL=http://www.userfriendly.org/cartoons/archives/ #-------------------------------------------- TEMP_PREFIX=USERFRIENDLY #Temp files ALL_HTML_URLS=${TEMP_FILES}/${TEMP_PREFIX}_ALL_HTML_URLS ALL_HTML_URLS_CMP=${TEMP_FILES}/${TEMP_PREFIX}_ALL_HTML_URLS_CMP NEW_URLS=${TEMP_FILES}/${TEMP_PREFIX}_NEW_URLS NEW_HTML_URLS=${TEMP_FILES}/${TEMP_PREFIX}_NEW_HTML_URLS INDEX_HTML=${TEMP_FILES}/${TEMP_PREFIX}_INDEX_HTML URL_TEMP_FILE=${TEMP_FILES}/${TEMP_PREFIX}_URL_TEMP_FILE HTML_MAIL_TEMP_FILE=${TEMP_FILES}/${TEMP_PREFIX}_HTML_MAIL_TEMP_FILE EXISTING_FILES=${TEMP_FILES}/${TEMP_PREFIX}_EXISTING_FILES PATH=$PATH:/usr/local/bin #-------------------------------------------- function CleanTemp() { rm -f ${ALL_HTML_URLS} rm -f ${ALL_HTML_URLS_CMP} rm -f ${NEW_URLS} rm -f ${NEW_HTML_URLS} rm -f ${INDEX_HTML} rm -f ${URL_TEMP_FILE} rm -f ${HTML_MAIL_TEMP_FILE} rm -f ${EXISTING_FILES} } #-------------------------------------------- function HandleTrap() { echo "You pressed Ctrl-C" echo "Waiting for all outstanding downloads" wait echo -n "all completed .... " SendEMail CleanTemp echo ".... exiting." exit } trap HandleTrap INT #-------------------------------------------- function LimitConcurrentThreads() { count=`ps fa | grep $0 | grep -v grep | wc -l`; #echo We have $count threads running. while [ $count -gt $1 ]; do echo We have $count threads running. Waiting for at least one to finish. sleep 1 count=`ps fa | grep "$0" | grep -v grep | wc -l`; done } #-------------------------------------------- function GetIndexFile() { echo Getting ${START_URL} for available cartoons ${WGET} ${START_URL} -O ${INDEX_HTML} echo Index received. } #-------------------------------------------- function DetermineNewImagesInIndexFile() { grep ars.userfriendly.org/cartoons ${INDEX_HTML} | awk -F\" '{print$2}' | sort -u > ${ALL_HTML_URLS} echo Index filtered and contains `wc -l ${ALL_HTML_URLS} | awk '{print$1}'` cartoons. awk -F\= '{print $2}' ${ALL_HTML_URLS} | awk -F\& '{print $1}' | sort -u > ${ALL_HTML_URLS_CMP} find ${IMAGE_DIR} -type f | grep -v html\$ | sed s%^.*\/%%g | sed s%\\\..*%%g | sort -u > ${EXISTING_FILES} echo Previously downloaded `wc -l ${EXISTING_FILES} | awk '{print$1}'` cartoons. diff ${EXISTING_FILES} ${ALL_HTML_URLS_CMP} | grep ^\> | awk '{print$2}' > ${NEW_URLS} grep -f${NEW_URLS} ${ALL_HTML_URLS} > ${NEW_HTML_URLS} echo There are `wc -l ${NEW_HTML_URLS} | awk '{print$1}'` cartoons to be downloaded. } #-------------------------------------------- function DownloadCartoon() { URL=$1 STORE_DIR=$2; GIF_NAME=$3; LOCAL_IMAGE_DIR=${IMAGE_DIR}/${STORE_DIR} LOCAL_IMAGE_NAME=${IMAGE_DIR}/${STORE_DIR}/${GIF_NAME} LOCAL_IMAGE_URL=${LOCAL_WWW_SERVER_BASE}/${STORE_DIR}/${GIF_NAME} TEMP_IMAGE_HTML_FILE=${TEMP_FILES}/${TEMP_PREFIX}_${GIF_NAME}_HTML echo ============================================================================== echo Missing ${GIF_NAME} --\> Getting ${URL} for exact image name rm -f ${TEMP_IMAGE_HTML_FILE} echo Getting ${URL} ${WGET} ${URL} -O ${TEMP_IMAGE_HTML_FILE} echo -n Ensuring the directory ${LOCAL_IMAGE_DIR} exists..... if [ ! -d ${LOCAL_IMAGE_DIR} ]; then mkdir -p ${LOCAL_IMAGE_DIR} ; echo Created. else echo It already does. fi LINE_WITH_IMAGE_URL=`grep "src=\".*cartoons/archives".*gif ${TEMP_IMAGE_HTML_FILE}` # Strip up to the src=" LINE_WITH_IMAGE_URL_BEGINSTRIPPED=${LINE_WITH_IMAGE_URL//*src=\"/} # Strip trailing bits ACTUAL_SERVER_GIF_PATH=${LINE_WITH_IMAGE_URL_BEGINSTRIPPED//gif*/gif} # Now check if the HTML file actually contains a link to a GIF file. if [ -z "${ACTUAL_SERVER_GIF_PATH}" ]; then echo The page ${URL} does NOT contain any images. else GIF_URL=${ACTUAL_SERVER_GIF_PATH} echo Getting image file ${GIF_URL} echo ${WGET} -s ${GIF_URL} -O ${LOCAL_IMAGE_NAME} ${WGET} ${GIF_URL} -O ${LOCAL_IMAGE_NAME} chmod a+r ${LOCAL_IMAGE_NAME} # We now have a file which we think is the GIF file. # Because it happens too often that we get an HTML file instead # we check for the actual format of the file. if echo `file ${LOCAL_IMAGE_NAME}` | grep -i "GIF image" > /dev/null then if identify ${LOCAL_IMAGE_NAME} > /dev/null; then echo "The file ${LOCAL_IMAGE_NAME} is a good GIF image." echo ${LOCAL_IMAGE_URL} echo ${LOCAL_IMAGE_URL} >> ${URL_TEMP_FILE} else echo "The file ${LOCAL_IMAGE_NAME} is a GIF image with errors in it." echo Deleting ${LOCAL_IMAGE_NAME} rm -f ${LOCAL_IMAGE_NAME} fi else if echo `file ${LOCAL_IMAGE_NAME}` | grep -i "JPEG image." > /dev/null then echo "The file ${LOCAL_IMAGE_NAME} is a JPEG image." mv ${LOCAL_IMAGE_NAME} ${LOCAL_IMAGE_NAME%%.gif}.jpg echo ${LOCAL_IMAGE_URL%%.gif}.jpg >> ${URL_TEMP_FILE} else echo "The file ${LOCAL_IMAGE_NAME} is NOT an image." file ${LOCAL_IMAGE_NAME} echo Deleting ${LOCAL_IMAGE_NAME} rm -f ${LOCAL_IMAGE_NAME} fi fi fi rm -f ${TEMP_IMAGE_HTML_FILE} } #-------------------------------------------- function DownloadCartoonURL() { URL=$1 # URL=http://ars.userfriendly.org/cartoons/?id=20010131&mode=classic DATESTR=`echo ${URL} | awk -F\= '{print$2}'|awk -F\& '{print$1}'` # DATESTR=200101031 YEAR=${DATESTR/%[0-9][0-9][0-9][0-9]/} # YEAR=2001 YEARMONTH=${DATESTR/%[0-9][0-9]/} # YEARMONTH=200101 MONTH=${YEARMONTH/[0-9][0-9][0-9][0-9]/} # MONTH=01 GIF_NAME=${DATESTR}.gif # GIF_NAME=200101031.gif echo ============================================================================== if [ -f ${IMAGE_DIR}/${YEAR}/${MONTH}/${GIF_NAME} ] then echo Skipping ${GIF_NAME} else LimitConcurrentThreads 12 echo Downloading ${GIF_NAME} echo DownloadCartoon $URL ${YEAR}/${MONTH} $GIF_NAME DownloadCartoon $URL ${YEAR}/${MONTH}/ $GIF_NAME & fi } function DownloadAllCartoons() { for URL in `cat ${NEW_HTML_URLS}` ; do DownloadCartoonURL "$URL" done # Wait till all have completed wait } #-------------------------------------------- function SendEMail() { if [ -f ${URL_TEMP_FILE} ] then case ${MAIL_TYPE} in "ASCII" ) cat ${URL_TEMP_FILE} | sort | /bin/mail -s "Todays new UserFriendly" ${MAILING_LIST} ;; "HTML" ) rm -f ${HTML_MAIL_TEMP_FILE} echo "" >> ${HTML_MAIL_TEMP_FILE} echo "" >> ${HTML_MAIL_TEMP_FILE} echo "" >> ${HTML_MAIL_TEMP_FILE} echo "" >> ${HTML_MAIL_TEMP_FILE} echo "User Friendly" >> ${HTML_MAIL_TEMP_FILE} echo "" >> ${HTML_MAIL_TEMP_FILE} echo "" >> ${HTML_MAIL_TEMP_FILE} for IMAGEURL in `cat ${URL_TEMP_FILE} | sort` do echo "${IMAGEURL}
" >> ${HTML_MAIL_TEMP_FILE} echo "

" >> ${HTML_MAIL_TEMP_FILE} done echo "" >> ${HTML_MAIL_TEMP_FILE} echo "" >> ${HTML_MAIL_TEMP_FILE} metasend -b -e 7bit -m "text/html" -F "UserFriendly@example.com" -s "Todays new UserFriendly" -t ${MAILING_LIST} -f ${HTML_MAIL_TEMP_FILE} ;; esac fi } #-------------------------------------------- function main() { CleanTemp GetIndexFile DetermineNewImagesInIndexFile DownloadAllCartoons SendEMail CleanTemp } main