#! /bin/sh : #ident "@(#)smail/util:RELEASE-3_2_0_121:checkerr.sh,v 1.53 2005/11/17 20:37:44 woods Exp" # # Copyright (C) 1987, 1988 Ronald S. Karr and Landon Curt Noll # Copyright (C) 1992 Ronald S. Karr # See the file COPYING, distributed with smail, for restriction # and warranty information. # Perform various cleanups and check for errors which have been frozen # in the Smail error directory. Notes about cleanup actions or errors # common to all spool_dirs are saved in the $TMPDIR/.chkerr.msg file. # # For each spool directory the frozen error queue is checked for new # messages. If any are found then information related to those errors # is saved in the file .checkerror which is then mailed to the # postmaster along with the contents of the $TMPDIR/.chkerr.msg file. # # If the mail cannot be sent to the postmaster at the moment, perhaps # because of a local configuration error, then the .checkerror and # $TMPDIR/.chkerr.msg files will be left behind so they can hopefully # be sent by some future invocation of this script. umask 022 PATH="/usr/local/libexec/smail:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin"; export PATH SMAIL_PROGRAM="/usr/local/sbin/sendmail" UTIL_BIN_DIR=/usr/local/libexec/smail argv0=`basename $0` GETOPT=${UTIL_BIN_DIR}/getopt # XXX consider adding an option to turn off statistics reporting USAGE="Usage: $argv0 [-v]" very_verbose=false set -- `${GETOPT} -n $argv0 -q v ${1+"$@"}` if [ "$?" -ne 0 ]; then echo ${USAGE} 1>&2 exit 2 fi for i in ${1+"$@"} ; do case "$i" in -v) very_verbose=true shift ;; --) shift break ;; -?) echo ${USAGE} 1>&2 exit 2 ;; esac done if [ $# -ne 0 ] ; then echo ${USAGE} 1>&2 exit 2 fi # It seems some implementations of expr don't like the explicit # anchoring of the RE, but unfortunately some need it, so we'll just # have to try throwing away stderr.... # if expr "`id`" : '^uid=0(root)' >/dev/null 2>&1 ; then : else echo "$argv0: ERROR: you must be root to do this!" 1>&2 exit 1 fi SPOOL_DIRS=`${SMAIL_PROGRAM} -bP spool_dirs` TMPDIR="/var/spool/smail/tmp" export TMPDIR if [ ! -d ${TMPDIR} ] ; then if mkdir ${TMPDIR} ; then chmod 700 ${TMPDIR} else echo "$argv0: ${TMPDIR} may exist as a file!" 1>&2 exit 1 fi fi cd ${TMPDIR} # WARNING: this script could fail to interlock properly if it can be # invoked simultaneously with more than one name! Don't do that! # LOCKDIR=${TMPDIR}/${argv0}.lock PIDFILE=${LOCKDIR}/${argv0}.pid if [ -s ${PIDFILE} ] ; then if kill -0 `cat ${PIDFILE}` > /dev/null 2>&1 ; then : # still running... error reported below... else # system probably rebooted while checkerr was running... rm ${PIDFILE} rmdir ${LOCKDIR} fi fi if [ -e ${LOCKDIR} ] ; then echo "It appears there's already an instance of ${argv0} running..." 1>&2 exit 1 fi if mkdir ${LOCKDIR} ; then : #got it! else echo "Oops, just missed grabbing ${LOCKDIR}!" 1>&2 exit 1 fi echo $$ > ${PIDFILE} HOSTNAME=`${SMAIL_PROGRAM} -bP primary_name` SMAIL_LIB_DIR=`${SMAIL_PROGRAM} -bP smail_lib_dir` DEAD_MAIL_ERE_FILE="${SMAIL_LIB_DIR}/dead-mail.egrep" LOGFILE=`${SMAIL_PROGRAM} -bP logfile` # Old fashioned smaillog form OLD_LOGFILE=`echo ${LOGFILE} | sed -e 's,^\(.*\)/\([^/][^/]*\)$,\1/OLD/\2,'` PANICLOG=`${SMAIL_PROGRAM} -bP paniclog` # Old fashioned smaillog form OLD_PANICLOG=`echo ${PANICLOG} | sed -e 's,^\(.*\)/\([^/][^/]*\)$,\1/OLD/\2,'` DOT_Z=".gz" ZCAT="gunzip -c" # XXX The opening paragraphs should only be included if there are new # messages in the error queue, and in theory the subject line could be # customized to indicate this as well..... rm -f ${TMPDIR}/.chkerr.top cat > ${TMPDIR}/.chkerr.top < Subject: Recent Mail errors and/or statistics on the host ${HOSTNAME} Note that any messages which failed and are now "frozen" in the error queue, which may be listed with 'mailq -E'. Delivery of these messages should retried with 'unfreezemail' when the situation which caused the error has been taken care of. Often these failed messages are un-returnable bounces -- i.e. messages which could not be delivered and which also now cannot be returned to their sender. Messages which can neither be delivered nor bounced because of invalid addresses should be examined directly to see if valid addresses can be determined from their content, and if so then the queue file may be edited to correct their delivery address(es) prior to running 'unfreezemail'. If no valid address can be determined then such messages can be disposed of as appropriate (eg. deleted with 'unfreezemail -D'). END_OF_FILE # first check to see if the paniclog has new output since the last time # # NOTE: The ${TMPDIR}/.chkerr.msg file is not cleared in case it # contains output from some earlier run that was not successfully sent # to the Postmaster. # # Too bad a real "stat" command was not invented long ago. We can't # even narrow this down to just the date fields without knowing what # version of "ls" we're using.... # ls -l ${PANICLOG} > ${TMPDIR}/.newpanic_chk 2>&1 if [ -f ${TMPDIR}/.lastpanic_chk ]; then NEWTIME=`cat ${TMPDIR}/.newpanic_chk` OLDTIME=`cat ${TMPDIR}/.lastpanic_chk` # # Instead of just ignoring an empty paniclog we should try # looking for a recently archived one, and if it's uncompressed # then check its timestamp and maybe tail it instead.... # if [ "${NEWTIME}" != "${OLDTIME}" -a -s ${PANICLOG} ] ; then echo "" echo "NOTICE: There appear to be new entries in the paniclog! Here's its tail:" echo "" tail ${PANICLOG} echo "" fi >> ${TMPDIR}/.chkerr.msg else if [ -s ${PANICLOG} ] ; then echo "" echo "NOTICE: The panic log may have new entries! Here's its tail:" echo "" tail ${PANICLOG} echo "" fi >> ${TMPDIR}/.chkerr.msg fi mv -f ${TMPDIR}/.newpanic_chk ${TMPDIR}/.lastpanic_chk > ${TMPDIR}/.chkerr.relays > ${TMPDIR}/.chkerr.mxerrs > ${TMPDIR}/.chkerr.stats ${UTIL_BIN_DIR}/logsumm -Ev >> ${TMPDIR}/.chkerr.stats if [ -s ${TMPDIR}/.chkerr.relays ]; then echo "" echo "The following hosts attempted to relay mail through this server:" echo "" sort -u ${TMPDIR}/.chkerr.relays | pr -4 -t fi >> ${TMPDIR}/.chkerr.msg if [ -s ${TMPDIR}/.chkerr.mxerrs ]; then # # Note: Currently this only details broken domains which we have # successfully delivered to because allow_one_mx_target_cname_hack # was set. When that flag is not set then invalid MX RRs are rejected # at SMTP command (MAIL FROM: or RCPT TO:) time, or when message # delivery is attempted, so we assume the sender sees the problem # immediately. # echo "" echo "The following DNS MXs with bad values were mentioned in yesterday's logfile:" echo "" echo "Target MX Domain MX's Target Hostname CNAME's target host" echo "-------------------------- -------------------------- -------------------" echo "" tr A-Z a-z < ${TMPDIR}/.chkerr.mxerrs | sort -u echo "" echo "If possible you should attempt to contact those responsible for each" echo "of the target domains and the invalid CNAME domains and have them fix" echo "their error(s)." fi >> ${TMPDIR}/.chkerr.msg if [ -s ${TMPDIR}/.chkerr.stats ]; then echo "" echo "Here are the overall statistics for the past reporting period:" echo "" cat ${TMPDIR}/.chkerr.stats fi >> ${TMPDIR}/.chkerr.msg echo "" >> ${TMPDIR}/.chkerr.msg echo "Current mail queue summary status:" >> ${TMPDIR}/.chkerr.msg echo "" >> ${TMPDIR}/.chkerr.msg mailq -s >> ${TMPDIR}/.chkerr.msg 2>&1 # Now we go into each spooling directory and look for new messages that have # been frozen into the error queue. # # NOTE: only the main SPOOLDIR will have the PANICLOG and LOGFILE summary # information from above # ( # silly trick to split SPOOL_DIRS on ':' without setting IFS in # the main script body IFS=: for i in ${SPOOL_DIRS}; do echo $i done ) | while read SPOOLDIR; do export SPOOLDIR if [ ! -d ${SPOOLDIR} ] ; then # spool directory does not exist -- ignore it... continue fi cd ${SPOOLDIR} if [ $? != 0 ]; then # spool directory problems echo "" echo "WARNING: problem with changing to spool directory: $SPOOLDIR" echo "" continue fi >> ${TMPDIR}/.chkerr.msg if [ ! -d tmp ]; then mkdir tmp chmod 755 tmp fi if [ ! -d msglog ]; then mkdir msglog chmod 755 msglog fi if [ ! -d input ]; then mkdir input chmod 700 input fi if [ ! -d error ]; then mkdir error chmod 700 error fi # if the last run found some errors, but couldn't deliver, try again now # if [ -s .checkerror ]; then # don't send to the Postmaster if configuration errors still exist if ${SMAIL_PROGRAM} -bv Postmaster > /dev/null 2>&1 ; then ( sed -e "s|_SPOOLDIR_|${SPOOLDIR}|g" ${TMPDIR}/.chkerr.top echo "" if [ -s ${TMPDIR}/.chkerr.msg ] ; then cat ${TMPDIR}/.chkerr.msg echo "" fi cat .checkerror ) | ${SMAIL_PROGRAM} -f"<+>" -eq -m Postmaster if [ $? -ne 0 ]; then continue fi rm -f .checkerror ${TMPDIR}/.chkerr.msg else # if we cannot reach the postmaster, don't go to the next step # but do allow the caller (cron?) to collect the error output # ${SMAIL_PROGRAM} -bv Postmaster continue fi fi # make sure we start again with a fresh empty message file # rm -f .checkerror > .checkerror # remove old msg.* files in the input directory # find input -name 'msg.*' -mtime +2 -print | xargs rm -f "" # remove msglog files with no corresponding input or error file: # find msglog -name '[0-9]*' -print | sed 's|^msglog/||' | while read i; do if [ -f msglog/$i -a ! -f input/$i -a ! -f error/$i ]; then # XXX WARNING: if msglog file has been created first, as # it must be, then there's a potential race condition # here. Test twice to try to avoid it. Putting in a # sleep would be more certain, but could also cause # adverse delays. # if [ -f msglog/$i -a ! -f input/$i -a ! -f error/$i ]; then rm -f msglog/$i fi fi done # cleanup retry files older than any maximum retry time: # if [ -d retry ]; then x=`${SMAIL_PROGRAM} -bP retry_duration` # turn it back into #-of-days for find... def_retry_dur=`${SMAIL_PROGRAM} -xD $x` max_retry_dur=${def_retry_dur} RETRY_FILE=`${SMAIL_PROGRAM} -bP retry_file` if [ -f "${RETRY_FILE}" ]; then # sometimes using sh is like sucking dead bears # through hollow logs... we have to do this nonsense # because while loops that read from pipes are # implemented in subshells so we force the issue so # that we can use an extra echo command in the # subshell to get the new value of max_retry_dur back # out again without using temporary data files... # max_retry_dur=`awk '$1 ~ /^#/ { next; } $1 ~ /^$/ { next; } $2 ~ /\/./ { sub(/^.*\//, "", $2); printf("%s\n", $2); } ' $RETRY_FILE | ( while read x ; do y=\`${SMAIL_PROGRAM} -xD $x\` if [ $y -gt ${max_retry_dur} ] ; then max_retry_dur=$y fi done echo ${max_retry_dur} )` fi if [ ${max_retry_dur} -le 0 ] ; then max_retry_dur=1 fi find retry -type f -mtime +${max_retry_dur} -print | xargs rm -f "" fi # find bounces being returned to local mailboxes but which could # not be delivered (presumably locally) because of temporary # failures (eg. quota limits with Cyrus) and try to deliver them # again.... # find msglog -name '[0-9]*' -print | \ xargs grep -l '^Xfail: <.*> reason: (ERR144) transport [^:]*: child returned status EX_TEMPFAIL (75)$' | \ sed 's|^msglog/||' > .tempfail_bounces if [ -s .tempfail_bounces ]; then echo "Attempting re-delivery of" `wc -l < .tempfail_bounces` "EX_TEMPFAIL bounces...." xargs unfreezemail < .tempfail_bounces : > .remaining_tempfail_bounces while read f; do if [ -f error/$f ]; then echo $f >> .remaining_tempfail_bounces fi done < .tempfail_bounces mv .remaining_tempfail_bounces .tempfail_bounces if [ -s .tempfail_bounces ]; then echo "There are still" `wc -l < .tempfail_bounces` "EX_TEMPFAIL bounces remaining." echo "(see ${SPOOLDIR}/.tempfail_bounces)" fi echo "" fi >> .checkerror # remove any frozen messages containing patterns matching known # virii & worms, spam, or whatever # if [ -s ${DEAD_MAIL_ERE_FILE} ]; then # NOTE: this won't clean out older messages when new REs are added... # if [ -f .lasttimedone ]; then newer="-newer .lasttimedone" else newer="" fi find error ${newer} -name '[0-9]*' -print | sed 's|^\./||' | \ eval xargs egrep -l `sed -e '/^$/d' -e '/^#/d' -e '/^-i$/!s/^\(.*\)$/-e '"'"'\1'"'"'/' ${DEAD_MAIL_ERE_FILE}` | \ sed 's|^error/||' | \ sort | \ comm -23 - .tempfail_bounces > .dead_bounces if [ ! -s .dead_bounces ]; then if ${very_verbose}; then echo "No frozen double-bounced message matched the patterns" >> .checkerror echo "in ${DEAD_MAIL_ERE_FILE}" >> .checkerror echo "" >> .checkerror fi fi else : > .dead_bounces fi rm -f .dead_bounce_senders : > .dead_bounce_senders rm -f .dead_double_bounces : > .dead_double_bounces # now again for the ERE-only matches # # NOTE: keep in sync with below -- XXX should be a function.... # if [ -s .dead_bounces ]; then while read bounce ; do # Don't delete this message if it's not a # double-bounce, i.e. if it doesn't have a new sender # address of '<>' or '<+>' # # The sender address is the line right after the last # '!-f', but we'll assume there's only ever one such # line and so search only for the first one. # double_bounce="`sed -n -e '/^$/q' -e '1,/^!-f$/d;/<>/p;/<\+>/p;q' error/${bounce}`" # If this message is a double-bounce then find the # destination address for it as it is about to deleted. # # The destination address is actually the sender address # for the original un-deliverable message which cannot now # be returned and is thus a double-bounce. This list of # sender addresses can be used to reject future e-mails # from these obviously bogus or broken sender addresses. # # Note that we don't care here if the sender is "local" or # not since we couldn't deliver this bounce to it and it # contained known unwanted content. There is though some # minor chance that if this host is a gateway to an # internal server that rejected the bounce because of # content then we might end up adding a valid internal # user's address. The correct solution here is to make # sure this gateway implements the same policies as the # internal server so that it doesn't get itself into such # a situation in the first place. # # We assume there's only ever one recipient for a bounce. # if [ -n "${double_bounce}" ]; then echo ${bounce} >> .dead_double_bounces mailq -E ${bounce} | \ sed -n '/^ Rcpts: /s///p' >> .dead_bounce_senders fi done < .dead_bounces fi # add a nice comment if it'll be a new dead-mail.senders file # if [ ! -s ${SMAIL_LIB_DIR}/dead-mail.senders ]; then ( echo "#" echo "# NOTICE: entries are automatically appended to this file by '$argv0'" echo "#" echo "# you can point smtp_sender_reject_db at this file" echo "#" echo "# old entries, e.g. ones prior to 2003/01/21, could be" echo "# removed from this file using a filter like this:" echo "#" echo "# awk '\$1 ~ /^#/ || \$4 > 20030121 { print \$0;}' " echo "#" ) > ${SMAIL_LIB_DIR}/dead-mail.senders fi if [ -s .dead_double_bounces ]; then # The .dead_bounces messages which are not listed in # .dead_double_bounces are probably mis-matches. We'll # ignore them for now, but perhaps they should be noted as # such so that the dead-mail.egrep patterns can be made # more accurate if possible. # # On the other hand these messages could be bounces # being relayed through this host as a smart-host, and # now we're stuck with them. In this case the best # thing to do is simply remove them, but of course # that can only be done after careful manual # examination. # rm -f "" `sed 's|^|error/|' .dead_double_bounces` rm -f "" `sed 's|^|msglog/|' .dead_double_bounces` echo "Removed" `wc -l < .dead_double_bounces` "frozen and double-bounced messages matching" echo "the patterns in ${DEAD_MAIL_ERE_FILE}!" echo "" echo "The list of corresponding message-IDs for these removed bounces remains in" echo `pwd`/.dead_double_bounces "until the next run of '$argv0'." echo "" fi >> .checkerror # find any new errors and report any related information.... # if [ -f .lasttimedone ]; then mv -f .lasttimedone .thistime : > .lasttimedone find error -newer .thistime -name '[0-9]*' -print else : > .lasttimedone find error -name '[0-9]*' -print fi | sed 's|^error/||' | while read f; do echo "" echo "------------------ Message $f ------------------" # Make a special note about messages which are likely frozen # because of configuration or system errors. These are ones # with non-empty sender addresses (not '<>' nor '<+>') -- # i.e. they are not bounces. # # The sender address is the line right after the last '!-f', # but we'll assume there's only ever one such line and so # search only for the first one. # # (Note the nojump label is there because that's the only way # to reset the flag used by the 't' command so that the 't quit' # commands, which branch around the 'p', work properly.) # frozen_sender="`sed -n -e '/^$/q' -e '1,/^!-f$/d;s/^!//;t nojump :nojump s/<>//;t quit s/<\+>//;t quit p; :quit q' error/$f`" if [ -n "$frozen_sender" ] ; then echo "" echo "WARNING: This message was likely frozen because of some local system error" echo "or Smail configuration error! (it is not a bounce message)" echo "" fi # detect anything that is likely a dead double-bounce # and add any non-local recipient to the .dead_bounce_senders list # # note: for now we will do this explicitly instead of just # using [ -z "$frozen_sender" ] # if [ -n "`sed -n -e '/^$/q' -e '1,/^!-f$/d;/<>/p;/<\+>/p;q' error/$f`" ]; then # record undeliverable addresses from double-bounces for # smtp_reject_sender_db # # Note these are not deleted -- they still need manual attention. # sender="`mailq -E $f | sed -n '/^ Rcpts: /s///p'`" # This is a really nasty hack but it's the best I can think of! # addrinfo="`$SMAIL_PROGRAM -v -bv ${sender} 2>&1`" dir_count=`expr "${addrinfo}" : '.*director .*'` # note: don't depend on the rewrite router being called "rewrite"... rew_count=`expr "${addrinfo}" : '.*reparse address .*'` echo "" if [ $dir_count -gt 0 ]; then echo "WARNING: This message is a double bounce from a local sender." echo "The address <${sender}> was not deliverable." echo "This message should be unfrozen when the problem has been resolved." elif [ $rew_count -gt 0 ]; then echo "WARNING: This message is a double bounce from a virtual-host sender." echo "The address <${sender}> was not deliverable." echo "This message should be unfrozen when the problem has been resolved." else echo "Note: This message is a double bounce from a non-local sender." echo "Logging sender address <${sender}> as invalid." echo $sender >> .dead_bounce_senders fi echo "" fi # next we include a full status of the message in .checkerror # mailq -E -v $f # This next part is extremely inefficient on any busy mail # server, which is why it's turned off by default. # # It would be better to simply collect the list of message IDs # and then use 'fgrep -f msgid-list' to grab all the related # entries, however that can be done at any time later too (so # long as the relevant log files still exist, of course). # if ${very_verbose}; then ( if [ -f ${LOGFILE} ]; then cat ${LOGFILE}; fi if [ -f ${LOGFILE}.0 ]; then cat ${LOGFILE}.0; # aka newsyslog elif [ -f ${LOGFILE}.0${DOT_Z} ]; then ${ZCAT} ${LOGFILE}.0${DOT_Z}; # aka newsyslog fi if [ -f ${OLD_LOGFILE}.0 ]; then cat ${OLD_LOGFILE}.0; # aka smaillog elif [ -f ${OLD_LOGFILE}.0${DOT_Z} ]; then ${ZCAT} ${OLD_LOGFILE}.0${DOT_Z}; # aka smaillog fi ) | sed -e ':l1 /^[^|].*\[m'$f'\]/{ :l2 s/^/ / n /^|/!b l1 s/^| /| / b l2 } d' > .tmp_logfile if [ -s .tmp_logfile ]; then echo "" echo "Recent logfile entries related to this message are:" cat .tmp_logfile fi rm .tmp_logfile fi # normally the paniclog should be quite small so this isn't # too inefficient to always run.... # ( if [ -f ${PANICLOG} ]; then cat ${PANICLOG}; fi if [ -f ${PANICLOG}.0 ]; then cat ${PANICLOG}.0; # aka newsyslog elif [ -f ${PANICLOG}.0${DOT_Z} ]; then ${ZCAT} ${PANICLOG}.0${DOT_Z}; # aka newsyslog fi if [ -f ${OLD_PANICLOG}.0 ]; then cat ${OLD_PANICLOG}.0; # aka smaillog elif [ -f ${OLD_PANICLOG}.0${DOT_Z} ]; then ${ZCAT} ${OLD_PANICLOG}.0${DOT_Z}; # aka smaillog fi ) | grep "\[m$f\]" | sed 's/^/ /' > .tmp_paniclog if [ -s .tmp_paniclog ]; then echo "" echo "Recent paniclog entries related to this messsage are:" cat .tmp_paniclog fi rm .tmp_paniclog done >> .checkerror if [ -s .dead_bounce_senders ]; then # take the unique sender addresses from the combined list # created above and add them to the dead-mail.senders file in # a format suitable for use with lsearch_database() # which will be pointed to by default by smtp_reject_senders_db # DATE_ADDED=`date +%Y%m%d` sort -u .dead_bounce_senders | \ sed -e 's/^$//' \ -e 's/$/: blocked since '${DATE_ADDED}' due to undeliverable error messages/' fi >> ${SMAIL_LIB_DIR}/dead-mail.senders if [ -s .checkerror -o -s ${TMPDIR}/.chkerr.msg ]; then # don't try to send to the Postmaster if configuration errors still exist if ${SMAIL_PROGRAM} -bv Postmaster > /dev/null 2>&1 ; then ( sed -e "s|_SPOOLDIR_|${SPOOLDIR}|g" ${TMPDIR}/.chkerr.top echo "" if [ -s ${TMPDIR}/.chkerr.msg ] ; then cat ${TMPDIR}/.chkerr.msg fi if [ -s .checkerror ] ; then echo "" echo "New error queue entry details for ${SPOOLDIR}:" echo "" cat .checkerror fi ) | ${SMAIL_PROGRAM} -f"<+>" -eq -m Postmaster if [ $? -eq 0 ]; then rm -f .checkerror ${TMPDIR}/.chkerr.msg fi else # in this case we're about to move onto the next # spool_dir, if there is one, so we'll just allow the # caller (cron?) to collect the error output # ${SMAIL_PROGRAM} -bv Postmaster fi fi done if [ ! -d ${LOCKDIR} ] ; then echo "$argv0: OOPS!!!! ${LOCKDIR} went missing!!!!" 1>&2 exit 1 fi rm -f ${PIDFILE} rmdir ${LOCKDIR} exit 0