#! /bin/awk -f -

#ident	"@(#)smail/util:RELEASE-3_2_0_121:logsumm.awk,v 1.46 2005/07/20 20:40:37 woods Exp"
#
#	logsumm.awk - guts of logsumm, in AWK
#
# This script expects the following variables to be set on the command-line:
#
#	DEBUG		- whether to print debug for unmatched errors (0 or 1)
#	ERRORS		- whether to collect error info for chkerr (0 or 1)
#	TMPDIR		- where to store the error info
#
# Normally this script is invoked by the logsumm shell wrapper script.
#
# WARNING: the following code assumes SMAIL_LOG_STYLE=2.
#
# Note:  We essentially ignore continued log lines (eg. those which have SMTP
# error responses appended to them).
#
# Note that some people might want top-10 destination summaries, etc.  That's
# not what this particular script is about.  Such reports are probably best
# implemented in a separate, though perhaps similar, script.
#
BEGIN {
	if (TMPDIR != "") {
		mxerr_file = TMPDIR "/.chkerr.mxerrs";
		relayerr_file = TMPDIR "/.chkerr.relays";
		sizes_file = TMPDIR "/.chkerr.msgsizes";
	} else {
		mxerr_file = "chkerr.mxerrs";
		relayerr_file = "chkerr.relays";
		sizes_file = "chkerr.msgsizes";
	}
	sizes_pipe = "sort -n > " sizes_file;
	connections = 0;
	too_many_connections = 0;
	many_connections = 0;
	max_connections = 0;
	ignored_hello_reject = 0;
	ignored_hello_and_mail_reject = 0;
	ignored_hello_reject_other = 0;
	early_mail = 0;
	early_rcpt = 0;
	early_data = 0;
	other_before_hello = 0;
	resets = 0;
	hello_warnings = 0;
	data_error_ignored = 0;
	data_rejected = 0;
	slammers = 0;
	command_too_long_errors = 0;
	invalid_char_errors = 0;
	hello_refused = 0;
	received = 0;
	delivered = 0;
	deferred = 0;
	completed = 0;
	returned = 0;
	frozen_double_bounces = 0;
	error_notifications_sent = 0;
	errors_sent_to_postmaster = 0;
	errors_sent_without_error = 0;
	succ_outbound_smtp = 0;
	succ_outbound_other = 0;
	failed = 0;
	size = 0;
	max_size = 0;
	min_size = 0;
	rcvd_bytes = 0;
	bad_client_names = 0;
	sender_deferrals = 0;
	invalid_senders = 0;
	invalid_recipients = 0;
	recipient_deferrals = 0;
	not_enough_spool_space = 0;
	clients_with_bad_local_sender = 0;	
	ignored_esmtp_size = 0;
	sent_too_much_data = 0;
	unexpected_error_during_data = 0;

	reason = "";			# temporary $0 copy
	rcvd_ssq = 0.0;			# float: sum of (values^2), a big number

	# initialise arrays
	# -- these "null address" elements must be deleted afterwards
	delivered_type[""] = 0;
	dnsbl[""] = 0;
	rcvd_bytes_type[""] = 0;
	received_type[""] = 0;
	hello_rejected_type[""] = 0;
	hello_warning_type[""] = 0;
	invalid_sender_type[""] = 0;
	invalid_recip_type[""] = 0;
	client_with_bad_local_sender[""] = 0;
	delay_sum[""] = 0;
	queued_delay_sum[""] = 0;
	queued_recv_sum[""] = 0;
	max_delay[""] = 0;
	max_queued_delay[""] = 0;
	max_recv_delay[""] = 0;
}
NR == 1 {
	sub(/:$/, "", $2);
	start_time = $1 " " $2;
}
($1 ~ /[0-9][0-9][0-9][0-9]\/[0-9][0-9]\/[0-9][0-9]/ || $1 ~ /[0-9][0-9]\/[0-9][0-9]\/[0-9][0-9][0-9][0-9]/) && $2 ~ /[0-9][0-9]:[0-9][0-9]:[0-9][0-9]:/ {
	# any timestamp could be the last time!
	# (note: do NOT call sub() on every record
	# just to clobber the silly trailing colon)
	last_time = $1 " " $2;
}
$5 == "Received" {
	received++;
	program = "";
	protocol = "";
	for (fn = 6; fn <= NF; fn++) {
		if (substr($fn, 1, 8) == "PROGRAM:") {
			program = substr($fn, 9);
		}
		if (substr($fn, 1, 9) == "PROTOCOL:") {
			protocol = substr($fn, 10);
		}
		if (substr($fn, 1, 5) == "SIZE:") {
			size = substr($fn, 6) + 0;
		}
		if (substr($fn, 1, 6) == "DELAY:") {
			delay = substr($fn, 7) + 0;
			if (delay > 14 * 24 * 3600) {
				delay = 1;
			}
		}
	}
	if (protocol == "") {
		protocol = "STDIN";
	}
	type = program "/" protocol;
	rcvd_bytes += size;
	rcvd_ssq += size * size;
	if (size > max_size) {
		max_size = size;
	}
	if (min_size == 0 || (min_size > 0 && size < min_size)) {
		min_size = size;
	}
	rcvd_bytes_type[type] += size;
	print size | sizes_pipe;
	received_type[type]++;
	if (delay > max_recv_delay[type]) {
		max_recv_delay[type] = delay;
	}
	recv_delay_sum[type] += delay;
	recv_delay_ssq[type] += delay * delay;
	next;
}
$5 == "Delivered" {
	delivered++;
	type = "";
	delay = 0;
	queued = 0;
	remote = 0;
	for (fn = 6; fn <= NF; fn++) {
		if (substr($fn, 1, 10) == "TRANSPORT:") {
			type = substr($fn, 11);
		}
		if (substr($fn, 1, 6) == "DELAY:") {
			queued = index($fn, "Q:");
			delay = substr($fn, (queued ? 9 : 7)) + 0;
			if (delay > 14 * 24 * 3600) {
				delay = 1;
			}
		}
	}
	delivered_type[type]++;
	if (queued) {
		if (delay > max_queued_delay[type]) {
			max_queued_delay[type] = delay;
		}
		queued_delay_sum[type] += delay;
		queued_delay_ssq[type] += delay * delay;
	} else {
		if (delay > max_delay[type]) {
			max_delay[type] = delay;
		}
		delay_sum[type] += delay;
		delay_ssq[type] += delay * delay;
	}
	next;
}
$5 == "Failed" {
	failed++;
	next;
}
$5 == "Returned" {
	returned++;
	next;
}
$5 == "Deferred" {
	deferred++;
	next;
}
$5 == "Completed." {
	completed++;
	next;
}
$5 == "Error" && $6 == "sent" {
	error_notifications_sent++;
	next;
}
$5 == "Sent" && $6 == "error" && $7 != "without" {
	errors_sent_to_postmaster++;
	next;
}
$5 == "Sent" && $6 == "error" && $7 == "without" {
	errors_sent_without_error++;
	next;
}
$5 == "notify:" && $6 == "freezing" && $8 == "double-bounce" {
	frozen_double_bounces++;
	next;
}
$5 == "opened" && $6 == "smtp" {
	succ_outbound_smtp++;
	next;
}
$5 == "opened" && $6 != "smtp" {
	succ_outbound_other++;
	next;
}
$4 == "accepted" && $5 == "connection" {
	accepted++;
	if ($6 > max_connections) {
		max_connections = $6;
	}
	next;
}
$4 == "remote" && $5 == "connection" {
	connections++;
	next;
}
$4 == "connection" && $8 == "deferred," {
	too_many_connections++;
	next;
}
$4 == "using" && $5 == "queue_only" {
	many_connections++;
	next;
}
$4 == "remote" && $5 == "DEFERRED:" && $0 ~ /not enough spool space/ {
	not_enough_spool_space++;
	next;
}
$4 == "remote" && $5 == "RSET:" {
	resets++;
	next;
}
$4 == "remote" && $5 != "MAIL" && $5 != "RCPT" && $(NF-4) == "refusing" {
	ignored_hello_reject_other++;
	next;
}
#
# WARNING: anything after field #10 can be off-by-one in older log files
# because of spaces appearing in the ident string or even hostname...
#
# Also note in older log files, or even in some bizzare cases current the log
# files, for any given refused connection there may be multiple nearly
# identical records matching the following rule, one for EHLO and then one or
# more for HELO.  Although these will logcially follow one another they may be
# interspersed by many other records and so we cannot easily keep track of
# them and collapse them into one count.
#
$0 ~ /client is using the wrong hostname!/ {
	sub(/\(client is using the wrong hostname\!\) /, "", $0);
	if ($4 == "remote" && ($5 == "HELO:" || $5 == "EHLO:")) {
		bad_client_names++;
	}
}
$4 == "remote" && ($5 == "HELO:" || $5 == "EHLO:") && $6 == "sent" {
	if ($0 ~ /SMTP greeting requires a hostname/) {
		hello_rejected_type["hostname parameter missing"]++;
	} else {
		if (DEBUG) {
			print $0;
		}
		hello_rejected_type["some other protocol error with HELO/EHLO"]++;
	}
}
$4 == "remote" && ($5 == "HELO:" || $5 == "EHLO:") && ($6 == "rejected:" || $6 == "refusing") {
	hello_refused++;
	if ($(NF-2) == "DNSBL:") {
		hello_rejected_type["DNSBL match"]++;
#		if (VERBOSE) {
			# ideally we should include the A RR value so that we
			# can see what DNSBL classifications work out to, but
			# then we'd also want to include sub-totals too.
#		}
		sub(/^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+\./, "", $13);
		dnsbl[$13]++;
	} else if ($0 ~ /matched in smtp_hello_reject_hostnames/) {
		hello_rejected_type["smtp_hello_reject_hostnames"]++;
	} else {
		match_other_greeting_errors(hello_rejected_type);
	}
	next;
}
$4 == "remote" && ($5 == "HELO:" || $5 == "EHLO:") && $6 == "warning:" {
	hello_warnings++;
	match_other_greeting_errors(hello_warning_type);
	next;
}
function match_other_greeting_errors(counters) {
	if (match_rfc1035_errors(counters)) {
		;	# increment done
	} else if ($(NF-2) == "matched" && $(NF-1) == "in") {
		counters[$(NF)]++;
	} else if ($0 ~ /a hostname must be fully qualified /) {
		counters["unqualified hostname"]++;
	} else if ($0 ~ /poss[i]*ble host address (IP )?literal given with invalid syntax/) {
		counters["likely invalid address literal syntax"]++;
	} else if ($0 ~ /possible hex or octal host address literal given with invalid syntax/) {
		counters["likely invalid address literal syntax"]++;
	} else if ($0 ~ /host address literal does not match remote client address/) {
		counters["invalid address literal"]++;
	} else if ($0 ~ /is a CNAME not a hostname/) {
		counters["hostname is a CNAME"]++;
	} else if ($0 ~ /there are no DNS A records for the (hostname|domain name)/) {
		counters["no DNS A records for hostname"]++;
	} else if ($0 ~ /no DNS A records for the hostname '.*' match the address of/) {
# that one is deprecated
		counters["no DNS A records match PTR address"]++;
	} else if ($0 ~ /no DNS A records for the hostname .* match the address/) {
# that one is an old variant of next one
		counters["no DNS A records match client address"]++;
	} else if ($0 ~ /the DNS A records? for the hostname .* match the address/) {
		counters["no DNS A records match client address"]++;
	} else if ($0 ~ /No matching address found at the hostname given by address literal.s reverse DNS PTR/) {
		counters["invalid PTR target hostname for literal"]++;
	} else if ($0 ~ /Fatal rDNS hostname verification error for/) {
		counters["invalid PTR target hostname"]++;
	} else if ($0 ~ /Hostname .* does not have any address matching/) {
		counters["spoofed or mis-configured reverse DNS"]++;
	} else if ($0 ~ /Unknown host \(No reverse DNS PTR for the remote address .* has a hostname matching/) {
		counters["no such host and no valid PTR"]++;
	} else if ($0 ~ /address literals are denied if there is no valid reverse DNS PTR/) {
		counters["no valid PTR for host address literal"]++;
	} else if ($0 ~ /None of the existing reverse DNS PTRs for (your remote|the) address/) {
# that must come after other more specific errors as it can be a secondary error
		counters["broken PTR or spoofed DNS"]++;
	} else if ($0 ~ /Remote address PTR lookup failed/) {
# that must come after other more specific PTR related errors
		counters["no PTR"]++;
	} else if ($0 ~ /No reverse DNS PTR for the remote address .* has a hostname matching/) {
		counters["no matching PTR"]++;
	} else if ($0 ~ /Temporary DNS problem encountered while trying to verify host/) {
		counters["temporary DNS problems"]++;
	} else if ($0 ~ /Temporary rDNS hostname lookup error/) {
		counters["temporary DNS problems"]++;
	} else if ($0 ~ /Temporary DNS problem encountered/) {
		counters["temporary DNS problems"]++;
	} else if ($0 ~ /DNS failure trying to verify host/) {
		counters["DNS failure"]++;
	} else if ($0 ~ /Unknown host/) {
# that must come after other more specific DNS errors
		counters["no such hostname"]++;
	} else if ($0 ~ /DNS error:/) {
# Note that one matches all other possible verify_host() failures caused by DNS
		if (DEBUG > 1) {
			print $0;
		}
		counters["DNS error while validating operand"]++;
	} else if ($0 ~ /operand(syntax is)? not strictly valid/) {
		counters["operand syntax not strictly valid (warning only)"];
	} else if ($7 == "invalid" && $8 == "operand:") {
# Note that matches _all_ other possible verify_host() failures not caught above
		if (DEBUG) {
			print $0;
		}
		counters["some other operand error"]++;
	} else {
		if (DEBUG) {
			print $0;
		}
		counters["some other unknown error"]++;
	}
}
$4 == "remote" && $5 == "MAIL" && $(NF-4) == "refusing" {
	ignored_hello_reject++;
	next;
}
$4 == "remote" && $5 == "MAIL" && $0 ~ /temporary error:/ {
	sender_deferrals++;
	next;
}
$4 == "remote" && $5 == "MAIL" && $(NF-4) != "refusing" {
	invalid_senders++;
	if ($0 ~ /[cC]ommand must be preceded by/) {
		early_mail++;
		invalid_senders--;
	} else if ($0 ~ /Sender already specified/) {
		sender_repeated++;
		invalid_senders--;
	} else if ($8 == "too" && $9 == "big") {
		invalid_sender_type["ignored ESMTP size option and tried too big"]++;
	} else if ($(NF-2) == "matched" && $(NF-1) == "in") {
		invalid_sender_type["matched in " $(NF)]++;
	} else if ($(NF-3) == "matched" && $(NF-2) == "RHSBL:") {
		invalid_sender_type["RH-DNSBL match"]++;
#		if (VERBOSE) {
			# ideally we should include the A RR value so that we
			# can see what DNSBL classifications work out to, but
			# then we'd also want to include sub-totals too.
#		}
		if ($7 ~ /^\x27</) {
			if ($(NF-4) !~ /:$/) {
				bl_offset = length($(NF-4)) + 2;
			} else {
				# some snapshots didn't include the target domain at $(NF-4)
				gsub(/\\s/, "", $7);
				split($7, addr, /@|<|>/);
				bl_offset = length(addr[3]) + 2;
			}
		} else {
			# old logs won't have all senders wrapped in angles,
			# but rather just single quotes, however single quote is
			# a valid local-part char so if it appears and we also
			# always split on it then our index is messed up so we
			# only do this if the angles are not there
			split($7, addr, /@|<|>|\x27/);
			bl_offset = length(addr[3]) + 2;
		}
		rhdnsbl[substr($(NF-1), bl_offset)]++;
	} else if ($0 ~ /via message submission/) {
		invalid_sender_type["null path via message submission rejected"]++;
	} else if ($0 ~ /operand(syntax is)? not strictly valid/) {
		# Note: we don't track this warning
		invalid_senders--;
	} else if ($0 ~ /rejecting bounce from/) {
		if ($0 ~ /via message submission/) {
			invalid_sender_type["SUBMISSION client bounce attempt"]++;
		} else {
			invalid_sender_type["MUA-only client bounce attempt"]++;
		}
	} else if ($0 ~ "not deliverable:") {
		set_non_deliverable_reason(invalid_sender_type);
		if (VERBOSE) {
			clients_with_bad_local_sender++;
			sub(/.*@/, "", $7);
			sub(/:$/, "", $7);
			client_with_bad_local_sender[$7]++;
		}
	} else {
		# this call must come last
		check_addr_operand_error(invalid_sender_type);
	}

	next;
}
$4 == "remote" && $5 == "RCPT" && $(NF-4) == "refusing" {
	ignored_hello_and_mail_reject++;
	next;
}
$4 == "remote" && $5 == "RCPT" && $0 ~ /temporary failure returned to/ {
	recipient_deferrals++;
	next;
}
$4 == "remote" && $5 == "RCPT" && $(NF-4) != "refusing" {
	invalid_recipients++;

	if ($0 ~ /[cC]ommand must be preceded by/) {
		early_rcpt++;
		invalid_recipients--;
	} else if ($0 ~ /matched in smtp_reject_sender_db:/) {
		# yes, recipients that cannot be senders are rejected as well
		# because we know already that they cannot be delivered to
		invalid_recip_type["matched in smtp_reject_sender_db"]++;
	} else if ($0 ~ /matched in smtp_sender_reject_hostnames:/) {
		# yes, recipients that cannot be senders are rejected as well
		# because we know already that they cannot be delivered to
		invalid_recip_type["matched in smtp_sender_reject_hostnames:"]++;
	} else if ($0 ~ /matched in smtp_sender_reject/) {
		# yes, recipients that cannot be senders are rejected as well
		# because we know already that they cannot be delivered to
		invalid_recip_type["matched in smtp_sender_reject:"]++;
	} else if ($0 ~ /security violation:/) {
		invalid_recip_type["unauthorised remote relay attempt"]++;
	} else if ($0 ~ /operand(syntax is)? not strictly valid/) {
		# Note: we don't track this warning
		invalid_recipients--;
	} else if ($0 ~ /not deliverable:/) {
		set_non_deliverable_reason(invalid_recip_type);
	} else {
		# this call must come last
		check_addr_operand_error(invalid_recip_type);
	}
	next;
}
function set_non_deliverable_reason(counters) {
	reason = $0;
	if (reason ~ /^.* \(ERR193\) address .* failed: /) {
		reason = "(ERR193) locally failed by error director";
	} else if (reason ~ /^.* \(ERR193\) address .* deferred: /) {
		reason = "(ERR193) locally deferred by error director";
	} else if (reason ~ /^.* \(ERR164\) /) {
		reason = "(ERR164) DNS server failure resolving domain";
	} else if (reason ~ /^.* \(ERR165\) /) {
		reason = "(ERR165) DNS server returned bad packet";
	} else if (reason ~ /^.* \(ERR169\) /) {
		reason = "(ERR169) domain points to this host";
	} else if (reason ~ /^.* \(ERR177\) /) {
		reason = "(ERR177) CNAME loop";
	} else {
		# XXX this can still be a big, often unique
		# per transaction, string
		sub(/^.* not deliverable: /, "", reason);
	}
	counters[reason]++;
}
function check_addr_operand_error(counters) {
	if (match_rfc1035_errors(counters)) {
		;	# increment done
	} else if ($0 ~ /syntax error: too many parameters/) {
		counters["syntax error: too many parameters"]++;
	} else if ($0 ~ /syntax error in parameters: ESMTP options may have been given without using EHLO/) {
		counters["syntax error: ESMTP options without EHLO"]++;
	} else if ($0 ~ /syntax error in parameter: closing angle bracket missing/) {
		counters["closing angle bracket missing"]++;
	} else if ($0 ~ /syntax error in parameters: unknown ESMTP option/) {
		counters["unknown ESMTP option given"]++;
	} else if ($0 ~ /is a local mailbox, missing domain part/) {
		counters["address missing domain part"]++;
	} else if ($0 ~ /invalid quoted mailbox local part/) {
		counters["invalid quoted mailbox"]++;
	} else if ($0 ~ /invalid character in unquoted mailbox local part/) {
		counters["invalid character in mailbox"]++;
	} else if ($0 ~ /mailbox is not a valid RFC-(821|2822) local-part/) {
		counters["invalid mailbox syntax"]++;
	} else if ($0 ~ /DNS failed:.*has an unauthorised address of/) {
		counters["target of an MX host matched in smtp_bad_mx_targets"]++;
	} else if ($0 ~ /is not a valid domain \(no MX record\)/) {
		counters["domain does not have any MX record"]++;
	} else if ($0 ~ /is not a valid domain/) {
		counters["domain does not exist"]++;
	} else if ($0 ~ /DNS failed:/) {
		counters["DNS verification of domain failed"]++;
	} else if ($0 ~ /DNS error:/) {
		counters["DNS error validating domain"]++;
	} else if ($0 ~ /is a [^ ]+\([0-9]+\) address/) {
		counters["invalid mailbox form"]++;
	} else if ($0 ~ /null address/) {
		counters["invalid null address"]++;
	} else if ($0 ~ /address parse error/) {
		if (DEBUG > 1) {
			print $0;
		}
		counters["address parse error"]++;
	} else if ($0 ~ /address syntax error/) {
		if (DEBUG > 1) {
			print $0;
		}
		counters["address syntax error"]++;
	} else if ($0 ~ /syntax error:/) {
		if (DEBUG > 1) {
			print $0;
		}
		counters["other syntax error"]++;
	} else {
		if (DEBUG) {
			print $0;
		}
		counters["error or warning parsing address"]++;
	}
}
function match_rfc1035_errors(counters) {
	if ($0 ~ /domain name cannot begin with /) {
# old form of next one
		counters["domain begins with invalid character"]++;
		return 1;
	} else if ($0 ~ /(a domain |host)name must NOT (begin|start) with a /) {
		counters["domain with invalid leading character"]++;
		return 1;
	} else if ($0 ~ /domain name cannot end with /) {
# old form of next one
		counters["domain ends with invalid character"]++;
		return 1;
	} else if ($0 ~ /(a domain |host)name must NOT end with a /) {
		counters["domain with invalid trailing character"]++;
		return 1;
	} else if ($0 ~ /invalid character (found )?in (hostname|domain name)/) {
		counters["invalid character in domain"]++;
		return 1;
	} else if ($0 ~ /underscore found in domain name/) {
		counters["invalid underscore character in domain"]++;
		return 1;
	} else if ($0 ~ /domain name cannot be a bare IP address/) {
# old form of next one
		counters["likely invalid address literal syntax"]++;
		return 1;
	} else if ($0 ~ /poss[i]*ble host address (IP )?literal given with invalid syntax/) {
		counters["likely invalid address literal syntax"]++;
		return 1;
	} else if ($0 ~ /possible hex or octal host address literal given with invalid syntax/) {
		counters["likely invalid address literal syntax"]++;
		return 1;
	}
	return 0;
}
$5 == "remote" && $6 == "DATA:" && $7 == "message" && $8 == "too" && $9 == "big" {
	sent_too_much_data++;
	next;
}
$5 == "remote" && $6 == "DATA:" && $7 == "failed" && $8 == "to" && $9 == "queue" {
	unexpected_error_during_data++;
	next;
}
# note will never have message-ID in log -- connection was refused.
$4 == "remote" && $5 == "DATA:" && $6 == "error" && $7 == "ignored," {
	data_error_ignored++;
	next;
}
$5 == "remote" && $6 == "DATA:" && $7 == "refused" && $NF == "rejected." {
	# XXX should we classify and keep track of each reason?
	data_rejected++;
	next;
}
$4 == "remote" && $5 == "DATA:" && $6 == "sent" {
	if ($0 ~ /[cC]ommand must be preceded by/) {
		early_data++;
	} else {
		if (DEBUG) {
			print $0;
		}
		error_data++;
	}
}
$4 == "remote" && $5 == "ERROR:" && $6 == "input" && $7 == "received" && $8 == "before" && $9 == "greeting" {
	slammers++;
	next;
}
$4 == "remote" && $5 == "ERROR:" && $6 == "command" && $7 == "too" && $8 == "long" {
	command_too_long_errors++;
	next;
}
$5 == "remote" && $6 == "ERROR:" && $7 == "command" && $8 == "too" && $9 == "long" {
	command_too_long_errors++;
	next;
}
$4 == "remote" && $5 == "ERROR:" && $6 == "invalid" && $7 == "characters" {
	invalid_char_errors++;
	next;
}
$5 == "remote" && $6 == "ERROR:" && $7 == "invalid" && $8 == "characters" {
	invalid_char_errors++;
	next;
}
#
# NOTE: everything after here is intended only to match those records which
# don't get matched above.
#
# without Message-ID
ERRORS && $4 == "DNS" && $5 == "MX" && $6 == "error:" {
	gsub(/'|'/, "", $9); # note dup quote is for emacs syntax closure
	gsub(/'|'/, "", $14);
	sub(/\).*$/, "", $21);
	gsub(/'|'/, "", $21);
	printf("%-26s %-26s %s\n", $9, $14, $21) >> mxerr_file;
}
# with Message-ID
ERRORS && $5 == "DNS" && $6 == "MX" && $7 == "error:" {
	gsub(/'|'/, "", $10); # note dup quote is for emacs syntax closure
	gsub(/'|'/, "", $15);
	sub(/\).*$/, "", $22);
	gsub(/'|'/, "", $22);
	printf("%-26s %-26s %s\n", $10, $15, $22) >> mxerr_file;
}
ERRORS && $13 == "DNS" && $14 == "error:" && $15 == "MX" {
	gsub(/'|'|:/, "", $9); # note dup quote is for emacs syntax closure
	gsub(/'|'/, "", $18);
	printf("%-26s %-26s %s\n", $9, $18, "(bogusmx)") >> mxerr_file;
}
ERRORS && $5 == "Failed" && $10 == "DNS" && $11 == "error:" && $12 == "MX" && $(NF) == "CNAME." {
	gsub(/'|'/, "", $14); # note dup quote is for emacs syntax closure
	gsub(/'|'/, "", $19);
	printf("%-26s %-26s %s\n", $14, $19, "(unknown)") >> mxerr_file;
}
ERRORS && $5 == "Failed" && $10 == "DNS" && $11 == "error:" && $12 == "No" {
	sub(/^\(/, "", $19);
	sub(/\)$/, "", $19);
	gsub(/'|'/, "", $(NF)); # note dup quote is for emacs syntax closure
	printf("%-26s %-26s %s\n", $19, $(NF), "(unknown)") >> mxerr_file;
}
ERRORS && NF >= 10 && $(NF-10) == "security" && $(NF-9) == "violation:" {
	gsub(/'|'/, "", $(NF-6)); # note dup quote is for emacs syntax closure
	printf("%s\n", $(NF-6)) >> relayerr_file;
}
END {
	close(sizes_pipe);

	# delete array initialisation elements
	delete delivered_type[""];
	delete dnsbl[""];
	delete rcvd_bytes_type[""];
	delete received_type[""];
	delete hello_rejected_type[""];
	delete hello_warning_type[""];
	delete invalid_recip_type[""];
	delete invalid_sender_type[""];
	delete client_with_bad_local_sender[""];
	delete delay_sum[""];
	delete queued_delay_sum[""];
	delete recv_delay_sum[""];
	delete max_delay[""];
	delete max_queued_delay[""];
	delete max_recv_delay[""];

	printf("First record timestamp: %s\n", start_time);
	# clobber the trailing colon now instead of on every assignment above
	sub(/:$/, "", last_time);
	printf("Last record timestamp:  %s\n", last_time);
	print "";
	printf("%14d  Total log entries processed.\n", NR);
	print "";
	printf("%14d  Total incoming SMTP connections.\n", connections);
	print "";
	if (max_connections) {
		printf("%14d  Maximum simultaneous incoming SMTP connections.\n", max_connections);
		print "";
	}
	if (too_many_connections) {
		printf("%14d  Total SMTP connections deferred due to smtp_accept_max.\n", too_many_connections);
		print "";
	}
	if (many_connections) {
		printf("%14d  Total SMTP connections with queue_only (smtp_accept_queue).\n", many_connections);
		print "";
	}
	if (not_enough_spool_space) {
		printf("%14d  Total SMTP connections deferred due to lack of spool space.\n", not_enough_spool_space);
		print "";
	}
	if (hello_refused > 0) {
		for (type in hello_rejected_type) {
			printf("%14d  connections rejected by %s\n", hello_rejected_type[type], type);
		}
		printf("%14s\n", substr("================", 1, length(sprintf("%d", hello_refused))));
		printf("%14d  Total SMTP connections rejected (perhaps after EHLO and HELO).\n", hello_refused);
		print "";
	}
	if (hello_warnings > 0) {
		for (type in hello_warning_type) {
			printf("%14d  possible connection problem: %s\n", hello_warning_type[type], type);
		}
		printf("%14s\n", substr("================", 1, length(sprintf("%d", hello_warnings))));
		printf("%14d  Total connection warnings.\n", hello_warnings);
		print "";
	}
	if (bad_client_names > 0) {
		printf("%14d  Connections allowed with bad client names.\n", bad_client_names);
		print "";
	}
	if (resets > 0) {
		printf("%14d  Total RSET commands.\n", resets);
	}
	if (ignored_hello_reject > 0) {
		printf("%14d  Total MAIL commands after connection refused.\n", ignored_hello_reject);
	}
	if (ignored_hello_and_mail_reject > 0) {
		printf("%14d  Total RCPT commands after connection refused.\n", ignored_hello_and_mail_reject);
	}
	if (ignored_hello_reject_other > 0) {
		printf("%14d  Total other commands after connection refused.\n", ignored_hello_reject_other);
	}
	if (sender_before_hello > 0) {
		printf("%14d  Total MAIL commands before successful greeting.\n", early_mail);
	}
	if (early_rcpt > 0) {
		printf("%14d  Total RCPT commands before successful greeting or MAIL.\n", early_rcpt);
	}
	if (early_data > 0) {
		printf("%14d  Total DATA commands before successful RCPT command.\n", early_data);
	}
	if (error_data > 0) {
		printf("%14d  Total DATA commands with other error.\n", error_data);
	}
	if (data_rejected > 0) {
		printf("%14d  Total messages rejected after DATA by content filtering.\n", data_rejected);
	}
	if (sent_too_much_data > 0) {
		printf("%14d  Total messages rejected due to being too big.\n", sent_too_much_data);
	}
	if (unexpected_error_during_data > 0) {
		printf("%14d  Total messages failed due to unexpected error during DATA.\n", unexpected_error_during_data);
	}
	if (data_error_ignored > 0) {
		printf("%14d  Total refused connections dropped forcefully after DATA.\n", data_error_ignored);
	}
	if (command_too_long_errors > 0) {
		printf("%14d  Total connections dropped forcefully after over-sized command.\n", command_too_long_errors);
	}
	if (invalid_char_errors > 0) {
		printf("%14d  Total connections dropped forcefully after invalid characters.\n", invalid_char_errors);
	}
	if (slammers > 0) {
		printf("%14d  Total connections dropped forcefully from slammers.\n", slammers);
	}
	if (other_before_hello > 0) {
		printf("%14d  Total other commands out of order.\n", other_before_hello);
	}
	if (sender_deferrals > 0) {
		printf("%14d  Total sender deferrals.\n", sender_deferrals);
	}
	if (recipient_deferrals > 0) {
		printf("%14d  Total recipient deferrals.\n", recipient_deferrals);
	}
	if (invalid_senders > 0) {
		print "";
		for (type in invalid_sender_type) {
			printf("%14d  invalid sender: %s\n", invalid_sender_type[type], type);
		}
		printf("%14s\n", substr("================", 1, length(sprintf("%d", invalid_senders))));
		printf("%14d  Total invalid sender addresses.\n", invalid_senders);
		if (VERBOSE && clients_with_bad_local_sender > 0) {
			print "";
			for (client in client_with_bad_local_sender) {
				printf("%14d  bad local sender attempts by client: %s\n", client_with_bad_local_sender[client], client);
			}
		}
	}
	if (invalid_recipients > 0) {
		print "";
		for (type in invalid_recip_type) {
			printf("%14d  invalid recipient: %s\n", invalid_recip_type[type], type);
		}
		printf("%14s\n", substr("================", 1, length(sprintf("%d", invalid_recipients))));
		printf("%14d  Total invalid recipient attempts.\n", invalid_recipients);
	}
	if (received > 0) {
		print "";
		for (type in received_type) {
			printf("%14d  messages received by %s\n", received_type[type], type);
			if (received_type[type] > 0 && max_recv_delay[type] > 0) {
				max_recv_delay_hrs = max_recv_delay[type] / 3600;
				max_recv_delay_sec = max_recv_delay[type] % 3600;
				printf("%14s  maximum arrival delay = %d:%02d:%02d\n", "",
				       max_recv_delay_hrs, max_recv_delay_sec / 60, max_recv_delay_sec % 60);
			}
			if (received_type[type] > 1 && recv_delay_sum[type] > 0) {
				recv_delay_mean = 0.0;
				recv_delay_stddev = 0.0;
				recv_delay_mean = recv_delay_sum[type] / received_type[type];
				recv_delay_variance = ((recv_delay_ssq[type] / received_type[type]) - (recv_delay_mean * recv_delay_mean)) * received_type[type] / (received_type[type] - 1);
				recv_delay_stddev = sqrt(recv_delay_variance);
				recv_delay_mean_hrs = recv_delay_mean / 3600;
				recv_delay_mean_sec = recv_delay_mean % 3600;
				recv_delay_stddev_hrs = recv_delay_stddev / 3600;
				recv_delay_stddev_sec = recv_delay_stddev % 3600;
				printf("%14s  arrival delay: mean = %d:%02d:%02d, stddev = %d:%02d:%02d\n", "",
				       recv_delay_mean_hrs, recv_delay_mean_sec / 60, recv_delay_mean_sec % 60,
				       recv_delay_stddev_hrs, recv_delay_stddev_sec / 60, recv_delay_stddev_sec %60);
				recv_delay_nfp = recv_delay_mean + (2 * recv_delay_stddev);
				if (recv_delay_nfp <= max_recv_delay[type]) {
					recv_delay_nfp_hrs = recv_delay_nfp / 3600;
					recv_delay_nfp_sec = recv_delay_nfp % 3600;
					printf("%14s  approximated 95th Percentile = %d:%02d:%02d\n", "",
					       recv_delay_nfp_hrs, recv_delay_nfp_sec / 60, recv_delay_nfp_sec % 60);
				}
			}
		}
		printf("%14s\n", substr("================", 1, length(sprintf("%d", received))));
	}
	printf("%14d  Total messages received.\n", received);
	print "";
	if (rcvd_bytes > 0) {
		for (type in rcvd_bytes_type) {
			printf("%14d  bytes (%d msgs) received by %s\n", rcvd_bytes_type[type], received_type[type], type);
		}
		printf("%14s\n", substr("================", 1, length(sprintf("%d", rcvd_bytes))));
	}
	# Too bad we don't have a humanize_number() or even a standard printf()
	# for formatting using the localeconv() rules....
	printf("%14d  Total bytes received.\n", rcvd_bytes);
	if (received >= 1 && rcvd_bytes > 0) {
		printf("%14s   min_size = %d, max_size = %d\n", "", min_size, max_size);
	}
	if (received > 1 && rcvd_bytes > 0) {
		rcvd_mean = 0.0;
		rcvd_stddev = 0.0;
		rcvd_mean = rcvd_bytes / received;
		#
		# one of these forms is less likely to overflow -- probably the
		# second one
		#
###		rcvd_variance = ((received * rcvd_ssq) - (rcvd_bytes * rcvd_bytes)) / (received * (received - 1));
		rcvd_variance = ((rcvd_ssq / received) - (rcvd_mean * rcvd_mean)) * received / (received - 1);
		rcvd_stddev = sqrt(rcvd_variance);
		printf("%14s   mean = %d, stddev = %d\n", "", rcvd_mean, rcvd_stddev);
		#
		# Note that we cannot reliably calculate the true Nth Percentile
		# without sorting the entire data set.
		#
		# The standard deviation (SD) quantifies variability.  IFF the
		# data follow a bell-shaped Gaussian distribution, then 68.3% of
		# the values lie within one SD of the mean (on either side) and
		# 95.4% of the values lie within (2 * SD) of the mean.
		#
		# So, we do a simple sanity check to make sure the calculated
		# value fits within the range of the actual values.  Maybe this
		# can never happen, but I'm too tired to prove it
		# mathematically and it's trivial to implement the test.  :-)
		#
		rcvd_nfp = rcvd_mean + (2 * rcvd_stddev);
		if (rcvd_nfp <= max_size) {
			printf("%14s   approximated 95th Percentile = %d\n", "", rcvd_nfp);
		}
		printf("%14s   actual 95th Percentile = ", "");
		ninetyfifth = int(received * 0.95);
		system("sed -n " ninetyfifth "p " sizes_file);
		printf("%14s   actual 99th Percentile = ", "");
		ninetynineth = int(received * 0.99);
		system("sed -n " ninetynineth "p " sizes_file);
	}
	print "";
	if (succ_outbound_smtp > 0) {
		printf("%14d  Successful outbound SMTP connections!\n", succ_outbound_smtp);
	}
	if (succ_outbound_other > 0) {
		printf("%14d  Successful outbound 'other' connections!\n", succ_outbound_other);
	}
	print "";
	if (delivered > 0) {
		for (type in delivered_type) {
			printf("%14d  messages delivered by transport: %s\n", delivered_type[type], type);
			if (delivered_type[type] >= 1 && max_delay[type] > 0) {
				# we may need to count days here
				max_delay_hrs = max_delay[type] / 3600;
				max_delay_sec = max_delay[type] % 3600;
				printf("%14s  maximum normal delay = %d:%02d:%02d\n", "",
				       max_delay_hrs, max_delay_sec / 60, max_delay_sec %60);
			}
			if (delivered_type[type] > 1 && delay_sum[type] > 0) {
				delay_mean = 0.0;
				delay_stddev = 0.0;
				delay_mean = delay_sum[type] / delivered_type[type];
				delay_variance = ((delay_ssq[type] / delivered_type[type]) - (delay_mean * delay_mean)) * delivered_type[type] / (delivered_type[type] - 1);
				delay_stddev = sqrt(delay_variance);
				delay_mean_hrs = delay_mean / 3600;
				delay_mean_sec = delay_mean % 3600;
				delay_stddev_hrs = delay_stddev / 3600;
				delay_stddev_sec = delay_stddev % 3600;
				printf("%14s  normal delay: mean = %d:%02d:%02d, stddev = %d:%02d:%02d\n", "",
				       delay_mean_hrs, delay_mean_sec / 60, delay_mean_sec % 60,
				       delay_stddev_hrs, delay_stddev_sec / 60, delay_stddev_sec %60);
				delay_nfp = delay_mean + (2 * delay_stddev);
				if (delay_nfp <= max_delay[type]) {
					delay_nfp_hrs = delay_nfp / 3600;
					delay_nfp_sec = delay_nfp % 3600;
					printf("%14s  normal approximated 95th Percentile = %d:%02d:%02d\n", "",
					       delay_nfp_hrs, delay_nfp_sec / 60, delay_nfp_sec % 60);
				}
			}
			if (delivered_type[type] >= 1 && max_queued_delay[type] > 0) {
				# we will likely need to count days here....
				max_queued_delay_hrs = max_queued_delay[type] / 3600;
				max_queued_delay_sec = max_queued_delay[type] % 3600;
				printf("%14s  maximum queued delay = %d:%02d:%02d\n", "",
				       max_queued_delay_hrs, max_queued_delay_sec / 60, max_queued_delay_sec %60);
			}
			if (delivered_type[type] > 1 && queued_delay_sum[type] > 0) {
				queued_delay_mean = 0.0;
				queued_delay_stddev = 0.0;
				queued_delay_mean = queued_delay_sum[type] / delivered_type[type];
				queued_delay_variance = ((queued_delay_ssq[type] / delivered_type[type]) - (queued_delay_mean * queued_delay_mean)) * delivered_type[type] / (delivered_type[type] - 1);
				queued_delay_stddev = sqrt(queued_delay_variance);
				queued_delay_mean_hrs = queued_delay_mean / 3600;
				queued_delay_mean_sec = queued_delay_mean % 3600;
				queued_delay_stddev_hrs = queued_delay_stddev / 3600;
				queued_delay_stddev_sec = queued_delay_stddev % 3600;
				printf("%14s  queued delay: mean = %d:%02d:%02d, stddev = %d:%02d:%02d\n", "",
				       queued_delay_mean_hrs, queued_delay_mean_sec / 60, queued_delay_mean_sec % 60,
				       queued_delay_stddev_hrs, queued_delay_stddev_sec / 60, queued_delay_stddev_sec %60);
				queued_delay_nfp = queued_delay_mean + (2 * queued_delay_stddev);
				if (queued_delay_nfp <= max_queued_delay[type]) {
					queued_delay_nfp_hrs = queued_delay_nfp / 3600;
					queued_delay_nfp_sec = queued_delay_nfp % 3600;
					printf("%14s  queued approximated 95th Percentile = %d:%02d:%02d\n", "",
					       queued_delay_nfp_hrs, queued_delay_nfp_sec / 60, queued_delay_nfp_sec % 60);
				}
			}
		}
		printf("%14s\n", substr("================", 1, length(sprintf("%d", delivered))));
	}
	printf("%14d  Total messages delivered.\n", delivered);
	print "";
	printf("%14d  Total delivery attempts deferred for retry.\n", deferred);
	print "";
	printf("%14d  Total message deliveries failed.\n", failed);
	print "";
	printf("%14d  Total messages returned.\n", returned);
	printf("%14d  Total bounces frozen to the error queue.\n", frozen_double_bounces);
	if (errors_notifications_sent > 0) {
		printf("%14d  Total error notifications sent to owners.\n", error_notifications_sent);
	}
	if (errors_sent_to_postmaster > 0) {
		printf("%14d  Total error notifications CCed to postmaster.\n", errors_sent_to_postmaster);
	}
	if (errors_sent_without_error > 0) {
		printf("%14d  Total errors sent without error info!\n", errors_sent_without_error);
	}
	print "";
	printf("%14d  Total messages `completed'.\n", completed);
	print "";
	saw_dnsbl = 0;
	for (domain in dnsbl) {
		printf("%14d  matches in DNSBL %s\n", dnsbl[domain], domain) | "sort -rn";
		saw_dnsbl += 1;
	}
	close("sort -rn");
	print "";
	saw_rhdnsbl = 0;
	for (domain in rhdnsbl) {
		printf("%14d  matches in RH-DNSBL %s\n", rhdnsbl[domain], domain) | "sort -rn";
		saw_rhdnsbl += 1;
	}
	close("sort -rn");
	if (VERBOSE && (saw_dnsbl > 1 || saw_rhdnsbl > 1)) {
		print "";
		print "NOTE: the order of DNSBL statistics reflects only their order of use and";
		print "should not be construed to have anything to do with their effectiveness or";
		print "usefulness.  DNSBL lookups stop on first match.  You will have to re-query";
		print "all addresses from your logs in all DNSBLs in order to determine their";
		print "relative effectiveness and to determine optimal ordering for efficiency";
	}
}

# Local Variables:
# c-file-style: "personal-awk"
# End: