#!/usr/bin/env python
# regm version 1.0 15/07/01
# (C) Jrme Dumonteil <jerome.dumonteil@linuxfr.org> 2001
# You're welcome to redistribute this software under the
# terms of the GNU General Public Licence version 2.0
# or, at your option, any higher version.
#
# You can read the complete GNU GPL in the file COPYING
# which should come along with this software, or visit
# the Free Software Foundation's WEB site http://www.fsf.org

# need python 2.0
# need code cleaning, classes
# new version (if any) may be there : http://perso.linuxfr.org/jdumont/regm

import os
import sys
import getopt
import rfc822
import mailbox
import glob
import re
import string
import tempfile
import time

version="regm version 1.0 15/07/01"
usage="""
Filter and extract messages from mailboxes with regular expressions.

installation:
        - change defaults at the begining of the script to fit your 
          needs, it's GPL code.
        - put it with your python scripts (need python 2.0) and do a 
          ln -s regm.py ~/bin/regm
bugs:
- parse encoded attachment
- only tested on Linux
- not usable as a library

usage:
regm [options] [search string] [input files]

exemples of use:
regm hello ~/mail/* > ~/tmp/mbox
regm -v -f ~/tmp/mbox -b hello -o -b bye ~/mail/*
regm -vx -f ~/tmp/mbox 'hello||bye'  ~/mail/*
regm -v -f ~/tmp/mbox -n -b hello ~/mail/*
regm -v -f ~/tmp/mbox -h '^to:.*@free.fr' ~/mail/*
regm -vx 't:@free.fr' > ~/tmp/mbox
regm -vN -f ~/tmp/mbox -h '^to:.*@free.fr' ~/mail/*
regm -xm 'f:joe&&hello&&!t:@free.fr'
regm -p '%40d  sub: %s\\n %5B\\n\\n' "" ~/mail/inbox
regm -v -p '%a\\n\\n' '' ~/news/fr.misc.bavardages.dinosaures
regm -p '%D %T\\n' "" ~/mail/sent
regm -U '' duplicate.mbox > clean.mbox

Default file path for mbox is ~/mail/*

All research string are regex, in expert mode (-x) you can use "||" 
"&&" and "!" operator in the same string as separator between different
regex.

Each output message is absolutely left unmodified by the filter.

options:
-h string       search string in message header
-b string       search string in message body
-n              negation of the following -h or -b 
-N              global negation (invert the filter output)
-o              "or" (between -h or -b). Default is a "and"
-f file         output file. there is a warning if output file exists.
                Default output is stdout.
-x              xpert mode
                no -h or -b option, use of && || !, the search string
                must be after all options (and before input path)
                's:' is '^Subject: .*'
                'd:' is '^Date: .*'
                'f:' is '^From: .*'
                'e:' is '^Sender: .*'
                't:' is '^To: .*'
                'c:' is '^Cc: .*'
                'r:' is '^Reply-To: .*'
                'i:' is '^Message-ID: .*'
                'g:' is '^References: .*'
                'a:' is '^Approved: .*'
                'x:' is '^X-Loop: .*'
                'n:' is '^newsgroups: .*'
                'h:' is equivalent to the -h option, default is 
                        searching in body
                
-u              case sensitive. Default is case insensitive
-p string       output format, syntax : \\n \\t and 
                %[number][sdfetcrixnagBDFETCR] for subject, date, from,... 
                B is body, other uppercase are for stripped mail 
                addresses and D for date with "%D" format
-m              direct launch of mutt on the result (using temp output
                file)
-U              discard duplicate messages (with same message-id)
-D string       change output date format of -p "%D" option
-q              quiet
-v              verbosity
--help          this help
--version       version
"""

# default mailbox(es) to search
def_mbox="~/mail/*"
def_tmpdir="~/tmp"
date_format="%D" # man date
na="(na)" # not available

# this will be used with the -m option
mua="mutt -R -f "

whath={'s':"Subject",'d':"Date",'D':"Date",'f':"From",'F':"From",'e':"Sender",
'E':"Sender",'t':"To",'T':"To",'c':"Cc",'C':"Cc",'r':"Reply-To",'R':"Reply-To",
'i':"Message-ID",'x':"X-Loop",'n':"newsgroups",'a':"Approved",'g':"References"}

xpaccel={'s':'^Subject: .*','d':'^Date: .*','f':'^From: .*','e':'^Sender: .*',
't':'^To: .*','c':'^Cc: .*','r':'^Reply-To: .*','i':'^Message-ID: .*',
'g':'^References: .*','a':'^Approved: .*','x':'^X-Loop: .*',
'n':'^newsgroups: .*','h':''}

rfil=re.compile(r'(^.*?)%([0-9]*)([sdfetcrixnBagDFETCR])(.*)$',re.L|re.M|re.S)
rpif=re.compile(r'\\%',re.M)
rpaf=re.compile(r'\\%_',re.M)

def err(a):
	if type(a)==type(""):
		sys.stderr.write("%s\n"%a)
	else:
		sys.stderr.write("%s\n"%repr(a))

def outformat(mess,body,sf):
	p=re.sub(r'\\n',r'\n',sf,re.M)
	p=re.sub(r'\\t',r'\t',p,re.M)
	p=rpif.sub(r'\\%_',p)
	pdeb=""
	while 1:
		ch=rfil.search(p)
		if not ch:
			break
		if ch.group(3)!="B":
			if len(ch.group(2))>0:
				fmtstr="%%-%ss"%ch.group(2)			
				repstr=fmtstr%msg.getheader(whath[ch.group(3)],na)[:eval(ch.group(2))]		
			else:
				if ch.group(3) in string.uppercase:
					if ch.group(3) in 'FETCR':
						repstr=string.join([x[1] for x in msg.getaddrlist(whath[ch.group(3)])])
						if repstr=='':
							repstr=na
					else: # ==D => date with custom format
						try:
							date=msg.getdate_tz('Date')
	    						tz=date[-1]
							date=time.localtime(rfc822.mktime_tz(date))
							repstr=time.strftime(date_format,date)
						except:
							repstr=na
				else:
					repstr=msg.getheader(whath[ch.group(3)],na)				
		else:
			msg.rewindbody()
			if len(ch.group(2))>0:
				i=eval(ch.group(2))
				while i>0:
					try:
						repstr+=msg.fp.readline()
						i-=1
					except: i=0
			else:
				repstr=msg.fp.read()
		pdeb=pdeb+ch.group(1)+repstr
		p=ch.group(4)
	p=pdeb+p
	p=rpaf.sub(r'%',p)
	return p

def isunic(m,verb):
	mid=m.getheader("Message-ID",na)
	if mid==na:
		if verbose>1:
			err("Message-ID %s"%mid)
		return 1
	if mid in unic:
		if verb>0:
			err("drop %s"%mid)
		return 0
	else:
		unic.append(mid)
		return 1

#get options
try:
	optlist, args = getopt.getopt(sys.argv[1:], 'h:b:nNof:xup:mUD:qv',
	['help','version'])
except getopt.GetoptError,msg:
	err('bad option')
	err(msg)
	sys.exit(1)


# should be a dic
f_case=f_xpert=f_date=f_outfile=verbose=quiet=0
f_help=f_version=f_glob_not=f_not=f_print=f_mua=erase_after=f_unic=0
warning=1 # used in option parsing

reglist=[]  # item = ( [(0|1,"bod"),...],[(0|1,"head"),...] )
regbodycurrent=[]
regheadercurrent=[]
regflag=f_not
lastone=('','')

for t in optlist:
	if t[0]=='-b':
		regbodycurrent.append((f_not,t[1]))
		if f_not:
			f_not=0
	elif t[0]=='-h':
		regheadercurrent.append((f_not,t[1]))
		if f_not:
			f_not=0
	elif t[0]=='-n':
		f_not=1
	elif t[0]=='-N':
		f_glob_not=1
	elif t[0]=='-o':
		if regbodycurrent==[] and regheadercurrent==[]:
			if warning:
				err("warning: nothing before -o ?")
		else:
			reglist.append((regbodycurrent,regheadercurrent))
			regbodycurrent=[]
			regheadercurrent=[]

	elif t[0]=='--help':
		f_help=1
	elif t[0]=='--version':
		f_version=1
	elif t[0]=='-u':
		f_case=1
	elif t[0]=='-U':
		f_unic=1
		unic=[]
	elif t[0]=='-p':
		f_print=1
		print_format=t[1]
	elif t[0]=='-m':
		f_mua=1
	elif t[0]=='-x':
		f_xpert=1
	elif t[0]=="-D":
		date_format=t[1]
	elif t[0]=="-f":
		f_outfile=1
		outfile=os.path.expanduser(t[1])
	elif t[0]=="-v":
		verbose += 1
	elif t[0]=="-q":
		quiet=1
		
	if lastone[0]=='-n' and f_not:
		# negation non ok"
		f_not=0
		if warning:
			err("warning: -n not used ?")
	lastone=t

if verbose and quiet:
	quiet=0
	if verbose > 1:
		err("switch quiet off ;-)")
		err("verbosity: %d"%verbose)

if lastone[0] in ('-n','-o'):
	# negation non ok"
	if warning:
		err("warning: %s not used ?"%lastone[0])

if regbodycurrent!=[] or regheadercurrent!=[]:
	reglist.append((regbodycurrent,regheadercurrent))
	regbodycurrent=[]
	regheadercurrent=[]

if f_version:
	print version
	sys.exit(0)
if f_help:
	print version
	print usage
	sys.exit(0)

if f_mua: # output file is a temporary file if no permanent one exist
	if not f_outfile:
		f_outfile=1
		erase_after=1		
		if os.path.isdir(os.path.expanduser(def_tmpdir)):
			tempfile.tempdir=os.path.expanduser(def_tmpdir)
		elif os.path.isdir(os.path.expanduser("~")):
			tempfile.tempdir=os.path.expanduser("~")
		outfile=tempfile.mktemp("regm")

if reglist==[]: #no -b or -h option used ? probably expert mode, or take lonely string arg
	try:
		begin_input_file=1
		if f_xpert:
			if verbose>1:
				err("expert mode")
			if f_not:
				if verbose:
					err("warning: -n before xp string, multiple -n assumed")
			or_l=args[0].split("||")
			for orl in or_l:
				request=orl.split("&&")
				for r in request:
					if len(r)>1 and r[0]=="!":
						neg=1
						r=r[1:]
					else: neg=0
					if f_not:
						neg=1-neg
					head=0
					if len(r)>2 and r[1]==':':
						if r[0] in xpaccel.keys():
							r=xpaccel[r[0]]+r[2:]
							head=1						
					if head:
						regheadercurrent.append((neg,r))
						head=0
					else:
						regbodycurrent.append((neg,r))
				reglist.append((regbodycurrent,regheadercurrent))
				regbodycurrent=[]
				regheadercurrent=[]
		else:
			request=[args[0]]
			if f_not:
				if verbose:
					err("warning: -n before string, should be -b or -h, -n -b assumed")
			regbodycurrent=[(f_not,hop) for hop in request]
			reglist.append((regbodycurrent,regheadercurrent))
			regbodycurrent=[]
			#regheadercurrent is nul

	except IndexError:
		err("need a request field")
		sys.exit(1)
else:
	begin_input_file=0

# now we got all the search strings, let compile them
reqlist=[]
for t in reglist:
	tq=[]

	l=t[0] # list body
	lq=[]
	if f_case:
		for i in l: # each tupple
			lq.append((i[0],re.compile(i[1],re.L|re.M|re.S)))
	else:
		for i in l: # each tupple
			lq.append((i[0],re.compile(i[1],re.I|re.L|re.M|re.S)))
	tq.append(lq)

	l=t[1] # list header
	lq=[]
	if f_case:
		for i in l: # each tupple
			lq.append((i[0],re.compile(i[1],re.L)))
	else:
		for i in l: # each tupple
			lq.append((i[0],re.compile(i[1],re.I|re.L)))
	tq.append(lq)

	reqlist.append(tq)

# if we are lucky, we dont need to parse either header or body
parse_header=0
parse_body=0
for rl in reqlist:
	if rl[0]!=[]:
		parse_body=1
		break
for rl in reqlist:
	if rl[1]!=[]:
		parse_header=1
		break
if f_unic:
	parse_header=1 # need id field

if verbose>2:
	print "b:%d  h:%d"%(parse_body,parse_header)
	print reqlist

limbox=args[begin_input_file:] # look for input mailbox
if not limbox:
	limbox=glob.glob(os.path.expanduser(def_mbox))

lmbox=[]
for p in limbox:
	b=os.path.expanduser(p)
	if b not in lmbox:
		if os.path.isfile(b):
			lmbox.append(b)

if f_outfile:
	if os.path.exists(outfile):
		if not os.path.isfile(outfile):
			err("can't write on %s"%outfile)
			sys.exit(1)
		else:
			err("%s : file exist"%outfile)
			rep=""
			while rep not in ("e","c","a"):
				err("(e)rase, (c)oncat, or (a)nnul ?")
				try:
					rep=string.lower(raw_input()[0])
				except: pass
			if rep=="e":
				open_mode="w"
			elif rep=="c":
				open_mode="a"
			else:
				err("Annulation.")
				sys.exit(0)
	else:
		open_mode="w"

	try:
		out=open(outfile,open_mode)
	except IOError,msg:
		if not quiet:
			err(msg)
		sys.exit(1)
else:
	out=sys.stdout

stats=0
parsed=0
cont=0

for fname in lmbox:
	try:
		if verbose >1:
			err("opening %s"%fname)
		mbox=open(fname,"r")
	except IOError,msg:
		if not quiet :
			err("can't open %s for read"%(fname))
		if verbose: err(msg)
		continue

	mb=mailbox.UnixMailbox(mbox)
	msgs = []
	while 1:
	    msg = mb.next()
	    if msg is None:
		break
	    msgs.append(msg)
	if verbose:
		err('%5d messages in %s'%(len(msgs),fname))
	count=0
	for msg in msgs:
		parsed+=1
		if parse_body or f_print: # should find better sollution
			msg.rewindbody()
			bodytxt=msg.fp.read()

		for rl in reqlist:
			find=0
			bad=0
			for r in rl[0]: # liste body
				if not r[0]:
					if r[1].search(bodytxt):
						find=1
						continue
					else:
						find=-1
						break
				else: # -n
					if not r[1].search(bodytxt) :
						bad=-1
						continue
					else:
						bad=1
						break

			if rl[0] and (find==-1 or bad==1): # goto or_liste
				continue

			for r in rl[1]: # liste header
				if not r[0]:
					find=-1
					for he in msg.headers:
						if r[1].search(he):
							find=1
							break
					if find==1:
						continue
					else:
						break
				else: # -n
					bad=-1
					for he in msg.headers:
						if r[1].search(he):
							bad=1
							break
					if bad==1:
						break
					else:
						continue

			if bad==1 or find==-1: # goto list or ?
				continue
			else: # this list is ok, no need or-list
				break
		if not f_glob_not:
			if bad!=1 and find!=-1:
				stats+=1
				count+=1
				if verbose>2:
					err("%d %d"%(stats,parsed))
				if not f_unic or isunic(msg,verbose):
					if f_print:				
						out.write(outformat(msg,bodytxt,print_format))
					else:											
						msg.fp.seek(0)
						out.write(msg.fp.read())
		else:
			if not(bad!=1 and find!=-1):
				stats+=1
				count+=1
				if verbose>2:
					err("%d %d"%(stats,parsed))
				if not f_unic or isunic(msg,verbose):
					if f_print:					
						out.write(outformat(msg,bodytxt,print_format))
					else:
						msg.fp.seek(0)
						out.write(msg.fp.read())

	if verbose:
		if count:
			err("=> %5d matched"%count)
		if verbose>1:
			err("closing %s"%fname)
	mbox.close()


if verbose:
	if verbose >1:
		err(reglist)
	err("%d on %d messages"%(stats,parsed))

out.close()
if f_mua:
	order=mua+outfile
	try:
		os.system(order)
	except:
		err(order)
		sys.exit(1)
if erase_after:
	try:
		os.unlink(outfile)
	except: 
		pass
	
