#
# This file is part of Zaplet
# Copyright 1999 - 2001 Adam Feuer <adamf@pobox.com>
#
# Zaplet is free software; you can redistribute it and/or modify
# it under the terms of the Python License as published by the
# Python Software Foundation, or GNU General Public License as published
# by the Free Software Foundation (either version 2 of the License, or
# (at your option) any later version).
#
# Zaplet is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Zaplet; see the file COPYING-Zaplet. If not, write to the
# Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
#
# You should have received a copy of the Python License along
# with Zaplet; see the file COPYING.
#

# zaplet classes
# for use with adzapper filtering proxy

RCS_ID =  '$Id: zaplet_engine.py,v 1.13 2001/09/02 04:47:17 adamf Exp $'

# standard modules
import os
import re
import string
import sys
import urlparse
import socket

# XML modules
from xml.sax import saxexts, saxlib, saxutils

# adzapper modules
from debug import debug
import zaplet
from zaplet import zaplet_object,zaplet_sax_handler


numeric_host = re.compile ('[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+')
numeric_network = re.compile ('[0-9]+\.([0-9]+\.)?([0-9]+\.)?([[0-9]+\*])?')


##################################################################
#
# zaplet_engine
#
class zaplet_engine:
    """Zaplet engine class: this does all the work--
    reads in zaplets, and determines whether URLs should be blocked.

    if called with no arguments, adds self.site_zaplet_dir to self.zaplet_dirs
    and reads zaplets contained in this list. the list has no elements, no
    zaplets are read.

    if called with one argument:
    the first argument can be a string that has a single zaplet directory,
    or a list that contains multiple zaplet directory paths, each
    as a single string

    if called with two arguments:
    the second argument can be a string that holds a single zaplet directory.
    this is where zaplets that are modified locally are stored.
    
    """

    def __init__(self, arg=None,site_zaplet_dir=None):
        """initialize the zaplet_engine object
        build the {site : block_regex} dictionary"""

        debug.debug(1,"Initializing zaplet engine...")

        # block popups
        self.set_block_popups_regex(zaplet.BLOCK_POPUPS_REGEX)
        
        # dictionary of (site, zaplet object instance) pairs
        self.zaplet_list = {}

        # set the list of zaplet repositories
        self.actual_zaplet_dirs = []  # this is filled out in ReadZapletDir
        if arg == None:
            if hasattr(self,'zaplet_dirs'):
                pass
            else:
                self.zaplet_dirs = []
        elif type(arg) == type(""):
            self.zaplet_dirs = [arg]
        elif type(arg) == type([]):
            self.zaplet_dirs = arg


        # site_zaplet_dir

        self.site_zaplet_dir = site_zaplet_dir

        self.all_zaplet_dirs = self.zaplet_dirs[:]
        if self.site_zaplet_dir != None:
            self.all_zaplet_dirs.append(self.site_zaplet_dir)

        debug.debug(3,"zaplet eng: site_zaplet_dir: %s" % self.site_zaplet_dir)
        debug.debug(3,"zaplet eng: site_zaplet_dirs: %s" % self.all_zaplet_dirs)

        self.readZaplets(self.all_zaplet_dirs)




    def readZaplets(self, zaplet_dirs):
        """read zaplets in from each of the directories in the list
        of zaplet directories

        read them in the order specified.

        for zaplets that have the same host statement,
        the most recent zaplet read overwrites ones that have
        already been read in
        
        """
        for dir in zaplet_dirs:
            self.readZapletDir(dir)




    def readZapletDir(self,zDir):
        """read Zaplets from a directory
        """
        debug.debug(1,"zaplet_engine- reading zaplets from '" + zDir + "'...")

        cpairs = []  # list of (site, conf file) pairs

        # make the list of zaplets

        if len(zDir) < 1:
            debug.debug(3,"warning: empty zaplet directory string")
            return

        # zDir must end with '/' 
        if zDir[-1] != '/':
            zDir = zDir + '/'

        if zDir[0] != '/':
            # not absolute path to zaplet directory-- add cwd to it
            zDir = os.path.join(os.getcwd(),zDir)

        zdirlist = []
        try:
            zdirlist = os.listdir(zDir)
            self.actual_zaplet_dirs.append(zDir) # only add the ones we can read from
        except:
            debug.debug(1,"zaplet_engine- can't read zaplet directory '" + zDir + "'")
            return
        debug.debug(7, "zaplet_engine- zdirlist:")
        debug.debug(7, zdirlist)

        zapfile =  re.compile('^.*\.zap$')
        dotfile =  re.compile('^\.')
        zlist = []
        for file in zdirlist:
            if os.path.isfile(os.path.join(zDir,file)):
                if zapfile.search(file) and not dotfile.search(file):
                    zlist.append(file)
        debug.debug(3,"zaplet_engine- processed source directory: ")
        debug.debug(3,zlist)

        # now go thru list of zaplets, read them in and compile them.
        for zfile in zlist:
            zfullname = os.path.join(zDir,zfile)
            debug.debug(7,"zaplet_engine- zaplet file: " + zfullname)

            try:
                file = open(zfullname)
                lines = file.readlines()
                file.close()
            except IOError,e:
                debug.debug(3, "IOerror reading %s" % zfullname)
                continue

            zstring = string.join(lines)
            zlitem = zaplet_object.zaplet_obj(zstring)

            if zlitem.host == '':
                debug.debug(3,"zaplet hostname is empty: %s" % zfullname)
                zlitem.host = '__NO_NAME__'
            
            self.zaplet_list[zlitem.host] = zlitem
            self.zaplet_list[zlitem.host].dir = zDir
            self.zaplet_list[zlitem.host].filename = zfile

        debug.debug(7,self.zaplet_list.keys())




    def get_zaplet(self,zapletfilename):
        """make a new zaplet object, parse the zaplet string, fill out the object data fields,
        then compile it. returns None if zaplet is invalid."""

        z = zaplet_object.zaplet_obj()
        dh = zaplet_sax_handler.ZapletDocHandler1(z)
        self.parser.setDocumentHandler(dh)
        self.parser.parse(zapletfilename)
        z.compile()
        return z




    def set_block_popups_regex(self,regex):
        """set the regex for removing popup windows"""
        self.block_popups_regex = regex
        self.block_popups_re = re.compile(self.block_popups_regex)
        return


######################################
######################################
#
# zaplet engine URL blocker method
#
    def urlblocker(self, url):
        """determine whether a URL should be zapped or not
        returns 1 if it should be blocked, 0 otherwise"""

        url_tuple = urlparse.urlparse(url)
        host_port = url_tuple[1]
        remote_host = self._getRemoteHost(host_port)
        z = self._getZaplet(remote_host)
        if z == None:
            debug.debug(3,"z.urlblocker- zaplet == None.")
            return 0

        debug.debug(3,"z.urlblocker- about to checkBlocks")
        if self._checkBlocks(z,url) == 1:
            # yes
            return 1

        # no blocks        return 0

######################################
#
# zaplet engine check for blocks and filters at once
#
    def urlCheckBlockAndFilter(self, url):
        """determine whether a URL should be zapped or not,
        or filtered or not.

        returns [block, filter]:
        
        block == 1 if it should be blocked, 0 otherwise
        filter == 1 if should be filtered, 0 otherwise"""
        
        block = 0
        filter = 0
        block_popups = 0

        url_tuple = urlparse.urlparse(url)
        host_port = url_tuple[1]
        remote_host = self._getRemoteHost(host_port)
        z = self._getZaplet(remote_host)

        if self._checkBlocks(z,url) == 1:
            # yes
            debug.debug(3,"z.checkBlocksAndFilters: block = 1")
            block = 1

        if self._checkBlockPopups(z,url) == 1:
            # yes
            debug.debug(3,"z.checkBlocksAndFilters: block_popups = 1")
            block_popups = 1

        filter_list = self._checkFilters(z,url)
        debug.debug(3,"z.checkBlocksAndFilters: filter: %s" % filter_list)
                
        return [block, block_popups, filter_list]


######################################
#
# zaplet engine popup blocker
#

    def blockPopups(self,buffer):
        """remove popups from the buffer-
        returns a copy of the buffer with the popup string removed"""
        new_buffer = self._filter_regex(buffer,self.block_popups_re,'0')
        debug.debug(7,"blockPopups- new buffer: ")
        debug.debug(7,new_buffer)
        debug.debug(7,"---------------------------------")
        return new_buffer

######################################
#
# zaplet engine filter
#

    def doFilter(self,filter_list,buffer):
        """apply a list of filters to buffer-
        must be passed a list of filters that matched the URL for this page
        and a string that is the input_buffer
        returns a copy of the buffer with the filters applied"""

        tmp_buffer = buffer

        for filter in filter_list:

            filter.printself()

            new_buffer = self._filter_regex(tmp_buffer,filter.filter_match_text_re,filter.filter_replace_text)
            debug.debug(7,"doFilter- new buffer: ")
            debug.debug(7,new_buffer)
            debug.debug(7,"---------------------------------")

            tmp_buffer = new_buffer

        return new_buffer



#######################################
#######################################
#
# internal URL blocker methods
#
#######################################
#######################################

    def _getRemoteHost(self,host_port):
        """get the host out of the host and port part of the url"""
        # deal with urls of the form "http://user:password@host.domain.com/"
        #   as per RFC1738
        # thanks to "Constantinos A. Kotsokalis" <C.Kotsokalis@ece.ntua.gr>
        remote_host = host_port
        remote_port = 80
        if (string.find(host_port, ':') != -1):
            occ = string.count(host_port, ':')
            atocc = string.count(host_port, '@')
            if occ == 1 and atocc == 0:
                remote_host, remote_port = string.split(host_port, ':')
                remote_port = string.atoi(remote_port)
            elif occ == 1 and atocc == 1:
                remote_host, remote_port = string.split(host_port, '@')[1], 80
            elif occ == 2:
                remote_host, remote_port = string.split(string.split(host_port, '@')[1], ':')
                remote_port = string.atoi(remote_port)
            else:
                remote_host, remote_port = host_port, 80

        remote_host=string.lower(remote_host)
        return remote_host


######################################
#
#
#
    def _checkBlocks(self,z,url):
        """returns 1 if URL should be blocked, 0 otherwise"""
        
        debug.debug(1,"checking for blocks, URL= %s" % url)

        if z == None:
            debug.debug(1,"zaplet == None.")
            return 0

        if z.allow_everything == 1:
            debug.debug(3,"_checkBlocks: allow matches everything from " + z.host)
            return 0
        elif z.num_allows > 0:
            if (z.allow_re.search(url)):
                # it's specifically allowed, so let it thru
                debug.debug(3,"_checkBlocks: host: " + z.host)
                debug.debug(3,"zap: allow- matched url: " + url)
                return 0

        if z.block_everything == 1:
            debug.debug(3,"_checkBlocks: block matches everything from " + z.host)
            return 1
        elif z.num_blocks > 0:
            # ok, it's not specifically allowed, now check for blocks
            if (z.block_re.search(url)):
                debug.debug(3,"_checkBlocks: host: " + z.host)
                debug.debug(3,"_checkBlocks: block matched url " + url)
                return 1
            
        debug.debug(3,"zap: no allows and no blocks. (%s)" % url)
        return 0

######################################
#
#
#
    def _checkFilters(self,z,url):
        """ returns None if there are no filters that apply to the URL
        if there is a match, return a list of filter objects that match"""

        debug.debug(1,"checking for filters...")

        filter_list = []

        if z == None:
            debug.debug(1,"zaplet == None.")
            return None

        if z.filters == None or len(z.filters) < 1:
            debug.debug(1,"no filters.")
            return None
        
        for filter in z.filters:
            if filter.filter_match_url_re.match(url):
                filter_list.append(filter)

        if len(filter_list) == 0:
            return None
        else:
            return filter_list


######################################
#
#
#
    def _checkBlockPopups(self,zaplet,url):
        """ returns 1 if there is a filter that blocks popups"""

        debug.debug(1,"checking for block_popup filters...")

        if zaplet == None:
            debug.debug(1,"zaplet == None.")
            return 0

        if zaplet.block_popups_everything == 1:
            return 1

        if zaplet.num_block_popups > 0:
            if zaplet.block_popups_re.match(url):
                return 1

        return 0


######################################
#
#
#

    def _getZaplet(self,host):
        # does the remote host match a zaplet in the database?
        debug.debug(3,"zaplet._getZaplet")
        if numeric_host.match(host) == None:
            # has DNS name
            debug.debug(3, "named host.")
            z = self._getZapletForNamedHost(host)
            if z != None:
                return z
            else:
                return self._getDefaultNamedZaplet()
        else:
            # is a numeric host
            debug.debug(3, "numeric host.")
            z = self._getZapletForNumericHost(host)
            if z != None:
                return z
            else:
                return self._getDefaultNumericZaplet()


######################################
#
#
#
    def _getDefaultZaplet(self):
        # default zaplet
        tmps = 'default'
        if (self.zaplet_list.has_key(tmps)):
            return self.zaplet_list[tmps]
        else:
            return None


######################################
#
#
#
    def _getDefaultNamedZaplet(self):
        # default zaplet
        tmps = 'default-named'
        if (self.zaplet_list.has_key(tmps)):
            return self.zaplet_list[tmps]
        else:
            return self._getDefaultZaplet()

######################################
#
#
#
    def _getDefaultNumericZaplet(self):
        # default zaplet
        tmps = 'default-numeric'
        if (self.zaplet_list.has_key(tmps)):
            return self.zaplet_list[tmps]
        else:
            return self._getDefaultZaplet()


######################################
#
#
#
    def _getZapletForNamedHost(self,host):
        """return zaplet for named host:
        uses zaplet database, searchs from most specific to least specific
        (whole dns name www.foo.bar.com, then foo.bar.com, then bar.com, then .com)
        returns zaplet, or None if no match found."""

        srv_fields = string.split(host,'.')
        while (srv_fields != []):
            tmps = string.join(srv_fields,'.');
            if (self.zaplet_list.has_key(tmps)):
                return self.zaplet_list[tmps]
            del srv_fields[0]

        return None


######################################
#
#
#
    def _getZapletForNumericHost(self,host):
        """get zaplet for a numeric host:
        uses zaplet database, searches from most specific to least specific
        (whole ip number a.b.c.d, then network a.b.c, then net a.b, and then net a)
        returns zaplet, or None if no match."""
        
        srv_fields = string.split(host,'.')
        debug.debug(1,"numeric host ip: %s" % srv_fields)
        lastfield = len(srv_fields)
        debug.debug(1,"numeric host ip fields: %d" % lastfield)
        for i in range(0,lastfield):
            tmps = string.join(srv_fields[0:lastfield-i],'.');
            debug.debug(1,"numeric host- iteration ip: %s" % tmps)
            if (self.zaplet_list.has_key(tmps)):
                return self.zaplet_list[tmps]

        return None



######################################
#
#
#

    def _filter_regex(self,buffer,match_re,replacement):
        """replace all occurances of match_re with replacement string"""
        return match_re.sub(replacement,buffer,re.DOTALL)



