#! /usr/bin/perl -w

# vim:syntax=perl

use strict;
use lib '/usr/local/share/perl5';

package Lire::Proxy::MS_ISALog;

use vars qw/ @ISA /;

use Lire::DlfSchema;
use Lire::W3CExtendedLog;
use Lire::WWW::URL;
use Lire::Program qw( :msg );

BEGIN {
    @ISA = qw/Lire::W3CExtendedLog/;
}

my $debug = 0;
sub debug {
    $debug and lr_debug($_[0]);
}


my $schema = Lire::DlfSchema::load_schema( "proxy" );

my %ms_type2regex = (
    # MS enhanced string, using tab as field separator
    ms_string => '([^\t]+)',
);

my %ms_identifier2type = (
    # MS workaround
    'username'       => 'ms_string',
    'agent'          => 'ms_string',
    'referred'       => 'name',
    'host'           => 'name',
    'port'           => 'integer',
    'computername'   => 'uri',
    'object-source'  => 'uri',
    'operation'      => 'uri',
    'protocol'       => 'uri',
    'mime-type'	     => 'uri',
    'rule#1'	     => 'uri',
    'rule#2'	     => 'uri',
);

# see http://www.w3.org/TR/WD-logfile.html
my %isa_field2proxy_dlf =
  (
    'c-ip'           => 'client_ip',
    'c-host'	     => 'client_host',
    'cs-username'    => 'user',

    'c-agent'        => 'useragent',
    # e.g. Mozilla/4.0 (compatible; MSIE 5.0; Win32)
    # or Outlook Express/5.0 (MSIE 5.0; Windows 98; DigExt)

    # 's-computername' => 'computername',
    # Proxy name (s-computername) The name of the computer running ISA
    # Server. This is the computer name that is assigned in Windows
    # 2000. e.g. GRO1SYX01 
    # Currently unmapped

    # 'cs-referred'    => 'result_src_host',
    # Referring server name (cs-referred): If ISA Server is used upstream in
    # a chained configuration, this indicates the server name of the
    # downstream server that sent the request.
    # Unmapped for now.

    'r-host'         => 'result_src_host',
    'r-ip'           => 'result_src_ip',
    'r-port'         => 'result_src_port',
    # For the Web Proxy service, a hyphen (-) in this field may
    # indicate that an object was sourced from the Web Proxy server
    # cache and not from the destination. One exception is negative
    # caching. In that case, this field indicates a destination IP
    # address for which a negative-cached object was returned.

    # 'sc-bytes'       => 'bytes',
    # We'll add up cs-bytes and sc-bytes later on

    'sc-status'	     => 'req_result',

    'cs-protocol'    => 'protocol',

    's-operation'    => 'operation',

    #'cs-uri'         => ,
    # This will be parsed for dst_host and requested_url

    # 's-object-source' => 'result_src_code',
    # Object source (s-object-source) Indicates the source that was used to 
    # retrieve the current object. This field applies only to the Web Proxy
    # service log.
    # This overlaps the cache_result and result_src_result

    'time-taken'     => 'duration',
    # This is in millisecond, we need to postprocess it

    'rule#1'	     => 'rule',
    'rule#2'	     => 'rule',
    # Rule#1 is about protocol rule
    # Rule#2 is about content filtering
    # Only one of the two should be defined in each record

    'cs-mime-type'   => 'type',
  );

# values and their meanings:
# 0           No source information is available.
# Cache       Source is the cache. Object returned from cache.
# Inet        Source is the Internet. Object added to cache.
# Member      Returned from another array member.
# NotModified Source is the cache. Client performed an If-Modified-Since
#              request and object had not been modified.
# NVCache     Source is the cache. Object could not be verified to source.
# Upstream    Object returned from an upstream proxy cache.
# Vcache      Source is the cache. Object was verified to source and had
#              not been modified. 
# VFInet      Source is the Internet. Cached object was verified to source
#              and had been modified.
my %ms_isa2cache_result =
  (
   0	    => 'NONE',
   Cache    => 'TCP_HIT',
   Inet	    => 'TCP_MISS',
   Member   => 'TCP_MISS',
   NotModified => 'TCP_IMS_HIT',
   NVCache  => 'TCP_REF_FAIL_HIT',
   Upstream => 'TCP_MISS',
   VCache   => 'TCP_REFRESH_HIT',
   VFInet   => 'TCP_REFRESH_MISS',
  );

my %ms_isa2result_src_code =
  (
   0	    => 'NONE',
   Cache    => 'NONE',
   Inet	    => 'DIRECT',
   Member   => 'SIBLING_HIT',
   NotModified => 'NONE',
   NVCache  => 'NONE',
   Upstream => 'PARENT_HIT',
   VCache   => 'NONE',
   VFInet   => 'DIRECT',
  );

sub build_parser {
    my ( $self ) = shift;

    $self->{field2re} = %ms_type2regex;

    for my $k (keys %ms_identifier2type) {
        if (defined $ms_identifier2type{$k}) {
            # possibly overwrite Lire::W3CExtendedLog values
            $self->{identifier2type}->{$k} = $ms_identifier2type{$k};
        } else {
            if (defined $self->{identifier2type}->{$k}) {
                lr_warn( "ms_identifier2type undefined in '$k', keeping W3C " .
                  "default '" . $self->{identifier2type}->{$k} .
                  "' for identifier2type" );
            } else {
                lr_warn( "ms_identifier2type undefined in '$k', " .
                  "identifier2type stays undefined" );
            }
        }
    }

    $debug and do {
        debug( "just filled identifier2type hash, dumping it" );
        while ((my $k, my $v) = each %{ $self->{identifier2type} }) {
            if (defined $v) {
                debug( "identifier2type{'$k'} = '$v'" );
            } else {
                debug( "identifier2type{'$k'} undefined" );
            }
        }
        debug( "... done" );
    };

    # Override some types' lexer
    while ( my ( $type, $rx ) = each %ms_type2regex ) {
	$self->{type2regex}{$type} = $rx;
    }

    $self->SUPER::build_parser( @_ );

    my @fields = split /\s+/, $self->{fields};
    my %fields = map { $_ => 1 } @fields;

    my @mapped   = ();
    my @dlf_fields;
    foreach my $f ( @fields ) {
        if ( exists $isa_field2proxy_dlf{$f} ) {
            push @mapped, $f;
            push @dlf_fields, $isa_field2proxy_dlf{$f};
        }
    }

    # Create the DLF maker function
    push @dlf_fields, "time" if $fields{time};
    push @dlf_fields, "client_host" if $fields{'c-ip'};
    push @dlf_fields, "bytes" if $fields{'cs-bytes'} || $fields{'sc-bytes'};
    push @dlf_fields, "dst_host", "requested_url", "dst_port", "protocol"
      if $fields{'cs-uri'};
    push @dlf_fields, "cache_result", "result_src_code"
      if $fields{'s-object-source'};

    # Keep only one of each
    my %dlf_fields = map { $_ => 1 } @dlf_fields;
    @dlf_fields = sort keys %dlf_fields;

    lr_info( "mapped DLF fields: ", join( ", ", @dlf_fields ) );

    my $dlf_maker = $schema->make_hashref2asciidlf_func( @dlf_fields );

    $self->{proxy_dlf_converter} = sub {
        my $w3c = $self->{w3c_parser}->( $_[0] );

        # Those fields that are mapped directly
        my %dlf = ( time => $w3c->{lire_time} );
	$dlf{bytes} = 0;
	$dlf{bytes} += $w3c->{'cs-bytes'}
	  if $w3c->{'cs-bytes'} && $w3c->{'cs-bytes'} ne '-';
	$dlf{bytes} += $w3c->{'sc-bytes'}
	  if $w3c->{'sc-bytes'} && $w3c->{'sc-bytes'} ne '-';

	if ( exists $w3c->{'cs-uri'}) {
	    my $url = Lire::WWW::URL->new( $w3c->{'cs-uri'} );
	    $dlf{'dst_host'}	    = $url->{'host'};
	    $dlf{'requested_url'}   = $url->{'path'};
	    $dlf{'dst_port'}	    = $url->{'port'};
	    $dlf{'protocol'}	    = $url->{'protocol'};
	    # That last one may be overriden by cs-protocol
	}

	if ( exists $w3c->{'s-object-source'} ) {
	    my $code = $w3c->{'s-object-source'};
	    if ( $code eq '-' ) {
		if ( exists $w3c->{'sc-status'} &&
		     $w3c->{'sc-status'} == 403 )
		{
		    # Special case for denied requests
		    $dlf{cache_result}	    = 'TCP_DENIED';
		    $dlf{result_src_code}   = 'NONE';
		} else {
		    $dlf{cache_result}	    = 'NONE';
		    $dlf{result_src_code}   = 'NONE';
		}
	    } else {
		$dlf{cache_result}    = $ms_isa2cache_result{$code};
		$dlf{result_src_code} = $ms_isa2result_src_code{$code};

		$dlf{cache_result} = 'TCP_NEGATIVE_HIT'
		  if  exists $dlf{req_result} && $dlf{req_result} == 404 &&
		   $dlf{cache_result} eq 'TCP_HIT';
	    }
	}

        foreach my $name ( @mapped ) {
	    # Map field only if it has the non-default value '-'
            $dlf{$isa_field2proxy_dlf{$name}} = $w3c->{$name}
	      if $w3c->{$name} ne '-';
        }

	$dlf{duration} = sprintf '%.3f', $dlf{duration} / 1_000
	  if exists $dlf{duration};

	# Put client_ip into client_host when it is missing.
	$dlf{client_host} ||= $dlf{client_ip};

        return $dlf_maker->( \%dlf );
    }
}




package main;

use Lire::Program qw( :msg :dlf );

my $lines       = 0;
my $dlflines    = 0;
my $errorlines  = 0;

init_dlf_converter( "proxy" );

my $parser = new Lire::Proxy::MS_ISALog;

# Parse the header
my $line;
while (defined( $line = <> )) {
    last unless $line =~ /^#/;
    $parser->parse_directive( $line );
}

lr_err( "invalid W3C extended log file: must start by Version and Fields " .
    "directives" ) unless defined $parser->{fields} &&
    defined $parser->{version};

my $todlf = $parser->{proxy_dlf_converter};

# Transform into DLF
do {
    $lines++;

    if ( $line =~ /^#/ ) {
        eval {
            $parser->parse_directive( $line );
        };
        if ( $@ ) {
            lr_err( $@ );
            $errorlines++;
            last;
        }
    } else {
        eval {
            my $dlf = $todlf->( $line );
            print join( " ", @$dlf), "\n";
            $dlflines++;
        };
        if ($@) {
            lr_warn( $@ );
            lr_notice( qq{cannot convert line $. "$line" to proxy dlf, skipping} );
        }
    }
    $line = <>;
} while (defined $line);

end_dlf_converter( $lines, $dlflines, $errorlines );

exit 0;


__END__

=pod

=head1 NAME

ms_isa2dlf - convert Microsoft ISA server logs to DLF

=head1 SYNOPSIS

B<ms_isa2dlf> B<[>I<file>B<]>

=head1 DESCRIPTION

B<ms_isa2dlf> converts Microsoft Internet Security and Acceleration Server log
files in the W3C Extended Log Format to the proxy DLF.  The ISA log files are
documented on
http://www.microsoft.com/technet/prodtechnol/isa/proddocs/isadocs/M_S_C_LoggingFields.asp .

=head1 DEBUGGING

As any Lire 2dlf program, this program needs adjusted LR_DBDIR, LR_DBFILE,
LR_ID and PATH variables.  These are set in .../etc/lire/defaults and
.../etc/lire/profile_lean.  After manually source-ing these files, one can run
this program as a standalone application, by invoking it as e.g.

 zcat ms_isa.log.gz | LR_ID=`date +%Y%m%d.%H%M%S` ./ms_isa2dlf > /tmp/dlf

.

=head1 THANKS

Chainsaw on OPN irc, for supplying log files.

=head1 SEE ALSO

w3c_extended2dlf(1)

=head1 VERSION

$Id: ms_isa2dlf.in,v 1.9 2002/02/07 20:43:46 flacoste Exp $

=head1 COPYRIGHT

Copyright (C) 2001 Stichting LogReport Foundation LogReport@LogReport.org
 
This program is part of Lire.

Lire is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
 
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
 
You should have received a copy of the GNU General Public License
along with this program (see COPYING); if not, check with
http://www.gnu.org/copyleft/gpl.html or write to the Free Software 
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.

=head1 AUTHOR

Joost van Baal <joostvb@logreport.org>, heavily inspired by Francis J.
Lacoste's w3c_extended2dlf(1)

=cut

# Local Variables:
# mode: cperl
# End:



