#!/usr/local/bin/perl5.00502
# ---------------------------------------------------------------------------
$Version = 'oldlog2new-2.0';
#
# Copyright (c) 1994, 1996 Regents of the University of California.
#
# This software has been developed by Roy Fielding <fielding@ics.uci.edu> as
# part of the WebSoft project at the University of California, Irvine.
#         <http://www.ics.uci.edu/pub/websoft/wwwstat/>
# See the file LICENSE for licensing and redistribution information.
#
# This program is based on an early version of the wwwstat log analyzer.
# It exists only for the purpose of converting old NCSA httpd 1.0 and 1.1
# log files to the common logfile format (CLF) used by wwwstat-1.0 and later.
# ALMOST ALL SITES WILL HAVE NO USE FOR THIS PROGRAM.
#
# It reads the old log, figures out what each entry points to, finds the
# current file size for that entity, and outputs the new format including
# a reasonable approximation of the server response code.  NOTE that this
# won't work if the logfile entries do not correspond to real files that
# are still in the place they were when the entry was logged.
#
sub usage {
    die <<"EndUsage";
usage: oldlog2new [-hez] [-f logfile] [-s srmfile]

$Version
Convert an NCSA httpd 1.1 access_log file to CLF access_log
Display Options:
     -h  Help -- just display this message and quit.
     -e  Display all invalid log entries on STDERR. (default is to ignore them)
Input Options:
     -f  Read from the following access_log file instead of the default.
     -z  Use zcat to uncompress the log file while reading [requires -f].
     -s  Get the server directives from the following srm.conf file.
EndUsage
}
# ---------------------------------------------------------------------------
# Set the default configuration options:

# Edit the next line to specify the (+/-)HHMM offset from GMT

$GMToffset = '-0700';

# Edit the next line to identify the server's default home page.

$ServerHome = "/";

# Edit the next two lines to specify the location of your server access log
# and your server configuration (srm.conf) file.

$access_log = '/usr/local/etc/httpd/logs/access_log';
$srm_conf   = '/usr/local/etc/httpd/conf/srm.conf';

# Edit the next line to specify the command for displaying compressed files

$zcat = 'gunzip -c';       # specify as null string if none are available

# Estimate the size of a redirect message minus the two location URLs

$DirectoryRedirect = 1;    # Does server do automatic redirect for slashless
                           # index reqs? (1 for httpd_1.1,  0 for httpd_1.0)

# Is the server running with rfc931 support (IdentityCheck on)?

$IdentityCheck = 0;        # Must = 1 if server uses rfc931 remote ident.

# Edit the next few lines to specify whether (1) or not (0) you want:

$PrintInvalids     = 0;    # Display invalid log entries on STDERR?
$CompressedLog     = 0;    # Access log has been compressed (or gzipped)?

# ==========================================================================
# Get the command-line options

require "getopts.pl";
&Getopts('hezf:s:');
if ($@ || $opt_h) { &usage; }

if ($opt_e) { $PrintInvalids     = 1; }
if ($opt_z) {
   if ($opt_f) { $CompressedLog  = 1; }       # Require logfile name if
   else        { &usage; }                    # uncompression is desired
}
if ($opt_f) { $access_log    = $opt_f; }
if ($opt_s) { $srm_conf      = $opt_s; }

if ($CompressedLog && !$zcat) {
    die "No zcat decompression command has been defined, stopped";
}

# ==========================================================================
# Get the other needed configuration items from the srm.conf file

open (SRM,$srm_conf) || die "Error opening config file: $srm_conf\n";

$UserDir        = "public_html";              # Start with NCSA defaults
$DirectoryIndex = "index.html";
$DocumentRoot   = "/usr/local/etc/httpd/htdocs";

while (<SRM>)
{
    next if ( ($_ eq "\n") || /^\#/ ); # Ignore blank and comment lines

    if (/^DocumentRoot (.+)\s/)
    {
        $DocumentRoot = $1;
    }
    elsif (/^UserDir (.+)\s/)
    {
        $UserDir = $1;
    }
    elsif (/^DirectoryIndex (.+)\s/)
    {
        $DirectoryIndex = $1;
    }
    elsif (/^Redirect\s+(\S+)\s+(\S+)\s/)
    {
        $alias = $1;
        $rname = $2;
        $alias =~ s/(\W)/\\$1/g;          # Needed for later pattern match
        $AllRedirects{$alias} = $rname;
    }
    elsif (/^Alias\s+(\S+)\s+(\S+)\s/)
    {
        $alias = $1;
        $rname = $2;
        $alias =~ s/(\W)/\\$1/g;          # Needed for later pattern match
        $AllAliases{$alias} = $rname;
    }
    elsif ( /^ScriptAlias\s+(\S+)\s+(\S+)\s/ ||
            /^OldScriptAlias\s+(\S+)\s+(\S+)\s/ )
    {
        $alias = $1;
        $rname = $2;
        $alias =~ s/(\W)/\\$1/g;          # Needed for later pattern match
        $AllScripts{$alias} = $rname;
    }
}
close SRM;

# ==========================================================================
# Now read log, figure out the response code and bytes, and output new format
# 

if ($CompressedLog) { $access_log = "$zcat $access_log |"; }

open (LOG,$access_log) || die "Error opening access log file: $access_log\n";

LINE: while (<LOG>)
{
    $saveline = $_;

    $ident = "-";

    if ($IdentityCheck)           # Does log include IdentityCheck info?
    {
        /^(.*)@\S+\s/;
        if ($_)
        {
            $ident = $1;                   # Save ident for later use
            $saveline =~ s/^.*@//;         # Remove the remote ident from log
        }
        $_ = $saveline;
    }
    
    $htv = '';

    ($afield, $date, $method, $oname, $htv) =
         /^(\S+) \[(.+)\] (\S+)\s+(\S+)\s(.*)$/;

    if (!($afield && $date && $method && $oname && (length($date) == 24)))
    {
        print(STDERR "$.:$saveline") if $PrintInvalids;
        next LINE;
    }

    #
    # First, we have to figure out what file or script was accessed
    #

    $fname = $oname;

    $fname =~ s/\?.*$//;              # Remove any trailing query information
    $fname =~ s/\#.*$//;              # Remove any trailing anchor information
    $fname =~ s#//#/#g;               # Remove any extra slashes

    if (($fname eq "") || ($fname eq "HTTP/1.0"))
    {
        print(STDERR "$.:$saveline") if $PrintInvalids;
        next LINE;
    }

    FNAME:                        # Get the document's real name
    {
        $rname = "";                         # and start with it unknown
        $rcode = 200;
        $fsize = 0;
        study $fname;

        if (($fname eq "/") || ($fname eq "/$DirectoryIndex"))
        {
            $fname = "$ServerHome";          # Handle top file with extra care
            $rname = "$DocumentRoot$fname";
            last FNAME;
        }

        foreach $redir (keys %AllRedirects)  # Is it a redirected file?
        {
            if ( $fname =~ /^$redir/ )
            {
                $rcode = 302;
                last FNAME;
            }
        }

        foreach $alias (keys %AllAliases)    # Is it a file name alias?
        {
            if ( $fname =~ /^$alias/ )
            {
                $rname = $fname;
                $rname =~ s#^$alias#$AllAliases{$alias}#;
                last FNAME;
            }
        }

        if ($fname =~ /^\/~(\w+)\// )        # Is it a /~username/...?
        {
            ($name,$passwd,$uid,$gid,$quota,$comment,$gcos,$dir,$shell)
                = getpwnam($1);
            if ($dir)
            {
                $rname = $fname;
                $rname =~ s#^/~$1#$dir/$UserDir#;
            }
            else
            {
                $rcode = 404;
            }
            last FNAME;
        }

        if ($fname =~ /^\/~(\w+)$/ )         # Is it a /~username ?
        {
            ($name,$passwd,$uid,$gid,$quota,$comment,$gcos,$dir,$shell)
                = getpwnam($1);
            if ($dir)
            {
                $rname = $fname;
                $rname =~ s#^/~$1#$dir/$UserDir#;
                if (-e "$rname/$DirectoryIndex")
                {
                    if ($DirectoryRedirect)
                    {
                        $rcode = 302;
                        last FNAME;
                    }
                    $rname .= "/$DirectoryIndex";
                }
                $fname .= '/';
            }
            else
            {
                $rcode = 404;
            }
            last FNAME;
        }

        foreach $alias ( keys %AllScripts )  # Is it a script directory alias?
        {
            if ( $fname =~ /^$alias/ )
            {
                $fsize = '-';
                last FNAME;
            }
        }

        if (-d "$DocumentRoot$fname")        # Is it a directory?
        {
            $hasSlash = ($fname =~ s/\/$//); # Remove any trailing slash
            if (-e "$DocumentRoot$fname/$DirectoryIndex")
            {
                if (!$hasSlash && $DirectoryRedirect)
                {
                    $rcode = 302;
                    last FNAME;
                }
                $rname = "$DocumentRoot$fname/$DirectoryIndex";
            }
            else
            {
                $rname = "$DocumentRoot$fname";
            }
            $fname .= '/';
            last FNAME;
        }

        $rname = "$DocumentRoot$fname";      # It must be a normal file

    } # end FNAME

    $xname = 0;

    if (!$fsize && ($rcode == 200) && $rname) # Get the file size
    {                                         # through use of a cache of Sizes
        ($fsize = $Sizes{$fname}) ||
        ($fsize = $Sizes{$fname} = (-s $rname)) ||
        ($xname = 1);
    }

    if ($xname) { $rcode = 404; }

    if (!(($method eq 'GET')||($method eq 'HEAD')||($method eq 'POST')))
    {
        $rcode = 400;
    }

    if    ($rcode  != 200)    { $fsize = '-'; }
    elsif ($method eq 'HEAD') { $fsize = '0'; }

    if ($htv) { $oname .= ' '. $htv; }

    #
    # Phew!  Now we have to swap the date format around
    #

    $newdate = substr($date, 8, 2) .'/'.
               substr($date, 4, 3) .'/'.
               substr($date,20, 4) .':'.
               substr($date,11, 9) . $GMToffset; 

    $newdate =~ s/^ /0/;

    #
    # Now that we have categorized it, print it in the new format
    #

    print($afield,' ',$ident,' - [',$newdate,'] "',$method,' ',$oname,
          '" ',$rcode,' ',$fsize," \n");

}
close LOG;

exit(0);
