package Lire::AsciiDlf::DlfInfo;

use strict;

use vars qw( $VERSION );

use Carp;

BEGIN {
    ($VERSION)	= '$Revision: 1.3 $' =~ m!Revision: ([.\d]+)!;
};

=pod

=head1 NAME

Lire::DlfInfo - Objects that holds information about a DLF file.

=head1 SYNOPSIS

    my $record_count = $dlf_info->record_count;

    my $start_time  = $dlf_info->start_time;
    my $end_time    = $dlf_info->end_time;

    if ( $dlf_info->is_field_available( "time" ) { ... };

    my $key_count = $dlf_info->key_count( "action" );

=head1 DESCRIPTION

Object of this class contains information about the DLF content. It
can be used to obtain the number of DLF records present in the DLF
file, the start and and time, the available and unavailable fields as
well as the number of different keys for each indexable field.

=head1 CONSTRUCTOR

=head1 Lire::DlfInfo::new( $schema, $dlf_fh )

Creates a new DlfInfo object for a particular schema. The $dlf_fh
parameter holds a file handle opened on a DLF file in the appropriate
schema.

=cut

# 
# Constructor: fh, schema, attr => value
# attr: record_count, start_time, end_time, unavail_fields
sub new {
    my $proto = shift;
    my $class = ref( $proto) || $proto;

    my ( $schema, $fh ) = @_;
    croak "invalid schema parameter ($schema): must be of type Lire::DlfSchema"
      unless UNIVERSAL::isa( $schema, "Lire::DlfSchema" );

    my $self = 
      bless { schema		=> $schema,
	      fh		=> $fh,
	      unavail_fields	=> {},
	      field_keys	=> {},
	    }, $class;

    
    $self->read_dlf;

    $self;
}

sub read_dlf {
    my ( $self ) = @_;

    # Count the number of DLF records in that file. Also determine the
    # starting and ending time. This adds insignificant overhead
    # compare to the time it take to compute the report
    my $fh = $self->{fh};
    my $dlf_count = 0;
    my @dlf_unavail = ();
    my $field_count = $self->{schema}->field_count;
    my $time_field_idx = $self->{schema}->timestamp_field->pos();

    # Holds the keys for which we collect statistics on
    my @field_keys;
    foreach my $f ( $self->{schema}->fields ) {
	if ( $f->type =~ /string|ip|port|hostname|url|email|filename/ ) {
	    push @field_keys, [ $f->name, $f->pos, 0, {} ];
	}
    }

    # Make sure we are at the beginning of the DLF file
    seek $fh, 0, 0
      or die "seek: $!\n";
    
    my ( $start_time, $end_time );
    while (<$fh>) {
	chomp;
	my $dlf = [split /\s+/];

	if (@$dlf != $field_count ) {
	    die "DLF record has ", scalar @$dlf, 
	      " fields when it should have $field_count",
		" at line $dlf_count\n";
	}

	if ( defined $start_time ) {
	    $start_time = $dlf->[$time_field_idx]
	       if $start_time ne "LIRE_NOTAVAIL" && 
		 $dlf->[$time_field_idx] < $start_time;
	} else {
	    $start_time = $dlf->[$time_field_idx];
	}

	if ( defined $end_time ) {
	    $end_time = $dlf->[$time_field_idx]
	       if $end_time ne "LIRE_NOTAVAIL" && 
		 $dlf->[$time_field_idx] > $end_time;
	} else {
	    $end_time = $dlf->[$time_field_idx];
	}

	if ( $dlf_count == 0 ) {
	    # First record, determine the unavailable fields
	    for ( my $i = 0; $i < @{$self->{schema}->fields}; $i++) {
		push @dlf_unavail, $self->{schema}->field_by_pos( $i )->name
		  if ($dlf->[$i] eq 'LIRE_NOTAVAIL' );
	    }

	    # Remove unavailable fields
	    @field_keys = grep { my $f = $_; 
				 ! grep { $_ eq $f->[0] } @dlf_unavail } @field_keys;
	}

	$dlf_count++;

	# Collect key statistics;
	foreach my $f ( @field_keys ) {
	    if ( defined $f->[3]) {
		if ( !exists $f->[3]{$dlf->[$f->[1]]} ) {
		    $f->[3]{$dlf->[$f->[1]]} = 1;
		    if ( ++$f->[2] >= 5_000 ) {
			# Keep precise number until 5_000 keys, after
			# use interpolation
			$f->[2] = $f->[2] / $dlf_count;
			$f->[3] = undef;
		    }
		}
	    }
	}
    }
    # Rewind the DLF file
    seek $fh, 0, 0
      or die "seek: $!\n";

    # DLF was probably empty
    $start_time ||= 'LIRE_NOTAVAIL';
    $end_time   ||= 'LIRE_NOTAVAIL';

    if ( $start_time eq 'LIRE_NOTAVAIL' || $end_time eq 'LIRE_NOTAVAIL' ) {
	# Invariant check
	die "start_time and end_time should be LIRE_NOTAVAIL. Something is seriously broken\n"
	  if $start_time ne $end_time;
	$start_time = 0;
	$end_time   = 0;
    }

    # Save values
    $self->{record_count} = $dlf_count;
    $self->{start_time}	  = $start_time;
    $self->{end_time}	  = $end_time;
    $self->{unavail_fields} = {map { $_ => 1 } @dlf_unavail };
    
    foreach my $f ( @field_keys ) {
	my $name = $f->[0];
	my $count = $f->[2];
	if ( $count < 1 ) {
	    # Count is a ratio
	    $count = int( $count * $dlf_count );
	}
	$self->{field_keys}{$name} = $count;
    }
    $self;
}

=pod

=head2 new_extended( $extended_schema, $extended_dlf_fh )

Creates a new DlfInfo object for an extended schema object based on
the same schema of the current DlfInfo. This will compute the
statistics concerning the extra fields.

Note: you call this constructor as a method on the base DlfInfo object.

=cut

sub new_extended {
    my ( $self, $schema, $fh ) = @_;

    croak "invalid schema parameter ($schema): must be of type Lire::ExtendedSchema"
      unless UNIVERSAL::isa( $schema, "Lire::ExtendedSchema" );


    my $new = bless {%$self}, ref $self;

    $new->{schema} = $schema;
    $new->{fh} = $fh;

    $new->read_extended_dlf;

    $new;
}

sub read_extended_dlf {
    my ( $self ) = @_;

    my $fh	    = $self->{fh};
    my $field_count = $self->{schema}->field_count;
    my $base	    = $self->{schema}->base;
    my $dlf_count   = 0;
    my @dlf_unavail = ();

    # Holds the keys for which we collect statistics on
    my @field_keys;
    foreach my $f ( $self->{schema}->fields ) {
	# Skip parent fields
	next if $base->has_field( $f->name );

	if ( $f->type =~ /string|ip|port|hostname|url|email|filename/ ) {
	    push @field_keys, [ $f->name, $f->pos, 0, {} ];
	}
    }
    
    # Make sure we are at the beginning of the DLF file
    seek $fh, 0, 0
      or die "seek: $!\n";
    while (<$fh>) {
	chomp;
	my $dlf = [split /\s+/];

	if (@$dlf != $field_count ) {
	    die "DLF record has ", scalar @$dlf, 
	      " fields when it should have $field_count",
		" at line $dlf_count\n";
	}

	if ( $dlf_count == 0 ) {
	    # First record, determine the unavailable fields
	    for ( my $i = $base->field_count; $i < @{$self->{schema}->fields}; $i++) {
		push @dlf_unavail, $self->{schema}->field_by_pos( $i )->name
		  if ($dlf->[$i] eq 'LIRE_NOTAVAIL' );
	    }

	    # Remove unavailable fields
	    @field_keys = grep { my $f = $_; 
				 ! grep { $_ eq $f->[0] } @dlf_unavail } @field_keys;
	}

	$dlf_count++;

	# Collect key statistics;
	foreach my $f ( @field_keys ) {
	    if ( defined $f->[3]) {
		if ( !exists $f->[3]{$dlf->[$f->[1]]} ) {
		    $f->[3]{$dlf->[$f->[1]]} = 1;
		    if ( ++$f->[2] >= 5_000 ) {
			# Keep precise number until 5_000 keys, after
			# use interpolation
			$f->[2] = $f->[2] / $dlf_count;
			$f->[3] = undef;
		    }
		}
	    }
	}
    }
    # Rewind the DLF file
    seek $fh, 0, 0
      or die "seek: $!\n";

    # Merge extended values
    foreach my $f ( @dlf_unavail ) {
	$self->{unavail_fields}{$f} = 1;
    }
    
    foreach my $f ( @field_keys ) {
	my $name = $f->[0];
	my $count = $f->[2];
	if ( $count < 1 ) {
	    # Count is a ratio
	    $count = int( $count * $dlf_count );
	}
	$self->{field_keys}{$name} = $count;
    }
    
    $self;
}

=pod
=head1 METHODS

=head2 start_time()

Returns the start timestamp of the DLF file in epoch time.

=cut

sub start_time {
    die $_[0]{schema}->timestamp_field->name, " is not available\n"
      unless defined $_[0]->{start_time};
    return $_[0]{start_time};
}

=pod

=head2 end_time()

Returns the end timestamp of the DLF file in epoch time.

=cut

sub end_time {
    die $_[0]{schema}->timestamp_field->name, " is not available\n"
      unless defined $_[0]->{end_time};
    return $_[0]{end_time};
}


=pod

=head2 record_count()

Returns the number of DLF records in the DLF.

=cut

sub record_count {
    return $_[0]{record_count};
}

=pod

=head2 is_field_available( $field )

Returns true if the field is available in the DLF, false otherwise.

This method will die() in the case that the requested field isn't
defined in the schema.

=cut

sub is_field_available {
    my ( $self, $field ) = @_;

    die "no such field: $field\n" unless $self->{schema}->has_field( $field );
    return ! exists $self->{unavail_fields}{$field};
}

=pod

=head2 unavail_fields()

Returns the list of fields that are unavailable in the DLF.

=cut

sub unavail_fields {
    return (keys %{$_[0]{unavail_fields}});
}

=pod

=head2 field_keys( $field )

Returns the number of different values there are in the field $field.
This will be equal to the number of records in the case of numeric or
time field. For large values, it is not a precise number but based on
an interpolation from a representative sample.

This method will die() in the case that the requested field isn't
defined in the schema or that it isn't available.

=cut

sub field_keys {
    my ( $self, $field ) = @_;

    die "$field is not available\n"
      unless $self->is_field_available( $field );
    if ( exists $self->{field_keys}{$field} ) {
	return $self->{field_keys}{$field};
    } else {
	return $self->{record_count};
    }
}

# keep perl happy
1;

__END__

=pod

=head1 SEE ALSO

Lire::Program(3pm), Lire::Field(3pm)

=head1 VERSION

$Id: DlfInfo.pm,v 1.3 2002/07/30 13:27:09 vanbaal Exp $

=head1 COPYRIGHT

Copyright (C) 2001 Stichting LogReport Foundation LogReport@LogReport.org

This file is part of Lire.

Lire is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program (see COPYING); if not, check with
http://www.gnu.org/copyleft/gpl.html or write to the Free Software 
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.

=head1 AUTHOR

Francis J. Lacoste <flacoste@logreport.org>

=cut
