package Lire::Extensions::WWW::SearchSchema;

# vim:syntax=perl

use strict;

use vars qw( $VERSION @ISA );

use Lire::AsciiDlf::ExtendedFieldsCreator;
use Lire::WWW::URL;

use Carp;

BEGIN {
    ($VERSION)	= '$Revision: 1.4 $' =~ m!Revision: ([.\d]+)!;
    @ISA = qw( Lire::AsciiDlf::ExtendedFieldsCreator );
}

# Order is important
# Because google.yahoo.com isn't the same as www.google.com
my @Engine2Keywords = 
  (
   ["yahoo.com",    'p',	"Yahoo!"],
   ["altavista.com",'q',	"AltaVista"],
   ["google",	    'q',	"Google"],
   ["google",	    'query',	"Google"],
   ["www.google",   'q',	"Google"],
   ["aol.com",	    'query',	"AOL NetFind"],
   ["eureka.com",   'q',	"Eureka"],
   ["lycos.com",    'query',	"Lycos"],
   ["hotbot.com",   'MT',	"HotBot"],
   ["msn.com",	    'MT',	"Microsoft Network"],
   ["infoseek.com", 'qt',	"InfoSeek"],
   ["webcrawler",   'searchText', "WebCrawler"],
   [ "excite",	    'search',	"Excite"],
   ["netscape.com", 'search',	"Netscape"],
   ["mamma.com",    'query',	"Mamma"],
   ["alltheweb.com", 'query',	"All The Web"],
   ["northernlight.com", 'qr',	"Northern Light"],
   ["askjeeves.com", 'ask',	"Ask Jeeves"],
   ["looksmart.com", 'key',	"Look Smart"],
   ["goto.com",	     'key',	"Look Smart"],
   ["overture.com",  'Keywords',"Overture"],
   ["about.com",     'terms',	"About.COM"],
   ["metacrawler.com", 'general', "Meta Crawler"],
   ["about.com",     'terms',	"About.COM"],
   ["iwon.com",     'searchfor', "iWon"],
  );

sub init_computation {
    my ( $self, $dlf_info ) = @_;

    # Cache the index
    $self->{referer_idx} = $self->schema->field( "referer" )->pos;
    $self->{url_parser}  = new Lire::WWW::URL;
}

sub create_extended_fields {
    my ( $self, $dlf ) = @_;

    my $referer = $dlf->[$self->{referer_idx} ];
    return [ "LIRE_NOTAVAIL" x 3 ]
      if ( $referer eq "LIRE_NOTAVAIL" );

    return [ "Bookmarks", "-", "-" ]
      if ( $referer =~ /bookmarks/i );

    my $parsed_url = $self->{url_parser};
    eval { $self->{url_parser}->parse( $referer) };
    return [ "-" x 3 ] if $@;

    my $host = $parsed_url->{host} || "-";
    return [ $host, "-", "-" ]
      unless defined $parsed_url->{host} && defined $parsed_url->{query};

    foreach my $spec ( @Engine2Keywords ) {
	my ( $host_match, $param, $engine ) = @$spec;
	next if index( lc $host, $host_match ) == -1;
	next unless $parsed_url->{query} =~ /$param=(.*?)([;&]|$)/;
	my $keywords = $1;
	$keywords =~ tr/+/ /s;
	$keywords =~ s/%([0-9a-fA-F]{2})/chr(hex $1)/eg;
        # we've seen
        # http://www.google.com/search?as_q=&num=10&btnG=Google+Search&\
        #  as_epq=Trippin+Smurfs&as_oq=&as_eq=&lr=&as_ft=i&as_filetype=&\
        #  as_qdr=all&as_occt=any&as_dt=i&as_sitesearch=&safe=off
        # so: deal with empty search requests.
	$keywords = '-' if $keywords eq '';
	return [ $host, $engine, $keywords ];
    }

    return [ $host, "-", "-" ];
}

# keep perl happy
1;

__END__

=pod

=head1 NAME

Lire::Extension::WWW::SearchSchema -

=head1 SYNOPSIS

=head1 DESCRIPTION

Create extended fields based on the 'referer' field.

=head1 VERSION

$Id: SearchSchema.pm,v 1.4 2002/03/16 05:18:24 flacoste Exp $

=head1 COPYRIGHT

Copyright (C) 2001 Stichting LogReport Foundation LogReport@LogReport.org

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program (see COPYING); if not, check with
http://www.gnu.org/copyleft/gpl.html or write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.

=head1 AUTHOR

Francis J. Lacoste <flacoste@logreport.org>

=cut
