#!/usr/local/bin/perl

# Perl 5.002 or later.  w3mir is mostly tested with perl 5.004
#
use lib '/Library/Perl';
#
# Can perform the following fixes:
# - Rewrite redirected to URLs
# - change .../ into .../index.html (or .../Welcome.html)
# - change external links to point to some helpfull .html file ---- NOT IMPLEMENTED
# - change links to documents not retrived to point to some helpfull
#   .html file --- NOT IMPLEMENTED
# - After adding a 'Also' directive we can edit the urls pointing to within
#   the new space of the retrival scope, making the pointers consistent.

# Method: 
# 1 Gather list of all URLs to be rewritten:
#   - Redirects: Just read the .redirs file
#   - .../ into .../index.html: All can be found in the .referers file
#     Remember to rewrite missing / redirects too...
#   - external links: in the .referers file too
#   - non-retrived documents: What files should be here (according to
#	.referers file) but are not?
# 2 Gather list of all documents needing editing
# 3 Edit them
#
# 09/05/98 janl		- Only rewrite .../ to .../$indexname if the attribute
#			  does not refer to a directory. -> 0.6.2
# 12/05/98 janl		- Use ->local_path to determine filename -> 0.6.3
# 22/05/98 janl		- .../#foo was not being rewritten to 
#			  .../$indexname#foo -> 0.6.4
#
require 5.002;

use vars qw($win32);

# To figure out what kind of system this is
BEGIN {
  use Config;
  $win32 = ( $Config{'osname'} eq 'MSWin32' ); 
}

use Carp;
use htmlop;
use URI::URL;
use URI::Escape;

eval '
# This is for URI 1.0 only
use URI;
# Make URI behave in a (more) sensible manner
$URI::ABS_ALLOW_RELATIVE_SCHEME=1;
$URI::ABS_REMOTE_LEADING_DOTS=1;
';

use strict;

my $VERSION;
$VERSION='0.6.6';

my $debug=0;			# Not debugging
my $verbose=0;

my $indexname='index.html';
my $chdirto='';			# Place to chdir to after reading
                                # config file

my $infoloss=0;			# 1 if any URL translations (which
                                # cause information loss) are in
                                # effect.  If this is true we use the
                                # SAVEURL operation.  What to get, and
                                # not.  Text of user supplied
                                # fetch/ignore rules

my $doindex=1;			# append $indexname to /$ ?

my $editthis='';		# Edit references matching this expression.

my $files=0;			# How many files have I edited?
my $rc='';

my $rule_text="# User defined fetch/ignore rules\n";
# Code ref to the rule procedure
my $rule_code;

# Code to prefix and postfix the generated code.  Prefix should make
# $_ contain the url to match.  Postfix should return 1, the default
# is to get the url/file.
my $rule_prefix='$rule_code = sub { local($_) = shift;'."\n";
my $rule_postfix=' return 1; } ';

# Scope tests generated by URL/Also directives in cfg. The scope code
# is just like the rule code, but used for program generated
# fetch/ignore rules related to multiscope retrival.
my $scope_fetch="# Automatic fetch rules for multiscope retrival\n";
my $scope_ignore="# Automatic ignore rules for multiscope retrival\n";
my $scope_code;

my $scope_prefix='$scope_code = sub { local($_) = shift;'."\n";
my $scope_postfix=' return 0; } ';

# Function to apply to urls, se rule comments.
my $user_apply_code;	# User specified apply code
my $apply_code;		# w3mirs apply code
my $apply_prefix='$apply_code = sub { local($_) = @_;'."\n";
my $apply_lc=' $_ = lc $_; ';
my $apply_postfix=' return $_; } ';
my @user_apply;		# List of users apply rules.
my @internal_apply;	# List of w3mirs apply rules.


my $iinline='';			# inline RE code to make RE caseinsensitive 
my $ipost='';			# RE postfix to make it caseinsensitive
my $lc=0;			# Convert urls/filenames to lowercase?
my $abs=0;			# Absolutify URLs?
my $fixrc='';			# Name of w3mfix config file
my $fixup=0;			# Do things needed to run fixup
my $r=0;			# Recurse? no recursion = absolutify links
my %rum_referers=();		# Array of referers, key: rum_url
my %rum_redirected=();		# Array of redirected url: key: original url
my %lf_edited=();		# Edited this file yet?
my $list;			# List url on STDOUT?

my %stat=();			# stat($lf_url): 'd' for dir, 'f' for others

# ######################### Libwww-perl addons:

# URI as part of libwww:

sub URI::URL::_generic::unix_path {
  my $self = (shift)->clone;

  $self->frag(undef);
  $self->scheme("file");
  $self->unix_path;
}


sub URI::URL::_generic::basename {
  my $self = shift;
  my @p = $self->path_components;
  my $old = $p[-1];
  if (@_) {
    splice(@p, -1, 1, shift);
    $self->path_components(@p)
  }
  $old;
}


# URI 1.0
sub URI::URL::unix_path {
  my $self = (shift)->clone;

  return uri_unescape($self->path);
}


sub URI::_generic::basename {
  my $self = shift;
  my @p = $self->path_segments;
  my $old = $p[-1];
  if (@_) {
    splice(@p, -1, 1, shift);
    $self->path_segments(@p)
  }
  $old;
}


# ######################### Configuration/argument parsing

sub parse_args {
  my $f;
  my $i;

  $i=0;

  while ($f=shift) {
    $i++;
    # This is a demonstration against Getopts::Long.
    if ($f =~ s/^-+//) {
      $verbose=-1,next if $f eq 'q'; 		# Quiet
      $verbose=1,next if $f eq 'c';		# Chatty
      die "w3mfix version $VERSION\n" if $f eq 'v';	# Version
      die "rtfm\n" if ($f eq 'help' || $f eq 'h' || $f eq '?');

      if ($f eq 'editref') {
	die "Sorry, can only have one -editref pr. run\n"
	  if $editthis;
	$editthis=quotemeta(shift);
	next;
      }

      if ($f eq 'd') {	# Debugging level
	$f=shift;
	unless (($debug = $f) > 0) {
	  die "w3mfix: debug level must be a number greater than zero.\n";
	}
	next;
      }

      # Those were all the options...
      warn "w3mfix: Unknown option: -$f.  Use -h for usage info.\n";
      exit(1);
    } else {
      # If we get this far then ... it's a configuration file name:
      $rc = $f;
      die "w3mfix: Got a non-option argument that wasn't the name of a\n".
	"(configuration) file either\n"
	  unless -f $f;
    }
  }
}


sub parse_cfg_file {
  # Read the configuration file.  Aborts on errors.
  # Ignores w3mir options w3mfix does not need itself.

  my ( $file ) = @_ ;
  my ($key, $value, $authserver,$authrealm,$authuser,$authpasswd);
  my $i;

  die "w3mfix: config file $file is not a file.\n" unless -f $file;
  open(CFGF, $file) || die "Could not open config file $file: $!\n";

  # print STDERR "Reading $file\n";

  $i=0;

  while (<CFGF>) {
    # Trim off various junk
    chomp;
    s/^#.*//;
    s/^\s+|\s$//g;
    # Anything left?
    next if $_ eq '';
    # Examine remains
    $i++;
    ($key, $value) = split(/\s*:\s*/,$_,2);
    $key = lc $key;

    # These are no-ops in w3mfix
    next if ( $key eq 'initial-referer' );
    next if ( $key eq 'header' );
    next if ( $key eq 'pause' );
    next if ( $key eq 'retry-pause' );
    next if ( $key eq 'retries' );
    next if ( $key eq 'robot-rules' );
    next if ( $key eq 'remove-nomirror' );
    next if ( $key eq 'file-disposition' );
    next if ( $key eq 'http-proxy' );
    next if ( $key eq 'proxy-options' );
    next if ( $key eq 'auth-domain' );
    next if ( $key eq 'auth-user' );
    next if ( $key eq 'auth-passwd' );
    next if ( $key eq 'disable-headers' );
    next if ( $key eq 'agent' );

    $debug=numeric($value),next if ( $key eq 'debug' );
    umask(numeric($value)),next if ( $key eq 'umask' );
    $indexname=$value,next if ($key eq 'index-name');
    $verbose=nway($value,'quiet','brief','chatty')-1,next
      if ( $key eq 'verbosity' );

    if ( $key eq 'cd' ) {
      $chdirto=$value;
      next;
    }

    if ($key eq 'url') {
      my ($rum_url_o,$lf_dir,$rum_reurl,$rum_rebase);

      # A two argument URL: line?
      if ($value =~ m/^(.+)\s+(.+)/i) {
	# Two arguments.
	$rum_url_o=url $1;
	# The last is a directory, it must end in /
	$lf_dir=$2;
	$lf_dir.='/' unless $lf_dir =~ m~/$~;

	# The first is a URL, make it more canonical, find the base.
	# The namespace confusion in this section is correct.(??)
	$rum_rebase = quotemeta( ($rum_url_o->as_string =~ m~^(.*/)~)[0] );

	# print "URL: ",$rum_url_o->as_string,"\n";
	# print "Base: $rum_rebase\n";

	# Translate from rum space to lf space:
	push(@internal_apply,"s/^".$rum_rebase."/".quotemeta($lf_dir)."/");

	# That translation could lead to information loss.
	$infoloss=1;

	# Fetch rules tests the rum_url_o->as_string.  Fetch whatever
	# matches the base.
	$scope_fetch.="return 1 if m/^".$rum_rebase."/".$ipost.";\n";

	# Ignore whatever did not match the base.
	$scope_ignore.="return 0 if m/^".
	  quotemeta("http://".$rum_url_o->netloc."/")."/".$ipost.";\n";

      } else {
	# $rum_url_o=root_quene($value);

	$rum_url_o=url $value;

	$rum_rebase = quotemeta( ($rum_url_o->as_string =~ m~^(.*/)~)[0] );

	# Translate from rum space to lf space:
	push(@internal_apply,"s/^".$rum_rebase."//");

	$scope_fetch.="return 1 if m/^".$rum_rebase."/".$ipost.";\n";
	$scope_ignore.="return 0 if m/^".
	  quotemeta("http://".$rum_url_o->netloc."/")."/".$ipost.";\n";
      }
      next;
    }

    if ($key eq 'also' || $key eq 'also-quene') {
      if ($value =~ m/^(.+)\s+(.+)/i) {
	my ($rum_url_o,$lf_dir,$rum_reurl,$rum_rebase);
	# Two arguments.
	$rum_url_o=url $1;
	$rum_url_o->host(lc $rum_url_o->host);
	# The last is a directory, it must end in /
	$lf_dir=$2;
	$lf_dir.='/' unless $lf_dir =~ m~/$~;

	# The first is a URL, find the base
	$rum_rebase = quotemeta( ($rum_url_o->as_string =~ m~^(.*/)~)[0] );

	# Ok, now we can transform and select stuff the right way
	push(@internal_apply,"s/^".$rum_rebase."/".quotemeta($lf_dir)."/");
	$infoloss=1;

	# Fetch rules tests the rum_url_o->as_string.  Fetch whatever
	# matches the base.
	$scope_fetch.="return 1 if m/^".$rum_rebase."/".$ipost.";\n";

	# Ignore whatever did not match the base.  This cures problem
	# with '..' from base in in rum space pointing within the the
	# scope in ra space.  We introduced a extra level (or more) of
	# directories with the apply above.  Must do same with 'Also:'
	# directives.
	$scope_ignore.="return 0 if m/^".
	  quotemeta("http://".$rum_url_o->netloc."/")."/".$ipost.";\n";
      } else {
	die "Also: requires 2 arguments\n";
      }
      next;
    }

    if ($key eq 'quene') {
      # root_quene($value);
      next;
    }

    if ($key eq 'ignore-re' || $key eq 'fetch-re') {
      # Check that it's a re, better that I am strict than for perl to
      # make compilation errors.
      unless ($value =~ /^m(.).*\1[gimosx]*$/) {
	print STDERR "w3mfix: $value is not a recognized regular expression\n";
	exit 1;
      }
    }

    if ($key eq 'fetch' || $key eq 'fetch-re') {
      my $expr=$value;
      $expr = wild_re($expr).$ipost if ($key eq 'fetch');
      $rule_text.=' return 1 if '.$expr.";\n";
      next;
    }

    if ($key eq 'ignore' || $key eq 'ignore-re') {
      my $expr=$value;
      $expr = wild_re($expr).$ipost if ($key eq 'ignore');
      $rule_text.=' return 0 if '.$expr.";\n";
      next;
    }


    if ($key eq 'apply') {
      unless ($value =~ /^s(.).*\1.*\1[gimosxe]*$/) {
	print STDERR
	  "w3mfix: '$value' is not a recognized regular expression\n";
	exit 1;
      }
      push(@user_apply,$value) ;
      $infoloss=1;
      next;
    }

    if ($key eq 'options') {
      
      my($val,$nval);
      foreach $val (split(/\s*,\s*/,lc $value)) {
	if ($i==1) {
	  $nval=nway($val,'recurse','no-date-check','only-nonexistent',
		     'list-urls','lowercase','remove','batch','read-urls',
		     'abs');
	  $r=1,next if $nval==0;
	  next if $nval==1;
	  next if $nval==2;
	  $list=1,next if $nval==3;
	  if ($nval==4) {
	    $lc=1;
	    $iinline=($lc?"(?i)":"");
	    $ipost=($lc?"i":"");
	    next ;
	  }
	  next if $nval==5;
	  next if $nval==6;
	  next if $nval==7;
	  $abs=1,next if $nval==8;
        } else {
	  die "w3mfix: options must be the first directive in the config file.\n";
	}
      }
    }

    if ($key eq 'fixup') {

#      chomp($fixrc=`pwd` || '.');
#      $fixrc.="/$file";
#      warn "Fixrc: $fixrc\n";
#      $fixup=1;

      my($val,$nval);
      foreach $val (split(/\s*,\s*/,lc $value)) {
	$nval=nway($val,'on','run','noindex');
	next if $nval==0;
	next if $nval==1;
	$doindex=0 if $nval==2;
	# Ignore everyting else
      }
    }

  }
  close(CFGF);
}

sub wild_re {
  # Here we translate unix wildcard subset to to perlre
  local($_) = shift;
  s#\*#\.\*#;
  s#\?#\.#;
  s#([\/\(\)\\\|\{\}\+)\$\^])#\\$1#g;
  return $_ = '/'.$_.'/';
}


sub numeric {
  # Check if argument is numeric?
  my ( $number ) = @_ ;
  return oct($number) if ($number =~ /\d+/ || $number =~ /\d+.\d+/);
  die "Expected a number, got \"$number\"\n";
}


sub boolean {
  my ( $boolean ) = @_ ;

  $boolean = lc $boolean;

  return 0 if ($boolean eq 'false' || $boolean eq 'off' || $boolean eq '0');
  return 1 if ($boolean eq 'true' || $boolean eq 'on' || $boolean eq '1');
  die "Expected a boolean, got \"$boolean\"\n";
}


sub nway {
  my ( $value ) = shift;
  my ( @values ) = @_;
  my ( $val ) = 0;

  $value = lc $value;
  while (@_) {
    return $val if $value eq shift;
    $val++;
  }
  die "Expected one of ".join(", ",@values).", got \"$value\"\n";
}


sub stat {

  my $file = shift;

  if (exists($stat{$file})) {
    print STDERR "++Cache hit: $file\n" if $debug;
  } else {
    stat($file);
    if (-e _) {
      $stat{$file}=(-d _)?'d':'f';
    } else {
      $stat{$file}='n';
    }
    print STDERR "--Cache miss: $file\n" if $debug;
  }
  return $stat{$file};
}

# ######################## Read 'state' files ##############################

sub read_state {

  my $reffile='.referers';
  my $refered;
  my @referers;

  $reffile="referers" if $win32;

  print STDERR "reading $reffile\n" if $verbose>0;
  open(REFERERS,"< $reffile") ||
    die "Could not open $reffile for reading: $!\n";

  while (<REFERERS>) {
    chomp;
    ($refered,undef,@referers) = split(/\s+/);
    $rum_referers{$refered}= [ @referers ];
    # print STDERR $refered," <- ",join(' and ',@referers),"\n";
  }

  close(REFERERS);


  # Read redirection report

  my $redirfile='.redirs';
  my $wrong;
  my $right;
  my $tmp;

  $redirfile="redirs" if $win32;

  print STDERR "reading $redirfile\n" if $verbose>0;
  open(REDIRS,"< $redirfile") ||
    die "Could not open $redirfile for reading: $!\n";

  while (<REDIRS>) {
    chomp;
    ($wrong,undef,$right) = split(/\s+/);

    $rum_redirected{$wrong}=$right;
  }

  close(REDIRS);
}

# ######################### Process every single tag ########################

sub process_tag {
  # Process a tag in html file
  my $lf_referer = shift;
  my $base_url = shift;
  my $tag_name = shift;
  my $url_attrs = shift;

  # Retrun quickly if no URL attributes
  return unless defined($url_attrs);

  my $attrs = shift;

  # Information loss through apply or processing in this procedure?
  my $il = $infoloss; 
  my $redirs;
  my $stat;

  my $rum_url;	# The absolute URL
  my $lf_url;	# The local filesystem url
  my $lf_url_o; # ... and it's object
  my $key;
  my $orig_rum_url;
#  my $debug = 1;

#  print STDERR "\nProcess Tag: $tag_name, URL attributes: ", join(', ',@{$url_attrs}),"\nOrigin:",$base_url,"\n"; # if $debug>2;

  substr($lf_referer,0,0)='./' unless substr($lf_referer,0,1) eq '/';

  foreach $key (@{$url_attrs}) {
    if (defined($$attrs{$key})) {
      $orig_rum_url=$rum_url=$$attrs{$key};

      print STDERR "\n$key = $rum_url\n" if $debug;

      # Apply redirects:
      $redirs=0;
      if (exists($rum_redirected{$rum_url})) {
	$il=1;
	while (exists($rum_redirected{$rum_url})) {
	  die "Too many redirects in a row\n" if $redirs++>32;
	  print STDERR "$rum_url -> ".$rum_redirected{$rum_url}."\n"
	    if $debug;
	  $rum_url=$rum_redirected{$rum_url};
	}
      }

      # Apply program/user apply rules
      $lf_url=apply($rum_url);

      if (defined($lf_url)) {
	# Apply directory/file check here
	$stat=&stat($lf_url);

	if ($stat eq 'f' && $lf_url =~ m~/$~) {
	  # It's a file, remove trailing /
	  print STDERR "****** File / fixup of $lf_url\n" if $debug;
	  $il=1;
	  substr($lf_url,length($lf_url)) = '';
	} elsif ($stat eq 'd' && !($lf_url =~ m~/$~) ) {
	  # It's a directory, add a trailing /
	  print STDERR "****** Directory / fixup of $lf_url\n" if $debug;
	  $il=1;
	  $lf_url .= '/';
	}

	substr($lf_url,0,0)='./' unless substr($lf_url,0,1) eq '/';

	$lf_url_o=url $lf_url;
	my $tmp=$lf_url_o->clone;
	$tmp->basename($indexname);

	if ( $doindex && $lf_url_o->basename eq "" && 
	     &stat($tmp->unix_path) eq 'f' &&
	     !$htmlop::isdir{$key}) {
	  $lf_url_o=$tmp;
	  $il=1;
	  print STDERR "indexname adjusted to ",$lf_url_o->as_string,"\n"
	    if $debug;
	}
		 
	# Save new value in the hash, make it a file url to get 'rel' working
	$lf_url_o->scheme('file');
	$$attrs{$key} = ($lf_url_o->rel("file:".$lf_referer))->as_string;

	printf STDERR "Saved ".$$attrs{$key}."\n" if $debug;

	# If there is potential information loss save the old value too
	$$attrs{"W3MIR".$key}=$orig_rum_url if $il;

      } elsif ($redirs>0) {
	$$attrs{$key}=$rum_url;
	print STDERR "Saved ".$$attrs{$key}."\n" if $debug;
	$$attrs{"W3MIR".$key}=$orig_rum_url;
      }
    }
  }
}


# ###################### Edit the URLs in one file... ########################

sub edit_html_file {
  # Check if it's a html file.  I know this tag is in all html
  # files, because w3mir put it there.

  my($lf_url)=shift;
  my($rum_url)=shift;

  # Figure out the filename for our local filesystem.
  $lf_url.=$indexname if $lf_url =~ m~/$~ || $lf_url eq '';

  # Stuff in need of unquoting?
  $lf_url = (url "file:$lf_url")->local_path if $lf_url =~ /\%\d\d/;

  if (exists($lf_edited{$lf_url})) {
    print STDERR "Already edited $lf_url\n" if $debug;
    return ;
  }

  $lf_edited{$lf_url}=1;

  my $page;
  my $newpage;
  my $read;
  my $atime;
  my $mtime;

  # dev  uno   mode  nlink uid   gid   rdev  size  atime mtime
  (undef,undef,undef,undef,undef,undef,undef,undef,$atime,$mtime)
    = stat($lf_url);

  if (!open(TMPF,"< $lf_url\n")) {
    warn "Cannot read $lf_url: $!\n" if $verbose>=0;
    return;
  }

  $read=sysread(TMPF,$page,10240,0);
  close(TMPF);

  if (! $page =~ /<HTML/i) {
    print STDERR "$lf_url is not html\n" if $verbose>0;
    return ;
  }

  $files++;

  print STDERR "w3mfix: $lf_url" if $verbose>=0;
  
  print STDERR "$lf_url is a html file\n" if $debug;

  print STDERR " reading" if $verbose>0;
  
  open(TMPF,$lf_url) || 
    die "Could not open $lf_url for reading: $!\n";
  # read the whole file.
  {
    local($/)=undef;
    $page = <TMPF>;
  }
  close(TMPF);

  print STDERR " ",length($page)," bytes" if $verbose>0;

  # It's a html document

  print STDERR ", editing" if $verbose>0;

  ($newpage,undef) = 
    &htmlop::process($page, # $htmlop::NODOC,
		     $htmlop::ABS,$rum_url,
		     $htmlop::USESAVED,'W3MIR',
		     $htmlop::TAGCALLBACK,\&process_tag,$lf_url);

  open(TMPF,">$lf_url") ||
    die "\nCould not open $lf_url for writing: $!\n";
  
  print STDERR ", saving" if $verbose>0;

  if (length($newpage)) {
    # This is ODD: close does not seem to flush the buffers.  So we
    # force the issue.
    local($|)=1;
    print TMPF $newpage || 
      die "\nCould not write to $lf_url (disk full?): $!\n";
  }

  close(TMPF) || 
    die "\nCould not close $lf_url after writing: $!\n";

  # Set times back to what they were.
  utime $atime,$mtime,$lf_url;

  print STDERR ".\n" if $verbose>=0;
}

# ############################### Scope test

sub want_this {
  # Find out if we want the url passed.  Just pass it on to the
  # generated functions.
  my($rum_url)=shift;

  # Does scope rule want this?
  return &$scope_code($rum_url) &&
    # Does user rule want this too?
    &$rule_code($rum_url) 
  
}

# ############################### Apply the apply rules

sub user_apply {
  # Apply the user apply rules
  return &$user_apply_code(shift);
}


sub internal_apply {
  # Apply the w3mir generated apply rules
  return &$apply_code(shift);
}


sub apply {
  # Apply the user apply rules.  Then if URL is wanted return result of
  # w3mir apply rules.  Return the undefined value otherwise.

  my $url = user_apply(shift);

  return undef unless want_this($url);

  internal_apply($url);
}

# ############################### Decide what URLs to edit

sub edit_as_needed {
  my $rum_redirected;
  my $rum_url;
  my $o_rum_url;
  my $rum_referer;
  my $lf_url;
  my $foo;
  my $redirs;

  if ($editthis) {

    # Find the URLs that match $edithis
    foreach $o_rum_url (keys %rum_referers) {
      # Work on them if they (now) fall within the scope of retrival
      $redirs=0;
      $rum_url=$o_rum_url;
      if (exists($rum_redirected{$o_rum_url})) {
	while (exists($rum_redirected{$rum_url})) {
	  die "Too many redirects in a row\n" if $redirs++>32;
	  print STDERR "$rum_url -> ".$rum_redirected{$rum_url}."\n"
	    if $debug;
	  $rum_url=$rum_redirected{$rum_url};
	}
      }
      next unless $rum_url =~ /$editthis/io;
      next unless want_this($rum_url);
      # Find and edit the documents containing references to $o_rum_url
      foreach $rum_referer (@{$rum_referers{$o_rum_url}}) {
	next if $rum_referer eq '(commandline)';
	$lf_url=apply($rum_referer);
	next unless defined($lf_url);
	edit_html_file($lf_url,$rum_referer);
      }
    }
    
    # Don't do anything else when invoked thus
    return;
  }

  if ($doindex) {
    # Edit everything that refers anything with trailing /
    foreach $rum_url (grep(/\/$/,keys %rum_referers)) {
      foreach $rum_referer (@{$rum_referers{$rum_url}}) {
	next if $rum_referer eq '(commandline)';
	$lf_url=apply($rum_referer);
	next unless defined($lf_url);
	edit_html_file($lf_url,$rum_referer);
      }
    }
  }

  # Edit only redirected stuff
  foreach $rum_redirected (keys %rum_redirected) {
    # print "Redirected $rum_redirected\n";
    foreach $rum_url (@{$rum_referers{$rum_redirected}}) {
      # print "- Found in $rum_url\n";
      $lf_url=apply($rum_url);
      next unless defined($lf_url);
      edit_html_file($lf_url,$rum_url);
    }
  }
}

# ############################### 'main'

&parse_args(@ARGV);

if (!$rc) {
  $rc='.w3mirc';
  $rc='w3mir.ini' if $win32;
}

print STDERR "w3mfix: rc file: $rc\n" if $verbose>0;

&parse_cfg_file($rc);

# $chdirto is ignored, w3mir already did it

# Compile second order code

# - The rum scope tests
my $full_rules=$scope_prefix.$scope_fetch.$scope_ignore.$scope_postfix;
eval $full_rules;

# warn "Scope rules:\n-------------\n$full_rules\n---------------\n";

die "Program generated rules did not compile.  The code is:\n----\n".
  $full_rules."\n----\n"
  if !defined($scope_code);

$full_rules=$rule_prefix.$rule_text.$rule_postfix;
eval $full_rules;

# warn "User rules:\n-------------\n$full_rules\n---------------\n";

# - The user specified rum tests
die "Ignore/Fetch rules did not compile.  The code is:\n----\n".
  $full_rules."\n----\n"
  if !defined($rule_code);

# - The user specified apply rules

my $full_apply=$apply_prefix.($lc?$apply_lc:'').
  join($ipost.";\n",@user_apply).(($#user_apply>=0)?$ipost:"").";\n".
  $apply_postfix;
eval $full_apply;

die "User apply rules did not compile. The code is:
----
".$full_apply."
----\n" if !defined($apply_code);

$user_apply_code=$apply_code;

# - The w3mir generated apply rules

$full_apply=$apply_prefix.($lc?$apply_lc:'').
  join($ipost.";\n",@internal_apply).(($#internal_apply>=0)?$ipost:"").";\n".
  $apply_postfix;
eval $full_apply;

die "Internal apply rules did not compile.  The code is:
----
".$full_apply."
----\n" if !defined($apply_code);

&read_state;

&edit_as_needed;

exit 0;
__END__
# -*- perl -*-  There must be a blank line here:

=head1 NAME

w3mfix - fixup program for w3mir

=head1 SYNOPSIS

B<w3mfix> [B<options>] [B<configuration-file>]

=head1 DESCRIPTION

B<w3mfix> is the companion program to L<w3mir>.  It can be used for
several URL editing operations usefull in different situations.  

When starting B<w3mfix> will read it's configuration file.  It's name is 
either .w3mirc (w3mir.ini on win32) or specified on the commandline.

B<w3mfix> is controlled by the 'Fixup' directive of the configuration
file (described in the L<w3mir> documentation).  B<w3mfix> is also
affected by 'Index-name' and the one special commandline option it
knows, as well as the directives/options controlling verbosity and
debugging information.

=head1 DESCRIPTION

B<w3mfix> can rewrite URLs in these ways:

=over 4

=item * Rewrite URLs that resutled in redirects to point to the place
redirected to.  This is needed in all cases and will always be fixed
by B<w3mfix>.

=item * Change URLs ending in .../ into .../index.html (or
.../Welcome.html).  This is, probably, not needed when the mirror is
meant do be used with a web-server.  It is usefull for browsing
directly from disk or CDROM, but in this case it's, most often,
required.  To disable this specify the I<noindex> option with the
'Fixup' directive.

The default is to transform URLs ending in .../ into .../index.html.

To controll the name of the index file use the I<Index-name> directive
as documented in L<w3mir>

=item * Change URL links to documents outside the mirror to point to
some local document.  Could be usefull if the mirror is destined for a
CDROM to be used on a unconnected machine.

THIS IS NOT YET IMPLEMENTED

=item * Change URL links to documents that L<w3mir> was
unable/forbidden to retrive to point to some local document.  Pointing
these to a nice informative document is probably better than random
error messages from the browser.

THIS IS NOT YET IMPLEMENTED

=item * And, least, but far from last, B<w3mfix> can be used to
prepare an established mirror for enlargement.

This feature is used thus: Add the new site or subsite to be mirrored
on the configuration file (by adding B<Also:> and B<Also-quene:>
directives).  Then run B<w3mfix> with the B<-editref> option.  When
the B<-editref> option is specified B<w3mfix> will not perform any
other editing tasks.

E.g.; To add I<http://www.yahoo.com/Science/Artificial_Life/> to your
mirror add something like

	  Also: http://www.yahoo.com/Science/Artificial_Life/ yahoo

to the configuration file, then run w3mfix:

	  w3mfix -editref www.yahoo.com/Science/Artificial_Life

This will cause all references to
I<www.yahoo.com/Science/Artificial_Life> (and under) to be edited so they
point to within the mirror.  After B<w3mfix> has finished you can run
L<w3mir> in the normal manner.

=back

=head1 BUGS

Naah.

=head1 SEE ALSO

L<w3mir>

=head1 AUTHORS

B<w3mir>s authors can be reached at I<w3mir-core@usit.uio.no>.
B<w3mir>s home page is at http://www.math.uio.no/~janl/w3mir/

__END__
# -*- perl -*-  There must be a blank line here:

=head1 NAME

w3mfix - fixup program for w3mir

=head1 SYNOPSIS

B<w3mfix> [B<options>] [B<configuration-file>]

=head1 DESCRIPTION

B<w3mfix> is the companion program to L<w3mir>.  It can be used for
several URL editing operations usefull in different situations.  

When starting B<w3mfix> will read it's configuration file.  It's name is 
either .w3mirc (w3mir.ini on win32) or specified on the commandline.

B<w3mfix> is controlled by the 'Fixup' directive of the configuration
file (described in the L<w3mir> documentation).  B<w3mfix> is also
affected by 'Index-name' and the one special commandline option it
knows, as well as the directives/options controlling verbosity and
debugging information.

=head1 DESCRIPTION

B<w3mfix> can rewrite URLs in these ways:

=over 4

=item * Rewrite URLs that resutled in redirects to point to the place
redirected to.  This is needed in all cases and will always be fixed
by B<w3mfix>.

=item * Change URLs ending in .../ into .../index.html (or
.../Welcome.html).  This is, probably, not needed when the mirror is
meant do be used with a web-server.  It is usefull for browsing
directly from disk or CDROM, but in this case it's, most often,
required.  To disable this specify the I<noindex> option with the
'Fixup' directive.

The default is to transform URLs ending in .../ into .../index.html.

To controll the name of the index file use the I<Index-name> directive
as documented in L<w3mir>

=item * Change URL links to documents outside the mirror to point to
some local document.  Could be usefull if the mirror is destined for a
CDROM to be used on a unconnected machine.

THIS IS NOT YET IMPLEMENTED

=item * Change URL links to documents that L<w3mir> was
unable/forbidden to retrive to point to some local document.  Pointing
these to a nice informative document is probably better than random
error messages from the browser.

THIS IS NOT YET IMPLEMENTED

=item * And, least, but far from last, B<w3mfix> can be used to
prepare an established mirror for enlargement.

This feature is used thus: Add the new site or subsite to be mirrored
on the configuration file (by adding B<Also:> and B<Also-quene:>
directives).  Then run B<w3mfix> with the B<-editref> option.  When
the B<-editref> option is specified B<w3mfix> will not perform any
other editing tasks.

E.g.; To add I<http://www.yahoo.com/Science/Artificial_Life/> to your
mirror add something like

	  Also: http://www.yahoo.com/Science/Artificial_Life/ yahoo

to the configuration file, then run w3mfix:

	  w3mfix -editref www.yahoo.com/Science/Artificial_Life

This will cause all references to
I<www.yahoo.com/Science/Artificial_Life> (and under) to be edited so they
point to within the mirror.  After B<w3mfix> has finished you can run
L<w3mir> in the normal manner.

=back

=head1 BUGS

Naah.

=head1 SEE ALSO

L<w3mir>

=head1 AUTHORS

B<w3mir>s authors can be reached at I<w3mir-core@usit.uio.no>.
B<w3mir>s home page is at http://www.math.uio.no/~janl/w3mir/

