#!/usr/bin/perl
#
#   $0 - Walk and clean WWW-pool (for perl 5.002)
#       by k-chinen@is.aist-nara.ac.jp, 1994, 1995, 1997, 1998
#
#
# $Id: pool_walk,v 1.1 1997/03/28 11:58:07 k-chinen Exp k-chinen $
#
#
# Note:
#   This program walk and clean WWW-pool (which made by wcol).
#       - show attributes
#       - remove strange files and directories
#

#
#                       *** Handle With Care ***
#
#   This program include remove action. So, If you mistake specification
#   target directory, you lost other important files also.
#

#
# usage: $0 [options] dir
#
# dir:  target direcotory
#
# option:   -h  help
#           -t  trace
#           -r  recursive
#           -k  kill ignore file
#           -l  long
#
# example:
#
#   Show attribute (short)
#
#           $0 -r /tmp/www-pool/http/info.cern.ch
#
#   Show attribute (long)
#
#           $0 -r -l /tmp/www-pool/http/info.cern.ch
#
#   Show attribute of data and kill ignore file.
#
#           $0 -r -k /tmp/www-pool/http/info.cern.ch
#
#


require "getopts.pl";


sub warnmsg {
#   print "#\t\x1b[7m Warning: @_ \x1b[m\n";
    print "#     * Warning: @_\n";
}

sub errmsg {
#    print "#\t\x1b[41m Error: @_ \x1b[40m\x1b[30m\x1b[m\n";
    print "# * * * Error: @_\n";
}

#
# show_info - read ",info"-file and show attributes of info.
#
sub show_info {
    local($fatal) = 0;

    local($ipath) = @_;
    local($path) = $ipath;
    $path =~ s/,info$//;
    local($hpath) = "$path,head";

    local(@parts) = split(/\//, $ipath);
    local($iname) = $parts[$#parts];
    local(@parts) = split(/,/, $iname);
    local($name) = $parts[0];


#
# Read files
#
    #
    # INFO-file
    #
    local($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $isize,
        $atime, $mtime, $ctim, $blksize, $blocks) = stat($ipath);

    open(F, $ipath);
    read(F, $buf, 2048, 0);
    close(F);

    ($id) = unpack("a16",$buf);

    if($id =~ /^0.02a/ || $id =~ /^0.02b/) {
        ($id, $ver, $url, $type, $statu, $count, $first, $last, $mstakes)
            = unpack("a16 a16 a256 a64 i l l l l",$buf);
        $msconn = -1;
        $mstrans = -1;
    }
    elsif($id =~ /^0.02c/) {
        ($id, $ver, $url, $type, $statu, $count, $first, $last,
            $mstakes, $msconn, $mstrans)
            = unpack("a16 a16 a256 a64 i l l l l l l",$buf);
    }
    elsif($id =~ /^0.02\@a/) {
        ($id, $ver, $url, $type, $statu, $count, $first, $last,
            $mstakes, $msconn, $mstrans)
            = unpack("a16 a16 a256 a64 i l l l l l l",$buf);
    }
    elsif($id =~ /^0.03a0/) {
        ($id, $ver, $url, $type, $statu, $count, $first, $last,
            $mstakes, $msconn, $mstrans)
            = unpack("a16 a16 a256 a64 i l l l l l l",$buf);
    }
    elsif($id =~ /^0.03b0/) {
        ($id, $ver, $url, $type, $statu, $count, $first, $last,
            $mstakes, $msconn, $mstrans)
            = unpack("a16 a16 a512 a64 i l l l l l l",$buf);
    }
    elsif($id =~ /^0.042/) {
        ($id, $url, $statu, $ver, $type, $len, $lmd,
            $mstakes, $msconn, $mstrans,
			$istate, $hsize, $bsize, $count, $first, $last, $expire)
            = unpack("a16 a512 i a16 a64 l l l l l l l l l l l l",$buf);
    }
    elsif($id =~ /^WCOL-INFO-0.045/) {
        ($id, $url, $statu, $ver, $type, $len, $lmd,
            $mstakes, $msconn, $mstrans,
			$istate, $hsize, $bsize, $count, $first, $last, $expire)
            = unpack("a16 a1024 i a16 a64 l l l l l l l l l l l l",$buf);
    }
    elsif($id =~ /^WCOL-INFO-0.050/) {
        ($id, $url, $statu, $ver_major, $ver_minor, $type, $len, $lmd,
			$clength,
            $mstakes, $msconn, $mstrans,
			$istate, $hsize, $bsize, $count, $first, $last, $expire)
            = unpack("a16 a1024 i c c a66 l l l l l l l l l l l l l",$buf);

		$ver = sprintf("%d.%d", $ver_major, $ver_minor);
    }
    else {
        &errmsg("Strange INFO (id=$id) $ipath");
        return
    }

    $id   =~ s/\0//g;
    $ver  =~ s/\0//g;
    $url  =~ s/\0//g;
    $type =~ s/\0//g;

    if($url eq "") {
        $url = "NoURL";
    }
    if($ver eq "") {
        $ver = "NoVersion";
    }
    if($id eq "") {
        $id = "NoID";
    }
    if($type eq "") {
        $type = "NoType";
    }


    if($istate eq "1") {
		$c_use++;
		$b_use += $len;
	}
    if($istate eq "2") {
		$c_unuse++;
		$b_unuse += $len;
	}



    #
    # BODY-file
    #
    local($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size,
        $atime, $mtime, $ctim, $blksize, $blocks) = stat($path);


    #
    # HEAD-file
    #
    undef @lparts;
    undef %fields;
    $fields{"content-length"} = -1;

    open(F, $hpath);
    while(<F>) {
        chop;
#       print "\t> $_\n";

        @lparts = split(/: /,$_);
        $lparts[0] =~ tr/A-Z/a-z/;          # normalize to small
        $fields{$lparts[0]} = $lparts[1];
        if(/^HTTP\//) {
            ($protocol, $response_state) = split(/ /, $_);
        }
        if($_ eq "" || $_ eq "\r") {
            last;
        }
    }
    close(F);


#
# Check
#
    if($response_state >= 400) {
        &errmsg("Error Message $path");
        $fatal = 1;
    }
    if(-z $path) {
        &errmsg("Empty Body $path");
        $fatal = 1;
    }

    if($fatal==1) {
        if($opt_k) {
            &errmsg("Kill Files $path $ipath $hpath");
            unlink $path, $ipath ,$hpath;
        }
        return
    }

#
# Report
#
    if($opt_l) {

print <<EOM;
=====
ID       $id
URL      $url
Type     $type
Size     $size
Statu    $statu
Count    $count
First    $first
Last     $last
-----
Conn     $msconn
Trans    $mstrans
Takes    $mstakes
-----
Protocol $protocol
Response $response_state
Recieve  $fields{"content-length"}
EOM

    }
    else {
#        print "$id $istate $url $type $size $statu $count $first $last $msconn $mstrans $mstakes\n";
        print "$istate $url $type $size $statu $count $first $last $msconn $mstrans $mstakes\n";
    }

    if($size != $fields{"content-length"}) {
        if($fields{"content-length"} != -1) {
            &warnmsg("Size Missmatch");
        }
    }
    if($type != $fields{"content-type"}) {
        &warnmsg("Type Missmatch");
    }

}


#
# break_down - break down directory.
#
#   0. check current directory
#   1. check files and remove error files and strange files
#   2. apply this routine to sub directories
#   3. apply 'show_info'-procedure to files
#   4. check current directory
#
#
sub break_down {
    local($name) = @_;

    if($opt_t) {
        print "#\n# Enter $name\n#\n";
    }

#
# Phase 0: check current directory
#
    opendir(CDIR,$name);
    local(@list) = grep(!/^\.\.?/, sort readdir(CDIR));
    closedir(CDIR);
    if($#list==-1) {
        if($opt_k) {
            &errmsg("Empty directory $name, remove");
            rmdir($name) || print "# *** RMDIR 1 ERROR *** \n";
        }
        return;
    }



    #
    # list      all files
    #
    # dirs      diretory
    # a_files   all files (without directory)
    #
    # b_files   body
    #

    local(@dirs) = grep(-d "$name/$_", @list);
    local(@a_files) = grep(!(-d "$name/$_"), @list);
    local(@b_files) = grep(!/,head$/ && !/,info$/, @a_files);

    # mark use or not
    local(%use);
    for (@a_files) {
        $use{$_} = -1;
    }

    if($opt_d) {
        print "dirs: $#dirs ";
        for (@dirs) {
            print "$_ ";
        }
        print "\n";

        print "a_files: $#a_files ";
        for (@a_files) {
            print "$_ ";
        }
        print "\n";

        print "b_files: $#b_files ";
        for (@b_files) {
            print "$_ ";
        }
        print "\n";

        print "target: $#a_files\n";
        for (@a_files) {
            print "\t".$_." ".$use{$_}."\n";
        }
        print "\n";
    }




#
# Phase 1: check files and remove error files and strange files
#
    for (@b_files) {
        if($use{"$_,head"}==-1 && $use{"$_,info"}==-1) {
            $use{$_} = 1;
            $use{"$_,head"} = 1;
            $use{"$_,info"} = 1;
        }
    }

    if($opt_d) {
        print "target: $#a_files\n";
        for (@a_files) {
            print "\t".$_." ".$use{$_}."\n";
        }
        print "\n";
    }

    for (@a_files) {
        if($use{$_}==-1) {
            if($opt_k) {
                &errmsg("Ignore file $name/$_, remove");
                unlink "$name/$_";
            }
        }
    }



#
# Phase 2: apply this routine to sub directories
#
    if($opt_r) {
        for (@dirs) {
            &break_down("$name/$_");
        }
    }



#
# Phase 3: apply 'show_info'-procedure to files
#
    for (@b_files) {
        if($use{"$_"}==1) {
            &show_info("$name/$_,info");
        }
    }



#
# Phase 4: check current directory
#
    opendir(CDIR,$name);
    local(@list) = grep(!/^\.\.?/, sort readdir(CDIR));
    closedir(CDIR);
    if($#list==-1) {
        if($opt_k) {
            &errmsg("Then, Empty directory $name, remove");
            rmdir($name) || print "# *** RMDIR 2 ERROR *** \n";
        }
        return;
    }
}





#
# Main Body
#

$now = time;

&Getopts('hrkp:tld');

if($opt_p) {
    $timelimit = $opt_p;
}
else {
    $timelimit = $now - 7*24*60*60;     # 1 week
}

$target = $ARGV[0];
if($target ne "/") {
    $target =~ s/\/$//;
}


if($opt_h) {
print <<EOM;
pool_walk - Walk WWW-pool. show info and clean strange data.
            by k-chinen\@is.aist-nara.ac.jp, 1994, 1995

usage: $0 [options] dir

option: -h      help
        -t      trace
        -r      recursive
        -k      kill ignore file
        -p sec  period (current limit=$timelimit, now $now)
        -l      long
        -d      debug

example:
    % $0 -r -/CACHE/wcol
                --- show info in last 1 week on pool.

    % $0 -r -p 0 /CACHE/wcol
                --- show all info on pool.

EOM

    exit 0;
}


if(!(-d $target)) {
    print "Error $target is not directory. please check target.\n";
}

print "#\n";
print "# Target Directory: $target\n";
print "# Options:";
    if($opt_r) {
        print " Recursive";
    }
    if($opt_k) {
        print " Kill";
    }
    if($opt_t) {
        print " Trace";
    }
    print "\n";
print "#\n";
if(!$opt_l) {
    print "# Format:\n";
    print "#       I URL TYPE SIZE STATU COUNT FIRST LAST CONN TRANS TOTAL\n";
    print "#       1  2   3    4    5     6     7     8    9    10    11\n";
    print "#\n";
}

$c_use = 0; 	$b_use = 0;
$c_unuse = 0;	$b_unuse = 0;


&break_down($target);

# ($b_all,$b_use,$b_unuse,$c_all,$c_use,$c_unuse) = 

print  "#\n";
printf "# used:   count %10d size %10d\n",$c_use , $b_use;
printf "# unused: count %10d size %10d\n",$c_unuse , $b_unuse;
print  "#\n";


