#!/usr/bin/perl # Ija - Access deja.com from command line. # Copyright 1999 Asher Blum - licensed under the GPL. # based on Randal Schwarz's alta script my $VERSION='0.4.15'; # 13-Aug-2003 - adapt to a google change in results my $fail = "\nTry the 'v' command to see if the version is current.\n"; use strict; use LWP::UserAgent; use URI::Escape; # Escape a string into a URL query my ( $date, $subject, $group, $author, $ref, # ref no. (0-n) of current article $offset, # ref no. of first article in current list $chunk_size, # how many articles per screen (must match DN) $verbose, # print diagnostic messages? $start_url, $next_msg_url, # url to get next list of articles $prev_msg_url, # url to get previous list of articles $page, # the current html document $page_thresh, # msg size which triggers pager $pager, # program, e.g. 'less' $total_hits, # no. hits, including (exactly|about) @result, # current results list, \0 separated @article_no, # AN number (big) keyed to ref no. (small) $QBASE, # base url for query $query_start, # first part of query $query_end, # third part of query (after meat) $aquery_end, ); $| = 1; # view output incrementally $chunk_size = 25; $page_thresh=24; my $lines = $ENV{ LINES }; if($lines > 5 && ($lines < 120 || $ENV{ TERM } =~ /xterm/)) { $chunk_size = $lines - 4; $page_thresh = $lines; # print"CS = $chunk_size pt = $page_thresh\n"; } $pager = $ENV{PAGER} || 'less'; $= = $chunk_size+3; #make FORMAT work correctly $verbose=0; $QBASE = 'http://groups.google.com/'; $query_start = 'groups?q='; $query_end = "&num=$chunk_size&btnG=Google+Search&as_oq=&as_epq=&as_eq=&as_ugroup=&as_usubject=&as_uauthors=+&as_umsgid=&lr="; $query_end = "&btnG=Search&meta=site%3Dgroups&num=$chunk_size"; # author search: $aquery_end = "&num=$chunk_size&btnG=Google+Search&as_oq=&as_epq=&as_eq=&as_ugroup=&as_usubject=&as_uauthors=+abigail&as_umsgid=&lr=&as_uauthors=+"; format STDOUT_TOP = Search Results Ref Date From Newsgroup Subject -------------------------------------------------------------------------------- . format STDOUT = @## @>>>>>>>>>>> @<<<<<<<<<<<<< @<<<<<<<<<<<<<<<<<<<< @<<<<<<<<<<<<<<<<<<<<<<<<< $ref,$date, $author, $group, $subject . my $first_target = shift; my $ua = new LWP::UserAgent; $ua->agent("ija/$VERSION"); $ua->env_proxy() if $ENV{ HTTP_PROXY }; if($first_target) { my $query = uri_escape($first_target); $verbose && print"query=$query\n"; if($query =~ /./) { $start_url="$QBASE$query_start$query$query_end"; &get_results($start_url); } $offset=1; &show_results($offset); } # Main Loop: # Read commands from user. while(<>) { chomp; if (/^n/ && $next_msg_url =~ /./) { &get_results($next_msg_url); $offset+=$chunk_size; &show_results($offset); } elsif (/^p/ && $prev_msg_url =~ /./) { &get_results($prev_msg_url); $offset-=$chunk_size; &show_results($offset); } elsif (/^l/) { &show_results($offset); } elsif (/^m\s*(\d*)/) { if(!($1)) { print"Max hits = $chunk_size\n"; } elsif ($1 > 100) { print"Cannot increase max hits above 100.\n"; } else { $chunk_size=$1; # $query_end =~ s/num=\d+//; # $query_end .= "num=$chunk_size"; $query_end =~ s/num=\d+/num=$chunk_size/; $aquery_end =~ s/num=\d+/num=$chunk_size/; $= = $chunk_size+3; #make FORMAT work correctly } } elsif (/^s\s*(.+)/) { open OUT_ART, ">>$1" or die "Can't open $1 for output: $!"; print OUT_ART $page; close OUT_ART; print"Wrote ", length($page), " bytes to $1.\n"; } elsif (/^t\s*(\d*)/) { if(!length($1)) { print"Page threshold = $page_thresh. ". ($page_thresh ? "Set it to 0 to disable paging.\n" : "Paging disabled.\n"); } else { $page_thresh = $1; } } elsif (/^r/) { open(LPR, "|lpr") || die "Can't open pipe to lpr: $!"; print LPR $page; close LPR; } elsif (/^f\s*(.+)/) { $start_url="$QBASE$query_start".uri_escape($1)."$query_end"; &get_results($start_url); $offset=1; &show_results($offset); } elsif(/^X/) { my $new_height = $chunk_size + 4; print $ENV{ TERM } eq 'xterm' ? chr(27) . "[8;$new_height;80t" : "Can't: TERM != xterm."; &show_results($offset); } elsif(/^[H\?h]/) { print <] Author search. f Search for a new query string. h Print help message. l List current group of messages. m <1-100> Set max hits returned. (25 normally) n List next group of messages. p List previous group of messages. q Quit. r Print the current message. s Save the current message to a file. t Set the paging threshold - longer articles will be paged. v Version - check if there's a newer version. X Size your xterm for the current max hits Enter a reference number to view a message. EOH } elsif(/^a\s*(.*)/) { $start_url="$QBASE$query_start$aquery_end" . uri_escape($1); &get_results($start_url); $offset=1; &show_results($offset); } elsif (/^(\d{1,3})/) { $ref=$1; if($article_no[$ref]!~/./) { print"Invalid number\n"; } else { &load_article($article_no[$ref]); } } elsif (/^q/) { exit(0); } elsif (/^v/) { print "Ija version $VERSION - "; my $c_ver = fetch('http://wildspark.com/asher/ija/version'); chomp $c_ver; chomp $c_ver; print $VERSION eq $c_ver ? "you have the latest.\n" : "$c_ver is the latest.\nhttp://wildspark.com/asher/ija/ija-current.gz\n"; } else { print"Invalid command. Type 'h' for help.\n"; } } # end of main loop sub author_profile { my $x_author = shift; my $r_author ='UNKNOWN'; # author returned by server my $total_msgs = 0; # num. returned by server my $s_author = $x_author ? $x_author : $author; my $url="http://www.deja.com/profile.xp?author=$s_author&ST=PS"; my $page=fetch($url); my @row = split(/\n\n+/,$page); for(@row) { s/\n//g; /^<.{78,88}>\s*(\d+)<.*>(.+)<.{15,23}$/ && printf("%6d %s\n",$1,$2); /There are (\d+) unique messages by/ && ($total_msgs=$1); /^/ig; printf("%6d by %s\n",$total_msgs,$r_author); } sub load_article { my $an=shift; #Dejanews article number my $paging_mode=0; $verbose && print"******************** Article $an ********************\n"; my $url="http://www.deja.com/getdoc.xp?AN=$an&fmt=text"; my $url = "http://groups.google.com/groups?q=xxxxxxx&start=10&hl=en&lr=&safe=off&rnum=11&seld=$an&ic=1"; $url = "http://groups.google.com/groups?q=xxxxxx&num=25&lr=&safe=off&rnum=1&ic=1&selm=$an"; $page = fetch("$url"); # save for debugging if necessary: if( -f 'ija-article.html' && open PAGE, '>ija-article.html') { print PAGE $page; close PAGE; } if($page_thresh && (split("\n",$page) > $page_thresh)) { open(PAGE,"|$pager") || die"Can't open $pager :$!"; select(PAGE); $paging_mode=1; } my $orig_len = length($page); my $text_article; my @el = map unhtml($_), (split /]*>/i, $page); # print"*** $_ : $el[ $_ ]\n" for 0..$#el; my $header_el = (grep(($el[ $_ ] =~ /\nFrom:/), 0..$#el))[ 0 ] or die "Header not found in message.$fail"; # print"Header el = $header_el\n"; $el[ $header_el + 1 ] =~ s/ This is the only article in this thread //; $el[ $header_el + 1 ] =~ s/\s*View complete thread\s*//; $page = $el[ $header_el ] . $el[ $header_el + 1 ]; # the meat is in this element - don't need the rest $page =~ s/$_/\n$_/ for qw( Subject: Date: ); $page =~ s/(Date:.\S+ \S+ \S+)/$1\n\n/; $page =~ s/View: Original Format//; $page =~ s/^(From:.*)\((\S+\@\S+)\)/$1<$2>/m; $page =~ s/^Search result \d+\n//i; $page =~ s/\[view thread\]//i; $page =~ s/\s+$//; $page =~ s/\n\n\n/\n\n/; $page =~ s/View:.*Original Format.*//; # $page =~ s/^(From:.*)/XXXXXXXXX/m; print "-" x 70, "\n"; # print"\n--- $_ ---\n$el[$_]\n" for (0..$#el); # Calculate the 'compression' - ratio of message bytes to web page bytes: my $new_len = length($page); my $percent_compress = int(100 * $new_len / $orig_len); print $page; #print "\n[Ref:$ref AN:$an % txt = $percent_compress% = $new_len / $orig_len]\n"; print "\n[Ref:$ref AN:$an $percent_compress% txt]\n"; if ($page =~ /^From:\s*([^\n]+)\n/) { $author=$1; } if($paging_mode) { select(STDOUT); close PAGE; } } sub show_results { my $start=shift; unless($total_hits) { print"No match\n"; return 0; } for my $r($start..$start+$chunk_size-1) { ($date,$author,$group,$subject) = split(/\0/,$result[$r]); $ref=$r; #Can't use "my" variable as for index write; } print"Results $offset through ",$offset+$chunk_size-1," of $total_hits\n"; $verbose && print"Next=$next_msg_url\nPrevious=$prev_msg_url\n"; } sub get_results { my $url=shift; my $page; my $an; $next_msg_url=''; $prev_msg_url=''; @result={}; @article_no={}; $page = &fetch($url); # Save a copy of the page for debugging: if( -f 'deja.html' && open PAGE, '>deja.html') { print PAGE $page; close PAGE; } $page =~ s/\n//g; $page =~ s|||gi; $page =~ s/
//gis; if($page =~ /Results (\d+) - (\d+) of.*?([\d\,]+)/) { $ref = $1; $total_hits = $3; } elsif($page =~ /did not match any documents/) { $total_hits = 0; return 0; } else { die "Results tag not found.$fail"; } while($page =~ m| ]+selm= ([^>]+) # Article Number [^>]*> ([^<]+) # subject .*? ]+group=[^>]+> ([^<]+) # group
\s+\-\s+ (\w\w\w\.?\s+\d{1,2}\,\s+\d\d\d\d) # date \s+by\s+ ([^<]+) # author |igxs) { ($an, $subject, $group, $date, $author) = ($1, $2, $3, $4, $5); for($date, $an, $ref, $subject, $group, $author) { s/^\s+//; s/\s+$//; } # $date =~ s|\d\d(\d\d)$|$1|; # Year yyyy -> yy # $date =~ s/ /-/g; $date =~ s/(\w\w\w)\. (\d{1,2})\, \d\d(\d\d)/$2.$1.$3/; $article_no[$ref]=$an; $result[$ref]=join("\0",$date,$author,$group,$subject); # print "** $result[$ref]\n"; $ref ++; } $next_msg_url = abs_url($1) if $page =~ /]+)>(<[^>]+>\s*)*Next]+)>(<[^>]+>\s*)*Previous]+>//g; $p =~ s/>/>/gi; $p =~ s/</request($request); die "$url failed: ",$response->error_as_HTML,$fail unless $response->is_success; $response->content; }