#!/usr/bin/perl -w use strict; # print STDERR "spider $$ [@ARGV]\n"; # # SWISH-E http method Spider # $Id: swishspider,v 1.13 2003/08/30 05:02:55 whmoseley Exp $ # # Should SWISH::Filter be use for filtering? This can be left 1 all the time, but # will add a little time to processing since. # Note: Just because USE_FILTERS is true doesn't mean SWISH::Filter is in @INC. # To get the path to use (for "use lib") run swish-filter-test -path. Or another way: # # PERL5LIB=`swish-filter-test -path` swish-e -S http -i http://localhost/index.html # # The @INC path is not set by default in swishspider because loading the SWISH::Filter # modules for every URL might be slow. use constant USE_FILTERS => 1; # 1 = yes use SWISH::Filter for filtering, 0 = no. (faster processing if not set) use constant FILTER_TEXT => 0; # set to one to filter text/* content, 0 will save processing time use constant DEBUG_FILTER => 0; # set to one to report errors on loading SWISH::Filter module. use LWP::UserAgent; use HTTP::Status; use HTML::Parser 3.00; use HTML::LinkExtor; if (scalar(@ARGV) != 2) { print STDERR "Usage: $0 localpath url\n"; exit(1); } my $ua = new LWP::UserAgent; $ua->agent( "SwishSpider http://swish-e.org" ); my $localpath = shift; my $url = shift; my $request = new HTTP::Request( "GET", $url ); my $response = $ua->simple_request( $request ); # Save the HTTP code, the content/type (or a redirection header), and a last modified date, if one. open( RESP, ">$localpath.response" ) || die( "Could not open response file $localpath.response: $!" ); print RESP $response->code() . "\n"; # If failed to fetch doc then write out the code and location and exit if( $response->code != RC_OK ) { print RESP ($response->header( "location" ) ||'') . "\n"; exit; } # Filter the document, if possible. my ( $content_ref, $content_type ) = filter_doc( $response ); # Write out the (perhaps new) content type and the last modified date. print RESP "$content_type\n", ($response->last_modified || 0), "\n"; close RESP; # Now write the content -- really only need to do this on text/* types since that's all swish processes # No, that's not true. Can use FileFilter inside of swish-e on binary data. open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents: $!\n" ); # Enable binmode if the contents is not text/* binmode CONTENTS unless $content_type =~ m[^text/]i; print CONTENTS $$content_ref; close( CONTENTS ); # Finally, extract out links exit unless $content_type =~ m!text/html!; open( LINKS, ">$localpath.links" ) || die( "Could not open links file $localpath.links: $!\n" ); my $p = HTML::LinkExtor->new( \&linkcb, $url ); # Don't allow links above the base $URI::ABS_REMOTE_LEADING_DOTS = 1; $p->parse( $$content_ref ); close( LINKS ); exit; sub linkcb { my($tag, %links) = @_; return unless $tag eq 'a' && $links{href}; my $link = $links{href}; # Remove fragments $link =~ s/(.*)#.*/$1/; print LINKS "$link\n"; } # This will optionally attempt to filter the document sub filter_doc { my $response = shift; my ( $content, $content_type ) = ( $response->content, $response->header( "content-type" ) ); my $content_ref = \$content; unless ( $content_type ) { warn 'URL: ', $response->base, " did not return a content-type\n"; return ( $content_ref, 'text/plain' ); } return ( $content_ref, $content_type ) unless USE_FILTERS; # filters enabled? # This can avoid loading the filter module if it is known that type text/* will never be filtered. return ( $content_ref, $content_type ) if $content_type =~ m!^text/! && !FILTER_TEXT; eval { require SWISH::Filter }; if ( $@ ) { warn $@ if DEBUG_FILTER; return ( $content_ref, $content_type ); } my $filter = SWISH::Filter->new; my $doc = $filter->convert( document => $content_ref, name => $response->base, content_type => $content_type, ); return $doc && $doc->was_filtered ? ( $doc->fetch_doc, $doc->content_type ) : ( $content_ref, $content_type ); }