#!/usr/bin/perl -w
use strict;

# print STDERR "spider $$ [@ARGV]\n";

#
# SWISH-E http method Spider
# $Id: swishspider,v 1.13 2003/08/30 05:02:55 whmoseley Exp $ 
#

# Should SWISH::Filter be use for filtering?  This can be left 1 all the time, but
# will add a little time to processing since.

# Note: Just because USE_FILTERS is true doesn't mean SWISH::Filter is in @INC.
# To get the path to use (for "use lib") run swish-filter-test -path.  Or another way:
#
#  PERL5LIB=`swish-filter-test -path` swish-e -S http -i http://localhost/index.html
#
# The @INC path is not set by default in swishspider because loading the SWISH::Filter
# modules for every URL might be slow.

use constant USE_FILTERS  => 1;  # 1 = yes use SWISH::Filter for filtering, 0 = no. (faster processing if not set)
use constant FILTER_TEXT  => 0;  # set to one to filter text/* content, 0 will save processing time
use constant DEBUG_FILTER => 0;  # set to one to report errors on loading SWISH::Filter module.

use LWP::UserAgent;
use HTTP::Status;
use HTML::Parser 3.00;
use HTML::LinkExtor;

    if (scalar(@ARGV) != 2) {
        print STDERR "Usage: $0 localpath url\n";
        exit(1);
    }

    my $ua = new LWP::UserAgent;
    $ua->agent( "SwishSpider http://swish-e.org" );


    my $localpath = shift;
    my $url = shift;

    my $request = new HTTP::Request( "GET", $url );
    my $response = $ua->simple_request( $request );

    # Save the HTTP code, the content/type (or a redirection header), and a last modified date, if one.

    open( RESP, ">$localpath.response" ) || die( "Could not open response file $localpath.response: $!" );
    print RESP $response->code() . "\n";


    # If failed to fetch doc then write out the code and location and exit
    
    if( $response->code != RC_OK ) {
        print RESP ($response->header( "location" ) ||'') . "\n";
        exit;
    }


    # Filter the document, if possible.

    my ( $content_ref, $content_type ) = filter_doc( $response );


    # Write out the (perhaps new) content type and the last modified date.

    print RESP "$content_type\n",
               ($response->last_modified || 0), "\n";

    close RESP;


    # Now write the content -- really only need to do this on text/* types since that's all swish processes
    # No, that's not true.  Can use FileFilter inside of swish-e on binary data.

    open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents: $!\n" );

    # Enable binmode if the contents is not text/*
    binmode CONTENTS unless $content_type =~ m[^text/]i;

    print CONTENTS $$content_ref;
    close( CONTENTS );


    # Finally, extract out links

    exit unless $content_type =~ m!text/html!;

    open( LINKS, ">$localpath.links" ) || die( "Could not open links file $localpath.links: $!\n" );
    my $p = HTML::LinkExtor->new( \&linkcb, $url );

    # Don't allow links above the base
    $URI::ABS_REMOTE_LEADING_DOTS = 1;

    $p->parse( $$content_ref );
    close( LINKS );

    exit;


sub linkcb {
    my($tag, %links) = @_;

    return unless $tag eq 'a' && $links{href};

    my $link = $links{href};

    # Remove fragments
    $link =~ s/(.*)#.*/$1/;

    print LINKS "$link\n";
}


# This will optionally attempt to filter the document

sub filter_doc {
    my $response = shift;

    my ( $content, $content_type ) = ( $response->content, $response->header( "content-type" ) );

    my $content_ref = \$content;

    unless ( $content_type ) {
        warn 'URL: ', $response->base, " did not return a content-type\n";
        return ( $content_ref, 'text/plain' );
    }


    return ( $content_ref, $content_type ) unless USE_FILTERS;  # filters enabled?


    # This can avoid loading the filter module if it is known that type text/* will never be filtered.
    
    return ( $content_ref, $content_type )
        if $content_type =~ m!^text/! && !FILTER_TEXT;
    

    eval { require SWISH::Filter };
    if ( $@ ) {
        warn $@ if DEBUG_FILTER;
        return ( $content_ref, $content_type );
    }

    my $filter = SWISH::Filter->new;

    my $doc = $filter->convert(
        document => $content_ref,
        name     => $response->base,
        content_type => $content_type,
    );

    return $doc && $doc->was_filtered
        ? ( $doc->fetch_doc, $doc->content_type )
        : ( $content_ref, $content_type );
}