The London Perl and Raku Workshop takes place on 26th Oct 2024. If your company depends on Perl, please consider sponsoring and/or attending.
#!/usr/local/bin/perl
# $Id$
#
# Copyright (c) 2007 Daisuke Maki <[email protected]>
# All rights reserved.

package Gungho::Example::SiteCrawler::Rule::Seen;
use strict;
use warnings;
use GunghoX::FollowLinks::Rule qw(FOLLOW_DENY FOLLOW_ALLOW);
use base qw(GunghoX::FollowLinks::Rule);
my %SEEN;

sub apply
{
    my ($self, $c, $response, $url, $attrs) = @_;
    if ($SEEN{ $url }++) {
        return FOLLOW_DENY;
    }
    return FOLLOW_ALLOW;
}

package main;
use strict;
use warnings;
use Gungho;
use Gungho::Request;
use GunghoX::FollowLinks '0.00004';
use URI;

main();

sub main
{
    my $site   = $ARGV[0] or die "No site specified";

    $site = URI->new($site) unless eval { $site->isa('URI') } && !$@;

    Gungho->run({
        provider => sub {
            my ($p, $c) = @_;

            my $request;
            if (! $p->{started}) {
                $p->{started} = 1;
                $p->{pending} = 1;
                $request = Gungho::Request->new( GET => $site );
            } else {
                $request = shift @{ $p->requests };
            }

            print STDERR "Pending is $p->{pending}.\n";
            if ( ! $request) {
                if ($p->{pending} <= 0 && scalar keys %Gungho::Example::SiteCrawler::Seen::SEEN > 1) {
                    print STDERR "Pending is $p->{pending}. Shutting down\n";
                    $c->shutdown();
                }
                return 1;
            }

            $request->uri->fragment(undef);
            if ($Gungho::Example::SiteCrawler::Seen::SEEN{$request->original_uri->as_string}) {
                $p->{pending}--;
            } else {
                print STDERR "Sending ", $request->uri, "\n";
                if ($c->send_request( $request ) && $request->notes('auto_robot_rules' => 1) ) {
                   $p->{pending}++;
                }
            }
            return 1;
        },
        handler  => sub {
            my ($h, $c, $req, $res) = @_;
            my $provider = $c->provider;
            print STDERR "Fetched ", $res->request->uri, ". Pending = $provider->{pending}\n";
            $provider->{pending}--;
            $provider->{pending} += $c->follow_links($res);

            # Make sure to use the original hostname
            my $original_uri = $req->original_uri;
            $Gungho::Example::SiteCrawler::Seen::SEEN{$req->original_uri->as_string}++;
        },
        components => [
            '+GunghoX::FollowLinks',
            'RobotRules',
            'Throttle::Simple',
        ],
        throttle => {
            simple => {
                max_items => 100,
                interval => 60,
            }
        },
        follow_links => {
            parsers => [
                { module => "HTML",
                  config => {
                      merge_rule => "ALL",
                      rules  => [
                        { module => "HTML::SelectedTags",
                          config => {
                            tags => [ qw(a link) ]
                          }
                        },
                        { module => "URI",
                          config => {
                            match => [ {
                                scheme => qr/^http$/i,
                                host => $site->host,
                                path => "^" . ($site->path || "/"),
                                action_nomatch => "FOLLOW_DENY"
                            } ]
                          }
                        },
                        { module => "MIME",
                          config => {
                            types => [ qw(text/html) ],
                            unknown => "FOLLOW_ALLOW",
                          }
                        },
                        { module => "+Gungho::Example::SiteCrawler::Rule::Seen"
                        }
                      ]
                  }
                }
            ]
        }
    });
}

1;

__END__

=head1 NAME

site-crawler.pl - Crawl Within A Specific Site

=head1 SYNOPSIS

  site-crawler.pl [path]

=head1 DESCRIPTION

This example crawls within the given site, looking for any links that might
be found within the pages. 

It will only look at HTML pages that reside under the url given in the
command line

Please note that this crawler will NOT terminate by itself at this point 
(it's an example toy!). You need to CTRL-C yourself

=cut