#!/usr/local/bin/perl
# $Id$
#
# Copyright (c) 2007 Daisuke Maki <[email protected]>
# All rights reserved.
package Gungho::Example::SiteCrawler::Rule::Seen;
use strict;
use warnings;
use GunghoX::FollowLinks::Rule qw(FOLLOW_DENY FOLLOW_ALLOW);
use base qw(GunghoX::FollowLinks::Rule);
my %SEEN;
sub apply
{
my ($self, $c, $response, $url, $attrs) = @_;
if ($SEEN{ $url }++) {
return FOLLOW_DENY;
}
return FOLLOW_ALLOW;
}
package main;
use strict;
use warnings;
use Gungho;
use Gungho::Request;
use GunghoX::FollowLinks '0.00004';
use URI;
main();
sub main
{
my $site = $ARGV[0] or die "No site specified";
$site = URI->new($site) unless eval { $site->isa('URI') } && !$@;
Gungho->run({
provider => sub {
my ($p, $c) = @_;
my $request;
if (! $p->{started}) {
$p->{started} = 1;
$p->{pending} = 1;
$request = Gungho::Request->new( GET => $site );
} else {
$request = shift @{ $p->requests };
}
print STDERR "Pending is $p->{pending}.\n";
if ( ! $request) {
if ($p->{pending} <= 0 && scalar keys %Gungho::Example::SiteCrawler::Seen::SEEN > 1) {
print STDERR "Pending is $p->{pending}. Shutting down\n";
$c->shutdown();
}
return 1;
}
$request->uri->fragment(undef);
if ($Gungho::Example::SiteCrawler::Seen::SEEN{$request->original_uri->as_string}) {
$p->{pending}--;
} else {
print STDERR "Sending ", $request->uri, "\n";
if ($c->send_request( $request ) && $request->notes('auto_robot_rules' => 1) ) {
$p->{pending}++;
}
}
return 1;
},
handler => sub {
my ($h, $c, $req, $res) = @_;
my $provider = $c->provider;
print STDERR "Fetched ", $res->request->uri, ". Pending = $provider->{pending}\n";
$provider->{pending}--;
$provider->{pending} += $c->follow_links($res);
# Make sure to use the original hostname
my $original_uri = $req->original_uri;
$Gungho::Example::SiteCrawler::Seen::SEEN{$req->original_uri->as_string}++;
},
components => [
'+GunghoX::FollowLinks',
'RobotRules',
'Throttle::Simple',
],
throttle => {
simple => {
max_items => 100,
interval => 60,
}
},
follow_links => {
parsers => [
{ module => "HTML",
config => {
merge_rule => "ALL",
rules => [
{ module => "HTML::SelectedTags",
config => {
tags => [ qw(a link) ]
}
},
{ module => "URI",
config => {
match => [ {
scheme => qr/^http$/i,
host => $site->host,
path => "^" . ($site->path || "/"),
action_nomatch => "FOLLOW_DENY"
} ]
}
},
{ module => "MIME",
config => {
types => [ qw(text/html) ],
unknown => "FOLLOW_ALLOW",
}
},
{ module => "+Gungho::Example::SiteCrawler::Rule::Seen"
}
]
}
}
]
}
});
}
1;
__END__
=head1 NAME
site-crawler.pl - Crawl Within A Specific Site
=head1 SYNOPSIS
site-crawler.pl [path]
=head1 DESCRIPTION
This example crawls within the given site, looking for any links that might
be found within the pages.
It will only look at HTML pages that reside under the url given in the
command line
Please note that this crawler will NOT terminate by itself at this point
(it's an example toy!). You need to CTRL-C yourself
=cut