#!/usr/bin/perl
# Web Site Mapper
# Copyright (C) 2004  Timm Murray
#
# Somewhat modified by Dobrica Palinusic, see 
# http://svn.rot13.org/index.cgi/perl/view/trunk/web_site_mapper.pl
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#

use strict;
use warnings;
use WWW::Mechanize;
use URI;
use YAML qw/Dump LoadFile/;
use Time::HiRes qw/time/;
use Data::Dump qw/dump/;

my $template_yaml = <<'__EOF__';
---
#'http://www.example.com/':
#  form_name: 'loginForm'
#  fields:
#    'user': 'foobar@example.com'
#    'password': secret
'http://blog.rot13.org':
__EOF__

my $rc = $ENV{'HOME'} . '/.sites.yaml';
my $credentials;

# maximum re-visit of each page
my $max_seen = 10;
my %seen;

my ( @sites, @site_hosts );

sub DISALLOWED_SCHEMES ()  { qw( mailto javascript ) }

my $debug = 1;
my $verbose = 0;

if ( -e $rc ) {
	$credentials = LoadFile( $rc ) or die "can't open $rc: $!";
	@sites = keys %$credentials;
	@site_hosts = map { URI->new($_)->host } @sites;
	warn "# loaded $rc with credentilas for: ", join(", ", @sites), "\n";
	warn dump( $credentials );
} else {
	open(my $yaml, '>', $rc ) || die "can't open $rc: $!";
	print $yaml $template_yaml;
	close($yaml);
	warn "# create template $rc edit it to your needs\n";
	exit 1;
}

my $mech = WWW::Mechanize->new();

sub get_page {
	my ($data, $uri) = @_;

	my $page_to_load = $uri->canonical;

	# Don't process pages that have already been loaded
	if(exists $data->{$page_to_load}) {
#		warn "\t$page_to_load already indexed\n" if $debug;
		return;
	}

	# Don't process pages that aren't listed in the sites above
	unless ( grep { lc($uri->host) eq lc ($_) } @site_hosts ) {
		warn "\t$page_to_load not in allowed sites\n" if $debug;
		return;
	}

	print "$page_to_load";

	my $t = time();

	my $response = $mech->get( $page_to_load );

	$t = time() - $t;

	$data->{$page_to_load}{status} = $mech->status;

	if($mech->success) {
		$data->{$page_to_load}{content_type} = $mech->ct;
		$data->{$page_to_load}{title} = $mech->title;

		my @links = map { $_->url_abs } $mech->links;

		if ($debug) {
			warn "\tResponse successful\n";
			warn "\tContent-type: ", $data->{$page_to_load}{content_type}, "\n";
			warn "\tTitle: ", $data->{$page_to_load}{title}, "\n";
			warn "\tLinks: " . join("\n", map "\t\t$_", @links) . "\n";
		} else {
			my $size = length( $mech->content );
			print " ", $mech->status, " ", $mech->ct, sprintf(" %d in %.2fs (%.2f b/s)", $size, $t, $size / $t), "\n";
		}

		$data->{$page_to_load}{links} = [];

		foreach my $link (@links) {
			my $uri = URI->new($link);
			next if grep { $uri->scheme eq $_ } DISALLOWED_SCHEMES;
			my $url = $uri->canonical->as_string;
						$url =~ s/#.*$//;
#						warn "\tFollowing $url\n" if $debug;
						my $url_no_params = $url;
						$url_no_params =~ s/\?.*$//;
						$seen{$url_no_params}++;
						if ($seen{$url_no_params} > $max_seen) {
							print "skipped $url_no_params, seen $seen{$url_no_params}\n" if ($verbose);
							next;
						}
			push @{ $data->{$page_to_load}{links} }, $url;

			get_page( $data, $uri );
		}
	}
	else {
		warn "\tResponse unsuccessful\n" if $debug;
	}
}


{
	my $data = { };
	foreach my $site ( @sites ) {
		warn "## indexing $site\n";
		my $uri = URI->new($site);
		$mech->get( $uri );
		if ( my $form = $credentials->{$site} ) {
			warn "## login using form ", dump($form);
			$mech->submit_form( %$form );
		}
		get_page( $data, $uri );
	}

	print Dump($data) if ($debug);
}

