#!/usr/bin/perl -w
use strict;
$| = 1;

use Class::DBI::Loader;

my $DATABASE = 'cpan';


__END__

### internally-defined classes

{
  package My::DBI;
  use base 'Class::DBI';

  __PACKAGE__->set_db('Main', "dbi:Pg:database=$DATABASE", undef, undef,
                        {AutoCommit => 1});

  sub CONSTRUCT {
    my $class = shift;
    for (qw(My::Page My::Link)) {
      eval { $_->sql_CONSTRUCT->execute };
      die $@ if $@ and $@ !~ /already exists/;
    }
  }

  sub atomically {
    my $class = shift;
    my $action = shift;         # coderef
    local $class->db_Main->{AutoCommit}; # turn off AutoCommit for this block

    my @result;
    eval {
      @result = wantarray ? $action->() : scalar($action->());
      $class->dbi_commit;
    };
    if ($@) {
      warn "atomically got error: $@";
      my $commit_error = $@;
      eval { $class->dbi_rollback };
      die $commit_error;
    }
    die $@ if $@;
    wantarray ? @result : $result[0];
  }
}

{
  package My::Link;
  our @ISA = qw(My::DBI);

  __PACKAGE__->table('link');
  __PACKAGE__->set_sql(CONSTRUCT => <<'SQL');
CREATE TABLE __TABLE__ (
  src TEXT,
  dst TEXT,
  PRIMARY KEY (src, dst)
)  
SQL
  __PACKAGE__->columns(Primary => qw(src dst));
  __PACKAGE__->has_a(src => 'My::Page');
  __PACKAGE__->has_a(dst => 'My::Page');
}

{
  package My::Page;
  our @ISA = qw(My::DBI);
  use enum qw(:State_ unseen todo working done);

  __PACKAGE__->table('page');
  __PACKAGE__->set_sql(CONSTRUCT => <<"SQL");
CREATE TABLE __TABLE__ (
  location TEXT PRIMARY KEY,
  state INT DEFAULT @{[State_unseen]},
  last_status TEXT,
  last_checked INT,
  last_good INT,
  last_modified INT
)  
SQL
  __PACKAGE__->columns(All => qw(location state last_status
                                 last_checked last_good last_modified));

  __PACKAGE__->has_many(inbound => 'My::Link', 'dst', { sort => 'src' });
  __PACKAGE__->has_many(outbound => 'My::Link', 'src', { sort => 'dst' });

  sub make_working_atomically {
    my $self = shift;

    $self->atomically(sub {
                        $self->state == State_todo or return undef;
                        $self->state(State_working);
                        $self->update;
                        return 1;
                      });
  }

  sub create_or_make_todo {
    my $class = shift;
    my $location = shift;

    $class->atomically(sub {
                         my $item = $class->find_or_create({location => $location});
                         if ((not defined($item->state)
                              or $item->state == State_unseen)) {
                           $item->state(State_todo);
                           $item->update;
                         }
                         $item;
                       });
  }
}

{
  use LWP::UserAgent;
  my $AGENT = LWP::UserAgent->new;
  $AGENT->agent("linkchecker/0.42 " . $AGENT->agent);
  $AGENT->env_proxy;
  $AGENT->timeout($TIMEOUT);

  sub fetch { $AGENT->simple_request(@_) }
}

### main code begins here

## initialize database if needed
My::DBI->CONSTRUCT;

## reset all working to todo
for my $page (My::Page->search(state => My::Page::State_working)) {
  $page->state(My::Page::State_todo);
  $page->update;
}

## unless any are todo or finished, prime the pump
unless (() = My::Page->search(state => My::Page::State_todo)
        or () = My::Page->search(state => My::Page::State_done)) {
  print "Starting a new run...\n";
  My::Page->create_or_make_todo(HACK_URL(URI->new($_))->as_string)
      for @CHECK;
}

## main loop, done by kids:
kids_do(sub {                   # the task
          srand;                # spin random number generator uniquely
          while (my @todo = My::Page->search(state => My::Page::State_todo)) {
            my $page = $todo[rand @todo]; # pick one at random
            unless($page->make_working_atomically) {
              # someone else got it
              print "$$ wanted ", $page->location, "\n" if $VERBOSE;
              next;
            }
            ;
            print "$$ doing ", $page->location, "\n" if $VERBOSE > 1;
            do_one_page($page);
          }
        },
        sub {                   # max kids needed
          scalar(() = My::Page->search(state => My::Page::State_todo));
        });

## clean out any unseen at this point (no longer needed)
$_->delete for My::Page->search(state => My::Page::State_unseen);

## display report
print "*** BEGIN REPORT ***\n";
for my $page (My::Page->search(state => My::Page::State_done,
                               {order_by => 'location'})) {
  next if $page->last_checked <= $page->last_good + $REPORT;
  my $url = URI->new($page->location);
  print "$url:\n";
  print "  Status: ", $page->last_status, "\n";
  for (qw(checked good modified)) {
    my $method = "last_$_";
    my $value = $page->$method() or next;
    print "  \u\L$_\E: ".localtime($value)."\n";
  }

  for my $inbound ($page->inbound) {
    my $inbound_page = $inbound->src;
    my $inbound_url = URI->new($inbound_page->location);
    my $rel = $inbound_url->rel($url);
    $rel = $inbound_url->path_query if $rel =~ /^\.\.\/\.\./;
    print "  from $rel\n";
  }
  for my $outbound ($page->outbound) {
    my $outbound_page = $outbound->dst;
    my $outbound_url = URI->new($outbound_page->location);
    my $rel = $outbound_url->rel($url);
    $rel = $outbound_url->path_query if $rel =~ /^\.\.\/\.\./;
    my $outbound_status = $outbound_page->last_status;
    print "  to $rel: $outbound_status\n";
  }
}
print "*** END REPORT ***\n";

## reset for next pass
for my $page (My::Page->search(state => My::Page::State_done)) {
  $page->state(My::Page::State_unseen);
  $page->update;
}

exit 0;

### subroutines

sub do_one_page {
  my $page = shift;             # My::Page

  my $url = URI->new($page->location);
  my $parse = PARSE($url);
  if ($parse >= 2) {
    print "Parsing $url\n" if $VERBOSE;
    if (time < ($page->last_checked || 0) + $RECHECK or
        time < ($page->last_good || 0) + $RECHECK_GOOD) {
      print "$url: too early to reparse\n" if $VERBOSE;
      ## reuse existing links
      My::Page->create_or_make_todo($_->dst->location) for $page->outbound;
    } else {
      parse_or_ping($page, $url, "PARSE");
    }
  } elsif ($parse >= 1) {
    print "Pinging $url\n" if $VERBOSE;
    if (time < ($page->last_checked || 0) + $RECHECK or
        time < ($page->last_good || 0) + $RECHECK_GOOD) {
      print "$url: too early to reping\n" if $VERBOSE;
      $_->delete for $page->outbound; # delete any existing stale links
    } else {
      parse_or_ping($page, $url, "PING");
    }
  } else {
    print "Skipping $url\n" if $VERBOSE;
    $page->last_status("Skipped");
    $page->last_checked(0);
  }
  $page->state(My::Page::State_done);
  $page->update;
}

sub parse_or_ping {
  my $page = shift;             # My::Page
  my $url = shift;              # URI
  my $kind = shift;             # "PARSE" or "PING"

  use HTML::LinkExtor;

  ## create the request
  my $request = HTTP::Request->new(GET => "$url");
  $request->if_modified_since($page->last_modified) if $page->last_modified;

  ## fetch the response
  my $content;
  my $content_type;
  my $res = fetch
    ($request,
     sub {
       my ($data, $response, $protocol) = @_;
       unless ($content_type) {
         if ($content_type = $response->content_type) {
           if ($kind eq "PING") {
             print "aborting $url for ping\n" if $VERBOSE > 1;
             die "ping only";
           }
           if ($content_type ne "text/html") {
             print "aborting $url for $content_type\n" if $VERBOSE > 1;
             die "content type is $content_type";
           }
         }
       }
       $content .= $data;
       if ($MAXSIZE and length $content > $MAXSIZE) {
         print "aborting $url for content length\n" if $VERBOSE > 1;
         die "content length is ", length $content;
       }
     }, 8192);
  $res->content($content);      # stuff what we got

  ## analyze the results
  if ($res->is_success) {
    my $now = time;
    $page->last_checked($now);
    $page->last_good($now);
    $page->last_modified($res->last_modified || $res->date);
    $_->delete for $page->outbound; # delete any existing stale links

    if ($content_type eq "text/html") {
      if ($kind eq "PARSE") {
        print "$url: parsed\n" if $VERBOSE;
        $page->last_status("Verified and parsed");
        my %seen;
        HTML::LinkExtor->new
            (sub {
               my ($tag, %attr) = @_;
               $seen{$_}++ or add_link($page, $_) for values %attr;
             }, $res->base)->parse($res->content);
      } else {                  # presume $kind = PING
        print "$url: good ping\n" if $VERBOSE;
        $page->last_status("Verified (contents not examined)");
      }
    } else {
      print "$url: content = $content_type\n" if $VERBOSE;
      $page->last_status("Verified (content = $content_type)");
    }
  } elsif ($res->code == 304) { # not modified
    print "$url: not modified\n" if $VERBOSE;
    my $now = time;
    $page->last_checked($now);
    $page->last_good($now);
    ## reuse existing links
    My::Page->create_or_make_todo($_->dst->location) for $page->outbound;
  } elsif ($res->is_redirect) {
    my $location = $res->header("Location");
    print "$url: redirect to $location\n" if $VERBOSE;
    $_->delete for $page->outbound; # delete any existing stale links
    add_link($page, $location, $res->base) if $FOLLOW_REDIRECT;
    $page->last_status("Redirect (status = ".$res->code.") to $location");
    $page->last_checked(time);
  } else {
    print "$url: not verified: ", $res->code, "\n" if $VERBOSE;
    $_->delete for $page->outbound; # delete any existing stale links
    $page->last_status("NOT Verified (status = ".($res->code).")");
    $page->last_checked(time);
  }
  $page->update;
}

sub add_link {
  my $page = shift;             # My::Page
  my $url_string = shift;       # string
  my $base = shift;             # maybe undef

  my $url = $base
    ? URI->new_abs($url_string, URI->new($base))
      : URI->new($url_string);
  $url->fragment(undef);        # blow away any fragment
  $url = HACK_URL($url);
  return if PARSE($url) < 0;    # skip any links to non-xref pages
  print "saw $url\n" if $VERBOSE > 1;

  my $newpage = My::Page->create_or_make_todo("$url");
  ## the following might die if there's already one link there
  eval { My::Link->create({src => $page, dst => $newpage}) };
  die $@ if $@ and not $@ =~ /uniqueness constraint/;
}

sub kids_do {
  my $code_task = shift;
  my $code_count = shift;

  use POSIX qw(WNOHANG);

  my %kids;

  while (keys %kids or $code_count->()) {
    ## reap kids
    while ((my $kid = waitpid(-1, WNOHANG)) > 0) {
      ## warn "$kid reaped";    # trace
      delete $kids{$kid};
    }
    ## verify live kids
    for my $kid (keys %kids) {
      next if kill 0, $kid;
      warn "*** $kid found missing ***"; # shouldn't happen
      delete $kids{$kid};
    }
    ## launch kids
    if (keys %kids < $KIDMAX
        and keys %kids < $code_count->()) {
      ## warn "forking a kid";  # trace
      my $kid = fork;
      if (defined $kid) {       # good parent or child
        if ($kid) {             # parent
          $kids{$kid} = 1;
        } else {
          $code_task->();       # the real task
          exit 0;
        }
      } else {
        warn "cannot fork: $!"; # hopefully temporary
        sleep 1;
        next;                   # outer loop
      }
    }
    print "[", scalar keys %kids, " kids]\n" if $VERBOSE;
    sleep 1;
  }
}

sub URI::mailto::host { ""; }   # workaround bug in LWP
sub URI::mailto::authority { ""; } # workaround bug in LWP
