#!/usr/bin/perl -w
use strict;
$|++;

# Mirror CPAN latest archive. Based on article "Mirroring your own mini-CPAN"
# by Randal L. Schwartz for Linux Magazine Column 42 (Nov 2002) available on 
# http://www.stonehenge.com/merlyn/LinuxMag/col42.html
# http://www.stonehenge.com/merlyn/LinuxMag/col42.listing.txt

# TODO:
# - support for ZIP archives (and fix .tar.gz cludges)
# - add version requirement for Archive::Tar (0.22 has a bug which
#   prevents it to extract some tars)

### CONFIG

#my $REMOTE = "http://ftp.linux.hr/CPAN/";
my $REMOTE = "http://www.cpan.org/";
#my $REMOTE = "http://cpan.pliva.hr/";
# my $REMOTE = "http://fi.cpan.org/";
# my $REMOTE = "http://au.cpan.org/";
# my $REMOTE = "file://Users/merlyn/MIRROR/CPAN/";

## warning: unknown files below this dir are deleted!
#my $LOCAL = "/mirrors/cpan/CPAN/";
my $LOCAL = "/rest/cpan/CPAN/";

my $TRACE = 0;

# This may or may not save you some disks space (depending on filesystem
# that you use to store CPAN mirror)
#
# If you want to create gziped readme files, change $readme_ext to
# my $readme_ext = '.readme.gz';
#
# I found out that gziping readme files doesn't save any
# space at one server and saves space on my laptop. YMMV
my $readme_ext = '.readme.gz';

### END CONFIG

## core -
use File::Path qw(mkpath);
use File::Basename qw(dirname);
use File::Spec::Functions qw(catfile devnull);
use File::Find qw(find);
use Getopt::Long;
use IO::Zlib;

## LWP -
use URI ();
use LWP::Simple qw(mirror RC_OK RC_NOT_MODIFIED);

## Compress::Zlib -
use Compress::Zlib qw(gzopen $gzerrno);

## Archive::Tar -
use Archive::Tar qw();

## process command-line arguments
my $result = GetOptions(
	"local=s" => \$LOCAL,
	"remote=s" => \$REMOTE,
	"verbose!" => \$TRACE,
	"debug!" => \$TRACE
	);

print "local path: $LOCAL\nremote URI: $REMOTE\n" if ($TRACE);

## first, get index files
my_mirror($_) for qw(
                     authors/01mailrc.txt.gz
                     modules/02packages.details.txt.gz
                     modules/03modlist.data.gz
		     MIRRORED.BY
                    );

## now walk the packages list
my $details = catfile($LOCAL, qw(modules 02packages.details.txt.gz));
my $gz = gzopen($details, "rb") or die "Cannot open details: $gzerrno";
my $inheader = 1;
while ($gz->gzreadline($_) > 0) {
  if ($inheader) {
    $inheader = 0 unless /\S/;
    next;
  }

  my ($module, $version, $path) = split;
  next if $path =~ m{/perl-5};  # skip Perl distributions
  my_mirror("authors/id/$path", 1);
}

## finally, clean the files we didn't stick there
clean_unmirrored();

print "creating 'indices/ls-lR.gz'\n";
system "cd $LOCAL && ls -lR | gzip > indices/ls-lR.gz" || die "$!";

exit 0;

BEGIN {
  ## %mirrored tracks the already done, keyed by filename
  ## 1 = local-checked, 2 = remote-mirrored
  my %mirrored;

  sub my_mirror {
    my $path = shift;           # partial URL
    my $skip_if_present = shift; # true/false

    my $remote_uri = URI->new_abs($path, $REMOTE)->as_string; # full URL
    my $local_file = catfile($LOCAL, split "/", $path); # native absolute file
    my $checksum_might_be_up_to_date = 1;

    if ($skip_if_present and -f $local_file) {
      ## upgrade to checked if not already
      $mirrored{$local_file} = 1 unless $mirrored{$local_file};
    } elsif (($mirrored{$local_file} || 0) < 2) {
      ## upgrade to full mirror
      $mirrored{$local_file} = 2;

      mkpath(dirname($local_file), $TRACE, 0711);
      print $path if $TRACE;
      my $status = mirror($remote_uri, $local_file);

      if ($status == RC_OK) {
        $checksum_might_be_up_to_date = 0;
        print " ... updated\n" if $TRACE;
      } elsif ($status != RC_NOT_MODIFIED) {
        warn "\n$remote_uri: $status\n";
        return;
      } else {
        print " ... up to date\n" if $TRACE;
      }
    }

    if ($path =~ m{^authors/id}) { # maybe fetch CHECKSUMS
      my $checksum_path =
        URI->new_abs("CHECKSUMS", $remote_uri)->rel($REMOTE);
      if ($path ne $checksum_path) {
        my_mirror($checksum_path, $checksum_might_be_up_to_date);
      }
    }
  }

  sub clean_unmirrored {
    find sub {
      return if /${readme_ext}$/; # don't erase readme files
      check_readme($File::Find::name) if ($mirrored{$File::Find::name} && $mirrored{$File::Find::name} == 2);
      return unless -f and not $mirrored{$File::Find::name};
      print "$File::Find::name ... removed\n" if $TRACE;
      unlink $_ or warn "Cannot remove $File::Find::name: $!";
      my $path = $File::Find::name;
      if ($path =~ s/(\.tar\.gz|\.tgz)/${readme_ext}/g && -f $path) {
        # only if we erase archive also!
        unlink $path or warn "Cannot remove $path: $!";
      }
    }, $LOCAL;
  }

  sub check_readme {

    my $path = shift;
    # fixup some things
    my $readme_path = $path;
    $readme_path =~ s/\.(tar\.gz|\.tgz)/${readme_ext}/g || return;	# just .tar.gz is supported!

    my $at = Archive::Tar->new($path) or die "Archive::Tar failed on $path\n";

    if (! -f $readme_path) {
      # create readme file
      my @readmes = sort grep m{^[^/]+/README\z}i, $at->list_files();
      my $readme;

      if ($readme = shift @readmes) {
      	my $fh;
	if ($readme_ext =~ m/\.gz/) {
        	$fh = IO::Zlib->new($readme_path, "wb");
	} else {
		$fh = IO::File->new($readme_path, "w");
	}
	if (defined $fh) {
		print $fh $at->get_content($readme);
		$fh->close;
	} else {
		die "Cannot create $readme_path: $!";
	}

	print "$readme_path ... created\n" if $TRACE;

      } else {

	$readme_path =~ s/^.+\/(.+)$/$1/;
	print "can't find readme for $readme_path ...\n" if $TRACE;

      }

    }
  }


}
