package SWISH::Split;

use 5.008;
use strict;
use warnings;

our $VERSION = '0.03';

use SWISH::API;
use Text::Iconv;
use File::Temp qw/ :mktemp /;
use Carp;
use Digest::MD5 qw(md5_hex);
use Memoize;
use File::Which;

use Data::Dumper;

use constant {
	ADDED => 1,
	DELETED => 2,
};

=head1 NAME

SWISH::Split - Perl interface to split index variant of Swish-e

=head1 SYNOPSIS

  use SWISH::Split;


=head1 DESCRIPTION

This is alternative interface for indexing data with swish-e. It's designed
to split indexes over multiple files (slices) to allow updates of records in index
by reindexing just changed parts (slice).

Data is stored in index using intrface which is somewhat similar to
L<Plucene::Simple>. This could make your migration (or supporting two index
engines) easier.

In the background, it will fork swish-e binaries (one for each index slice)
and produce UTF-8 encoded XML files for it. So, if your input charset isn't
C<ISO-8859-1> you will have to specify it.

=head1 Methods used for indexing

=head2 open_index

Create new object for index.

  my $i = SWISH::Split->open_index({
  	index => '/path/to/index',
	slice_name => \&slice_on_path,
	slices => 30,
	merge => 0,
	codepage => 'ISO-8859-2',
	swish_config => qq{
		PropertyNames from date
		PropertyNamesDate date
        },
	memoize_to_xml => 0,
  );

  # split index on first component of path
  sub slice_on_path {
	return shift split(/\//,$_[0]);
  }

Options to C<open_index> are following:

=over 5

=item C<index>

path to (existing) directory in which index slices will be created.

=item C<slice_name>

coderef to function which provide slicing from path.

=item C<slices>

maximum number of index slices. See L<"in_slice"> for
more explanation.

=item C<merge>

(planned) option to merge indexes into one at end.

=item C<codepage>

data codepage (needed for conversion to UTF-8).
By default, it's C<ISO-8859-1>.

=item C<swish_config>

additional parametars which will be inserted into
C<swish-e> configuration file. See C<swish-config>.

=item C<memoize_to_xml>

speed up repeatable data, see L<"to_xml">.

=back

=cut

my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');

sub open_index {
        my $class = shift;
        my $self = {@_};
	bless($self, $class);

	croak "need slice_name coderef" unless ref $self->{'slice_name'};
	croak "need slices" unless $self->{'slices'};

	croak "need index" unless $self->{'index'};
	croak "index '",$self->{'index'},"' doesn't exist" unless -e $self->{'index'};
	croak "index '",$self->{'index'},"' is not directory" unless -d $self->{'index'};

	$iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});

	# speedup
	memoize('in_slice');
	memoize('to_xml') if ($self->{'memoize_to_xml'});

	$self ? return $self : return undef;

}

=head2 add

Add document to index.

  $i->add($swishpath, {
	headline => 'foobar result',
	property => 'data',
  })

=cut

sub add {
	my $self = shift;

	my $swishpath = shift || return;
	my $data = shift || return;

	my $slice = $self->put_slice($swishpath, $self->to_xml($data));

	return $slice;
}

=head2 delete

Delete documents from index.

  $i->delete(@swishpath);

B<This function is not implemented.>

=cut

sub delete {
	my $self = shift;

	my @paths = @_ || return;

	foreach my $path (@paths) {
		$self->{'paths'}->{$path} = DELETED; 
	}

	die "delete is not yet implemented";

	return 42;
}


=head2 done

Finish indexing and close index file(s).

  $i->done;

This is most time-consuming operation. When it's called, it will re-index
all entries which haven't changed in all slices.

Returns number of slices updated.

This method should really be called close or finish, but both of those are
allready used.

=cut

sub done {
	my $self = shift;

	my $ret = 0;

	foreach my $s (keys %{$self->{'slice'}}) {
		$self->_debug("closing slice $s");
		$ret += $self->close_slice($s);
	}

	return $ret;
}



=head1 Reporting methods

This methods return statistics about your index.

=head2 swishpaths

Return array of C<swishpath>s in index.

  my @p = $i->swishpaths;

=cut

sub swishpaths {
	my $self = shift;

	my $s = shift || return;
	return if (! exists($self->{'slice'}->{'s'}));

	return keys %{$self->{'slice'}->{'s'}};
}

=head2 swishpaths_updated

Return array with updated C<swishpath>s.

  my @d = $i->swishpaths_updated;

=cut

sub swishpaths_updated {
	my $self = shift;
}


=head2 swishpaths_deleted

Return array with deleted C<swishpath>s.

  my $n = $i->swishpaths_deleted;

=cut

sub swishpaths_deleted {
	my $self = shift;
}


=head2 slices

Return array with all slice names.

  my @s = $i->slices;

=cut

sub slices {
	my $self = shift;
}

=head1 Helper methods

This methods are used internally, but they might be useful.

=head2 in_slice

Takes path and return slice in which this path belongs.

  my $s = $i->in_slice('path/to/document/in/index');

If there are C<slices> parametar to L<"open_index"> it will use
MD5 hash to spread documents across slices. That will produce random
distribution of your documents in slices, which might or might not be best
for your data. If you have to re-index large number of slices on each
run, think about creating your own C<slice> function and distributing
documents manually across slices.

Slice number must always be true value or various sanity checks will fail.

This function is C<Memoize>ed for performance reasons.

=cut

sub in_slice {
	my $self = shift;

	my $path = shift || confess "need path";

	confess "need slice_name function" unless ref ($self->{'slice_name'});

	if ($self->{'slices'}) {
		# first, pass path through slice_name function
		my $slice = &{$self->{'slice_name'}}($path);
		# then calculate MD5 hash
		my $hash = md5_hex($slice);
		# take first 8 chars to produce number
		# FIXME how random is this?
		$hash = hex(substr($hash,0,8));
		
		$slice = ($hash % $self->{'slices'}) + 1;
		$self->_debug("hash: $hash / ",$self->{'slices'}," => $slice");
		return $slice;
	} else {
		return &{$self->{'split'}}($path);
	}
}

=head2 find_paths

Return array of C<swishpath>s for given C<swish-e> query.

  my @p = $i->find_paths("headline=test*");

Useful for combining with L<"delete_documents"> to delete documents
which hasn't changed a while (so, expired).

=cut

sub find_paths {
	my $self = shift;

}


=head2 make_config

Create C<swish-e> configuration file for given slice.

  my $config_filename = $i->make_config('slice name');

It returns configuration filename. If no C<swish_config> was defined in
L<"open_index">, default swish-e configuration will be used. It will index all data for
searching, but none for properties.

If you want to see what is allready defined for swish-e in configuration
take a look at source code for C<DEFAULT_SWISH_CONF>.

It uses C<stdin> as C<IndexDir> to comunicate with C<swish-e>.

=cut

sub make_config {
	my $self = shift;


	my $index_file = $self->{'index'}."/";
	$index_file .= shift || confess "need slice name";

	my ($tmp_fh, $swish_config_filename) = mkstemp("/tmp/swishXXXXX");

	# find cat on filesystem
	my $cat = which('cat');

	print $tmp_fh <<"DEFAULT_SWISH_CONF";
# swish-e config file

IndexDir stdin

# input file definition
DefaultContents XML*

# indexed metatags
MetaNames xml swishdocpath


#XMLClassAttributes type
UndefinedMetaTags auto
UndefinedXMLAttributes auto

IndexFile $index_file

# Croatian ISO-8859-2 characters to unaccented equivalents
TranslateCharacters ¹©ðÐèÈæÆ¾® ssddcccczz


# disable output
ParserWarnLevel 0
IndexReport 1

DEFAULT_SWISH_CONF

	# add user parametars (like stored properties)
	print $tmp_fh $self->{'swish_config'} if ($self->{'swish_config'});

	close($tmp_fh);

	return $swish_config_filename;
}

=head2 create_slice

On first run, starts C<swish-e>. On subsequent calls just return
it's handles using C<Memoize>.

  my $s = create_slice('/path/to/document');

You shouldn't need to call C<create_slice> directly because it will be called
from L<"put_slice"> when needed.

=cut

sub create_slice {
	my $self = shift;

	my $path = shift || confess "create_slice need path!";

	my $s = $self->in_slice($path) || confess "in_slice returned null";

	return $s if (exists($self->{'slice'}->{$s}));

	my $swish_config = $self->make_config($s);

	my $swish = qq{| swish-e };
	if (-f $self->{'index'}.'/'.$s) {
		$swish .= qq{ -u };
		$self->{'slice'}->{$s}->{'update_mode'}++;
	}
	$swish .= qq{ -S prog -c } . $swish_config;

	$self->_debug("creating slice $s using $swish");

	## Build the harness, open all pipes, and launch the subprocesses
	open(my $fh, $swish) || croak "can't open $swish: $!";

	$self->{'slice'}->{$s}->{'h'} = $fh;

	$self->slice_output($s);

	return $s;
}

=head2 put_slice

Pass XML data to swish.

  my $slice = $i->put_slice('/swish/path', '<xml>data</xml>');

Returns slice in which XML ended up.

=cut

sub put_slice {
	my $self = shift;

	my $path = shift || confess "need path";
	my $xml = shift || confess "need xml";

	$xml = $iso2utf->convert($xml) || carp "XML conversion error in $xml";

	my $s = $self->create_slice($path) || confess "create_slice returned null";

	confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
	confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));

	$self->slice_output($s);

	use bytes;      # as opposed to chars
	my $fh = $self->{'slice'}->{$s}->{'h'} || confess "handle for slice $s undefined";

	my $update_header = "Update-Mode: Index\n";
	$update_header = '' unless ($self->{'slice'}->{$s}->{'update_mode'});

	print { $fh } "Path-Name: $path\n".
		"Content-Length: ".(length($xml)+1)."\n" . $update_header . 
		"Document-Type: XML\n\n$xml\n";

	$self->slice_output($s);

	$self->_debug("dumping in slice $s: $path");

	$self->{'paths'}->{$path} = ADDED; 

	return $s;
}

=head2 slice_output

Prints to STDERR output and errors from C<swish-e>.

  my $slice = $i->slice_output($s);

Normally, you don't need to call it.

B<This is dummy placeholder function for very old code that assumes this
module is using C<IPC::Run> which it isn't any more.>

=cut

sub slice_output {
	my $self = shift;

	my $s = shift || confess "slice_output needs slice";

	confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));

	# FIXME

	return $s;
}

=head2 close_slice

Close slice (terminates swish-e process for that slice).

  my $i->close_slice($s);

Returns true if slice is closed, false otherwise.

=cut

sub close_slice {
	my $self = shift;

	my $s = shift || confess "close_slice needs slice";

	confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
	confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));

	# pump rest of content (if any)
	close $self->{'slice'}->{$s}->{'h'} || carp "can't close slice $s: $!";

	$self->slice_output($s);

	undef $self->{'slice'}->{$s}->{'h'};
	
	delete($self->{'slice'}->{$s}) && return 1;
	return 0;
}

=head2 to_xml

Convert (binary safe, I hope) your data into XML for C<swish-e>.
Data will not yet be recoded to UTF-8. L<"put_slice"> will do that.

  my $xml = $i->to_xml({ foo => 'bar' });

This function is extracted from L<"add"> method so that you can C<Memoize> it.
If your data set has a lot of repeatable data, and memory is not a problem, you
can add C<memoize_to_xml> option to L<"open_index">.

=cut

my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
my $escape_re  = join '|' => keys %escape;

sub to_xml {
	my $self = shift;

	my $data = shift || return;

	my $xml = qq{<xml>};
	foreach my $tag (keys %$data) {
		my $content = $data->{$tag};
		next if (! $content || $content eq '');
		# save [cr/]lf before conversion to XML
#		$content =~ s/\n\r/##lf##/gs;
#		$content =~ s/\n/##lf##/gs;
		$content =~ s/($escape_re)/$escape{$1}/gs;
		$xml .= "<$tag><![CDATA[".$content."]]></$tag>";
	}
	$xml .= qq{</xml>};
}

sub _debug {
	my $self = shift;
	print STDERR "## ",@_,"\n" if ($self->{'debug'});
	return;
}

1;
__END__


=head1 Searching

Searching is still conducted using L<SWISH::API>, but you have to glob
index names.

    use SWISH::API;

    my $swish = SWISH::API->new( glob('index.swish-e/*') );

You can also alternativly create merged index (using C<merge> option) and
not change your source code at all.

That would also benefit performance, but it increases indexing time
because merged indexes must be re-created on each indexing run.

=head1 EXPORT

Nothing by default.

=head1 EXAMPLES

Test script for this module uses all parts of API. It's also nice example
how to use C<SWISH::Split>.

=head1 SEE ALSO

L<SWISH::API>,
L<http://www.swish-e.org/>

=head1 AUTHOR

Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2004 by Dobrica Pavlinusic

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.4 or,
at your option, any later version of Perl 5 you may have available.


=cut
