#!/usr/bin/perl

=head1 NAME

vec_search.pl - simple command-line vector space search engine

=head1 SYNOPSYS

TODO

=head1 DESCRIPTION

This code is based on great article


It's also based on code from bloxom search plug-in by John Todd Larason
L<http://molelog.molehill.org/plugins/52search> and on
Search::VectorSpace::MT by Kellan Elliott-McCrea
L<http://laughingmeme.org/archives/2003_03_21.html>

=cut

use strict;
use warnings;

package Search::VectorSpace::Simple;
use base qw(Search::VectorSpace);
use Lingua::Stem;
use HTML::TokeParser;

use File::Find;
use File::stat;
use PDL::IO::Storable;
use Search::VectorSpace;
use Storable qw/lock_store lock_retrieve/;
use Data::Dumper;

# directory to index
my $datadir='/home/dpavlin/private/home_html/papers/';

#$search_on 	= 1 unless defined $search_on;
#$auto_search_on = 1 unless defined $auto_search_on;
#die "problem getting config" unless defined $datadir;
#$file_extension ||= "story";

#$similar_on     = 0;
my $similar_thresh = .3;

# where to put index
my $state_dir 	      = '/tmp/';
# name of index store
my $store_file_name   = "$state_dir/vecsearch.stor";

my $engine 	      = undef;

my $cachefile         = "$state_dir/vecsearch.cache";
my $cache;
my $save_cache        = 1;	# save cache between runs

sub read_cache {
    $cache = (-r $cachefile ? Storable::lock_retrieve($cachefile) : undef);
    if (defined($cache) &&
	$cache->{starttime} > stat($store_file_name)->mtime) {
	return 1;
    }
    $cache = {starttime => $^T};
    return 0;
}

sub save_cache {
    return unless $save_cache;
    Storable::lock_store($cache, $cachefile);
}

sub search {
    my ($query) = @_;

    $engine ||= lock_retrieve($store_file_name);
    return 1 if (!defined($engine));

    my %results = $engine->search($query);
    # %results = {"filename\ntitle\nbody" => score}

    # returns ([score, filename, title, body]) sorted by score, best first
    return sort {$b->[0] <=> $a->[0]}
      map {[$results{$_}, split('\n', $_, 3)]}
	keys %results;
}

sub build_engine {
    my @files;
    my @docs;
    find({ wanted => sub {
	     my $d; 
	     my $curr_depth = $File::Find::dir =~ tr[/][]; 

print "? $File::Find::name\n";

	     $File::Find::name =~ m!\.html*$!i
		and (-r $File::Find::name)
		and push @files, $File::Find::name;
	 }, follow => 1},
	 $datadir);

    foreach (@files) {
	print "Feeding $_\n";
	open FILE,$_;
	my $data = join '',("$_\n",<FILE>);
	close FILE;
	$data =~ s/<\/*[^>]+>//gs;
print Dumper($data);
	push @docs,$data if ($data);
    }
    my $engine = Search::VectorSpace->new(docs => \@docs, threshold => .04);
    $engine->build_index();
    lock_store($engine, $store_file_name);
}

### main

read_cache();
build_engine(); # if (! -r $store_file_name);
print Dumper(search(@ARGV));
save_cache();

