#!/usr/bin/perl -w
#

use strict;

my $home = "/home/dpavlin/private/home_html";

my @dirs = ( $home );

while (@dirs) {
	my $path = shift @dirs;
	opendir(DIR, $path) || die "can't opendir $path: $!";
	foreach (readdir(DIR)) {
		next if (/^\./);	# skip .dot files
		if (-d "$path/$_") {
			push @dirs,"$path/$_";
			next;
		}
		if (-f "$path/$_" && /\.html?$/i) {
			my $file="$path/$_";

			open(HTML,$file) || warn "can't open $file: $!";
			my $html = "";
			while(<HTML>) {
				chomp;
				$html .= "$_ ";
			}
			close(HTML);

			my $title = $file;
			$title = $1 if ($html =~ m/<title[^>]*>([^<]+)<\/title>/i);

			$html =~ s/<\/?[^>]+>//g;
			$html =~ s/\s+/ /g;

			# remove noindex
			$html =~ s,<noindex>.+?</noindex>,,isg;
			$html =~ s,<!--\s*noindex\s*-->.+?<!--\s*/noindex\s*-->,,isg;
			# remove all script from indexing content
			$html =~ s,<script>.+?</script>,,isg;

			my $url = $file;
			$url =~ s/^$home//;
			$url =~ s/^\///;

			print "$html\n$title\n$url\n";
		}
	}
	closedir DIR;
}

