#!/usr/bin/perl

use DBI;

sub load_data {
    $dbh = DBI->connect("dbi:Pg:dbname=search", "", "")
	or die "cannot connect to database!";

    # load filenames
    open FILE, "<files";
    @files = ();
    @n_chars = ();
    @n_words = ();

    my $sth = $dbh->prepare("SELECT doc_id, ri, n_words, n_chars FROM doc")
	or die "cannot prepare 1";
    $sth->execute
	or die "cannot execute 1";

    while (my $data = $sth->fetchrow_arrayref) {
	my ($doc_id, $ri, $n_words, $n_chars) = @$data;
	$files[$doc_id] = $ri;
	$n_chars[$doc_id] = $n_chars;
	$n_words[$doc_id] = $n_words;
    }

    # load words list
    @words = ();

    my $sth = $dbh->prepare("SELECT word FROM word")
    	or die "cannot prepare 2";
    $sth->execute
	or die "cannot execute 2";

    while (my ($word) = $sth->fetchrow_array) { 
	push @words, $word;
    }

    @words = sort @words;

    # prepare select word query
    $select_word = $dbh->prepare("SELECT docpos FROM word WHERE word = ?");
}

sub set_2_and {
    my ($words1, $words2, $cmp_fn) = @_;
    $cmp_fn ||= sub { return $_[0] <=> $_[1] };
    my ($i1, $i2) = (0, 0);
    my @and = ();
    while ($i1 < @$words1 and $i2 < @$words2) {
	my $cmp = &$cmp_fn($words1->[$i1], $words2->[$i2]);
	if ($cmp == 0) { push @and, $words1->[$i1]; ++$i1; ++$i2 }
	elsif ($cmp < 0) { ++$i1 }
	else { ++$i2 }
    }
    return [@and];
}

sub set_2_or {
    my ($words1, $words2, $cmp_fn) = @_;
    $cmp_fn ||= sub { return $_[0] <=> $_[1] };
    my ($i1, $i2) = (0, 0);
    my @or = ();
    while ($i1 < @$words1 and $i2 < @$words2) {
	my $cmp = &$cmp_fn($words1->[$i1], $words2->[$i2]);
	if ($cmp == 0) { push @or, $words1->[$i1]; ++$i1; ++$i2 }
	elsif ($cmp < 0) { push @or, $words1->[$i1]; ++$i1 }
	else { push @or, $words2->[$i2]; ++$i2 }
    }
    if ($i1 < @$words1) {
	push @or, @$words1[$i1..$#$words1];
    } elsif ($i2 < @$words2) {
	push @or, @$words2[$i2..$#$words2];
    }
    return [@or];
}

sub set_and {
    my ($rara_words, $cmp_fn) = @_;
    my $and = shift @$rara_words || [];
    for my $words (@$rara_words) {
	$and = set_2_and($and, $words, $cmp_fn);
    }
    return $and;
}

sub set_or {
    my ($rara_words, $cmp_fn) = @_;
    my $or = [];
    for $words (@$rara_words) {
	$or = set_2_or($or, $words, $cmp_fn);
    }
    return $or;
}

sub regexp {
    my $pattern = shift;
    my @matches;
    for $word (@words) {
	push @matches, $word if $word =~ /$pattern/;
    }
    return @matches;
}

sub partial {
    my $substr = shift;
    my @matches;
    for $word (@words) {
	push @matches, $word if index ($word, $substr) >= 0;
    }
    return @matches;
}

#broken (uses filepos_for_word)
sub filepos_for_word {
    my $word = shift;
    my %filepos = ();
    for $filepos (split ' ', $index{$word}) {
	my ($file, $pos) = split /:/, $filepos;
	push @{$filepos{$file}}, split /,/, $pos;
    }
    return \%filepos;
}

sub docpos {
    my $word = shift;
    $select_word->execute($word);
    my ($docpos) = $select_word->fetchrow_array;
}

sub unranked {
    my $word = shift;
    $docpos = docpos($word);
    return $docpos =~ /^(.*?):/, $docpos =~ / (.*?):/g;
}

#broken (uses filepos_for_word)
sub frequency_rank {
    my ($file, $words) = @_;
    my $rank = 0;
    for $word (@$words) {
	my $filepos = filepos_for_word($word);
	$rank += @{$filepos->{$file}};
    }
    return $rank;
}

#broken (uses filepos_for_word)
sub distribution_rank {
    my ($file, $words) = @_;
    my $average_pos = 0;
    my @pos = ();
    for my $word (@$words) {
	my $filepos = filepos_for_word($word);
	for my $pos (@{$filepos->{$file}}) {
	    push @pos, $pos;
	    $average_pos += $pos;
	}
    }
    $average_pos /= @pos;

    my $sd = 0;
    if ($#pos > 0) {
	for my $pos (@pos) {
	    my $d = $pos-$average_pos;
	    $sd += $d*$d;
	}
	$sd /= $#pos;
	$sd = sqrt($sd);
    }
    return $sd / $n_chars[$file];
}

#broken (uses filepos_for_word)
sub rank {
    my ($file, $words) = @_;
    my $fr = frequency_rank(@_);
#    my $dr = distribution_rank(@_);
#    my $rank = $fr * -5000 + $dr * 50 - log($n_words[$file]);
#    print $file, "\t", $fr * -5000, "\t", $dr * 50, "\t", log($n_words[$file]), "\t", $rank, "\n";
    return -$fr;
}

sub unranked_and {
    my @words = @_;
    return @{set_and([map {[sort { $a <=> $b } unranked($_)]} @words])};
}

sub unranked_or {
    my @words = @_;
    return @{set_or([map {[sort { $a <=> $b } unranked($_)]} @words])};
}

sub unranked_phrase {
    # for each matching document, get a list of all word positions of each match
    # sub 1 from positions for 2nd word, 2 for 3rd word, etc.
    # do a set_and to determine the positions of phrase matches in each document, and how many there are
    my $word = shift;
    $docpos = docpos($word);
    %docpos = $docpos =~ /(?:^| )(.*?):(.*?)( |$)/g;
    # etc...
}

#broken (uses filepos_for_word)
sub ranked_and {
    my @words = @_;
    my $files = set_and([map {[unranked($_)]} @words]);
    my $words = [@words];
    my @rank = map {rank($_, $words)} @$files;
    return @$files[sort {$rank[$a] <=> $rank[$b]} (0..$#$files)];
}

sub ranked_or {
    
}

sub names {
    my $nos;
    if (ref $_[0]) { $nos = shift; }
    else { $nos = \@_; }
    return @files[@$nos];
}

#  sub word_frequency {
#      my %freq;
#      for my $word (@words) {
#  	my @filepos = keys %{$index{$word}};
#  	my $freq = @filepos;
#  	$freq{$word} = $freq;
#      }
#      return sort {$b->[1] <=> $a->[1]} map {[$_, $freq{$_}]} @words;
#  }

sub start_server {
    use Socket;
    my $port = 2001;
    my $proto = getprotobyname 'tcp';
    $SIG{CHLD} = sub { wait };

    socket SERVER, PF_INET, SOCK_STREAM, $proto
	or die "socket: $!";

    setsockopt SERVER, SOL_SOCKET, SO_REUSEADDR, 1
	or die "setsockopt: $!";

    bind SERVER, sockaddr_in($port, INADDR_ANY)
	or die "bind: $!";

    listen SERVER, 5
	or die "listen: $!";

    print "$0 listening to port $port\n";

    for (;;) {
	my $addr = accept CLIENT, SERVER;
	my $client_host = gethostbyaddr((unpack_sockaddr_in $addr)[1], AF_INET);

	$words = <CLIENT>;
	print CLIENT join "\n", names(unranked_and(split " ", $words));
	close CLIENT;
#	die "can't fork: $!"
#	    unless defined (my $kid = fork());
#	if (not $kid) { # this is the child
#	    my $words = <CLIENT>;
#	    print CLIENT join "\n", names(ranked_and(split " ", $words));
#	    exit;
#	} else { # this is the parent
#	    close CLIENT;
#	}
    }
}

load_data;
start_server;

#print join "\n", names(ranked_and(''));

#ranked_and('lambda', 'lisp');

#print join ' ', @{set_or([[qw(a b c d e f g)], [qw(a c e g j z)], [qw(c d e j q)]], sub {$_[0] cmp $_[1]} )};

#ranked_and('lambda', 'sort');

#print join ' ', unranked('lambda'), "\n\n";
#print join ' ', unranked('calculus'), "\n\n";
#print join ' ', unranked('lisp'), "\n\n";
#print join ' ', names(set_and([unranked('lambda')], [unranked('lisp')])), "\n";
#for (word_frequency) {
#    print "$_->[0] $_->[1]\n";
#}