#!/usr/bin/perl use DBI; sub load_data { $dbh = DBI->connect("dbi:Pg:dbname=search", "", "") or die "cannot connect to database!"; # load filenames open FILE, "prepare("SELECT doc_id, ri, n_words, n_chars FROM doc") or die "cannot prepare 1"; $sth->execute or die "cannot execute 1"; while (my $data = $sth->fetchrow_arrayref) { my ($doc_id, $ri, $n_words, $n_chars) = @$data; $files[$doc_id] = $ri; $n_chars[$doc_id] = $n_chars; $n_words[$doc_id] = $n_words; } # load words list @words = (); my $sth = $dbh->prepare("SELECT word FROM word") or die "cannot prepare 2"; $sth->execute or die "cannot execute 2"; while (my ($word) = $sth->fetchrow_array) { push @words, $word; } @words = sort @words; # prepare select word query $select_word = $dbh->prepare("SELECT docpos FROM word WHERE word = ?"); } sub set_2_and { my ($words1, $words2, $cmp_fn) = @_; $cmp_fn ||= sub { return $_[0] <=> $_[1] }; my ($i1, $i2) = (0, 0); my @and = (); while ($i1 < @$words1 and $i2 < @$words2) { my $cmp = &$cmp_fn($words1->[$i1], $words2->[$i2]); if ($cmp == 0) { push @and, $words1->[$i1]; ++$i1; ++$i2 } elsif ($cmp < 0) { ++$i1 } else { ++$i2 } } return [@and]; } sub set_2_or { my ($words1, $words2, $cmp_fn) = @_; $cmp_fn ||= sub { return $_[0] <=> $_[1] }; my ($i1, $i2) = (0, 0); my @or = (); while ($i1 < @$words1 and $i2 < @$words2) { my $cmp = &$cmp_fn($words1->[$i1], $words2->[$i2]); if ($cmp == 0) { push @or, $words1->[$i1]; ++$i1; ++$i2 } elsif ($cmp < 0) { push @or, $words1->[$i1]; ++$i1 } else { push @or, $words2->[$i2]; ++$i2 } } if ($i1 < @$words1) { push @or, @$words1[$i1..$#$words1]; } elsif ($i2 < @$words2) { push @or, @$words2[$i2..$#$words2]; } return [@or]; } sub set_and { my ($rara_words, $cmp_fn) = @_; my $and = shift @$rara_words || []; for my $words (@$rara_words) { $and = set_2_and($and, $words, $cmp_fn); } return $and; } sub set_or { my ($rara_words, $cmp_fn) = @_; my $or = []; for $words (@$rara_words) { $or = set_2_or($or, $words, $cmp_fn); } return $or; } sub regexp { my $pattern = shift; my @matches; for $word (@words) { push @matches, $word if $word =~ /$pattern/; } return @matches; } sub partial { my $substr = shift; my @matches; for $word (@words) { push @matches, $word if index ($word, $substr) >= 0; } return @matches; } #broken (uses filepos_for_word) sub filepos_for_word { my $word = shift; my %filepos = (); for $filepos (split ' ', $index{$word}) { my ($file, $pos) = split /:/, $filepos; push @{$filepos{$file}}, split /,/, $pos; } return \%filepos; } sub docpos { my $word = shift; $select_word->execute($word); my ($docpos) = $select_word->fetchrow_array; } sub unranked { my $word = shift; $docpos = docpos($word); return $docpos =~ /^(.*?):/, $docpos =~ / (.*?):/g; } #broken (uses filepos_for_word) sub frequency_rank { my ($file, $words) = @_; my $rank = 0; for $word (@$words) { my $filepos = filepos_for_word($word); $rank += @{$filepos->{$file}}; } return $rank; } #broken (uses filepos_for_word) sub distribution_rank { my ($file, $words) = @_; my $average_pos = 0; my @pos = (); for my $word (@$words) { my $filepos = filepos_for_word($word); for my $pos (@{$filepos->{$file}}) { push @pos, $pos; $average_pos += $pos; } } $average_pos /= @pos; my $sd = 0; if ($#pos > 0) { for my $pos (@pos) { my $d = $pos-$average_pos; $sd += $d*$d; } $sd /= $#pos; $sd = sqrt($sd); } return $sd / $n_chars[$file]; } #broken (uses filepos_for_word) sub rank { my ($file, $words) = @_; my $fr = frequency_rank(@_); # my $dr = distribution_rank(@_); # my $rank = $fr * -5000 + $dr * 50 - log($n_words[$file]); # print $file, "\t", $fr * -5000, "\t", $dr * 50, "\t", log($n_words[$file]), "\t", $rank, "\n"; return -$fr; } sub unranked_and { my @words = @_; return @{set_and([map {[sort { $a <=> $b } unranked($_)]} @words])}; } sub unranked_or { my @words = @_; return @{set_or([map {[sort { $a <=> $b } unranked($_)]} @words])}; } sub unranked_phrase { # for each matching document, get a list of all word positions of each match # sub 1 from positions for 2nd word, 2 for 3rd word, etc. # do a set_and to determine the positions of phrase matches in each document, and how many there are my $word = shift; $docpos = docpos($word); %docpos = $docpos =~ /(?:^| )(.*?):(.*?)( |$)/g; # etc... } #broken (uses filepos_for_word) sub ranked_and { my @words = @_; my $files = set_and([map {[unranked($_)]} @words]); my $words = [@words]; my @rank = map {rank($_, $words)} @$files; return @$files[sort {$rank[$a] <=> $rank[$b]} (0..$#$files)]; } sub ranked_or { } sub names { my $nos; if (ref $_[0]) { $nos = shift; } else { $nos = \@_; } return @files[@$nos]; } # sub word_frequency { # my %freq; # for my $word (@words) { # my @filepos = keys %{$index{$word}}; # my $freq = @filepos; # $freq{$word} = $freq; # } # return sort {$b->[1] <=> $a->[1]} map {[$_, $freq{$_}]} @words; # } sub start_server { use Socket; my $port = 2001; my $proto = getprotobyname 'tcp'; $SIG{CHLD} = sub { wait }; socket SERVER, PF_INET, SOCK_STREAM, $proto or die "socket: $!"; setsockopt SERVER, SOL_SOCKET, SO_REUSEADDR, 1 or die "setsockopt: $!"; bind SERVER, sockaddr_in($port, INADDR_ANY) or die "bind: $!"; listen SERVER, 5 or die "listen: $!"; print "$0 listening to port $port\n"; for (;;) { my $addr = accept CLIENT, SERVER; my $client_host = gethostbyaddr((unpack_sockaddr_in $addr)[1], AF_INET); $words = ; print CLIENT join "\n", names(unranked_and(split " ", $words)); close CLIENT; # die "can't fork: $!" # unless defined (my $kid = fork()); # if (not $kid) { # this is the child # my $words = ; # print CLIENT join "\n", names(ranked_and(split " ", $words)); # exit; # } else { # this is the parent # close CLIENT; # } } } load_data; start_server; #print join "\n", names(ranked_and('')); #ranked_and('lambda', 'lisp'); #print join ' ', @{set_or([[qw(a b c d e f g)], [qw(a c e g j z)], [qw(c d e j q)]], sub {$_[0] cmp $_[1]} )}; #ranked_and('lambda', 'sort'); #print join ' ', unranked('lambda'), "\n\n"; #print join ' ', unranked('calculus'), "\n\n"; #print join ' ', unranked('lisp'), "\n\n"; #print join ' ', names(set_and([unranked('lambda')], [unranked('lisp')])), "\n"; #for (word_frequency) { # print "$_->[0] $_->[1]\n"; #}