#!/usr/bin/perl use DBI; $dbh = DBI->connect("dbi:Pg:dbname=search", "", "") or die "cannot connect to database!"; $insert_word = $dbh->prepare("INSERT INTO word (word, docpos) VALUES (?, ?)") or die "cannot prepare 1"; $insert_doc = $dbh->prepare("INSERT INTO doc (doc_id, ri, n_words, n_chars) VALUES (?, ?, ?, ?)") or die "cannot prepare 2"; $next_doc_id = $dbh->prepare("SELECT NEXTVAL ('seq_doc')") or die "cannot prepare 3"; @files = `find /home/sam/cltl -name '*.html'`; for (@files) { chomp } %index = (); %word_id = (); for $file_index (0..$#files) { $next_doc_id->execute or die "cannot get next doc_id"; ($doc_id) = $next_doc_id->fetchrow_array; $doc_id == $file_index or die "seq_doc not working"; $file = $files[$file_index]; print STDERR "$file\n"; { local ($/, *FILE); open FILE, $file or die "cannot load file: $file"; $text = ; close FILE; } $word_pos = 0; while ($text =~ /([\w-]+)/g) { $word = $1; $titlecaps = $word =~ /^[A-Z]/ ? 't' : 'f'; $word = lc $word; next unless $word =~ /[a-z]/; # a word must contain at least one letter! push @{$index{$word}{$file_index}}, $word_pos; $word_pos++; } $n_chars = -s $file; $insert_doc->execute($doc_id, $file, $word_pos, $n_chars) or die "cannot insert doc $doc_id, $file, undef, $word_pos, $n_chars"; } print "ok until end of scan\n"; @words = sort keys %index; print "ok until after sort\n"; for $word (@words) { $files = $index{$word}; $docpos{$word} = join ' ', map {$_.":".join ',', @{$files->{$_}}} keys %$files; delete $index{$word}; } undef %index; print "ok until dump index\n"; # dump index for $word (@words) { $insert_word->execute($word, $docpos{$word}) or die "cannot insert word $word, $docpos"; }