#!/usr/bin/perl my @files = `find /home/sam/cltl -name '*.html'`; for (@files) { chomp } my %index = (); my @n_words = (); my @n_chars = (); for my $file_index (0..$#files) { my $file = $files[$file_index]; print STDERR "$file\n"; my $text; { local ($/, *FILE); open FILE, $file or die "cannot load file: $file"; $text = ; close FILE; } my $word; my $word_pos = 0; while ($text =~ /([\w-]+)/g) { $word = lc $1; next unless $word =~ /[a-z]/; push @{$index{$word}{$file_index}}, $pos++; } push @n_words, $word_pos; push @n_chars, -s $file; } # dump filenames open FILE, ">files"; for my $file_index (0..$#files) { print FILE "$file_index $files[$file_index] $n_chars[$file_index] $n_words[$file_index]\n"; } @words = sort keys %index; # dump index open FILE, ">index"; for my $word (@words) { my $files = $index{$word}; print FILE "$word ", join ' ', map {$_.":".join ',', @{$files->{$_}}} keys %$files; print FILE "\n"; }