#!/usr/bin/perl use strict; use warnings; use utf8; use IO::File; my $known_file = shift; $known_file = IO::File->new($known_file, "r"); my %known; my $line; while (defined ($line = $known_file->getline)) { chomp $line; utf8::decode($line); for ($line =~ /(.)/g) { $known{$_} = 1; } } my $line_re = qr{^(.+?) (?:\[(.+?)\] )?(?:\[(.+?)\] )?(/.*/)[\r\n]*$}; #my $kanji_re = qr{[\x80-\xFF].}; while (defined ($line = <>)) { my $outline = $line; utf8::decode($line); my ($kanji, $kana, $romaji, $english) = $line =~ $line_re; if (!$kanji) { warn "bad line: $line\n"; next; } my $unknown = 0; my $count = 0; for ($kanji =~ /./g) { ++ $unknown unless $known{$_}; ++ $count; } my $known = $count - $unknown; my $frac = $known / $count; my $len1 = length($kanji); my $len2 = length($english); print "$count $known $unknown $frac $len1 $len2 $outline"; }