#!/bin/sh
htmlsplit() {
perl -e '$_ = join "",; tr/\n\r \t/ /s; s/\n/>\n/g; s/\n ?\n/\n/g; s/^ ?\n//s; s/ $//s; print'
}
html2txt() {
tr -d '' | htmlsplit | grep -v '^<' |
perl -pe 's/^\s+//; s/"/"/g; s/&/\&/g; s/<//g; s/(\d+);/chr($1)/ge;' |
tr -s '
' | tr -s ' ' | perl -pe 's/ / /g;'
}
pair_lines() {
perl -e '
$n = $ARGV[0] || 2;
while(defined ($_=)) {
chomp;
push @q, $_;
if (@q == $n) {
print join "\t", @q;
print "\n";
shift @q;
}
}
'
}
N="$1"
URL="$2"
if [ -n "$URL" ]; then
wget "$URL" -O-
else
cat
fi |
htmlsplit |
perl -ne '/^