#!perl use strict; use warnings; my $lines = 0; my $error = 0; my $count = 0; my $traffic_by_user = {}; my $reqs_by_domain = {}; my $reqs_by_host = {}; my $traffic_by_domain = {}; my $traffic_by_host = {}; my $dot_every = 100000; if (@ARGV == 0) { @ARGV = ; if (@ARGV == 0) { die "we need some logs to play with!\n"; } } print STDERR "scanning logs: @ARGV\n"; my $n_files_to_go = @ARGV; while (<>) { $lines++; if (@ARGV != $n_files_to_go) { print STDERR $n_files_to_go."."; $n_files_to_go = @ARGV} chomp; s/\r\z//; # MAYBE the Urls can contain a space sometimes...? # ---- parse the line ---- my ($time, $foo, $ip, $response, $size, $method, $url, $user, $baz, $mimetype) = m{^(\d+\.\d+) +(\d+) ([\d\.]+) ([A-Z_]+/\d+) (\d+) ([A-Z]+) (\S+) (\S+) (\S+) (\S+)$}; if (!defined $time) { print STDERR "can't parse line: $_\n"; $error++; next; } # print STDERR "\nok: $_\n"; $count++; # ---- parse the URL ---- # my ($proto, $host, $port, $path) = $url =~ m{^([a-z]+://|)([-%_.a-z0-9]+|)(:[0-9]+|)(.*)$}i; my ($proto, $host, $port, $path) = $url =~ m{^([a-z]+://|)([^/:?]*)(:[0-9]+|)(.*)$}i; if (!defined $proto) { print STDERR "can't parse URL with permissive regexp!!! : $url\n in line: $_\n"; $error++; next; } else { # print "host: $host\n"; } $reqs_by_host->{$host} ++; $traffic_by_host->{$host} += $size; # split into domains. we will then sort by size my $domain = $host; if ($domain ne "") { $domain =~ s/\.$//; $domain .= "."; } ## my $rev_domain = join '.', reverse split /\./, $domain, -1; ## $domain = $rev_domain; # experiment! # TODO could add path components to the domain also..? # print STDERR "host: $host domain0: $domain\n"; while ($domain ne "") { $reqs_by_domain->{$domain} ++; $traffic_by_domain->{$domain} += $size; ## if ($domain !~ s/\.[^\.]*$//) { if ($domain !~ s/^[^\.]*\.//) { print STDERR "cannot parse domain!!! : $domain\n in line: $_\n"; $error++; last; } } # $traffic_by_user->{$user} += $size; } print STDERR "\n\ndone scanning!\n\n"; open FH, ">traffic_by_domain.txt"; dump_hash_highest_value_first(\*FH, $traffic_by_domain); open FH, ">reqs_by_domain.txt"; dump_hash_highest_value_first(\*FH, $reqs_by_domain); close FH; sub dump_hash_highest_value_first { my ($fh, $hash) = @_; my $order = [ sort { $hash->{$b} <=> $hash->{$a} || $a cmp $b } keys %$hash ]; for my $k (@$order) { my $v = $hash->{$k}; print $fh " " x (15-length($v)), $v, "\t", $k, "\n"; } } #sub dump_hash { # my ($fh, $hash, $order) = @_; # for my $k (@$order) { # print $fh "$k\t$hash->{$k}\n"; # } #} print STDERR "count: $count\n"; print STDERR "errors: $error\n"; print STDERR "output is in traffic_by_domain.txt and reqs_by_domain.txt\n";