#!perl
use strict; use warnings;

my $lines = 0;
my $error = 0;
my $count = 0;

my $traffic_by_user = {};

my $reqs_by_domain = {};
my $reqs_by_host = {};
my $traffic_by_domain = {};
my $traffic_by_host = {};

my $dot_every = 100000;

if (@ARGV == 0) {
	@ARGV = <squid-*.log>;
	if (@ARGV == 0) {
		die "we need some logs to play with!\n";
	}
}

print STDERR "scanning logs: @ARGV\n";

my $n_files_to_go = @ARGV;

while (<>) {
	$lines++;
	if (@ARGV != $n_files_to_go) { print STDERR $n_files_to_go."."; $n_files_to_go = @ARGV}
	chomp;
	s/\r\z//;
	# MAYBE the Urls can contain a space sometimes...?

	# ---- parse the line ----

	my ($time, $foo, $ip, $response, $size, $method, $url, $user, $baz, $mimetype) =
		m{^(\d+\.\d+) +(\d+) ([\d\.]+) ([A-Z_]+/\d+) (\d+) ([A-Z]+) (\S+) (\S+) (\S+) (\S+)$};

	if (!defined $time) {
		print STDERR "can't parse line: $_\n";
		$error++;
		next;
	}
#	print STDERR "\nok: $_\n";

	$count++;

	# ---- parse the URL ----

#	my ($proto, $host, $port, $path) = $url =~ m{^([a-z]+://|)([-%_.a-z0-9]+|)(:[0-9]+|)(.*)$}i;
	my ($proto, $host, $port, $path) = $url =~ m{^([a-z]+://|)([^/:?]*)(:[0-9]+|)(.*)$}i;
	
	if (!defined $proto) {
		print STDERR "can't parse URL with permissive regexp!!! : $url\n  in line: $_\n";
		$error++;
		next;
	} else {
#		print "host: $host\n";
	}

	$reqs_by_host->{$host} ++;
	$traffic_by_host->{$host} += $size;

	# split into domains.  we will then sort by size
	my $domain = $host;
	if ($domain ne "") {
		$domain =~ s/\.$//;
		$domain .= ".";
	}

##	my $rev_domain = join '.', reverse split /\./, $domain, -1;
##	$domain = $rev_domain;  # experiment!

	# TODO could add path components to the domain also..?

#	print STDERR "host: $host  domain0: $domain\n";
	while ($domain ne "") {
		$reqs_by_domain->{$domain} ++;
		$traffic_by_domain->{$domain} += $size;
##		if ($domain !~ s/\.[^\.]*$//) {
		if ($domain !~ s/^[^\.]*\.//) {
			print STDERR "cannot parse domain!!! : $domain\n  in line: $_\n";
			$error++;
			last;
		}
	}

#	$traffic_by_user->{$user} += $size;
}

print STDERR "\n\ndone scanning!\n\n";

open FH, ">traffic_by_domain.txt";
dump_hash_highest_value_first(\*FH, $traffic_by_domain);

open FH, ">reqs_by_domain.txt";
dump_hash_highest_value_first(\*FH, $reqs_by_domain);

close FH;

sub dump_hash_highest_value_first {
	my ($fh, $hash) = @_;
	my $order = [
		sort {
			$hash->{$b} <=> $hash->{$a} || $a cmp $b
		} keys %$hash
	];
	for my $k (@$order) {
		my $v = $hash->{$k};
		print $fh " " x (15-length($v)), $v, "\t", $k, "\n";
	}
}

#sub dump_hash {
#	my ($fh, $hash, $order) = @_;
#	for my $k (@$order) {
#		print $fh "$k\t$hash->{$k}\n";
#	}
#}

print STDERR "count: $count\n";
print STDERR "errors: $error\n";

print STDERR "output is in traffic_by_domain.txt and reqs_by_domain.txt\n";