#!/usr/bin/perl -w

# this script takes an apache logfile on stdin and parses it to figure
# out what's taking up so much time: by ip, domain or url
#
# this should be named time_per_domain and symlinked to time_per_ip
# and time_per_url to accomplish the three above functions.
#
# this assumes the following logfile format:
# LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" %T %{Host}i" alternc
# %U could be used instead of %T to have more precise results
#
# output is a tab separated value format, with the seconds (or
# microseconds, if %D is used) in the first column, and whatever stat
# is requested on the second one.

use Getopt::Std;

getopts('at');

# -s implies -t
if ($opt_a) {
    $opt_t = 1;
}

my %stats = ();
my $hits = ();

# get the basename of this script to figure out the mode
($mode = $0) =~ s#.*/##;

# iterate over all files or stdin
while (<>) {
    $opt_t && $hits++;
    if ($mode eq 'time_per_ip') {
        # fairly simple: we take the ip at the beginning, skip as
        # little as possible (.*?) to get the digit and domain at the
        # end
        if (/^([\d.]*) .*? (\d+) ([\w.]*)$/i) {
            $stats{$1} += $2;
            $opt_t && $hits{$1}++;
        }
    } elsif ($mode eq 'time_per_url') {
        # funky regex: we need to find the url, so we rely on
        # double-quotes and assume spaces in the url are url-encoded
        if (/^[^"]*"\w+ ([^ ]*) [^"]*".*? (\d+) ([\w.]*)$/i) {
            $time = $2; # need to keep this because we trash it below
            # do some canonicalisation: remove duplicated slashes
            ($url = $3.$1) =~ s#///*#/#g;
            $stats{$url} += $time;
            $opt_t && $hits{$url}++;
        }        
    } else {
        # the simpler regex: just look at the end for the domain and the time
        if (/(\d+) ([\w.]*)$/i) {
            $stats{$2} += $1;
            $opt_t && $hits{$2}++;
        }
    }
}

# calculate a total, should be optional (ie. cli flag)
$total = 0;
$count = 0;
$total_hits = 0;

# if requested, average the results per hit
if ($opt_a) {
    for my $stat (keys %stats) {
        # need to keep track of the total because we blow it here
        $total += $stats{$stat};
        $stats{$stat} = $stats{$stat}/$hits{$stat};
    }
}

# sort the output by values (by time)
for my $stat (sort { $stats{$a} <=> $stats{$b} } keys %stats) {
    # total stat is computed during averaging
    if (!$opt_a) {
        $total += $stats{$stat};
    }
    $count++;
    $opt_t && ($total_hits += $hits{$stat});
    if ($opt_a) {
        printf "%0.2f\t%s", $stats{$stat}, $stat;
    } else {
        print "$stats{$stat}\t$stat";
    }
    $opt_t && print "\t$hits{$stat}";
    print "\n";
}
if ($opt_a) {
    $total = sprintf "%0.2f", $total/$total_hits;
}
print "$total\t$count unique matches";
$opt_t && print " and $total_hits hits";
print "\n";