#!/usr/bin/env perl

use strict;
use warnings;
use Getopt::Long qw(:config autohelp);
use IRC::Utils qw(lc_irc strip_color strip_formatting decode_irc);
use Pod::Usage;

GetOptions(
    'f|format=s'       => \(my $format = ''),
    'b|bot=s'          => \my $bot,
    'r|ignore-regex=s' => \my @ignore_regexes,
    'n|ignore-nick=s'  => \my @ignore_nicks,
    'v|version'        => sub {
        no strict 'vars';
        require POE::Component::IRC::Plugin::MegaHAL;
        my $version = defined $POE::Component::IRC::Plugin::MegaHAL::VERSION
            ? $POE::Component::IRC::Plugin::MegaHAL::VERSION
            : 'dev-git';
        print "irchal-seed version $version\n";
        exit;
    },
) or pod2usage();

my ($privmsg, $action);
if ($format eq 'pocoirc') {
    $privmsg = qr/^(?:\S+ )?\S+ <(\S+)> (.*)/;
    $action = qr/^(?:\S+ )?\S+ \* (\S+) (.*)/;
}
elsif ($format eq 'irssi') {
    $privmsg = qr/^\S+\s+<.(\S+)> (.*)/;
    $action = qr/^\S+\s+\* (\S+) (.*)/;
}
else {
    die "No log format specified with --format\n";
}

binmode STDIN, ':bytes';
binmode STDOUT, ':utf8';
LINE: while (my $line = <STDIN>) {
    chomp $line;

    # decode cp1252/utf8 bytes to a text string
    $line = decode_irc($line);

    $line = strip_color($line);
    $line = strip_formatting($line);

    my ($nick, $msg);
    if (($nick, $msg) = $line =~ /$privmsg/) {
        # PRIVMSG
        $msg =~ s/^\s*\Q$bot\E[:,;.!?~]?\s//i if defined $bot;
    }
    elsif (($nick, $msg) = $line =~ /$action/) {
        # ACTION
        $msg = "\x01 $msg";
    }
    else {
        next LINE;
    }

    # skip it if we want to ignore this nick
    for my $bad (@ignore_nicks) {
        next LINE if lc_irc($nick) eq lc_irc($bad);
    }

    # skip if it matches a regex
    for my $regex (@ignore_regexes) {
        next LINE if $msg =~ /$regex/;
    }

    # MegaHAL ignores lines in megahal.trn which start with '#'
    $msg = " $msg" if $msg =~ /^#/;

    print "$msg\n";
}

=head1 NAME

irchal-seed - Creates a L<MegaHAL|AI::MegaHAL> training file from logs
generated by L<POE::Component::IRC::Plugin::Logger|POE::Component::IRC::Plugin::Logger>
or irssi.

=head1 SYNOPSIS

B<irchal-seed> <options>

 Options:
   -f FORMAT, --format=NAME         Log format (pocoirc or irssi)
   -c CASE,   --case=CASE           The IRC server's case mapping
   -b NICK,   --bot=NICK            The nickname of the IRC bot
   -r RX      --ignore-regex=RX     A regex for lines to ignore
   -n NICK,   --ignore-nick=NICK    A nickname to ignore
   -h,        --help                Print this help message
   -v,        --version             Print version number

Input is accepted on STDIN, and output is delivered to STDOUT.

You should use the C<--bot> option if your bot was present when the channel
was logged as it will strip the I<"botname: "> part from messages where
users addressed the bot.

If your bot (or any other bot in the channel) can take commands that all
start in a similar way, use the C<--ignore-regex> option to ignore those
lines, e.g. C<--ignore-regex='^!\w+'> for bots that react to commands
prefixed with '!'.

Example:

  # convert irssi log, ignoring "MyBot: " and lines with URLs
  cat ~/logs/\#chan.log | irchal-seed -f irssi -b MyBot -r '\w+://\w' > megahal.trn

The C<--case> option controls the second parameter passed to C<lc_irc>
from L<IRC::Utils|IRC::Utils>. The default is B<'rfc1459'>. You usually
don't have to change this. Consult that package's documentation for more
information.

=head1 AUTHOR

Hinrik E<Ouml>rn SigurE<eth>sson, hinrik.sig@gmail.com

=head1 LICENSE AND COPYRIGHT

Copyright 2009 Hinrik E<Ouml>rn SigurE<eth>sson

This program is free software, you can redistribute it and/or modify
it under the same terms as Perl itself.

=cut
