#!/usr/bin/env perl
use strict;
use warnings;

use PICA::Data;
use PICA::Schema;
use IO::File;
use Getopt::Long;
use Pod::Usage;

GetOptions(
    'from|f=s'   => \( my $from ),
    'to|t:s'     => \( my $to = 0 ),
    'schema|s=s' => \( my $schema ),
    'unknown|u!' => \( my $report_unknown ),
    'count|c'    => \( my $count ),
    'path|p=s'   => \( my $path ),
    'help|?'     => \( my $help )
) or pod2usage(2);
pod2usage( -verbose => 99, -sections => "SYNOPSIS|OPTIONS|DESCRIPTION" )
  if $help
  or ( !@ARGV and -t );

my %types = (
    bin    => 'Binary',
    dat    => 'Binary',
    binary => 'Binary',
    plain  => 'Plain',
    plus   => 'Plus',
    xml    => 'XML',
    ppxml  => 'PPXML'
);

my $fh;
if (@ARGV) {
    $fh = IO::File->new( $ARGV[0] );
    $from = $1 if !$from && $ARGV[0] =~ /\.([a-z]+)$/ && $types{ lc $1 };
}

$from = 'plain' unless $from;
pod2usage("unknown serialization type: $from") unless $types{ lc $from };

$to = $from if $to eq '';
pod2usage("unknown serialization type: $to") unless !$to or $types{ lc $to };

my $parser = "PICA::Parser::${types{$from}}"->new( $fh, bless => 1 );
my $writer = $to ? "PICA::Writer::${types{$to}}"->new() : undef;

my %schema_options = ( ignore_unknown => !$report_unknown );

$count = {
    records  => 0,
    holdings => 0,
    items    => 0,
    fields   => 0,
} if $count;

my $invalid = 0;
if ($schema) {
    use JSON::PP qw(decode_json);
    my $fh = IO::File->new($schema);
    $schema = PICA::Schema->new( decode_json( join "\n", <$fh> ) );
}

if ($path) {
    my $compiled = eval { PICA::Path->new($path) };
    die "invalid pica path: $path\n" unless $compiled;
    $path = $compiled;
}

while ( my $record = $parser->next ) {
    if ($path) {
        $record = $record->fields($path);
    }

    $writer->write($record) if $writer;
    if ($schema) {
        my @errors = $schema->check($record, %schema_options);
        if (@errors) {
            for (@errors) {
                my $msg = defined $record->{_id}
                    ? $record->{_id} . ": $_" : $_;
                print "$msg\n";
            }
            $invalid++;
        }
    }
    if ($count) {
        $count->{records}++;
        $count->{holdings} += @{ $record->holdings };
        $count->{items}    += @{ $record->items };
        $count->{fields}   += @{ $record->{record} };
    }
}

$writer->end() if $writer && $writer->can('end');

if ($count) {
    $count->{invalid} = $invalid if defined $invalid;
    print $count->{$_} . " $_\n"
      for grep { defined $count->{$_} }
      qw(records invalid holdings items fields);
}

__END__

=head1 NAME

pica-validate - parse and validate PICA+ data

=head1 SYNOPSIS

pica-validate [--from TYPE] [--schema FILE] [--to [TYPE]] {OPTIONS} [FILE]

=head1 DESCRIPTION

Parse, validate and/or serialize PICA+ data from the command line, e.g.:

  pica-validate pica.xml -s schema.json   # validate against Avram schema
  pica-validate pica.dat -t xml           # convert binary to XML
  pica-validate -c -f plain < pica.plain  # parse and count records
  pica-validate -p 003@ pica.xml -t       # extract field 003@

=head1 OPTIONS

=head2 --from, -f

PICA serialization type (plain, plus, binary, XML, ppxml) with XML as default.
Guessed from input filename unless specified. See format documentation at
L<http://format.gbv.de/pica>

=head2 --to, -t

PICA serialization type to enable writing parsed PICA data.

=head2 --count, -c

Count number of records, holdings, items, and fields.

=head2 --schema, -s

L<Avram Schema|http://format.gbv.de/schema/avram/specification> to validate
against.

=head2 --unknown, -u

Report unknown fields and subfields when validating (disabled by default).

=head2 --path, -p

Limit the record to fields specified by a simple PICA Path expression.

=head1 SEE ALSO

See L<Catmandu::PICA> for more elaborated command line processing of PICA+ data.

=cut
