#!/usr/bin/perl -wT
use POSIX;
use Getopt::Long qw(:config bundling require_order auto_version);
use Pod::Usage;
use FAST;
use FAST::Bio::SeqIO;
use FAST::Bio::Tools::SeqPattern ();
use strict;

use vars qw($VERSION $DESC $NAME $COMMAND $DATE);
$VERSION = $FAST::VERSION; 
$DESC    = "Select sequences based on perl regular expression match on IDs, descriptions or sequences.\n";
$NAME    = $0;
$NAME    =~ s/^.*\///;
$COMMAND = join " ",$NAME,@ARGV;
$DATE = POSIX::strftime("%c",localtime());

use constant { true => 1, false => 0 };

## DEFAULT OPTION VALUES
my $def_format  = $FAST::DEF_FORMAT;  #7/1/13 "fasta";
my $def_logname = $FAST::DEF_LOGNAME; #7/1/13 "FAST.log.txt";

## OPTION VARIABLES
my $man                  = undef;  # --man
my $help                 = undef;  # -h
my $negate               = undef;  # -v
my $insensitive          = undef;  # -i
my $moltype              = undef;  # -m, in case bioperl can't tell

my $format               = $def_format;  # --format
my $log                  = undef;        # -l
my $logname              = $def_logname; # -L
my $comment              = undef;        # -C
my $split_on_regex       = undef;        # -S 

## in increasing priority:
my $description    = undef;  # -d
my $sequence       = undef;  # -s
my $field          = undef;  # -f
my $tag            = undef;  # -t

## in increasing priority:
my $iupac_ambiguity_expand_regex_prot        = undef; # -p, implies -s
my $iupac_ambiguity_expand_regex_nuc         = undef; # -e, implies -s
my $revcom_iupac_ambiguity_expand_regex_nuc  = undef; # -r, implies -s
my $fastq                = undef; #-q


GetOptions('help|h'         		 => \$help, 
	   'man'            		 => \$man,
	   'negate|v'                    => \$negate,
	   'insensitive|i'               => \$insensitive,
	   'moltype|m=s'                 => sub{  my (undef,$val) = @_; 
						  die "$NAME: --moltype or -m option argument must be \"dna\", \"rna\" or \"protein\"\n" 
						    unless $val =~ /dna|rna|protein/i; 
						  $moltype = $val;
						},
	   'format=s'                    => \$format,
	   'log|l'                       => \$log,
	   'logname|L=s'                 => \$logname,
	   'comment|C=s'                 => \$comment,

	   'description|d'         => \$description,
	   'sequence|s'            => \$sequence,
	   'field|f=i'             => sub{  my (undef,$val) = @_; 
						  die "$NAME: --field or -f option expects non-zero integer argument\n" 
						    unless $val != 0; 
						  $field = $val;
						},
	   'tag|t=s'             => sub{  my (undef,$val) = @_; 
						  die "$NAME: --tag or -t option expects string argument exclusively from [a-zA-Z0-9_-]\n" 
						    unless $val =~ /^[a-zA-Z0-9_-]+$/;  
						  $tag = $val;
						},
	   'split-on-regex|S=s'          => \$split_on_regex,

           'iupac-expand-prot|p'         => \$iupac_ambiguity_expand_regex_prot,
	   'iupac-expand-nuc|e'          => \$iupac_ambiguity_expand_regex_nuc,
	   'revcom-iupac-expand-nuc|r'   => \$revcom_iupac_ambiguity_expand_regex_nuc,
     'fastq|q'                     => sub{$format = 'fastq';},
	  ) 
  or exit(1);
		  
pod2usage(-verbose => 1) if $help;
pod2usage(-verbose => 2, -input => \*DATA) if $man;

my $fromSTDIN = ((-t STDIN) ? false : true);

pod2usage("$NAME: expects one regex argument and at least one input filename or glob. Try \"perldoc $NAME\"") if (!($fromSTDIN) && (@ARGV < 2));
pod2usage("$NAME: expects one regex argument when input is on STDIN. Try \"perldoc $NAME\"") if ($fromSTDIN && @ARGV != 1);

&FAST::log($logname, $DATE, $COMMAND, $comment, $fromSTDIN) if ($log); 

my $arg = shift @ARGV;
my $re_string;

if ($iupac_ambiguity_expand_regex_nuc  and not $revcom_iupac_ambiguity_expand_regex_nuc) {
  $sequence = true;
  $re_string = new FAST::Bio::Tools::SeqPattern(-SEQ => $arg, -TYPE =>'Dna')->expand;
}
elsif ($revcom_iupac_ambiguity_expand_regex_nuc) {
  $sequence = true;
  $re_string = new FAST::Bio::Tools::SeqPattern(-SEQ => $arg, -TYPE =>'Dna')->revcom(1)->str;
}
elsif ($iupac_ambiguity_expand_regex_prot) {
  $sequence = true;
  $re_string = new FAST::Bio::Tools::SeqPattern(-SEQ => $arg, -TYPE =>'Protein')->expand;
}

else {
  $re_string = $arg;
}

my $re;
if ($insensitive) {
  $re = qr/$re_string/i;
}
else {
  $re = qr/$re_string/;
}


my $split_re;
if ($split_on_regex) {
  $split_re = qr/$split_on_regex/;
}
else {
  $split_re = ' ';
}

my $index;
if (defined $field and $field > 0) {
  $index = $field - 1;
}
else { # $field < 0
  $index = $field;
}


my $OUT = FAST::Bio::SeqIO->newFh('-format' => $format);
my $IN;
unless (@ARGV) {
    if ($moltype) {
	$IN = FAST::Bio::SeqIO->new(-fh => *STDIN{IO}, '-format' => $format, '-alphabet' => $moltype);
    }
    else {
	$IN = FAST::Bio::SeqIO->new(-fh => *STDIN{IO}, '-format' => $format);
    }
}

while ($IN or @ARGV) {
  if (@ARGV) {
    my $file = shift (@ARGV);
    unless (-e $file) {
      warn "$NAME: Could not find file $file. Skipping.\n";
      next;
    }
    elsif ($moltype) {
      $IN = FAST::Bio::SeqIO->new(-file => $file, '-format' => $format, '-alphabet' => $moltype);
    }
    else {
      $IN = FAST::Bio::SeqIO->new(-file => $file, '-format' => $format);
    }
  }
  if ($IN) { 
    while (my $seq = $IN->next_seq()) {
      my $data = $seq->id();
      $data = $seq->seq()  if ($sequence);
      $data = $seq->desc() if ($description);
      if ($field) {
	 my $desc = $seq->desc();
	 my @fields;
	 if ($split_on_regex) {
	   @fields = split $split_re,$desc;
	}
	else {
	  @fields = split ' ',$desc;
	}
	$data = $fields[$index];
      }
      if ($tag) {
	my $desc = $seq->desc();
	my $re = qr/$tag[:=](\S+)/p;
	if ($desc =~ $re) {
	  $data = $1;
	}
	else {
	  $data = "";
	}
      }
      print $OUT $seq if (((not $negate) and $data =~ $re) or ($negate and $data !~ $re));
    }
    undef $IN;
  }
}



__END__

=head1 NAME

B<fasgrep> - print sequence records matching a pattern

=head1 SYNOPSIS

B<fasgrep> [OPTION]... [PERL-REGEX] [MULTIFASTA-FILE]...

=head1 DESCRIPTION

B<fasgrep> takes sequence or alignment data as input, and outputs
sequence records whose data match a perl regular expression (regex)
argument. By default, the regex is tested for matching against
sequence identifiers. B<fasgrep> takes one argument, a perl regular
expression. Regex arguments may require quoting to protect special
characters from interpretation by the shell.

Options specific to B<fasgrep>:
  -B<s>, B<--sequence>                    match on sequence 
  -B<d>, B<--description>                 match on description 
  -B<f>, B<--field>=<int>                 match on field <int> in description
  -B<t>, B<--tag>=<string>                match on tagged value in description
  -B<S>, B<--split-on-regex>=<regex>      use regex to split description for fields
  -B<v>, B<--negate>                      output sequences that do not match
  -B<i>, B<--insensitive>                 match case-insensitively
  -B<e>, B<--iupac-expand-nuc>            expand IUPAC ambiguities for DNA/RNA in query
  -B<r>, B<--revcom-iupac-expand-nuc>     reverse complement query and expand IUPAC
  -B<p>, B<--iupac-expand-prot>           expand ambiguities for protein in query

Options general to FAST:
  -B<h>, B<--help>                        print a brief help message
  B<--man>                             print full documentation
  B<--version>                         print version
  -B<l>, B<--log>                         create/append to logfile
  -B<L>, B<--logname>=<string>            use logfile name <string>
  -B<C>, B<--comment>=<string>            save comment <string> to log
  B<--format>=<format>                 use alternative format for input
  B<--moltype>=<[dna|rna|protein]>     specify input sequence type
  B<-q>, B<--fastq>                       use fastq format as input and output

=head1 INPUT AND OUTPUT

B<fasgrep> is part of FAST, the FAST Analysis of Sequences Toolbox, based
on Bioperl. Most core FAST utilities expect input and return output in
multifasta format. Input can occur in one or more files or on
STDIN. Output occurs to STDOUT. The FAST utility B<fasconvert> can
reformat other formats to and from multifasta.

=head1 DEFAULT AND OPTIONAL MATCHING BEHAVIOR

By default, B<fasgrep> tests regex matching on the identifiers of
sequence records. Options described below modify which parts of
sequence records get tested for regex matching. These options take
effect as follows with decreasing priority: 
B<-t> > B<-f> > B<-d> > B<-s>

=head1 OPTIONS

=over 8

=item B<-s>,
      B<--sequence> 		

Print records whose sequence data match the regex.
Default: [query identifiers]

=item B<-d>,
      B<--description> 		

Print records whose descriptions match the regex.
Default: [query identifiers]

=item B<-f [int]>,
      B<--field=[int]>     

Split descriptions into fields, and print records for which a specific
numbered field matches the regex.  With this option, the description
is split into fields using strings of white space as field delimiters
(the Perl default).

This option takes a mandatory integer option argument giving the index
for which field the regex should query. One-based indexing is used, so
the first field after the identifier has index 1. As standard in Perl,
negative indices count backwards from the last field in the
description; field "-1" is the last field, "-2" is the second-to-last
etc.

In fasta files, the identifier occurs between the record separator
(">") and the first whitespace on the identifier line, and the
description is everything after the first string of white space on the
identifier line. Therefore the identifier is counted as the 0th field,
which is what B<fasgrep> matches by default. 

=item B<-t [string]>,
      B<--tag=[string]>   

Query sequence records by values of a named tag in the description.
Name-value pairs in the description are expected to have the format
"name:value" as generated by FAST tools (such as faslen) or
"name=value" as common in General Feature Format. The "name" must
contain only characters from the set [a-zA-Z0-9_-]. The "value" is any
string of non-whitespace characters. Sequence records for which the
specified tag does not exist will not be written to output. Currently
only the first (left-most) occcurence of a tag is tested.

=item B<-S [regex]>,
      B<--split-on-regex=[regex]>   

Use regex <regex> to split the description for the -f option instead
of the perl default (which splits on one or more whitespace
characters). Special characters must be quoted to protect them from
the shell.

=item B<-v>,
      B<--negate> 	   

Output sequences that B<do not> match the regular expression argument.

=item B<-i>,
      B<--insensitive> 	   

Match data case-insensitively.

=item B<-e>,
      B<--iupac-expand-nuc> 	  

Expand IUPAC ambiguity symbols in the regex argument for DNA/RNA
sequence matching (implies -s, matching on sequence data). For
example, the symbol "B" in the regex is equivalent to "[CGTU]". This
option, -e, has priority over -p.

=item B<-r>,
      B<--revcom-iupac-expand-nuc> 	  

Reverse complement the regular-expression and expand IUPAC ambiguity
symbols for DNA/RNA sequence matching (implies -s, matching on
sequence data). For example, the symbol "B" in the regex is equivalent
to "[ACG]". This option, -r, takes priority over -e or -p.
priority.

=item B<-p>,
      B<--iupac-expand-prot> 	  

Expand IUPAC ambiguity symbols in the regex argyment for protein
sequence matching (implies -s, matching on sequence data).

=item B<-m [dna|rna|protein]>,
      B<--moltype=[dna|rna|protein]> 		  

Specify the type of sequence on input (should not be needed in most
cases, but sometimes Bioperl cannot guess and complains when
processing data).

=item B<-h>,
      B<--help>

Print a brief help message and exit.

=item B<--man>

Print the manual page and exit.

=item B<--version>

Print version information and exit.

=item B<-l>,
      B<--log>

Creates, or appends to, a generic FAST logfile in the current working
directory. The logfile records date/time of execution, full command
with options and arguments, and an optional comment.

=item B<-L [string]>,
      B<--logname=[string]>

Use [string] as the name of the logfile. Default is "FAST.log.txt".

=item B<-C [string]>,
      B<--comment=[string]>

Include comment [string] in logfile. No comment is saved by default.

=item B<--format=[format]> 		  

Use alternative format for input. See man page for "fasconvert" for
allowed formats. This is for convenience; the FAST tools are designed
to exchange data in Fasta format, and "fasta" is the default format
for this tool.

=item B<-q>
      B,--fastq>
Use fastq format as input and output.

=back

=head1 EXAMPLES

Print all sequences with "-DNA" in the ID:

=over 8

cat data.fas | B<fasgrep> "-DNA" > data.dna.fas

=back

Print all seqs with "mammal", "Mammal" or "MAMMAL" in the description:

=over 8

B<fasgrep> -di "mammal" < data.fas > mammal.fas     

=back

Print all sequences with the motif MSDQ in them:

=over 8

B<fasgrep> -s  "MSDQ"  < data.fas > MSDQ.fas       
    
=back

Print all sequences that don't start with S or R:
    
=over 8

B<fasgrep> -v "^[SR]" < data.fas > sub.fas         

=back

Print all sequences matching purine-purine-pyrimidine:

=over 8

B<fasgrep> -sem dna "RRY"  < data.fas > RRY.fas 
   
B<fasgrep> -se "RRY"  < data.fas > RRY.fas        

B<fasgrep> -s "[AG][AG][CT]" < data.fas > RRY.fas

B<fasgrep> -s "[AG]{2}[CT]" < data.fas > RRY.fas

=back

Print sequence records containing characters that are not IUPAC nucleotide ambiguities:

=over 8

B<fasgrep> -is '[^ACUTGRYWSMKBDHVN]'

=back

=head1 SEE ALSO

=over 8

=item C<man perlre>

=item C<perldoc perlre>

Documentation on perl regular expressions.

=item C<man FAST>

=item C<perldoc FAST>

Introduction and cookbook for FAST

=item L<The FAST Home Page|http://compbio.ucmerced.edu/ardell/FAST>"

=back 

=head1 CITING

If you use FAST, please cite I<Lawrence et al. (2015). FAST: FAST Analysis of
Sequences Toolbox.> and Bioperl I<Stajich et al.>. 

=cut
