#!/usr/local/bin/perl -w -s

# Copyright © 2003, Stowers Institute for Medical Research.  All
# rights reserved.

# Redistribution and use in source and binary forms, with or without
# modifications, are permitted provided that the following conditions
# are met:

# Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.

# Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.

# Neither the name of Stowers Institute for Medical Research nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written
# permission.

# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# FOR A PARTICULAR PURPOSE, ARE DISCLAIMED.  IN NO EVENT SHALL THE
# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILTY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

# Written by Malcolm Cook, Stowers Institute for Medical
# Research. Please submit all inquiries regarding this software to
# Malcolm Cook, Stowers Institute for Medical Research, 1000 East 50th
# Street, Kansas City, MO 64110, mec@stowers-institute.org

=pod

=head1 SYNOPSIS

polyafind [-all] file1 file2 > polyaout.dat

For each file named on command line (presumed to contain a single DNA
sequence), produce a line of output naming the file and the 'poly A'
tail (defined as the longest stretch of A or N).

=head1 options:

=item B<-all>

output lists each match_length as opposed to default of just the
longest.

=head1 AUTHOR

Malcolm Cook (mec@stowers-institute.org)

=cut

# Notes:
#  uses -s for rudimentary switch processing for -all option
#  does NOT validate input
#  stated requirement
#  On Tue, 2003-09-09 at 17:00, Tristan Fiedler wrote:
# > I have over 500 files, named *.seq.  I would like to create a script which :
# > 
# > a.  runs through all the files,
# > b.  counts the length of the 
# > of A or N)
# > c. sends the output to a file, eg.

use strict;
use vars qw($all $help); 
use English;
use Pod::Usage;

pod2usage(-exitstatus => 0, -verbose => 2) if $help or not @ARGV;

$OUTPUT_RECORD_SEPARATOR = "\n";
$OUTPUT_FIELD_SEPARATOR ="\t";

undef  $INPUT_RECORD_SEPARATOR;	# slurp entire file instead of line by line.

my $pattern ||= '[N,A]+';

while (<>) {
  s/\s+//gm;			# remove any whitespace (including newlines)
  s/($pattern)/length($1)/egimo; # replace 'poly a' region with length of region
  my @match_length = split /\D+/,$_,0; # produce array of match lengths.
  shift @match_length while @match_length and not $match_length[0]; # remove any empty leading fields.
  print $ARGV , $all ? @match_length : (sort {$b <=> $a} @match_length)[0] ; # NB: $ARGV is name file file currently being slurped.
}