#!/usr/local/bin/perl -w -s # Copyright © 2003, Stowers Institute for Medical Research. All # rights reserved. # Redistribution and use in source and binary forms, with or without # modifications, are permitted provided that the following conditions # are met: # Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # Neither the name of Stowers Institute for Medical Research nor the # names of its contributors may be used to endorse or promote products # derived from this software without specific prior written # permission. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS # FOR A PARTICULAR PURPOSE, ARE DISCLAIMED. IN NO EVENT SHALL THE # COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILTY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY # WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # Written by Malcolm Cook, Stowers Institute for Medical # Research. Please submit all inquiries regarding this software to # Malcolm Cook, Stowers Institute for Medical Research, 1000 East 50th # Street, Kansas City, MO 64110, mec@stowers-institute.org =pod =head1 SYNOPSIS polyafind [-all] file1 file2 > polyaout.dat For each file named on command line (presumed to contain a single DNA sequence), produce a line of output naming the file and the 'poly A' tail (defined as the longest stretch of A or N). =head1 options: =item B<-all> output lists each match_length as opposed to default of just the longest. =head1 AUTHOR Malcolm Cook (mec@stowers-institute.org) =cut # Notes: # uses -s for rudimentary switch processing for -all option # does NOT validate input # stated requirement # On Tue, 2003-09-09 at 17:00, Tristan Fiedler wrote: # > I have over 500 files, named *.seq. I would like to create a script which : # > # > a. runs through all the files, # > b. counts the length of the # > of A or N) # > c. sends the output to a file, eg. use strict; use vars qw($all $help); use English; use Pod::Usage; pod2usage(-exitstatus => 0, -verbose => 2) if $help or not @ARGV; $OUTPUT_RECORD_SEPARATOR = "\n"; $OUTPUT_FIELD_SEPARATOR ="\t"; undef $INPUT_RECORD_SEPARATOR; # slurp entire file instead of line by line. my $pattern ||= '[N,A]+'; while (<>) { s/\s+//gm; # remove any whitespace (including newlines) s/($pattern)/length($1)/egimo; # replace 'poly a' region with length of region my @match_length = split /\D+/,$_,0; # produce array of match lengths. shift @match_length while @match_length and not $match_length[0]; # remove any empty leading fields. print $ARGV , $all ? @match_length : (sort {$b <=> $a} @match_length)[0] ; # NB: $ARGV is name file file currently being slurped. }