1 |
#!/usr/bin/perl |
2 |
use strict; |
3 |
use Getopt::Std; |
4 |
use POSIX "sys_wait_h"; # mostly for the handy uname() function |
5 |
|
6 |
my $usage = q/Usage: |
7 |
qsim4cc [-C] [-o <output.gff3>] [-q '<queryname>'] |
8 |
{query.fa|querydb.cidx} genomic.fa |
9 |
Option -C forces the extraction of the CDS range of query.fa creating |
10 |
a file <query.fa>.cds which is then searched with sim4cc against subj.fa. |
11 |
If <query.fa> is a multifasta file then sim4cc will be run for each of the |
12 |
query records. |
13 |
The output is converted to gff and optionally appended to file <output.gff3> |
14 |
(if -o is not given, the gff output is sent to stdout) |
15 |
/; |
16 |
umask 0002; |
17 |
getopts('DCo:c:p:q:') || die($usage."\n"); |
18 |
#die($usage."\n") unless -f $ARGV[1]; |
19 |
my ($mincov, $minpid, $fappend, $qpull, $cdsOnly)=($Getopt::Std::opt_c, $Getopt::Std::opt_p, |
20 |
$Getopt::Std::opt_o, $Getopt::Std::opt_q, $Getopt::Std::opt_C); |
21 |
my $debug=$Getopt::Std::opt_D; |
22 |
$mincov=1 unless $mincov; |
23 |
$minpid=20 unless $minpid; |
24 |
my $numaln=1; # ID suffix for each mapping |
25 |
|
26 |
my ($qf, $sfile)=@ARGV; |
27 |
die("$usage\n") unless $qf && $sfile; |
28 |
die("$usage Error: cannot locate query file ('$qf')!\n") unless -f $qf; |
29 |
die("$usage Error: cannot locate genomic fasta file ('$sfile')!\n") unless -f $sfile; |
30 |
my $host=lc((&POSIX::uname)[1]); |
31 |
chomp($host); |
32 |
($host)=($host=~m/^([\w\-]+)/); |
33 |
|
34 |
if ($qpull) { #only pull a sequence |
35 |
my $r=`cdbyank -a '$qpull' $qf`; |
36 |
my ($qname, $qdesc, $qseq)=($r=~m/^>(\S+)[ \t\x01]*(.*?)[\n\r](.+)/s); |
37 |
my @nr=split(/\x01/, $qdesc, 2); |
38 |
$qdesc=$nr[0] if (@nr>1); |
39 |
$qseq =~ tr/\t \n\r//d; |
40 |
die("Error: couldn't cdbyank fasta record for $qpull from $qf!\n") |
41 |
unless $qname && $qseq; |
42 |
$qseq=uc($qseq); |
43 |
$qf=~s/\.cidx$//; |
44 |
runSim4cc($qf, $qname, $qdesc, $qseq); |
45 |
} |
46 |
else { # run for each entry in $qf |
47 |
open(QFILE, $qf) || die("Error opening query file: $qf !\n"); |
48 |
my $qcounter=0; |
49 |
local $/="\n>"; |
50 |
while (<QFILE>) { |
51 |
s/^>//; |
52 |
chomp; |
53 |
my ($qname, $qdesc, $qseq)=(m/^(\S+)[ \t\x01]*(.*?)\n(.+)/s); |
54 |
my @nr=split(/\x01/, $qdesc, 2); |
55 |
$qdesc=$nr[0] if (@nr>1); |
56 |
$qseq =~ tr/\t \n\r//d; |
57 |
$qseq=uc($qseq); |
58 |
if ($qname && $qseq) { |
59 |
$qcounter++; |
60 |
runSim4cc($qf, $qname, $qdesc, $qseq, $qcounter); |
61 |
} |
62 |
} |
63 |
close(QFILE); |
64 |
} |
65 |
|
66 |
|
67 |
sub runSim4cc { |
68 |
my ($qfile, $qname, $qdesc, $qseq, $qcnt)=@_; |
69 |
#my $CDSextracted; |
70 |
#my $qrf=$qfile; |
71 |
#$qrf=$1 if $qfile=~m/\/([^\/]+)$/; |
72 |
my $qfsim='sim4cc.'.$host.'_'.$$; |
73 |
$qfsim.='_'.$qcnt if $qcnt; |
74 |
my $qrf=getfname($qfile); |
75 |
my $gfn=getfname($sfile); |
76 |
$qfsim.='_'.$qrf.'_'.$gfn.'.qry'; |
77 |
my $cmdopts; |
78 |
my $outseq=$qseq; |
79 |
my ($cds_start, $cds_end); |
80 |
if ($cdsOnly && $qdesc=~m/CDS[:=](\d+)[\-\.]+(\d+)/) { |
81 |
($cds_start, $cds_end)=($1,$2); |
82 |
($cds_start, $cds_end)=($cds_end, $cds_start) if $cds_start>$cds_end; |
83 |
$qfsim.='.cds'; |
84 |
$outseq=substr($qseq, $cds_start-1, $cds_end-$cds_start+1); |
85 |
#$CDSextracted=1; |
86 |
$qdesc=~s/\s*CDS[:=]\d+\-\d+//; |
87 |
} |
88 |
#write sequence to a tmp file for searching |
89 |
open(QSIM, '>'.$qfsim) || die("Error creating file '$qfsim'\n"); |
90 |
print QSIM ">$qname $qdesc\n"; |
91 |
print QSIM join("\n", (unpack('(A80)*',$outseq)))."\n"; |
92 |
close(QSIM); |
93 |
#debug only: |
94 |
#print STDERR "running: sim4cc $qfsim $sfile $cmdopts\n"; |
95 |
$cmdopts.=' A=6'; |
96 |
$cmdopts.=' >> '.$fappend if ($fappend); |
97 |
print STDERR "sim4cc $qfsim $sfile $cmdopts\n" if $debug; |
98 |
my $r=system("sim4cc $qfsim $sfile $cmdopts"); |
99 |
if ($r) { |
100 |
print STDERR "Warning: Error at running: sim4cc $qfsim $sfile $cmdopts\n"; |
101 |
} |
102 |
else { |
103 |
unlink($qfsim) unless $debug; |
104 |
} |
105 |
} |
106 |
|
107 |
sub getfname { |
108 |
my $f=$_[0]; |
109 |
$f=$1 if $f=~m/\/([^\/]+)$/; |
110 |
$f=~s/\.cidx$//; |
111 |
$f=~s/\.\w+$//; |
112 |
return $f |
113 |
} |