1 |
#!/usr/bin/perl |
2 |
use strict; |
3 |
use Getopt::Std; |
4 |
use POSIX "sys_wait_h"; # mostly for the handy uname() function |
5 |
|
6 |
my $usage = q/Usage: |
7 |
qexo_p2g [-o <output.gff3>] {-q '<qprotname>' | -m <ctgname>:<hitmap_fadb.cidx>} |
8 |
[-c <mincov> -p <minpid>] {qprot.fa|protdb.cidx} genomic.fa |
9 |
If <qprot.fa> is a multifasta file then exonerate p2g will be run for each of the |
10 |
proteins in that file. |
11 |
The output is converted to gff and optionally appended to file <output.gff3> |
12 |
(if -o is not given, the gff output is sent to stdout) |
13 |
/; |
14 |
|
15 |
umask 0002; |
16 |
getopts('Do:m:n:c:p:q:') || die($usage."\n"); |
17 |
#die($usage."\n") unless -f $ARGV[1]; |
18 |
my ($fappend, $qpull)=($Getopt::Std::opt_o, $Getopt::Std::opt_q); |
19 |
my ($mincov, $minpid)=($Getopt::Std::opt_c,$Getopt::Std::opt_p); |
20 |
$mincov=1 unless $mincov; |
21 |
$minpid=10 unless $minpid; |
22 |
my $debug=$Getopt::Std::opt_D; |
23 |
my ($hitmapdta)=($Getopt::Std::opt_m); |
24 |
my ($ctg, $hitmapcidx); |
25 |
if ($hitmapdta) { ($ctg, $hitmapcidx)=split(/:/,$hitmapdta); } |
26 |
my $numaln=1; # ID suffix for each mapping |
27 |
my $pulling=0; # temporary qry file created through db pulling |
28 |
my ($qf, $sfile)=@ARGV; |
29 |
die("$usage\n") unless $qf && $sfile; |
30 |
die("$usage Error: cannot locate query file ('$qf')!\n") unless -f $qf; |
31 |
die("$usage Error: cannot locate genomic fasta file ('$sfile')!\n") unless -f $sfile; |
32 |
my $host=lc((&POSIX::uname)[1]); |
33 |
my $counter=time(); |
34 |
chomp($host); |
35 |
($host)=($host=~m/^([\w\-]+)/); |
36 |
my $qrf=getfname($qf); |
37 |
my $gfn=getfname($sfile); |
38 |
my $pnum=$ENV{GRID_TASK} || $counter; |
39 |
my $qfexo='exop2g.'.$host.'_'.$$.".$pnum.$qrf.$gfn"; |
40 |
my $qfexout=$qfexo.'.exo'; |
41 |
if ($qpull || $ctg) { |
42 |
$pulling=1; |
43 |
#my $qrf=$qf; |
44 |
#$qrf=$1 if $qf=~m/\/([^\/]+)$/; |
45 |
#$qrf=~s/\.cidx$//; |
46 |
my $qfqry=$qfexo.'.qry'; |
47 |
if ($qpull) { #just pull one protein sequence |
48 |
my $r=system("cdbyank -a '$qpull' $qf > $qfqry"); |
49 |
} |
50 |
else { # pull a set of proteins -- in two steps |
51 |
die("Error: the hitmap cdb index not found!\n") unless -f $hitmapcidx; |
52 |
my $r=system("cdbyank -a '$ctg' $hitmapcidx | cdbyank $qf > $qfqry"); |
53 |
} |
54 |
runExonerate($qfqry); |
55 |
} |
56 |
else { |
57 |
runExonerate($qf, $qfexout); |
58 |
} |
59 |
|
60 |
sub runExonerate { |
61 |
my ($qfqry, $fexout)=@_; |
62 |
print STDERR ("Warning: 0 length protein input, mistake?\n") unless -s $qfqry; |
63 |
#my $CDSextracted; |
64 |
#write sequence to a tmp file for searching |
65 |
$fexout=$qfqry.'.exo' unless $fexout; |
66 |
# use exonerate.icc ? |
67 |
my $cmd="exonerate $qfqry $sfile -V 0 --model p2g --percent 20 -n 20 --showalignment no --saturatethreshold 80 ". |
68 |
"--ryo '".'%ti\t%qi\tmap_end\t%r\t%ql|%qab-%qae\t%ps\t%g\t%qd\n'."' --geneseed 60 -x 50 --proteinwordlen 4 ". |
69 |
"--seedrepeat 2 -i -40 -M 512 --showtargetgff yes --showvulgar no > $fexout "; |
70 |
|
71 |
#$cmd.=' >> '.$fappend if ($fappend); |
72 |
my $r=system($cmd); |
73 |
if ($r) { |
74 |
die("Error at running: \n $cmd\n"); |
75 |
} |
76 |
else { |
77 |
if ($fappend && -s $fexout) { |
78 |
my $excat=$fappend; |
79 |
$excat=~s/\.gff\d*$//i; |
80 |
$excat.='.exout'; |
81 |
system("cat $fexout >> $excat"); |
82 |
system("fltexonerate -c $mincov -p $minpid < $fexout >> $fappend"); |
83 |
unlink($fexout); |
84 |
} |
85 |
unlink($qfqry) if $pulling && !$debug; |
86 |
} |
87 |
} |
88 |
|
89 |
#------------ |
90 |
sub runCmd { |
91 |
my ($docmd, @todel) = @_; |
92 |
my $errmsg = `( $docmd ) 2>&1`; |
93 |
my $errcode=$?; |
94 |
my @cores=<core.*>; |
95 |
if ($errcode || @cores>0 || ($errmsg=~/error|segmentation|violation/si) || ($errmsg=~/\bfail/si)) { |
96 |
print STDERR "!Error at:\n$docmd\n"; |
97 |
print STDERR "$errmsg\n"; |
98 |
foreach (@todel) { |
99 |
unlink($_); |
100 |
} |
101 |
exit(1); |
102 |
} |
103 |
} |
104 |
|
105 |
sub getfname { |
106 |
my $f=$_[0]; |
107 |
$f=$1 if $f=~m/\/([^\/]+)$/; |
108 |
$f=~s/\.cidx$//; |
109 |
$f=~s/\.\w+$//; |
110 |
return $f |
111 |
} |