ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/qsim4cc
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (13 years, 2 months ago) by gpertea
File size: 3621 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 use POSIX "sys_wait_h"; # mostly for the handy uname() function
5
6 my $usage = q/Usage:
7 qsim4cc [-C] [-o <output.gff3>] [-q '<queryname>']
8 {query.fa|querydb.cidx} genomic.fa
9 Option -C forces the extraction of the CDS range of query.fa creating
10 a file <query.fa>.cds which is then searched with sim4cc against subj.fa.
11 If <query.fa> is a multifasta file then sim4cc will be run for each of the
12 query records.
13 The output is converted to gff and optionally appended to file <output.gff3>
14 (if -o is not given, the gff output is sent to stdout)
15 /;
16 umask 0002;
17 getopts('DCo:c:p:q:') || die($usage."\n");
18 #die($usage."\n") unless -f $ARGV[1];
19 my ($mincov, $minpid, $fappend, $qpull, $cdsOnly)=($Getopt::Std::opt_c, $Getopt::Std::opt_p,
20 $Getopt::Std::opt_o, $Getopt::Std::opt_q, $Getopt::Std::opt_C);
21 my $debug=$Getopt::Std::opt_D;
22 $mincov=1 unless $mincov;
23 $minpid=20 unless $minpid;
24 my $numaln=1; # ID suffix for each mapping
25
26 my ($qf, $sfile)=@ARGV;
27 die("$usage\n") unless $qf && $sfile;
28 die("$usage Error: cannot locate query file ('$qf')!\n") unless -f $qf;
29 die("$usage Error: cannot locate genomic fasta file ('$sfile')!\n") unless -f $sfile;
30 my $host=lc((&POSIX::uname)[1]);
31 chomp($host);
32 ($host)=($host=~m/^([\w\-]+)/);
33
34 if ($qpull) { #only pull a sequence
35 my $r=`cdbyank -a '$qpull' $qf`;
36 my ($qname, $qdesc, $qseq)=($r=~m/^>(\S+)[ \t\x01]*(.*?)[\n\r](.+)/s);
37 my @nr=split(/\x01/, $qdesc, 2);
38 $qdesc=$nr[0] if (@nr>1);
39 $qseq =~ tr/\t \n\r//d;
40 die("Error: couldn't cdbyank fasta record for $qpull from $qf!\n")
41 unless $qname && $qseq;
42 $qseq=uc($qseq);
43 $qf=~s/\.cidx$//;
44 runSim4cc($qf, $qname, $qdesc, $qseq);
45 }
46 else { # run for each entry in $qf
47 open(QFILE, $qf) || die("Error opening query file: $qf !\n");
48 my $qcounter=0;
49 local $/="\n>";
50 while (<QFILE>) {
51 s/^>//;
52 chomp;
53 my ($qname, $qdesc, $qseq)=(m/^(\S+)[ \t\x01]*(.*?)\n(.+)/s);
54 my @nr=split(/\x01/, $qdesc, 2);
55 $qdesc=$nr[0] if (@nr>1);
56 $qseq =~ tr/\t \n\r//d;
57 $qseq=uc($qseq);
58 if ($qname && $qseq) {
59 $qcounter++;
60 runSim4cc($qf, $qname, $qdesc, $qseq, $qcounter);
61 }
62 }
63 close(QFILE);
64 }
65
66
67 sub runSim4cc {
68 my ($qfile, $qname, $qdesc, $qseq, $qcnt)=@_;
69 #my $CDSextracted;
70 #my $qrf=$qfile;
71 #$qrf=$1 if $qfile=~m/\/([^\/]+)$/;
72 my $qfsim='sim4cc.'.$host.'_'.$$;
73 $qfsim.='_'.$qcnt if $qcnt;
74 my $qrf=getfname($qfile);
75 my $gfn=getfname($sfile);
76 $qfsim.='_'.$qrf.'_'.$gfn.'.qry';
77 my $cmdopts;
78 my $outseq=$qseq;
79 my ($cds_start, $cds_end);
80 if ($cdsOnly && $qdesc=~m/CDS[:=](\d+)[\-\.]+(\d+)/) {
81 ($cds_start, $cds_end)=($1,$2);
82 ($cds_start, $cds_end)=($cds_end, $cds_start) if $cds_start>$cds_end;
83 $qfsim.='.cds';
84 $outseq=substr($qseq, $cds_start-1, $cds_end-$cds_start+1);
85 #$CDSextracted=1;
86 $qdesc=~s/\s*CDS[:=]\d+\-\d+//;
87 }
88 #write sequence to a tmp file for searching
89 open(QSIM, '>'.$qfsim) || die("Error creating file '$qfsim'\n");
90 print QSIM ">$qname $qdesc\n";
91 print QSIM join("\n", (unpack('(A80)*',$outseq)))."\n";
92 close(QSIM);
93 #debug only:
94 #print STDERR "running: sim4cc $qfsim $sfile $cmdopts\n";
95 $cmdopts.=' A=6';
96 $cmdopts.=' >> '.$fappend if ($fappend);
97 print STDERR "sim4cc $qfsim $sfile $cmdopts\n" if $debug;
98 my $r=system("sim4cc $qfsim $sfile $cmdopts");
99 if ($r) {
100 print STDERR "Warning: Error at running: sim4cc $qfsim $sfile $cmdopts\n";
101 }
102 else {
103 unlink($qfsim) unless $debug;
104 }
105 }
106
107 sub getfname {
108 my $f=$_[0];
109 $f=$1 if $f=~m/\/([^\/]+)$/;
110 $f=~s/\.cidx$//;
111 $f=~s/\.\w+$//;
112 return $f
113 }

Properties

Name Value
svn:executable *