ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/orf_stats.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (13 years, 1 month ago) by gpertea
File size: 5633 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 use FindBin;use lib $FindBin::Bin;
5
6 my $usage = q/Usage:
7 orf_stats.pl <orf_check.tab> <ref_mrnas.iit>
8
9 It will try to report various stats about the ORFs
10 and junctions of transfrags given in <orf_check.tab>
11
12 <orf_check.tab> is the output of orf_check.pl, with the columns:
13 <transcript_id>,<xlocus>,<chromosome><strand>,<exons>,<orfs>
14
15 ORF coordinates are local - i.e. relative to the transcript sequence
16 /;
17 umask 0002;
18 getopts('o:') || die($usage."\n");
19 my $outfile=$Getopt::Std::opt_o;
20 if ($outfile) {
21 open(OUTF, '>'.$outfile) || die("Error creating output file $outfile\n");
22 select(OUTF);
23 }
24 # --
25 my ($forfs, $fiit)=@ARGV;
26 die($usage."\n") unless -f $forfs; # && -f $fiit;
27 my %ci; #cufflinks isoforms data
28 # transcript_id => [$chr, $strand, $cstart, $cend, [ exondata..], [ orfdata.. ]]
29 my $fiitqry=$forfs.'.iitqry';
30
31 open(QDATA, '>'.$fiitqry) || die("Error creating iit query file $fiitqry\n");
32
33 open(FORFS, $forfs) || die("$usage\n");
34 while(<FORFS>) {
35 chomp;
36 next unless $_;
37 my ($t, $xloc, $chr, $exonstr, $orfstr)=split(/\t/);
38 my $strand=chop($chr);
39 $exonstr=~s/^exons=//;
40 $orfstr=~s/^orfs=//;
41 my @ex= map { [split(/\-/)] } split(/\,/,$exonstr);
42 my @orfs;
43 if ($orfstr) {
44 @orfs=map { [split(/\:/)] } split(/\,/,$orfstr);
45 }
46 my @t2g;
47 my @gorfs;
48 my $x=0; #position on transcript sequence
49 if ($strand eq '-') {
50 foreach my $d (reverse @ex) {
51 foreach my $gx (reverse($$d[0] .. $$d[1])) { $t2g[++$x]=$gx }
52 }
53 #@gorfs= map { [$t2g[$$_[0]+$$_[1]]-2, $t2g[$$_[0]]] } @orfs;
54 # keep the order so we can more easily match GFF CDS end coordinate
55 if ($orfstr) {
56 @gorfs= map { [$t2g[$$_[0]], $t2g[$$_[0]+$$_[1]]-2] } @orfs;
57 }
58 }
59 else {
60 foreach my $d (@ex) {
61 foreach my $gx ($$d[0] .. $$d[1]) { $t2g[++$x]=$gx }
62 }
63 if ($orfstr) {
64 @gorfs= map { [$t2g[$$_[0]], $t2g[$$_[0]+$$_[1]]+2] } @orfs;
65 }
66 }
67 # - debug print:
68 #print join("\t", $t, $chr.$strand, join(',', (map { $$_[0].'-'.$$_[1] } @ex)),
69 # join(',', (map { $$_[0].'|'.$$_[1] } @gorfs)))."\n";
70 print QDATA join(' ',$ex[0]->[0], $ex[-1]->[1], $chr.$strand, '<'.$t)."\n";
71 $ci{$t}=[$chr, $strand, $ex[0]->[0], $ex[-1]->[1], [@ex], [@gorfs]];
72 }
73 close(QDATA);
74 close(FORFS);
75 # -- now query the iit_file with $fiitqry
76 goto ENDP unless ($fiit && -f $fiit);
77
78 my $cmd="iit_get $fiit < $fiitqry";
79 open(IITGET, "$cmd |") || die ("Error opening pipe $cmd |\n");
80 my @refs; #refs overlapping query transfrag -- forming this cluster
81 my $tf; # current transfrag;
82 my $strand; #strand for current cluster
83 my $lref; #last ref id
84 my $lchrstrand; #last ref chr and strand
85 my %accs; #all acceptor positions for this cluster
86 my %dons; #all donor positions for this cluster
87 my %cdstops; #all CDS stop locations
88 my $jrecomb; # counter for transfrags that have all junctions present in the cluster
89 my $samestop; #counter for transfrags that have an orf stopping
90 # at a known stop in the cluster (e.g. in %cdstops)
91 my $haveORFs;
92 while (<IITGET>) {
93 chomp;
94 next unless $_;
95 #print ">> $_\n";
96 if (m/^# Query: /) {
97 ($strand, $tf)=(m/([\+\-]) <([^<]+)$/);
98 die("Error parsing current transfrag in: $_\n")
99 unless $strand && $tf && exists($ci{$tf});
100 $haveORFs=(@{$ci{$tf}->[5]}>0);
101 next;
102 }
103 if (m/>(\S+) \d+ \d+ (\S+)/) {
104 #header for a ref interval
105 ($lref, $lchrstrand)=($1, $2);
106 }
107 elsif (m/^\d+\-\d+/) {
108 #exons line:
109 if ($strand eq '-') {
110 my @acc=(m/\-(\d+)\,/g);
111 @accs{@acc}=();
112 my @don=(m/\,(\d+)\-/g);
113 @dons{@don}=();
114 }
115 else {
116 my @acc=(m/\,(\d+)\-/g);
117 @accs{@acc}=();
118 my @don=(m/\-(\d+)\,/g);
119 @dons{@don}=();
120 }
121 }
122 elsif($haveORFs && m/^C:\d+\-/) {
123 # CDS line
124 my $cdstop;
125 if ($strand eq '-') {
126 ($cdstop)=(m/^C:(\d+)/);
127 #print STDERR ">>>> storing stop $cdstop\n";
128 }
129 else {
130 ($cdstop)=(m/(\d+)$/);
131 }
132 $cdstops{$cdstop}=1;
133 }
134 if (m/^# End$/) {
135 # collect stats here
136 my ($tchr, $tstrand, $tmin, $tmax, $texs, $torfs)=@{$ci{$tf}};
137 die ("ChrStrand mismatch!\n") unless $tstrand eq $strand;
138 my @acc;
139 my @don;
140 if ($tstrand eq '-') {
141 @acc= map { $$_[1] } @$texs;
142 pop(@acc);
143 @don= map { $$_[0] } @$texs;
144 shift(@don);
145 }
146 else {
147 @don= map { $$_[1] } @$texs;
148 pop(@don);
149 @acc= map { $$_[0] } @$texs;
150 shift(@acc);
151 }
152 # check if all acceptors and all donors are in the cluster
153 my $alljmatch=0;
154 my $print=0;
155 my @pcol=('','');
156 foreach my $a (@acc) {
157 goto JNOTFOUND if !exists($accs{$a});
158 }
159 foreach my $d (@don) {
160 goto JNOTFOUND if !exists($dons{$d});
161 }
162 $alljmatch=1;
163 JNOTFOUND:
164 if ($alljmatch) {
165 #print "$tr\tjrecomb\n";
166 $jrecomb++;
167 $print=1;
168 $pcol[0]='jrecomb';
169 }
170 #now check if same stop codon
171 foreach my $od (@$torfs) {
172 #print STDERR ">>>> >> checking stop $$od[1]\n" if $strand eq '-';
173 if (exists($cdstops{$$od[1]})) {
174 $print=1;
175 $pcol[1]='samestop';
176 $samestop++;
177 last;
178 }
179 }
180 print join("\t",$tf,@pcol)."\n" if $print;
181 # -- clear for next run
182 ($strand, $tf)=();
183 %accs=();
184 %dons=();
185 %cdstops=();
186 @refs=();
187 }
188 }
189 close(IITGET);
190 print STDERR "$jrecomb transfrags are recombinations of known junctions.\n";
191 print STDERR "$samestop transfrags have an ORF that match a known stop codon.\n";
192
193 # --
194 ENDP:
195 if ($outfile) {
196 select(STDOUT);
197 close(OUTF);
198 }
199
200 #************ Subroutines **************

Properties

Name Value
svn:executable *