ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/utrpaste.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (13 years, 1 month ago) by gpertea
File size: 6291 byte(s)
Log Message:
Line User Rev File contents
1 gpertea 23 #!/usr/bin/perl
2     use strict;
3     use Getopt::Std;
4     use FindBin;use lib $FindBin::Bin;
5    
6     my $usage = q/Usage:
7     utrpaste.pl [-a 'newID'] [-o <output.gff3>] <jigsaw_out.jgff> <exon_mappings.gff3>
8    
9     Augment the gene models found in <jigsaw_out.jgff> with
10     exon mappings found in <mappings.gff3> (only initial\/terminal exons,
11     if they overlap the CDS)
12    
13     /;
14     umask 0002;
15     getopts('a:o:') || die($usage."\n");
16     my $outfile=$Getopt::Std::opt_o;
17     my $newacc=$Getopt::Std::opt_a;
18     my ($jgff,$mapgff)=@ARGV;
19     foreach (@ARGV) {
20     die ("Error locating file $_\n") unless -f $_;
21     }
22     open(MGFF, $mapgff) || die ("Error opening mappings file $mapgff\n");
23     # 0 1 2 3 4
24     my @mapdata; #list of [ ID, strand, start, end, [ @exons] ]
25     # where each exon element is [exon_start, exon_end]
26     my $mi=-1; #current index in @mapdata
27     my $curid;
28     while (<MGFF>) {
29     my ($chr, $v, $f, $fstart, $fend, $fscore, $strand, $frame, $info)=split(/\t/);
30     next unless ($f eq 'exon'); #! ignore anything else but exons
31     ($fstart, $fend)=($fend, $fstart) if ($fstart>$fend);
32     my ($id)=($info=~m/Parent=([^;]+)/);
33     die("Error getting mapping ID at:\n$_\n") unless $id;
34     if ($id ne $curid) {
35     #start a new record;
36     push(@mapdata, [ $id, $strand, 0,0, []]);
37     $mi=$#mapdata;
38     $curid=$id;
39     }
40     push( @{$mapdata[$mi]->[4]} , [$fstart, $fend] );
41     }
42     close(MGFF);
43    
44     foreach my $md (@mapdata) {
45     my @exons=sort { $main::a->[0] <=> $main::b->[0] } @{$md->[4]};
46     $md->[2]=$exons[0]->[0];
47     $md->[3]=$exons[-1]->[1];
48     $md->[4]=[@exons];
49     }
50    
51     # sort mappings by start coordinate
52     @mapdata=sort { $main::a->[2] <=> $main::b->[2] } @mapdata;
53    
54     if ($outfile) {
55     open(FOUT, '>'.$outfile) || die("Error creating file $outfile\n");
56     select(FOUT);
57     }
58     #now parse and augment the jgff
59     open(JGFF, $jgff) || die ("Error opening mappings file $jgff\n");
60     $curid='';
61     my @cds=0; #current CDS chain: list of [$start, $end, $frame]
62     my @curdata;
63     while (<JGFF>) {
64     next if length($_)<4 || m/^# /;
65     my ($chr, $track, $f, $fstart, $fend, $fscore, $strand, $phase, $jdata)=split(/\t/);
66     next unless (($f eq 'CDS') || ($track=~m/jigsaw/ && $f=~m/\-exon$/));
67     my ($id)=($jdata=~m/^([^;]+); gene_score=/);
68     unless ($id) {
69     ($id)=($jdata=~m/Parent=([^;]+)/);
70     unless ($id) { #try GTF too
71     ($id)=($jdata=~m/transcript_id "([^"]+)/);
72     }
73     die("Error getting model ID at:\n$_\n") unless $id;
74     }
75     if ($id ne $curid) {
76     extendCDS(@curdata, \@cds) if $curid; #checks @mapdata
77     @cds=();
78     $curid=$id;
79     my $wid = $newacc || $curid;
80     my ($jnum)=($curid=~m/\.(\d+)$/);
81     $jnum=1 unless $jnum;
82     @curdata=($wid, $jnum, $chr, $track, $strand);
83     }
84     push(@cds, [$fstart, $fend, $phase]);
85     }
86     # last record:
87     extendCDS(@curdata, \@cds) if @cds>0; #checks @mapdata
88     close(JGFF);
89    
90     if ($outfile) { close(FOUT); }
91    
92     #================ SUBROUTINES ============
93    
94     sub extendCDS {
95     my ($id, $jnum, $chr, $track, $strand, $cdsl)=@_;
96     #find mapdata overlapping first and last exons of this model
97     my @cds = sort { $main::a->[0] <=> $main::b->[0] } @$cdsl;
98     my $cstart=$cds[0]->[0];
99     my $cend=$cds[-1]->[1];
100     my $mrnaid=$id.'.j'.$jnum;
101    
102     my $found=0;
103     my @exons;
104    
105     foreach my $m (@mapdata) {
106     my ($mid, $mstrand, $mstart, $mend, $mexons)=@$m;
107     #mapping should overlap on the same strand
108     #print STDERR "===> comparing $mstart-$mend ($mstrand) w $cstart-$cend ($strand)\n";
109     next if ($mstrand ne $strand || $mstart>$cend || $cstart>$mend);
110     #mapping ends should cover a larger interval than CDS
111     next if (($mstart>$cstart) || ($mend<$cend));
112     # at least one of the ends should stretch farther
113     next unless (($mstart<$cstart) || ($mend>$cend));
114     # -- found it:
115     $found=1;
116     # augment here, and print the mRNA line too, then exons
117     # left end :
118     my @lexons;
119     my $lovl;
120     for (my $i=0;$i<@$mexons;$i++) {
121     my ($estart, $eend)=@{$$mexons[$i]};
122     #print STDERR "left end cmp: $estart-$eend vs CDS $cstart-$cds[0]->[1]\n";
123     if ($estart<=$cstart && $eend>$cstart) {
124     # if they really overlapped properly, $estart is <= cds start
125     #print STDERR " YES, adding exon $estart-$cds[0]->[1]^^!\n";
126     push(@lexons, [$estart, $cds[0]->[1]]);
127     $lovl=1;
128     last;
129     }
130     push(@lexons,[$estart, $eend]);
131     }
132     #right end:
133     my @rexons;
134     my $rovl;
135     for (my $i=@$mexons-1;$i>=0;$i--) {
136     my ($estart, $eend)=@{$$mexons[$i]};
137     #print STDERR "right end cmp: $estart-$eend vs CDS $cds[-1]->[0]-$cend\n";
138     if ($eend>=$cend && $estart<$cend) {
139     # if they really overlapped properly, $eend is >= cds end
140     #print STDERR " YES, adding exon $cds[-1]->[0]-$eend!\n";
141     unshift(@rexons, [$cds[-1]->[0], $eend]);
142     $rovl=1;
143     last;
144     }
145     unshift(@rexons,[$estart, $eend]);
146     }
147    
148     if ($rovl && $lovl) {
149     #special single exon case:
150     if ( @cds==1 ) {
151     #we have to merge rexon and lexons
152     $lexons[-1]->[1]=$rexons[0]->[1];
153     shift(@rexons); #remove the first rexon
154     push(@exons, @lexons, @rexons);
155     }
156     else { #normal multi-exon case
157     push(@exons, @lexons);
158     foreach my $cd (@cds) {
159     push(@exons, [$$cd[0], $$cd[1]])
160     if ($cd->[0]>$lexons[-1]->[1] && $cd->[1]<$rexons[0]->[0])
161     }
162     #for (my $i=1;$i<@cds;$i++) {
163     # push(@exons, [$cds[$i]->[0], $cds[$i]->[1]]);
164     # }
165     push(@exons, @rexons);
166     }
167     # print mRNA and exons
168     print join("\t", $chr, $track, 'mRNA', $exons[0]->[0], $exons[-1]->[1], '.', $strand, '.',
169     'ID='.$mrnaid)."\n";
170     foreach my $e (@exons) {
171     print join("\t", $chr, $track, 'exon', $$e[0], $$e[1], '.', $strand, '.',
172     'Parent='.$mrnaid)."\n";
173     }
174     last;
175     }
176     else { $found = 0; }
177     }
178     if (!$found) {
179     #print mRNA line and the old CDS as is
180     print join("\t", $chr, $track, 'mRNA', $cstart, $cend, '.', $strand, '.',
181     'ID='.$mrnaid)."\n";
182     }
183     #print CDS segments:
184     foreach my $c (@cds) {
185     print join("\t", $chr, $track, 'CDS', $$c[0], $$c[1], '.', $strand, $$c[2],
186     'Parent='.$mrnaid)."\n";
187    
188     }
189     }

Properties

Name Value
svn:executable *