ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/hg19_getcons.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (13 years, 2 months ago) by gpertea
File size: 9149 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 use FindBin;use lib $FindBin::Bin;
5 my $wigbindir='/fs/szannotation/human_rnaseq/hg19_phastCons';
6 #my $fmid='phyloP46way'; #middle part of the file name
7
8 my $usage = qq/Usage:
9 hg19_getcons.pl {-r <chr>:<interval_list> | -g <gff>} \
10 [-d <wigfixbin_dir>]
11 Returns conservation average and sample standard deviation across
12 bases in given intervals (exons)
13
14 Requires a directory with .wigfixbin files for each chromosome.
15 (default: $wigbindir)
16 /;
17
18 umask 0002;
19 my %ch; # file access cache: chr->[filepath, start_cpos]
20 # the value for start_cpos is always at offset 9 in the file!
21
22 getopts('g:r:o:') || die($usage."\n");
23 my $outfile=$Getopt::Std::opt_o;
24 if ($outfile) {
25 open(OUTF, '>'.$outfile) || die("Error creating output file $outfile\n");
26 select(OUTF);
27 }
28
29 my $intv=$Getopt::Std::opt_r;
30 if ($intv) {
31 my ($chr,$rlst)=split(/\:/,$intv);
32 die("$usage Incorrect format for the interval list!\n") unless $chr && $rlst;
33 my $strand=chop($chr);
34 if ($strand ne '-' && $strand ne '+') {
35 $chr.=$strand;
36 $strand=undef;
37 }
38 my @rdata=map { [split(/[\-\.]+/)] } (split(/[\,\;\s]+/,$rlst));
39 foreach my $d (@rdata) {
40 ($$d[0], $$d[1])=($$d[1], $$d[0]) if $$d[0]>$$d[1];
41 }
42 my @ex = sort { $a->[0] <=> $b->[0] } @rdata;
43 my ($avg, $std)=fetchCons($chr, \@ex);
44 print STDOUT "$intv\t$avg\t$std\n";
45 exit;
46 }
47 #else {
48 # GFF/GTF input
49 my $fgff=$Getopt::Std::opt_g;
50 my %gffrecs;
51 # --------------------- 0 1 2 3 4 5 6 7 8
52 # recID => [ chr, strand, feat_type, gname, tdescr, fstart, fend, [@exons], [@cds] ]
53
54 # --
55 die("$usage Error: -r or -g required!\n") unless $fgff;
56 my $gffh;
57 if ($fgff eq '-') {
58 open($gffh, "<&=STDIN") || die("Error: couldn't alias STDIN. $!\n");
59 }
60 else {
61 open($gffh, $fgff) || die("Error: failed to open $fgff. $!\n");
62 }
63
64 loadGff($gffh, \%gffrecs);
65 # -- sort features by chromosome:
66 #print STDERR "GFF data loaded. Sorting by chromosome location..\n";
67 my @sorted_features=sort sortByLoc keys(%gffrecs);
68 # print STDERR "Writing BED file..\n";
69 processGffRecs(\%gffrecs, \@sorted_features);
70 # --
71 if ($outfile) {
72 select(STDOUT);
73 close(OUTF);
74 }
75
76 #************ Subroutines **************
77 sub loadGff {
78 my ($fh, $recs)=@_; # $hr=\%gffrecs
79 # --------------------- 0 1 2 3 4 5 6 7 8
80 # recID => [ chr, strand, feat_type, gname, tdescr, fstart, fend, [@exons], [@cds] ]
81 while (<$fh>) {
82 next if m/^\s*#/;
83 chomp;
84 my ($chr, $track, $f, $fstart, $fend, $fscore, $strand, $frame, $lnum)=split(/\t/);
85 next unless $fstart>1 && $lnum;
86 next if $f eq 'gene' || $f eq 'locus'; # Warning: skipping any 'gene' or 'locus' features, unconditionally
87 my $gff3_ID;
88 my $gff3_Parent;
89 my ($gname, $tdescr);
90 ($fstart, $fend)=($fend, $fstart) if $fend<$fstart;
91 ($gff3_ID)=($lnum=~m/\bID=([^;]+)/);
92 ($gff3_Parent)=($lnum=~m/\bParent=([^;]+)/);
93 if ($gff3_ID || $gff3_Parent) { # GFF format
94 $gff3_ID=~tr/"//d; #"
95 $gff3_Parent=~tr/"//d; #"
96 $gff3_Parent='' if ($f eq 'mRNA');
97 if ($gff3_ID && !$gff3_Parent) { #top level feature
98 if ($f=~m/RNA/i || $f=~/gene/) {
99 # try to parse the description, if any
100 $tdescr='';
101 $gname='';
102 if ($lnum=~m/\b(?:descr|tophit|info|product)\s*=\s*"?([^;"]+)/i) {
103 $tdescr=$1;
104 }
105 elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) {
106 $tdescr=$1;
107 }
108 if ($lnum=~m/\bgene_name[\s=]+"?([^;"]+)/i) {
109 $gname=$1;
110 }
111 elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) {
112 $gname=$1;
113 }
114 $tdescr='' if ($tdescr eq $gname);
115 $gname='' if $gname eq $gff3_ID;
116 }
117 die("Error: duplicate feature $gff3_ID on $chr\n") if (exists($$recs{"$chr|$gff3_ID"}));
118 my $recID="$chr|$gff3_ID";
119 $$recs{$recID} = [$chr, $strand, $f, $gname, $tdescr, $fstart, $fend, [], [] ];
120 next;
121 } # parent/top-level feature
122 } #GFF
123 else { #GTF format
124 next if ($f eq 'transcript'); #exception: GTF with parent 'transcript' feature
125 }
126 # -------------- exon/CDS line here:
127 my $recID;
128 if ($track=~/^jigsaw/ && $lnum=~m/^\d+$/) {
129 $recID=$chr.'.jsm.'.$lnum;
130 }
131 elsif ($lnum=~m/Parent=(['"\:\w\|\-\.]+)/) {
132 $recID=$1;
133 $recID=~tr/"//d; #"
134 }
135 elsif ($lnum=~m/transcript_id[= ]+(['"\:\w\.\|\-]+)/) {
136 $recID=$1;
137 $recID=~tr/"//d; #"
138 }
139 else {
140 die("Error: cannot parse locus/transcript name from input line:\n$_\n");
141 }
142 if (!$gname && $lnum=~m/gene_id[= ]+(['"\:\w\.\|\-]+)/) {
143 $gname=$1;
144 $gname=~tr/"//d; #"
145 }
146 $tdescr='' if index($recID, $tdescr)>=0;
147 $gname='' if index($recID, $gname)>=0;
148 $recID=$chr.'|'.$recID;
149 my $ld = $$recs{$recID};
150 if ($ld) { #existing entry
151 my $i=($f eq 'CDS') ? 8 : 7;
152 my ($lstart, $lend)=($$ld[5], $$ld[6]);
153 $$ld[5]=$fstart if $fstart<$lstart;
154 $$ld[6]=$fend if $fend>$lend;
155 push(@{$$ld[$i]}, [$fstart, $fend, $fscore]);
156 }
157 else { # first time seeing this locus/gene
158 $$recs{$recID} = ($f eq 'CDS') ?
159 [$chr, $strand, $f, $gname, $tdescr, $fstart, $fend, [], [[$fstart, $fend, $fscore]] ] :
160 [$chr, $strand, $f, $gname, $tdescr, $fstart, $fend, [[$fstart, $fend, $fscore]], [] ] ;
161 } # 0 1 2 3 4 5 6 7 (exons) 8 (CDS)
162 } #while <$fh>
163 } # loadGff
164
165 sub sortByLoc {
166 my $da=$gffrecs{$a};
167 my $db=$gffrecs{$b};
168 if ($$da[0] eq $$db[0]) {
169 return ($$da[5]==$$db[5]) ? $$da[6] <=> $$db[6] : $$da[5] <=> $$db[5] ;
170 }
171 else { return $$da[0] cmp $$db[1] ; }
172 }
173
174 sub processGffRecs {
175 #return if keys(%recs)==0;
176 my ($recs, $rlist)=@_;
177 my @recs_keys;
178 unless ($rlist) {
179 @recs_keys=keys(%$recs);
180 $rlist=\@recs_keys;
181 }
182 foreach my $gffid (@$rlist) {
183 my $ld=$$recs{$gffid};
184 # my $ld=$recs{$l} || die ("Error: locus $l found in list but not in hash!\n");
185 # 0 1 2 3 4 5 6 7 8
186 my ($chr, $strand, $ftype, $gname, $descr, $lstart, $lend, $er, $cr) = @$ld;
187 # my ($mstart,$mend)=($lstart, $lend);
188 my $CDexons=0;
189 my @ex;
190 my @cds;
191 #some records might lack exons, but have only CDS segments (e.g. mitochondrial genes)
192 if (@$er<1 && @$cr>0) {
193 @ex = sort { $a->[0] <=> $b->[0] } @$cr;
194 $CDexons=1;
195 }
196 else {
197 @ex = sort { $a->[0] <=> $b->[0] } @$er;
198 #if (@$cr>0) { # sort cds segments too
199 # @cds= sort { $a->[0] <=> $b->[0] } @$cr;
200 # }
201 }
202 # get the more accurate version of the start-end coords for the feature
203 #($mstart, $mend) = ($ex[0]->[0], $ex[-1]->[1]);
204 my @exonlst = map { $_->[0].'-'.$_->[1] } @ex;
205 my ($avg, $std)=fetchCons($chr, \@ex);
206 substr($gffid, 0, length($chr)+1)='';
207 print join("\t", $gffid, $chr.$strand.':'.join(',',@exonlst),$avg, $std)."\n";
208 } #for each stored transcript
209 }
210
211
212 # -- fetch conservation data for a sorted set of exons
213 sub fetchCons {
214 my ($chr, $exr)=@_; # $exr is \@exons (sorted)
215 my ($tstart, $tend)=($$exr[0]->[0], $$exr[-1]->[1]);
216 my $chd=$ch{$chr};
217 my ($cfname, $cfstart);
218 my $fh;
219 if ($chd) {
220 ($cfname, $cfstart)=@$chd;
221 open($fh, $cfname) || die("Error opening file $cfname!\n");
222 binmode($fh);
223 }
224 else {
225 $cfname="$wigbindir/$chr.wigfixbin";
226 open($fh, $cfname) || die("Error opening file $cfname!\n");
227 binmode($fh);
228 my ($tag, $r);
229 read($fh, $tag, 4);
230 die("Error: $cfname is not a wigbin file (tag=$tag)!\n") unless $tag eq 'WIGB';
231 read($fh, $r, 4);
232 my @v=unpack('I',$r);
233 $cfstart=$v[0];
234 #print STDERR "start=$cfstart\n";
235 $ch{$chr}=[$cfname, $cfstart];
236 }
237 my $tspan=($tend-$tstart+1);
238 my @rdata = (0) x $tspan;
239 if ($tend>=$cfstart) {
240 my ($rstart, $rspan)=($tstart, $tspan); # real start and read values
241 if ($rstart<$cfstart) {
242 $rstart=$cfstart;
243 $rspan=($tend-$rstart+1);
244 }
245 #print STDERR "..seeking to ".(8+2*($rstart-$cfstart))."\n";
246 seek($fh, 8+2*($rstart-$cfstart), 0);
247 my $rdata;
248 # my $rb;
249 # my $nr=0;
250 # my @v;
251 # for (my $i=0;$i<$rspan;$i++) {
252 # $rb=read($fh,$rdata,2);
253 # last if $rb<2;
254 # my @cv=unpack('s',$rdata);
255 # push(@v,$cv[0]);
256 # $nr++;
257 # }
258 my $rb=read($fh, $rdata, 2*$rspan);
259 $rb>>=1; # number of values read (each value is 2 bytes)
260 my @v=unpack('s'.$rb, $rdata);
261 #
262 # print STDERR "rspan=$rspan; unpacking s$rb values: (".join(',',@v).")\n";
263 @rdata[$rstart-$tstart .. $rstart-$tstart+$#v]=@v;
264 }
265 #now compute average&std for exons only
266 my ($sum, $vcount);
267 foreach my $e (@$exr) {
268 $vcount+=$$e[1]-$$e[0]+1;
269 map { $sum+=($_/1000) } @rdata[$$e[0]-$tstart .. $$e[1]-$tstart];
270 }
271 my $avg=$sum/$vcount;
272 $sum=0;
273 foreach my $e (@$exr) {
274 map { $sum+=((($_/1000)-$avg)**2) } @rdata[$$e[0]-$tstart .. $$e[1]-$tstart];
275 }
276 my $std=sqrt($sum/($vcount-1));
277 return(sprintf('%.3f',$avg), sprintf('%.3f',$std));
278 }

Properties

Name Value
svn:executable *