1 |
#!/usr/bin/perl |
2 |
use strict; |
3 |
use Getopt::Std; |
4 |
use FindBin;use lib $FindBin::Bin; |
5 |
my $wigbindir='/fs/szannotation/human_rnaseq/hg19_phastCons'; |
6 |
#my $fmid='phyloP46way'; #middle part of the file name |
7 |
|
8 |
my $usage = qq/Usage: |
9 |
hg19_getcons.pl {-r <chr>:<interval_list> | -g <gff>} \ |
10 |
[-d <wigfixbin_dir>] |
11 |
Returns conservation average and sample standard deviation across |
12 |
bases in given intervals (exons) |
13 |
|
14 |
Requires a directory with .wigfixbin files for each chromosome. |
15 |
(default: $wigbindir) |
16 |
/; |
17 |
|
18 |
umask 0002; |
19 |
my %ch; # file access cache: chr->[filepath, start_cpos] |
20 |
# the value for start_cpos is always at offset 9 in the file! |
21 |
|
22 |
getopts('g:r:o:') || die($usage."\n"); |
23 |
my $outfile=$Getopt::Std::opt_o; |
24 |
if ($outfile) { |
25 |
open(OUTF, '>'.$outfile) || die("Error creating output file $outfile\n"); |
26 |
select(OUTF); |
27 |
} |
28 |
|
29 |
my $intv=$Getopt::Std::opt_r; |
30 |
if ($intv) { |
31 |
my ($chr,$rlst)=split(/\:/,$intv); |
32 |
die("$usage Incorrect format for the interval list!\n") unless $chr && $rlst; |
33 |
my $strand=chop($chr); |
34 |
if ($strand ne '-' && $strand ne '+') { |
35 |
$chr.=$strand; |
36 |
$strand=undef; |
37 |
} |
38 |
my @rdata=map { [split(/[\-\.]+/)] } (split(/[\,\;\s]+/,$rlst)); |
39 |
foreach my $d (@rdata) { |
40 |
($$d[0], $$d[1])=($$d[1], $$d[0]) if $$d[0]>$$d[1]; |
41 |
} |
42 |
my @ex = sort { $a->[0] <=> $b->[0] } @rdata; |
43 |
my ($avg, $std)=fetchCons($chr, \@ex); |
44 |
print STDOUT "$intv\t$avg\t$std\n"; |
45 |
exit; |
46 |
} |
47 |
#else { |
48 |
# GFF/GTF input |
49 |
my $fgff=$Getopt::Std::opt_g; |
50 |
my %gffrecs; |
51 |
# --------------------- 0 1 2 3 4 5 6 7 8 |
52 |
# recID => [ chr, strand, feat_type, gname, tdescr, fstart, fend, [@exons], [@cds] ] |
53 |
|
54 |
# -- |
55 |
die("$usage Error: -r or -g required!\n") unless $fgff; |
56 |
my $gffh; |
57 |
if ($fgff eq '-') { |
58 |
open($gffh, "<&=STDIN") || die("Error: couldn't alias STDIN. $!\n"); |
59 |
} |
60 |
else { |
61 |
open($gffh, $fgff) || die("Error: failed to open $fgff. $!\n"); |
62 |
} |
63 |
|
64 |
loadGff($gffh, \%gffrecs); |
65 |
# -- sort features by chromosome: |
66 |
#print STDERR "GFF data loaded. Sorting by chromosome location..\n"; |
67 |
my @sorted_features=sort sortByLoc keys(%gffrecs); |
68 |
# print STDERR "Writing BED file..\n"; |
69 |
processGffRecs(\%gffrecs, \@sorted_features); |
70 |
# -- |
71 |
if ($outfile) { |
72 |
select(STDOUT); |
73 |
close(OUTF); |
74 |
} |
75 |
|
76 |
#************ Subroutines ************** |
77 |
sub loadGff { |
78 |
my ($fh, $recs)=@_; # $hr=\%gffrecs |
79 |
# --------------------- 0 1 2 3 4 5 6 7 8 |
80 |
# recID => [ chr, strand, feat_type, gname, tdescr, fstart, fend, [@exons], [@cds] ] |
81 |
while (<$fh>) { |
82 |
next if m/^\s*#/; |
83 |
chomp; |
84 |
my ($chr, $track, $f, $fstart, $fend, $fscore, $strand, $frame, $lnum)=split(/\t/); |
85 |
next unless $fstart>1 && $lnum; |
86 |
next if $f eq 'gene' || $f eq 'locus'; # Warning: skipping any 'gene' or 'locus' features, unconditionally |
87 |
my $gff3_ID; |
88 |
my $gff3_Parent; |
89 |
my ($gname, $tdescr); |
90 |
($fstart, $fend)=($fend, $fstart) if $fend<$fstart; |
91 |
($gff3_ID)=($lnum=~m/\bID=([^;]+)/); |
92 |
($gff3_Parent)=($lnum=~m/\bParent=([^;]+)/); |
93 |
if ($gff3_ID || $gff3_Parent) { # GFF format |
94 |
$gff3_ID=~tr/"//d; #" |
95 |
$gff3_Parent=~tr/"//d; #" |
96 |
$gff3_Parent='' if ($f eq 'mRNA'); |
97 |
if ($gff3_ID && !$gff3_Parent) { #top level feature |
98 |
if ($f=~m/RNA/i || $f=~/gene/) { |
99 |
# try to parse the description, if any |
100 |
$tdescr=''; |
101 |
$gname=''; |
102 |
if ($lnum=~m/\b(?:descr|tophit|info|product)\s*=\s*"?([^;"]+)/i) { |
103 |
$tdescr=$1; |
104 |
} |
105 |
elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) { |
106 |
$tdescr=$1; |
107 |
} |
108 |
if ($lnum=~m/\bgene_name[\s=]+"?([^;"]+)/i) { |
109 |
$gname=$1; |
110 |
} |
111 |
elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) { |
112 |
$gname=$1; |
113 |
} |
114 |
$tdescr='' if ($tdescr eq $gname); |
115 |
$gname='' if $gname eq $gff3_ID; |
116 |
} |
117 |
die("Error: duplicate feature $gff3_ID on $chr\n") if (exists($$recs{"$chr|$gff3_ID"})); |
118 |
my $recID="$chr|$gff3_ID"; |
119 |
$$recs{$recID} = [$chr, $strand, $f, $gname, $tdescr, $fstart, $fend, [], [] ]; |
120 |
next; |
121 |
} # parent/top-level feature |
122 |
} #GFF |
123 |
else { #GTF format |
124 |
next if ($f eq 'transcript'); #exception: GTF with parent 'transcript' feature |
125 |
} |
126 |
# -------------- exon/CDS line here: |
127 |
my $recID; |
128 |
if ($track=~/^jigsaw/ && $lnum=~m/^\d+$/) { |
129 |
$recID=$chr.'.jsm.'.$lnum; |
130 |
} |
131 |
elsif ($lnum=~m/Parent=(['"\:\w\|\-\.]+)/) { |
132 |
$recID=$1; |
133 |
$recID=~tr/"//d; #" |
134 |
} |
135 |
elsif ($lnum=~m/transcript_id[= ]+(['"\:\w\.\|\-]+)/) { |
136 |
$recID=$1; |
137 |
$recID=~tr/"//d; #" |
138 |
} |
139 |
else { |
140 |
die("Error: cannot parse locus/transcript name from input line:\n$_\n"); |
141 |
} |
142 |
if (!$gname && $lnum=~m/gene_id[= ]+(['"\:\w\.\|\-]+)/) { |
143 |
$gname=$1; |
144 |
$gname=~tr/"//d; #" |
145 |
} |
146 |
$tdescr='' if index($recID, $tdescr)>=0; |
147 |
$gname='' if index($recID, $gname)>=0; |
148 |
$recID=$chr.'|'.$recID; |
149 |
my $ld = $$recs{$recID}; |
150 |
if ($ld) { #existing entry |
151 |
my $i=($f eq 'CDS') ? 8 : 7; |
152 |
my ($lstart, $lend)=($$ld[5], $$ld[6]); |
153 |
$$ld[5]=$fstart if $fstart<$lstart; |
154 |
$$ld[6]=$fend if $fend>$lend; |
155 |
push(@{$$ld[$i]}, [$fstart, $fend, $fscore]); |
156 |
} |
157 |
else { # first time seeing this locus/gene |
158 |
$$recs{$recID} = ($f eq 'CDS') ? |
159 |
[$chr, $strand, $f, $gname, $tdescr, $fstart, $fend, [], [[$fstart, $fend, $fscore]] ] : |
160 |
[$chr, $strand, $f, $gname, $tdescr, $fstart, $fend, [[$fstart, $fend, $fscore]], [] ] ; |
161 |
} # 0 1 2 3 4 5 6 7 (exons) 8 (CDS) |
162 |
} #while <$fh> |
163 |
} # loadGff |
164 |
|
165 |
sub sortByLoc { |
166 |
my $da=$gffrecs{$a}; |
167 |
my $db=$gffrecs{$b}; |
168 |
if ($$da[0] eq $$db[0]) { |
169 |
return ($$da[5]==$$db[5]) ? $$da[6] <=> $$db[6] : $$da[5] <=> $$db[5] ; |
170 |
} |
171 |
else { return $$da[0] cmp $$db[1] ; } |
172 |
} |
173 |
|
174 |
sub processGffRecs { |
175 |
#return if keys(%recs)==0; |
176 |
my ($recs, $rlist)=@_; |
177 |
my @recs_keys; |
178 |
unless ($rlist) { |
179 |
@recs_keys=keys(%$recs); |
180 |
$rlist=\@recs_keys; |
181 |
} |
182 |
foreach my $gffid (@$rlist) { |
183 |
my $ld=$$recs{$gffid}; |
184 |
# my $ld=$recs{$l} || die ("Error: locus $l found in list but not in hash!\n"); |
185 |
# 0 1 2 3 4 5 6 7 8 |
186 |
my ($chr, $strand, $ftype, $gname, $descr, $lstart, $lend, $er, $cr) = @$ld; |
187 |
# my ($mstart,$mend)=($lstart, $lend); |
188 |
my $CDexons=0; |
189 |
my @ex; |
190 |
my @cds; |
191 |
#some records might lack exons, but have only CDS segments (e.g. mitochondrial genes) |
192 |
if (@$er<1 && @$cr>0) { |
193 |
@ex = sort { $a->[0] <=> $b->[0] } @$cr; |
194 |
$CDexons=1; |
195 |
} |
196 |
else { |
197 |
@ex = sort { $a->[0] <=> $b->[0] } @$er; |
198 |
#if (@$cr>0) { # sort cds segments too |
199 |
# @cds= sort { $a->[0] <=> $b->[0] } @$cr; |
200 |
# } |
201 |
} |
202 |
# get the more accurate version of the start-end coords for the feature |
203 |
#($mstart, $mend) = ($ex[0]->[0], $ex[-1]->[1]); |
204 |
my @exonlst = map { $_->[0].'-'.$_->[1] } @ex; |
205 |
my ($avg, $std)=fetchCons($chr, \@ex); |
206 |
substr($gffid, 0, length($chr)+1)=''; |
207 |
print join("\t", $gffid, $chr.$strand.':'.join(',',@exonlst),$avg, $std)."\n"; |
208 |
} #for each stored transcript |
209 |
} |
210 |
|
211 |
|
212 |
# -- fetch conservation data for a sorted set of exons |
213 |
sub fetchCons { |
214 |
my ($chr, $exr)=@_; # $exr is \@exons (sorted) |
215 |
my ($tstart, $tend)=($$exr[0]->[0], $$exr[-1]->[1]); |
216 |
my $chd=$ch{$chr}; |
217 |
my ($cfname, $cfstart); |
218 |
my $fh; |
219 |
if ($chd) { |
220 |
($cfname, $cfstart)=@$chd; |
221 |
open($fh, $cfname) || die("Error opening file $cfname!\n"); |
222 |
binmode($fh); |
223 |
} |
224 |
else { |
225 |
$cfname="$wigbindir/$chr.wigfixbin"; |
226 |
open($fh, $cfname) || die("Error opening file $cfname!\n"); |
227 |
binmode($fh); |
228 |
my ($tag, $r); |
229 |
read($fh, $tag, 4); |
230 |
die("Error: $cfname is not a wigbin file (tag=$tag)!\n") unless $tag eq 'WIGB'; |
231 |
read($fh, $r, 4); |
232 |
my @v=unpack('I',$r); |
233 |
$cfstart=$v[0]; |
234 |
#print STDERR "start=$cfstart\n"; |
235 |
$ch{$chr}=[$cfname, $cfstart]; |
236 |
} |
237 |
my $tspan=($tend-$tstart+1); |
238 |
my @rdata = (0) x $tspan; |
239 |
if ($tend>=$cfstart) { |
240 |
my ($rstart, $rspan)=($tstart, $tspan); # real start and read values |
241 |
if ($rstart<$cfstart) { |
242 |
$rstart=$cfstart; |
243 |
$rspan=($tend-$rstart+1); |
244 |
} |
245 |
#print STDERR "..seeking to ".(8+2*($rstart-$cfstart))."\n"; |
246 |
seek($fh, 8+2*($rstart-$cfstart), 0); |
247 |
my $rdata; |
248 |
# my $rb; |
249 |
# my $nr=0; |
250 |
# my @v; |
251 |
# for (my $i=0;$i<$rspan;$i++) { |
252 |
# $rb=read($fh,$rdata,2); |
253 |
# last if $rb<2; |
254 |
# my @cv=unpack('s',$rdata); |
255 |
# push(@v,$cv[0]); |
256 |
# $nr++; |
257 |
# } |
258 |
my $rb=read($fh, $rdata, 2*$rspan); |
259 |
$rb>>=1; # number of values read (each value is 2 bytes) |
260 |
my @v=unpack('s'.$rb, $rdata); |
261 |
# |
262 |
# print STDERR "rspan=$rspan; unpacking s$rb values: (".join(',',@v).")\n"; |
263 |
@rdata[$rstart-$tstart .. $rstart-$tstart+$#v]=@v; |
264 |
} |
265 |
#now compute average&std for exons only |
266 |
my ($sum, $vcount); |
267 |
foreach my $e (@$exr) { |
268 |
$vcount+=$$e[1]-$$e[0]+1; |
269 |
map { $sum+=($_/1000) } @rdata[$$e[0]-$tstart .. $$e[1]-$tstart]; |
270 |
} |
271 |
my $avg=$sum/$vcount; |
272 |
$sum=0; |
273 |
foreach my $e (@$exr) { |
274 |
map { $sum+=((($_/1000)-$avg)**2) } @rdata[$$e[0]-$tstart .. $$e[1]-$tstart]; |
275 |
} |
276 |
my $std=sqrt($sum/($vcount-1)); |
277 |
return(sprintf('%.3f',$avg), sprintf('%.3f',$std)); |
278 |
} |