ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/rnaseq_cufflinks.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (13 years, 1 month ago) by gpertea
File size: 6804 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 use FindBin;use lib $FindBin::Bin;
5
6 my $usage = q/Usage:
7 rnaseq_cufflinks.pl <libprefix>
8 Prepares and runs tophat+cufflinks on a pair of fastq files
9 e.g.
10 rnaseq_cufflinks.pl RIVA
11
12 /;
13
14 my %libs=('BJAB'=>['GCB',250],
15 'Dogum'=>['GCB', 250],
16 'DOHH2'=>['GCB', 250],
17 'Gumbus'=>['GCB',250],
18 'HT'=>['GCB',250],
19 'Karpas422'=>['GCB',250],
20 'Ly1'=>['GCB',150],
21 'Ly18'=>['GCB',250],
22 'Ly19'=>['GCB',250],
23 'Ly2'=>['GCB',250],
24 'Ly4'=>['GCB',250],
25 'Ly7'=>['GCB',250],
26 'Ly8'=>['GCB',150],
27 'Mieu'=>['GCB',250],
28 'Pfieffer'=>['GCB',250],
29 'ULA'=>['GCB',250],
30 # ABC:
31 'HBL1'=>['ABC',150],
32 'TMD8'=>['ABC',150],
33 'Ly10'=>['ABC',150],
34 'Ly3'=>['ABC',250],
35 'RIVA'=>['ABC',250],
36 'SUDHL2'=>['ABC',150],
37 'U2932'=>['ABC',150]
38 );
39
40 my $tcmd='tophat --solexa1.3-quals --coverage-search --microexon-search --closure-search -p 8 ';
41 #my $tophat1=$tcmd.' -r 36 -o RIVA_tophat_untrimmed hg19 RIVA_1.fq RIVA_2.fq > & log_tophat_untrimmed.log &';
42 #my $tophat2=$tcmd.' -r 80 -o RIVA_cut84_tophat hg19 RIVA_1.unmapped_cut84.fq RIVA_2.unmapped_cut84.fq > & log_tophat_cut84.log &';
43 my $gzdir='/fs/szattic-asmg6/NCI_rnaseq/Cell_line_seq';
44 my $workdir='/fs/szannotation/human_rnaseq/NCI';
45 umask 0002;
46 chdir($workdir) || die("Error switching to working directory: $workdir\n");
47
48 getopts('o:') || die($usage."\n");
49 my $outfile=$Getopt::Std::opt_o;
50 if ($outfile) {
51 open(OUTF, '>'.$outfile) || die("Error creating output file $outfile\n");
52 select(OUTF);
53 }
54 # --
55
56 my $fbase=$ARGV[0];
57 my $libd=$libs{$fbase};
58 die("$usage Error: unrecognized sample ($fbase)!\n") unless $libd;
59 my ($gz1, $gz2)=($fbase.'_1_sequence.txt.gz',$fbase.'_2_sequence.txt.gz');
60 foreach my $gz ($gz1, $gz2) {
61 die("Error: file $gzdir/$gz not found!\n") unless -f "$gzdir/$gz";
62 }
63 my ($fq1, $fq2)=($fbase.'_1.fq', $fbase.'_2.fq');
64 my ($fqu1, $fqu2)=($fbase.'_unmapped_1.fq', $fbase.'_unmapped_2.fq');
65 my ($fqustats1, $fqustats2)=($fbase.'_unmapped_1.qstats', $fbase.'_unmapped_2.qstats');
66 my ($fqcut1, $fqcut2)=($fbase.'_trimmed_1.fq', $fbase.'_trimmed_2.fq');
67 if (-f $fq1) {
68 mlog("$fq1 already exists, skipping decompression.");
69 }
70 else {
71 mlog("decompressing $gz1 ..");
72 system("gzip -cd $gzdir/$gz1 > $fq1");
73 }
74 unless (-f "$fq1.cidx") {
75 mlog("indexing $fq1..");
76 system("cdbfasta -Q $fq1");
77 system("cdbyank -l $fq1.cidx | sort -u > $fq1.lst");
78 }
79 # fq2
80 if (-f $fq2) {
81 mlog("$fq2 already exists, skipping decompression.");
82 }
83 else {
84 mlog("decompressing $gz2 ..");
85 system("gzip -cd $gzdir/$gz2 > $fq2");
86 }
87 unless (-f "$fq2.cidx") {
88 mlog("indexing $fq2..");
89 system("cdbfasta -Q $fq2");
90 system("cdbyank -l $fq2.cidx | sort -u > $fq2.lst");
91 }
92 unless (-s "$fbase.lst" ) {
93 system("sort -o $fbase.lst -m $fq1.lst $fq2.lst");
94 }
95 my $rlen=int(`head -2 $fq1 | tail -1 | wc -L`);
96 my $frgsize=$libd->[1]; #
97 my $innerdist=$frgsize-$rlen*2;
98 mlog("read length: $rlen, fragment len: $frgsize (inner dist=$innerdist)");
99 #die("Just for now, exit\n");
100 my $tophat_dir=$fbase.'_tophat_untrimmed';
101 if (-d $tophat_dir) {
102 mlog("dir $tophat_dir exists, skipping first tophat run.");
103 }
104 else {
105 mkdir($tophat_dir) || die("Error creating directory $tophat_dir ($!)\n");
106 my $tophatcmd="$tcmd -r $innerdist -o $tophat_dir hg19 $fq1 $fq2 >& $tophat_dir/tophat.log";
107 mlog('Starting tophat #1 run on the untrimmed data:', ' '.$tophatcmd);
108 system($tophatcmd);
109 mlog('tophat run #1 done.');
110 }
111 unless (-s "$tophat_dir/mapped_reads.lst") {
112 mlog("generating the list of mapped reads ($tophat_dir/mapped_reads.lst");
113 sysrun("samtools view $tophat_dir/accepted_hits.bam | sam2reads.pl | sort -u > $tophat_dir/mapped_reads.lst");
114 }
115 # - generate the fq file with the unmapped reads
116 unless (-f $fqu1) {
117 mlog("extracting unmapped reads from $fq1");
118 sysrun("comm -13 $tophat_dir/mapped_reads.lst $fq1.lst | cdbyank $fq1.cidx > $fqu1");
119 }
120 unless (-f $fqu2) {
121 mlog("extracting unmapped reads from $fq2");
122 sysrun("comm -13 $tophat_dir/mapped_reads.lst $fq2.lst | cdbyank $fq2.cidx > $fqu2");
123 }
124 #analyse the qv distribution for $fqu1 and $fqu2, find the cutoff point
125 unless (-f $fqustats1) {
126 mlog("getting quality stats for $fqu1");
127 sysrun("fastx_quality_stats -i $fqu1 -o $fqustats1");
128 }
129
130 unless (-f $fqustats2) {
131 mlog("getting quality stats for $fqu2");
132 sysrun("fastx_quality_stats -i $fqu2 -o $fqustats2");
133 }
134
135 my ($q1cut, $q2cut);
136 open(FQ, $fqustats1);
137 while (<FQ>) {
138 my @t=split();
139 next unless $t[0]>5;
140 $q1cut=$t[0];
141 last if $t[5]<10
142 }
143 close(FQ);
144
145 open(FQ, $fqustats2);
146 while (<FQ>) {
147 my @t=split();
148 next unless $t[0]>5;
149 $q2cut=$t[0];
150 last if $t[5]<10
151 }
152 close(FQ);
153
154 my $cutlen= ($q1cut<$q2cut ? $q1cut : $q2cut);
155 die("Error getting cut length ($q1cut, $q2cut)!\n") if $cutlen<20;
156 unless ( -f $fqcut1 ) {
157 mlog("trimming $fqu1 to length $cutlen (into $fqcut1)");
158 sysrun("fastx_trimmer -f 1 -l $cutlen -i $fqu1 -o $fqcut1");
159 }
160 unless ( -f $fqcut2 ) {
161 mlog("trimming $fqu2 to length $cutlen (into $fqcut2)");
162 sysrun("fastx_trimmer -f 1 -l $cutlen -i $fqu2 -o $fqcut2");
163 }
164
165 my $tophat_dir2=$fbase.'_tophat_cut';
166 my $cutinnerdist=$frgsize-$cutlen*2;
167 mlog("cut read length: $cutlen, fragment len: $frgsize (inner dist=$cutinnerdist)");
168
169 if (-d $tophat_dir2) {
170 mlog("dir $tophat_dir2 exists, skipping 2nd tophat run.");
171 }
172 else {
173 mkdir($tophat_dir2) || die("Error creating directory $tophat_dir ($!)\n");
174 my $tophatcmd="$tcmd -r $cutinnerdist -o $tophat_dir2 hg19 $fqcut1 $fqcut2 >& $tophat_dir2/tophat.log";
175 mlog('Starting tophat #2 run on the trimmed data:', ' '.$tophatcmd);
176 system($tophatcmd);
177 mlog("tophat run #2 done (exit code $?)");
178 }
179 #join resulting bam files for cufflinks
180 chdir($tophat_dir) || die("Error at chdir into $tophat_dir!\n");
181 unless (-f 'merged_hits.bam') {
182 mlog("merging mappings into $tophat_dir/merged_hits.bam");
183 sysrun("samtools merge merged_hits.bam accepted_hits.bam ".
184 "../$tophat_dir2/accepted_hits.bam");
185 }
186 unless (-d 'cufflinks_out') {
187 my $idist=int(($innerdist+$cutinnerdist)/2);
188 #my $cufflinks_cmd="cufflinks.new -m $idist -s 34 -p 8 -o cufflinks_out ".
189 # ' merged_hits.bam >& log_cufflinks.log';
190 my $cufflinks_cmd="cufflinks -m $frgsize -s 60 -p 8 -o cufflinks_out ".
191 ' merged_hits.bam >& log_cufflinks.log';
192 mlog("running cufflinks command:\n$cufflinks_cmd");
193 system($cufflinks_cmd);
194 mlog("cufflinks done (exit code $?)");
195 }
196 # --
197 if ($outfile) {
198 select(STDOUT);
199 close(OUTF);
200 }
201 print STDERR "done.\n";
202 #************ Subroutines **************
203
204 sub mlog {
205 foreach my $m (@_) {
206 print STDERR $m."\n";
207 }
208 }
209
210 sub sysrun {
211 my ($cmd)=@_;
212 my $errmsg = `($cmd) 2>&1`;
213 my $exitcode=$?;
214 if ($exitcode || ($errmsg=~/Error|Segmentation|Fail|Invalid|Cannot|Unable/si)) {
215 print STDERR "!Error at:\n$cmd\n";
216 print STDERR " Exit code: $exitcode, message:\n$errmsg\n";
217 #unlink(@todel);
218 exit(1);
219 }
220 }

Properties

Name Value
svn:executable *