1 |
#!/usr/bin/env python |
2 |
|
3 |
# encoding: utf-8 |
4 |
""" |
5 |
tophat.py |
6 |
|
7 |
Created by Cole Trapnell on 2008-12-25. |
8 |
Copyright (c) 2008 Cole Trapnell. All rights reserved. |
9 |
""" |
10 |
|
11 |
import sys |
12 |
try: |
13 |
import psyco |
14 |
psyco.full() |
15 |
except ImportError: |
16 |
pass |
17 |
|
18 |
import getopt |
19 |
import subprocess |
20 |
import errno |
21 |
import os |
22 |
import warnings |
23 |
import re |
24 |
import signal |
25 |
from datetime import datetime, date, time |
26 |
|
27 |
use_message = ''' |
28 |
TopHat maps short sequences from spliced transcripts to whole genomes. |
29 |
|
30 |
Usage: |
31 |
tophat [options] <bowtie_index> <reads1[,reads2,...]> [reads1[,reads2,...]] \\ |
32 |
[quals1,[quals2,...]] [quals1[,quals2,...]] |
33 |
|
34 |
Options: |
35 |
-v/--version |
36 |
-o/--output-dir <string> [ default: ./tophat_out ] |
37 |
-a/--min-anchor <int> [ default: 8 ] |
38 |
-m/--splice-mismatches <0-2> [ default: 0 ] |
39 |
-i/--min-intron-length <int> [ default: 50 ] |
40 |
-I/--max-intron-length <int> [ default: 500000 ] |
41 |
-g/--max-multihits <int> [ default: 20 ] |
42 |
-F/--min-isoform-fraction <float> [ default: 0.15 ] |
43 |
--max-insertion-length <int> [ default: 3 ] |
44 |
--max-deletion-length <int> [ default: 3 ] |
45 |
--solexa-quals |
46 |
--solexa1.3-quals (same as phred64-quals) |
47 |
--phred64-quals (same as solexa1.3-quals) |
48 |
-Q/--quals |
49 |
--integer-quals |
50 |
-C/--color (Solid - color space) |
51 |
--color-out |
52 |
--library-type <string> (fr-unstranded, fr-firststrand, |
53 |
fr-secondstrand) |
54 |
-p/--num-threads <int> [ default: 1 ] |
55 |
-G/--GTF <filename> |
56 |
-j/--raw-juncs <filename> |
57 |
--insertions <filename> |
58 |
--deletions <filename> |
59 |
-r/--mate-inner-dist <int> |
60 |
--mate-std-dev <int> [ default: 20 ] |
61 |
--no-novel-juncs |
62 |
--no-novel-indels |
63 |
--no-gtf-juncs |
64 |
--no-coverage-search |
65 |
--coverage-search |
66 |
--no-closure-search |
67 |
--closure-search |
68 |
--microexon-search |
69 |
--butterfly-search |
70 |
--no-butterfly-search |
71 |
--keep-tmp |
72 |
--tmp-dir <dirname> [ default: <output_dir>/tmp ] |
73 |
-z/--zpacker <program> [ default: gzip ] |
74 |
-X/--unmapped-fifo [use mkfifo to compress more temporary |
75 |
files for color space reads] |
76 |
|
77 |
Advanced Options: |
78 |
--initial-read-mismatches <int> [ default: 2 ] |
79 |
--segment-mismatches <int> [ default: 2 ] |
80 |
--segment-length <int> [ default: 25 ] |
81 |
--bowtie-n [ default: bowtie -v ] |
82 |
--min-closure-exon <int> [ default: 100 ] |
83 |
--min-closure-intron <int> [ default: 50 ] |
84 |
--max-closure-intron <int> [ default: 5000 ] |
85 |
--min-coverage-intron <int> [ default: 50 ] |
86 |
--max-coverage-intron <int> [ default: 20000 ] |
87 |
--min-segment-intron <int> [ default: 50 ] |
88 |
--max-segment-intron <int> [ default: 500000 ] |
89 |
--no-sort-bam [Output BAM is not coordinate-sorted] |
90 |
--no-convert-bam [Do not convert to bam format. |
91 |
Output is <output_dir>accepted_hit.sam. |
92 |
Implies --no-sort-bam.] |
93 |
|
94 |
SAM Header Options (for embedding sequencing run metadata in output): |
95 |
--rg-id <string> (read group ID) |
96 |
--rg-sample <string> (sample ID) |
97 |
--rg-library <string> (library ID) |
98 |
--rg-description <string> (descriptive string, no tabs allowed) |
99 |
--rg-platform-unit <string> (e.g Illumina lane ID) |
100 |
--rg-center <string> (sequencing center name) |
101 |
--rg-date <string> (ISO 8601 date of the sequencing run) |
102 |
--rg-platform <string> (Sequencing platform descriptor) |
103 |
''' |
104 |
|
105 |
class Usage(Exception): |
106 |
def __init__(self, msg): |
107 |
self.msg = msg |
108 |
|
109 |
output_dir = "./tophat_out/" |
110 |
logging_dir = output_dir + "logs/" |
111 |
run_log = None |
112 |
run_cmd = None |
113 |
tmp_dir = output_dir + "tmp/" |
114 |
bin_dir = sys.path[0] + "/" |
115 |
use_zpacker = False # this is set by -z/--zpacker option (-z0 leaves it False) |
116 |
|
117 |
use_BAM_Unmapped = False # automatically set to True for non-Solid reads, handles unmapped reads in BAM format |
118 |
|
119 |
use_BWT_FIFO = False # can only be set to True if use_zpacker is True and only with -C/--color |
120 |
# enabled by -X/-unmapped-fifo option (unless -z0) |
121 |
unmapped_reads_fifo = None # if use_BWT_FIFO is True, this is to trick bowtie into writing the |
122 |
# unmapped reads into a compressed file |
123 |
|
124 |
samtools_path = None |
125 |
bowtie_path = None |
126 |
fail_str = "\t[FAILED]\n" |
127 |
|
128 |
sam_header = tmp_dir + "stub_header.sam" |
129 |
|
130 |
# TopHatParams captures all of the runtime paramaters used by TopHat, and many |
131 |
# of these are passed as command line options to exectubles run by the pipeline |
132 |
|
133 |
# This class and its nested classes also do options parsing through parse_options() |
134 |
# and option validation via the member function check() |
135 |
|
136 |
class TopHatParams: |
137 |
|
138 |
# SpliceConstraints is a group of runtime parameters that specify what |
139 |
# constraints to put on junctions discovered by the program. These constraints |
140 |
# are used to filter out spurious/false positive junctions. |
141 |
|
142 |
class SpliceConstraints: |
143 |
def __init__(self, |
144 |
min_anchor_length, |
145 |
min_intron_length, |
146 |
max_intron_length, |
147 |
splice_mismatches, |
148 |
min_isoform_fraction): |
149 |
self.min_anchor_length = min_anchor_length |
150 |
self.min_intron_length = min_intron_length |
151 |
self.max_intron_length = max_intron_length |
152 |
self.splice_mismatches = splice_mismatches |
153 |
self.min_isoform_fraction = min_isoform_fraction |
154 |
|
155 |
def parse_options(self, opts): |
156 |
for option, value in opts: |
157 |
if option in ("-m", "--splice-mismatches"): |
158 |
self.splice_mismatches = int(value) |
159 |
elif option in ("-a", "--min-anchor"): |
160 |
self.min_anchor_length = int(value) |
161 |
elif option in ("-F", "--min-isoform-fraction"): |
162 |
self.min_isoform_fraction = float(value) |
163 |
elif option in ("-i", "--min-intron-length"): |
164 |
self.min_intron_length = int(value) |
165 |
elif option in ("-I", "--max-intron-length"): |
166 |
self.max_intron_length = int(value) |
167 |
|
168 |
def check(self): |
169 |
if self.splice_mismatches not in [0,1,2]: |
170 |
die("Error: arg to --splice-mismatches must be 0, 1, or 2") |
171 |
if self.min_anchor_length < 4: |
172 |
die("Error: arg to --min-anchor-len must be greater than 4") |
173 |
if self.min_isoform_fraction < 0.0 or self.min_isoform_fraction > 1.0: |
174 |
die("Error: arg to --min-isoform-fraction must be between 0.0 and 1.0") |
175 |
if self.min_intron_length <= 0: |
176 |
die("Error: arg to --min-intron-length must be greater than 0") |
177 |
if self.max_intron_length <= 0: |
178 |
die("Error: arg to --max-intron-length must be greater than 0") |
179 |
|
180 |
# SystemParams is a group of runtime parameters that determine how to handle |
181 |
# temporary files produced during a run and how many threads to use for threaded |
182 |
# stages of the pipeline (e.g. Bowtie) |
183 |
|
184 |
class SystemParams: |
185 |
def __init__(self, |
186 |
num_cpus, |
187 |
keep_tmp): |
188 |
self.num_cpus = num_cpus |
189 |
self.keep_tmp = keep_tmp |
190 |
self.zipper = "gzip" |
191 |
self.zipper_opts= [] |
192 |
|
193 |
def parse_options(self, opts): |
194 |
global use_zpacker |
195 |
global use_BWT_FIFO |
196 |
for option, value in opts: |
197 |
if option in ("-p", "--num-threads"): |
198 |
self.num_cpus = int(value) |
199 |
elif option == "--keep-tmp": |
200 |
self.keep_tmp = True |
201 |
elif option in ("-z","--zpacker"): |
202 |
if value.lower() in ["-", " ", ".", "0", "none", "f", "false", "no"]: |
203 |
value="" |
204 |
self.zipper = value |
205 |
#if not self.zipper: |
206 |
# self.zipper='gzip' |
207 |
elif option in ("-X", "--unmapped-fifo"): |
208 |
use_BWT_FIFO=True |
209 |
if self.zipper: |
210 |
use_zpacker=True |
211 |
if self.num_cpus>1 and not self.zipper_opts: |
212 |
if self.zipper.endswith('pbzip2') or self.zipper.endswith('pigz'): |
213 |
self.zipper_opts.append('-p'+str(self.num_cpus)) |
214 |
else: |
215 |
use_zpacker=False |
216 |
if use_BWT_FIFO: use_BWT_FIFO=False |
217 |
def cmd(self): |
218 |
cmdline=[] |
219 |
if self.zipper: |
220 |
cmdline.extend(['-z',self.zipper]) |
221 |
if self.num_cpus>1: |
222 |
cmdline.extend(['-p'+str(self.num_cpus)]) |
223 |
return cmdline |
224 |
|
225 |
def check(self): |
226 |
if self.num_cpus<1 : |
227 |
die("Error: arg to --num-threads must be greater than 0") |
228 |
if self.zipper: |
229 |
xzip=which(self.zipper) |
230 |
if not xzip: |
231 |
die("Error: cannot find compression program "+self.zipper) |
232 |
|
233 |
# ReadParams is a group of runtime parameters that specify various properties |
234 |
# of the user's reads (e.g. which quality scale their are on, how long the |
235 |
# fragments are, etc). |
236 |
class ReadParams: |
237 |
def __init__(self, |
238 |
solexa_quals, |
239 |
phred64_quals, |
240 |
quals, |
241 |
integer_quals, |
242 |
color, |
243 |
color_out, |
244 |
library_type, |
245 |
seed_length, |
246 |
reads_format, |
247 |
mate_inner_dist, |
248 |
mate_inner_dist_std_dev, |
249 |
read_group_id, |
250 |
sample_id, |
251 |
library_id, |
252 |
description, |
253 |
seq_platform_unit, |
254 |
seq_center, |
255 |
seq_run_date, |
256 |
seq_platform): |
257 |
self.solexa_quals = solexa_quals |
258 |
self.phred64_quals = phred64_quals |
259 |
self.quals = quals |
260 |
self.integer_quals = integer_quals |
261 |
self.color = color |
262 |
self.color_out = color_out |
263 |
self.library_type = library_type |
264 |
self.seed_length = seed_length |
265 |
self.reads_format = reads_format |
266 |
self.mate_inner_dist = mate_inner_dist |
267 |
self.mate_inner_dist_std_dev = mate_inner_dist_std_dev |
268 |
self.read_group_id = read_group_id |
269 |
self.sample_id = sample_id |
270 |
self.library_id = library_id |
271 |
self.description = description |
272 |
self.seq_platform_unit = seq_platform_unit |
273 |
self.seq_center = seq_center |
274 |
self.seq_run_date = seq_run_date |
275 |
self.seq_platform = seq_platform |
276 |
|
277 |
def parse_options(self, opts): |
278 |
for option, value in opts: |
279 |
if option == "--solexa-quals": |
280 |
self.solexa_quals = True |
281 |
elif option in ("--solexa1.3-quals", "--phred64-quals"): |
282 |
self.phred64_quals = True |
283 |
elif option in ("-Q", "--quals"): |
284 |
self.quals = True |
285 |
elif option == "--integer-quals": |
286 |
self.integer_quals = True |
287 |
elif option in ("-C", "--color"): |
288 |
self.color = True |
289 |
elif option == "--color-out": |
290 |
self.color_out = True |
291 |
elif option == "--library-type": |
292 |
self.library_type = value |
293 |
elif option in ("-s", "--seed-length"): |
294 |
self.seed_length = int(value) |
295 |
elif option in ("-r", "--mate-inner-dist"): |
296 |
self.mate_inner_dist = int(value) |
297 |
elif option == "--mate-std-dev": |
298 |
self.mate_inner_dist_std_dev = int(value) |
299 |
elif option == "--rg-id": |
300 |
self.read_group_id = value |
301 |
elif option == "--rg-sample": |
302 |
self.sample_id = value |
303 |
elif option == "--rg-library": |
304 |
self.library_id = value |
305 |
elif option == "--rg-description": |
306 |
self.description = value |
307 |
elif option == "--rg-platform-unit": |
308 |
self.seq_platform_unit = value |
309 |
elif option == "--rg-center": |
310 |
self.seq_center = value |
311 |
elif option == "--rg-date": |
312 |
self.seq_run_date = value |
313 |
elif option == "--rg-platform": |
314 |
self.seq_platform = value |
315 |
|
316 |
def check(self): |
317 |
if self.seed_length != None and self.seed_length < 20: |
318 |
die("Error: arg to --seed-length must be at least 20") |
319 |
if self.mate_inner_dist_std_dev != None and self.mate_inner_dist_std_dev < 0: |
320 |
die("Error: arg to --mate-std-dev must at least 0") |
321 |
if (not self.read_group_id and self.sample_id) or (self.read_group_id and not self.sample_id): |
322 |
die("Error: --rg-id and --rg-sample must be specified or omitted together") |
323 |
|
324 |
# SearchParams is a group of runtime parameters that specify how TopHat will |
325 |
# search for splice junctions |
326 |
|
327 |
class SearchParams: |
328 |
def __init__(self, |
329 |
min_closure_exon, |
330 |
min_closure_intron, |
331 |
max_closure_intron, |
332 |
min_coverage_intron, |
333 |
max_coverage_intron, |
334 |
min_segment_intron, |
335 |
max_segment_intron): |
336 |
|
337 |
self.min_closure_exon_length = min_closure_exon |
338 |
self.min_closure_intron_length = min_closure_intron |
339 |
self.max_closure_intron_length = max_closure_intron |
340 |
self.min_coverage_intron_length = min_coverage_intron |
341 |
self.max_coverage_intron_length = max_coverage_intron |
342 |
self.min_segment_intron_length = min_segment_intron |
343 |
self.max_segment_intron_length = max_segment_intron |
344 |
|
345 |
def parse_options(self, opts): |
346 |
for option, value in opts: |
347 |
if option == "--min-closure-exon": |
348 |
self.min_closure_exon_length = int(value) |
349 |
if option == "--min-closure-intron": |
350 |
self.min_closure_intron_length = int(value) |
351 |
if option == "--max-closure-intron": |
352 |
self.max_closure_intron_length = int(value) |
353 |
if option == "--min-coverage-intron": |
354 |
self.min_coverage_intron_length = int(value) |
355 |
if option == "--max-coverage-intron": |
356 |
self.max_coverage_intron_length = int(value) |
357 |
if option == "--min-segment-intron": |
358 |
self.min_segment_intron_length = int(value) |
359 |
if option == "--max-segment-intron": |
360 |
self.max_segment_intron_length = int(value) |
361 |
|
362 |
def check(self): |
363 |
if self.min_closure_exon_length < 0: |
364 |
die("Error: arg to --min-closure-exon must be at least 20") |
365 |
if self.min_closure_intron_length < 0: |
366 |
die("Error: arg to --min-closure-intron must be at least 20") |
367 |
if self.max_closure_intron_length < 0: |
368 |
die("Error: arg to --max-closure-intron must be at least 20") |
369 |
if self.min_coverage_intron_length < 0: |
370 |
die("Error: arg to --min-coverage-intron must be at least 20") |
371 |
if self.max_coverage_intron_length < 0: |
372 |
die("Error: arg to --max-coverage-intron must be at least 20") |
373 |
if self.min_segment_intron_length < 0: |
374 |
die("Error: arg to --min-segment-intron must be at least 20") |
375 |
if self.max_segment_intron_length < 0: |
376 |
die("Error: arg to --max-segment-intron must be at least 20") |
377 |
|
378 |
class ReportParams: |
379 |
def __init__(self): |
380 |
self.sort_bam = True |
381 |
self.convert_bam = True |
382 |
|
383 |
def parse_options(self, opts): |
384 |
for option, value in opts: |
385 |
if option == "--no-sort-bam": |
386 |
self.sort_bam = False |
387 |
if option == "--no-convert-bam": |
388 |
self.convert_bam = False |
389 |
self.sort_bam = False |
390 |
|
391 |
|
392 |
def __init__(self): |
393 |
self.splice_constraints = self.SpliceConstraints(8, # min_anchor |
394 |
50, # min_intron |
395 |
500000, # max_intron |
396 |
0, # splice_mismatches |
397 |
0.15) # min_isoform_frac |
398 |
|
399 |
self.read_params = self.ReadParams(False, # solexa_scale |
400 |
False, |
401 |
False, # quals |
402 |
None, # integer quals |
403 |
False, # SOLiD - color space |
404 |
False, # SOLiD - color out instead of base pair, |
405 |
"", # library type (e.g. "illumina-stranded-pair-end") |
406 |
None, # seed_length |
407 |
"fastq", # quality_format |
408 |
None, # mate inner distance |
409 |
20, # mate inner dist std dev |
410 |
None, # read group id |
411 |
None, # sample id |
412 |
None, # library id |
413 |
None, # description |
414 |
None, # platform unit (i.e. lane) |
415 |
None, # sequencing center |
416 |
None, # run date |
417 |
None) # sequencing platform |
418 |
|
419 |
self.system_params = self.SystemParams(1, # bowtie_threads (num_cpus) |
420 |
False) # keep_tmp |
421 |
|
422 |
self.search_params = self.SearchParams(100, # min_closure_exon_length |
423 |
50, # min_closure_intron_length |
424 |
5000, # max_closure_intron_length |
425 |
50, # min_coverage_intron_length |
426 |
20000, # max_coverage_intron_length |
427 |
50, # min_segment_intron_length |
428 |
500000) # max_segment_intron_length |
429 |
|
430 |
self.report_params = self.ReportParams() |
431 |
self.gff_annotation = None |
432 |
self.raw_junctions = None |
433 |
self.find_novel_juncs = True |
434 |
self.find_novel_indels = True |
435 |
self.find_GFF_juncs = True |
436 |
self.skip_check_reads = False |
437 |
self.max_hits = 20 |
438 |
self.initial_read_mismatches = 2 |
439 |
self.segment_length = 25 |
440 |
self.segment_mismatches = 2 |
441 |
self.bowtie_alignment_option = "-v" |
442 |
self.max_insertion_length = 3 |
443 |
self.max_deletion_length = 3 |
444 |
self.raw_insertions = None |
445 |
self.raw_deletions = None |
446 |
self.closure_search = None |
447 |
self.coverage_search = None |
448 |
self.microexon_search = False |
449 |
self.butterfly_search = None |
450 |
|
451 |
def check(self): |
452 |
self.splice_constraints.check() |
453 |
self.read_params.check() |
454 |
self.system_params.check() |
455 |
|
456 |
if self.segment_length <= 4: |
457 |
die("Error: arg to --segment-length must at least 4") |
458 |
if self.segment_mismatches < 0 or self.segment_mismatches > 3: |
459 |
die("Error: arg to --segment-mismatches must in [0, 3]") |
460 |
|
461 |
if self.read_params.color and self.butterfly_search: |
462 |
die("Error: butterfly-search in colorspace is not yet supported") |
463 |
|
464 |
library_types = ["fr-unstranded", "fr-firststrand", "fr-secondstrand"] |
465 |
|
466 |
if self.read_params.library_type and self.read_params.library_type not in library_types: |
467 |
die("Error: library-type should be one of: "+', '.join(library_types)) |
468 |
|
469 |
self.search_params.max_closure_intron_length = min(self.splice_constraints.max_intron_length, |
470 |
self.search_params.max_closure_intron_length) |
471 |
|
472 |
self.search_params.max_segment_intron_length = min(self.splice_constraints.max_intron_length, |
473 |
self.search_params.max_segment_intron_length) |
474 |
|
475 |
self.search_params.max_coverage_intron_length = min(self.splice_constraints.max_intron_length, |
476 |
self.search_params.max_coverage_intron_length) |
477 |
|
478 |
if self.max_insertion_length >= self.segment_length: |
479 |
die("Error: the max insertion length ("+self.max_insertion_length+") can not be equal to or greater than the segment length ("+self.segment_length+")") |
480 |
|
481 |
if self.max_insertion_length < 0: |
482 |
die("Error: the max insertion length ("+self.max_insertion_length+") can not be less than 0") |
483 |
|
484 |
if self.max_deletion_length >= self.splice_constraints.min_intron_length: |
485 |
die("Error: the max deletion length ("+self.max_deletion_length+") can not be equal to or greater than the min intron length ("+self.splice_constraints.min_intron_length+")") |
486 |
|
487 |
if self.max_deletion_length < 0: |
488 |
die("Error: the max deletion length ("+self.max_deletion_length+") can not be less than 0") |
489 |
|
490 |
def cmd(self): |
491 |
cmd = ["--min-anchor", str(self.splice_constraints.min_anchor_length), |
492 |
"--splice-mismatches", str(self.splice_constraints.splice_mismatches), |
493 |
"--min-report-intron", str(self.splice_constraints.min_intron_length), |
494 |
"--max-report-intron", str(self.splice_constraints.max_intron_length), |
495 |
"--min-isoform-fraction", str(self.splice_constraints.min_isoform_fraction), |
496 |
"--output-dir", output_dir, |
497 |
"--max-multihits", str(self.max_hits), |
498 |
"--segment-length", str(self.segment_length), |
499 |
"--segment-mismatches", str(self.segment_mismatches), |
500 |
"--min-closure-exon", str(self.search_params.min_closure_exon_length), |
501 |
"--min-closure-intron", str(self.search_params.min_closure_intron_length), |
502 |
"--max-closure-intron", str(self.search_params.max_closure_intron_length), |
503 |
"--min-coverage-intron", str(self.search_params.min_coverage_intron_length), |
504 |
"--max-coverage-intron", str(self.search_params.max_coverage_intron_length), |
505 |
"--min-segment-intron", str(self.search_params.min_segment_intron_length), |
506 |
"--max-segment-intron", str(self.search_params.max_segment_intron_length), |
507 |
"--sam-header", sam_header, |
508 |
"--max-insertion-length", str(self.max_insertion_length), |
509 |
"--max-deletion-length", str(self.max_deletion_length)] |
510 |
|
511 |
cmd.extend(self.system_params.cmd()) |
512 |
|
513 |
if self.read_params.mate_inner_dist != None: |
514 |
cmd.extend(["--inner-dist-mean", str(self.read_params.mate_inner_dist), |
515 |
"--inner-dist-std-dev", str(self.read_params.mate_inner_dist_std_dev)]) |
516 |
if self.gff_annotation != None: |
517 |
cmd.extend(["--gtf-annotations", str(self.gff_annotation)]) |
518 |
if not self.closure_search: |
519 |
cmd.append("--no-closure-search") |
520 |
if not self.coverage_search: |
521 |
cmd.append("--no-coverage-search") |
522 |
if not self.microexon_search: |
523 |
cmd.append("--no-microexon-search") |
524 |
if self.butterfly_search: |
525 |
cmd.append("--butterfly-search") |
526 |
if self.read_params.solexa_quals: |
527 |
cmd.append("--solexa-quals") |
528 |
if self.read_params.quals: |
529 |
cmd.append("--quals") |
530 |
if self.read_params.integer_quals: |
531 |
cmd.append("--integer-quals") |
532 |
if self.read_params.color: |
533 |
cmd.append("--color") |
534 |
if self.read_params.color_out: |
535 |
cmd.append("--color-out") |
536 |
if self.read_params.library_type: |
537 |
cmd.extend(["--library-type", self.read_params.library_type]) |
538 |
if self.read_params.read_group_id: |
539 |
cmd.extend(["--rg-id", self.read_params.read_group_id]) |
540 |
if self.read_params.phred64_quals: |
541 |
cmd.append("--phred64-quals") |
542 |
return cmd |
543 |
|
544 |
# This is the master options parsing routine, which calls parse_options for |
545 |
# the delegate classes (e.g. SpliceConstraints) that handle certain groups |
546 |
# of options. |
547 |
def parse_options(self, argv): |
548 |
try: |
549 |
opts, args = getopt.getopt(argv[1:], "hvp:m:F:a:i:I:G:r:o:j:Xz:s:g:QC", |
550 |
["version", |
551 |
"help", |
552 |
"output-dir=", |
553 |
"solexa-quals", |
554 |
"solexa1.3-quals", |
555 |
"phred64-quals", |
556 |
"quals", |
557 |
"integer-quals", |
558 |
"color", |
559 |
"color-out", |
560 |
"library-type=", |
561 |
"num-threads=", |
562 |
"splice-mismatches=", |
563 |
"max-multihits=", |
564 |
"min-isoform-fraction=", |
565 |
"min-anchor-length=", |
566 |
"min-intron-length=", |
567 |
"max-intron-length=", |
568 |
"GTF=", |
569 |
"raw-juncs=", |
570 |
"no-novel-juncs", |
571 |
"no-novel-indels", |
572 |
"no-gtf-juncs", |
573 |
"skip-check-reads", |
574 |
"mate-inner-dist=", |
575 |
"mate-std-dev=", |
576 |
"no-closure-search", |
577 |
"no-coverage-search", |
578 |
"closure-search", |
579 |
"coverage-search", |
580 |
"microexon-search", |
581 |
"min-closure-exon=", |
582 |
"min-closure-intron=", |
583 |
"max-closure-intron=", |
584 |
"min-coverage-intron=", |
585 |
"max-coverage-intron=", |
586 |
"min-segment-intron=", |
587 |
"max-segment-intron=", |
588 |
"seed-length=", |
589 |
"initial-read-mismatches=", |
590 |
"segment-length=", |
591 |
"segment-mismatches=", |
592 |
"bowtie-n", |
593 |
"butterfly-search", |
594 |
"no-butterfly-search", |
595 |
"keep-tmp", |
596 |
"rg-id=", |
597 |
"rg-sample=", |
598 |
"rg-library=", |
599 |
"rg-description=", |
600 |
"rg-platform-unit=", |
601 |
"rg-center=", |
602 |
"rg-date=", |
603 |
"rg-platform=", |
604 |
"tmp-dir=", |
605 |
"zpacker=", |
606 |
"unmapped-fifo", |
607 |
"max-insertion-length=", |
608 |
"max-deletion-length=", |
609 |
"insertions=", |
610 |
"deletions=", |
611 |
"no-sort-bam", |
612 |
"no-convert-bam"]) |
613 |
except getopt.error, msg: |
614 |
raise Usage(msg) |
615 |
|
616 |
self.splice_constraints.parse_options(opts) |
617 |
self.system_params.parse_options(opts) |
618 |
self.read_params.parse_options(opts) |
619 |
self.search_params.parse_options(opts) |
620 |
self.report_params.parse_options(opts) |
621 |
global use_BWT_FIFO |
622 |
global use_BAM_Unmapped |
623 |
if not self.read_params.color: |
624 |
use_BWT_FIFO=False |
625 |
use_BAM_Unmapped=True |
626 |
global output_dir |
627 |
global logging_dir |
628 |
global tmp_dir |
629 |
global sam_header |
630 |
|
631 |
custom_tmp_dir = None |
632 |
custom_out_dir = None |
633 |
# option processing |
634 |
for option, value in opts: |
635 |
if option in ("-v", "--version"): |
636 |
print "TopHat v%s" % (get_version()) |
637 |
sys.exit(0) |
638 |
if option in ("-h", "--help"): |
639 |
raise Usage(use_message) |
640 |
if option in ("-g", "--max-multihits"): |
641 |
self.max_hits = int(value) |
642 |
if option in ("-G", "--GTF"): |
643 |
self.gff_annotation = value |
644 |
if option in ("-j", "--raw-juncs"): |
645 |
self.raw_junctions = value |
646 |
if option == "--no-novel-juncs": |
647 |
self.find_novel_juncs = False |
648 |
if option == "--no-novel-indels": |
649 |
self.find_novel_indels = False |
650 |
if option == "--no-gtf-juncs": |
651 |
self.find_GFF_juncs = False |
652 |
if option == "--skip-check-reads": |
653 |
self.skip_check_reads = True |
654 |
if option == "--no-coverage-search": |
655 |
self.coverage_search = False |
656 |
if option == "--no-closure-search": |
657 |
self.closure_search = False |
658 |
if option == "--coverage-search": |
659 |
self.coverage_search = True |
660 |
if option == "--closure-search": |
661 |
self.closure_search = True |
662 |
if option == "--microexon-search": |
663 |
self.microexon_search = True |
664 |
if option == "--butterfly-search": |
665 |
self.butterfly_search = True |
666 |
if option == "--no-butterfly-search": |
667 |
self.butterfly_search = False |
668 |
if option == "--initial-read-mismatches": |
669 |
self.initial_read_mismatches = int(value) |
670 |
if option == "--segment-length": |
671 |
self.segment_length = int(value) |
672 |
if option == "--segment-mismatches": |
673 |
self.segment_mismatches = int(value) |
674 |
if option == "--bowtie-n": |
675 |
self.bowtie_alignment_option = "-n" |
676 |
if option == "--max-insertion-length": |
677 |
self.max_insertion_length = int(value) |
678 |
if option == "--max-deletion-length": |
679 |
self.max_deletion_length = int(value) |
680 |
if option == "--insertions": |
681 |
self.raw_insertions = value |
682 |
if option == "--deletions": |
683 |
self.raw_deletions = value |
684 |
if option in ("-o", "--output-dir"): |
685 |
custom_out_dir = value + "/" |
686 |
if option == "--tmp-dir": |
687 |
custom_tmp_dir = value + "/" |
688 |
|
689 |
if custom_out_dir != None: |
690 |
output_dir = custom_out_dir |
691 |
logging_dir = output_dir + "logs/" |
692 |
tmp_dir = output_dir + "tmp/" |
693 |
sam_header = tmp_dir + "stub_header.sam" |
694 |
if custom_tmp_dir != None: |
695 |
tmp_dir = custom_tmp_dir |
696 |
sam_header = tmp_dir + "stub_header.sam" |
697 |
if len(args) < 2: |
698 |
raise Usage(use_message) |
699 |
return args |
700 |
|
701 |
|
702 |
def getFileDir(filepath): |
703 |
#if fullpath, returns path including the ending / |
704 |
fpath, fname=os.path.split(filepath) |
705 |
if fpath: fpath+='/' |
706 |
return fpath |
707 |
|
708 |
def getFileBaseName(filepath): |
709 |
fpath, fname=os.path.split(filepath) |
710 |
fbase, fext =os.path.splitext(fname) |
711 |
fx=fext.lower() |
712 |
if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fbase)>0: |
713 |
return fbase |
714 |
elif fx == '.z' or fx.find('.gz')==0 or fx.find('.bz')==0: |
715 |
fb, fext = os.path.splitext(fbase) |
716 |
fx=fext.lower() |
717 |
if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fb)>0: |
718 |
return fb |
719 |
else: |
720 |
return fbase |
721 |
else: |
722 |
if len(fbase)>len(fext): |
723 |
return fbase |
724 |
else: |
725 |
return fname |
726 |
|
727 |
# Returns the current time in a nice format |
728 |
def right_now(): |
729 |
curr_time = datetime.now() |
730 |
return curr_time.strftime("%c") |
731 |
|
732 |
# Ensures that the output, logging, and temp directories are present. If not, |
733 |
# they are created |
734 |
def prepare_output_dir(): |
735 |
|
736 |
print >> sys.stderr, "[%s] Preparing output location %s" % (right_now(), output_dir) |
737 |
if os.path.exists(output_dir): |
738 |
pass |
739 |
else: |
740 |
os.mkdir(output_dir) |
741 |
|
742 |
if os.path.exists(logging_dir): |
743 |
pass |
744 |
else: |
745 |
os.mkdir(logging_dir) |
746 |
|
747 |
if os.path.exists(tmp_dir): |
748 |
pass |
749 |
else: |
750 |
try: |
751 |
os.makedirs(tmp_dir) |
752 |
except OSError, o: |
753 |
die("\nError creating directory %s (%s)" % (tmp_dir, o)) |
754 |
|
755 |
|
756 |
# to be added as preexec_fn for every subprocess.Popen() call: |
757 |
# see http://bugs.python.org/issue1652 |
758 |
def subprocess_setup(): |
759 |
# Python installs a SIGPIPE handler by default, which causes |
760 |
# gzip or other de/compression pipes to complain about "stdout: Broken pipe" |
761 |
signal.signal(signal.SIGPIPE, signal.SIG_DFL) |
762 |
|
763 |
# Check that the Bowtie index specified by the user is present and all files |
764 |
# are there. |
765 |
def check_bowtie_index(idx_prefix): |
766 |
print >> sys.stderr, "[%s] Checking for Bowtie index files" % right_now() |
767 |
|
768 |
idx_fwd_1 = idx_prefix + ".1.ebwt" |
769 |
idx_fwd_2 = idx_prefix + ".2.ebwt" |
770 |
idx_rev_1 = idx_prefix + ".rev.1.ebwt" |
771 |
idx_rev_2 = idx_prefix + ".rev.2.ebwt" |
772 |
|
773 |
if os.path.exists(idx_fwd_1) and \ |
774 |
os.path.exists(idx_fwd_2) and \ |
775 |
os.path.exists(idx_rev_1) and \ |
776 |
os.path.exists(idx_rev_2): |
777 |
return |
778 |
else: |
779 |
bowtie_idx_env_var = os.environ.get("BOWTIE_INDEXES") |
780 |
if bowtie_idx_env_var == None: |
781 |
die("Error: Could not find Bowtie index files " + idx_prefix + ".*") |
782 |
idx_prefix = bowtie_idx_env_var + idx_prefix |
783 |
idx_fwd_1 = idx_prefix + ".1.ebwt" |
784 |
idx_fwd_2 = idx_prefix + ".2.ebwt" |
785 |
idx_rev_1 = idx_prefix + ".rev.1.ebwt" |
786 |
idx_rev_2 = idx_prefix + ".rev.2.ebwt" |
787 |
|
788 |
if os.path.exists(idx_fwd_1) and \ |
789 |
os.path.exists(idx_fwd_2) and \ |
790 |
os.path.exists(idx_rev_1) and \ |
791 |
os.path.exists(idx_rev_2): |
792 |
return |
793 |
else: |
794 |
die("Error: Could not find Bowtie index files " + idx_prefix + ".*") |
795 |
|
796 |
# Reconstructs the multifasta file from which the Bowtie index was created, if |
797 |
# it's not already there. |
798 |
def bowtie_idx_to_fa(idx_prefix): |
799 |
idx_name = idx_prefix.split('/')[-1] |
800 |
print >> sys.stderr, "[%s] Reconstituting reference FASTA file from Bowtie index" % (right_now()) |
801 |
|
802 |
try: |
803 |
tmp_fasta_file_name = tmp_dir + idx_name + ".fa" |
804 |
tmp_fasta_file = open(tmp_fasta_file_name, "w") |
805 |
|
806 |
inspect_log = open(logging_dir + "bowtie_inspect_recons.log", "w") |
807 |
|
808 |
inspect_cmd = [prog_path("bowtie-inspect"), |
809 |
idx_prefix] |
810 |
|
811 |
print >> sys.stderr, "Executing: " + " ".join(inspect_cmd) + " > " + tmp_fasta_file_name |
812 |
ret = subprocess.call(inspect_cmd, |
813 |
stdout=tmp_fasta_file, |
814 |
stderr=inspect_log) |
815 |
|
816 |
# Bowtie reported an error |
817 |
if ret != 0: |
818 |
die(fail_str+"Error: bowtie-inspect returned an error") |
819 |
|
820 |
# Bowtie not found |
821 |
except OSError, o: |
822 |
if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT: |
823 |
die(fail_str+"Error: bowtie-inspect not found on this system. Did you forget to include it in your PATH?") |
824 |
|
825 |
return tmp_fasta_file_name |
826 |
|
827 |
# Checks whether the multifasta file for the genome is present alongside the |
828 |
# Bowtie index files for it. |
829 |
def check_fasta(idx_prefix): |
830 |
print >> sys.stderr, "[%s] Checking for reference FASTA file" % right_now() |
831 |
idx_fasta = idx_prefix + ".fa" |
832 |
if os.path.exists(idx_fasta): |
833 |
return idx_fasta |
834 |
else: |
835 |
bowtie_idx_env_var = os.environ.get("BOWTIE_INDEXES") |
836 |
if bowtie_idx_env_var != None: |
837 |
idx_fasta = bowtie_idx_env_var + idx_prefix + ".fa" |
838 |
if os.path.exists(idx_fasta): |
839 |
return idx_fasta |
840 |
|
841 |
print >> sys.stderr, "\tWarning: Could not find FASTA file " + idx_fasta |
842 |
idx_fa = bowtie_idx_to_fa(idx_prefix) |
843 |
return idx_fa |
844 |
|
845 |
# Check that both the Bowtie index and the genome's fasta file are present |
846 |
def check_index(idx_prefix): |
847 |
check_bowtie_index(idx_prefix) |
848 |
ref_fasta_file = check_fasta(idx_prefix) |
849 |
|
850 |
return (ref_fasta_file, None) |
851 |
|
852 |
# Retrive a tuple containing the system's version of Bowtie. Parsed from |
853 |
# `bowtie --version` |
854 |
def get_bowtie_version(): |
855 |
try: |
856 |
# Launch Bowtie to capture its version info |
857 |
proc = subprocess.Popen([bowtie_path, "--version"], |
858 |
stdout=subprocess.PIPE) |
859 |
stdout_value = proc.communicate()[0] |
860 |
bowtie_version = None |
861 |
bowtie_out = repr(stdout_value) |
862 |
|
863 |
# Find the version identifier |
864 |
version_str = "bowtie version " |
865 |
ver_str_idx = bowtie_out.find(version_str) |
866 |
if ver_str_idx != -1: |
867 |
nl = bowtie_out.find("\\n", ver_str_idx) |
868 |
version_val = bowtie_out[ver_str_idx + len(version_str):nl] |
869 |
bowtie_version = [int(x) for x in version_val.split('.')] |
870 |
if len(bowtie_version) == 3: |
871 |
bowtie_version.append(0) |
872 |
|
873 |
return bowtie_version |
874 |
except OSError, o: |
875 |
errmsg=fail_str+str(o)+"\n" |
876 |
if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT: |
877 |
errmsg+="Error: bowtie not found on this system" |
878 |
die(errmsg) |
879 |
|
880 |
def get_index_sam_header(read_params, idx_prefix): |
881 |
try: |
882 |
bowtie_sam_header_filename = tmp_dir+idx_prefix.split('/')[-1]+".bwt.samheader.sam" |
883 |
bowtie_sam_header_file = open(bowtie_sam_header_filename,"w") |
884 |
|
885 |
bowtie_header_cmd = [bowtie_path, "--sam"] |
886 |
if read_params.color: |
887 |
bowtie_header_cmd.append('-C') |
888 |
bowtie_header_cmd.extend([idx_prefix, '/dev/null']) |
889 |
subprocess.call(bowtie_header_cmd, |
890 |
stdout=bowtie_sam_header_file, |
891 |
stderr=open('/dev/null')) |
892 |
|
893 |
bowtie_sam_header_file.close() |
894 |
bowtie_sam_header_file = open(bowtie_sam_header_filename,"r") |
895 |
|
896 |
sam_header_file = open(sam_header, "w") |
897 |
|
898 |
preamble = [] |
899 |
sq_dict_lines = [] |
900 |
|
901 |
for line in bowtie_sam_header_file.readlines(): |
902 |
line = line.strip() |
903 |
if line.find("@SQ") != -1: |
904 |
# Sequence dictionary record |
905 |
cols = line.split('\t') |
906 |
seq_name = None |
907 |
for col in cols: |
908 |
fields = col.split(':') |
909 |
#print fields |
910 |
if len(fields) > 0 and fields[0] == "SN": |
911 |
seq_name = fields[1] |
912 |
if seq_name == None: |
913 |
die("Error: malformed sequence dictionary in sam header") |
914 |
sq_dict_lines.append([seq_name,line]) |
915 |
elif line.find("CL"): |
916 |
continue |
917 |
else: |
918 |
preamble.append(line) |
919 |
#for line in preamble: |
920 |
# print >> sam_header_file, line |
921 |
print >> sam_header_file, "@HD\tVN:1.0\tSO:coordinate" |
922 |
|
923 |
if read_params.read_group_id and read_params.sample_id: |
924 |
rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id, |
925 |
read_params.sample_id) |
926 |
if read_params.library_id: |
927 |
rg_str += "\tLB:%s" % read_params.library_id |
928 |
if read_params.description: |
929 |
rg_str += "\tDS:%s" % read_params.description |
930 |
if read_params.seq_platform_unit: |
931 |
rg_str += "\tPU:%s" % read_params.seq_platform_unit |
932 |
if read_params.seq_center: |
933 |
rg_str += "\tCN:%s" % read_params.seq_center |
934 |
if read_params.mate_inner_dist: |
935 |
rg_str += "\tPI:%s" % read_params.mate_inner_dist |
936 |
if read_params.seq_run_date: |
937 |
rg_str += "\tDT:%s" % read_params.seq_run_date |
938 |
if read_params.seq_platform: |
939 |
rg_str += "\tPL:%s" % read_params.seq_platform |
940 |
|
941 |
print >> sam_header_file, rg_str |
942 |
|
943 |
sq_dict_lines.sort(lambda x,y: cmp(x[0],y[0])) |
944 |
for [name, line] in sq_dict_lines: |
945 |
print >> sam_header_file, line |
946 |
print >> sam_header_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd) |
947 |
|
948 |
sam_header_file.close() |
949 |
|
950 |
return sam_header |
951 |
|
952 |
except OSError, o: |
953 |
errmsg=fail_str+str(o)+"\n" |
954 |
if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT: |
955 |
errmsg+="Error: bowtie not found on this system" |
956 |
die(errmsg) |
957 |
|
958 |
# Make sure Bowtie is installed and is recent enough to be useful |
959 |
def check_bowtie(): |
960 |
print >> sys.stderr, "[%s] Checking for Bowtie" % right_now() |
961 |
global bowtie_path |
962 |
bowtie_path=prog_path("bowtie") |
963 |
bowtie_version = get_bowtie_version() |
964 |
if bowtie_version == None: |
965 |
die("Error: Bowtie not found on this system") |
966 |
elif bowtie_version[0] < 1 and bowtie_version[1] < 12 and bowtie_version[2] < 3: |
967 |
die("Error: TopHat requires Bowtie 0.12.3 or later") |
968 |
print >> sys.stderr, "\tBowtie version:\t\t\t %s" % ".".join([str(x) for x in bowtie_version]) |
969 |
|
970 |
|
971 |
# Retrive a tuple containing the system's version of samtools. Parsed from |
972 |
# `samtools` |
973 |
def get_samtools_version(): |
974 |
try: |
975 |
# Launch Bowtie to capture its version info |
976 |
proc = subprocess.Popen(samtools_path, stderr=subprocess.PIPE) |
977 |
samtools_out = proc.communicate()[1] |
978 |
|
979 |
# Find the version identifier |
980 |
version_match = re.search(r'Version:\s+(\d+)\.(\d+).(\d+)([a-zA-Z]?)', samtools_out) |
981 |
samtools_version_arr = [int(version_match.group(x)) for x in [1,2,3]] |
982 |
if version_match.group(4): |
983 |
samtools_version_arr.append(version_match.group(4)) |
984 |
else: |
985 |
samtools_version_arr.append(0) |
986 |
|
987 |
return version_match.group(), samtools_version_arr |
988 |
except OSError, o: |
989 |
errmsg=fail_str+str(o)+"\n" |
990 |
if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT: |
991 |
errmsg+="Error: samtools not found on this system" |
992 |
die(errmsg) |
993 |
|
994 |
# Make sure the SAM tools are installed and are recent enough to be useful |
995 |
def check_samtools(): |
996 |
print >> sys.stderr, "[%s] Checking for Samtools" % right_now() |
997 |
global samtools_path |
998 |
samtools_path=prog_path("samtools") |
999 |
samtools_version_str, samtools_version_arr = get_samtools_version() |
1000 |
if samtools_version_str == None: |
1001 |
die("Error: Samtools not found on this system") |
1002 |
elif samtools_version_arr[1] < 1 or samtools_version_arr[2] < 7: |
1003 |
die("Error: TopHat requires Samtools 0.1.7 or later") |
1004 |
print >> sys.stderr, "\tSamtools %s" % samtools_version_str |
1005 |
|
1006 |
|
1007 |
|
1008 |
class FastxReader: |
1009 |
def __init__(self, i_file, is_color=0, fname=''): |
1010 |
self.bufline=None |
1011 |
self.format=None |
1012 |
self.ifile=i_file |
1013 |
self.nextRecord=None |
1014 |
self.eof=None |
1015 |
self.fname=fname |
1016 |
self.lastline=None |
1017 |
self.numrecords=0 |
1018 |
self.isColor=0 |
1019 |
if is_color : self.isColor=1 |
1020 |
# determine file type |
1021 |
#no records processed yet, skip custom header lines if any |
1022 |
hlines=10 # allow maximum 10 header lines |
1023 |
self.lastline=" " |
1024 |
while hlines>0 and self.lastline[0] not in "@>" : |
1025 |
self.lastline=self.ifile.readline() |
1026 |
hlines-=1 |
1027 |
if self.lastline[0] == '@': |
1028 |
self.format='fastq' |
1029 |
self.nextRecord=self.nextFastq |
1030 |
elif self.lastline[0] == '>': |
1031 |
self.format='fasta' |
1032 |
self.nextRecord=self.nextFasta |
1033 |
else: |
1034 |
die("Error: cannot determine record type in input file %s" % fname) |
1035 |
self.bufline=self.lastline |
1036 |
self.lastline=None |
1037 |
|
1038 |
def nextFastq(self): |
1039 |
# returning tuple: (seqID, sequence_string, seq_len, qv_string) |
1040 |
seqid,seqstr,qstr,seq_len='','','',0 |
1041 |
if self.eof: return (seqid, seqstr, seq_len, qstr) |
1042 |
fline=self.getLine #shortcut to save a bit of time |
1043 |
line=fline() |
1044 |
|
1045 |
if not line : return (seqid, seqstr, seq_len, qstr) |
1046 |
while len(line.rstrip())==0: # skip empty lines |
1047 |
line=fline() |
1048 |
if not line : return (seqid, seqstr,seq_len, qstr) |
1049 |
try: |
1050 |
if line[0] != "@": |
1051 |
raise ValueError("Records in Fastq files should start with '@' character") |
1052 |
seqid = line[1:].rstrip() |
1053 |
seqstr = fline().rstrip() |
1054 |
|
1055 |
#There may now be more sequence lines, or the "+" quality marker line: |
1056 |
while True: |
1057 |
line = fline() |
1058 |
if not line: |
1059 |
raise ValueError("Premature end of file (missing quality values for "+seqid+")") |
1060 |
if line[0] == "+": |
1061 |
# -- sequence string ended |
1062 |
#qtitle = line[1:].rstrip() |
1063 |
#if qtitle and qtitle != seqid: |
1064 |
# raise ValueError("Different read ID for sequence and quality (%s vs %s)" \ |
1065 |
# % (seqid, qtitle)) |
1066 |
break |
1067 |
seqstr += line.rstrip() #removes trailing newlines |
1068 |
#loop until + found |
1069 |
seq_len = len(seqstr) |
1070 |
#at least one line of quality data should follow |
1071 |
qstrlen=0 |
1072 |
#now read next lines as quality values until seq_len is reached |
1073 |
while True: |
1074 |
line=fline() |
1075 |
if not line : break #end of file |
1076 |
qstr += line.rstrip() |
1077 |
qstrlen=len(qstr) |
1078 |
if qstrlen + self.isColor >= seq_len : |
1079 |
break # qv string has reached the length of seq string |
1080 |
#loop until qv has the same length as seq |
1081 |
if self.isColor and qstr[0]=="!": |
1082 |
#some color qual string have a '!' character at the beginning, should be stripped |
1083 |
qstr = qstr[1:] |
1084 |
qstrlen -= 1 |
1085 |
if seq_len != qstrlen+self.isColor : |
1086 |
raise ValueError("Length mismatch between sequence and quality strings "+ \ |
1087 |
"for %s (%i vs %i)." \ |
1088 |
% (seqid, seq_len, qstrlen)) |
1089 |
except ValueError, err: |
1090 |
die("\nError encountered parsing file "+self.fname+":\n "+str(err)) |
1091 |
#return the record |
1092 |
self.numrecords+=1 |
1093 |
if self.isColor : |
1094 |
seq_len-=1 |
1095 |
seqstr = seqstr[1:] |
1096 |
return (seqid, seqstr, seq_len, qstr) |
1097 |
|
1098 |
def nextFasta(self): |
1099 |
# returning tuple: (seqID, sequence_string, seq_len) |
1100 |
seqid,seqstr,seq_len='','',0 |
1101 |
fline=self.getLine # shortcut to readline function of f |
1102 |
line=fline() # this will use the buffer line if it's there |
1103 |
if not line : return (seqid, seqstr, seq_len, None) |
1104 |
while len(line.rstrip())==0: # skip empty lines |
1105 |
line=fline() |
1106 |
if not line : return (seqid, seqstr, seq_len, None) |
1107 |
try: |
1108 |
if line[0] != ">": |
1109 |
raise ValueError("Records in Fasta files must start with '>' character") |
1110 |
seqid = line[1:].split()[0] |
1111 |
#more sequence lines, or the ">" quality marker line: |
1112 |
while True: |
1113 |
line = fline() |
1114 |
if not line: break |
1115 |
if line[0] == '>': |
1116 |
#next sequence starts here |
1117 |
self.ungetLine() |
1118 |
break |
1119 |
seqstr += line.rstrip() |
1120 |
#loop until '>' found |
1121 |
seq_len = len(seqstr) |
1122 |
if seq_len < 3: |
1123 |
raise ValueError("Read %s too short (%i)." \ |
1124 |
% (seqid, seq_len)) |
1125 |
except ValueError, err: |
1126 |
die("\nError encountered parsing fasta file "+self.fname+"\n "+str(err)) |
1127 |
#return the record and continue |
1128 |
self.numrecords+=1 |
1129 |
if self.isColor : |
1130 |
seq_len-=1 |
1131 |
seqstr = seqstr[1:] |
1132 |
return (seqid, seqstr, seq_len, None) |
1133 |
|
1134 |
def getLine(self): |
1135 |
if self.bufline: #return previously buffered line |
1136 |
r=self.bufline |
1137 |
self.bufline=None |
1138 |
return r |
1139 |
else: #read a new line from stream and return it |
1140 |
if self.eof: return None |
1141 |
self.lastline=self.ifile.readline() |
1142 |
if not self.lastline: |
1143 |
self.eof=1 |
1144 |
return None |
1145 |
return self.lastline |
1146 |
def ungetLine(self): |
1147 |
if self.lastline is None: |
1148 |
print >> sys.stderr, "Warning: FastxReader called ungetLine() with no prior line!" |
1149 |
self.bufline=self.lastline |
1150 |
self.lastline=None |
1151 |
#< class FastxReader |
1152 |
|
1153 |
class ZReader: |
1154 |
def __init__(self, filename, sysparams, guess=True): |
1155 |
self.fname=filename |
1156 |
self.file=None |
1157 |
self.fsrc=None |
1158 |
self.popen=None |
1159 |
pipecmd=[] |
1160 |
s=filename.lower() |
1161 |
if s.endswith(".bam"): |
1162 |
pipecmd=["bam2fastx", "-"] |
1163 |
else: |
1164 |
if guess: |
1165 |
if s.endswith(".z") or s.endswith(".gz") or s.endswith(".gzip"): |
1166 |
pipecmd=['gzip'] |
1167 |
else: |
1168 |
if s.endswith(".bz2") or s.endswith(".bzip2") or s.endswith(".bzip"): |
1169 |
pipecmd=['bzip2'] |
1170 |
if len(pipecmd)>0 and which(pipecmd[0]) is None: |
1171 |
die("Error: cannot find %s to decompress input file %s " % (pipecmd, filename)) |
1172 |
if len(pipecmd)>0: |
1173 |
if pipecmd[0]=='gzip' and sysparams.zipper.endswith('pigz'): |
1174 |
pipecmd[0]=sysparams.zipper |
1175 |
pipecmd.extend(sysparams.zipper_opts) |
1176 |
elif pipecmd[0]=='bzip2' and sysparams.zipper.endswith('pbzip2'): |
1177 |
pipecmd[0]=sysparams.zipper |
1178 |
pipecmd.extend(sysparams.zipper_opts) |
1179 |
else: #not guessing, but we must still be sure it's a compressed file |
1180 |
if use_zpacker and filename.endswith(".z"): |
1181 |
pipecmd=[sysparams.zipper] |
1182 |
pipecmd.extend(sysparams.zipper_opts) |
1183 |
if pipecmd: |
1184 |
pipecmd+=['-cd'] |
1185 |
if pipecmd: |
1186 |
try: |
1187 |
self.fsrc=open(self.fname, 'rb') |
1188 |
self.popen=subprocess.Popen(pipecmd, |
1189 |
preexec_fn=subprocess_setup, |
1190 |
stdin=self.fsrc, |
1191 |
stdout=subprocess.PIPE, close_fds=True) |
1192 |
except Exception: |
1193 |
die("Error: could not open pipe "+' '.join(pipecmd)+' < '+ self.fname) |
1194 |
self.file=self.popen.stdout |
1195 |
else: |
1196 |
self.file=open(filename) |
1197 |
def close(self): |
1198 |
if self.fsrc: self.fsrc.close() |
1199 |
self.file.close() |
1200 |
if self.popen: |
1201 |
self.popen.wait() |
1202 |
self.popen=None |
1203 |
|
1204 |
class ZWriter: |
1205 |
def __init__(self, filename, sysparams): |
1206 |
self.fname=filename |
1207 |
if use_zpacker: |
1208 |
pipecmd=[sysparams.zipper,"-cf", "-"] |
1209 |
self.ftarget=open(filename, "wb") |
1210 |
try: |
1211 |
self.popen=subprocess.Popen(pipecmd, |
1212 |
preexec_fn=subprocess_setup, |
1213 |
stdin=subprocess.PIPE, |
1214 |
stdout=self.ftarget, close_fds=True) |
1215 |
except Exception: |
1216 |
die("Error: could not open writer pipe "+' '.join(pipecmd)+' < '+ self.fname) |
1217 |
self.file=self.popen.stdin # client writes to this end of the pipe |
1218 |
else: #no compression |
1219 |
self.file=open(filename, "w") |
1220 |
self.ftarget=None |
1221 |
self.popen=None |
1222 |
def close(self): |
1223 |
self.file.close() |
1224 |
if self.ftarget: self.ftarget.close() |
1225 |
if self.popen: |
1226 |
self.popen.wait() #! required to actually flush the pipes (eek!) |
1227 |
self.popen=None |
1228 |
|
1229 |
# check_reads_format() examines the first few records in the user files |
1230 |
# to determines the file format |
1231 |
def check_reads_format(params, reads_files): |
1232 |
#print >> sys.stderr, "[%s] Checking reads" % right_now() |
1233 |
# |
1234 |
seed_len = params.read_params.seed_length |
1235 |
fileformat = params.read_params.reads_format |
1236 |
|
1237 |
observed_formats = set([]) |
1238 |
# observed_scales = set([]) |
1239 |
min_seed_len = 99999 |
1240 |
max_seed_len = 0 |
1241 |
files = reads_files.split(',') |
1242 |
|
1243 |
for f_name in files: |
1244 |
#try: |
1245 |
zf = ZReader(f_name, params.system_params) |
1246 |
#except IOError: |
1247 |
# die("Error: could not open file "+f_name) |
1248 |
freader=FastxReader(zf.file, params.read_params.color, zf.fname) |
1249 |
toread=4 #just sample the first 4 reads |
1250 |
while toread>0: |
1251 |
seqid, seqstr, seq_len, qstr = freader.nextRecord() |
1252 |
if not seqid: break |
1253 |
toread-=1 |
1254 |
if seq_len < 20: |
1255 |
print >> sys.stderr, "Warning: found a read < 20bp in", f_name |
1256 |
else: |
1257 |
min_seed_len = min(seq_len, min_seed_len) |
1258 |
max_seed_len = max(seq_len, max_seed_len) |
1259 |
zf.close() |
1260 |
observed_formats.add(freader.format) |
1261 |
if len(observed_formats) > 1: |
1262 |
die("Error: TopHat requires all reads be either FASTQ or FASTA. Mixing formats is not supported.") |
1263 |
fileformat=list(observed_formats)[0] |
1264 |
if seed_len != None: |
1265 |
seed_len = max(seed_len, min_seed_len) |
1266 |
else: |
1267 |
seed_len = max_seed_len |
1268 |
#print >> sys.stderr, "\tmin read length: %dbp, max read length: %dbp" % (min_seed_len, max_seed_len) |
1269 |
print >> sys.stderr, "\tformat:\t\t %s" % fileformat |
1270 |
if fileformat == "fastq": |
1271 |
quality_scale = "phred33 (default)" |
1272 |
if params.read_params.solexa_quals and not params.read_params.phred64_quals: |
1273 |
quality_scale = "solexa33 (reads generated with GA pipeline version < 1.3)" |
1274 |
elif params.read_params.phred64_quals: |
1275 |
quality_scale = "phred64 (reads generated with GA pipeline version >= 1.3)" |
1276 |
print >> sys.stderr, "\tquality scale:\t %s" % quality_scale |
1277 |
elif fileformat == "fasta": |
1278 |
if params.read_params.color: |
1279 |
params.read_params.integer_quals = True |
1280 |
|
1281 |
#print seed_len, format, solexa_scale |
1282 |
#NOTE: seed_len will be re-evaluated later by prep_reads |
1283 |
return TopHatParams.ReadParams(params.read_params.solexa_quals, |
1284 |
params.read_params.phred64_quals, |
1285 |
params.read_params.quals, |
1286 |
params.read_params.integer_quals, |
1287 |
params.read_params.color, |
1288 |
params.read_params.color_out, |
1289 |
params.read_params.library_type, |
1290 |
seed_len, |
1291 |
fileformat, |
1292 |
params.read_params.mate_inner_dist, |
1293 |
params.read_params.mate_inner_dist_std_dev, |
1294 |
params.read_params.read_group_id, |
1295 |
params.read_params.sample_id, |
1296 |
params.read_params.library_id, |
1297 |
params.read_params.description, |
1298 |
params.read_params.seq_platform_unit, |
1299 |
params.read_params.seq_center, |
1300 |
params.read_params.seq_run_date, |
1301 |
params.read_params.seq_platform) |
1302 |
|
1303 |
def log_tail(logfile, lines=1): |
1304 |
f=open(logfile, "r") |
1305 |
f.seek(0, 2) |
1306 |
fbytes= f.tell() |
1307 |
size=lines |
1308 |
block=-1 |
1309 |
while size > 0 and fbytes+block*1024 > 0: |
1310 |
if (fbytes+block*1024 > 0): |
1311 |
##Seek back once more, if possible |
1312 |
f.seek( block*1024, 2 ) |
1313 |
else: |
1314 |
#Seek to the beginning |
1315 |
f.seek(0, 0) |
1316 |
data= f.read( 1024 ) |
1317 |
linesFound= data.count('\n') |
1318 |
size -= linesFound |
1319 |
block -= 1 |
1320 |
if (fbytes + block*1024 > 0): |
1321 |
f.seek(block*1024, 2) |
1322 |
else: |
1323 |
f.seek(0,0) |
1324 |
f.readline() # find a newline |
1325 |
lastBlocks= list( f.readlines() ) |
1326 |
f.close() |
1327 |
return "\n".join(lastBlocks[-lines:]) |
1328 |
|
1329 |
# Format a DateTime as a pretty string. |
1330 |
# FIXME: Currently doesn't support days! |
1331 |
def formatTD(td): |
1332 |
hours = td.seconds // 3600 |
1333 |
minutes = (td.seconds % 3600) // 60 |
1334 |
seconds = td.seconds % 60 |
1335 |
return '%02d:%02d:%02d' % (hours, minutes, seconds) |
1336 |
|
1337 |
class PrepReadsInfo: |
1338 |
def __init__(self, fname, side): |
1339 |
try: |
1340 |
f=open(fname,"r") |
1341 |
self.min_len=int(f.readline().split("=")[-1]) |
1342 |
self.max_len=int(f.readline().split("=")[-1]) |
1343 |
self.in_count=int(f.readline().split("=")[-1]) |
1344 |
self.out_count=int(f.readline().split("=")[-1]) |
1345 |
if (self.out_count==0) or (self.max_len<16): |
1346 |
raise Exception() |
1347 |
except Exception, e: |
1348 |
die(fail_str+"Error retrieving prep_reads info.") |
1349 |
print >> sys.stderr, "\t%s reads: min. length=%s, count=%s" % (side, self.min_len, self.out_count) |
1350 |
if self.min_len < 20: |
1351 |
print >> sys.stderr, "Warning: short reads (<20bp) will make TopHat quite slow and take large amount of memory because they are likely to be mapped to too many places" |
1352 |
|
1353 |
# Calls the prep_reads executable, which prepares an internal read library. |
1354 |
# The read library features reads with monotonically increasing integer IDs. |
1355 |
# prep_reads also filters out very low complexy or garbage reads as well as |
1356 |
# polyA reads. |
1357 |
#--> returns the tuple (internal_read_library_file, read_info) |
1358 |
def prep_reads(params, reads_list, quals_list, output_name, side="Left "): |
1359 |
reads_suffix = ".fq" |
1360 |
if use_zpacker: reads_suffix += ".z" |
1361 |
kept_reads_filename = tmp_dir + output_name + reads_suffix |
1362 |
|
1363 |
if os.path.exists(kept_reads_filename): |
1364 |
os.remove(kept_reads_filename) |
1365 |
kept_reads = open(kept_reads_filename, "wb") |
1366 |
log_fname=logging_dir + "prep_reads.log" |
1367 |
filter_log = open(log_fname,"w") |
1368 |
|
1369 |
filter_cmd = [prog_path("prep_reads")] |
1370 |
filter_cmd.extend(params.cmd()) |
1371 |
|
1372 |
if params.read_params.reads_format == "fastq": |
1373 |
filter_cmd += ["--fastq"] |
1374 |
elif params.read_params.reads_format == "fasta": |
1375 |
filter_cmd += ["--fasta"] |
1376 |
info_file=output_dir+output_name+".info" |
1377 |
filter_cmd += ["--aux-outfile="+info_file] |
1378 |
filter_cmd.append(reads_list) |
1379 |
if params.read_params.quals: |
1380 |
filter_cmd.append(quals_list) |
1381 |
shell_cmd = ' '.join(filter_cmd) |
1382 |
#finally, add the compression pipe |
1383 |
zip_cmd=[] |
1384 |
if use_zpacker: |
1385 |
zip_cmd=[ params.system_params.zipper ] |
1386 |
zip_cmd.extend(params.system_params.zipper_opts) |
1387 |
zip_cmd.extend(['-c','-']) |
1388 |
shell_cmd +=' | '+' '.join(zip_cmd) |
1389 |
shell_cmd += ' >' +kept_reads_filename |
1390 |
retcode=0 |
1391 |
try: |
1392 |
print >> run_log, shell_cmd |
1393 |
if use_zpacker: |
1394 |
filter_proc = subprocess.Popen(filter_cmd, |
1395 |
stdout=subprocess.PIPE, |
1396 |
stderr=filter_log) |
1397 |
zip_proc=subprocess.Popen(zip_cmd, |
1398 |
preexec_fn=subprocess_setup, |
1399 |
stdin=filter_proc.stdout, |
1400 |
stdout=kept_reads) |
1401 |
filter_proc.stdout.close() #as per http://bugs.python.org/issue7678 |
1402 |
zip_proc.communicate() |
1403 |
retcode=filter_proc.poll() |
1404 |
if retcode==0: |
1405 |
retcode=zip_proc.poll() |
1406 |
else: |
1407 |
retcode = subprocess.call(filter_cmd, |
1408 |
stdout=kept_reads, |
1409 |
stderr=filter_log) |
1410 |
if retcode: |
1411 |
die(fail_str+"Error running 'prep_reads'\n"+log_tail(log_fname)) |
1412 |
|
1413 |
except OSError, o: |
1414 |
errmsg=fail_str+str(o) |
1415 |
die(errmsg+"\n"+log_tail(log_fname)) |
1416 |
kept_reads.close() |
1417 |
|
1418 |
return kept_reads_filename, PrepReadsInfo(info_file, side) |
1419 |
|
1420 |
# Call bowtie |
1421 |
def bowtie(params, |
1422 |
bwt_idx_prefix, |
1423 |
reads_list, |
1424 |
reads_format, |
1425 |
num_mismatches, |
1426 |
mapped_reads, |
1427 |
unmapped_reads = None, #not needed vs segment_juncs |
1428 |
extra_output = ""): |
1429 |
start_time = datetime.now() |
1430 |
bwt_idx_name = bwt_idx_prefix.split('/')[-1] |
1431 |
readfile_basename=getFileBaseName(reads_list) |
1432 |
print >> sys.stderr, "[%s] Mapping %s against %s with Bowtie %s" % (start_time.strftime("%c"), |
1433 |
readfile_basename, bwt_idx_name, extra_output) |
1434 |
|
1435 |
bwt_log = open(logging_dir + 'bowtie.'+readfile_basename+'.fixmap.log', "w") |
1436 |
unmapped_reads_out=None |
1437 |
if unmapped_reads: |
1438 |
unmapped_reads_out=unmapped_reads+".fq" |
1439 |
use_FIFO = use_BWT_FIFO and use_zpacker and unmapped_reads and params.read_params.color |
1440 |
if use_FIFO: #this should always be False unless color reads are used with -X option |
1441 |
global unmapped_reads_fifo |
1442 |
unmapped_reads_fifo=unmapped_reads+".fifo" |
1443 |
if os.path.exists(unmapped_reads_fifo): |
1444 |
os.remove(unmapped_reads_fifo) |
1445 |
try: |
1446 |
os.mkfifo(unmapped_reads_fifo) |
1447 |
except OSError, o: |
1448 |
die(fail_str+"Error at mkfifo("+unmapped_reads_fifo+'). '+str(o)) |
1449 |
unmapped_reads_out+=".z" |
1450 |
|
1451 |
# Launch Bowtie |
1452 |
try: |
1453 |
bowtie_cmd = [bowtie_path] |
1454 |
bam_input=0 |
1455 |
unzip_cmd=None |
1456 |
if reads_list.endswith('.bam'): |
1457 |
bam_input=1 |
1458 |
unzip_cmd=[ prog_path('bam2fastx') ] |
1459 |
if reads_format: |
1460 |
unzip_cmd.append("--"+reads_format) |
1461 |
unzip_cmd+=[reads_list] |
1462 |
|
1463 |
if reads_format == "fastq": |
1464 |
bowtie_cmd += ["-q"] |
1465 |
elif reads_format == "fasta": |
1466 |
bowtie_cmd += ["-f"] |
1467 |
|
1468 |
if params.read_params.color: |
1469 |
bowtie_cmd += ["-C", "--col-keepends"] |
1470 |
if unmapped_reads: |
1471 |
bowtie_cmd += ["--un", unmapped_reads_fifo] |
1472 |
# "--max", "/dev/null"] |
1473 |
if use_zpacker and (unzip_cmd is None): |
1474 |
unzip_cmd=[ params.system_params.zipper ] |
1475 |
unzip_cmd.extend(params.system_params.zipper_opts) |
1476 |
unzip_cmd+=['-cd'] |
1477 |
|
1478 |
fifo_pid=None |
1479 |
if use_FIFO: |
1480 |
unm_zipcmd=[ params.system_params.zipper ] |
1481 |
unm_zipcmd.extend(params.system_params.zipper_opts) |
1482 |
unm_zipcmd+=['-c'] |
1483 |
print >> run_log, ' '.join(unm_zipcmd)+' < '+ unmapped_reads_fifo + ' > '+ unmapped_reads_out + ' & ' |
1484 |
fifo_pid=os.fork() |
1485 |
if fifo_pid==0: |
1486 |
def on_sig_exit(sig, func=None): |
1487 |
os._exit(os.EX_OK) |
1488 |
signal.signal(signal.SIGTERM, on_sig_exit) |
1489 |
subprocess.call(unm_zipcmd, |
1490 |
stdin=open(unmapped_reads_fifo, "r"), |
1491 |
stdout=open(unmapped_reads_out, "wb")) |
1492 |
os._exit(os.EX_OK) |
1493 |
|
1494 |
bowtie_cmd += [params.bowtie_alignment_option, str(num_mismatches), |
1495 |
"-p", str(params.system_params.num_cpus), |
1496 |
"-k", str(params.max_hits), |
1497 |
"-m", str(params.max_hits), |
1498 |
"--max", "/dev/null", |
1499 |
"-S", "--sam-nohead"] #always use headerless SAM file |
1500 |
fix_map_cmd = [prog_path('fix_map_ordering')] |
1501 |
sam_zipcmd=None |
1502 |
if unmapped_reads: |
1503 |
#mapping on the genome |
1504 |
#fix_map_ordering will write BAM file directly, and unmapped_reads as BAM file too |
1505 |
mapped_reads += ".bam" |
1506 |
if params.read_params.color: |
1507 |
fix_map_cmd += ["--sam-header", sam_header, "-", mapped_reads] |
1508 |
else: |
1509 |
unmapped_reads_out = unmapped_reads+".bam" |
1510 |
fix_map_cmd += ["--sam-header", sam_header, "-", mapped_reads, unmapped_reads_out] |
1511 |
else: |
1512 |
#mapping on segment_juncs (spliced potential junctions) |
1513 |
# don't use SAM headers, could be slow -- better compress SAM output directly |
1514 |
mapped_reads += ".sam" |
1515 |
fix_map_cmd += ["-"] |
1516 |
if use_zpacker: |
1517 |
fix_map_cmd += ["-"] |
1518 |
sam_zipcmd=[ params.system_params.zipper ] |
1519 |
sam_zipcmd.extend(params.system_params.zipper_opts) |
1520 |
sam_zipcmd += ["-c"] |
1521 |
mapped_reads += ".z" |
1522 |
else: |
1523 |
fix_map_cmd += [mapped_reads] |
1524 |
|
1525 |
bowtie_cmd+=[bwt_idx_prefix] |
1526 |
bowtie_proc=None |
1527 |
shellcmd="" |
1528 |
unzip_proc=None |
1529 |
mapzip_proc=None |
1530 |
if unzip_cmd: |
1531 |
# could be bam2fastx |
1532 |
if bam_input: |
1533 |
unzip_proc = subprocess.Popen(unzip_cmd, stdout=subprocess.PIPE) |
1534 |
shellcmd=' '.join(unzip_cmd) + "|" |
1535 |
else: |
1536 |
unzip_proc = subprocess.Popen(unzip_cmd, |
1537 |
stdin=open(reads_list, "rb"), |
1538 |
stdout=subprocess.PIPE) |
1539 |
shellcmd=' '.join(unzip_cmd) + "< " +reads_list +"|" |
1540 |
bowtie_cmd += ['-'] |
1541 |
bowtie_proc = subprocess.Popen(bowtie_cmd, |
1542 |
stdin=unzip_proc.stdout, |
1543 |
stdout=subprocess.PIPE, |
1544 |
stderr=bwt_log) |
1545 |
unzip_proc.stdout.close() # see http://bugs.python.org/issue7678 |
1546 |
else: |
1547 |
bowtie_cmd += [reads_list] |
1548 |
bowtie_proc = subprocess.Popen(bowtie_cmd, |
1549 |
stdout=subprocess.PIPE, |
1550 |
stderr=bwt_log) |
1551 |
shellcmd += " ".join(bowtie_cmd) + " | " + " ".join(fix_map_cmd) |
1552 |
if sam_zipcmd: |
1553 |
shellcmd += "|" + " ".join(sam_zipcmd)+" > " + mapped_reads |
1554 |
fix_order_proc = subprocess.Popen(fix_map_cmd, |
1555 |
stdin=bowtie_proc.stdout, |
1556 |
stdout=subprocess.PIPE) |
1557 |
bowtie_proc.stdout.close() |
1558 |
mapzip_proc = subprocess.Popen(sam_zipcmd, |
1559 |
stdin=fix_order_proc.stdout, |
1560 |
stdout=open(mapped_reads, "wb")) |
1561 |
fix_order_proc.stdout.close() |
1562 |
else: |
1563 |
fix_order_proc = subprocess.Popen(fix_map_cmd, |
1564 |
stdin=bowtie_proc.stdout) |
1565 |
bowtie_proc.stdout.close() |
1566 |
|
1567 |
print >> run_log, shellcmd |
1568 |
if mapzip_proc: |
1569 |
mapzip_proc.communicate() |
1570 |
else: |
1571 |
fix_order_proc.communicate() |
1572 |
if use_FIFO: |
1573 |
if fifo_pid and not os.path.exists(unmapped_reads_out): |
1574 |
try: |
1575 |
os.kill(fifo_pid, signal.SIGTERM) |
1576 |
except: |
1577 |
pass |
1578 |
except OSError, o: |
1579 |
die(fail_str+"Error: "+str(o)) |
1580 |
|
1581 |
# Success |
1582 |
#finish_time = datetime.now() |
1583 |
#duration = finish_time - start_time |
1584 |
#print >> sys.stderr, "\t\t\t[%s elapsed]" % formatTD(duration) |
1585 |
if use_FIFO: |
1586 |
try: |
1587 |
os.remove(unmapped_reads_fifo) |
1588 |
except: |
1589 |
pass |
1590 |
return (mapped_reads, unmapped_reads_out) |
1591 |
|
1592 |
def bowtie_segment(params, |
1593 |
bwt_idx_prefix, |
1594 |
reads_list, |
1595 |
reads_format, |
1596 |
mapped_reads, |
1597 |
unmapped_reads = None, |
1598 |
extra_output = ""): |
1599 |
|
1600 |
backup_bowtie_alignment_option = params.bowtie_alignment_option |
1601 |
params.bowtie_alignment_option = "-v" |
1602 |
params.max_hits *= 2 |
1603 |
|
1604 |
result = bowtie(params, |
1605 |
bwt_idx_prefix, |
1606 |
reads_list, |
1607 |
reads_format, |
1608 |
params.segment_mismatches, |
1609 |
mapped_reads, |
1610 |
unmapped_reads, |
1611 |
extra_output) |
1612 |
|
1613 |
params.bowtie_alignment_option = backup_bowtie_alignment_option |
1614 |
params.max_hits /= 2 |
1615 |
return result |
1616 |
|
1617 |
# Retrieve a .juncs file from a GFF file by calling the gtf_juncs executable |
1618 |
def get_gtf_juncs(gff_annotation): |
1619 |
print >> sys.stderr, "[%s] Reading known junctions from GTF file" % (right_now()) |
1620 |
gtf_juncs_log = open(logging_dir + "gtf_juncs.log", "w") |
1621 |
|
1622 |
gff_prefix = gff_annotation.split('/')[-1].split('.')[0] |
1623 |
|
1624 |
gtf_juncs_out_name = tmp_dir + gff_prefix + ".juncs" |
1625 |
gtf_juncs_out = open(gtf_juncs_out_name, "w") |
1626 |
|
1627 |
gtf_juncs_cmd=[prog_path("gtf_juncs"), gff_annotation] |
1628 |
try: |
1629 |
print >> run_log, " ".join(gtf_juncs_cmd) |
1630 |
retcode = subprocess.call(gtf_juncs_cmd, |
1631 |
stderr=gtf_juncs_log, |
1632 |
stdout=gtf_juncs_out) |
1633 |
# cvg_islands returned an error |
1634 |
if retcode == 1: |
1635 |
print >> sys.stderr, "\tWarning: TopHat did not find any junctions in GTF file" |
1636 |
return (False, gtf_juncs_out_name) |
1637 |
elif retcode != 0: |
1638 |
die(fail_str+"Error: GTF junction extraction failed with err ="+str(retcode)) |
1639 |
# cvg_islands not found |
1640 |
except OSError, o: |
1641 |
errmsg=fail_str+str(o)+"\n" |
1642 |
if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT: |
1643 |
errmsg+="Error: gtf_juncs not found on this system" |
1644 |
die(errmsg) |
1645 |
return (True, gtf_juncs_out_name) |
1646 |
|
1647 |
# Call bowtie-build on the FASTA file of sythetic splice junction sequences |
1648 |
def build_juncs_bwt_index(external_splice_prefix, color): |
1649 |
print >> sys.stderr, "[%s] Indexing splices" % (right_now()) |
1650 |
bowtie_build_log = open(logging_dir + "bowtie_build.log", "w") |
1651 |
|
1652 |
#user_splices_out_prefix = output_dir + "user_splices_idx" |
1653 |
|
1654 |
bowtie_build_cmd = [prog_path("bowtie-build")] |
1655 |
if color: |
1656 |
bowtie_build_cmd += ["-C"] |
1657 |
|
1658 |
bowtie_build_cmd += [external_splice_prefix + ".fa", |
1659 |
external_splice_prefix] |
1660 |
try: |
1661 |
print >> run_log, " ".join(bowtie_build_cmd) |
1662 |
retcode = subprocess.call(bowtie_build_cmd, |
1663 |
stdout=bowtie_build_log) |
1664 |
|
1665 |
if retcode != 0: |
1666 |
die(fail_str+"Error: Splice sequence indexing failed with err ="+ str(retcode)) |
1667 |
except OSError, o: |
1668 |
errmsg=fail_str+str(o)+"\n" |
1669 |
if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT: |
1670 |
errmsg+="Error: bowtie-build not found on this system" |
1671 |
die(errmsg) |
1672 |
return external_splice_prefix |
1673 |
|
1674 |
# Build a splice index from a .juncs file, suitable for use with specified read |
1675 |
# (or read segment) lengths |
1676 |
def build_juncs_index(min_anchor_length, |
1677 |
read_length, |
1678 |
juncs_prefix, |
1679 |
external_juncs, |
1680 |
external_insertions, |
1681 |
external_deletions, |
1682 |
reference_fasta, |
1683 |
color): |
1684 |
print >> sys.stderr, "[%s] Retrieving sequences for splices" % (right_now()) |
1685 |
|
1686 |
juncs_file_list = ",".join(external_juncs) |
1687 |
insertions_file_list = ",".join(external_insertions) |
1688 |
deletions_file_list = ",".join(external_deletions) |
1689 |
|
1690 |
|
1691 |
juncs_db_log = open(logging_dir + "juncs_db.log", "w") |
1692 |
|
1693 |
external_splices_out_prefix = tmp_dir + juncs_prefix |
1694 |
external_splices_out_name = external_splices_out_prefix + ".fa" |
1695 |
|
1696 |
external_splices_out = open(external_splices_out_name, "w") |
1697 |
# juncs_db_cmd = [bin_dir + "juncs_db", |
1698 |
juncs_db_cmd = [prog_path("juncs_db"), |
1699 |
str(min_anchor_length), |
1700 |
str(read_length), |
1701 |
juncs_file_list, |
1702 |
insertions_file_list, |
1703 |
deletions_file_list, |
1704 |
reference_fasta] |
1705 |
try: |
1706 |
print >> run_log, " ".join(juncs_db_cmd) |
1707 |
retcode = subprocess.call(juncs_db_cmd, |
1708 |
stderr=juncs_db_log, |
1709 |
stdout=external_splices_out) |
1710 |
|
1711 |
if retcode != 0: |
1712 |
die(fail_str+"Error: Splice sequence retrieval failed with err ="+str(retcode)) |
1713 |
# juncs_db not found |
1714 |
except OSError, o: |
1715 |
errmsg=fail_str+str(o)+"\n" |
1716 |
if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT: |
1717 |
errmsg+="Error: juncs_db not found on this system" |
1718 |
die(errmsg) |
1719 |
|
1720 |
external_splices_out_prefix = build_juncs_bwt_index(external_splices_out_prefix, color) |
1721 |
return external_splices_out_prefix |
1722 |
|
1723 |
# Print out the sam header, embedding the user's specified library properties. |
1724 |
# FIXME: also needs SQ dictionary lines |
1725 |
def write_sam_header(read_params, sam_file): |
1726 |
print >> sam_file, "@HD\tVN:1.0\tSO:coordinate" |
1727 |
if read_params.read_group_id and read_params.sample_id: |
1728 |
rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id, |
1729 |
read_params.sample_id) |
1730 |
if read_params.library_id: |
1731 |
rg_str += "\tLB:%s" % read_params.library_id |
1732 |
if read_params.description: |
1733 |
rg_str += "\tDS:%s" % read_params.description |
1734 |
if read_params.seq_platform_unit: |
1735 |
rg_str += "\tPU:%s" % read_params.seq_platform_unit |
1736 |
if read_params.seq_center: |
1737 |
rg_str += "\tCN:%s" % read_params.seq_center |
1738 |
if read_params.mate_inner_dist: |
1739 |
rg_str += "\tPI:%s" % read_params.mate_inner_dist |
1740 |
if read_params.seq_run_date: |
1741 |
rg_str += "\tDT:%s" % read_params.seq_run_date |
1742 |
if read_params.seq_platform: |
1743 |
rg_str += "\tPL:%s" % read_params.seq_platform |
1744 |
|
1745 |
print >> sam_file, rg_str |
1746 |
print >> sam_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd) |
1747 |
|
1748 |
# Write final TopHat output, via tophat_reports and wiggles |
1749 |
def compile_reports(params, sam_header_filename, left_maps, left_reads, right_maps, right_reads, gff_annotation): |
1750 |
print >> sys.stderr, "[%s] Reporting output tracks" % right_now() |
1751 |
|
1752 |
left_maps = [x for x in left_maps if (os.path.exists(x) and os.path.getsize(x) > 25)] |
1753 |
left_maps = ','.join(left_maps) |
1754 |
|
1755 |
if len(right_maps) > 0: |
1756 |
right_maps = [x for x in right_maps if (os.path.exists(x) and os.path.getsize(x) > 25)] |
1757 |
right_maps = ','.join(right_maps) |
1758 |
log_fname = logging_dir + "reports.log" |
1759 |
report_log = open(log_fname, "w") |
1760 |
junctions = output_dir + "junctions.bed" |
1761 |
insertions = output_dir + "insertions.bed" |
1762 |
deletions = output_dir + "deletions.bed" |
1763 |
coverage = "coverage.wig" |
1764 |
accepted_hits = output_dir + "accepted_hits" |
1765 |
report_cmdpath = prog_path("tophat_reports") |
1766 |
report_cmd = [report_cmdpath] |
1767 |
report_cmd.extend(params.cmd()) |
1768 |
report_cmd.extend(["--samtools="+samtools_path]) |
1769 |
report_cmd.extend([junctions, |
1770 |
insertions, |
1771 |
deletions, |
1772 |
"-", |
1773 |
left_maps, |
1774 |
left_reads]) |
1775 |
if len(right_maps) > 0 and right_reads != None: |
1776 |
report_cmd.append(right_maps) |
1777 |
report_cmd.append(right_reads) |
1778 |
# -- tophat_reports now produces (uncompressed) BAM stream, |
1779 |
# directly piped into samtools sort |
1780 |
try: |
1781 |
if params.report_params.convert_bam: |
1782 |
if params.report_params.sort_bam: |
1783 |
report_proc=subprocess.Popen(report_cmd, |
1784 |
preexec_fn=subprocess_setup, |
1785 |
stdout=subprocess.PIPE, |
1786 |
stderr=report_log) |
1787 |
|
1788 |
bamsort_cmd = ["samtools", "sort", "-", accepted_hits] |
1789 |
bamsort_proc= subprocess.Popen(bamsort_cmd, |
1790 |
preexec_fn=subprocess_setup, |
1791 |
stderr=open(logging_dir + "reports.samtools_sort.log", "w"), |
1792 |
stdin=report_proc.stdout) |
1793 |
report_proc.stdout.close() |
1794 |
print >> run_log, " ".join(report_cmd)+" | " + " ".join(bamsort_cmd) |
1795 |
bamsort_proc.communicate() |
1796 |
retcode = report_proc.poll() |
1797 |
if retcode: |
1798 |
die(fail_str+"Error running tophat_reports\n"+log_tail(log_fname)) |
1799 |
else: |
1800 |
print >> run_log, " ".join(report_cmd) |
1801 |
report_proc=subprocess.call(report_cmd, |
1802 |
preexec_fn=subprocess_setup, |
1803 |
stdout=open(accepted_hits,"wb"), |
1804 |
stderr=report_log) |
1805 |
os.rename(accepted_hits, output_dir + "accepted_hits.bam") |
1806 |
else: |
1807 |
print >> run_log, " ".join(report_cmd) |
1808 |
report_proc=subprocess.call(report_cmd, |
1809 |
preexec_fn=subprocess_setup, |
1810 |
stdout=open(accepted_hits,"wb"), |
1811 |
stderr=report_log) |
1812 |
tmp_sam = output_dir + "accepted_hits.sam" |
1813 |
|
1814 |
bam_to_sam_cmd = ["samtools", "view", "-h", accepted_hits] |
1815 |
print >> run_log, " ".join(bam_to_sam_cmd) + " > " + tmp_sam |
1816 |
bam_to_sam_log = open(logging_dir + "accepted_hits_bam_to_sam.log", "w") |
1817 |
tmp_sam_file = open(tmp_sam, "w") |
1818 |
ret = subprocess.call(bam_to_sam_cmd, |
1819 |
stdout=tmp_sam_file, |
1820 |
stderr=bam_to_sam_log) |
1821 |
tmp_sam_file.close() |
1822 |
os.remove(accepted_hits) |
1823 |
#print >> run_log, "mv %s %s" % (tmp_sam, output_dir + "accepted_hits.sam") |
1824 |
#os.rename(tmp_sam, output_dir + "accepted_hits.sam") |
1825 |
|
1826 |
except OSError, o: |
1827 |
die(fail_str+"Error: "+str(o)+"\n"+log_tail(log_fname)) |
1828 |
|
1829 |
# FIXME: put wiggles back! |
1830 |
# wig_cmd = [prog_path("wiggles"), output_dir + accepted_hits, output_dir + coverage] |
1831 |
# print >> run_log, " ".join(wig_cmd) |
1832 |
# subprocess.call(wig_cmd, |
1833 |
# stderr=open("/dev/null")) |
1834 |
return (coverage, junctions) |
1835 |
|
1836 |
|
1837 |
# Split up each read in a FASTQ file into multiple segments. Creates a FASTQ file |
1838 |
# for each segment This function needs to be fixed to support mixed read length |
1839 |
# inputs |
1840 |
|
1841 |
def open_output_files(prefix, num_files_prev, num_files, out_segf, extension, params): |
1842 |
i = num_files_prev + 1 |
1843 |
while i <= num_files: |
1844 |
segfname=prefix+("_seg%d" % i)+extension |
1845 |
out_segf.append(ZWriter(segfname,params.system_params)) |
1846 |
i += 1 |
1847 |
|
1848 |
def split_reads(reads_filename, |
1849 |
prefix, |
1850 |
fasta, |
1851 |
params, |
1852 |
segment_length): |
1853 |
#reads_file = open(reads_filename) |
1854 |
zreads = ZReader(reads_filename, params.system_params, False) |
1855 |
out_segfiles = [] |
1856 |
|
1857 |
if fasta: |
1858 |
extension = ".fa" |
1859 |
else: |
1860 |
extension = ".fq" |
1861 |
if use_zpacker: extension += ".z" |
1862 |
def convert_color_to_bp(color_seq): |
1863 |
decode_dic = { 'A0':'A', 'A1':'C', 'A2':'G', 'A3':'T', 'A4':'N', 'A.':'N', 'AN':'N', |
1864 |
'C0':'C', 'C1':'A', 'C2':'T', 'C3':'G', 'C4':'N', 'C.':'N', 'CN':'N', |
1865 |
'G0':'G', 'G1':'T', 'G2':'A', 'G3':'C', 'G4':'N', 'G.':'N', 'GN':'N', |
1866 |
'T0':'T', 'T1':'G', 'T2':'C', 'T3':'A', 'T4':'N', 'T.':'N', 'TN':'N', |
1867 |
'N0':'N', 'N1':'N', 'N2':'N', 'N3':'N', 'N4':'N', 'N.':'N', 'NN':'N', |
1868 |
'.0':'N', '.1':'N', '.2':'N', '.3':'N', '.4':'N', '..':'N', '.N':'N' } |
1869 |
|
1870 |
base = color_seq[0] |
1871 |
bp_seq = base |
1872 |
for ch in color_seq[1:]: |
1873 |
base = decode_dic[base+ch] |
1874 |
bp_seq += base |
1875 |
return bp_seq |
1876 |
|
1877 |
def convert_bp_to_color(bp_seq): |
1878 |
encode_dic = { 'AA':'0', 'CC':'0', 'GG':'0', 'TT':'0', |
1879 |
'AC':'1', 'CA':'1', 'GT':'1', 'TG':'1', |
1880 |
'AG':'2', 'CT':'2', 'GA':'2', 'TC':'2', |
1881 |
'AT':'3', 'CG':'3', 'GC':'3', 'TA':'3', |
1882 |
'A.':'4', 'C.':'4', 'G.':'4', 'T.':'4', |
1883 |
'.A':'4', '.C':'4', '.G':'4', '.T':'4', |
1884 |
'.N':'4', 'AN':'4', 'CN':'4', 'GN':'4', |
1885 |
'TN':'4', 'NA':'4', 'NC':'4', 'NG':'4', |
1886 |
'NT':'4', 'NN':'4', 'N.':'4', '..':'4' } |
1887 |
|
1888 |
base = bp_seq[0] |
1889 |
color_seq = base |
1890 |
for ch in bp_seq[1:]: |
1891 |
color_seq += encode_dic[base + ch] |
1892 |
base = ch |
1893 |
|
1894 |
return color_seq |
1895 |
|
1896 |
def split_record(read_name, read_seq, read_qual, out_segf, offsets, color): |
1897 |
if color: |
1898 |
color_offset = 1 |
1899 |
read_seq_temp = convert_color_to_bp(read_seq) |
1900 |
|
1901 |
seg_num = 1 |
1902 |
while seg_num + 1 < len(offsets): |
1903 |
if read_seq[offsets[seg_num]+1] not in ['0', '1', '2', '3']: |
1904 |
return |
1905 |
seg_num += 1 |
1906 |
else: |
1907 |
color_offset = 0 |
1908 |
|
1909 |
seg_num = 0 |
1910 |
last_seq_offset = 0 |
1911 |
while seg_num + 1 < len(offsets): |
1912 |
f = out_segf[seg_num].file |
1913 |
seg_seq = read_seq[last_seq_offset+color_offset:offsets[seg_num + 1]+color_offset] |
1914 |
print >> f, "%s|%d:%d:%d" % (read_name,last_seq_offset,seg_num, len(offsets) - 1) |
1915 |
if color: |
1916 |
print >> f, "%s%s" % (read_seq_temp[last_seq_offset], seg_seq) |
1917 |
else: |
1918 |
print >> f, seg_seq |
1919 |
if not fasta: |
1920 |
seg_qual = read_qual[last_seq_offset:offsets[seg_num + 1]] |
1921 |
print >> f, "+" |
1922 |
print >> f, seg_qual |
1923 |
seg_num += 1 |
1924 |
last_seq_offset = offsets[seg_num] |
1925 |
|
1926 |
line_state = 0 |
1927 |
read_name = "" |
1928 |
read_seq = "" |
1929 |
read_qual = "" |
1930 |
num_segments = 0 |
1931 |
offsets = [] |
1932 |
for line in zreads.file: |
1933 |
if line.strip() == "": |
1934 |
continue |
1935 |
if line_state == 0: |
1936 |
read_name = line.strip() |
1937 |
elif line_state == 1: |
1938 |
read_seq = line.strip() |
1939 |
|
1940 |
read_length = len(read_seq) |
1941 |
tmp_num_segments = read_length / segment_length |
1942 |
offsets = [segment_length * i for i in range(0, tmp_num_segments + 1)] |
1943 |
|
1944 |
# Bowtie's minimum read length here is 20bp, so if the last segment |
1945 |
# is between 20 and segment_length bp long, go ahead and write it out |
1946 |
if read_length % segment_length >= 20: |
1947 |
offsets.append(read_length) |
1948 |
tmp_num_segments += 1 |
1949 |
else: |
1950 |
offsets[-1] = read_length |
1951 |
|
1952 |
if tmp_num_segments == 1: |
1953 |
offsets = [0, read_length] |
1954 |
|
1955 |
if tmp_num_segments > num_segments: |
1956 |
open_output_files(prefix, num_segments, tmp_num_segments, out_segfiles, extension, params) |
1957 |
num_segments = tmp_num_segments |
1958 |
|
1959 |
if fasta: |
1960 |
split_record(read_name, read_seq, None, out_segfiles, offsets, params.read_params.color) |
1961 |
elif line_state == 2: |
1962 |
line = line.strip() |
1963 |
else: |
1964 |
read_quals = line.strip() |
1965 |
if not fasta: |
1966 |
split_record(read_name, read_seq, read_quals, out_segfiles, offsets, params.read_params.color) |
1967 |
|
1968 |
line_state += 1 |
1969 |
if fasta: |
1970 |
line_state %= 2 |
1971 |
else: |
1972 |
line_state %= 4 |
1973 |
zreads.close() |
1974 |
out_fnames=[] |
1975 |
for zf in out_segfiles: |
1976 |
zf.close() |
1977 |
out_fnames.append(zf.fname) |
1978 |
#return [o.fname for o in out_segfiles] |
1979 |
return out_fnames |
1980 |
|
1981 |
# Find possible splice junctions using the "closure search" strategy, and report |
1982 |
# them in closures.juncs. Calls the executable closure_juncs |
1983 |
def junctions_from_closures(params, |
1984 |
left_maps, |
1985 |
right_maps, |
1986 |
ref_fasta): |
1987 |
#print >> sys.stderr, "[%s] " % right_now() |
1988 |
print >> sys.stderr, "[%s] Searching for junctions via mate-pair closures" % right_now() |
1989 |
|
1990 |
#maps = [x for x in seg_maps if (os.path.exists(x) and os.path.getsize(x) > 0)] |
1991 |
#if len(maps) == 0: |
1992 |
# return None |
1993 |
#print >> sys.stderr, left_maps |
1994 |
slash = left_maps[0].rfind('/') |
1995 |
juncs_out = "" |
1996 |
if slash != -1: |
1997 |
juncs_out += left_maps[0][:slash+1] |
1998 |
juncs_out += "closure.juncs" |
1999 |
|
2000 |
juncs_log = open(logging_dir + "closure.log", "w") |
2001 |
juncs_cmdpath=prog_path("closure_juncs") |
2002 |
juncs_cmd = [juncs_cmdpath] |
2003 |
|
2004 |
left_maps = ','.join(left_maps) |
2005 |
right_maps = ','.join(right_maps) |
2006 |
|
2007 |
juncs_cmd.extend(params.cmd()) |
2008 |
juncs_cmd.extend([juncs_out, |
2009 |
ref_fasta, |
2010 |
left_maps, |
2011 |
right_maps]) |
2012 |
try: |
2013 |
print >> run_log, ' '.join(juncs_cmd) |
2014 |
retcode = subprocess.call(juncs_cmd, |
2015 |
stderr=juncs_log) |
2016 |
|
2017 |
# spanning_reads returned an error |
2018 |
if retcode != 0: |
2019 |
die(fail_str+"Error: closure-based junction search failed with err ="+str(retcode)) |
2020 |
# cvg_islands not found |
2021 |
except OSError, o: |
2022 |
if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT: |
2023 |
print >> sys.stderr, fail_str, "Error: closure_juncs not found on this system" |
2024 |
die(str(o)) |
2025 |
return [juncs_out] |
2026 |
|
2027 |
# Find possible junctions by examining coverage and split segments in the initial |
2028 |
# map and segment maps. Report junctions, insertions, and deletions in segment.juncs, |
2029 |
# segment.insertions, and segment.deletions. Calls the executable |
2030 |
# segment_juncs |
2031 |
def junctions_from_segments(params, |
2032 |
left_reads, |
2033 |
left_reads_map, |
2034 |
left_seg_maps, |
2035 |
right_reads, |
2036 |
right_reads_map, |
2037 |
right_seg_maps, |
2038 |
unmapped_reads, |
2039 |
reads_format, |
2040 |
ref_fasta): |
2041 |
if left_reads_map != left_seg_maps[0]: |
2042 |
print >> sys.stderr, "[%s] Searching for junctions via segment mapping" % right_now() |
2043 |
out_path=getFileDir(left_seg_maps[0]) |
2044 |
juncs_out=out_path+"segment.juncs" |
2045 |
insertions_out=out_path+"segment.insertions" |
2046 |
deletions_out =out_path+"segment.deletions" |
2047 |
|
2048 |
left_maps = ','.join(left_seg_maps) |
2049 |
align_log = open(logging_dir + "segment_juncs.log", "w") |
2050 |
align_cmd = [prog_path("segment_juncs")] |
2051 |
|
2052 |
align_cmd.extend(params.cmd()) |
2053 |
|
2054 |
align_cmd.extend(["--ium-reads", ",".join(unmapped_reads), |
2055 |
ref_fasta, |
2056 |
juncs_out, |
2057 |
insertions_out, |
2058 |
deletions_out, |
2059 |
left_reads, |
2060 |
left_reads_map, |
2061 |
left_maps]) |
2062 |
if right_seg_maps != None: |
2063 |
right_maps = ','.join(right_seg_maps) |
2064 |
align_cmd.extend([right_reads, right_reads_map, right_maps]) |
2065 |
try: |
2066 |
print >> run_log, " ".join(align_cmd) |
2067 |
retcode = subprocess.call(align_cmd, |
2068 |
preexec_fn=subprocess_setup, |
2069 |
stderr=align_log) |
2070 |
|
2071 |
# spanning_reads returned an error |
2072 |
if retcode != 0: |
2073 |
die(fail_str+"Error: segment-based junction search failed with err ="+str(retcode)) |
2074 |
# cvg_islands not found |
2075 |
except OSError, o: |
2076 |
if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT: |
2077 |
print >> sys.stderr, fail_str, "Error: segment_juncs not found on this system" |
2078 |
die(str(o)) |
2079 |
|
2080 |
return [juncs_out, insertions_out, deletions_out] |
2081 |
|
2082 |
# Joins mapped segments into full-length read alignments via the executable |
2083 |
# long_spanning_reads |
2084 |
def join_mapped_segments(params, |
2085 |
sam_header_filename, |
2086 |
reads, |
2087 |
ref_fasta, |
2088 |
possible_juncs, |
2089 |
possible_insertions, |
2090 |
possible_deletions, |
2091 |
contig_seg_maps, |
2092 |
spliced_seg_maps, |
2093 |
alignments_out_name): |
2094 |
if len(contig_seg_maps)>1: |
2095 |
print >> sys.stderr, "[%s] Joining segment hits" % right_now() |
2096 |
else: |
2097 |
print >> sys.stderr, "[%s] Processing bowtie hits" % right_now() |
2098 |
contig_seg_maps = ','.join(contig_seg_maps) |
2099 |
|
2100 |
possible_juncs = ','.join(possible_juncs) |
2101 |
possible_insertions = ",".join(possible_insertions) |
2102 |
possible_deletions = ",".join(possible_deletions) |
2103 |
|
2104 |
align_log = open(logging_dir + "long_spanning_reads.log", "w") |
2105 |
align_cmd = [prog_path("long_spanning_reads")] |
2106 |
|
2107 |
align_cmd.extend(params.cmd()) |
2108 |
|
2109 |
align_cmd.append(ref_fasta) |
2110 |
align_cmd.extend([ reads, |
2111 |
possible_juncs, |
2112 |
possible_insertions, |
2113 |
possible_deletions, |
2114 |
contig_seg_maps]) |
2115 |
|
2116 |
if spliced_seg_maps != None: |
2117 |
spliced_seg_maps = ','.join(spliced_seg_maps) |
2118 |
align_cmd.append(spliced_seg_maps) |
2119 |
|
2120 |
try: |
2121 |
print >> run_log, " ".join(align_cmd),">",alignments_out_name |
2122 |
join_proc=subprocess.Popen(align_cmd, |
2123 |
preexec_fn=subprocess_setup, |
2124 |
stdout=open(alignments_out_name, "wb"), |
2125 |
stderr=align_log) |
2126 |
join_proc.communicate() |
2127 |
except OSError, o: |
2128 |
die(fail_str+"Error: "+str(o)) |
2129 |
|
2130 |
|
2131 |
# This class collects spliced and unspliced alignments for each of the |
2132 |
# left and right read files provided by the user. |
2133 |
class Maps: |
2134 |
def __init__(self, |
2135 |
# unspliced_bwt, |
2136 |
unspliced_sam, |
2137 |
seg_maps, |
2138 |
unmapped_segs, |
2139 |
segs): |
2140 |
#self.unspliced_bwt = unspliced_bwt |
2141 |
self.unspliced_sam = unspliced_sam |
2142 |
self.seg_maps = seg_maps |
2143 |
self.unmapped_segs = unmapped_segs |
2144 |
self.segs = segs |
2145 |
|
2146 |
# The main aligment routine of TopHat. This function executes most of the |
2147 |
# workflow producing a set of candidate alignments for each cDNA fragment in a |
2148 |
# pair of SAM alignment files (for paired end reads). |
2149 |
def spliced_alignment(params, |
2150 |
bwt_idx_prefix, |
2151 |
sam_header_filename, |
2152 |
ref_fasta, |
2153 |
read_len, |
2154 |
segment_len, |
2155 |
left_reads, |
2156 |
right_reads, |
2157 |
user_supplied_junctions, |
2158 |
user_supplied_insertions, |
2159 |
user_supplied_deletions): |
2160 |
|
2161 |
possible_juncs = [] |
2162 |
possible_juncs.extend(user_supplied_junctions) |
2163 |
|
2164 |
possible_insertions = [] |
2165 |
possible_insertions.extend(user_supplied_insertions) |
2166 |
|
2167 |
possible_deletions = [] |
2168 |
possible_deletions.extend(user_supplied_deletions) |
2169 |
|
2170 |
maps = { left_reads : [], right_reads : [] } |
2171 |
#single_segments = False |
2172 |
|
2173 |
# Perform the first part of the TopHat work flow on the left and right |
2174 |
# reads of paired ends separately - we'll use the pairing information later |
2175 |
for reads in [left_reads, right_reads]: |
2176 |
if reads == None or os.path.getsize(reads)<25 : |
2177 |
continue |
2178 |
fbasename=getFileBaseName(reads) |
2179 |
unspliced_out = tmp_dir + fbasename + ".mapped" |
2180 |
unmapped_unspliced = tmp_dir + fbasename + "_unmapped" |
2181 |
|
2182 |
num_segs = read_len / segment_len |
2183 |
if read_len % segment_len >= min(segment_len -2, 20): |
2184 |
num_segs += 1 |
2185 |
|
2186 |
# daehwan - check this out |
2187 |
if num_segs <= 1: |
2188 |
print >> sys.stderr, "Warning: you have only one segment per read\n\twe strongly recommend that you decrease --segment-length to about half the read length because TopHat will work better with multiple segments" |
2189 |
|
2190 |
# Perform the initial Bowtie mapping of the full length reads |
2191 |
(unspliced_sam, unmapped_reads) = bowtie(params, |
2192 |
bwt_idx_prefix, |
2193 |
reads, |
2194 |
"fastq", |
2195 |
params.initial_read_mismatches, |
2196 |
unspliced_out, |
2197 |
unmapped_unspliced) |
2198 |
|
2199 |
# Convert the initial Bowtie maps into BAM format. |
2200 |
# skip this step as fix_map_ordering does it now |
2201 |
#join_mapped_segments(params, |
2202 |
# sam_header_filename, |
2203 |
# reads, |
2204 |
# ref_fasta, |
2205 |
# ["/dev/null"], |
2206 |
# ["/dev/null"], |
2207 |
# ["/dev/null"], |
2208 |
# [unspliced_map], |
2209 |
# [], |
2210 |
# unspliced_sam) |
2211 |
# Using the num_segs value returned by check_reads(), decide which |
2212 |
# junction discovery strategy to use |
2213 |
if num_segs == 1: |
2214 |
segment_len = read_len |
2215 |
elif num_segs >= 3: |
2216 |
# if we have at least three segments, just use split segment search, |
2217 |
# which is the most sensitive and specific, fastest, and lightest-weight. |
2218 |
if not params.closure_search: |
2219 |
params.closure_search = False |
2220 |
if not params.coverage_search: |
2221 |
params.coverage_search = False |
2222 |
if not params.butterfly_search: |
2223 |
params.butterfly_search = False |
2224 |
if not params.closure_search: |
2225 |
params.closure_search = False |
2226 |
seg_maps = [] |
2227 |
unmapped_segs = [] |
2228 |
segs = [] |
2229 |
if num_segs > 1: |
2230 |
# split up the IUM reads into segments |
2231 |
# unmapped_reads can be in BAM format |
2232 |
read_segments = split_reads(unmapped_reads, |
2233 |
tmp_dir + fbasename, |
2234 |
False, |
2235 |
params, |
2236 |
segment_len) |
2237 |
|
2238 |
# Map each segment file independently with Bowtie |
2239 |
for i in range(len(read_segments)): |
2240 |
seg = read_segments[i] |
2241 |
fbasename=getFileBaseName(seg) |
2242 |
seg_out = tmp_dir + fbasename + ".mapped" |
2243 |
#if use_zpacker: seg_out += ".z" |
2244 |
unmapped_seg = tmp_dir + fbasename + "_unmapped" |
2245 |
extra_output = "(%d/%d)" % (i+1, len(read_segments)) |
2246 |
(seg_map, unmapped) = bowtie_segment(params, |
2247 |
bwt_idx_prefix, |
2248 |
seg, |
2249 |
"fastq", |
2250 |
seg_out, |
2251 |
unmapped_seg, |
2252 |
extra_output) |
2253 |
seg_maps.append(seg_map) |
2254 |
unmapped_segs.append(unmapped) |
2255 |
segs.append(seg) |
2256 |
|
2257 |
# Collect the segment maps for left and right reads together |
2258 |
#maps[reads] = Maps(unspliced_map, unspliced_sam, seg_maps, unmapped_segs, segs) |
2259 |
maps[reads] = Maps(unspliced_sam, seg_maps, unmapped_segs, segs) |
2260 |
else: |
2261 |
# if there's only one segment, just collect the initial map as the only |
2262 |
# map to be used downstream for coverage-based junction discovery |
2263 |
read_segments = [reads] |
2264 |
#maps[reads] = Maps(unspliced_map, unspliced_sam, [unspliced_map], [unmapped_reads], [unmapped_reads]) |
2265 |
maps[reads] = Maps(unspliced_sam, [unspliced_sam], [unmapped_reads], [unmapped_reads]) |
2266 |
|
2267 |
# Unless the user asked not to discover new junctions, start that process |
2268 |
# here |
2269 |
|
2270 |
if params.find_novel_juncs or params.find_novel_indels: |
2271 |
#left_reads_map = maps[left_reads].unspliced_bwt |
2272 |
left_reads_map = maps[left_reads].unspliced_sam |
2273 |
left_seg_maps = maps[left_reads].seg_maps |
2274 |
unmapped_reads = maps[left_reads].unmapped_segs |
2275 |
if right_reads != None: |
2276 |
#right_reads_map = maps[right_reads].unspliced_bwt |
2277 |
right_reads_map = maps[right_reads].unspliced_sam |
2278 |
right_seg_maps = maps[right_reads].seg_maps |
2279 |
unmapped_reads.extend(maps[right_reads].unmapped_segs) |
2280 |
else: |
2281 |
right_reads_map = None |
2282 |
right_seg_maps = None |
2283 |
|
2284 |
# Call segment_juncs to infer a list of possible splice junctions from |
2285 |
# the regions of the genome covered in the initial and segment maps |
2286 |
juncs = junctions_from_segments(params, |
2287 |
left_reads, |
2288 |
left_reads_map, |
2289 |
left_seg_maps, |
2290 |
right_reads, |
2291 |
right_reads_map, |
2292 |
right_seg_maps, |
2293 |
unmapped_reads, |
2294 |
"fastq", |
2295 |
ref_fasta) |
2296 |
if params.find_novel_juncs: |
2297 |
if os.path.getsize(juncs[0]) != 0: |
2298 |
possible_juncs.append(juncs[0]) |
2299 |
if params.find_novel_indels: |
2300 |
if os.path.getsize(juncs[1]) != 0: |
2301 |
possible_insertions.append(juncs[1]) |
2302 |
if os.path.getsize(juncs[2]) != 0: |
2303 |
possible_deletions.append(juncs[2]) |
2304 |
# Optionally, and for paired reads only, use a closure search to |
2305 |
# discover addtional junctions |
2306 |
if params.find_novel_juncs and params.closure_search and left_reads != None and right_reads != None: |
2307 |
juncs = junctions_from_closures(params, |
2308 |
# [maps[left_reads].unspliced_bwt, maps[left_reads].seg_maps[-1]], |
2309 |
# [maps[right_reads].unspliced_bwt, maps[right_reads].seg_maps[-1]], |
2310 |
[maps[left_reads].unspliced_sam, maps[left_reads].seg_maps[-1]], |
2311 |
[maps[right_reads].unspliced_sam, maps[right_reads].seg_maps[-1]], |
2312 |
ref_fasta) |
2313 |
if os.path.getsize(juncs[0]) != 0: |
2314 |
possible_juncs.extend(juncs) |
2315 |
|
2316 |
if len(possible_insertions) == 0 and len(possible_deletions) == 0 and len(possible_juncs) == 0: |
2317 |
spliced_seg_maps = None |
2318 |
junc_idx_prefix = None |
2319 |
else: |
2320 |
junc_idx_prefix = "segment_juncs" |
2321 |
if len(possible_insertions) == 0: |
2322 |
possible_insertions.append(os.devnull) |
2323 |
# print >> sys.stderr, "Warning: insertions database is empty!" |
2324 |
if len(possible_deletions) == 0: |
2325 |
possible_deletions.append(os.devnull) |
2326 |
# print >> sys.stderr, "Warning: deletions database is empty!" |
2327 |
if len(possible_juncs) == 0: |
2328 |
possible_juncs.append(os.devnull) |
2329 |
print >> sys.stderr, "Warning: junction database is empty!" |
2330 |
if junc_idx_prefix != None: |
2331 |
build_juncs_index(3, |
2332 |
segment_len, |
2333 |
junc_idx_prefix, |
2334 |
possible_juncs, |
2335 |
possible_insertions, |
2336 |
possible_deletions, |
2337 |
ref_fasta, |
2338 |
params.read_params.color) |
2339 |
|
2340 |
# Now map read segments (or whole IUM reads, if num_segs == 1) to the splice |
2341 |
# index with Bowtie |
2342 |
for reads in [left_reads, right_reads]: |
2343 |
spliced_seg_maps = [] |
2344 |
if reads == None or reads not in maps: |
2345 |
continue |
2346 |
|
2347 |
if junc_idx_prefix != None: |
2348 |
i = 0 |
2349 |
for seg in maps[reads].segs: |
2350 |
fsegname = getFileBaseName(seg) |
2351 |
seg_out = tmp_dir + fsegname + ".to_spliced" |
2352 |
#if use_zpacker: seg_out += ".z" |
2353 |
extra_output = "(%d/%d)" % (i+1, len(maps[reads].segs)) |
2354 |
(seg_map, unmapped) = bowtie_segment(params, |
2355 |
tmp_dir + junc_idx_prefix, |
2356 |
seg, |
2357 |
"fastq", |
2358 |
seg_out, |
2359 |
None, |
2360 |
extra_output) |
2361 |
spliced_seg_maps.append(seg_map) |
2362 |
i += 1 |
2363 |
|
2364 |
# Join the contigous and spliced segment hits into full length read |
2365 |
# alignments or convert a spliced segment hit into a genomic-coordinate hit |
2366 |
rfname = getFileBaseName(reads) |
2367 |
rfdir = getFileDir(reads) |
2368 |
mapped_reads = rfdir + rfname + ".candidates.bam" |
2369 |
merged_map =rfdir + rfname + ".candidates_and_unspl.bam" |
2370 |
# unspl_bwtfile = maps[reads].unspliced_bwt |
2371 |
unspl_samfile = maps[reads].unspliced_sam |
2372 |
|
2373 |
join_mapped_segments(params, |
2374 |
sam_header_filename, |
2375 |
reads, |
2376 |
ref_fasta, |
2377 |
possible_juncs, |
2378 |
possible_insertions, |
2379 |
possible_deletions, |
2380 |
maps[reads].seg_maps, |
2381 |
spliced_seg_maps, |
2382 |
mapped_reads) |
2383 |
|
2384 |
if num_segs > 1: |
2385 |
# Merge the spliced and unspliced full length alignments into a |
2386 |
# single SAM file. |
2387 |
# The individual SAM files are all already sorted in |
2388 |
# increasing read ID order. |
2389 |
# NOTE: We also should be able to address bug #134 here, by replacing |
2390 |
# contiguous alignments that poke into an intron by a small amount by |
2391 |
# the correct spliced alignment. |
2392 |
|
2393 |
try: |
2394 |
merge_cmd = [ prog_path("bam_merge"), merged_map, |
2395 |
maps[reads].unspliced_sam, |
2396 |
mapped_reads ] |
2397 |
print >> run_log, " ".join(merge_cmd) |
2398 |
ret = subprocess.call( merge_cmd, |
2399 |
stderr=open(logging_dir + "sam_merge.log", "w") ) |
2400 |
if ret != 0: |
2401 |
die(fail_str+"Error executing: "+" ".join(merge_cmd)) |
2402 |
except OSError, o: |
2403 |
die(fail_str+"Error: "+str(o)) |
2404 |
maps[reads] = [merged_map] |
2405 |
|
2406 |
if not params.system_params.keep_tmp: |
2407 |
os.remove(mapped_reads) |
2408 |
os.remove(unspl_samfile) |
2409 |
#os.remove(unspl_bwtfile) |
2410 |
else: |
2411 |
os.rename(mapped_reads, merged_map) |
2412 |
maps[reads] = [merged_map] |
2413 |
if not params.system_params.keep_tmp: |
2414 |
os.remove(unspl_samfile) |
2415 |
#os.remove(unspl_bwtfile) |
2416 |
|
2417 |
return maps |
2418 |
|
2419 |
def die(msg=None): |
2420 |
if msg is not None: |
2421 |
print >> sys.stderr, msg |
2422 |
sys.exit(1) |
2423 |
|
2424 |
# rough equivalent of the 'which' command to find external programs |
2425 |
# (current script path is tested first, then PATH envvar) |
2426 |
def which(program): |
2427 |
def is_executable(fpath): |
2428 |
return os.path.isfile(fpath) and os.access(fpath, os.X_OK) |
2429 |
fpath, fname = os.path.split(program) |
2430 |
if fpath: |
2431 |
if is_executable(program): |
2432 |
return program |
2433 |
else: |
2434 |
progpath = os.path.join(bin_dir, program) |
2435 |
if is_executable(progpath): |
2436 |
return progpath |
2437 |
for path in os.environ["PATH"].split(os.pathsep): |
2438 |
progpath = os.path.join(path, program) |
2439 |
if is_executable(progpath): |
2440 |
return progpath |
2441 |
return None |
2442 |
|
2443 |
def prog_path(program): |
2444 |
progpath=which(program) |
2445 |
if progpath == None: |
2446 |
die("Error locating program: "+program) |
2447 |
return progpath |
2448 |
|
2449 |
# FIXME: this should get set during the make dist autotools phase of the build |
2450 |
def get_version(): |
2451 |
return "1.3.2" |
2452 |
|
2453 |
def mlog(msg): |
2454 |
print >> sys.stderr, "[DBGLOG]:"+msg |
2455 |
|
2456 |
def test_input_file(filename): |
2457 |
try: |
2458 |
test_file = open(filename, "r") |
2459 |
# Bowtie not found |
2460 |
except IOError, o: |
2461 |
die("Error: Opening file %s" % filename) |
2462 |
return |
2463 |
|
2464 |
|
2465 |
def main(argv=None): |
2466 |
warnings.filterwarnings("ignore", "tmpnam is a potential security risk") |
2467 |
|
2468 |
# Initialize default parameter values |
2469 |
params = TopHatParams() |
2470 |
|
2471 |
try: |
2472 |
if argv is None: |
2473 |
argv = sys.argv |
2474 |
args = params.parse_options(argv) |
2475 |
params.check() |
2476 |
|
2477 |
bwt_idx_prefix = args[0] |
2478 |
left_reads_list = args[1] |
2479 |
left_quals_list, right_quals_list = [], [] |
2480 |
if (not params.read_params.quals and len(args) > 2) or (params.read_params.quals and len(args) > 3): |
2481 |
if params.read_params.mate_inner_dist == None: |
2482 |
params.read_params.mate_inner_dist = 50 |
2483 |
#die("Error: you must set the mean inner distance between mates with -r") |
2484 |
|
2485 |
right_reads_list = args[2] |
2486 |
if params.read_params.quals: |
2487 |
left_quals_list = args[3] |
2488 |
right_quals_list = args[4] |
2489 |
else: |
2490 |
right_reads_list = None |
2491 |
if params.read_params.quals: |
2492 |
left_quals_list = args[2] |
2493 |
|
2494 |
print >> sys.stderr |
2495 |
print >> sys.stderr, "[%s] Beginning TopHat run (v%s)" % (right_now(), get_version()) |
2496 |
print >> sys.stderr, "-----------------------------------------------" |
2497 |
|
2498 |
start_time = datetime.now() |
2499 |
prepare_output_dir() |
2500 |
|
2501 |
global run_log |
2502 |
run_log = open(logging_dir + "run.log", "w", 0) |
2503 |
global run_cmd |
2504 |
run_cmd = " ".join(argv) |
2505 |
print >> run_log, run_cmd |
2506 |
|
2507 |
# Validate all the input files, check all prereqs before committing |
2508 |
# to the run |
2509 |
(ref_fasta, ref_seq_dict) = check_index(bwt_idx_prefix) |
2510 |
|
2511 |
check_bowtie() |
2512 |
check_samtools() |
2513 |
print >> sys.stderr, "[%s] Generating SAM header for %s" % (right_now(), bwt_idx_prefix) |
2514 |
sam_header_filename = get_index_sam_header(params.read_params, bwt_idx_prefix) |
2515 |
print >> sys.stderr, "[%s] Preparing reads" % (right_now()) |
2516 |
|
2517 |
if not params.skip_check_reads: |
2518 |
reads_list = left_reads_list |
2519 |
if right_reads_list: |
2520 |
reads_list = reads_list + "," + right_reads_list |
2521 |
params.read_params = check_reads_format(params, reads_list) |
2522 |
|
2523 |
user_supplied_juncs = [] |
2524 |
user_supplied_insertions = [] |
2525 |
user_supplied_deletions = [] |
2526 |
|
2527 |
if params.gff_annotation and params.find_GFF_juncs: |
2528 |
test_input_file(params.gff_annotation) |
2529 |
(found_juncs, gtf_juncs) = get_gtf_juncs(params.gff_annotation) |
2530 |
if found_juncs: |
2531 |
user_supplied_juncs.append(gtf_juncs) |
2532 |
if params.raw_junctions: |
2533 |
test_input_file(params.raw_junctions) |
2534 |
user_supplied_juncs.append(params.raw_junctions) |
2535 |
|
2536 |
if params.raw_insertions: |
2537 |
test_input_file(params.raw_insertions) |
2538 |
user_supplied_insertions.append(params.raw_insertions) |
2539 |
|
2540 |
if params.raw_deletions: |
2541 |
test_input_file(params.raw_deletions) |
2542 |
user_supplied_deletions.append(params.raw_deletions) |
2543 |
|
2544 |
global unmapped_reads_fifo |
2545 |
unmapped_reads_fifo = tmp_dir + str(os.getpid())+".bwt_unmapped.z.fifo" |
2546 |
|
2547 |
# Now start the time consuming stuff |
2548 |
left_kept_reads, left_reads_info = prep_reads(params, |
2549 |
left_reads_list, |
2550 |
left_quals_list, |
2551 |
"left_kept_reads", "Left ") |
2552 |
min_read_len=left_reads_info.min_len |
2553 |
max_read_len=left_reads_info.max_len |
2554 |
if right_reads_list != None: |
2555 |
right_kept_reads, right_reads_info = prep_reads(params, |
2556 |
right_reads_list, |
2557 |
right_quals_list, |
2558 |
"right_kept_reads", "Right") |
2559 |
min_read_len=min(right_reads_info.min_len, min_read_len) |
2560 |
max_read_len=max(right_reads_info.max_len, max_read_len) |
2561 |
else: |
2562 |
right_kept_reads = None |
2563 |
seed_len=params.read_params.seed_length |
2564 |
if seed_len != None: |
2565 |
seed_len = max(seed_len, min_read_len) |
2566 |
else: |
2567 |
seed_len = max_read_len |
2568 |
params.read_params.seed_length=seed_len |
2569 |
# turn off integer-quals |
2570 |
if params.read_params.integer_quals: |
2571 |
params.read_params.integer_quals = False |
2572 |
|
2573 |
mapping = spliced_alignment(params, |
2574 |
bwt_idx_prefix, |
2575 |
sam_header_filename, |
2576 |
ref_fasta, |
2577 |
params.read_params.seed_length, |
2578 |
params.segment_length, |
2579 |
left_kept_reads, |
2580 |
right_kept_reads, |
2581 |
user_supplied_juncs, |
2582 |
user_supplied_insertions, |
2583 |
user_supplied_deletions) |
2584 |
|
2585 |
left_maps = mapping[left_kept_reads] |
2586 |
#left_unmapped_reads = mapping[1] |
2587 |
right_maps = mapping[right_kept_reads] |
2588 |
#right_unmapped_reads = mapping[3] |
2589 |
|
2590 |
compile_reports(params, |
2591 |
sam_header_filename, |
2592 |
left_maps, |
2593 |
left_kept_reads, |
2594 |
right_maps, |
2595 |
right_kept_reads, |
2596 |
params.gff_annotation) |
2597 |
|
2598 |
if not params.system_params.keep_tmp: |
2599 |
for m in left_maps: |
2600 |
os.remove(m) |
2601 |
if left_kept_reads != None: |
2602 |
os.remove(left_kept_reads) |
2603 |
for m in right_maps: |
2604 |
os.remove(m) |
2605 |
if right_kept_reads != None: |
2606 |
os.remove(right_kept_reads) |
2607 |
tmp_files = os.listdir(tmp_dir) |
2608 |
for t in tmp_files: |
2609 |
os.remove(tmp_dir+t) |
2610 |
os.rmdir(tmp_dir) |
2611 |
|
2612 |
finish_time = datetime.now() |
2613 |
duration = finish_time - start_time |
2614 |
print >> sys.stderr,"-----------------------------------------------" |
2615 |
print >> sys.stderr, "Run complete [%s elapsed]" % formatTD(duration) |
2616 |
|
2617 |
except Usage, err: |
2618 |
print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg) |
2619 |
print >> sys.stderr, " for detailed help see http://tophat.cbcb.umd.edu/manual.html" |
2620 |
return 2 |
2621 |
|
2622 |
|
2623 |
if __name__ == "__main__": |
2624 |
sys.exit(main()) |