ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/tophat
Revision: 67
Committed: Mon Sep 19 19:39:49 2011 UTC (12 years, 11 months ago) by gpertea
File size: 111262 byte(s)
Log Message:
fixed segment_juncs to work with .bam files

Line File contents
1 #!/usr/bin/env python
2
3 # encoding: utf-8
4 """
5 tophat.py
6
7 Created by Cole Trapnell on 2008-12-25.
8 Copyright (c) 2008 Cole Trapnell. All rights reserved.
9 """
10
11 import sys
12 try:
13 import psyco
14 psyco.full()
15 except ImportError:
16 pass
17
18 import getopt
19 import subprocess
20 import errno
21 import os
22 import warnings
23 import re
24 import signal
25 from datetime import datetime, date, time
26
27 use_message = '''
28 TopHat maps short sequences from spliced transcripts to whole genomes.
29
30 Usage:
31 tophat [options] <bowtie_index> <reads1[,reads2,...]> [reads1[,reads2,...]] \\
32 [quals1,[quals2,...]] [quals1[,quals2,...]]
33
34 Options:
35 -v/--version
36 -o/--output-dir <string> [ default: ./tophat_out ]
37 -a/--min-anchor <int> [ default: 8 ]
38 -m/--splice-mismatches <0-2> [ default: 0 ]
39 -i/--min-intron-length <int> [ default: 50 ]
40 -I/--max-intron-length <int> [ default: 500000 ]
41 -g/--max-multihits <int> [ default: 20 ]
42 -F/--min-isoform-fraction <float> [ default: 0.15 ]
43 --max-insertion-length <int> [ default: 3 ]
44 --max-deletion-length <int> [ default: 3 ]
45 --solexa-quals
46 --solexa1.3-quals (same as phred64-quals)
47 --phred64-quals (same as solexa1.3-quals)
48 -Q/--quals
49 --integer-quals
50 -C/--color (Solid - color space)
51 --color-out
52 --library-type <string> (fr-unstranded, fr-firststrand,
53 fr-secondstrand)
54 -p/--num-threads <int> [ default: 1 ]
55 -G/--GTF <filename>
56 -j/--raw-juncs <filename>
57 --insertions <filename>
58 --deletions <filename>
59 -r/--mate-inner-dist <int>
60 --mate-std-dev <int> [ default: 20 ]
61 --no-novel-juncs
62 --no-novel-indels
63 --no-gtf-juncs
64 --no-coverage-search
65 --coverage-search
66 --no-closure-search
67 --closure-search
68 --microexon-search
69 --butterfly-search
70 --no-butterfly-search
71 --keep-tmp
72 --tmp-dir <dirname> [ default: <output_dir>/tmp ]
73 -z/--zpacker <program> [ default: gzip ]
74 -X/--unmapped-fifo [use mkfifo to compress more temporary
75 files for color space reads]
76
77 Advanced Options:
78 --initial-read-mismatches <int> [ default: 2 ]
79 --segment-mismatches <int> [ default: 2 ]
80 --segment-length <int> [ default: 25 ]
81 --bowtie-n [ default: bowtie -v ]
82 --min-closure-exon <int> [ default: 100 ]
83 --min-closure-intron <int> [ default: 50 ]
84 --max-closure-intron <int> [ default: 5000 ]
85 --min-coverage-intron <int> [ default: 50 ]
86 --max-coverage-intron <int> [ default: 20000 ]
87 --min-segment-intron <int> [ default: 50 ]
88 --max-segment-intron <int> [ default: 500000 ]
89 --no-sort-bam [Output BAM is not coordinate-sorted]
90 --no-convert-bam [Do not convert to bam format.
91 Output is <output_dir>accepted_hit.sam.
92 Implies --no-sort-bam.]
93
94 SAM Header Options (for embedding sequencing run metadata in output):
95 --rg-id <string> (read group ID)
96 --rg-sample <string> (sample ID)
97 --rg-library <string> (library ID)
98 --rg-description <string> (descriptive string, no tabs allowed)
99 --rg-platform-unit <string> (e.g Illumina lane ID)
100 --rg-center <string> (sequencing center name)
101 --rg-date <string> (ISO 8601 date of the sequencing run)
102 --rg-platform <string> (Sequencing platform descriptor)
103 '''
104
105 class Usage(Exception):
106 def __init__(self, msg):
107 self.msg = msg
108
109 output_dir = "./tophat_out/"
110 logging_dir = output_dir + "logs/"
111 run_log = None
112 run_cmd = None
113 tmp_dir = output_dir + "tmp/"
114 bin_dir = sys.path[0] + "/"
115 use_zpacker = False # this is set by -z/--zpacker option (-z0 leaves it False)
116
117 use_BAM_Unmapped = False # automatically set to True for non-Solid reads, handles unmapped reads in BAM format
118
119 use_BWT_FIFO = False # can only be set to True if use_zpacker is True and only with -C/--color
120 # enabled by -X/-unmapped-fifo option (unless -z0)
121 unmapped_reads_fifo = None # if use_BWT_FIFO is True, this is to trick bowtie into writing the
122 # unmapped reads into a compressed file
123
124 samtools_path = None
125 bowtie_path = None
126 fail_str = "\t[FAILED]\n"
127
128 sam_header = tmp_dir + "stub_header.sam"
129
130 # TopHatParams captures all of the runtime paramaters used by TopHat, and many
131 # of these are passed as command line options to exectubles run by the pipeline
132
133 # This class and its nested classes also do options parsing through parse_options()
134 # and option validation via the member function check()
135
136 class TopHatParams:
137
138 # SpliceConstraints is a group of runtime parameters that specify what
139 # constraints to put on junctions discovered by the program. These constraints
140 # are used to filter out spurious/false positive junctions.
141
142 class SpliceConstraints:
143 def __init__(self,
144 min_anchor_length,
145 min_intron_length,
146 max_intron_length,
147 splice_mismatches,
148 min_isoform_fraction):
149 self.min_anchor_length = min_anchor_length
150 self.min_intron_length = min_intron_length
151 self.max_intron_length = max_intron_length
152 self.splice_mismatches = splice_mismatches
153 self.min_isoform_fraction = min_isoform_fraction
154
155 def parse_options(self, opts):
156 for option, value in opts:
157 if option in ("-m", "--splice-mismatches"):
158 self.splice_mismatches = int(value)
159 elif option in ("-a", "--min-anchor"):
160 self.min_anchor_length = int(value)
161 elif option in ("-F", "--min-isoform-fraction"):
162 self.min_isoform_fraction = float(value)
163 elif option in ("-i", "--min-intron-length"):
164 self.min_intron_length = int(value)
165 elif option in ("-I", "--max-intron-length"):
166 self.max_intron_length = int(value)
167
168 def check(self):
169 if self.splice_mismatches not in [0,1,2]:
170 die("Error: arg to --splice-mismatches must be 0, 1, or 2")
171 if self.min_anchor_length < 4:
172 die("Error: arg to --min-anchor-len must be greater than 4")
173 if self.min_isoform_fraction < 0.0 or self.min_isoform_fraction > 1.0:
174 die("Error: arg to --min-isoform-fraction must be between 0.0 and 1.0")
175 if self.min_intron_length <= 0:
176 die("Error: arg to --min-intron-length must be greater than 0")
177 if self.max_intron_length <= 0:
178 die("Error: arg to --max-intron-length must be greater than 0")
179
180 # SystemParams is a group of runtime parameters that determine how to handle
181 # temporary files produced during a run and how many threads to use for threaded
182 # stages of the pipeline (e.g. Bowtie)
183
184 class SystemParams:
185 def __init__(self,
186 num_cpus,
187 keep_tmp):
188 self.num_cpus = num_cpus
189 self.keep_tmp = keep_tmp
190 self.zipper = "gzip"
191 self.zipper_opts= []
192
193 def parse_options(self, opts):
194 global use_zpacker
195 global use_BWT_FIFO
196 for option, value in opts:
197 if option in ("-p", "--num-threads"):
198 self.num_cpus = int(value)
199 elif option == "--keep-tmp":
200 self.keep_tmp = True
201 elif option in ("-z","--zpacker"):
202 if value.lower() in ["-", " ", ".", "0", "none", "f", "false", "no"]:
203 value=""
204 self.zipper = value
205 #if not self.zipper:
206 # self.zipper='gzip'
207 elif option in ("-X", "--unmapped-fifo"):
208 use_BWT_FIFO=True
209 if self.zipper:
210 use_zpacker=True
211 if self.num_cpus>1 and not self.zipper_opts:
212 if self.zipper.endswith('pbzip2') or self.zipper.endswith('pigz'):
213 self.zipper_opts.append('-p'+str(self.num_cpus))
214 else:
215 use_zpacker=False
216 if use_BWT_FIFO: use_BWT_FIFO=False
217 def cmd(self):
218 cmdline=[]
219 if self.zipper:
220 cmdline.extend(['-z',self.zipper])
221 if self.num_cpus>1:
222 cmdline.extend(['-p'+str(self.num_cpus)])
223 return cmdline
224
225 def check(self):
226 if self.num_cpus<1 :
227 die("Error: arg to --num-threads must be greater than 0")
228 if self.zipper:
229 xzip=which(self.zipper)
230 if not xzip:
231 die("Error: cannot find compression program "+self.zipper)
232
233 # ReadParams is a group of runtime parameters that specify various properties
234 # of the user's reads (e.g. which quality scale their are on, how long the
235 # fragments are, etc).
236 class ReadParams:
237 def __init__(self,
238 solexa_quals,
239 phred64_quals,
240 quals,
241 integer_quals,
242 color,
243 color_out,
244 library_type,
245 seed_length,
246 reads_format,
247 mate_inner_dist,
248 mate_inner_dist_std_dev,
249 read_group_id,
250 sample_id,
251 library_id,
252 description,
253 seq_platform_unit,
254 seq_center,
255 seq_run_date,
256 seq_platform):
257 self.solexa_quals = solexa_quals
258 self.phred64_quals = phred64_quals
259 self.quals = quals
260 self.integer_quals = integer_quals
261 self.color = color
262 self.color_out = color_out
263 self.library_type = library_type
264 self.seed_length = seed_length
265 self.reads_format = reads_format
266 self.mate_inner_dist = mate_inner_dist
267 self.mate_inner_dist_std_dev = mate_inner_dist_std_dev
268 self.read_group_id = read_group_id
269 self.sample_id = sample_id
270 self.library_id = library_id
271 self.description = description
272 self.seq_platform_unit = seq_platform_unit
273 self.seq_center = seq_center
274 self.seq_run_date = seq_run_date
275 self.seq_platform = seq_platform
276
277 def parse_options(self, opts):
278 for option, value in opts:
279 if option == "--solexa-quals":
280 self.solexa_quals = True
281 elif option in ("--solexa1.3-quals", "--phred64-quals"):
282 self.phred64_quals = True
283 elif option in ("-Q", "--quals"):
284 self.quals = True
285 elif option == "--integer-quals":
286 self.integer_quals = True
287 elif option in ("-C", "--color"):
288 self.color = True
289 elif option == "--color-out":
290 self.color_out = True
291 elif option == "--library-type":
292 self.library_type = value
293 elif option in ("-s", "--seed-length"):
294 self.seed_length = int(value)
295 elif option in ("-r", "--mate-inner-dist"):
296 self.mate_inner_dist = int(value)
297 elif option == "--mate-std-dev":
298 self.mate_inner_dist_std_dev = int(value)
299 elif option == "--rg-id":
300 self.read_group_id = value
301 elif option == "--rg-sample":
302 self.sample_id = value
303 elif option == "--rg-library":
304 self.library_id = value
305 elif option == "--rg-description":
306 self.description = value
307 elif option == "--rg-platform-unit":
308 self.seq_platform_unit = value
309 elif option == "--rg-center":
310 self.seq_center = value
311 elif option == "--rg-date":
312 self.seq_run_date = value
313 elif option == "--rg-platform":
314 self.seq_platform = value
315
316 def check(self):
317 if self.seed_length != None and self.seed_length < 20:
318 die("Error: arg to --seed-length must be at least 20")
319 if self.mate_inner_dist_std_dev != None and self.mate_inner_dist_std_dev < 0:
320 die("Error: arg to --mate-std-dev must at least 0")
321 if (not self.read_group_id and self.sample_id) or (self.read_group_id and not self.sample_id):
322 die("Error: --rg-id and --rg-sample must be specified or omitted together")
323
324 # SearchParams is a group of runtime parameters that specify how TopHat will
325 # search for splice junctions
326
327 class SearchParams:
328 def __init__(self,
329 min_closure_exon,
330 min_closure_intron,
331 max_closure_intron,
332 min_coverage_intron,
333 max_coverage_intron,
334 min_segment_intron,
335 max_segment_intron):
336
337 self.min_closure_exon_length = min_closure_exon
338 self.min_closure_intron_length = min_closure_intron
339 self.max_closure_intron_length = max_closure_intron
340 self.min_coverage_intron_length = min_coverage_intron
341 self.max_coverage_intron_length = max_coverage_intron
342 self.min_segment_intron_length = min_segment_intron
343 self.max_segment_intron_length = max_segment_intron
344
345 def parse_options(self, opts):
346 for option, value in opts:
347 if option == "--min-closure-exon":
348 self.min_closure_exon_length = int(value)
349 if option == "--min-closure-intron":
350 self.min_closure_intron_length = int(value)
351 if option == "--max-closure-intron":
352 self.max_closure_intron_length = int(value)
353 if option == "--min-coverage-intron":
354 self.min_coverage_intron_length = int(value)
355 if option == "--max-coverage-intron":
356 self.max_coverage_intron_length = int(value)
357 if option == "--min-segment-intron":
358 self.min_segment_intron_length = int(value)
359 if option == "--max-segment-intron":
360 self.max_segment_intron_length = int(value)
361
362 def check(self):
363 if self.min_closure_exon_length < 0:
364 die("Error: arg to --min-closure-exon must be at least 20")
365 if self.min_closure_intron_length < 0:
366 die("Error: arg to --min-closure-intron must be at least 20")
367 if self.max_closure_intron_length < 0:
368 die("Error: arg to --max-closure-intron must be at least 20")
369 if self.min_coverage_intron_length < 0:
370 die("Error: arg to --min-coverage-intron must be at least 20")
371 if self.max_coverage_intron_length < 0:
372 die("Error: arg to --max-coverage-intron must be at least 20")
373 if self.min_segment_intron_length < 0:
374 die("Error: arg to --min-segment-intron must be at least 20")
375 if self.max_segment_intron_length < 0:
376 die("Error: arg to --max-segment-intron must be at least 20")
377
378 class ReportParams:
379 def __init__(self):
380 self.sort_bam = True
381 self.convert_bam = True
382
383 def parse_options(self, opts):
384 for option, value in opts:
385 if option == "--no-sort-bam":
386 self.sort_bam = False
387 if option == "--no-convert-bam":
388 self.convert_bam = False
389 self.sort_bam = False
390
391
392 def __init__(self):
393 self.splice_constraints = self.SpliceConstraints(8, # min_anchor
394 50, # min_intron
395 500000, # max_intron
396 0, # splice_mismatches
397 0.15) # min_isoform_frac
398
399 self.read_params = self.ReadParams(False, # solexa_scale
400 False,
401 False, # quals
402 None, # integer quals
403 False, # SOLiD - color space
404 False, # SOLiD - color out instead of base pair,
405 "", # library type (e.g. "illumina-stranded-pair-end")
406 None, # seed_length
407 "fastq", # quality_format
408 None, # mate inner distance
409 20, # mate inner dist std dev
410 None, # read group id
411 None, # sample id
412 None, # library id
413 None, # description
414 None, # platform unit (i.e. lane)
415 None, # sequencing center
416 None, # run date
417 None) # sequencing platform
418
419 self.system_params = self.SystemParams(1, # bowtie_threads (num_cpus)
420 False) # keep_tmp
421
422 self.search_params = self.SearchParams(100, # min_closure_exon_length
423 50, # min_closure_intron_length
424 5000, # max_closure_intron_length
425 50, # min_coverage_intron_length
426 20000, # max_coverage_intron_length
427 50, # min_segment_intron_length
428 500000) # max_segment_intron_length
429
430 self.report_params = self.ReportParams()
431 self.gff_annotation = None
432 self.raw_junctions = None
433 self.find_novel_juncs = True
434 self.find_novel_indels = True
435 self.find_GFF_juncs = True
436 self.skip_check_reads = False
437 self.max_hits = 20
438 self.initial_read_mismatches = 2
439 self.segment_length = 25
440 self.segment_mismatches = 2
441 self.bowtie_alignment_option = "-v"
442 self.max_insertion_length = 3
443 self.max_deletion_length = 3
444 self.raw_insertions = None
445 self.raw_deletions = None
446 self.closure_search = None
447 self.coverage_search = None
448 self.microexon_search = False
449 self.butterfly_search = None
450
451 def check(self):
452 self.splice_constraints.check()
453 self.read_params.check()
454 self.system_params.check()
455
456 if self.segment_length <= 4:
457 die("Error: arg to --segment-length must at least 4")
458 if self.segment_mismatches < 0 or self.segment_mismatches > 3:
459 die("Error: arg to --segment-mismatches must in [0, 3]")
460
461 if self.read_params.color and self.butterfly_search:
462 die("Error: butterfly-search in colorspace is not yet supported")
463
464 library_types = ["fr-unstranded", "fr-firststrand", "fr-secondstrand"]
465
466 if self.read_params.library_type and self.read_params.library_type not in library_types:
467 die("Error: library-type should be one of: "+', '.join(library_types))
468
469 self.search_params.max_closure_intron_length = min(self.splice_constraints.max_intron_length,
470 self.search_params.max_closure_intron_length)
471
472 self.search_params.max_segment_intron_length = min(self.splice_constraints.max_intron_length,
473 self.search_params.max_segment_intron_length)
474
475 self.search_params.max_coverage_intron_length = min(self.splice_constraints.max_intron_length,
476 self.search_params.max_coverage_intron_length)
477
478 if self.max_insertion_length >= self.segment_length:
479 die("Error: the max insertion length ("+self.max_insertion_length+") can not be equal to or greater than the segment length ("+self.segment_length+")")
480
481 if self.max_insertion_length < 0:
482 die("Error: the max insertion length ("+self.max_insertion_length+") can not be less than 0")
483
484 if self.max_deletion_length >= self.splice_constraints.min_intron_length:
485 die("Error: the max deletion length ("+self.max_deletion_length+") can not be equal to or greater than the min intron length ("+self.splice_constraints.min_intron_length+")")
486
487 if self.max_deletion_length < 0:
488 die("Error: the max deletion length ("+self.max_deletion_length+") can not be less than 0")
489
490 def cmd(self):
491 cmd = ["--min-anchor", str(self.splice_constraints.min_anchor_length),
492 "--splice-mismatches", str(self.splice_constraints.splice_mismatches),
493 "--min-report-intron", str(self.splice_constraints.min_intron_length),
494 "--max-report-intron", str(self.splice_constraints.max_intron_length),
495 "--min-isoform-fraction", str(self.splice_constraints.min_isoform_fraction),
496 "--output-dir", output_dir,
497 "--max-multihits", str(self.max_hits),
498 "--segment-length", str(self.segment_length),
499 "--segment-mismatches", str(self.segment_mismatches),
500 "--min-closure-exon", str(self.search_params.min_closure_exon_length),
501 "--min-closure-intron", str(self.search_params.min_closure_intron_length),
502 "--max-closure-intron", str(self.search_params.max_closure_intron_length),
503 "--min-coverage-intron", str(self.search_params.min_coverage_intron_length),
504 "--max-coverage-intron", str(self.search_params.max_coverage_intron_length),
505 "--min-segment-intron", str(self.search_params.min_segment_intron_length),
506 "--max-segment-intron", str(self.search_params.max_segment_intron_length),
507 "--sam-header", sam_header,
508 "--max-insertion-length", str(self.max_insertion_length),
509 "--max-deletion-length", str(self.max_deletion_length)]
510
511 cmd.extend(self.system_params.cmd())
512
513 if self.read_params.mate_inner_dist != None:
514 cmd.extend(["--inner-dist-mean", str(self.read_params.mate_inner_dist),
515 "--inner-dist-std-dev", str(self.read_params.mate_inner_dist_std_dev)])
516 if self.gff_annotation != None:
517 cmd.extend(["--gtf-annotations", str(self.gff_annotation)])
518 if not self.closure_search:
519 cmd.append("--no-closure-search")
520 if not self.coverage_search:
521 cmd.append("--no-coverage-search")
522 if not self.microexon_search:
523 cmd.append("--no-microexon-search")
524 if self.butterfly_search:
525 cmd.append("--butterfly-search")
526 if self.read_params.solexa_quals:
527 cmd.append("--solexa-quals")
528 if self.read_params.quals:
529 cmd.append("--quals")
530 if self.read_params.integer_quals:
531 cmd.append("--integer-quals")
532 if self.read_params.color:
533 cmd.append("--color")
534 if self.read_params.color_out:
535 cmd.append("--color-out")
536 if self.read_params.library_type:
537 cmd.extend(["--library-type", self.read_params.library_type])
538 if self.read_params.read_group_id:
539 cmd.extend(["--rg-id", self.read_params.read_group_id])
540 if self.read_params.phred64_quals:
541 cmd.append("--phred64-quals")
542 return cmd
543
544 # This is the master options parsing routine, which calls parse_options for
545 # the delegate classes (e.g. SpliceConstraints) that handle certain groups
546 # of options.
547 def parse_options(self, argv):
548 try:
549 opts, args = getopt.getopt(argv[1:], "hvp:m:F:a:i:I:G:r:o:j:Xz:s:g:QC",
550 ["version",
551 "help",
552 "output-dir=",
553 "solexa-quals",
554 "solexa1.3-quals",
555 "phred64-quals",
556 "quals",
557 "integer-quals",
558 "color",
559 "color-out",
560 "library-type=",
561 "num-threads=",
562 "splice-mismatches=",
563 "max-multihits=",
564 "min-isoform-fraction=",
565 "min-anchor-length=",
566 "min-intron-length=",
567 "max-intron-length=",
568 "GTF=",
569 "raw-juncs=",
570 "no-novel-juncs",
571 "no-novel-indels",
572 "no-gtf-juncs",
573 "skip-check-reads",
574 "mate-inner-dist=",
575 "mate-std-dev=",
576 "no-closure-search",
577 "no-coverage-search",
578 "closure-search",
579 "coverage-search",
580 "microexon-search",
581 "min-closure-exon=",
582 "min-closure-intron=",
583 "max-closure-intron=",
584 "min-coverage-intron=",
585 "max-coverage-intron=",
586 "min-segment-intron=",
587 "max-segment-intron=",
588 "seed-length=",
589 "initial-read-mismatches=",
590 "segment-length=",
591 "segment-mismatches=",
592 "bowtie-n",
593 "butterfly-search",
594 "no-butterfly-search",
595 "keep-tmp",
596 "rg-id=",
597 "rg-sample=",
598 "rg-library=",
599 "rg-description=",
600 "rg-platform-unit=",
601 "rg-center=",
602 "rg-date=",
603 "rg-platform=",
604 "tmp-dir=",
605 "zpacker=",
606 "unmapped-fifo",
607 "max-insertion-length=",
608 "max-deletion-length=",
609 "insertions=",
610 "deletions=",
611 "no-sort-bam",
612 "no-convert-bam"])
613 except getopt.error, msg:
614 raise Usage(msg)
615
616 self.splice_constraints.parse_options(opts)
617 self.system_params.parse_options(opts)
618 self.read_params.parse_options(opts)
619 self.search_params.parse_options(opts)
620 self.report_params.parse_options(opts)
621 global use_BWT_FIFO
622 global use_BAM_Unmapped
623 if not self.read_params.color:
624 use_BWT_FIFO=False
625 use_BAM_Unmapped=True
626 global output_dir
627 global logging_dir
628 global tmp_dir
629 global sam_header
630
631 custom_tmp_dir = None
632 custom_out_dir = None
633 # option processing
634 for option, value in opts:
635 if option in ("-v", "--version"):
636 print "TopHat v%s" % (get_version())
637 sys.exit(0)
638 if option in ("-h", "--help"):
639 raise Usage(use_message)
640 if option in ("-g", "--max-multihits"):
641 self.max_hits = int(value)
642 if option in ("-G", "--GTF"):
643 self.gff_annotation = value
644 if option in ("-j", "--raw-juncs"):
645 self.raw_junctions = value
646 if option == "--no-novel-juncs":
647 self.find_novel_juncs = False
648 if option == "--no-novel-indels":
649 self.find_novel_indels = False
650 if option == "--no-gtf-juncs":
651 self.find_GFF_juncs = False
652 if option == "--skip-check-reads":
653 self.skip_check_reads = True
654 if option == "--no-coverage-search":
655 self.coverage_search = False
656 if option == "--no-closure-search":
657 self.closure_search = False
658 if option == "--coverage-search":
659 self.coverage_search = True
660 if option == "--closure-search":
661 self.closure_search = True
662 if option == "--microexon-search":
663 self.microexon_search = True
664 if option == "--butterfly-search":
665 self.butterfly_search = True
666 if option == "--no-butterfly-search":
667 self.butterfly_search = False
668 if option == "--initial-read-mismatches":
669 self.initial_read_mismatches = int(value)
670 if option == "--segment-length":
671 self.segment_length = int(value)
672 if option == "--segment-mismatches":
673 self.segment_mismatches = int(value)
674 if option == "--bowtie-n":
675 self.bowtie_alignment_option = "-n"
676 if option == "--max-insertion-length":
677 self.max_insertion_length = int(value)
678 if option == "--max-deletion-length":
679 self.max_deletion_length = int(value)
680 if option == "--insertions":
681 self.raw_insertions = value
682 if option == "--deletions":
683 self.raw_deletions = value
684 if option in ("-o", "--output-dir"):
685 custom_out_dir = value + "/"
686 if option == "--tmp-dir":
687 custom_tmp_dir = value + "/"
688
689 if custom_out_dir != None:
690 output_dir = custom_out_dir
691 logging_dir = output_dir + "logs/"
692 tmp_dir = output_dir + "tmp/"
693 sam_header = tmp_dir + "stub_header.sam"
694 if custom_tmp_dir != None:
695 tmp_dir = custom_tmp_dir
696 sam_header = tmp_dir + "stub_header.sam"
697 if len(args) < 2:
698 raise Usage(use_message)
699 return args
700
701
702 def getFileDir(filepath):
703 #if fullpath, returns path including the ending /
704 fpath, fname=os.path.split(filepath)
705 if fpath: fpath+='/'
706 return fpath
707
708 def getFileBaseName(filepath):
709 fpath, fname=os.path.split(filepath)
710 fbase, fext =os.path.splitext(fname)
711 fx=fext.lower()
712 if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fbase)>0:
713 return fbase
714 elif fx == '.z' or fx.find('.gz')==0 or fx.find('.bz')==0:
715 fb, fext = os.path.splitext(fbase)
716 fx=fext.lower()
717 if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fb)>0:
718 return fb
719 else:
720 return fbase
721 else:
722 if len(fbase)>len(fext):
723 return fbase
724 else:
725 return fname
726
727 # Returns the current time in a nice format
728 def right_now():
729 curr_time = datetime.now()
730 return curr_time.strftime("%c")
731
732 # Ensures that the output, logging, and temp directories are present. If not,
733 # they are created
734 def prepare_output_dir():
735
736 print >> sys.stderr, "[%s] Preparing output location %s" % (right_now(), output_dir)
737 if os.path.exists(output_dir):
738 pass
739 else:
740 os.mkdir(output_dir)
741
742 if os.path.exists(logging_dir):
743 pass
744 else:
745 os.mkdir(logging_dir)
746
747 if os.path.exists(tmp_dir):
748 pass
749 else:
750 try:
751 os.makedirs(tmp_dir)
752 except OSError, o:
753 die("\nError creating directory %s (%s)" % (tmp_dir, o))
754
755
756 # to be added as preexec_fn for every subprocess.Popen() call:
757 # see http://bugs.python.org/issue1652
758 def subprocess_setup():
759 # Python installs a SIGPIPE handler by default, which causes
760 # gzip or other de/compression pipes to complain about "stdout: Broken pipe"
761 signal.signal(signal.SIGPIPE, signal.SIG_DFL)
762
763 # Check that the Bowtie index specified by the user is present and all files
764 # are there.
765 def check_bowtie_index(idx_prefix):
766 print >> sys.stderr, "[%s] Checking for Bowtie index files" % right_now()
767
768 idx_fwd_1 = idx_prefix + ".1.ebwt"
769 idx_fwd_2 = idx_prefix + ".2.ebwt"
770 idx_rev_1 = idx_prefix + ".rev.1.ebwt"
771 idx_rev_2 = idx_prefix + ".rev.2.ebwt"
772
773 if os.path.exists(idx_fwd_1) and \
774 os.path.exists(idx_fwd_2) and \
775 os.path.exists(idx_rev_1) and \
776 os.path.exists(idx_rev_2):
777 return
778 else:
779 bowtie_idx_env_var = os.environ.get("BOWTIE_INDEXES")
780 if bowtie_idx_env_var == None:
781 die("Error: Could not find Bowtie index files " + idx_prefix + ".*")
782 idx_prefix = bowtie_idx_env_var + idx_prefix
783 idx_fwd_1 = idx_prefix + ".1.ebwt"
784 idx_fwd_2 = idx_prefix + ".2.ebwt"
785 idx_rev_1 = idx_prefix + ".rev.1.ebwt"
786 idx_rev_2 = idx_prefix + ".rev.2.ebwt"
787
788 if os.path.exists(idx_fwd_1) and \
789 os.path.exists(idx_fwd_2) and \
790 os.path.exists(idx_rev_1) and \
791 os.path.exists(idx_rev_2):
792 return
793 else:
794 die("Error: Could not find Bowtie index files " + idx_prefix + ".*")
795
796 # Reconstructs the multifasta file from which the Bowtie index was created, if
797 # it's not already there.
798 def bowtie_idx_to_fa(idx_prefix):
799 idx_name = idx_prefix.split('/')[-1]
800 print >> sys.stderr, "[%s] Reconstituting reference FASTA file from Bowtie index" % (right_now())
801
802 try:
803 tmp_fasta_file_name = tmp_dir + idx_name + ".fa"
804 tmp_fasta_file = open(tmp_fasta_file_name, "w")
805
806 inspect_log = open(logging_dir + "bowtie_inspect_recons.log", "w")
807
808 inspect_cmd = [prog_path("bowtie-inspect"),
809 idx_prefix]
810
811 print >> sys.stderr, "Executing: " + " ".join(inspect_cmd) + " > " + tmp_fasta_file_name
812 ret = subprocess.call(inspect_cmd,
813 stdout=tmp_fasta_file,
814 stderr=inspect_log)
815
816 # Bowtie reported an error
817 if ret != 0:
818 die(fail_str+"Error: bowtie-inspect returned an error")
819
820 # Bowtie not found
821 except OSError, o:
822 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
823 die(fail_str+"Error: bowtie-inspect not found on this system. Did you forget to include it in your PATH?")
824
825 return tmp_fasta_file_name
826
827 # Checks whether the multifasta file for the genome is present alongside the
828 # Bowtie index files for it.
829 def check_fasta(idx_prefix):
830 print >> sys.stderr, "[%s] Checking for reference FASTA file" % right_now()
831 idx_fasta = idx_prefix + ".fa"
832 if os.path.exists(idx_fasta):
833 return idx_fasta
834 else:
835 bowtie_idx_env_var = os.environ.get("BOWTIE_INDEXES")
836 if bowtie_idx_env_var != None:
837 idx_fasta = bowtie_idx_env_var + idx_prefix + ".fa"
838 if os.path.exists(idx_fasta):
839 return idx_fasta
840
841 print >> sys.stderr, "\tWarning: Could not find FASTA file " + idx_fasta
842 idx_fa = bowtie_idx_to_fa(idx_prefix)
843 return idx_fa
844
845 # Check that both the Bowtie index and the genome's fasta file are present
846 def check_index(idx_prefix):
847 check_bowtie_index(idx_prefix)
848 ref_fasta_file = check_fasta(idx_prefix)
849
850 return (ref_fasta_file, None)
851
852 # Retrive a tuple containing the system's version of Bowtie. Parsed from
853 # `bowtie --version`
854 def get_bowtie_version():
855 try:
856 # Launch Bowtie to capture its version info
857 proc = subprocess.Popen([bowtie_path, "--version"],
858 stdout=subprocess.PIPE)
859 stdout_value = proc.communicate()[0]
860 bowtie_version = None
861 bowtie_out = repr(stdout_value)
862
863 # Find the version identifier
864 version_str = "bowtie version "
865 ver_str_idx = bowtie_out.find(version_str)
866 if ver_str_idx != -1:
867 nl = bowtie_out.find("\\n", ver_str_idx)
868 version_val = bowtie_out[ver_str_idx + len(version_str):nl]
869 bowtie_version = [int(x) for x in version_val.split('.')]
870 if len(bowtie_version) == 3:
871 bowtie_version.append(0)
872
873 return bowtie_version
874 except OSError, o:
875 errmsg=fail_str+str(o)+"\n"
876 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
877 errmsg+="Error: bowtie not found on this system"
878 die(errmsg)
879
880 def get_index_sam_header(read_params, idx_prefix):
881 try:
882 bowtie_sam_header_filename = tmp_dir+idx_prefix.split('/')[-1]+".bwt.samheader.sam"
883 bowtie_sam_header_file = open(bowtie_sam_header_filename,"w")
884
885 bowtie_header_cmd = [bowtie_path, "--sam"]
886 if read_params.color:
887 bowtie_header_cmd.append('-C')
888 bowtie_header_cmd.extend([idx_prefix, '/dev/null'])
889 subprocess.call(bowtie_header_cmd,
890 stdout=bowtie_sam_header_file,
891 stderr=open('/dev/null'))
892
893 bowtie_sam_header_file.close()
894 bowtie_sam_header_file = open(bowtie_sam_header_filename,"r")
895
896 sam_header_file = open(sam_header, "w")
897
898 preamble = []
899 sq_dict_lines = []
900
901 for line in bowtie_sam_header_file.readlines():
902 line = line.strip()
903 if line.find("@SQ") != -1:
904 # Sequence dictionary record
905 cols = line.split('\t')
906 seq_name = None
907 for col in cols:
908 fields = col.split(':')
909 #print fields
910 if len(fields) > 0 and fields[0] == "SN":
911 seq_name = fields[1]
912 if seq_name == None:
913 die("Error: malformed sequence dictionary in sam header")
914 sq_dict_lines.append([seq_name,line])
915 elif line.find("CL"):
916 continue
917 else:
918 preamble.append(line)
919 #for line in preamble:
920 # print >> sam_header_file, line
921 print >> sam_header_file, "@HD\tVN:1.0\tSO:coordinate"
922
923 if read_params.read_group_id and read_params.sample_id:
924 rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id,
925 read_params.sample_id)
926 if read_params.library_id:
927 rg_str += "\tLB:%s" % read_params.library_id
928 if read_params.description:
929 rg_str += "\tDS:%s" % read_params.description
930 if read_params.seq_platform_unit:
931 rg_str += "\tPU:%s" % read_params.seq_platform_unit
932 if read_params.seq_center:
933 rg_str += "\tCN:%s" % read_params.seq_center
934 if read_params.mate_inner_dist:
935 rg_str += "\tPI:%s" % read_params.mate_inner_dist
936 if read_params.seq_run_date:
937 rg_str += "\tDT:%s" % read_params.seq_run_date
938 if read_params.seq_platform:
939 rg_str += "\tPL:%s" % read_params.seq_platform
940
941 print >> sam_header_file, rg_str
942
943 sq_dict_lines.sort(lambda x,y: cmp(x[0],y[0]))
944 for [name, line] in sq_dict_lines:
945 print >> sam_header_file, line
946 print >> sam_header_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd)
947
948 sam_header_file.close()
949
950 return sam_header
951
952 except OSError, o:
953 errmsg=fail_str+str(o)+"\n"
954 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
955 errmsg+="Error: bowtie not found on this system"
956 die(errmsg)
957
958 # Make sure Bowtie is installed and is recent enough to be useful
959 def check_bowtie():
960 print >> sys.stderr, "[%s] Checking for Bowtie" % right_now()
961 global bowtie_path
962 bowtie_path=prog_path("bowtie")
963 bowtie_version = get_bowtie_version()
964 if bowtie_version == None:
965 die("Error: Bowtie not found on this system")
966 elif bowtie_version[0] < 1 and bowtie_version[1] < 12 and bowtie_version[2] < 3:
967 die("Error: TopHat requires Bowtie 0.12.3 or later")
968 print >> sys.stderr, "\tBowtie version:\t\t\t %s" % ".".join([str(x) for x in bowtie_version])
969
970
971 # Retrive a tuple containing the system's version of samtools. Parsed from
972 # `samtools`
973 def get_samtools_version():
974 try:
975 # Launch Bowtie to capture its version info
976 proc = subprocess.Popen(samtools_path, stderr=subprocess.PIPE)
977 samtools_out = proc.communicate()[1]
978
979 # Find the version identifier
980 version_match = re.search(r'Version:\s+(\d+)\.(\d+).(\d+)([a-zA-Z]?)', samtools_out)
981 samtools_version_arr = [int(version_match.group(x)) for x in [1,2,3]]
982 if version_match.group(4):
983 samtools_version_arr.append(version_match.group(4))
984 else:
985 samtools_version_arr.append(0)
986
987 return version_match.group(), samtools_version_arr
988 except OSError, o:
989 errmsg=fail_str+str(o)+"\n"
990 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
991 errmsg+="Error: samtools not found on this system"
992 die(errmsg)
993
994 # Make sure the SAM tools are installed and are recent enough to be useful
995 def check_samtools():
996 print >> sys.stderr, "[%s] Checking for Samtools" % right_now()
997 global samtools_path
998 samtools_path=prog_path("samtools")
999 samtools_version_str, samtools_version_arr = get_samtools_version()
1000 if samtools_version_str == None:
1001 die("Error: Samtools not found on this system")
1002 elif samtools_version_arr[1] < 1 or samtools_version_arr[2] < 7:
1003 die("Error: TopHat requires Samtools 0.1.7 or later")
1004 print >> sys.stderr, "\tSamtools %s" % samtools_version_str
1005
1006
1007
1008 class FastxReader:
1009 def __init__(self, i_file, is_color=0, fname=''):
1010 self.bufline=None
1011 self.format=None
1012 self.ifile=i_file
1013 self.nextRecord=None
1014 self.eof=None
1015 self.fname=fname
1016 self.lastline=None
1017 self.numrecords=0
1018 self.isColor=0
1019 if is_color : self.isColor=1
1020 # determine file type
1021 #no records processed yet, skip custom header lines if any
1022 hlines=10 # allow maximum 10 header lines
1023 self.lastline=" "
1024 while hlines>0 and self.lastline[0] not in "@>" :
1025 self.lastline=self.ifile.readline()
1026 hlines-=1
1027 if self.lastline[0] == '@':
1028 self.format='fastq'
1029 self.nextRecord=self.nextFastq
1030 elif self.lastline[0] == '>':
1031 self.format='fasta'
1032 self.nextRecord=self.nextFasta
1033 else:
1034 die("Error: cannot determine record type in input file %s" % fname)
1035 self.bufline=self.lastline
1036 self.lastline=None
1037
1038 def nextFastq(self):
1039 # returning tuple: (seqID, sequence_string, seq_len, qv_string)
1040 seqid,seqstr,qstr,seq_len='','','',0
1041 if self.eof: return (seqid, seqstr, seq_len, qstr)
1042 fline=self.getLine #shortcut to save a bit of time
1043 line=fline()
1044
1045 if not line : return (seqid, seqstr, seq_len, qstr)
1046 while len(line.rstrip())==0: # skip empty lines
1047 line=fline()
1048 if not line : return (seqid, seqstr,seq_len, qstr)
1049 try:
1050 if line[0] != "@":
1051 raise ValueError("Records in Fastq files should start with '@' character")
1052 seqid = line[1:].rstrip()
1053 seqstr = fline().rstrip()
1054
1055 #There may now be more sequence lines, or the "+" quality marker line:
1056 while True:
1057 line = fline()
1058 if not line:
1059 raise ValueError("Premature end of file (missing quality values for "+seqid+")")
1060 if line[0] == "+":
1061 # -- sequence string ended
1062 #qtitle = line[1:].rstrip()
1063 #if qtitle and qtitle != seqid:
1064 # raise ValueError("Different read ID for sequence and quality (%s vs %s)" \
1065 # % (seqid, qtitle))
1066 break
1067 seqstr += line.rstrip() #removes trailing newlines
1068 #loop until + found
1069 seq_len = len(seqstr)
1070 #at least one line of quality data should follow
1071 qstrlen=0
1072 #now read next lines as quality values until seq_len is reached
1073 while True:
1074 line=fline()
1075 if not line : break #end of file
1076 qstr += line.rstrip()
1077 qstrlen=len(qstr)
1078 if qstrlen + self.isColor >= seq_len :
1079 break # qv string has reached the length of seq string
1080 #loop until qv has the same length as seq
1081 if self.isColor and qstr[0]=="!":
1082 #some color qual string have a '!' character at the beginning, should be stripped
1083 qstr = qstr[1:]
1084 qstrlen -= 1
1085 if seq_len != qstrlen+self.isColor :
1086 raise ValueError("Length mismatch between sequence and quality strings "+ \
1087 "for %s (%i vs %i)." \
1088 % (seqid, seq_len, qstrlen))
1089 except ValueError, err:
1090 die("\nError encountered parsing file "+self.fname+":\n "+str(err))
1091 #return the record
1092 self.numrecords+=1
1093 if self.isColor :
1094 seq_len-=1
1095 seqstr = seqstr[1:]
1096 return (seqid, seqstr, seq_len, qstr)
1097
1098 def nextFasta(self):
1099 # returning tuple: (seqID, sequence_string, seq_len)
1100 seqid,seqstr,seq_len='','',0
1101 fline=self.getLine # shortcut to readline function of f
1102 line=fline() # this will use the buffer line if it's there
1103 if not line : return (seqid, seqstr, seq_len, None)
1104 while len(line.rstrip())==0: # skip empty lines
1105 line=fline()
1106 if not line : return (seqid, seqstr, seq_len, None)
1107 try:
1108 if line[0] != ">":
1109 raise ValueError("Records in Fasta files must start with '>' character")
1110 seqid = line[1:].split()[0]
1111 #more sequence lines, or the ">" quality marker line:
1112 while True:
1113 line = fline()
1114 if not line: break
1115 if line[0] == '>':
1116 #next sequence starts here
1117 self.ungetLine()
1118 break
1119 seqstr += line.rstrip()
1120 #loop until '>' found
1121 seq_len = len(seqstr)
1122 if seq_len < 3:
1123 raise ValueError("Read %s too short (%i)." \
1124 % (seqid, seq_len))
1125 except ValueError, err:
1126 die("\nError encountered parsing fasta file "+self.fname+"\n "+str(err))
1127 #return the record and continue
1128 self.numrecords+=1
1129 if self.isColor :
1130 seq_len-=1
1131 seqstr = seqstr[1:]
1132 return (seqid, seqstr, seq_len, None)
1133
1134 def getLine(self):
1135 if self.bufline: #return previously buffered line
1136 r=self.bufline
1137 self.bufline=None
1138 return r
1139 else: #read a new line from stream and return it
1140 if self.eof: return None
1141 self.lastline=self.ifile.readline()
1142 if not self.lastline:
1143 self.eof=1
1144 return None
1145 return self.lastline
1146 def ungetLine(self):
1147 if self.lastline is None:
1148 print >> sys.stderr, "Warning: FastxReader called ungetLine() with no prior line!"
1149 self.bufline=self.lastline
1150 self.lastline=None
1151 #< class FastxReader
1152
1153 class ZReader:
1154 def __init__(self, filename, sysparams, guess=True):
1155 self.fname=filename
1156 self.file=None
1157 self.fsrc=None
1158 self.popen=None
1159 pipecmd=[]
1160 s=filename.lower()
1161 if s.endswith(".bam"):
1162 pipecmd=["bam2fastx", "-"]
1163 else:
1164 if guess:
1165 if s.endswith(".z") or s.endswith(".gz") or s.endswith(".gzip"):
1166 pipecmd=['gzip']
1167 else:
1168 if s.endswith(".bz2") or s.endswith(".bzip2") or s.endswith(".bzip"):
1169 pipecmd=['bzip2']
1170 if len(pipecmd)>0 and which(pipecmd[0]) is None:
1171 die("Error: cannot find %s to decompress input file %s " % (pipecmd, filename))
1172 if len(pipecmd)>0:
1173 if pipecmd[0]=='gzip' and sysparams.zipper.endswith('pigz'):
1174 pipecmd[0]=sysparams.zipper
1175 pipecmd.extend(sysparams.zipper_opts)
1176 elif pipecmd[0]=='bzip2' and sysparams.zipper.endswith('pbzip2'):
1177 pipecmd[0]=sysparams.zipper
1178 pipecmd.extend(sysparams.zipper_opts)
1179 else: #not guessing, but we must still be sure it's a compressed file
1180 if use_zpacker and filename.endswith(".z"):
1181 pipecmd=[sysparams.zipper]
1182 pipecmd.extend(sysparams.zipper_opts)
1183 if pipecmd:
1184 pipecmd+=['-cd']
1185 if pipecmd:
1186 try:
1187 self.fsrc=open(self.fname, 'rb')
1188 self.popen=subprocess.Popen(pipecmd,
1189 preexec_fn=subprocess_setup,
1190 stdin=self.fsrc,
1191 stdout=subprocess.PIPE, close_fds=True)
1192 except Exception:
1193 die("Error: could not open pipe "+' '.join(pipecmd)+' < '+ self.fname)
1194 self.file=self.popen.stdout
1195 else:
1196 self.file=open(filename)
1197 def close(self):
1198 if self.fsrc: self.fsrc.close()
1199 self.file.close()
1200 if self.popen:
1201 self.popen.wait()
1202 self.popen=None
1203
1204 class ZWriter:
1205 def __init__(self, filename, sysparams):
1206 self.fname=filename
1207 if use_zpacker:
1208 pipecmd=[sysparams.zipper,"-cf", "-"]
1209 self.ftarget=open(filename, "wb")
1210 try:
1211 self.popen=subprocess.Popen(pipecmd,
1212 preexec_fn=subprocess_setup,
1213 stdin=subprocess.PIPE,
1214 stdout=self.ftarget, close_fds=True)
1215 except Exception:
1216 die("Error: could not open writer pipe "+' '.join(pipecmd)+' < '+ self.fname)
1217 self.file=self.popen.stdin # client writes to this end of the pipe
1218 else: #no compression
1219 self.file=open(filename, "w")
1220 self.ftarget=None
1221 self.popen=None
1222 def close(self):
1223 self.file.close()
1224 if self.ftarget: self.ftarget.close()
1225 if self.popen:
1226 self.popen.wait() #! required to actually flush the pipes (eek!)
1227 self.popen=None
1228
1229 # check_reads_format() examines the first few records in the user files
1230 # to determines the file format
1231 def check_reads_format(params, reads_files):
1232 #print >> sys.stderr, "[%s] Checking reads" % right_now()
1233 #
1234 seed_len = params.read_params.seed_length
1235 fileformat = params.read_params.reads_format
1236
1237 observed_formats = set([])
1238 # observed_scales = set([])
1239 min_seed_len = 99999
1240 max_seed_len = 0
1241 files = reads_files.split(',')
1242
1243 for f_name in files:
1244 #try:
1245 zf = ZReader(f_name, params.system_params)
1246 #except IOError:
1247 # die("Error: could not open file "+f_name)
1248 freader=FastxReader(zf.file, params.read_params.color, zf.fname)
1249 toread=4 #just sample the first 4 reads
1250 while toread>0:
1251 seqid, seqstr, seq_len, qstr = freader.nextRecord()
1252 if not seqid: break
1253 toread-=1
1254 if seq_len < 20:
1255 print >> sys.stderr, "Warning: found a read < 20bp in", f_name
1256 else:
1257 min_seed_len = min(seq_len, min_seed_len)
1258 max_seed_len = max(seq_len, max_seed_len)
1259 zf.close()
1260 observed_formats.add(freader.format)
1261 if len(observed_formats) > 1:
1262 die("Error: TopHat requires all reads be either FASTQ or FASTA. Mixing formats is not supported.")
1263 fileformat=list(observed_formats)[0]
1264 if seed_len != None:
1265 seed_len = max(seed_len, min_seed_len)
1266 else:
1267 seed_len = max_seed_len
1268 #print >> sys.stderr, "\tmin read length: %dbp, max read length: %dbp" % (min_seed_len, max_seed_len)
1269 print >> sys.stderr, "\tformat:\t\t %s" % fileformat
1270 if fileformat == "fastq":
1271 quality_scale = "phred33 (default)"
1272 if params.read_params.solexa_quals and not params.read_params.phred64_quals:
1273 quality_scale = "solexa33 (reads generated with GA pipeline version < 1.3)"
1274 elif params.read_params.phred64_quals:
1275 quality_scale = "phred64 (reads generated with GA pipeline version >= 1.3)"
1276 print >> sys.stderr, "\tquality scale:\t %s" % quality_scale
1277 elif fileformat == "fasta":
1278 if params.read_params.color:
1279 params.read_params.integer_quals = True
1280
1281 #print seed_len, format, solexa_scale
1282 #NOTE: seed_len will be re-evaluated later by prep_reads
1283 return TopHatParams.ReadParams(params.read_params.solexa_quals,
1284 params.read_params.phred64_quals,
1285 params.read_params.quals,
1286 params.read_params.integer_quals,
1287 params.read_params.color,
1288 params.read_params.color_out,
1289 params.read_params.library_type,
1290 seed_len,
1291 fileformat,
1292 params.read_params.mate_inner_dist,
1293 params.read_params.mate_inner_dist_std_dev,
1294 params.read_params.read_group_id,
1295 params.read_params.sample_id,
1296 params.read_params.library_id,
1297 params.read_params.description,
1298 params.read_params.seq_platform_unit,
1299 params.read_params.seq_center,
1300 params.read_params.seq_run_date,
1301 params.read_params.seq_platform)
1302
1303 def log_tail(logfile, lines=1):
1304 f=open(logfile, "r")
1305 f.seek(0, 2)
1306 fbytes= f.tell()
1307 size=lines
1308 block=-1
1309 while size > 0 and fbytes+block*1024 > 0:
1310 if (fbytes+block*1024 > 0):
1311 ##Seek back once more, if possible
1312 f.seek( block*1024, 2 )
1313 else:
1314 #Seek to the beginning
1315 f.seek(0, 0)
1316 data= f.read( 1024 )
1317 linesFound= data.count('\n')
1318 size -= linesFound
1319 block -= 1
1320 if (fbytes + block*1024 > 0):
1321 f.seek(block*1024, 2)
1322 else:
1323 f.seek(0,0)
1324 f.readline() # find a newline
1325 lastBlocks= list( f.readlines() )
1326 f.close()
1327 return "\n".join(lastBlocks[-lines:])
1328
1329 # Format a DateTime as a pretty string.
1330 # FIXME: Currently doesn't support days!
1331 def formatTD(td):
1332 hours = td.seconds // 3600
1333 minutes = (td.seconds % 3600) // 60
1334 seconds = td.seconds % 60
1335 return '%02d:%02d:%02d' % (hours, minutes, seconds)
1336
1337 class PrepReadsInfo:
1338 def __init__(self, fname, side):
1339 try:
1340 f=open(fname,"r")
1341 self.min_len=int(f.readline().split("=")[-1])
1342 self.max_len=int(f.readline().split("=")[-1])
1343 self.in_count=int(f.readline().split("=")[-1])
1344 self.out_count=int(f.readline().split("=")[-1])
1345 if (self.out_count==0) or (self.max_len<16):
1346 raise Exception()
1347 except Exception, e:
1348 die(fail_str+"Error retrieving prep_reads info.")
1349 print >> sys.stderr, "\t%s reads: min. length=%s, count=%s" % (side, self.min_len, self.out_count)
1350 if self.min_len < 20:
1351 print >> sys.stderr, "Warning: short reads (<20bp) will make TopHat quite slow and take large amount of memory because they are likely to be mapped to too many places"
1352
1353 # Calls the prep_reads executable, which prepares an internal read library.
1354 # The read library features reads with monotonically increasing integer IDs.
1355 # prep_reads also filters out very low complexy or garbage reads as well as
1356 # polyA reads.
1357 #--> returns the tuple (internal_read_library_file, read_info)
1358 def prep_reads(params, reads_list, quals_list, output_name, side="Left "):
1359 reads_suffix = ".fq"
1360 if use_zpacker: reads_suffix += ".z"
1361 kept_reads_filename = tmp_dir + output_name + reads_suffix
1362
1363 if os.path.exists(kept_reads_filename):
1364 os.remove(kept_reads_filename)
1365 kept_reads = open(kept_reads_filename, "wb")
1366 log_fname=logging_dir + "prep_reads.log"
1367 filter_log = open(log_fname,"w")
1368
1369 filter_cmd = [prog_path("prep_reads")]
1370 filter_cmd.extend(params.cmd())
1371
1372 if params.read_params.reads_format == "fastq":
1373 filter_cmd += ["--fastq"]
1374 elif params.read_params.reads_format == "fasta":
1375 filter_cmd += ["--fasta"]
1376 info_file=output_dir+output_name+".info"
1377 filter_cmd += ["--aux-outfile="+info_file]
1378 filter_cmd.append(reads_list)
1379 if params.read_params.quals:
1380 filter_cmd.append(quals_list)
1381 shell_cmd = ' '.join(filter_cmd)
1382 #finally, add the compression pipe
1383 zip_cmd=[]
1384 if use_zpacker:
1385 zip_cmd=[ params.system_params.zipper ]
1386 zip_cmd.extend(params.system_params.zipper_opts)
1387 zip_cmd.extend(['-c','-'])
1388 shell_cmd +=' | '+' '.join(zip_cmd)
1389 shell_cmd += ' >' +kept_reads_filename
1390 retcode=0
1391 try:
1392 print >> run_log, shell_cmd
1393 if use_zpacker:
1394 filter_proc = subprocess.Popen(filter_cmd,
1395 stdout=subprocess.PIPE,
1396 stderr=filter_log)
1397 zip_proc=subprocess.Popen(zip_cmd,
1398 preexec_fn=subprocess_setup,
1399 stdin=filter_proc.stdout,
1400 stdout=kept_reads)
1401 filter_proc.stdout.close() #as per http://bugs.python.org/issue7678
1402 zip_proc.communicate()
1403 retcode=filter_proc.poll()
1404 if retcode==0:
1405 retcode=zip_proc.poll()
1406 else:
1407 retcode = subprocess.call(filter_cmd,
1408 stdout=kept_reads,
1409 stderr=filter_log)
1410 if retcode:
1411 die(fail_str+"Error running 'prep_reads'\n"+log_tail(log_fname))
1412
1413 except OSError, o:
1414 errmsg=fail_str+str(o)
1415 die(errmsg+"\n"+log_tail(log_fname))
1416 kept_reads.close()
1417
1418 return kept_reads_filename, PrepReadsInfo(info_file, side)
1419
1420 # Call bowtie
1421 def bowtie(params,
1422 bwt_idx_prefix,
1423 reads_list,
1424 reads_format,
1425 num_mismatches,
1426 mapped_reads,
1427 unmapped_reads = None, #not needed vs segment_juncs
1428 extra_output = ""):
1429 start_time = datetime.now()
1430 bwt_idx_name = bwt_idx_prefix.split('/')[-1]
1431 readfile_basename=getFileBaseName(reads_list)
1432 print >> sys.stderr, "[%s] Mapping %s against %s with Bowtie %s" % (start_time.strftime("%c"),
1433 readfile_basename, bwt_idx_name, extra_output)
1434
1435 bwt_log = open(logging_dir + 'bowtie.'+readfile_basename+'.fixmap.log', "w")
1436 unmapped_reads_out=None
1437 if unmapped_reads:
1438 unmapped_reads_out=unmapped_reads+".fq"
1439 use_FIFO = use_BWT_FIFO and use_zpacker and unmapped_reads and params.read_params.color
1440 if use_FIFO: #this should always be False unless color reads are used with -X option
1441 global unmapped_reads_fifo
1442 unmapped_reads_fifo=unmapped_reads+".fifo"
1443 if os.path.exists(unmapped_reads_fifo):
1444 os.remove(unmapped_reads_fifo)
1445 try:
1446 os.mkfifo(unmapped_reads_fifo)
1447 except OSError, o:
1448 die(fail_str+"Error at mkfifo("+unmapped_reads_fifo+'). '+str(o))
1449 unmapped_reads_out+=".z"
1450
1451 # Launch Bowtie
1452 try:
1453 bowtie_cmd = [bowtie_path]
1454 bam_input=0
1455 unzip_cmd=None
1456 if reads_list.endswith('.bam'):
1457 bam_input=1
1458 unzip_cmd=[ prog_path('bam2fastx') ]
1459 if reads_format:
1460 unzip_cmd.append("--"+reads_format)
1461 unzip_cmd+=[reads_list]
1462
1463 if reads_format == "fastq":
1464 bowtie_cmd += ["-q"]
1465 elif reads_format == "fasta":
1466 bowtie_cmd += ["-f"]
1467
1468 if params.read_params.color:
1469 bowtie_cmd += ["-C", "--col-keepends"]
1470 if unmapped_reads:
1471 bowtie_cmd += ["--un", unmapped_reads_fifo]
1472 # "--max", "/dev/null"]
1473 if use_zpacker and (unzip_cmd is None):
1474 unzip_cmd=[ params.system_params.zipper ]
1475 unzip_cmd.extend(params.system_params.zipper_opts)
1476 unzip_cmd+=['-cd']
1477
1478 fifo_pid=None
1479 if use_FIFO:
1480 unm_zipcmd=[ params.system_params.zipper ]
1481 unm_zipcmd.extend(params.system_params.zipper_opts)
1482 unm_zipcmd+=['-c']
1483 print >> run_log, ' '.join(unm_zipcmd)+' < '+ unmapped_reads_fifo + ' > '+ unmapped_reads_out + ' & '
1484 fifo_pid=os.fork()
1485 if fifo_pid==0:
1486 def on_sig_exit(sig, func=None):
1487 os._exit(os.EX_OK)
1488 signal.signal(signal.SIGTERM, on_sig_exit)
1489 subprocess.call(unm_zipcmd,
1490 stdin=open(unmapped_reads_fifo, "r"),
1491 stdout=open(unmapped_reads_out, "wb"))
1492 os._exit(os.EX_OK)
1493
1494 bowtie_cmd += [params.bowtie_alignment_option, str(num_mismatches),
1495 "-p", str(params.system_params.num_cpus),
1496 "-k", str(params.max_hits),
1497 "-m", str(params.max_hits),
1498 "--max", "/dev/null",
1499 "-S", "--sam-nohead"] #always use headerless SAM file
1500 fix_map_cmd = [prog_path('fix_map_ordering')]
1501 sam_zipcmd=None
1502 if unmapped_reads:
1503 #mapping on the genome
1504 #fix_map_ordering will write BAM file directly, and unmapped_reads as BAM file too
1505 mapped_reads += ".bam"
1506 if params.read_params.color:
1507 fix_map_cmd += ["--sam-header", sam_header, "-", mapped_reads]
1508 else:
1509 unmapped_reads_out = unmapped_reads+".bam"
1510 fix_map_cmd += ["--sam-header", sam_header, "-", mapped_reads, unmapped_reads_out]
1511 else:
1512 #mapping on segment_juncs (spliced potential junctions)
1513 # don't use SAM headers, could be slow -- better compress SAM output directly
1514 mapped_reads += ".sam"
1515 fix_map_cmd += ["-"]
1516 if use_zpacker:
1517 fix_map_cmd += ["-"]
1518 sam_zipcmd=[ params.system_params.zipper ]
1519 sam_zipcmd.extend(params.system_params.zipper_opts)
1520 sam_zipcmd += ["-c"]
1521 mapped_reads += ".z"
1522 else:
1523 fix_map_cmd += [mapped_reads]
1524
1525 bowtie_cmd+=[bwt_idx_prefix]
1526 bowtie_proc=None
1527 shellcmd=""
1528 unzip_proc=None
1529 mapzip_proc=None
1530 if unzip_cmd:
1531 # could be bam2fastx
1532 if bam_input:
1533 unzip_proc = subprocess.Popen(unzip_cmd, stdout=subprocess.PIPE)
1534 shellcmd=' '.join(unzip_cmd) + "|"
1535 else:
1536 unzip_proc = subprocess.Popen(unzip_cmd,
1537 stdin=open(reads_list, "rb"),
1538 stdout=subprocess.PIPE)
1539 shellcmd=' '.join(unzip_cmd) + "< " +reads_list +"|"
1540 bowtie_cmd += ['-']
1541 bowtie_proc = subprocess.Popen(bowtie_cmd,
1542 stdin=unzip_proc.stdout,
1543 stdout=subprocess.PIPE,
1544 stderr=bwt_log)
1545 unzip_proc.stdout.close() # see http://bugs.python.org/issue7678
1546 else:
1547 bowtie_cmd += [reads_list]
1548 bowtie_proc = subprocess.Popen(bowtie_cmd,
1549 stdout=subprocess.PIPE,
1550 stderr=bwt_log)
1551 shellcmd += " ".join(bowtie_cmd) + " | " + " ".join(fix_map_cmd)
1552 if sam_zipcmd:
1553 shellcmd += "|" + " ".join(sam_zipcmd)+" > " + mapped_reads
1554 fix_order_proc = subprocess.Popen(fix_map_cmd,
1555 stdin=bowtie_proc.stdout,
1556 stdout=subprocess.PIPE)
1557 bowtie_proc.stdout.close()
1558 mapzip_proc = subprocess.Popen(sam_zipcmd,
1559 stdin=fix_order_proc.stdout,
1560 stdout=open(mapped_reads, "wb"))
1561 fix_order_proc.stdout.close()
1562 else:
1563 fix_order_proc = subprocess.Popen(fix_map_cmd,
1564 stdin=bowtie_proc.stdout)
1565 bowtie_proc.stdout.close()
1566
1567 print >> run_log, shellcmd
1568 if mapzip_proc:
1569 mapzip_proc.communicate()
1570 else:
1571 fix_order_proc.communicate()
1572 if use_FIFO:
1573 if fifo_pid and not os.path.exists(unmapped_reads_out):
1574 try:
1575 os.kill(fifo_pid, signal.SIGTERM)
1576 except:
1577 pass
1578 except OSError, o:
1579 die(fail_str+"Error: "+str(o))
1580
1581 # Success
1582 #finish_time = datetime.now()
1583 #duration = finish_time - start_time
1584 #print >> sys.stderr, "\t\t\t[%s elapsed]" % formatTD(duration)
1585 if use_FIFO:
1586 try:
1587 os.remove(unmapped_reads_fifo)
1588 except:
1589 pass
1590 return (mapped_reads, unmapped_reads_out)
1591
1592 def bowtie_segment(params,
1593 bwt_idx_prefix,
1594 reads_list,
1595 reads_format,
1596 mapped_reads,
1597 unmapped_reads = None,
1598 extra_output = ""):
1599
1600 backup_bowtie_alignment_option = params.bowtie_alignment_option
1601 params.bowtie_alignment_option = "-v"
1602 params.max_hits *= 2
1603
1604 result = bowtie(params,
1605 bwt_idx_prefix,
1606 reads_list,
1607 reads_format,
1608 params.segment_mismatches,
1609 mapped_reads,
1610 unmapped_reads,
1611 extra_output)
1612
1613 params.bowtie_alignment_option = backup_bowtie_alignment_option
1614 params.max_hits /= 2
1615 return result
1616
1617 # Retrieve a .juncs file from a GFF file by calling the gtf_juncs executable
1618 def get_gtf_juncs(gff_annotation):
1619 print >> sys.stderr, "[%s] Reading known junctions from GTF file" % (right_now())
1620 gtf_juncs_log = open(logging_dir + "gtf_juncs.log", "w")
1621
1622 gff_prefix = gff_annotation.split('/')[-1].split('.')[0]
1623
1624 gtf_juncs_out_name = tmp_dir + gff_prefix + ".juncs"
1625 gtf_juncs_out = open(gtf_juncs_out_name, "w")
1626
1627 gtf_juncs_cmd=[prog_path("gtf_juncs"), gff_annotation]
1628 try:
1629 print >> run_log, " ".join(gtf_juncs_cmd)
1630 retcode = subprocess.call(gtf_juncs_cmd,
1631 stderr=gtf_juncs_log,
1632 stdout=gtf_juncs_out)
1633 # cvg_islands returned an error
1634 if retcode == 1:
1635 print >> sys.stderr, "\tWarning: TopHat did not find any junctions in GTF file"
1636 return (False, gtf_juncs_out_name)
1637 elif retcode != 0:
1638 die(fail_str+"Error: GTF junction extraction failed with err ="+str(retcode))
1639 # cvg_islands not found
1640 except OSError, o:
1641 errmsg=fail_str+str(o)+"\n"
1642 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1643 errmsg+="Error: gtf_juncs not found on this system"
1644 die(errmsg)
1645 return (True, gtf_juncs_out_name)
1646
1647 # Call bowtie-build on the FASTA file of sythetic splice junction sequences
1648 def build_juncs_bwt_index(external_splice_prefix, color):
1649 print >> sys.stderr, "[%s] Indexing splices" % (right_now())
1650 bowtie_build_log = open(logging_dir + "bowtie_build.log", "w")
1651
1652 #user_splices_out_prefix = output_dir + "user_splices_idx"
1653
1654 bowtie_build_cmd = [prog_path("bowtie-build")]
1655 if color:
1656 bowtie_build_cmd += ["-C"]
1657
1658 bowtie_build_cmd += [external_splice_prefix + ".fa",
1659 external_splice_prefix]
1660 try:
1661 print >> run_log, " ".join(bowtie_build_cmd)
1662 retcode = subprocess.call(bowtie_build_cmd,
1663 stdout=bowtie_build_log)
1664
1665 if retcode != 0:
1666 die(fail_str+"Error: Splice sequence indexing failed with err ="+ str(retcode))
1667 except OSError, o:
1668 errmsg=fail_str+str(o)+"\n"
1669 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1670 errmsg+="Error: bowtie-build not found on this system"
1671 die(errmsg)
1672 return external_splice_prefix
1673
1674 # Build a splice index from a .juncs file, suitable for use with specified read
1675 # (or read segment) lengths
1676 def build_juncs_index(min_anchor_length,
1677 read_length,
1678 juncs_prefix,
1679 external_juncs,
1680 external_insertions,
1681 external_deletions,
1682 reference_fasta,
1683 color):
1684 print >> sys.stderr, "[%s] Retrieving sequences for splices" % (right_now())
1685
1686 juncs_file_list = ",".join(external_juncs)
1687 insertions_file_list = ",".join(external_insertions)
1688 deletions_file_list = ",".join(external_deletions)
1689
1690
1691 juncs_db_log = open(logging_dir + "juncs_db.log", "w")
1692
1693 external_splices_out_prefix = tmp_dir + juncs_prefix
1694 external_splices_out_name = external_splices_out_prefix + ".fa"
1695
1696 external_splices_out = open(external_splices_out_name, "w")
1697 # juncs_db_cmd = [bin_dir + "juncs_db",
1698 juncs_db_cmd = [prog_path("juncs_db"),
1699 str(min_anchor_length),
1700 str(read_length),
1701 juncs_file_list,
1702 insertions_file_list,
1703 deletions_file_list,
1704 reference_fasta]
1705 try:
1706 print >> run_log, " ".join(juncs_db_cmd)
1707 retcode = subprocess.call(juncs_db_cmd,
1708 stderr=juncs_db_log,
1709 stdout=external_splices_out)
1710
1711 if retcode != 0:
1712 die(fail_str+"Error: Splice sequence retrieval failed with err ="+str(retcode))
1713 # juncs_db not found
1714 except OSError, o:
1715 errmsg=fail_str+str(o)+"\n"
1716 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1717 errmsg+="Error: juncs_db not found on this system"
1718 die(errmsg)
1719
1720 external_splices_out_prefix = build_juncs_bwt_index(external_splices_out_prefix, color)
1721 return external_splices_out_prefix
1722
1723 # Print out the sam header, embedding the user's specified library properties.
1724 # FIXME: also needs SQ dictionary lines
1725 def write_sam_header(read_params, sam_file):
1726 print >> sam_file, "@HD\tVN:1.0\tSO:coordinate"
1727 if read_params.read_group_id and read_params.sample_id:
1728 rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id,
1729 read_params.sample_id)
1730 if read_params.library_id:
1731 rg_str += "\tLB:%s" % read_params.library_id
1732 if read_params.description:
1733 rg_str += "\tDS:%s" % read_params.description
1734 if read_params.seq_platform_unit:
1735 rg_str += "\tPU:%s" % read_params.seq_platform_unit
1736 if read_params.seq_center:
1737 rg_str += "\tCN:%s" % read_params.seq_center
1738 if read_params.mate_inner_dist:
1739 rg_str += "\tPI:%s" % read_params.mate_inner_dist
1740 if read_params.seq_run_date:
1741 rg_str += "\tDT:%s" % read_params.seq_run_date
1742 if read_params.seq_platform:
1743 rg_str += "\tPL:%s" % read_params.seq_platform
1744
1745 print >> sam_file, rg_str
1746 print >> sam_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd)
1747
1748 # Write final TopHat output, via tophat_reports and wiggles
1749 def compile_reports(params, sam_header_filename, left_maps, left_reads, right_maps, right_reads, gff_annotation):
1750 print >> sys.stderr, "[%s] Reporting output tracks" % right_now()
1751
1752 left_maps = [x for x in left_maps if (os.path.exists(x) and os.path.getsize(x) > 25)]
1753 left_maps = ','.join(left_maps)
1754
1755 if len(right_maps) > 0:
1756 right_maps = [x for x in right_maps if (os.path.exists(x) and os.path.getsize(x) > 25)]
1757 right_maps = ','.join(right_maps)
1758 log_fname = logging_dir + "reports.log"
1759 report_log = open(log_fname, "w")
1760 junctions = output_dir + "junctions.bed"
1761 insertions = output_dir + "insertions.bed"
1762 deletions = output_dir + "deletions.bed"
1763 coverage = "coverage.wig"
1764 accepted_hits = output_dir + "accepted_hits"
1765 report_cmdpath = prog_path("tophat_reports")
1766 report_cmd = [report_cmdpath]
1767 report_cmd.extend(params.cmd())
1768 report_cmd.extend(["--samtools="+samtools_path])
1769 report_cmd.extend([junctions,
1770 insertions,
1771 deletions,
1772 "-",
1773 left_maps,
1774 left_reads])
1775 if len(right_maps) > 0 and right_reads != None:
1776 report_cmd.append(right_maps)
1777 report_cmd.append(right_reads)
1778 # -- tophat_reports now produces (uncompressed) BAM stream,
1779 # directly piped into samtools sort
1780 try:
1781 if params.report_params.convert_bam:
1782 if params.report_params.sort_bam:
1783 report_proc=subprocess.Popen(report_cmd,
1784 preexec_fn=subprocess_setup,
1785 stdout=subprocess.PIPE,
1786 stderr=report_log)
1787
1788 bamsort_cmd = ["samtools", "sort", "-", accepted_hits]
1789 bamsort_proc= subprocess.Popen(bamsort_cmd,
1790 preexec_fn=subprocess_setup,
1791 stderr=open(logging_dir + "reports.samtools_sort.log", "w"),
1792 stdin=report_proc.stdout)
1793 report_proc.stdout.close()
1794 print >> run_log, " ".join(report_cmd)+" | " + " ".join(bamsort_cmd)
1795 bamsort_proc.communicate()
1796 retcode = report_proc.poll()
1797 if retcode:
1798 die(fail_str+"Error running tophat_reports\n"+log_tail(log_fname))
1799 else:
1800 print >> run_log, " ".join(report_cmd)
1801 report_proc=subprocess.call(report_cmd,
1802 preexec_fn=subprocess_setup,
1803 stdout=open(accepted_hits,"wb"),
1804 stderr=report_log)
1805 os.rename(accepted_hits, output_dir + "accepted_hits.bam")
1806 else:
1807 print >> run_log, " ".join(report_cmd)
1808 report_proc=subprocess.call(report_cmd,
1809 preexec_fn=subprocess_setup,
1810 stdout=open(accepted_hits,"wb"),
1811 stderr=report_log)
1812 tmp_sam = output_dir + "accepted_hits.sam"
1813
1814 bam_to_sam_cmd = ["samtools", "view", "-h", accepted_hits]
1815 print >> run_log, " ".join(bam_to_sam_cmd) + " > " + tmp_sam
1816 bam_to_sam_log = open(logging_dir + "accepted_hits_bam_to_sam.log", "w")
1817 tmp_sam_file = open(tmp_sam, "w")
1818 ret = subprocess.call(bam_to_sam_cmd,
1819 stdout=tmp_sam_file,
1820 stderr=bam_to_sam_log)
1821 tmp_sam_file.close()
1822 os.remove(accepted_hits)
1823 #print >> run_log, "mv %s %s" % (tmp_sam, output_dir + "accepted_hits.sam")
1824 #os.rename(tmp_sam, output_dir + "accepted_hits.sam")
1825
1826 except OSError, o:
1827 die(fail_str+"Error: "+str(o)+"\n"+log_tail(log_fname))
1828
1829 # FIXME: put wiggles back!
1830 # wig_cmd = [prog_path("wiggles"), output_dir + accepted_hits, output_dir + coverage]
1831 # print >> run_log, " ".join(wig_cmd)
1832 # subprocess.call(wig_cmd,
1833 # stderr=open("/dev/null"))
1834 return (coverage, junctions)
1835
1836
1837 # Split up each read in a FASTQ file into multiple segments. Creates a FASTQ file
1838 # for each segment This function needs to be fixed to support mixed read length
1839 # inputs
1840
1841 def open_output_files(prefix, num_files_prev, num_files, out_segf, extension, params):
1842 i = num_files_prev + 1
1843 while i <= num_files:
1844 segfname=prefix+("_seg%d" % i)+extension
1845 out_segf.append(ZWriter(segfname,params.system_params))
1846 i += 1
1847
1848 def split_reads(reads_filename,
1849 prefix,
1850 fasta,
1851 params,
1852 segment_length):
1853 #reads_file = open(reads_filename)
1854 zreads = ZReader(reads_filename, params.system_params, False)
1855 out_segfiles = []
1856
1857 if fasta:
1858 extension = ".fa"
1859 else:
1860 extension = ".fq"
1861 if use_zpacker: extension += ".z"
1862 def convert_color_to_bp(color_seq):
1863 decode_dic = { 'A0':'A', 'A1':'C', 'A2':'G', 'A3':'T', 'A4':'N', 'A.':'N', 'AN':'N',
1864 'C0':'C', 'C1':'A', 'C2':'T', 'C3':'G', 'C4':'N', 'C.':'N', 'CN':'N',
1865 'G0':'G', 'G1':'T', 'G2':'A', 'G3':'C', 'G4':'N', 'G.':'N', 'GN':'N',
1866 'T0':'T', 'T1':'G', 'T2':'C', 'T3':'A', 'T4':'N', 'T.':'N', 'TN':'N',
1867 'N0':'N', 'N1':'N', 'N2':'N', 'N3':'N', 'N4':'N', 'N.':'N', 'NN':'N',
1868 '.0':'N', '.1':'N', '.2':'N', '.3':'N', '.4':'N', '..':'N', '.N':'N' }
1869
1870 base = color_seq[0]
1871 bp_seq = base
1872 for ch in color_seq[1:]:
1873 base = decode_dic[base+ch]
1874 bp_seq += base
1875 return bp_seq
1876
1877 def convert_bp_to_color(bp_seq):
1878 encode_dic = { 'AA':'0', 'CC':'0', 'GG':'0', 'TT':'0',
1879 'AC':'1', 'CA':'1', 'GT':'1', 'TG':'1',
1880 'AG':'2', 'CT':'2', 'GA':'2', 'TC':'2',
1881 'AT':'3', 'CG':'3', 'GC':'3', 'TA':'3',
1882 'A.':'4', 'C.':'4', 'G.':'4', 'T.':'4',
1883 '.A':'4', '.C':'4', '.G':'4', '.T':'4',
1884 '.N':'4', 'AN':'4', 'CN':'4', 'GN':'4',
1885 'TN':'4', 'NA':'4', 'NC':'4', 'NG':'4',
1886 'NT':'4', 'NN':'4', 'N.':'4', '..':'4' }
1887
1888 base = bp_seq[0]
1889 color_seq = base
1890 for ch in bp_seq[1:]:
1891 color_seq += encode_dic[base + ch]
1892 base = ch
1893
1894 return color_seq
1895
1896 def split_record(read_name, read_seq, read_qual, out_segf, offsets, color):
1897 if color:
1898 color_offset = 1
1899 read_seq_temp = convert_color_to_bp(read_seq)
1900
1901 seg_num = 1
1902 while seg_num + 1 < len(offsets):
1903 if read_seq[offsets[seg_num]+1] not in ['0', '1', '2', '3']:
1904 return
1905 seg_num += 1
1906 else:
1907 color_offset = 0
1908
1909 seg_num = 0
1910 last_seq_offset = 0
1911 while seg_num + 1 < len(offsets):
1912 f = out_segf[seg_num].file
1913 seg_seq = read_seq[last_seq_offset+color_offset:offsets[seg_num + 1]+color_offset]
1914 print >> f, "%s|%d:%d:%d" % (read_name,last_seq_offset,seg_num, len(offsets) - 1)
1915 if color:
1916 print >> f, "%s%s" % (read_seq_temp[last_seq_offset], seg_seq)
1917 else:
1918 print >> f, seg_seq
1919 if not fasta:
1920 seg_qual = read_qual[last_seq_offset:offsets[seg_num + 1]]
1921 print >> f, "+"
1922 print >> f, seg_qual
1923 seg_num += 1
1924 last_seq_offset = offsets[seg_num]
1925
1926 line_state = 0
1927 read_name = ""
1928 read_seq = ""
1929 read_qual = ""
1930 num_segments = 0
1931 offsets = []
1932 for line in zreads.file:
1933 if line.strip() == "":
1934 continue
1935 if line_state == 0:
1936 read_name = line.strip()
1937 elif line_state == 1:
1938 read_seq = line.strip()
1939
1940 read_length = len(read_seq)
1941 tmp_num_segments = read_length / segment_length
1942 offsets = [segment_length * i for i in range(0, tmp_num_segments + 1)]
1943
1944 # Bowtie's minimum read length here is 20bp, so if the last segment
1945 # is between 20 and segment_length bp long, go ahead and write it out
1946 if read_length % segment_length >= 20:
1947 offsets.append(read_length)
1948 tmp_num_segments += 1
1949 else:
1950 offsets[-1] = read_length
1951
1952 if tmp_num_segments == 1:
1953 offsets = [0, read_length]
1954
1955 if tmp_num_segments > num_segments:
1956 open_output_files(prefix, num_segments, tmp_num_segments, out_segfiles, extension, params)
1957 num_segments = tmp_num_segments
1958
1959 if fasta:
1960 split_record(read_name, read_seq, None, out_segfiles, offsets, params.read_params.color)
1961 elif line_state == 2:
1962 line = line.strip()
1963 else:
1964 read_quals = line.strip()
1965 if not fasta:
1966 split_record(read_name, read_seq, read_quals, out_segfiles, offsets, params.read_params.color)
1967
1968 line_state += 1
1969 if fasta:
1970 line_state %= 2
1971 else:
1972 line_state %= 4
1973 zreads.close()
1974 out_fnames=[]
1975 for zf in out_segfiles:
1976 zf.close()
1977 out_fnames.append(zf.fname)
1978 #return [o.fname for o in out_segfiles]
1979 return out_fnames
1980
1981 # Find possible splice junctions using the "closure search" strategy, and report
1982 # them in closures.juncs. Calls the executable closure_juncs
1983 def junctions_from_closures(params,
1984 left_maps,
1985 right_maps,
1986 ref_fasta):
1987 #print >> sys.stderr, "[%s] " % right_now()
1988 print >> sys.stderr, "[%s] Searching for junctions via mate-pair closures" % right_now()
1989
1990 #maps = [x for x in seg_maps if (os.path.exists(x) and os.path.getsize(x) > 0)]
1991 #if len(maps) == 0:
1992 # return None
1993 #print >> sys.stderr, left_maps
1994 slash = left_maps[0].rfind('/')
1995 juncs_out = ""
1996 if slash != -1:
1997 juncs_out += left_maps[0][:slash+1]
1998 juncs_out += "closure.juncs"
1999
2000 juncs_log = open(logging_dir + "closure.log", "w")
2001 juncs_cmdpath=prog_path("closure_juncs")
2002 juncs_cmd = [juncs_cmdpath]
2003
2004 left_maps = ','.join(left_maps)
2005 right_maps = ','.join(right_maps)
2006
2007 juncs_cmd.extend(params.cmd())
2008 juncs_cmd.extend([juncs_out,
2009 ref_fasta,
2010 left_maps,
2011 right_maps])
2012 try:
2013 print >> run_log, ' '.join(juncs_cmd)
2014 retcode = subprocess.call(juncs_cmd,
2015 stderr=juncs_log)
2016
2017 # spanning_reads returned an error
2018 if retcode != 0:
2019 die(fail_str+"Error: closure-based junction search failed with err ="+str(retcode))
2020 # cvg_islands not found
2021 except OSError, o:
2022 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2023 print >> sys.stderr, fail_str, "Error: closure_juncs not found on this system"
2024 die(str(o))
2025 return [juncs_out]
2026
2027 # Find possible junctions by examining coverage and split segments in the initial
2028 # map and segment maps. Report junctions, insertions, and deletions in segment.juncs,
2029 # segment.insertions, and segment.deletions. Calls the executable
2030 # segment_juncs
2031 def junctions_from_segments(params,
2032 left_reads,
2033 left_reads_map,
2034 left_seg_maps,
2035 right_reads,
2036 right_reads_map,
2037 right_seg_maps,
2038 unmapped_reads,
2039 reads_format,
2040 ref_fasta):
2041 if left_reads_map != left_seg_maps[0]:
2042 print >> sys.stderr, "[%s] Searching for junctions via segment mapping" % right_now()
2043 out_path=getFileDir(left_seg_maps[0])
2044 juncs_out=out_path+"segment.juncs"
2045 insertions_out=out_path+"segment.insertions"
2046 deletions_out =out_path+"segment.deletions"
2047
2048 left_maps = ','.join(left_seg_maps)
2049 align_log = open(logging_dir + "segment_juncs.log", "w")
2050 align_cmd = [prog_path("segment_juncs")]
2051
2052 align_cmd.extend(params.cmd())
2053
2054 align_cmd.extend(["--ium-reads", ",".join(unmapped_reads),
2055 ref_fasta,
2056 juncs_out,
2057 insertions_out,
2058 deletions_out,
2059 left_reads,
2060 left_reads_map,
2061 left_maps])
2062 if right_seg_maps != None:
2063 right_maps = ','.join(right_seg_maps)
2064 align_cmd.extend([right_reads, right_reads_map, right_maps])
2065 try:
2066 print >> run_log, " ".join(align_cmd)
2067 retcode = subprocess.call(align_cmd,
2068 preexec_fn=subprocess_setup,
2069 stderr=align_log)
2070
2071 # spanning_reads returned an error
2072 if retcode != 0:
2073 die(fail_str+"Error: segment-based junction search failed with err ="+str(retcode))
2074 # cvg_islands not found
2075 except OSError, o:
2076 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2077 print >> sys.stderr, fail_str, "Error: segment_juncs not found on this system"
2078 die(str(o))
2079
2080 return [juncs_out, insertions_out, deletions_out]
2081
2082 # Joins mapped segments into full-length read alignments via the executable
2083 # long_spanning_reads
2084 def join_mapped_segments(params,
2085 sam_header_filename,
2086 reads,
2087 ref_fasta,
2088 possible_juncs,
2089 possible_insertions,
2090 possible_deletions,
2091 contig_seg_maps,
2092 spliced_seg_maps,
2093 alignments_out_name):
2094 if len(contig_seg_maps)>1:
2095 print >> sys.stderr, "[%s] Joining segment hits" % right_now()
2096 else:
2097 print >> sys.stderr, "[%s] Processing bowtie hits" % right_now()
2098 contig_seg_maps = ','.join(contig_seg_maps)
2099
2100 possible_juncs = ','.join(possible_juncs)
2101 possible_insertions = ",".join(possible_insertions)
2102 possible_deletions = ",".join(possible_deletions)
2103
2104 align_log = open(logging_dir + "long_spanning_reads.log", "w")
2105 align_cmd = [prog_path("long_spanning_reads")]
2106
2107 align_cmd.extend(params.cmd())
2108
2109 align_cmd.append(ref_fasta)
2110 align_cmd.extend([ reads,
2111 possible_juncs,
2112 possible_insertions,
2113 possible_deletions,
2114 contig_seg_maps])
2115
2116 if spliced_seg_maps != None:
2117 spliced_seg_maps = ','.join(spliced_seg_maps)
2118 align_cmd.append(spliced_seg_maps)
2119
2120 try:
2121 print >> run_log, " ".join(align_cmd),">",alignments_out_name
2122 join_proc=subprocess.Popen(align_cmd,
2123 preexec_fn=subprocess_setup,
2124 stdout=open(alignments_out_name, "wb"),
2125 stderr=align_log)
2126 join_proc.communicate()
2127 except OSError, o:
2128 die(fail_str+"Error: "+str(o))
2129
2130
2131 # This class collects spliced and unspliced alignments for each of the
2132 # left and right read files provided by the user.
2133 class Maps:
2134 def __init__(self,
2135 # unspliced_bwt,
2136 unspliced_sam,
2137 seg_maps,
2138 unmapped_segs,
2139 segs):
2140 #self.unspliced_bwt = unspliced_bwt
2141 self.unspliced_sam = unspliced_sam
2142 self.seg_maps = seg_maps
2143 self.unmapped_segs = unmapped_segs
2144 self.segs = segs
2145
2146 # The main aligment routine of TopHat. This function executes most of the
2147 # workflow producing a set of candidate alignments for each cDNA fragment in a
2148 # pair of SAM alignment files (for paired end reads).
2149 def spliced_alignment(params,
2150 bwt_idx_prefix,
2151 sam_header_filename,
2152 ref_fasta,
2153 read_len,
2154 segment_len,
2155 left_reads,
2156 right_reads,
2157 user_supplied_junctions,
2158 user_supplied_insertions,
2159 user_supplied_deletions):
2160
2161 possible_juncs = []
2162 possible_juncs.extend(user_supplied_junctions)
2163
2164 possible_insertions = []
2165 possible_insertions.extend(user_supplied_insertions)
2166
2167 possible_deletions = []
2168 possible_deletions.extend(user_supplied_deletions)
2169
2170 maps = { left_reads : [], right_reads : [] }
2171 #single_segments = False
2172
2173 # Perform the first part of the TopHat work flow on the left and right
2174 # reads of paired ends separately - we'll use the pairing information later
2175 for reads in [left_reads, right_reads]:
2176 if reads == None or os.path.getsize(reads)<25 :
2177 continue
2178 fbasename=getFileBaseName(reads)
2179 unspliced_out = tmp_dir + fbasename + ".mapped"
2180 unmapped_unspliced = tmp_dir + fbasename + "_unmapped"
2181
2182 num_segs = read_len / segment_len
2183 if read_len % segment_len >= min(segment_len -2, 20):
2184 num_segs += 1
2185
2186 # daehwan - check this out
2187 if num_segs <= 1:
2188 print >> sys.stderr, "Warning: you have only one segment per read\n\twe strongly recommend that you decrease --segment-length to about half the read length because TopHat will work better with multiple segments"
2189
2190 # Perform the initial Bowtie mapping of the full length reads
2191 (unspliced_sam, unmapped_reads) = bowtie(params,
2192 bwt_idx_prefix,
2193 reads,
2194 "fastq",
2195 params.initial_read_mismatches,
2196 unspliced_out,
2197 unmapped_unspliced)
2198
2199 # Convert the initial Bowtie maps into BAM format.
2200 # skip this step as fix_map_ordering does it now
2201 #join_mapped_segments(params,
2202 # sam_header_filename,
2203 # reads,
2204 # ref_fasta,
2205 # ["/dev/null"],
2206 # ["/dev/null"],
2207 # ["/dev/null"],
2208 # [unspliced_map],
2209 # [],
2210 # unspliced_sam)
2211 # Using the num_segs value returned by check_reads(), decide which
2212 # junction discovery strategy to use
2213 if num_segs == 1:
2214 segment_len = read_len
2215 elif num_segs >= 3:
2216 # if we have at least three segments, just use split segment search,
2217 # which is the most sensitive and specific, fastest, and lightest-weight.
2218 if not params.closure_search:
2219 params.closure_search = False
2220 if not params.coverage_search:
2221 params.coverage_search = False
2222 if not params.butterfly_search:
2223 params.butterfly_search = False
2224 if not params.closure_search:
2225 params.closure_search = False
2226 seg_maps = []
2227 unmapped_segs = []
2228 segs = []
2229 if num_segs > 1:
2230 # split up the IUM reads into segments
2231 # unmapped_reads can be in BAM format
2232 read_segments = split_reads(unmapped_reads,
2233 tmp_dir + fbasename,
2234 False,
2235 params,
2236 segment_len)
2237
2238 # Map each segment file independently with Bowtie
2239 for i in range(len(read_segments)):
2240 seg = read_segments[i]
2241 fbasename=getFileBaseName(seg)
2242 seg_out = tmp_dir + fbasename + ".mapped"
2243 #if use_zpacker: seg_out += ".z"
2244 unmapped_seg = tmp_dir + fbasename + "_unmapped"
2245 extra_output = "(%d/%d)" % (i+1, len(read_segments))
2246 (seg_map, unmapped) = bowtie_segment(params,
2247 bwt_idx_prefix,
2248 seg,
2249 "fastq",
2250 seg_out,
2251 unmapped_seg,
2252 extra_output)
2253 seg_maps.append(seg_map)
2254 unmapped_segs.append(unmapped)
2255 segs.append(seg)
2256
2257 # Collect the segment maps for left and right reads together
2258 #maps[reads] = Maps(unspliced_map, unspliced_sam, seg_maps, unmapped_segs, segs)
2259 maps[reads] = Maps(unspliced_sam, seg_maps, unmapped_segs, segs)
2260 else:
2261 # if there's only one segment, just collect the initial map as the only
2262 # map to be used downstream for coverage-based junction discovery
2263 read_segments = [reads]
2264 #maps[reads] = Maps(unspliced_map, unspliced_sam, [unspliced_map], [unmapped_reads], [unmapped_reads])
2265 maps[reads] = Maps(unspliced_sam, [unspliced_sam], [unmapped_reads], [unmapped_reads])
2266
2267 # Unless the user asked not to discover new junctions, start that process
2268 # here
2269
2270 if params.find_novel_juncs or params.find_novel_indels:
2271 #left_reads_map = maps[left_reads].unspliced_bwt
2272 left_reads_map = maps[left_reads].unspliced_sam
2273 left_seg_maps = maps[left_reads].seg_maps
2274 unmapped_reads = maps[left_reads].unmapped_segs
2275 if right_reads != None:
2276 #right_reads_map = maps[right_reads].unspliced_bwt
2277 right_reads_map = maps[right_reads].unspliced_sam
2278 right_seg_maps = maps[right_reads].seg_maps
2279 unmapped_reads.extend(maps[right_reads].unmapped_segs)
2280 else:
2281 right_reads_map = None
2282 right_seg_maps = None
2283
2284 # Call segment_juncs to infer a list of possible splice junctions from
2285 # the regions of the genome covered in the initial and segment maps
2286 juncs = junctions_from_segments(params,
2287 left_reads,
2288 left_reads_map,
2289 left_seg_maps,
2290 right_reads,
2291 right_reads_map,
2292 right_seg_maps,
2293 unmapped_reads,
2294 "fastq",
2295 ref_fasta)
2296 if params.find_novel_juncs:
2297 if os.path.getsize(juncs[0]) != 0:
2298 possible_juncs.append(juncs[0])
2299 if params.find_novel_indels:
2300 if os.path.getsize(juncs[1]) != 0:
2301 possible_insertions.append(juncs[1])
2302 if os.path.getsize(juncs[2]) != 0:
2303 possible_deletions.append(juncs[2])
2304 # Optionally, and for paired reads only, use a closure search to
2305 # discover addtional junctions
2306 if params.find_novel_juncs and params.closure_search and left_reads != None and right_reads != None:
2307 juncs = junctions_from_closures(params,
2308 # [maps[left_reads].unspliced_bwt, maps[left_reads].seg_maps[-1]],
2309 # [maps[right_reads].unspliced_bwt, maps[right_reads].seg_maps[-1]],
2310 [maps[left_reads].unspliced_sam, maps[left_reads].seg_maps[-1]],
2311 [maps[right_reads].unspliced_sam, maps[right_reads].seg_maps[-1]],
2312 ref_fasta)
2313 if os.path.getsize(juncs[0]) != 0:
2314 possible_juncs.extend(juncs)
2315
2316 if len(possible_insertions) == 0 and len(possible_deletions) == 0 and len(possible_juncs) == 0:
2317 spliced_seg_maps = None
2318 junc_idx_prefix = None
2319 else:
2320 junc_idx_prefix = "segment_juncs"
2321 if len(possible_insertions) == 0:
2322 possible_insertions.append(os.devnull)
2323 # print >> sys.stderr, "Warning: insertions database is empty!"
2324 if len(possible_deletions) == 0:
2325 possible_deletions.append(os.devnull)
2326 # print >> sys.stderr, "Warning: deletions database is empty!"
2327 if len(possible_juncs) == 0:
2328 possible_juncs.append(os.devnull)
2329 print >> sys.stderr, "Warning: junction database is empty!"
2330 if junc_idx_prefix != None:
2331 build_juncs_index(3,
2332 segment_len,
2333 junc_idx_prefix,
2334 possible_juncs,
2335 possible_insertions,
2336 possible_deletions,
2337 ref_fasta,
2338 params.read_params.color)
2339
2340 # Now map read segments (or whole IUM reads, if num_segs == 1) to the splice
2341 # index with Bowtie
2342 for reads in [left_reads, right_reads]:
2343 spliced_seg_maps = []
2344 if reads == None or reads not in maps:
2345 continue
2346
2347 if junc_idx_prefix != None:
2348 i = 0
2349 for seg in maps[reads].segs:
2350 fsegname = getFileBaseName(seg)
2351 seg_out = tmp_dir + fsegname + ".to_spliced"
2352 #if use_zpacker: seg_out += ".z"
2353 extra_output = "(%d/%d)" % (i+1, len(maps[reads].segs))
2354 (seg_map, unmapped) = bowtie_segment(params,
2355 tmp_dir + junc_idx_prefix,
2356 seg,
2357 "fastq",
2358 seg_out,
2359 None,
2360 extra_output)
2361 spliced_seg_maps.append(seg_map)
2362 i += 1
2363
2364 # Join the contigous and spliced segment hits into full length read
2365 # alignments or convert a spliced segment hit into a genomic-coordinate hit
2366 rfname = getFileBaseName(reads)
2367 rfdir = getFileDir(reads)
2368 mapped_reads = rfdir + rfname + ".candidates.bam"
2369 merged_map =rfdir + rfname + ".candidates_and_unspl.bam"
2370 # unspl_bwtfile = maps[reads].unspliced_bwt
2371 unspl_samfile = maps[reads].unspliced_sam
2372
2373 join_mapped_segments(params,
2374 sam_header_filename,
2375 reads,
2376 ref_fasta,
2377 possible_juncs,
2378 possible_insertions,
2379 possible_deletions,
2380 maps[reads].seg_maps,
2381 spliced_seg_maps,
2382 mapped_reads)
2383
2384 if num_segs > 1:
2385 # Merge the spliced and unspliced full length alignments into a
2386 # single SAM file.
2387 # The individual SAM files are all already sorted in
2388 # increasing read ID order.
2389 # NOTE: We also should be able to address bug #134 here, by replacing
2390 # contiguous alignments that poke into an intron by a small amount by
2391 # the correct spliced alignment.
2392
2393 try:
2394 merge_cmd = [ prog_path("bam_merge"), merged_map,
2395 maps[reads].unspliced_sam,
2396 mapped_reads ]
2397 print >> run_log, " ".join(merge_cmd)
2398 ret = subprocess.call( merge_cmd,
2399 stderr=open(logging_dir + "sam_merge.log", "w") )
2400 if ret != 0:
2401 die(fail_str+"Error executing: "+" ".join(merge_cmd))
2402 except OSError, o:
2403 die(fail_str+"Error: "+str(o))
2404 maps[reads] = [merged_map]
2405
2406 if not params.system_params.keep_tmp:
2407 os.remove(mapped_reads)
2408 os.remove(unspl_samfile)
2409 #os.remove(unspl_bwtfile)
2410 else:
2411 os.rename(mapped_reads, merged_map)
2412 maps[reads] = [merged_map]
2413 if not params.system_params.keep_tmp:
2414 os.remove(unspl_samfile)
2415 #os.remove(unspl_bwtfile)
2416
2417 return maps
2418
2419 def die(msg=None):
2420 if msg is not None:
2421 print >> sys.stderr, msg
2422 sys.exit(1)
2423
2424 # rough equivalent of the 'which' command to find external programs
2425 # (current script path is tested first, then PATH envvar)
2426 def which(program):
2427 def is_executable(fpath):
2428 return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
2429 fpath, fname = os.path.split(program)
2430 if fpath:
2431 if is_executable(program):
2432 return program
2433 else:
2434 progpath = os.path.join(bin_dir, program)
2435 if is_executable(progpath):
2436 return progpath
2437 for path in os.environ["PATH"].split(os.pathsep):
2438 progpath = os.path.join(path, program)
2439 if is_executable(progpath):
2440 return progpath
2441 return None
2442
2443 def prog_path(program):
2444 progpath=which(program)
2445 if progpath == None:
2446 die("Error locating program: "+program)
2447 return progpath
2448
2449 # FIXME: this should get set during the make dist autotools phase of the build
2450 def get_version():
2451 return "1.3.2"
2452
2453 def mlog(msg):
2454 print >> sys.stderr, "[DBGLOG]:"+msg
2455
2456 def test_input_file(filename):
2457 try:
2458 test_file = open(filename, "r")
2459 # Bowtie not found
2460 except IOError, o:
2461 die("Error: Opening file %s" % filename)
2462 return
2463
2464
2465 def main(argv=None):
2466 warnings.filterwarnings("ignore", "tmpnam is a potential security risk")
2467
2468 # Initialize default parameter values
2469 params = TopHatParams()
2470
2471 try:
2472 if argv is None:
2473 argv = sys.argv
2474 args = params.parse_options(argv)
2475 params.check()
2476
2477 bwt_idx_prefix = args[0]
2478 left_reads_list = args[1]
2479 left_quals_list, right_quals_list = [], []
2480 if (not params.read_params.quals and len(args) > 2) or (params.read_params.quals and len(args) > 3):
2481 if params.read_params.mate_inner_dist == None:
2482 params.read_params.mate_inner_dist = 50
2483 #die("Error: you must set the mean inner distance between mates with -r")
2484
2485 right_reads_list = args[2]
2486 if params.read_params.quals:
2487 left_quals_list = args[3]
2488 right_quals_list = args[4]
2489 else:
2490 right_reads_list = None
2491 if params.read_params.quals:
2492 left_quals_list = args[2]
2493
2494 print >> sys.stderr
2495 print >> sys.stderr, "[%s] Beginning TopHat run (v%s)" % (right_now(), get_version())
2496 print >> sys.stderr, "-----------------------------------------------"
2497
2498 start_time = datetime.now()
2499 prepare_output_dir()
2500
2501 global run_log
2502 run_log = open(logging_dir + "run.log", "w", 0)
2503 global run_cmd
2504 run_cmd = " ".join(argv)
2505 print >> run_log, run_cmd
2506
2507 # Validate all the input files, check all prereqs before committing
2508 # to the run
2509 (ref_fasta, ref_seq_dict) = check_index(bwt_idx_prefix)
2510
2511 check_bowtie()
2512 check_samtools()
2513 print >> sys.stderr, "[%s] Generating SAM header for %s" % (right_now(), bwt_idx_prefix)
2514 sam_header_filename = get_index_sam_header(params.read_params, bwt_idx_prefix)
2515 print >> sys.stderr, "[%s] Preparing reads" % (right_now())
2516
2517 if not params.skip_check_reads:
2518 reads_list = left_reads_list
2519 if right_reads_list:
2520 reads_list = reads_list + "," + right_reads_list
2521 params.read_params = check_reads_format(params, reads_list)
2522
2523 user_supplied_juncs = []
2524 user_supplied_insertions = []
2525 user_supplied_deletions = []
2526
2527 if params.gff_annotation and params.find_GFF_juncs:
2528 test_input_file(params.gff_annotation)
2529 (found_juncs, gtf_juncs) = get_gtf_juncs(params.gff_annotation)
2530 if found_juncs:
2531 user_supplied_juncs.append(gtf_juncs)
2532 if params.raw_junctions:
2533 test_input_file(params.raw_junctions)
2534 user_supplied_juncs.append(params.raw_junctions)
2535
2536 if params.raw_insertions:
2537 test_input_file(params.raw_insertions)
2538 user_supplied_insertions.append(params.raw_insertions)
2539
2540 if params.raw_deletions:
2541 test_input_file(params.raw_deletions)
2542 user_supplied_deletions.append(params.raw_deletions)
2543
2544 global unmapped_reads_fifo
2545 unmapped_reads_fifo = tmp_dir + str(os.getpid())+".bwt_unmapped.z.fifo"
2546
2547 # Now start the time consuming stuff
2548 left_kept_reads, left_reads_info = prep_reads(params,
2549 left_reads_list,
2550 left_quals_list,
2551 "left_kept_reads", "Left ")
2552 min_read_len=left_reads_info.min_len
2553 max_read_len=left_reads_info.max_len
2554 if right_reads_list != None:
2555 right_kept_reads, right_reads_info = prep_reads(params,
2556 right_reads_list,
2557 right_quals_list,
2558 "right_kept_reads", "Right")
2559 min_read_len=min(right_reads_info.min_len, min_read_len)
2560 max_read_len=max(right_reads_info.max_len, max_read_len)
2561 else:
2562 right_kept_reads = None
2563 seed_len=params.read_params.seed_length
2564 if seed_len != None:
2565 seed_len = max(seed_len, min_read_len)
2566 else:
2567 seed_len = max_read_len
2568 params.read_params.seed_length=seed_len
2569 # turn off integer-quals
2570 if params.read_params.integer_quals:
2571 params.read_params.integer_quals = False
2572
2573 mapping = spliced_alignment(params,
2574 bwt_idx_prefix,
2575 sam_header_filename,
2576 ref_fasta,
2577 params.read_params.seed_length,
2578 params.segment_length,
2579 left_kept_reads,
2580 right_kept_reads,
2581 user_supplied_juncs,
2582 user_supplied_insertions,
2583 user_supplied_deletions)
2584
2585 left_maps = mapping[left_kept_reads]
2586 #left_unmapped_reads = mapping[1]
2587 right_maps = mapping[right_kept_reads]
2588 #right_unmapped_reads = mapping[3]
2589
2590 compile_reports(params,
2591 sam_header_filename,
2592 left_maps,
2593 left_kept_reads,
2594 right_maps,
2595 right_kept_reads,
2596 params.gff_annotation)
2597
2598 if not params.system_params.keep_tmp:
2599 for m in left_maps:
2600 os.remove(m)
2601 if left_kept_reads != None:
2602 os.remove(left_kept_reads)
2603 for m in right_maps:
2604 os.remove(m)
2605 if right_kept_reads != None:
2606 os.remove(right_kept_reads)
2607 tmp_files = os.listdir(tmp_dir)
2608 for t in tmp_files:
2609 os.remove(tmp_dir+t)
2610 os.rmdir(tmp_dir)
2611
2612 finish_time = datetime.now()
2613 duration = finish_time - start_time
2614 print >> sys.stderr,"-----------------------------------------------"
2615 print >> sys.stderr, "Run complete [%s elapsed]" % formatTD(duration)
2616
2617 except Usage, err:
2618 print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg)
2619 print >> sys.stderr, " for detailed help see http://tophat.cbcb.umd.edu/manual.html"
2620 return 2
2621
2622
2623 if __name__ == "__main__":
2624 sys.exit(main())

Properties

Name Value
svn:executable *