ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/tophat
Revision: 263
Committed: Mon Jul 30 21:38:58 2012 UTC (12 years, 1 month ago) by gpertea
File size: 168081 byte(s)
Log Message:
resuming works

Line File contents
1 #!/usr/bin/env python
2
3 # encoding: utf-8
4 """
5 tophat.py
6
7 Created by Cole Trapnell on 2008-12-25.
8 Copyright (c) 2008 Cole Trapnell. All rights reserved.
9 Updated and maintained by Daehwan Kim and Geo Pertea since Jul 2010.
10 """
11 import sys
12 try:
13 import psyco
14 psyco.full()
15 except ImportError:
16 pass
17
18 import getopt
19 import subprocess
20 import errno
21 import os
22 import warnings
23 import re
24 import glob
25 import signal
26 from datetime import datetime, date, time
27 from shutil import copy
28 import logging
29
30 use_message = '''
31 TopHat maps short sequences from spliced transcripts to whole genomes.
32
33 Usage:
34 tophat [options] <bowtie_index> <reads1[,reads2,...]> [reads1[,reads2,...]] \\
35 [quals1,[quals2,...]] [quals1[,quals2,...]]
36
37 Options:
38 -v/--version
39 -o/--output-dir <string> [ default: ./tophat_out ]
40 --bowtie1 [ default: bowtie2 ]
41 -N/--read-mismatches <int> [ default: 2 ]
42 --read-gap-length <int> [ default: 2 ]
43 --read-edit-dist <int> [ default: 2 ]
44 --read-realign-edit-dist <int> [ default: 2 ]
45 -a/--min-anchor <int> [ default: 8 ]
46 -m/--splice-mismatches <0-2> [ default: 0 ]
47 -i/--min-intron-length <int> [ default: 50 ]
48 -I/--max-intron-length <int> [ default: 500000 ]
49 -g/--max-multihits <int> [ default: 20 ]
50 --suppress-hits
51 -x/--transcriptome-max-hits <int> [ default: 60 ]
52 -M/--prefilter-multihits ( for -G/--GTF option, enable
53 an initial bowtie search
54 against the genome )
55 --max-insertion-length <int> [ default: 3 ]
56 --max-deletion-length <int> [ default: 3 ]
57 --solexa-quals
58 --solexa1.3-quals (same as phred64-quals)
59 --phred64-quals (same as solexa1.3-quals)
60 -Q/--quals
61 --integer-quals
62 -C/--color (Solid - color space)
63 --color-out
64 --library-type <string> (fr-unstranded, fr-firststrand,
65 fr-secondstrand)
66 -p/--num-threads <int> [ default: 1 ]
67 -R/--resume <out_dir> ( try to resume execution )
68 -G/--GTF <filename> (GTF/GFF with known transcripts)
69 --transcriptome-index <bwtidx> (transcriptome bowtie index)
70 -T/--transcriptome-only (map only to the transcriptome)
71 -j/--raw-juncs <filename>
72 --insertions <filename>
73 --deletions <filename>
74 -r/--mate-inner-dist <int>
75 --mate-std-dev <int> [ default: 20 ]
76 --no-novel-juncs
77 --no-novel-indels
78 --no-gtf-juncs
79 --no-coverage-search
80 --coverage-search
81 --microexon-search
82 --keep-tmp
83 --tmp-dir <dirname> [ default: <output_dir>/tmp ]
84 -z/--zpacker <program> [ default: gzip ]
85 -X/--unmapped-fifo [use mkfifo to compress more temporary
86 files for color space reads]
87
88 Advanced Options:
89 --report-secondary-alignments
90 --no-discordant
91 --no-mixed
92
93 --segment-mismatches <int> [ default: 2 ]
94 --segment-length <int> [ default: 25 ]
95
96 --bowtie-n [ default: bowtie -v ]
97 --min-coverage-intron <int> [ default: 50 ]
98 --max-coverage-intron <int> [ default: 20000 ]
99 --min-segment-intron <int> [ default: 50 ]
100 --max-segment-intron <int> [ default: 500000 ]
101 --no-sort-bam (Output BAM is not coordinate-sorted)
102 --no-convert-bam (Do not output bam format.
103 Output is <output_dir>/accepted_hit.sam)
104 --keep-fasta-order
105 --allow-partial-mapping
106
107 Bowtie2 related options:
108 Preset options in --end-to-end mode (local alignment is not used in TopHat2)
109 --b2-very-fast
110 --b2-fast
111 --b2-sensitive
112 --b2-very-sensitive
113
114 Alignment options
115 --b2-N <int> [ default: 0 ]
116 --b2-L <int> [ default: 20 ]
117 --b2-i <func> [ default: S,1,1.25 ]
118 --b2-n-ceil <func> [ default: L,0,0.15 ]
119 --b2-gbar <int> [ default: 4 ]
120
121 Scoring options
122 --b2-mp <int>,<int> [ default: 6,2 ]
123 --b2-np <int> [ default: 1 ]
124 --b2-rdg <int>,<int> [ default: 5,3 ]
125 --b2-rfg <int>,<int> [ default: 5,3 ]
126 --b2-score-min <func> [ default: L,-0.6,-0.6 ]
127
128 Effort options
129 --b2-D <int> [ default: 15 ]
130 --b2-R <int> [ default: 2 ]
131
132 Fusion related options:
133 --fusion-search
134 --fusion-anchor-length <int> [ default: 20 ]
135 --fusion-min-dist <int> [ default: 10000000 ]
136 --fusion-read-mismatches <int> [ default: 2 ]
137 --fusion-multireads <int> [ default: 2 ]
138 --fusion-multipairs <int> [ default: 2 ]
139 --fusion-ignore-chromosomes <list> [ e.g, <chrM,chrX> ]
140
141 --fusion-do-not-resolve-conflicts [this is for test purposes ]
142
143 SAM Header Options (for embedding sequencing run metadata in output):
144 --rg-id <string> (read group ID)
145 --rg-sample <string> (sample ID)
146 --rg-library <string> (library ID)
147 --rg-description <string> (descriptive string, no tabs allowed)
148 --rg-platform-unit <string> (e.g Illumina lane ID)
149 --rg-center <string> (sequencing center name)
150 --rg-date <string> (ISO 8601 date of the sequencing run)
151 --rg-platform <string> (Sequencing platform descriptor)
152 '''
153
154 # Deprecated:
155 # --min-closure-exon <int> [ default: 100 ]
156 # --min-closure-intron <int> [ default: 50 ]
157 # --max-closure-intron <int> [ default: 5000 ]
158 # --no-closure-search
159 # --closure-search
160 # --butterfly-search
161 # --no-butterfly-search
162 # -F/--min-isoform-fraction <float> [ default: 0.15 ]
163
164 class Usage(Exception):
165 def __init__(self, msg):
166 self.msg = msg
167
168 output_dir = "./tophat_out/"
169 logging_dir = output_dir + "logs/"
170 run_log = None
171 tophat_log = None #main log file handle
172 tophat_logger = None # main logging object
173 run_cmd = None
174 tmp_dir = output_dir + "tmp/"
175 bin_dir = sys.path[0] + "/"
176 use_zpacker = False # this is set by -z/--zpacker option (-z0 leaves it False)
177
178 use_BAM_Unmapped = False # automatically set to True for non-Solid reads, handles unmapped reads in BAM format
179
180 use_BWT_FIFO = False # can only be set to True if use_zpacker is True and only with -C/--color
181 # enabled by -X/-unmapped-fifo option (unless -z0)
182 unmapped_reads_fifo = None # if use_BWT_FIFO is True, this tricks bowtie into writing the
183 # unmapped reads into a compressed file
184
185 samtools_path = None
186 bowtie_path = None
187 fail_str = "\t[FAILED]\n"
188 gtf_juncs = None #file name with junctions extracted from given GFF file
189
190 #mapping types:
191
192 _reads_vs_G, _reads_vs_T, _segs_vs_G, _segs_vs_J = range(1,5)
193
194 # execution resuming stages (for now, execution can be resumed only for stages
195 # after the pre-filter and transcriptome searches):
196 _stage_map_segments, _stage_find_juncs, _stage_juncs_db, _stage_map2juncs, _stage_tophat_reports = range(1,6)
197 stageNames = ["start", "map_segments", "find_juncs", "juncs_db", "map2juncs", "tophat_reports"]
198 # 0 1 2 3 4 5
199 runStages = dict([(stageNames[st], st) for st in range(0, 6)])
200 currentStage = 0
201 resumeStage = 0
202
203 def doResume(odir):
204 #must return the original list of arguments
205 rlog = odir+"/logs/run.log"
206 oldargv = None
207 try:
208 flog=open(rlog)
209 #first line must be the actual tophat command used
210 thcmd=None
211 try:
212 thcmd = next(flog)
213 except StopIteration as e:
214 die("Error: cannot resume, run.log is empty.")
215 oldargv=thcmd.split()
216 mlog("Tophat command to resume: "+thcmd)
217 resume_tag = None
218 for line in flog:
219 #scan for last resume code, if any
220 r=re.match("^#>(\w+):$", line)
221 if r:
222 resume_tag=r.group(1)
223 mlog("resume_tag found: "+resume_tag)
224 global resumeStage
225 if resume_tag:
226 if resume_tag in runStages:
227 resumeStage = runStages[resume_tag]
228 th_log("Resuming TopHat run from stage: " + resume_tag)
229 else:
230 die("Error: unrecognized run stage '"+resume_tag+"'")
231 else:
232 die("Error: resuming requested but no valid stage found in run.log")
233 flog.close()
234 except IOError as e:
235 die("Error: cannot resume, failed to open "+rlog)
236 return oldargv
237
238 def setRunStage(stnum):
239 global currentStage
240 print >> run_log, "#>"+stageNames[stnum]+":"
241 currentStage = stnum
242
243 def init_logger(log_fname):
244 global tophat_logger
245 tophat_logger = logging.getLogger('project')
246 formatter = logging.Formatter('%(asctime)s %(message)s', '[%Y-%m-%d %H:%M:%S]')
247 # level = logging.__dict__.get(options.loglevel.upper(),logging.DEBUG)
248 tophat_logger.setLevel(logging.DEBUG)
249
250 hstream = logging.StreamHandler(sys.stderr)
251 hstream.setFormatter(formatter)
252 tophat_logger.addHandler(hstream)
253 #
254 # Output logging information to file
255 if os.path.isfile(log_fname):
256 os.remove(log_fname)
257 global tophat_log
258 logfh = logging.FileHandler(log_fname)
259 logfh.setFormatter(formatter)
260 tophat_logger.addHandler(logfh)
261 tophat_log=logfh.stream
262
263 # TopHatParams captures all of the runtime paramaters used by TopHat, and many
264 # of these are passed as command line options to exectubles run by the pipeline
265
266 # This class and its nested classes also do options parsing through parse_options()
267 # and option validation via the member function check()
268
269 class BowtieFltFiles:
270 def __init__(self,
271 seqfiles=None, qualfiles=None,
272 mappings=None,
273 unmapped_reads=None,
274 multihit_reads=None):
275 self.seqfiles=seqfiles
276 self.qualfiles=qualfiles
277 self.mappings=mappings
278 self.unmapped_reads=unmapped_reads
279 self.multihit_reads=multihit_reads
280
281 class TopHatParams:
282
283 # SpliceConstraints is a group of runtime parameters that specify what
284 # constraints to put on junctions discovered by the program. These constraints
285 # are used to filter out spurious/false positive junctions.
286
287 class SpliceConstraints:
288 def __init__(self,
289 min_anchor_length,
290 min_intron_length,
291 max_intron_length,
292 splice_mismatches,
293 min_isoform_fraction):
294 self.min_anchor_length = min_anchor_length
295 self.min_intron_length = min_intron_length
296 self.max_intron_length = max_intron_length
297 self.splice_mismatches = splice_mismatches
298 self.min_isoform_fraction = min_isoform_fraction
299
300 def parse_options(self, opts):
301 for option, value in opts:
302 if option in ("-m", "--splice-mismatches"):
303 self.splice_mismatches = int(value)
304 elif option in ("-a", "--min-anchor"):
305 self.min_anchor_length = int(value)
306 elif option in ("-F", "--min-isoform-fraction"):
307 self.min_isoform_fraction = float(value)
308 elif option in ("-i", "--min-intron-length"):
309 self.min_intron_length = int(value)
310 elif option in ("-I", "--max-intron-length"):
311 self.max_intron_length = int(value)
312
313 def check(self):
314 if self.splice_mismatches not in [0,1,2]:
315 die("Error: arg to --splice-mismatches must be 0, 1, or 2")
316 if self.min_anchor_length < 4:
317 die("Error: arg to --min-anchor-len must be greater than 4")
318 if self.min_isoform_fraction < 0.0 or self.min_isoform_fraction > 1.0:
319 die("Error: arg to --min-isoform-fraction must be between 0.0 and 1.0")
320 if self.min_intron_length <= 0:
321 die("Error: arg to --min-intron-length must be greater than 0")
322 if self.max_intron_length <= 0:
323 die("Error: arg to --max-intron-length must be greater than 0")
324
325 # SystemParams is a group of runtime parameters that determine how to handle
326 # temporary files produced during a run and how many threads to use for threaded
327 # stages of the pipeline (e.g. Bowtie)
328
329 class SystemParams:
330 def __init__(self,
331 num_threads,
332 keep_tmp):
333 self.num_threads = num_threads
334 self.keep_tmp = keep_tmp
335 self.zipper = "gzip"
336 self.zipper_opts= []
337
338 def parse_options(self, opts):
339 global use_zpacker
340 global use_BWT_FIFO
341 for option, value in opts:
342 if option in ("-p", "--num-threads"):
343 self.num_threads = int(value)
344 elif option == "--keep-tmp":
345 self.keep_tmp = True
346 elif option in ("-z","--zpacker"):
347 if value.lower() in ["-", " ", ".", "0", "none", "f", "false", "no"]:
348 value=""
349 self.zipper = value
350 #if not self.zipper:
351 # self.zipper='gzip'
352 elif option in ("-X", "--unmapped-fifo"):
353 use_BWT_FIFO=True
354 if self.zipper:
355 use_zpacker=True
356 if self.num_threads>1 and not self.zipper_opts:
357 if self.zipper.endswith('pbzip2') or self.zipper.endswith('pigz'):
358 self.zipper_opts.append('-p'+str(self.num_threads))
359 else:
360 use_zpacker=False
361 if use_BWT_FIFO: use_BWT_FIFO=False
362 def cmd(self):
363 cmdline=[]
364 if self.zipper:
365 cmdline.extend(['-z',self.zipper])
366 if self.num_threads>1:
367 cmdline.extend(['-p'+str(self.num_threads)])
368 return cmdline
369
370 def check(self):
371 if self.num_threads<1 :
372 die("Error: arg to --num-threads must be greater than 0")
373 if self.zipper:
374 xzip=which(self.zipper)
375 if not xzip:
376 die("Error: cannot find compression program "+self.zipper)
377
378 # ReadParams is a group of runtime parameters that specify various properties
379 # of the user's reads (e.g. which quality scale their are on, how long the
380 # fragments are, etc).
381 class ReadParams:
382 def __init__(self,
383 solexa_quals,
384 phred64_quals,
385 quals,
386 integer_quals,
387 color,
388 library_type,
389 seed_length,
390 reads_format,
391 mate_inner_dist,
392 mate_inner_dist_std_dev,
393 read_group_id,
394 sample_id,
395 library_id,
396 description,
397 seq_platform_unit,
398 seq_center,
399 seq_run_date,
400 seq_platform):
401 self.solexa_quals = solexa_quals
402 self.phred64_quals = phred64_quals
403 self.quals = quals
404 self.integer_quals = integer_quals
405 self.color = color
406 self.library_type = library_type
407 self.seed_length = seed_length
408 self.reads_format = reads_format
409 self.mate_inner_dist = mate_inner_dist
410 self.mate_inner_dist_std_dev = mate_inner_dist_std_dev
411 self.read_group_id = read_group_id
412 self.sample_id = sample_id
413 self.library_id = library_id
414 self.description = description
415 self.seq_platform_unit = seq_platform_unit
416 self.seq_center = seq_center
417 self.seq_run_date = seq_run_date
418 self.seq_platform = seq_platform
419
420 def parse_options(self, opts):
421 for option, value in opts:
422 if option == "--solexa-quals":
423 self.solexa_quals = True
424 elif option in ("--solexa1.3-quals", "--phred64-quals"):
425 self.phred64_quals = True
426 elif option in ("-Q", "--quals"):
427 self.quals = True
428 elif option == "--integer-quals":
429 self.integer_quals = True
430 elif option in ("-C", "--color"):
431 self.color = True
432 elif option == "--library-type":
433 self.library_type = value
434 elif option in ("-s", "--seed-length"):
435 self.seed_length = int(value)
436 elif option in ("-r", "--mate-inner-dist"):
437 self.mate_inner_dist = int(value)
438 elif option == "--mate-std-dev":
439 self.mate_inner_dist_std_dev = int(value)
440 elif option == "--rg-id":
441 self.read_group_id = value
442 elif option == "--rg-sample":
443 self.sample_id = value
444 elif option == "--rg-library":
445 self.library_id = value
446 elif option == "--rg-description":
447 self.description = value
448 elif option == "--rg-platform-unit":
449 self.seq_platform_unit = value
450 elif option == "--rg-center":
451 self.seq_center = value
452 elif option == "--rg-date":
453 self.seq_run_date = value
454 elif option == "--rg-platform":
455 self.seq_platform = value
456
457 def check(self):
458 if self.seed_length and self.seed_length < 20:
459 die("Error: arg to --seed-length must be at least 20")
460
461 if self.mate_inner_dist_std_dev != None and self.mate_inner_dist_std_dev < 0:
462 die("Error: arg to --mate-std-dev must at least 0")
463 if (not self.read_group_id and self.sample_id) or (self.read_group_id and not self.sample_id):
464 die("Error: --rg-id and --rg-sample must be specified or omitted together")
465
466 # SearchParams is a group of runtime parameters that specify how TopHat will
467 # search for splice junctions
468
469 class SearchParams:
470 def __init__(self,
471 min_closure_exon,
472 min_closure_intron,
473 max_closure_intron,
474 min_coverage_intron,
475 max_coverage_intron,
476 min_segment_intron,
477 max_segment_intron):
478
479 self.min_closure_exon_length = min_closure_exon
480 self.min_closure_intron_length = min_closure_intron
481 self.max_closure_intron_length = max_closure_intron
482 self.min_coverage_intron_length = min_coverage_intron
483 self.max_coverage_intron_length = max_coverage_intron
484 self.min_segment_intron_length = min_segment_intron
485 self.max_segment_intron_length = max_segment_intron
486
487 def parse_options(self, opts):
488 for option, value in opts:
489 if option == "--min-closure-exon":
490 self.min_closure_exon_length = int(value)
491 if option == "--min-closure-intron":
492 self.min_closure_intron_length = int(value)
493 if option == "--max-closure-intron":
494 self.max_closure_intron_length = int(value)
495 if option == "--min-coverage-intron":
496 self.min_coverage_intron_length = int(value)
497 if option == "--max-coverage-intron":
498 self.max_coverage_intron_length = int(value)
499 if option == "--min-segment-intron":
500 self.min_segment_intron_length = int(value)
501 if option == "--max-segment-intron":
502 self.max_segment_intron_length = int(value)
503
504 def check(self):
505 if self.min_closure_exon_length < 0:
506 die("Error: arg to --min-closure-exon must be at least 20")
507 if self.min_closure_intron_length < 0:
508 die("Error: arg to --min-closure-intron must be at least 20")
509 if self.max_closure_intron_length < 0:
510 die("Error: arg to --max-closure-intron must be at least 20")
511 if self.min_coverage_intron_length < 0:
512 die("Error: arg to --min-coverage-intron must be at least 20")
513 if self.max_coverage_intron_length < 0:
514 die("Error: arg to --max-coverage-intron must be at least 20")
515 if self.min_segment_intron_length < 0:
516 die("Error: arg to --min-segment-intron must be at least 20")
517 if self.max_segment_intron_length < 0:
518 die("Error: arg to --max-segment-intron must be at least 20")
519
520 class ReportParams:
521 def __init__(self):
522 self.sort_bam = True
523 self.convert_bam = True
524
525 def parse_options(self, opts):
526 for option, value in opts:
527 if option == "--no-sort-bam":
528 self.sort_bam = False
529 if option == "--no-convert-bam":
530 self.convert_bam = False
531
532 class Bowtie2Params:
533 def __init__(self):
534 self.very_fast = False
535 self.fast = False
536 self.sensitive = False
537 self.very_sensitive = False
538
539 self.N = 0
540 self.L = 20
541 self.i = "S,1,1.25"
542 self.n_ceil = "L,0,0.15"
543 self.gbar = 4
544
545 self.mp = "6,2"
546 self.np = 1
547 self.rdg = "5,3"
548 self.rfg = "5,3"
549 # self.score_min = "L,-0.6,-0.6"
550 self.score_min = None
551
552 self.D = 15
553 self.R = 2
554
555 def parse_options(self, opts):
556 for option, value in opts:
557 if option == "--b2-very-fast":
558 self.very_fast = True
559 if option == "--b2-fast":
560 self.fast = True
561 if option == "--b2-sensitive":
562 self.sensitive = True
563 if option == "--b2-very-sensitive":
564 self.very_sensitive = True
565
566 if option == "--b2-N":
567 self.N = int(value)
568 if option == "--b2-L":
569 self.L = 20
570 if option == "--b2-i":
571 self.i = value
572 if option == "--b2-n-ceil":
573 self.n_ceil = value
574 if option == "--b2-gbar":
575 self.gbar = 4
576
577 if option == "--b2-mp":
578 self.mp = value
579 if option == "--b2-np":
580 self.np = int(value)
581 if option == "--b2-rdg":
582 self.rdg = value
583 if option == "--b2-rfg":
584 self.rfg = value
585 if option == "--b2-score-min":
586 self.score_min = value
587
588 if option == "--b2-D":
589 self.D = int(value)
590 if option == "--b2-R":
591 self.R = int(value)
592
593 def check(self):
594 more_than_once = False
595 if self.very_fast:
596 if self.fast or self.sensitive or self.very_sensitive:
597 more_than_once = True
598 else:
599 if self.fast:
600 if self.sensitive or self.very_sensitive:
601 more_than_once = True
602 else:
603 if self.sensitive and self.very_sensitive:
604 more_than_once = True
605
606 if more_than_once:
607 die("Error: use only one of --b2-very-fast, --b2-fast, --b2-sensitive, --b2-very-sensitive")
608
609 if not self.N in [0, 1]:
610 die("Error: arg to --b2-N must be either 0 or 1")
611
612 function_re = r'^[CLSG],-?\d+(\.\d+)?,-?\d+(\.\d+)?$'
613 function_match = re.search(function_re, self.i)
614
615 if not function_match:
616 die("Error: arg to --b2-i must be <func> (e.g. --b2-i S,1,1.25)")
617
618 function_match = re.search(function_re, self.n_ceil)
619 if not function_match:
620 die("Error: arg to --b2-n-ceil must be <func> (e.g. --b2-n-ceil L,0,0.15)")
621
622 if self.score_min:
623 function_match = re.search(function_re, self.score_min)
624 if not function_match:
625 die("Error: arg to --b2-score-min must be <func> (e.g. --b2-score-min L,-0.6,-0.6)")
626
627 pair_re = r'^\d+,\d+$'
628 pair_match = re.search(pair_re, self.mp)
629 if not pair_match:
630 die("Error: arg to --b2-mp must be <int>,<int> (e.g. --b2-mp 6,2)")
631
632 pair_match = re.search(pair_re, self.rdg)
633 if not pair_match:
634 die("Error: arg to --b2-rdg must be <int>,<int> (e.g. --b2-mp 5,3)")
635
636 pair_match = re.search(pair_re, self.rfg)
637 if not pair_match:
638 die("Error: arg to --b2-rfg must be <int>,<int> (e.g. --b2-mp 5,3)")
639
640
641 def __init__(self):
642 self.splice_constraints = self.SpliceConstraints(8, # min_anchor
643 50, # min_intron
644 500000, # max_intron
645 0, # splice_mismatches
646 0.15) # min_isoform_frac
647
648 self.preflt_data = [ BowtieFltFiles(), BowtieFltFiles() ]
649 self.sam_header = None
650 self.read_params = self.ReadParams(False, # solexa_scale
651 False,
652 False, # quals
653 None, # integer quals
654 False, # SOLiD - color space
655 "", # library type (e.g. "illumina-stranded-pair-end")
656 None, # seed_length
657 "fastq", # quality_format
658 None, # mate inner distance
659 20, # mate inner dist std dev
660 None, # read group id
661 None, # sample id
662 None, # library id
663 None, # description
664 None, # platform unit (i.e. lane)
665 None, # sequencing center
666 None, # run date
667 None) # sequencing platform
668
669 self.system_params = self.SystemParams(1, # bowtie_threads (num_threads)
670 False) # keep_tmp
671
672 self.search_params = self.SearchParams(100, # min_closure_exon_length
673 50, # min_closure_intron_length
674 5000, # max_closure_intron_length
675 50, # min_coverage_intron_length
676 20000, # max_coverage_intron_length
677 50, # min_segment_intron_length
678 500000) # max_segment_intron_length
679
680 self.report_params = self.ReportParams()
681
682 self.bowtie2_params = self.Bowtie2Params()
683
684 self.bowtie2 = True
685 self.gff_annotation = None
686 self.transcriptome_only = False
687 self.transcriptome_index = None
688 self.transcriptome_outdir = None
689 self.raw_junctions = None
690 self.resume_dir = None
691 self.find_novel_juncs = True
692 self.find_novel_indels = True
693 self.find_novel_fusions = True
694 self.find_GFF_juncs = True
695 self.max_hits = 20
696 self.suppress_hits = False
697 self.t_max_hits = 60
698 self.max_seg_hits = 40
699 self.prefilter_multi = False
700 self.read_mismatches = 2
701 self.read_gap_length = 2
702 self.read_edit_dist = 2
703 self.read_realign_edit_dist = None
704 self.segment_length = 25
705 self.segment_mismatches = 2
706 self.bowtie_alignment_option = "-v"
707 self.max_insertion_length = 3
708 self.max_deletion_length = 3
709 self.raw_insertions = None
710 self.raw_deletions = None
711 self.coverage_search = None
712 self.closure_search = False
713 #self.butterfly_search = None
714 self.butterfly_search = False
715 self.microexon_search = False
716 self.report_secondary_alignments = False
717 self.report_discordant_pair_alignments = True
718 self.report_mixed_alignments = True
719
720 # experimental -W option to activate score and edit distance filtering
721 # in fix_map_ordering (hits post processing)
722 self.b2scoreflt = False
723
724 self.keep_fasta_order = False
725 self.partial_mapping = False
726
727 self.fusion_search = False
728 self.fusion_anchor_length = 20
729 self.fusion_min_dist = 10000000
730 self.fusion_read_mismatches = 2
731 self.fusion_multireads = 2
732 self.fusion_multipairs = 2
733 self.fusion_ignore_chromosomes = []
734 self.fusion_do_not_resolve_conflicts = False
735
736 def check(self):
737 self.splice_constraints.check()
738 self.read_params.check()
739 self.system_params.check()
740 if self.segment_length < 10:
741 die("Error: arg to --segment-length must at least 10")
742 if self.segment_mismatches < 0 or self.segment_mismatches > 3:
743 die("Error: arg to --segment-mismatches must in [0, 3]")
744 if self.read_params.color:
745 if self.bowtie2:
746 th_log("Warning: bowtie2 in colorspace is not supported; --bowtie1 option assumed.")
747 self.bowtie2=False
748 if self.fusion_search:
749 die("Error: fusion-search in colorspace is not yet supported")
750 if self.butterfly_search:
751 die("Error: butterfly-search in colorspace is not yet supported")
752
753 self.bowtie2_params.check()
754
755 if self.bowtie2 and self.fusion_search:
756 th_logp("\tWarning: --fusion-search with Bowtie2 may not work well as it may require much memory space and produce many spurious fusions. Please try --bowtie1 option if this doesn't work.")
757
758 library_types = ["fr-unstranded", "fr-firststrand", "fr-secondstrand"]
759
760 if self.read_params.library_type and self.read_params.library_type not in library_types:
761 die("Error: library-type should be one of: "+', '.join(library_types))
762
763 self.search_params.max_closure_intron_length = min(self.splice_constraints.max_intron_length,
764 self.search_params.max_closure_intron_length)
765
766 self.search_params.max_segment_intron_length = min(self.splice_constraints.max_intron_length,
767 self.search_params.max_segment_intron_length)
768
769 self.search_params.max_coverage_intron_length = min(self.splice_constraints.max_intron_length,
770 self.search_params.max_coverage_intron_length)
771
772 if self.max_insertion_length >= self.segment_length:
773 die("Error: the max insertion length ("+self.max_insertion_length+") can not be equal to or greater than the segment length ("+self.segment_length+")")
774
775 if self.max_insertion_length < 0:
776 die("Error: the max insertion length ("+self.max_insertion_length+") can not be less than 0")
777
778 if self.max_deletion_length >= self.splice_constraints.min_intron_length:
779 die("Error: the max deletion length ("+self.max_deletion_length+") can not be equal to or greater than the min intron length ("+self.splice_constraints.min_intron_length+")")
780
781 if self.max_deletion_length < 0:
782 die("Error: the max deletion length ("+self.max_deletion_length+") can not be less than 0")
783
784 if self.read_mismatches > self.read_edit_dist or self.read_gap_length > self.read_edit_dist:
785 die("Error: the read mismatches (" + str(self.read_mismatches) + ") and the read gap length (" + str(self.read_edit_dist) + ") should be less than or equal to the read edit dist (" + str(self.read_edit_dist) + ")\n" + \
786 "Either decrease --read-mismatches or --read-gap-length, or increase --read-edit-dist")
787
788 def cmd(self):
789 cmd = ["--min-anchor", str(self.splice_constraints.min_anchor_length),
790 "--splice-mismatches", str(self.splice_constraints.splice_mismatches),
791 "--min-report-intron", str(self.splice_constraints.min_intron_length),
792 "--max-report-intron", str(self.splice_constraints.max_intron_length),
793 "--min-isoform-fraction", str(self.splice_constraints.min_isoform_fraction),
794 "--output-dir", output_dir,
795 "--max-multihits", str(self.max_hits),
796 "--max-seg-multihits", str(self.max_seg_hits),
797 "--segment-length", str(self.segment_length),
798 "--segment-mismatches", str(self.segment_mismatches),
799 "--min-closure-exon", str(self.search_params.min_closure_exon_length),
800 "--min-closure-intron", str(self.search_params.min_closure_intron_length),
801 "--max-closure-intron", str(self.search_params.max_closure_intron_length),
802 "--min-coverage-intron", str(self.search_params.min_coverage_intron_length),
803 "--max-coverage-intron", str(self.search_params.max_coverage_intron_length),
804 "--min-segment-intron", str(self.search_params.min_segment_intron_length),
805 "--max-segment-intron", str(self.search_params.max_segment_intron_length),
806 "--read-mismatches", str(self.read_mismatches),
807 "--read-gap-length", str(self.read_gap_length),
808 "--read-edit-dist", str(self.read_edit_dist),
809 "--read-realign-edit-dist", str(self.read_realign_edit_dist),
810 "--max-insertion-length", str(self.max_insertion_length),
811 "--max-deletion-length", str(self.max_deletion_length)]
812
813 if self.suppress_hits:
814 cmd.extend(["--suppress-hits"])
815
816 if not self.bowtie2:
817 cmd.extend(["--bowtie1"])
818
819 if self.fusion_search:
820 cmd.extend(["--fusion-search",
821 "--fusion-anchor-length", str(self.fusion_anchor_length),
822 "--fusion-min-dist", str(self.fusion_min_dist),
823 "--fusion-read-mismatches", str(self.fusion_read_mismatches),
824 "--fusion-multireads", str(self.fusion_multireads),
825 "--fusion-multipairs", str(self.fusion_multipairs)])
826
827 if self.fusion_ignore_chromosomes:
828 cmd.extend(["--fusion-ignore-chromosomes", ",".join(self.fusion_ignore_chromosomes)])
829
830 if self.fusion_do_not_resolve_conflicts:
831 cmd.extend(["--fusion-do-not-resolve-conflicts"])
832
833 cmd.extend(self.system_params.cmd())
834
835 if self.read_params.mate_inner_dist != None:
836 cmd.extend(["--inner-dist-mean", str(self.read_params.mate_inner_dist),
837 "--inner-dist-std-dev", str(self.read_params.mate_inner_dist_std_dev)])
838 if self.gff_annotation != None:
839 cmd.extend(["--gtf-annotations", str(self.gff_annotation)])
840 if gtf_juncs:
841 cmd.extend(["--gtf-juncs", gtf_juncs])
842 if self.closure_search == False:
843 cmd.append("--no-closure-search")
844 if not self.coverage_search:
845 cmd.append("--no-coverage-search")
846 if not self.microexon_search:
847 cmd.append("--no-microexon-search")
848 if self.butterfly_search:
849 cmd.append("--butterfly-search")
850 if self.read_params.solexa_quals:
851 cmd.append("--solexa-quals")
852 if self.read_params.quals:
853 cmd.append("--quals")
854 if self.read_params.integer_quals:
855 cmd.append("--integer-quals")
856 if self.read_params.color:
857 cmd.append("--color")
858 if self.read_params.library_type:
859 cmd.extend(["--library-type", self.read_params.library_type])
860 if self.read_params.read_group_id:
861 cmd.extend(["--rg-id", self.read_params.read_group_id])
862 if self.read_params.phred64_quals:
863 cmd.append("--phred64-quals")
864 return cmd
865
866 # This is the master options parsing routine, which calls parse_options for
867 # the delegate classes (e.g. SpliceConstraints) that handle certain groups
868 # of options.
869 def parse_options(self, argv):
870 try:
871 opts, args = getopt.getopt(argv[1:], "hvp:m:n:N:F:a:i:I:G:Tr:o:j:Xz:s:g:x:R:MQCW",
872 ["version",
873 "help",
874 "output-dir=",
875 "bowtie1",
876 "solexa-quals",
877 "solexa1.3-quals",
878 "phred64-quals",
879 "quals",
880 "integer-quals",
881 "color",
882 "library-type=",
883 "num-threads=",
884 "splice-mismatches=",
885 "max-multihits=",
886 "suppress-hits",
887 "min-isoform-fraction=",
888 "min-anchor-length=",
889 "min-intron-length=",
890 "max-intron-length=",
891 "GTF=",
892 "transcriptome-only",
893 "transcriptome-max-hits=",
894 "transcriptome-index=",
895 "raw-juncs=",
896 "no-novel-juncs",
897 "allow-fusions",
898 "fusion-search",
899 "fusion-anchor-length=",
900 "fusion-min-dist=",
901 "fusion-read-mismatches=",
902 "fusion-multireads=",
903 "fusion-multipairs=",
904 "fusion-ignore-chromosomes=",
905 "fusion-do-not-resolve-conflicts",
906 "no-novel-indels",
907 "no-gtf-juncs",
908 "mate-inner-dist=",
909 "mate-std-dev=",
910 "no-coverage-search",
911 "coverage-search",
912 "prefilter-multihits",
913 "microexon-search",
914 "min-coverage-intron=",
915 "max-coverage-intron=",
916 "min-segment-intron=",
917 "max-segment-intron=",
918 "resume=",
919 "seed-length=",
920 "read-mismatches=",
921 "read-gap-length=",
922 "read-edit-dist=",
923 "read-realign-edit-dist=",
924 "segment-length=",
925 "segment-mismatches=",
926 "bowtie-n",
927 "keep-tmp",
928 "rg-id=",
929 "rg-sample=",
930 "rg-library=",
931 "rg-description=",
932 "rg-platform-unit=",
933 "rg-center=",
934 "rg-date=",
935 "rg-platform=",
936 "tmp-dir=",
937 "zpacker=",
938 "unmapped-fifo",
939 "max-insertion-length=",
940 "max-deletion-length=",
941 "insertions=",
942 "deletions=",
943 "no-sort-bam",
944 "no-convert-bam",
945 "report-secondary-alignments",
946 "no-discordant",
947 "no-mixed",
948 "keep-fasta-order",
949 "allow-partial-mapping",
950 "b2-very-fast",
951 "b2-fast",
952 "b2-sensitive",
953 "b2-very-sensitive",
954 "b2-N=",
955 "b2-L=",
956 "b2-i=",
957 "b2-n-ceil=",
958 "b2-gbar=",
959 "b2-ma=",
960 "b2-mp=",
961 "b2-np=",
962 "b2-rdg=",
963 "b2-rfg=",
964 "b2-score-min=",
965 "b2-D=",
966 "b2-R="])
967 except getopt.error, msg:
968 raise Usage(msg)
969
970 self.splice_constraints.parse_options(opts)
971 self.system_params.parse_options(opts)
972 self.read_params.parse_options(opts)
973 self.search_params.parse_options(opts)
974 self.report_params.parse_options(opts)
975 self.bowtie2_params.parse_options(opts)
976 global use_BWT_FIFO
977 global use_BAM_Unmapped
978 if not self.read_params.color:
979 use_BWT_FIFO=False
980 use_BAM_Unmapped=True
981 global output_dir
982 global logging_dir
983 global tmp_dir
984
985 custom_tmp_dir = None
986 custom_out_dir = None
987 # option processing
988 for option, value in opts:
989 if option in ("-v", "--version"):
990 print "TopHat v%s" % (get_version())
991 sys.exit(0)
992 if option in ("-h", "--help"):
993 raise Usage(use_message)
994 if option == "--bowtie1":
995 self.bowtie2 = False
996 if option in ("-g", "--max-multihits"):
997 self.max_hits = int(value)
998 self.max_seg_hits = max(10, self.max_hits * 2)
999 if option == "--suppress-hits":
1000 self.suppress_hits = True
1001 if option in ("-x", "--transcriptome-max-hits"):
1002 self.t_max_hits = int(value)
1003 if option in ("-G", "--GTF"):
1004 self.gff_annotation = value
1005 if option in ("-T", "--transcriptome-only"):
1006 self.transcriptome_only = True
1007 if option == "--transcriptome-index":
1008 self.transcriptome_index = value
1009 if option in("-M", "--prefilter-multihits"):
1010 self.prefilter_multi = True
1011 if option in ("-j", "--raw-juncs"):
1012 self.raw_junctions = value
1013 if option == "--no-novel-juncs":
1014 self.find_novel_juncs = False
1015 if option == "--no-novel-indels":
1016 self.find_novel_indels = False
1017 if option == "--fusion-search":
1018 self.fusion_search = True
1019 if option == "--fusion-anchor-length":
1020 self.fusion_anchor_length = int(value)
1021 if option == "--fusion-min-dist":
1022 self.fusion_min_dist = int(value)
1023 if option == "--fusion-read-mismatches":
1024 self.fusion_read_mismatches = int(value)
1025 if option == "--fusion-multireads":
1026 self.fusion_multireads = int(value)
1027 if option == "--fusion-multipairs":
1028 self.fusion_multipairs = int(value)
1029 if option == "--fusion-ignore-chromosomes":
1030 self.fusion_ignore_chromosomes = value.split(",")
1031 if option == "--fusion-do-not-resolve-conflicts":
1032 self.fusion_do_not_resolve_conflicts = True
1033 if option == "--no-gtf-juncs":
1034 self.find_GFF_juncs = False
1035 if option == "--no-coverage-search":
1036 self.coverage_search = False
1037 if option == "--coverage-search":
1038 self.coverage_search = True
1039 # -W option : score and edit distance filtering in fix_map_ordering
1040 # this is *soft* post-processing of bowtie2 results, should be
1041 # more effectively implemented by using bowtie2's score function
1042 if option == "-W":
1043 self.b2scoreflt = True
1044 self.closure_search = False
1045 #if option == "--no-closure-search":
1046 # self.closure_search = False
1047 #if option == "--closure-search":
1048 # self.closure_search = True
1049 if option == "--microexon-search":
1050 self.microexon_search = True
1051
1052 self.butterfly_search = False
1053 #if option == "--butterfly-search":
1054 # self.butterfly_search = True
1055 #if option == "--no-butterfly-search":
1056 # self.butterfly_search = False
1057 if option in ("-N", "--read-mismatches"):
1058 self.read_mismatches = int(value)
1059 if option == "--read-gap-length":
1060 self.read_gap_length = int(value)
1061 if option == "--read-edit-dist":
1062 self.read_edit_dist = int(value)
1063 if option == "--read-realign-edit-dist":
1064 self.read_realign_edit_dist = int(value)
1065 if option == "--segment-length":
1066 self.segment_length = int(value)
1067 if option == "--segment-mismatches":
1068 self.segment_mismatches = int(value)
1069 if option == "--bowtie-n":
1070 self.bowtie_alignment_option = "-n"
1071 if option == "--max-insertion-length":
1072 self.max_insertion_length = int(value)
1073 if option == "--max-deletion-length":
1074 self.max_deletion_length = int(value)
1075 if option == "--insertions":
1076 self.raw_insertions = value
1077 if option == "--deletions":
1078 self.raw_deletions = value
1079 if option == "--report-secondary-alignments":
1080 self.report_secondary_alignments = True
1081 if option == "--no-discordant":
1082 self.report_discordant_pair_alignments = False
1083 if option == "--no-mixed":
1084 self.report_mixed_alignments = False
1085 if option == "--keep-fasta-order":
1086 self.keep_fasta_order = True
1087 if option == "--allow-partial-mapping":
1088 self.partial_mapping = True
1089 if option in ("-o", "--output-dir"):
1090 custom_out_dir = value + "/"
1091 if option in ("-R", "--resume"):
1092 self.resume_dir = value
1093 if option == "--tmp-dir":
1094 custom_tmp_dir = value + "/"
1095
1096 if self.transcriptome_only:
1097 self.find_novel_juncs=False
1098 self.find_novel_indels=False
1099 if custom_out_dir:
1100 output_dir = custom_out_dir
1101 logging_dir = output_dir + "logs/"
1102 tmp_dir = output_dir + "tmp/"
1103 sam_header = tmp_dir + "stub_header.sam"
1104 if custom_tmp_dir:
1105 tmp_dir = custom_tmp_dir
1106 sam_header = tmp_dir + "stub_header.sam"
1107 if len(args) < 2 and not self.resume_dir:
1108 raise Usage(use_message)
1109
1110 if not self.read_realign_edit_dist:
1111 self.read_realign_edit_dist = self.read_edit_dist
1112
1113 return args
1114
1115
1116 def nonzeroFile(filepath):
1117 if os.path.exists(filepath):
1118 fpath, fname=os.path.split(filepath)
1119 fbase, fext =os.path.splitext(fname)
1120 if fext.lower() == ".bam":
1121 samtools_view_cmd = ["samtools", "view", filepath]
1122 samtools_view = subprocess.Popen(samtools_view_cmd, stdout=subprocess.PIPE)
1123 head_cmd = ["head", "-1"]
1124 head = subprocess.Popen(head_cmd, stdin=samtools_view.stdout, stdout=subprocess.PIPE)
1125 output = head.communicate()[0][:-1]
1126 if len(output) > 0:
1127 return True
1128 else:
1129 if os.path.getsize(filepath)>25:
1130 return True
1131 return False
1132
1133
1134 # check if a file exists and has non-zero (or minimum) size
1135 def fileExists(filepath, minfsize=1):
1136 if os.path.exists(filepath) and os.path.getsize(filepath)>=minfsize:
1137 return True
1138 else:
1139 return False
1140
1141 def removeFileWithIndex(filepath):
1142 if os.path.exists(filepath):
1143 os.remove(filepath)
1144
1145 fileindexpath = filepath + ".index"
1146 if os.path.exists(fileindexpath):
1147 os.remove(fileindexpath)
1148
1149 def getFileDir(filepath):
1150 #if fullpath given, returns path including the ending /
1151 fpath, fname=os.path.split(filepath)
1152 if fpath: fpath+='/'
1153 return fpath
1154
1155 def getFileBaseName(filepath):
1156 fpath, fname=os.path.split(filepath)
1157 fbase, fext =os.path.splitext(fname)
1158 fx=fext.lower()
1159 if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fbase)>0:
1160 return fbase
1161 elif fx == '.z' or fx.find('.gz')==0 or fx.find('.bz')==0:
1162 fb, fext = os.path.splitext(fbase)
1163 fx=fext.lower()
1164 if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fb)>0:
1165 return fb
1166 else:
1167 return fbase
1168 else:
1169 if len(fbase)>0:
1170 return fbase
1171 else:
1172 return fname
1173
1174 # Returns the current time in a nice format
1175 def right_now():
1176 curr_time = datetime.now()
1177 return curr_time.strftime("%c")
1178
1179 # The TopHat logging formatter
1180 def th_log(out_str):
1181 #print >> sys.stderr, "[%s] %s" % (right_now(), out_str)
1182 if tophat_logger:
1183 tophat_logger.info(out_str)
1184
1185 def th_logp(out_str=""):
1186 print >> sys.stderr, out_str
1187 if tophat_log:
1188 print >> tophat_log, out_str
1189
1190 def die(msg=None):
1191 if msg is not None:
1192 th_logp(msg)
1193 sys.exit(1)
1194
1195 # Ensures that the output, logging, and temp directories are present. If not,
1196 # they are created
1197 def prepare_output_dir():
1198
1199 #th_log("Preparing output location "+output_dir)
1200 if os.path.exists(output_dir):
1201 pass
1202 else:
1203 os.mkdir(output_dir)
1204
1205 if os.path.exists(logging_dir):
1206 pass
1207 else:
1208 os.mkdir(logging_dir)
1209
1210 if os.path.exists(tmp_dir):
1211 pass
1212 else:
1213 try:
1214 os.makedirs(tmp_dir)
1215 except OSError, o:
1216 die("\nError creating directory %s (%s)" % (tmp_dir, o))
1217
1218
1219 # to be added as preexec_fn for every subprocess.Popen() call:
1220 # see http://bugs.python.org/issue1652
1221 def subprocess_setup():
1222 # Python installs a SIGPIPE handler by default, which causes
1223 # gzip or other de/compression pipes to complain about "stdout: Broken pipe"
1224 signal.signal(signal.SIGPIPE, signal.SIG_DFL)
1225
1226 # Check that the Bowtie index specified by the user is present and all files
1227 # are there.
1228 def check_bowtie_index(idx_prefix, is_bowtie2):
1229 if currentStage >= resumeStage:
1230 th_log("Checking for Bowtie index files")
1231 idxext="ebwt"
1232 bowtie_ver=""
1233 if is_bowtie2:
1234 idxext="bt2"
1235 bowtie_ver="2 "
1236
1237 idx_fwd_1 = idx_prefix + ".1."+idxext
1238 idx_fwd_2 = idx_prefix + ".2."+idxext
1239 idx_rev_1 = idx_prefix + ".rev.1."+idxext
1240 idx_rev_2 = idx_prefix + ".rev.2."+idxext
1241
1242 if os.path.exists(idx_fwd_1) and \
1243 os.path.exists(idx_fwd_2) and \
1244 os.path.exists(idx_rev_1) and \
1245 os.path.exists(idx_rev_2):
1246 return
1247 else:
1248 bwtidxerr="Error: Could not find Bowtie "+bowtie_ver+"index files (" + idx_prefix + ".*."+idxext+")"
1249
1250 if is_bowtie2:
1251 bwtidx_env = os.environ.get("BOWTIE2_INDEXES")
1252 else:
1253 bwtidx_env = os.environ.get("BOWTIE_INDEXES")
1254
1255 if bwtidx_env == None:
1256 die(bwtidxerr)
1257 if os.path.exists(bwtidx_env+idx_fwd_1) and \
1258 os.path.exists(bwtidx_env+idx_fwd_2) and \
1259 os.path.exists(bwtidx_env+idx_rev_1) and \
1260 os.path.exists(bwtidx_env+idx_rev_2):
1261 return
1262 else:
1263 die(bwtidxerr)
1264
1265 # Reconstructs the multifasta file from which the Bowtie index was created, if
1266 # it's not already there.
1267 def bowtie_idx_to_fa(idx_prefix, is_bowtie2):
1268 idx_name = idx_prefix.split('/')[-1]
1269 th_log("Reconstituting reference FASTA file from Bowtie index")
1270
1271 try:
1272 tmp_fasta_file_name = tmp_dir + idx_name + ".fa"
1273 tmp_fasta_file = open(tmp_fasta_file_name, "w")
1274
1275 inspect_log = open(logging_dir + "bowtie_inspect_recons.log", "w")
1276
1277 if is_bowtie2:
1278 inspect_cmd = [prog_path("bowtie2-inspect")]
1279 else:
1280 inspect_cmd = [prog_path("bowtie-inspect")]
1281
1282 inspect_cmd += [idx_prefix]
1283
1284 th_logp(" Executing: " + " ".join(inspect_cmd) + " > " + tmp_fasta_file_name)
1285 ret = subprocess.call(inspect_cmd,
1286 stdout=tmp_fasta_file,
1287 stderr=inspect_log)
1288 # Bowtie reported an error
1289 if ret != 0:
1290 die(fail_str+"Error: bowtie-inspect returned an error\n"+log_tail(logging_dir + "bowtie_inspect_recons.log"))
1291
1292 # Bowtie not found
1293 except OSError, o:
1294 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1295 die(fail_str+"Error: bowtie-inspect not found on this system. Did you forget to include it in your PATH?")
1296
1297 return tmp_fasta_file_name
1298
1299 # Checks whether the multifasta file for the genome is present alongside the
1300 # Bowtie index files for it.
1301 def check_fasta(idx_prefix, is_bowtie2):
1302 th_log("Checking for reference FASTA file")
1303 idx_fasta = idx_prefix + ".fa"
1304 if os.path.exists(idx_fasta):
1305 return idx_fasta
1306 else:
1307 if is_bowtie2:
1308 bowtie_idx_env_var = os.environ.get("BOWTIE2_INDEXES")
1309 else:
1310 bowtie_idx_env_var = os.environ.get("BOWTIE_INDEXES")
1311 if bowtie_idx_env_var:
1312 idx_fasta = bowtie_idx_env_var + idx_prefix + ".fa"
1313 if os.path.exists(idx_fasta):
1314 return idx_fasta
1315
1316 th_logp("\tWarning: Could not find FASTA file " + idx_fasta)
1317 idx_fa = bowtie_idx_to_fa(idx_prefix, is_bowtie2)
1318 return idx_fa
1319
1320 # Check that both the Bowtie index and the genome's fasta file are present
1321 def check_index(idx_prefix, is_bowtie2):
1322 check_bowtie_index(idx_prefix, is_bowtie2)
1323 ref_fasta_file = check_fasta(idx_prefix, is_bowtie2)
1324
1325 return (ref_fasta_file, None)
1326
1327 # Retrive a tuple containing the system's version of Bowtie. Parsed from
1328 # `bowtie --version`
1329 def get_bowtie_version(is_bowtie2):
1330 try:
1331 # Launch Bowtie to capture its version info
1332 proc = subprocess.Popen([bowtie_path, "--version"],
1333 stdout=subprocess.PIPE)
1334
1335 stdout_value = proc.communicate()[0]
1336
1337 bowtie_version = None
1338 bowtie_out = repr(stdout_value)
1339
1340 # Find the version identifier
1341 if is_bowtie2:
1342 version_str = "bowtie2-align version "
1343 else:
1344 version_str = "bowtie version "
1345
1346 ver_str_idx = bowtie_out.find(version_str)
1347 if ver_str_idx != -1:
1348 nl = bowtie_out.find("\\n", ver_str_idx)
1349 version_val = bowtie_out[ver_str_idx + len(version_str):nl]
1350
1351 if is_bowtie2:
1352 ver_numbers = version_val.split('.')
1353 ver_numbers[2:] = ver_numbers[2].split('-')
1354 bowtie_version = [int(x) for x in ver_numbers[:3]] + [int(ver_numbers[3][4:])]
1355 else:
1356 bowtie_version = [int(x) for x in version_val.split('.')]
1357
1358 if len(bowtie_version) == 3:
1359 bowtie_version.append(0)
1360
1361 return bowtie_version
1362 except OSError, o:
1363 errmsg=fail_str+str(o)+"\n"
1364 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1365 errmsg+="Error: bowtie not found on this system"
1366 die(errmsg)
1367
1368 def get_index_sam_header(params, idx_prefix, name = ""):
1369 noSkip = currentStage >= resumeStage
1370 try:
1371 temp_sam_header_filename = tmp_dir + "temp.samheader.sam"
1372 temp_sam_header_file = None
1373 if noSkip:
1374 temp_sam_header_file = open(temp_sam_header_filename, "w")
1375
1376 bowtie_header_cmd = [bowtie_path]
1377
1378 read_params = params.read_params
1379 if not params.bowtie2:
1380 bowtie_header_cmd += ["--sam"]
1381
1382 if read_params.color:
1383 bowtie_header_cmd.append('-C')
1384
1385 bowtie_header_cmd.extend([idx_prefix, '/dev/null'])
1386 if noSkip:
1387 subprocess.call(bowtie_header_cmd,
1388 stdout=temp_sam_header_file,
1389 stderr=open('/dev/null'))
1390
1391 temp_sam_header_file.close()
1392 temp_sam_header_file = open(temp_sam_header_filename, "r")
1393
1394 bowtie_sam_header_filename = tmp_dir + idx_prefix.split('/')[-1]
1395 if name != "":
1396 bowtie_sam_header_filename += ("_" + name)
1397 bowtie_sam_header_filename += ".bwt.samheader.sam"
1398 if not noSkip:
1399 return bowtie_sam_header_filename
1400 bowtie_sam_header_file = open(bowtie_sam_header_filename, "w")
1401
1402 preamble = []
1403 sq_dict_lines = []
1404
1405 for line in temp_sam_header_file.readlines():
1406 line = line.strip()
1407 if line.find("@SQ") != -1:
1408 # Sequence dictionary record
1409 cols = line.split('\t')
1410 seq_name = None
1411 for col in cols:
1412 fields = col.split(':')
1413 #print fields
1414 if len(fields) > 0 and fields[0] == "SN":
1415 seq_name = fields[1]
1416 if seq_name == None:
1417 die("Error: malformed sequence dictionary in sam header")
1418 sq_dict_lines.append([seq_name,line])
1419 elif line.find("CL"):
1420 continue
1421 else:
1422 preamble.append(line)
1423
1424 print >> bowtie_sam_header_file, "@HD\tVN:1.0\tSO:coordinate"
1425 if read_params.read_group_id and read_params.sample_id:
1426 rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id,
1427 read_params.sample_id)
1428 if read_params.library_id:
1429 rg_str += "\tLB:%s" % read_params.library_id
1430 if read_params.description:
1431 rg_str += "\tDS:%s" % read_params.description
1432 if read_params.seq_platform_unit:
1433 rg_str += "\tPU:%s" % read_params.seq_platform_unit
1434 if read_params.seq_center:
1435 rg_str += "\tCN:%s" % read_params.seq_center
1436 if read_params.mate_inner_dist:
1437 rg_str += "\tPI:%s" % read_params.mate_inner_dist
1438 if read_params.seq_run_date:
1439 rg_str += "\tDT:%s" % read_params.seq_run_date
1440 if read_params.seq_platform:
1441 rg_str += "\tPL:%s" % read_params.seq_platform
1442
1443 print >> bowtie_sam_header_file, rg_str
1444
1445 if not params.keep_fasta_order:
1446 sq_dict_lines.sort(lambda x,y: cmp(x[0],y[0]))
1447
1448 for [name, line] in sq_dict_lines:
1449 print >> bowtie_sam_header_file, line
1450 print >> bowtie_sam_header_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd)
1451
1452 bowtie_sam_header_file.close()
1453 temp_sam_header_file.close()
1454 return bowtie_sam_header_filename
1455
1456 except OSError, o:
1457 errmsg=fail_str+str(o)+"\n"
1458 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1459 errmsg+="Error: bowtie not found on this system"
1460 die(errmsg)
1461
1462 # Make sure Bowtie is installed and is recent enough to be useful
1463 def check_bowtie(params):
1464 bowtie_req=""
1465 if params.bowtie2:
1466 bowtie_req="2-align"
1467 log_msg = "Checking for Bowtie"
1468 th_log(log_msg)
1469
1470 bowtie_bin = "bowtie"+bowtie_req
1471
1472 global bowtie_path
1473 bowtie_version = None
1474 bowtie_path=which(bowtie_bin)
1475 if bowtie_path:
1476 bowtie_version = get_bowtie_version(params.bowtie2)
1477 if params.bowtie2 and bowtie_version == None:
1478 th_logp(" Bowtie 2 not found, checking for older version..")
1479 #try to fallback on bowtie 1
1480 params.bowtie2=False
1481 bowtie_path=which('bowtie')
1482 if bowtie_path:
1483 bowtie_version=get_bowtie_version(params.bowtie2)
1484 if bowtie_version == None:
1485 die("Error: Bowtie not found on this system.")
1486 if params.bowtie2:
1487 if bowtie_version[0] < 3 and bowtie_version[1] < 1 and bowtie_version[2] < 1 and bowtie_version[3] < 5:
1488 die("Error: TopHat requires Bowtie 2.0.0-beta5 or later")
1489 else:
1490 if bowtie_version[0] < 1 and bowtie_version[1] < 12 and bowtie_version[2] < 3:
1491 die("Error: TopHat requires Bowtie 0.12.3 or later")
1492 th_logp("\t\t Bowtie version:\t %s" % ".".join([str(x) for x in bowtie_version]))
1493
1494
1495 # Retrive a tuple containing the system's version of samtools. Parsed from
1496 # `samtools`
1497 def get_samtools_version():
1498 try:
1499 # Launch Bowtie to capture its version info
1500 proc = subprocess.Popen(samtools_path, stderr=subprocess.PIPE)
1501 samtools_out = proc.communicate()[1]
1502
1503 # Find the version identifier
1504 version_match = re.search(r'Version:\s+(\d+)\.(\d+).(\d+)([a-zA-Z]?)', samtools_out)
1505 samtools_version_arr = [int(version_match.group(x)) for x in [1,2,3]]
1506 if version_match.group(4):
1507 samtools_version_arr.append(version_match.group(4))
1508 else:
1509 samtools_version_arr.append(0)
1510
1511 return version_match.group(), samtools_version_arr
1512 except OSError, o:
1513 errmsg=fail_str+str(o)+"\n"
1514 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
1515 errmsg+="Error: samtools not found on this system"
1516 die(errmsg)
1517
1518 # Make sure the SAM tools are installed and are recent enough to be useful
1519 def check_samtools():
1520 th_log("Checking for Samtools")
1521 global samtools_path
1522 samtools_path=prog_path("samtools")
1523 samtools_version_str, samtools_version_arr = get_samtools_version()
1524 if samtools_version_str == None:
1525 die("Error: Samtools not found on this system")
1526 elif samtools_version_arr[1] < 1 or samtools_version_arr[2] < 7:
1527 die("Error: TopHat requires Samtools 0.1.7 or later")
1528 th_logp("\t\tSamtools version:\t %s" % ".".join([str(x) for x in samtools_version_arr]))
1529
1530
1531
1532 class FastxReader:
1533 def __init__(self, i_file, is_color=0, fname=''):
1534 self.bufline=None
1535 self.format=None
1536 self.ifile=i_file
1537 self.nextRecord=None
1538 self.eof=None
1539 self.fname=fname
1540 self.lastline=None
1541 self.numrecords=0
1542 self.isColor=0
1543 if is_color : self.isColor=1
1544 # determine file type
1545 #no records processed yet, skip custom header lines if any
1546 hlines=10 # allow maximum 10 header lines
1547 self.lastline=" "
1548 while hlines>0 and self.lastline[0] not in "@>" :
1549 self.lastline=self.ifile.readline()
1550 hlines-=1
1551 if self.lastline[0] == '@':
1552 self.format='fastq'
1553 self.nextRecord=self.nextFastq
1554 elif self.lastline[0] == '>':
1555 self.format='fasta'
1556 self.nextRecord=self.nextFasta
1557 else:
1558 die("Error: cannot determine record type in input file %s" % fname)
1559 self.bufline=self.lastline
1560 self.lastline=None
1561
1562 def nextFastq(self):
1563 # returning tuple: (seqID, sequence_string, seq_len, qv_string)
1564 seqid,seqstr,qstr,seq_len='','','',0
1565 if self.eof: return (seqid, seqstr, seq_len, qstr)
1566 fline=self.getLine #shortcut to save a bit of time
1567 line=fline()
1568
1569 if not line : return (seqid, seqstr, seq_len, qstr)
1570 while len(line.rstrip())==0: # skip empty lines
1571 line=fline()
1572 if not line : return (seqid, seqstr,seq_len, qstr)
1573 try:
1574 if line[0] != "@":
1575 raise ValueError("Records in Fastq files should start with '@' character")
1576
1577 seqid = line[1:].rstrip()
1578 seqstr = fline().rstrip()
1579
1580 #There may now be more sequence lines, or the "+" quality marker line:
1581 while True:
1582 line = fline()
1583 if not line:
1584 raise ValueError("Premature end of file (missing quality values for "+seqid+")")
1585 if line[0] == "+":
1586 # -- sequence string ended
1587 #qtitle = line[1:].rstrip()
1588 #if qtitle and qtitle != seqid:
1589 # raise ValueError("Different read ID for sequence and quality (%s vs %s)" \
1590 # % (seqid, qtitle))
1591 break
1592 seqstr += line.rstrip() #removes trailing newlines
1593 #loop until + found
1594 seq_len = len(seqstr)
1595 #at least one line of quality data should follow
1596 qstrlen=0
1597 #now read next lines as quality values until seq_len is reached
1598 while True:
1599 line=fline()
1600 if not line : break #end of file
1601 qstr += line.rstrip()
1602 qstrlen=len(qstr)
1603 if qstrlen + self.isColor >= seq_len :
1604 break # qv string has reached the length of seq string
1605 #loop until qv has the same length as seq
1606
1607 if self.isColor:
1608 # and qstrlen==seq_len :
1609 if qstrlen==seq_len:
1610 #qual string may have a dummy qv at the beginning, should be stripped
1611 qstr = qstr[1:]
1612 qstrlen -= 1
1613 if qstrlen!=seq_len-1:
1614 raise ValueError("Length mismatch between sequence and quality strings "+ \
1615 "for %s (%i vs %i)." % (seqid, seq_len, qstrlen))
1616 else:
1617 if seq_len != qstrlen :
1618 raise ValueError("Length mismatch between sequence and quality strings "+ \
1619 "for %s (%i vs %i)." % (seqid, seq_len, qstrlen))
1620 except ValueError, err:
1621 die("\nError encountered parsing file "+self.fname+":\n "+str(err))
1622 #return the record
1623 self.numrecords+=1
1624 ##--discard the primer base [NO]
1625 if self.isColor :
1626 seq_len-=1
1627 seqstr = seqstr[1:]
1628 return (seqid, seqstr, seq_len, qstr)
1629
1630 def nextFasta(self):
1631 # returning tuple: (seqID, sequence_string, seq_len)
1632 seqid,seqstr,seq_len='','',0
1633 fline=self.getLine # shortcut to readline function of f
1634 line=fline() # this will use the buffer line if it's there
1635 if not line : return (seqid, seqstr, seq_len, None)
1636 while len(line.rstrip())==0: # skip empty lines
1637 line=fline()
1638 if not line : return (seqid, seqstr, seq_len, None)
1639 try:
1640 if line[0] != ">":
1641 raise ValueError("Records in Fasta files must start with '>' character")
1642 seqid = line[1:].split()[0]
1643 #more sequence lines, or the ">" quality marker line:
1644 while True:
1645 line = fline()
1646 if not line: break
1647 if line[0] == '>':
1648 #next sequence starts here
1649 self.ungetLine()
1650 break
1651 seqstr += line.rstrip()
1652 #loop until '>' found
1653 seq_len = len(seqstr)
1654 if seq_len < 3:
1655 raise ValueError("Read %s too short (%i)." \
1656 % (seqid, seq_len))
1657 except ValueError, err:
1658 die("\nError encountered parsing fasta file "+self.fname+"\n "+str(err))
1659 #return the record and continue
1660 self.numrecords+=1
1661 if self.isColor : # -- discard primer base
1662 seq_len-=1
1663 seqstr=seqstr[1:]
1664 return (seqid, seqstr, seq_len, None)
1665
1666 def getLine(self):
1667 if self.bufline: #return previously buffered line
1668 r=self.bufline
1669 self.bufline=None
1670 return r
1671 else: #read a new line from stream and return it
1672 if self.eof: return None
1673 self.lastline=self.ifile.readline()
1674 if not self.lastline:
1675 self.eof=1
1676 return None
1677 return self.lastline
1678 def ungetLine(self):
1679 if self.lastline is None:
1680 th_logp("Warning: FastxReader called ungetLine() with no prior line!")
1681 self.bufline=self.lastline
1682 self.lastline=None
1683 #< class FastxReader
1684
1685 def fa_write(fhandle, seq_id, seq):
1686 """
1687 Write to a file in the FASTA format.
1688
1689 Arguments:
1690 - `fhandle`: A file handle open for writing
1691 - `seq_id`: The sequence id string for this sequence
1692 - `seq`: An unformatted string of the sequence to write
1693 """
1694 line_len = 60
1695 fhandle.write(">" + seq_id + "\n")
1696 for i in xrange(len(seq) / line_len + 1):
1697 start = i * line_len
1698 #end = (i+1) * line_len if (i+1) * line_len < len(seq) else len(seq)
1699 if (i+1) * line_len < len(seq):
1700 end = (i+1) * line_len
1701 else:
1702 end = len(seq)
1703 fhandle.write( seq[ start:end ] + "\n")
1704
1705 class ZReader:
1706 def __init__(self, filename, params, guess=True):
1707 self.fname=filename
1708 self.file=None
1709 self.fsrc=None
1710 self.popen=None
1711 sys_params = params.system_params
1712 pipecmd=[]
1713 s=filename.lower()
1714 if s.endswith(".bam"):
1715 pipecmd=[prog_path("bam2fastx")]
1716 if params.read_params.color:
1717 pipecmd+=["--color"]
1718 pipecmd+=["--all", "-"]
1719 else:
1720 if guess:
1721 if s.endswith(".z") or s.endswith(".gz") or s.endswith(".gzip"):
1722 pipecmd=['gzip']
1723 else:
1724 if s.endswith(".bz2") or s.endswith(".bzip2") or s.endswith(".bzip"):
1725 pipecmd=['bzip2']
1726 if len(pipecmd)>0 and which(pipecmd[0]) is None:
1727 die("Error: cannot find %s to decompress input file %s " % (pipecmd, filename))
1728 if len(pipecmd)>0:
1729 if pipecmd[0]=='gzip' and sys_params.zipper.endswith('pigz'):
1730 pipecmd[0]=sys_params.zipper
1731 pipecmd.extend(sys_params.zipper_opts)
1732 elif pipecmd[0]=='bzip2' and sys_params.zipper.endswith('pbzip2'):
1733 pipecmd[0]=sys_params.zipper
1734 pipecmd.extend(sys_params.zipper_opts)
1735 else: #not guessing, but must still check if it's a compressed file
1736 if use_zpacker and filename.endswith(".z"):
1737 pipecmd=[sys_params.zipper]
1738 pipecmd.extend(sys_params.zipper_opts)
1739
1740 if pipecmd:
1741 pipecmd+=['-cd']
1742 if pipecmd:
1743 try:
1744 self.fsrc=open(self.fname, 'rb')
1745 self.popen=subprocess.Popen(pipecmd,
1746 preexec_fn=subprocess_setup,
1747 stdin=self.fsrc,
1748 stdout=subprocess.PIPE, stderr=tophat_log, close_fds=True)
1749 except Exception:
1750 die("Error: could not open pipe "+' '.join(pipecmd)+' < '+ self.fname)
1751 self.file=self.popen.stdout
1752 else:
1753 self.file=open(filename)
1754 def close(self):
1755 if self.fsrc: self.fsrc.close()
1756 self.file.close()
1757 if self.popen:
1758 self.popen.wait()
1759 self.popen=None
1760
1761 class ZWriter:
1762 def __init__(self, filename, sysparams):
1763 self.fname=filename
1764 if use_zpacker:
1765 pipecmd=[sysparams.zipper,"-cf", "-"]
1766 self.ftarget=open(filename, "wb")
1767 try:
1768 self.popen=subprocess.Popen(pipecmd,
1769 preexec_fn=subprocess_setup,
1770 stdin=subprocess.PIPE,
1771 stderr=tophat_log, stdout=self.ftarget, close_fds=True)
1772 except Exception:
1773 die("Error: could not open writer pipe "+' '.join(pipecmd)+' < '+ self.fname)
1774 self.file=self.popen.stdin # client writes to this end of the pipe
1775 else: #no compression
1776 self.file=open(filename, "w")
1777 self.ftarget=None
1778 self.popen=None
1779 def close(self):
1780 self.file.close()
1781 if self.ftarget: self.ftarget.close()
1782 if self.popen:
1783 self.popen.wait() #! required to actually flush the pipes (eek!)
1784 self.popen=None
1785
1786 # check_reads_format() examines the first few records in the user files
1787 # to determines the file format
1788 def check_reads_format(params, reads_files):
1789 #seed_len = params.read_params.seed_length
1790 fileformat = params.read_params.reads_format
1791
1792 observed_formats = set([])
1793 # observed_scales = set([])
1794 min_seed_len = 99999
1795 max_seed_len = 0
1796 files = reads_files.split(',')
1797
1798 for f_name in files:
1799 #try:
1800 zf = ZReader(f_name, params)
1801 #except IOError:
1802 # die("Error: could not open file "+f_name)
1803 freader=FastxReader(zf.file, params.read_params.color, zf.fname)
1804 toread=4 #just sample the first 4 reads
1805 while toread>0:
1806 seqid, seqstr, seq_len, qstr = freader.nextRecord()
1807 if not seqid: break
1808 toread-=1
1809 if seq_len < 20:
1810 th_logp("Warning: found a read < 20bp in "+f_name)
1811 else:
1812 min_seed_len = min(seq_len, min_seed_len)
1813 max_seed_len = max(seq_len, max_seed_len)
1814 zf.close()
1815 observed_formats.add(freader.format)
1816 if len(observed_formats) > 1:
1817 die("Error: TopHat requires all reads be either FASTQ or FASTA. Mixing formats is not supported.")
1818 fileformat=list(observed_formats)[0]
1819 #if seed_len != None:
1820 # seed_len = max(seed_len, max_seed_len)
1821 #else:
1822 # seed_len = max_seed_len
1823 #print >> sys.stderr, "\tmin read length: %dbp, max read length: %dbp" % (min_seed_len, max_seed_len)
1824 th_logp("\tformat:\t\t %s" % fileformat)
1825 if fileformat == "fastq":
1826 quality_scale = "phred33 (default)"
1827 if params.read_params.solexa_quals and not params.read_params.phred64_quals:
1828 quality_scale = "solexa33 (reads generated with GA pipeline version < 1.3)"
1829 elif params.read_params.phred64_quals:
1830 quality_scale = "phred64 (reads generated with GA pipeline version >= 1.3)"
1831 th_logp("\tquality scale:\t %s" % quality_scale)
1832 elif fileformat == "fasta":
1833 if params.read_params.color:
1834 params.read_params.integer_quals = True
1835
1836 #print seed_len, format, solexa_scale
1837 #NOTE: seed_len will be re-evaluated later by prep_reads
1838 return TopHatParams.ReadParams(params.read_params.solexa_quals,
1839 params.read_params.phred64_quals,
1840 params.read_params.quals,
1841 params.read_params.integer_quals,
1842 params.read_params.color,
1843 params.read_params.library_type,
1844 #seed_len,
1845 params.read_params.seed_length,
1846 fileformat,
1847 params.read_params.mate_inner_dist,
1848 params.read_params.mate_inner_dist_std_dev,
1849 params.read_params.read_group_id,
1850 params.read_params.sample_id,
1851 params.read_params.library_id,
1852 params.read_params.description,
1853 params.read_params.seq_platform_unit,
1854 params.read_params.seq_center,
1855 params.read_params.seq_run_date,
1856 params.read_params.seq_platform)
1857
1858 def grep_file(logfile, regex="warning"):
1859 f=open(logfile, "r")
1860 r=[]
1861 for line in f:
1862 if re.match(regex, line, re.IGNORECASE):
1863 r += [line.rstrip()]
1864 return r
1865
1866 def log_tail(logfile, lines=1):
1867 f=open(logfile, "r")
1868 f.seek(0, 2)
1869 fbytes= f.tell()
1870 size=lines
1871 block=-1
1872 while size > 0 and fbytes+block*1024 > 0:
1873 if (fbytes+block*1024 > 0):
1874 ##Seek back once more, if possible
1875 f.seek( block*1024, 2 )
1876 else:
1877 #Seek to the beginning
1878 f.seek(0, 0)
1879 data= f.read( 1024 )
1880 linesFound= data.count('\n')
1881 size -= linesFound
1882 block -= 1
1883 if (fbytes + block*1024 > 0):
1884 f.seek(block*1024, 2)
1885 else:
1886 f.seek(0,0)
1887 #f.readline() # find a newline
1888 lastBlocks= list( f.readlines() )
1889 f.close()
1890 return "".join(lastBlocks[-lines:])
1891
1892 # Format a DateTime as a pretty string.
1893 # FIXME: Currently doesn't support days!
1894 def formatTD(td):
1895 days = td.days
1896 hours = td.seconds // 3600
1897 minutes = (td.seconds % 3600) // 60
1898 seconds = td.seconds % 60
1899
1900 if days > 0:
1901 return '%d days %02d:%02d:%02d' % (days, hours, minutes, seconds)
1902 else:
1903 return '%02d:%02d:%02d' % (hours, minutes, seconds)
1904
1905 class PrepReadsInfo:
1906 def __init__(self, fname, out_fname):
1907 self.min_len = [0, 0]
1908 self.max_len = [0, 0]
1909 self.in_count = [0, 0]
1910 self.out_count= [0, 0]
1911 self.kept_reads = [None, None]
1912 try:
1913 f=open(fname,"r")
1914 self.min_len[0]=int(f.readline().split("=")[-1])
1915 self.max_len[0]=int(f.readline().split("=")[-1])
1916 self.in_count[0]=int(f.readline().split("=")[-1])
1917 self.out_count[0]=int(f.readline().split("=")[-1])
1918 if (self.out_count[0]==0) or (self.max_len[0]<16):
1919 raise Exception()
1920 line=f.readline()
1921 if line and line.find("=") > 0:
1922 self.min_len[1]=int(line.split("=")[-1])
1923 self.max_len[1]=int(f.readline().split("=")[-1])
1924 self.in_count[1]=int(f.readline().split("=")[-1])
1925 self.out_count[1]=int(f.readline().split("=")[-1])
1926 if (self.out_count[1]==0) or (self.max_len[1]<16):
1927 raise Exception()
1928 except Exception, e:
1929 die(fail_str+"Error retrieving prep_reads info.")
1930 sides=["left", "right"]
1931 for ri in (0,1):
1932 if self.in_count[ri]==0: break
1933 trashed=self.in_count[ri]-self.out_count[ri]
1934 self.kept_reads[ri]=out_fname.replace("%side%", sides[ri])
1935 th_logp("\t%5s reads: min. length=%s, max. length=%s, %s kept reads (%s discarded)" % (sides[ri], self.min_len[ri], self.max_len[ri], self.out_count[ri], trashed))
1936
1937 def prep_reads_cmd(params, l_reads_list, l_quals_list=None, r_reads_list=None, r_quals_list=None, out_file=None, aux_file=None,
1938 index_file=None, filter_reads=[], hits_to_filter=[]):
1939 #generate a prep_reads cmd arguments
1940 prep_cmd = [prog_path("prep_reads")]
1941
1942 prep_cmd.extend(params.cmd())
1943
1944 if params.read_params.reads_format == "fastq":
1945 prep_cmd += ["--fastq"]
1946 elif params.read_params.reads_format == "fasta":
1947 prep_cmd += ["--fasta"]
1948 if hits_to_filter:
1949 prep_cmd += ["--flt-hits=" + ",".join(hits_to_filter)]
1950 if aux_file:
1951 prep_cmd += ["--aux-outfile="+aux_file]
1952 if index_file:
1953 prep_cmd += ["--index-outfile="+index_file] # could be a template
1954 if filter_reads:
1955 prep_cmd += ["--flt-reads=" + ",".join(filter_reads)]
1956 if params.sam_header:
1957 prep_cmd += ["--sam-header="+params.sam_header]
1958 if out_file:
1959 prep_cmd += ["--outfile="+out_file] #could be a template
1960 prep_cmd.append(l_reads_list)
1961 if l_quals_list:
1962 prep_cmd.append(l_quals_list)
1963 if r_reads_list:
1964 prep_cmd.append(r_reads_list)
1965 if r_quals_list:
1966 prep_cmd.append(r_quals_list)
1967
1968 return prep_cmd
1969
1970 # Calls the prep_reads executable, which prepares an internal read library.
1971 # The read library features reads with monotonically increasing integer IDs.
1972 # prep_reads also filters out very low complexy or garbage reads as well as
1973 # polyA reads.
1974 #--> returns a PrepReadsInfo structure
1975 def prep_reads(params, l_reads_list, l_quals_list, r_reads_list, r_quals_list, prefilter_reads=[]):
1976 reads_suffix = ".bam"
1977 use_bam = True
1978
1979 #if params.read_params.color:
1980 # reads_suffix = ".fq"
1981 # use_bam = False
1982
1983 # for parallelization, we don't compress the read files
1984 do_use_zpacker = use_zpacker and not use_bam
1985 if do_use_zpacker and params.system_params.num_threads > 1:
1986 do_use_zpacker = False
1987
1988 if do_use_zpacker: reads_suffix += ".z"
1989
1990 out_suffix = "_kept_reads" + reads_suffix
1991 #kept_reads_filename = tmp_dir + output_name + reads_suffix
1992
1993 for side in ("left", "right"):
1994 kept_reads_filename = tmp_dir + side + out_suffix
1995 if resumeStage<1 and os.path.exists(kept_reads_filename):
1996 os.remove(kept_reads_filename)
1997 out_tmpl="left"
1998 out_fname=None
1999 kept_reads = None #output file handle
2000 if r_reads_list:
2001 out_tmpl="%side%"
2002 info_file = output_dir+"prep_reads.info"
2003 if fileExists(info_file,10) and resumeStage>0 :
2004 return PrepReadsInfo(info_file, tmp_dir + out_tmpl + out_suffix)
2005
2006 if use_bam:
2007 out_fname = tmp_dir + out_tmpl + out_suffix
2008 else:
2009 #assumed no right reads given here, only one side is being processed
2010 kept_reads = open(tmp_dir + out_tmpl + out_suffix, "wb")
2011 log_fname=logging_dir + "prep_reads.log"
2012 filter_log = open(log_fname,"w")
2013
2014 index_file = out_fname + ".index"
2015 if do_use_zpacker: index_file=None
2016
2017 prep_cmd=prep_reads_cmd(params, l_reads_list, l_quals_list, r_reads_list, r_quals_list,
2018 out_fname, info_file, index_file, prefilter_reads)
2019 shell_cmd = ' '.join(prep_cmd)
2020 #finally, add the compression pipe if needed
2021 zip_cmd=[]
2022 if do_use_zpacker:
2023 zip_cmd=[ params.system_params.zipper ]
2024 zip_cmd.extend(params.system_params.zipper_opts)
2025 zip_cmd.extend(['-c','-'])
2026 shell_cmd +=' | '+' '.join(zip_cmd)
2027 if not use_bam: shell_cmd += ' >' +kept_reads_filename
2028 retcode = None
2029 try:
2030 print >> run_log, shell_cmd
2031 if do_use_zpacker:
2032 filter_proc = subprocess.Popen(prep_cmd,
2033 stdout=subprocess.PIPE,
2034 stderr=filter_log)
2035 zip_proc=subprocess.Popen(zip_cmd,
2036 preexec_fn=subprocess_setup,
2037 stdin=filter_proc.stdout,
2038 stderr=tophat_log, stdout=kept_reads)
2039 filter_proc.stdout.close() #as per http://bugs.python.org/issue7678
2040 zip_proc.communicate()
2041 retcode=filter_proc.poll()
2042 if retcode==0:
2043 retcode=zip_proc.poll()
2044 else:
2045 if use_bam:
2046 retcode = subprocess.call(prep_cmd, stderr=filter_log)
2047 else:
2048 retcode = subprocess.call(prep_cmd,
2049 stdout=kept_reads, stderr=filter_log)
2050 if retcode:
2051 die(fail_str+"Error running 'prep_reads'\n"+log_tail(log_fname))
2052
2053 except OSError, o:
2054 errmsg=fail_str+str(o)
2055 die(errmsg+"\n"+log_tail(log_fname))
2056
2057 if kept_reads: kept_reads.close()
2058 warnings=grep_file(log_fname)
2059 if warnings:
2060 th_logp("\n"+"\n".join(warnings)+"\n")
2061 return PrepReadsInfo(info_file, tmp_dir + out_tmpl + out_suffix)
2062
2063 # Call bowtie
2064 def bowtie(params,
2065 bwt_idx_prefix,
2066 sam_headers,
2067 reads_list,
2068 reads_format,
2069 num_mismatches,
2070 gap_length,
2071 edit_dist,
2072 realign_edit_dist,
2073 mapped_reads,
2074 unmapped_reads,
2075 extra_output = "",
2076 mapping_type = _reads_vs_G,
2077 multihits_out = None): #only --prefilter-multihits should activate this parameter for the initial prefilter search
2078 start_time = datetime.now()
2079 bwt_idx_name = bwt_idx_prefix.split('/')[-1]
2080 reads_file=reads_list[0]
2081 readfile_basename=getFileBaseName(reads_file)
2082
2083 g_mapping, t_mapping, seg_mapping = False, False, False
2084 sam_header_filename = None
2085 genome_sam_header_filename = None
2086 if mapping_type == _reads_vs_T:
2087 t_mapping = True
2088 sam_header_filename = sam_headers[0]
2089 genome_sam_header_filename = sam_headers[1]
2090 else:
2091 sam_header_filename = sam_headers
2092 if mapping_type >= _segs_vs_G:
2093 seg_mapping = True
2094 else:
2095 g_mapping = True
2096
2097 bowtie_str = "Bowtie"
2098 if params.bowtie2:
2099 bowtie_str += "2"
2100
2101 if seg_mapping:
2102 if not params.bowtie2:
2103 backup_bowtie_alignment_option = params.bowtie_alignment_option
2104 params.bowtie_alignment_option = "-v"
2105
2106 resume_skip = resumeStage > currentStage
2107 unmapped_reads_out=None
2108 if unmapped_reads:
2109 unmapped_reads_out=unmapped_reads+".fq"
2110 mapped_reads += ".bam"
2111 if unmapped_reads:
2112 unmapped_reads_out = unmapped_reads + ".bam"
2113 use_FIFO = use_BWT_FIFO and use_zpacker and unmapped_reads and params.read_params.color
2114 if use_FIFO:
2115 unmapped_reads_out+=".z"
2116 if resume_skip:
2117 #skipping this step
2118 return (mapped_reads, unmapped_reads_out)
2119
2120 bwt_logname=logging_dir + 'bowtie.'+readfile_basename+'.log'
2121
2122 if t_mapping:
2123 th_log("Mapping %s to transcriptome %s with %s %s" % (readfile_basename,
2124 bwt_idx_name, bowtie_str, extra_output))
2125 else:
2126 qryname = readfile_basename
2127 if len(reads_list) > 1:
2128 bnames=[]
2129 for fname in reads_list:
2130 bnames += [getFileBaseName(fname)]
2131 qryname = ",".join(bnames)
2132 th_log("Mapping %s to genome %s with %s %s" % (qryname,
2133 bwt_idx_name, bowtie_str, extra_output))
2134
2135 if use_FIFO:
2136 global unmapped_reads_fifo
2137 unmapped_reads_fifo=unmapped_reads+".fifo"
2138 if os.path.exists(unmapped_reads_fifo):
2139 os.remove(unmapped_reads_fifo)
2140 try:
2141 os.mkfifo(unmapped_reads_fifo)
2142 except OSError, o:
2143 die(fail_str+"Error at mkfifo("+unmapped_reads_fifo+'). '+str(o))
2144
2145 # Launch Bowtie
2146 try:
2147 bowtie_cmd = [bowtie_path]
2148 if reads_format == "fastq":
2149 bowtie_cmd += ["-q"]
2150 elif reads_format == "fasta":
2151 bowtie_cmd += ["-f"]
2152 if params.read_params.color:
2153 bowtie_cmd += ["-C", "--col-keepends"]
2154
2155 unzip_cmd=None
2156 bam_input=False
2157 if len(reads_list) > 0 and reads_list[0].endswith('.bam'):
2158 bam_input=True
2159 unzip_cmd=[ prog_path('bam2fastx'), "--all" ]
2160 if params.read_params.color:
2161 unzip_cmd.append("--color")
2162 if reads_format:
2163 unzip_cmd.append("--" + reads_format)
2164 unzip_cmd+=[reads_list[0]]
2165
2166 if use_zpacker and (unzip_cmd is None):
2167 unzip_cmd=[ params.system_params.zipper ]
2168 unzip_cmd.extend(params.system_params.zipper_opts)
2169 unzip_cmd+=['-cd']
2170
2171 fifo_pid=None
2172 if use_FIFO:
2173 unm_zipcmd=[ params.system_params.zipper ]
2174 unm_zipcmd.extend(params.system_params.zipper_opts)
2175 unm_zipcmd+=['-c']
2176 print >> run_log, ' '.join(unm_zipcmd)+' < '+ unmapped_reads_fifo + ' > '+ unmapped_reads_out + ' & '
2177 fifo_pid=os.fork()
2178 if fifo_pid==0:
2179 def on_sig_exit(sig, func=None):
2180 os._exit(os.EX_OK)
2181 signal.signal(signal.SIGTERM, on_sig_exit)
2182 subprocess.call(unm_zipcmd,
2183 stdin=open(unmapped_reads_fifo, "r"),
2184 stderr=tophat_log,
2185 stdout=open(unmapped_reads_out, "wb"))
2186 os._exit(os.EX_OK)
2187
2188 fix_map_cmd = [prog_path('fix_map_ordering')]
2189 if params.read_params.color:
2190 fix_map_cmd += ["--color"]
2191
2192 if params.bowtie2:
2193 #if t_mapping or g_mapping:
2194 max_penalty, min_penalty = params.bowtie2_params.mp.split(',')
2195 max_penalty, min_penalty = int(max_penalty), int(min_penalty)
2196 min_score = (max_penalty - 1) * realign_edit_dist
2197 fix_map_cmd += ["--bowtie2-min-score", str(min_score)]
2198 # testing score filtering
2199 if params.b2scoreflt:
2200 fix_map_cmd +=["-W"+str(min_score+max_penalty)]
2201 fix_map_cmd += ["--read-mismatches", str(params.read_mismatches),
2202 "--read-gap-length", str(params.read_gap_length),
2203 "--read-edit-dist", str(params.read_edit_dist),
2204 "--read-realign-edit-dist", str(params.read_realign_edit_dist)]
2205
2206 #write BAM file
2207 out_bam = mapped_reads
2208
2209 if not t_mapping:
2210 fix_map_cmd += ["--index-outfile", mapped_reads + ".index"]
2211 if not params.bowtie2:
2212 fix_map_cmd += ["--bowtie1"]
2213 if multihits_out != None:
2214 fix_map_cmd += ["--aux-outfile", params.preflt_data[multihits_out].multihit_reads]
2215 fix_map_cmd += ["--max-multihits", str(params.max_hits)]
2216 if t_mapping:
2217 out_bam = "-" # we'll pipe into map2gtf
2218 fix_map_cmd += ["--sam-header", sam_header_filename, "-", out_bam]
2219 if unmapped_reads:
2220 fix_map_cmd += [unmapped_reads_out]
2221 if t_mapping:
2222 max_hits = params.t_max_hits
2223 elif seg_mapping:
2224 max_hits = params.max_seg_hits
2225 else:
2226 max_hits = params.max_hits
2227
2228 if num_mismatches > 3:
2229 num_mismatches = 3
2230
2231 if params.bowtie2:
2232 if seg_mapping or multihits_out != None:
2233 # since bowtie2 does not suppress reads that map to too many places,
2234 # we suppress those in segment_juncs and long_spanning_reads.
2235 bowtie_cmd += ["-k", str(max_hits + 1)]
2236 else:
2237 bowtie_cmd += ["-k", str(max_hits)]
2238
2239 bowtie2_params = params.bowtie2_params
2240 if seg_mapping:
2241 # after intensive testing,
2242 # the following parameters seem to work faster than Bowtie1 and as sensitive as Bowtie1,
2243 # but room for further improvements remains.
2244 bowtie_cmd += ["-N", str(min(num_mismatches, 1))]
2245 bowtie_cmd += ["-i", "C,2,0"]
2246 bowtie_cmd += ["-L", "20"]
2247 # bowtie_cmd += ["-L", str(min(params.segment_length, 20))]
2248 else:
2249 bowtie2_preset = ""
2250 if bowtie2_params.very_fast:
2251 bowtie2_preset = "--very-fast"
2252 elif bowtie2_params.fast:
2253 bowtie2_preset = "--fast"
2254 elif bowtie2_params.sensitive:
2255 bowtie2_preset = "--sensitive"
2256 elif bowtie2_params.very_sensitive:
2257 bowtie2_preset = "--very-sensitive"
2258
2259 if bowtie2_preset != "":
2260 bowtie_cmd += [bowtie2_preset]
2261 else:
2262 bowtie_cmd += ["-D", str(bowtie2_params.D),
2263 "-R", str(bowtie2_params.R),
2264 "-N", str(bowtie2_params.N),
2265 "-L", str(bowtie2_params.L),
2266 "-i", bowtie2_params.i]
2267
2268 score_min = bowtie2_params.score_min
2269 if not score_min:
2270 max_penalty, min_penalty = bowtie2_params.mp.split(',')
2271 score_min = "C,-%d,0" % (int(max_penalty) * edit_dist + 2)
2272
2273 # "--n-ceil" is not correctly parsed in Bowtie2,
2274 # I (daehwan) already talked to Ben who will fix the problem.
2275 bowtie_cmd += [# "--n-ceil", bowtie2_params.n_ceil,
2276 "--gbar", str(bowtie2_params.gbar),
2277 "--mp", bowtie2_params.mp,
2278 "--np", str(bowtie2_params.np),
2279 "--rdg", bowtie2_params.rdg,
2280 "--rfg", bowtie2_params.rfg,
2281 "--score-min", score_min]
2282
2283 else:
2284 bowtie_cmd += [params.bowtie_alignment_option, str(num_mismatches),
2285 "-k", str(max_hits),
2286 "-m", str(max_hits),
2287 "-S"]
2288
2289 bowtie_cmd += ["-p", str(params.system_params.num_threads)]
2290
2291 if params.bowtie2: #always use headerless SAM file
2292 bowtie_cmd += ["--sam-no-hd"]
2293 else:
2294 bowtie_cmd += ["--sam-nohead"]
2295
2296 if not params.bowtie2:
2297 if multihits_out != None:
2298 bowtie_cmd += ["--max", params.preflt_data[multihits_out].multihit_reads]
2299 else:
2300 bowtie_cmd += ["--max", "/dev/null"]
2301
2302 if params.bowtie2:
2303 bowtie_cmd += ["-x"]
2304
2305 bowtie_cmd += [ bwt_idx_prefix ]
2306 bowtie_proc=None
2307 shellcmd=""
2308 unzip_proc=None
2309
2310 if multihits_out != None:
2311 #special prefilter bowtie run: we use prep_reads on the fly
2312 #in order to get multi-mapped reads to exclude later
2313 prep_cmd = prep_reads_cmd(params, params.preflt_data[0].seqfiles, params.preflt_data[0].qualfiles,
2314 params.preflt_data[1].seqfiles, params.preflt_data[1].qualfiles)
2315 prep_cmd.insert(1,"--flt-side="+str(multihits_out))
2316 sides=["left", "right"]
2317 preplog_fname=logging_dir + "prep_reads.prefilter_%s.log" % sides[multihits_out]
2318 prepfilter_log = open(preplog_fname,"w")
2319 unzip_proc = subprocess.Popen(prep_cmd,
2320 stdout=subprocess.PIPE,
2321 stderr=prepfilter_log)
2322 shellcmd=' '.join(prep_cmd) + "|"
2323 else:
2324 z_input=use_zpacker and reads_file.endswith(".z")
2325 if z_input:
2326 unzip_proc = subprocess.Popen(unzip_cmd,
2327 stdin=open(reads_file, "rb"),
2328 stderr=tophat_log, stdout=subprocess.PIPE)
2329 shellcmd=' '.join(unzip_cmd) + "< " +reads_file +"|"
2330 else:
2331 #must be uncompressed fastq input (unmapped reads from a previous run)
2332 #or a BAM file with unmapped reads
2333 if bam_input:
2334 unzip_proc = subprocess.Popen(unzip_cmd, stderr=tophat_log, stdout=subprocess.PIPE)
2335 shellcmd=' '.join(unzip_cmd) + "|"
2336 else:
2337 bowtie_cmd += [reads_file]
2338 if not unzip_proc:
2339 bowtie_proc = subprocess.Popen(bowtie_cmd,
2340 stdout=subprocess.PIPE,
2341 stderr=open(bwt_logname, "w"))
2342 if unzip_proc:
2343 #input is compressed OR prep_reads is used as a filter
2344 bowtie_cmd += ['-']
2345 bowtie_proc = subprocess.Popen(bowtie_cmd,
2346 stdin=unzip_proc.stdout,
2347 stdout=subprocess.PIPE,
2348 stderr=open(bwt_logname, "w"))
2349 unzip_proc.stdout.close() # see http://bugs.python.org/issue7678
2350
2351 shellcmd += ' '.join(bowtie_cmd) + '|' + ' '.join(fix_map_cmd)
2352 pipeline_proc = None
2353 fix_order_proc = None
2354 #write BAM format directly
2355 if t_mapping:
2356 #pipe into map2gtf
2357 fix_order_proc = subprocess.Popen(fix_map_cmd,
2358 stdin=bowtie_proc.stdout,
2359 stdout=subprocess.PIPE,
2360 stderr=tophat_log)
2361 bowtie_proc.stdout.close()
2362 m2g_cmd = [prog_path("map2gtf")]
2363 m2g_cmd += ["--sam-header", genome_sam_header_filename]
2364 m2g_cmd.append(params.gff_annotation)
2365 m2g_cmd.append("-") #incoming uncompressed BAM stream
2366 m2g_cmd.append(mapped_reads)
2367 m2g_log = logging_dir + "m2g_"+readfile_basename+".out"
2368 m2g_err = logging_dir + "m2g_"+readfile_basename+".err"
2369 shellcmd += ' | '+' '.join(m2g_cmd)+ ' > '+m2g_log
2370 pipeline_proc = subprocess.Popen(m2g_cmd,
2371 stdin=fix_order_proc.stdout,
2372 stdout=open(m2g_log, "w"),
2373 stderr=open(m2g_err, "w"))
2374 fix_order_proc.stdout.close()
2375 else:
2376 fix_order_proc = subprocess.Popen(fix_map_cmd,
2377 stdin=bowtie_proc.stdout,
2378 stderr=tophat_log)
2379 bowtie_proc.stdout.close()
2380 pipeline_proc = fix_order_proc
2381
2382 print >> run_log, shellcmd
2383 retcode = None
2384 if pipeline_proc:
2385 pipeline_proc.communicate()
2386 retcode = pipeline_proc.returncode
2387 bowtie_proc.wait()
2388 r=bowtie_proc.returncode
2389 if r:
2390 die(fail_str+"Error running bowtie:\n"+log_tail(bwt_logname,100))
2391 if use_FIFO:
2392 if fifo_pid and not os.path.exists(unmapped_reads_out):
2393 try:
2394 os.kill(fifo_pid, signal.SIGTERM)
2395 except:
2396 pass
2397 if retcode:
2398 die(fail_str+"Error running:\n"+shellcmd)
2399 except OSError, o:
2400 die(fail_str+"Error: "+str(o))
2401
2402 # Success
2403 #finish_time = datetime.now()
2404 #duration = finish_time - start_time
2405 #print >> sys.stderr, "\t\t\t[%s elapsed]" % formatTD(duration)
2406 if use_FIFO:
2407 try:
2408 os.remove(unmapped_reads_fifo)
2409 except:
2410 pass
2411 if multihits_out != None and not os.path.exists(params.preflt_data[multihits_out].multihit_reads):
2412 open(params.preflt_data[multihits_out].multihit_reads, "w").close()
2413
2414 if seg_mapping:
2415 if not params.bowtie2:
2416 params.bowtie_alignment_option = backup_bowtie_alignment_option
2417
2418 return (mapped_reads, unmapped_reads_out)
2419
2420
2421 # Retrieve a .juncs file from a GFF file by calling the gtf_juncs executable
2422 def get_gtf_juncs(gff_annotation):
2423 th_log("Reading known junctions from GTF file")
2424 gtf_juncs_log = open(logging_dir + "gtf_juncs.log", "w")
2425
2426 gff_prefix = gff_annotation.split('/')[-1].split('.')[0]
2427
2428 gtf_juncs_out_name = tmp_dir + gff_prefix + ".juncs"
2429 gtf_juncs_out = open(gtf_juncs_out_name, "w")
2430
2431 gtf_juncs_cmd=[prog_path("gtf_juncs"), gff_annotation]
2432 try:
2433 print >> run_log, " ".join(gtf_juncs_cmd), " > "+gtf_juncs_out_name
2434 retcode = subprocess.call(gtf_juncs_cmd,
2435 stderr=gtf_juncs_log,
2436 stdout=gtf_juncs_out)
2437 # cvg_islands returned an error
2438 if retcode == 1:
2439 th_logp("\tWarning: TopHat did not find any junctions in GTF file")
2440 return (False, gtf_juncs_out_name)
2441 elif retcode != 0:
2442 die(fail_str+"Error: GTF junction extraction failed with err ="+str(retcode))
2443
2444 # cvg_islands not found
2445 except OSError, o:
2446 errmsg=fail_str+str(o)+"\n"
2447 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2448 errmsg+="Error: gtf_juncs not found on this system"
2449 die(errmsg)
2450 return (True, gtf_juncs_out_name)
2451
2452 # Call bowtie-build on the FASTA file of sythetic splice junction sequences
2453 def build_juncs_bwt_index(is_bowtie2, external_splice_prefix, color):
2454 th_log("Indexing splices")
2455 bowtie_build_log = open(logging_dir + "bowtie_build.log", "w")
2456
2457 #user_splices_out_prefix = output_dir + "user_splices_idx"
2458
2459 if is_bowtie2:
2460 bowtie_build_cmd = [prog_path("bowtie2-build")]
2461 else:
2462 bowtie_build_cmd = [prog_path("bowtie-build")]
2463
2464 if color:
2465 bowtie_build_cmd += ["-C"]
2466
2467 bowtie_build_cmd += [external_splice_prefix + ".fa",
2468 external_splice_prefix]
2469 try:
2470 print >> run_log, " ".join(bowtie_build_cmd)
2471 retcode = subprocess.call(bowtie_build_cmd,
2472 stdout=bowtie_build_log)
2473
2474 if retcode != 0:
2475 die(fail_str+"Error: Splice sequence indexing failed with err ="+ str(retcode))
2476 except OSError, o:
2477 errmsg=fail_str+str(o)+"\n"
2478 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2479 errmsg+="Error: bowtie-build not found on this system"
2480 die(errmsg)
2481 return external_splice_prefix
2482
2483 # Build a splice index from a .juncs file, suitable for use with specified read
2484 # (or read segment) lengths
2485 def build_juncs_index(is_bowtie2,
2486 min_anchor_length,
2487 max_seg_len,
2488 juncs_prefix,
2489 external_juncs,
2490 external_insertions,
2491 external_deletions,
2492 external_fusions,
2493 reference_fasta,
2494 color):
2495 th_log("Retrieving sequences for splices")
2496 juncs_file_list = ",".join(external_juncs)
2497 insertions_file_list = ",".join(external_insertions)
2498 deletions_file_list = ",".join(external_deletions)
2499 fusions_file_list = ",".join(external_fusions)
2500
2501 # do not use insertions and deletions in case of Bowtie2
2502 if is_bowtie2:
2503 insertions_file_list = "/dev/null"
2504 deletions_file_list = "/dev/null"
2505
2506 juncs_db_log = open(logging_dir + "juncs_db.log", "w")
2507
2508 external_splices_out_prefix = tmp_dir + juncs_prefix
2509 external_splices_out_name = external_splices_out_prefix + ".fa"
2510
2511 external_splices_out = open(external_splices_out_name, "w")
2512 # juncs_db_cmd = [bin_dir + "juncs_db",
2513 juncs_db_cmd = [prog_path("juncs_db"),
2514 str(min_anchor_length),
2515 str(max_seg_len),
2516 juncs_file_list,
2517 insertions_file_list,
2518 deletions_file_list,
2519 fusions_file_list,
2520 reference_fasta]
2521 try:
2522 print >> run_log, " ".join(juncs_db_cmd) + " > " + external_splices_out_name
2523 retcode = subprocess.call(juncs_db_cmd,
2524 stderr=juncs_db_log,
2525 stdout=external_splices_out)
2526
2527 if retcode != 0:
2528 die(fail_str+"Error: Splice sequence retrieval failed with err ="+str(retcode))
2529 # juncs_db not found
2530 except OSError, o:
2531 errmsg=fail_str+str(o)+"\n"
2532 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2533 errmsg+="Error: juncs_db not found on this system"
2534 die(errmsg)
2535
2536 external_splices_out_prefix = build_juncs_bwt_index(is_bowtie2, external_splices_out_prefix, color)
2537 return external_splices_out_prefix
2538
2539 def build_idx_from_fa(is_bowtie2, fasta_fname, out_dir, color):
2540 """ Build a bowtie index from a FASTA file.
2541
2542 Arguments:
2543 - `fasta_fname`: File path to FASTA file.
2544 - `out_dir`: Output directory to place index in. (includes os.sep)
2545
2546 Returns:
2547 - The path to the Bowtie index.
2548 """
2549 bwt_idx_path = out_dir + os.path.basename(fasta_fname).replace(".fa", "")
2550
2551 if is_bowtie2:
2552 bowtie_idx_cmd = [prog_path("bowtie2-build")]
2553 else:
2554 bowtie_idx_cmd = [prog_path("bowtie-build")]
2555
2556 if color:
2557 bowtie_idx_cmd += ["-C"]
2558
2559 bowtie_idx_cmd += [fasta_fname,
2560 bwt_idx_path]
2561 try:
2562 th_log("Building Bowtie index from " + os.path.basename(fasta_fname))
2563 print >> run_log, " ".join(bowtie_idx_cmd)
2564 retcode = subprocess.call(bowtie_idx_cmd,
2565 stdout=open(os.devnull, "w"),
2566 stderr=open(os.devnull, "w"))
2567 if retcode != 0:
2568 die(fail_str + "Error: Couldn't build bowtie index with err = "
2569 + str(retcode))
2570 except OSError, o:
2571 errmsg=fail_str+str(o)+"\n"
2572 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2573 errmsg+="Error: bowtie-build not found on this system"
2574 die(errmsg)
2575
2576 return bwt_idx_path
2577
2578 # Print out the sam header, embedding the user's specified library properties.
2579 # FIXME: also needs SQ dictionary lines
2580 def write_sam_header(read_params, sam_file):
2581 print >> sam_file, "@HD\tVN:1.0\tSO:coordinate"
2582 if read_params.read_group_id and read_params.sample_id:
2583 rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id,
2584 read_params.sample_id)
2585 if read_params.library_id:
2586 rg_str += "\tLB:%s" % read_params.library_id
2587 if read_params.description:
2588 rg_str += "\tDS:%s" % read_params.description
2589 if read_params.seq_platform_unit:
2590 rg_str += "\tPU:%s" % read_params.seq_platform_unit
2591 if read_params.seq_center:
2592 rg_str += "\tCN:%s" % read_params.seq_center
2593 if read_params.mate_inner_dist:
2594 rg_str += "\tPI:%s" % read_params.mate_inner_dist
2595 if read_params.seq_run_date:
2596 rg_str += "\tDT:%s" % read_params.seq_run_date
2597 if read_params.seq_platform:
2598 rg_str += "\tPL:%s" % read_params.seq_platform
2599
2600 print >> sam_file, rg_str
2601 print >> sam_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd)
2602
2603 # Write final TopHat output, via tophat_reports and wiggles
2604 def compile_reports(params, sam_header_filename, ref_fasta, mappings, readfiles, gff_annotation):
2605 th_log("Reporting output tracks")
2606 left_maps, right_maps = mappings
2607 left_reads, right_reads = readfiles
2608 # left_maps = [x for x in left_maps if (os.path.exists(x) and os.path.getsize(x) > 25)]
2609 left_maps = ','.join(left_maps)
2610
2611 if len(right_maps) > 0:
2612 # right_maps = [x for x in right_maps if (os.path.exists(x) and os.path.getsize(x) > 25)]
2613 right_maps = ','.join(right_maps)
2614
2615 log_fname = logging_dir + "reports.log"
2616 report_log = open(log_fname, "w")
2617 junctions = output_dir + "junctions.bed"
2618 insertions = output_dir + "insertions.bed"
2619 deletions = output_dir + "deletions.bed"
2620 accepted_hits = output_dir + "accepted_hits"
2621 report_cmdpath = prog_path("tophat_reports")
2622 fusions = output_dir + "fusions.out"
2623 report_cmd = [report_cmdpath]
2624
2625 alignments_output_filename = tmp_dir + "accepted_hits"
2626
2627 report_cmd.extend(params.cmd())
2628 report_cmd += ["--sam-header", sam_header_filename]
2629 if params.report_secondary_alignments:
2630 report_cmd += ["--report-secondary-alignments"]
2631
2632 if params.report_discordant_pair_alignments:
2633 report_cmd += ["--report-discordant-pair-alignments"]
2634
2635 if params.report_mixed_alignments:
2636 report_cmd += ["--report-mixed-alignments"]
2637
2638 report_cmd.extend(["--samtools="+samtools_path])
2639
2640 b2_params = params.bowtie2_params
2641 max_penalty, min_penalty = b2_params.mp.split(',')
2642 report_cmd += ["--bowtie2-max-penalty", max_penalty,
2643 "--bowtie2-min-penalty", min_penalty]
2644
2645 report_cmd += ["--bowtie2-penalty-for-N", str(b2_params.np)]
2646
2647 read_gap_open, read_gap_cont = b2_params.rdg.split(',')
2648 report_cmd += ["--bowtie2-read-gap-open", read_gap_open,
2649 "--bowtie2-read-gap-cont", read_gap_cont]
2650
2651 ref_gap_open, ref_gap_cont = b2_params.rfg.split(',')
2652 report_cmd += ["--bowtie2-ref-gap-open", ref_gap_open,
2653 "--bowtie2-ref-gap-cont", ref_gap_cont]
2654
2655 report_cmd.extend([ref_fasta,
2656 junctions,
2657 insertions,
2658 deletions,
2659 fusions,
2660 alignments_output_filename,
2661 left_maps,
2662 left_reads])
2663
2664 if len(right_maps) > 0 and right_reads:
2665 report_cmd.append(right_maps)
2666 report_cmd.append(right_reads)
2667
2668 try:
2669 print >> run_log, " ".join(report_cmd)
2670 report_proc=subprocess.call(report_cmd,
2671 preexec_fn=subprocess_setup,
2672 stderr=report_log)
2673 if report_proc != 0:
2674 die(fail_str+"Error running "+" ".join(report_cmd)+"\n"+log_tail(log_fname))
2675 bam_parts = []
2676 for i in range(params.system_params.num_threads):
2677 bam_part_filename = "%s%d.bam" % (alignments_output_filename, i)
2678 if os.path.exists(bam_part_filename):
2679 bam_parts.append(bam_part_filename)
2680 else:
2681 break
2682 num_bam_parts = len(bam_parts)
2683
2684 if params.report_params.sort_bam:
2685 pids = [0 for i in range(num_bam_parts)]
2686 sorted_bam_parts = ["%s%d_sorted" % (alignments_output_filename, i) for i in range(num_bam_parts)]
2687 #left_um_parts = ["%s%s%d_sorted" % (alignments_output_filename, i) for i in range(num_bam_parts)]
2688 #right_um_parts = ["%s%d_sorted" % (alignments_output_filename, i) for i in range(num_bam_parts)]
2689 for i in range(num_bam_parts):
2690 bamsort_cmd = [samtools_path,
2691 "sort",
2692 bam_parts[i],
2693 sorted_bam_parts[i]]
2694
2695 sorted_bam_parts[i] += ".bam"
2696 print >> run_log, " ".join(bamsort_cmd)
2697
2698 if i + 1 < range(num_bam_parts):
2699 pid = os.fork()
2700 if pid == 0:
2701 subprocess.call(bamsort_cmd,
2702 stderr=open(logging_dir + "reports.samtools_sort.log%d" % i, "w"))
2703 os._exit(os.EX_OK)
2704 else:
2705 pids[i] = pid
2706 else:
2707 subprocess.call(bamsort_cmd,
2708 stderr=open(logging_dir + "reports.samtools_sort.log%d" % i, "w"))
2709
2710 for i in range(len(pids)):
2711 if pids[i] > 0:
2712 os.waitpid(pids[i], 0)
2713 pids[i] = 0
2714
2715 for bam_part in bam_parts:
2716 os.remove(bam_part)
2717 bam_parts = sorted_bam_parts[:]
2718 #-- endif sort_bam
2719
2720 if num_bam_parts > 1:
2721 if params.report_params.sort_bam:
2722 bammerge_cmd = [samtools_path,
2723 "merge","-f","-h", sam_header_filename]
2724 if not params.report_params.convert_bam:
2725 bammerge_cmd += ["-u"]
2726 else: #not sorted, so just raw merge
2727 bammerge_cmd = [prog_path("bam_merge"), "-Q",
2728 "--sam-header", sam_header_filename]
2729
2730 if params.report_params.convert_bam:
2731 bammerge_cmd += ["%s.bam" % accepted_hits]
2732 bammerge_cmd += bam_parts
2733 print >> run_log, " ".join(bammerge_cmd)
2734 subprocess.call(bammerge_cmd,
2735 stderr=open(logging_dir + "reports.merge_bam.log", "w"))
2736 else: #make .sam
2737 bammerge_cmd += ["-"]
2738 bammerge_cmd += bam_parts
2739 merge_proc = subprocess.Popen(bammerge_cmd,
2740 stdout=subprocess.PIPE,
2741 stderr=open(logging_dir + "reports.merge_bam.log", "w"))
2742 bam2sam_cmd = [samtools_path, "view", "-h", "-"]
2743 sam_proc = subprocess.Popen(bam2sam_cmd,
2744 stdin=merge_proc.stdout,
2745 stdout=open(accepted_hits + ".sam", "w"),
2746 stderr=open(logging_dir + "accepted_hits_bam_to_sam.log", "w"))
2747 merge_proc.stdout.close()
2748 shellcmd = " ".join(bammerge_cmd) + " | " + " ".join(bam2sam_cmd)
2749 print >> run_log, shellcmd
2750 sam_proc.communicate()
2751 retcode = sam_proc.returncode
2752 if retcode:
2753 die(fail_str+"Error running:\n"+shellcmd)
2754 else: # only one file
2755 os.rename(bam_parts[0], accepted_hits+".bam")
2756 if not params.report_params.convert_bam:
2757 #just convert to .sam
2758 bam2sam_cmd = [samtools_path, "view", "-h", accepted_hits+".bam"]
2759 shellcmd = " ".join(bam2sam_cmd) + " > " + accepted_hits + ".sam"
2760 print >> run_log, shellcmd
2761 r = subprocess.call(bam2sam_cmd,
2762 stdout=open(accepted_hits + ".sam", "w"),
2763 stderr=open(logging_dir + "accepted_hits_bam_to_sam.log", "w"))
2764 if r != 0:
2765 die(fail_str+"Error running: "+shellcmd)
2766 os.remove(accepted_hits+".bam")
2767
2768 except OSError, o:
2769 die(fail_str+"Error: "+str(o)+"\n"+log_tail(log_fname))
2770
2771 try:
2772 # -- merge the unmapped files
2773 um_parts = []
2774 um_merged = output_dir + "unmapped.bam"
2775 for i in range(params.system_params.num_threads):
2776 left_um_file = tmp_dir + "unmapped_left_%d.bam" % i
2777 right_um_file = tmp_dir + "unmapped_right_%d.bam" % i
2778 um_len = len(um_parts)
2779 if nonzeroFile(left_um_file):
2780 um_parts.append(left_um_file)
2781 if right_reads and nonzeroFile(right_um_file):
2782 um_parts.append(right_um_file)
2783
2784 if len(um_parts) > 0:
2785 if len(um_parts)==1:
2786 os.rename(um_parts[0], um_merged)
2787 else:
2788 merge_cmd=[prog_path("bam_merge"), "-Q",
2789 "--sam-header", sam_header_filename, um_merged]
2790 merge_cmd += um_parts
2791 print >> run_log, " ".join(merge_cmd)
2792 ret = subprocess.call( merge_cmd,
2793 stderr=open(logging_dir + "bam_merge_um.log", "w") )
2794 if ret != 0:
2795 die(fail_str+"Error executing: "+" ".join(merge_cmd)+"\n"+log_tail(logging_dir+"bam_merge_um.log"))
2796 except OSError, o:
2797 die(fail_str+"Error: "+str(o)+"\n"+log_tail(log_fname))
2798
2799 return junctions
2800
2801
2802 # Split up each read in a FASTQ file into multiple segments. Creates a FASTQ file
2803 # for each segment This function needs to be fixed to support mixed read length
2804 # inputs
2805 def open_output_files(prefix, num_files_prev, num_files, out_segf, extension, params):
2806 i = num_files_prev + 1
2807 while i <= num_files:
2808 segfname=prefix+("_seg%d" % i)+extension
2809 out_segf.append(ZWriter(segfname,params.system_params))
2810 i += 1
2811
2812 def split_reads(reads_filename,
2813 prefix,
2814 fasta,
2815 params,
2816 segment_length):
2817 #reads_file = open(reads_filename)
2818 out_segfiles = []
2819 if fasta:
2820 extension = ".fa"
2821 else:
2822 extension = ".fq"
2823 if use_zpacker: extension += ".z"
2824 existing_seg_files = glob.glob(prefix+"_seg*"+extension)
2825 if resumeStage > currentStage and len(existing_seg_files)>0:
2826 #skip this, we are going to return the existing files
2827 return existing_seg_files
2828 zreads = ZReader(reads_filename, params, False)
2829
2830 def convert_color_to_bp(color_seq):
2831 decode_dic = { 'A0':'A', 'A1':'C', 'A2':'G', 'A3':'T', 'A4':'N', 'A.':'N', 'AN':'N',
2832 'C0':'C', 'C1':'A', 'C2':'T', 'C3':'G', 'C4':'N', 'C.':'N', 'CN':'N',
2833 'G0':'G', 'G1':'T', 'G2':'A', 'G3':'C', 'G4':'N', 'G.':'N', 'GN':'N',
2834 'T0':'T', 'T1':'G', 'T2':'C', 'T3':'A', 'T4':'N', 'T.':'N', 'TN':'N',
2835 'N0':'N', 'N1':'N', 'N2':'N', 'N3':'N', 'N4':'N', 'N.':'N', 'NN':'N',
2836 '.0':'N', '.1':'N', '.2':'N', '.3':'N', '.4':'N', '..':'N', '.N':'N' }
2837
2838 base = color_seq[0]
2839 bp_seq = base
2840 for ch in color_seq[1:]:
2841 base = decode_dic[base+ch]
2842 bp_seq += base
2843 return bp_seq
2844
2845 def convert_bp_to_color(bp_seq):
2846 encode_dic = { 'AA':'0', 'CC':'0', 'GG':'0', 'TT':'0',
2847 'AC':'1', 'CA':'1', 'GT':'1', 'TG':'1',
2848 'AG':'2', 'CT':'2', 'GA':'2', 'TC':'2',
2849 'AT':'3', 'CG':'3', 'GC':'3', 'TA':'3',
2850 'A.':'4', 'C.':'4', 'G.':'4', 'T.':'4',
2851 '.A':'4', '.C':'4', '.G':'4', '.T':'4',
2852 '.N':'4', 'AN':'4', 'CN':'4', 'GN':'4',
2853 'TN':'4', 'NA':'4', 'NC':'4', 'NG':'4',
2854 'NT':'4', 'NN':'4', 'N.':'4', '..':'4' }
2855
2856 base = bp_seq[0]
2857 color_seq = base
2858 for ch in bp_seq[1:]:
2859 color_seq += encode_dic[base + ch]
2860 base = ch
2861
2862 return color_seq
2863
2864 def split_record(read_name, read_seq, read_qual, out_segf, offsets, color):
2865 if color:
2866 color_offset = 1
2867 read_seq_temp = convert_color_to_bp(read_seq)
2868
2869 seg_num = 1
2870 while seg_num + 1 < len(offsets):
2871 if read_seq[offsets[seg_num]+1] not in ['0', '1', '2', '3']:
2872 return
2873 seg_num += 1
2874 else:
2875 color_offset = 0
2876
2877 seg_num = 0
2878 last_seq_offset = 0
2879 while seg_num + 1 < len(offsets):
2880 f = out_segf[seg_num].file
2881 seg_seq = read_seq[last_seq_offset+color_offset:offsets[seg_num + 1]+color_offset]
2882 print >> f, "%s|%d:%d:%d" % (read_name,last_seq_offset,seg_num, len(offsets) - 1)
2883 if color:
2884 print >> f, "%s%s" % (read_seq_temp[last_seq_offset], seg_seq)
2885 else:
2886 print >> f, seg_seq
2887 if not fasta:
2888 seg_qual = read_qual[last_seq_offset:offsets[seg_num + 1]]
2889 print >> f, "+"
2890 print >> f, seg_qual
2891 seg_num += 1
2892 last_seq_offset = offsets[seg_num]
2893
2894 line_state = 0
2895 read_name = ""
2896 read_seq = ""
2897 read_quals = ""
2898 num_segments = 0
2899 offsets = []
2900 for line in zreads.file:
2901 if line.strip() == "":
2902 continue
2903 if line_state == 0:
2904 read_name = line.strip()
2905 elif line_state == 1:
2906 read_seq = line.strip()
2907
2908 read_length = len(read_seq)
2909 tmp_num_segments = read_length / segment_length
2910 offsets = [segment_length * i for i in range(0, tmp_num_segments + 1)]
2911
2912 # Bowtie's minimum read length here is 20bp, so if the last segment
2913 # is between 20 and segment_length bp long, go ahead and write it out
2914 if read_length % segment_length >= min(segment_length - 2, 20):
2915 offsets.append(read_length)
2916 tmp_num_segments += 1
2917 else:
2918 offsets[-1] = read_length
2919
2920 if tmp_num_segments == 1:
2921 offsets = [0, read_length]
2922
2923 if tmp_num_segments > num_segments:
2924 open_output_files(prefix, num_segments, tmp_num_segments, out_segfiles, extension, params)
2925 num_segments = tmp_num_segments
2926
2927 if fasta:
2928 split_record(read_name, read_seq, None, out_segfiles, offsets, params.read_params.color)
2929 elif line_state == 2:
2930 line = line.strip()
2931 else:
2932 read_quals = line.strip()
2933 if not fasta:
2934 split_record(read_name, read_seq, read_quals, out_segfiles, offsets, params.read_params.color)
2935
2936 line_state += 1
2937 if fasta:
2938 line_state %= 2
2939 else:
2940 line_state %= 4
2941 zreads.close()
2942 out_fnames=[]
2943 for zf in out_segfiles:
2944 zf.close()
2945 out_fnames.append(zf.fname)
2946 #return [o.fname for o in out_segfiles]
2947 return out_fnames
2948
2949 # Find possible splice junctions using the "closure search" strategy, and report
2950 # them in closures.juncs. Calls the executable closure_juncs
2951 def junctions_from_closures(params,
2952 sam_header_filename,
2953 left_maps,
2954 right_maps,
2955 ref_fasta):
2956 th_log("Searching for junctions via mate-pair closures")
2957
2958
2959 #maps = [x for x in seg_maps if (os.path.exists(x) and os.path.getsize(x) > 0)]
2960 #if len(maps) == 0:
2961 # return None
2962 slash = left_maps[0].rfind('/')
2963 juncs_out = ""
2964 if slash != -1:
2965 juncs_out += left_maps[0][:slash+1]
2966 fusions_out = juncs_out
2967
2968 juncs_out += "closure.juncs"
2969 fusions_out += "closure.fusions"
2970
2971 juncs_log = open(logging_dir + "closure.log", "w")
2972 juncs_cmdpath=prog_path("closure_juncs")
2973 juncs_cmd = [juncs_cmdpath]
2974
2975 left_maps = ','.join(left_maps)
2976 right_maps = ','.join(right_maps)
2977
2978 juncs_cmd.extend(params.cmd())
2979 juncs_cmd.extend(["--sam-header", sam_header_filename,
2980 juncs_out,
2981 fusions_out,
2982 ref_fasta,
2983 left_maps,
2984 right_maps])
2985 try:
2986 print >> run_log, ' '.join(juncs_cmd)
2987 retcode = subprocess.call(juncs_cmd,
2988 stderr=juncs_log)
2989
2990 # spanning_reads returned an error
2991 if retcode != 0:
2992 die(fail_str+"Error: closure-based junction search failed with err ="+str(retcode))
2993 # cvg_islands not found
2994 except OSError, o:
2995 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
2996 th_logp(fail_str + "Error: closure_juncs not found on this system")
2997 die(str(o))
2998 return [juncs_out]
2999
3000 # Find possible junctions by examining coverage and split segments in the initial
3001 # map and segment maps. Report junctions, insertions, and deletions in segment.juncs,
3002 # segment.insertions, and segment.deletions. Calls the executable
3003 # segment_juncs
3004 def junctions_from_segments(params,
3005 sam_header_filename,
3006 left_reads,
3007 left_reads_map,
3008 left_seg_maps,
3009 right_reads,
3010 right_reads_map,
3011 right_seg_maps,
3012 unmapped_reads,
3013 reads_format,
3014 ref_fasta):
3015 # if left_reads_map != left_seg_maps[0]:
3016
3017 out_path=getFileDir(left_seg_maps[0])
3018 juncs_out=out_path+"segment.juncs"
3019 insertions_out=out_path+"segment.insertions"
3020 deletions_out =out_path+"segment.deletions"
3021 fusions_out = out_path+"segment.fusions"
3022 if resumeStage>currentStage and fileExists(juncs_out):
3023 return [juncs_out, insertions_out, deletions_out, fusions_out]
3024 th_log("Searching for junctions via segment mapping")
3025 if params.coverage_search == True:
3026 print >> sys.stderr, "\tCoverage-search algorithm is turned on, making this step very slow"
3027 print >> sys.stderr, "\tPlease try running TopHat again with the option (--no-coverage-search) if this step takes too much time or memory."
3028
3029 left_maps = ','.join(left_seg_maps)
3030 log_fname = logging_dir + "segment_juncs.log"
3031 segj_log = open(log_fname, "w")
3032 segj_cmd = [prog_path("segment_juncs")]
3033
3034 segj_cmd.extend(params.cmd())
3035 segj_cmd.extend(["--sam-header", sam_header_filename,
3036 "--ium-reads", ",".join(unmapped_reads),
3037 ref_fasta,
3038 juncs_out,
3039 insertions_out,
3040 deletions_out,
3041 fusions_out,
3042 left_reads,
3043 left_reads_map,
3044 left_maps])
3045 if right_seg_maps:
3046 right_maps = ','.join(right_seg_maps)
3047 segj_cmd.extend([right_reads, right_reads_map, right_maps])
3048 try:
3049 print >> run_log, " ".join(segj_cmd)
3050 retcode = subprocess.call(segj_cmd,
3051 preexec_fn=subprocess_setup,
3052 stderr=segj_log)
3053
3054 # spanning_reads returned an error
3055 if retcode != 0:
3056 die(fail_str+"Error: segment-based junction search failed with err ="+str(retcode)+"\n"+log_tail(log_fname))
3057
3058 # cvg_islands not found
3059 except OSError, o:
3060 if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT:
3061 th_logp(fail_str + "Error: segment_juncs not found on this system")
3062 die(str(o))
3063
3064 return [juncs_out, insertions_out, deletions_out, fusions_out]
3065
3066 # Joins mapped segments into full-length read alignments via the executable
3067 # long_spanning_reads
3068 def join_mapped_segments(params,
3069 sam_header_filename,
3070 reads,
3071 ref_fasta,
3072 possible_juncs,
3073 possible_insertions,
3074 possible_deletions,
3075 possible_fusions,
3076 contig_seg_maps,
3077 spliced_seg_maps,
3078 alignments_out_name):
3079 rn=""
3080 if len(contig_seg_maps)>1:
3081 th_log("Joining segment hits")
3082 rn=".segs"
3083 else:
3084 th_log("Processing bowtie hits")
3085 contig_seg_maps = ','.join(contig_seg_maps)
3086
3087 possible_juncs = ','.join(possible_juncs)
3088 possible_insertions = ",".join(possible_insertions)
3089 possible_deletions = ",".join(possible_deletions)
3090 possible_fusions = ",".join(possible_fusions)
3091
3092 log_fname=logging_dir + "long_spanning_reads"+rn+".log"
3093 align_log = open(log_fname, "w")
3094 align_cmd = [prog_path("long_spanning_reads")]
3095
3096 align_cmd.extend(params.cmd())
3097 align_cmd += ["--sam-header", sam_header_filename]
3098
3099 b2_params = params.bowtie2_params
3100 max_penalty, min_penalty = b2_params.mp.split(',')
3101 align_cmd += ["--bowtie2-max-penalty", max_penalty,
3102 "--bowtie2-min-penalty", min_penalty]
3103
3104 align_cmd += ["--bowtie2-penalty-for-N", str(b2_params.np)]
3105
3106 read_gap_open, read_gap_cont = b2_params.rdg.split(',')
3107 align_cmd += ["--bowtie2-read-gap-open", read_gap_open,
3108 "--bowtie2-read-gap-cont", read_gap_cont]
3109
3110 ref_gap_open, ref_gap_cont = b2_params.rfg.split(',')
3111 align_cmd += ["--bowtie2-ref-gap-open", ref_gap_open,
3112 "--bowtie2-ref-gap-cont", ref_gap_cont]
3113
3114 align_cmd.append(ref_fasta)
3115 align_cmd.extend([reads,
3116 possible_juncs,
3117 possible_insertions,
3118 possible_deletions,
3119 possible_fusions,
3120 alignments_out_name,
3121 contig_seg_maps])
3122
3123 if spliced_seg_maps:
3124 spliced_seg_maps = ','.join(spliced_seg_maps)
3125 align_cmd.append(spliced_seg_maps)
3126
3127 try:
3128 print >> run_log, " ".join(align_cmd)
3129 ret = subprocess.call(align_cmd,
3130 stderr=align_log)
3131 if ret:
3132 die(fail_str+"Error running 'long_spanning_reads':"+log_tail(log_fname))
3133 except OSError, o:
3134 die(fail_str+"Error: "+str(o))
3135
3136 # This class collects spliced and unspliced alignments for each of the
3137 # left and right read files provided by the user.
3138 class Maps:
3139 def __init__(self,
3140 unspliced_sam,
3141 seg_maps,
3142 unmapped_segs,
3143 segs):
3144 self.unspliced_sam = unspliced_sam
3145 self.seg_maps = seg_maps
3146 self.unmapped_segs = unmapped_segs
3147 self.segs = segs
3148
3149 # Map2GTF stuff
3150 def m2g_convert_coords(params, sam_header_filename, gtf_fname, reads, out_fname):
3151 """ajjkljlks
3152
3153 Arguments:
3154 - `params`: TopHat parameters
3155 - `gtf_fname`: File name pointing to the annotation.
3156 - `reads`: The reads to convert coords (in Bowtie format).
3157 - `out_fname`: The file name pointing to the output.
3158 """
3159 m2g_cmd = [prog_path("map2gtf")]
3160 m2g_cmd.extend(params.cmd())
3161 m2g_cmd += ["--sam-header", sam_header_filename]
3162 m2g_cmd.append(gtf_fname)
3163 m2g_cmd.append(reads) #could be BAM file
3164 m2g_cmd.append(out_fname)
3165 fbasename = getFileBaseName(reads)
3166 m2g_log = logging_dir + "m2g_" + fbasename + ".out"
3167 m2g_err = logging_dir + "m2g_" + fbasename + ".err"
3168
3169 try:
3170 th_log("Converting " + fbasename + " to genomic coordinates (map2gtf)")
3171 print >> run_log, " ".join(m2g_cmd) + " > " + m2g_log
3172 ret = subprocess.call(m2g_cmd,
3173 stdout=open(m2g_log, "w"),
3174 stderr=open(m2g_err, "w"))
3175 if ret != 0:
3176 die(fail_str + " Error: map2gtf returned an error")
3177 except OSError, o:
3178 err_msg = fail_str + str(o)
3179 die(err_msg + "\n")
3180
3181
3182 def gtf_to_fasta(params, trans_gtf, genome, out_fname):
3183 """ Make the transcriptome from a GTF.
3184
3185 Arguments:
3186 - `trans_gtf`:
3187 - `genome`:
3188 - `out_fname`:
3189 """
3190 # TODO: Call gtf_to_fasta
3191 if resumeStage > currentStage and fileExists(out_fname):
3192 return
3193 g2f_cmd = [prog_path("gtf_to_fasta")]
3194 g2f_cmd.extend(params.cmd())
3195 g2f_cmd.append(trans_gtf)
3196 g2f_cmd.append(genome)
3197 g2f_cmd.append(out_fname)
3198
3199 g2f_log = logging_dir + "g2f.out"
3200 g2f_err = logging_dir + "g2f.err"
3201
3202 try:
3203 print >> run_log, " ".join(g2f_cmd)+" > " + g2f_log
3204 ret = subprocess.call(g2f_cmd,
3205 stdout = open(g2f_log, "w"),
3206 stderr = open(g2f_err, "w"))
3207 if ret != 0:
3208 die(fail_str + " Error: gtf_to_fasta returned an error.")
3209 except OSError, o:
3210 err_msg = fail_str + str(o)
3211 die(err_msg + "\n")
3212
3213 def map2gtf(params, genome_sam_header_filename, ref_fasta, left_reads, right_reads):
3214 """ Main GTF mapping function
3215
3216 Arguments:
3217 - `params`: The TopHat parameters.
3218 - `ref_fasta`: The reference genome.
3219 - `left_reads`: A list of reads.
3220 - `right_reads`: A list of reads (empty if single-end).
3221
3222 """
3223 test_input_file(params.gff_annotation)
3224
3225 # th_log("Reading in GTF file: " + params.gff_annotation)
3226 # transcripts = gtf_to_transcripts(params.gff_annotation)
3227
3228 gtf_name = getFileBaseName(params.gff_annotation)
3229 m2g_bwt_idx = None
3230 t_out_dir = tmp_dir
3231 if params.transcriptome_index and not params.transcriptome_outdir:
3232 m2g_bwt_idx = params.transcriptome_index
3233 th_log("Using pre-built transcriptome index..")
3234 else:
3235 th_log("Creating transcriptome data files..")
3236 if params.transcriptome_outdir:
3237 t_out_dir=params.transcriptome_outdir+"/"
3238 m2g_ref_fasta = t_out_dir + gtf_name + ".fa"
3239 gtf_to_fasta(params, params.gff_annotation, ref_fasta, m2g_ref_fasta)
3240 m2g_bwt_idx = build_idx_from_fa(params.bowtie2, m2g_ref_fasta, t_out_dir, params.read_params.color)
3241
3242 transcriptome_header_filename = get_index_sam_header(params, m2g_bwt_idx)
3243
3244 mapped_gtf_list = []
3245 unmapped_gtf_list = []
3246 # do the initial mapping in GTF coordinates
3247 for reads in [left_reads, right_reads]:
3248 if reads == None or os.path.getsize(reads) < 25 :
3249 continue
3250 fbasename = getFileBaseName(reads)
3251 mapped_gtf_out = tmp_dir + fbasename + ".m2g"
3252 #if use_zpacker:
3253 # mapped_gtf_out+=".z"
3254
3255 unmapped_gtf = tmp_dir + fbasename + ".m2g_um"
3256 #if use_BWT_FIFO:
3257 # unmapped_gtf += ".z"
3258
3259 (mapped_gtf_map, unmapped) = bowtie(params,
3260 m2g_bwt_idx,
3261 [transcriptome_header_filename, genome_sam_header_filename],
3262 [reads],
3263 "fastq",
3264 params.read_mismatches,
3265 params.read_gap_length,
3266 params.read_edit_dist,
3267 params.read_realign_edit_dist,
3268 mapped_gtf_out,
3269 unmapped_gtf,
3270 "", _reads_vs_T)
3271 mapped_gtf_list.append(mapped_gtf_map)
3272 unmapped_gtf_list.append(unmapped)
3273
3274 # bam_gtf_list = mapped_gtf_map[]
3275
3276 # for reads in mapped_gtf_list:
3277 # fbasename = getFileBaseName(reads)
3278 # bam_out_fname = tmp_dir + fbasename + "_converted.bam"
3279 # m2g_convert_coords(params,
3280 # genome_sam_header_filename,
3281 # params.gff_annotation,
3282 # reads,
3283 # bam_out_fname)
3284 #
3285 # bam_gtf_list.append(bam_out_fname)
3286 # if not params.system_params.keep_tmp:
3287 # removeFileWithIndex(reads)
3288
3289 # if len(bam_gtf_list) < 2:
3290 # bam_gtf_list.append(None)
3291 # unmapped_gtf_list.append(None)
3292 if len(mapped_gtf_list) < 2:
3293 mapped_gtf_list.append(None)
3294 if len(unmapped_gtf_list) < 2:
3295 unmapped_gtf_list.append(None)
3296 # return (bam_gtf_list, unmapped_gtf_list)
3297 return (mapped_gtf_list, unmapped_gtf_list)
3298 # end Map2GTF
3299
3300 def get_preflt_data(params, ri, target_reads, out_mappings, out_unmapped):
3301 ## extract mappings and unmapped reads from prefilter mappings and preflt_ium
3302 ##
3303 #this is accomplished by a special prep_reads usage (triggered by --flt-hits)
3304 out_bam=None
3305 #if params.read_params.color:
3306 # out_unmapped += ".fq"
3307 # #if use_zpacker: out_unmapped += ".z"
3308 #else:
3309 out_unmapped += ".bam"
3310 out_bam = out_unmapped
3311 # no colorspace reads
3312 if resumeStage:
3313 return (out_mappings, out_unmapped)
3314 do_use_zpacker = use_zpacker and not out_bam
3315 prep_cmd=prep_reads_cmd(params, params.preflt_data[ri].unmapped_reads, None,
3316 None, None, # right-side mates
3317 out_bam, # stdout file
3318 out_mappings, # aux file (filtered mappings)
3319 None, # no index for out_bam
3320 [target_reads], # prefilter reads
3321 [params.preflt_data[ri].mappings]) # mappings to filter
3322 if not out_bam: um_reads = open(out_unmapped, "wb")
3323 sides=["left","right"]
3324 log_fname=logging_dir + "prep_reads.from_preflt."+sides[ri]+".log"
3325 filter_log = open(log_fname,"w")
3326
3327 shell_cmd = " ".join(prep_cmd)
3328 #add the compression pipe
3329 zip_cmd=[]
3330 if do_use_zpacker:
3331 zip_cmd=[ params.system_params.zipper ]
3332 zip_cmd.extend(params.system_params.zipper_opts)
3333 zip_cmd.extend(['-c','-'])
3334 shell_cmd +=' | '+' '.join(zip_cmd)
3335 if not out_bam:
3336 shell_cmd += ' >' + out_unmapped
3337 retcode=0
3338 try:
3339 print >> run_log, shell_cmd
3340 if do_use_zpacker:
3341 prep_proc = subprocess.Popen(prep_cmd,
3342 stdout=subprocess.PIPE,
3343 stderr=filter_log)
3344 zip_proc = subprocess.Popen(zip_cmd,
3345 preexec_fn=subprocess_setup,
3346 stdin=prep_proc.stdout,
3347 stderr=tophat_log, stdout=um_reads)
3348 prep_proc.stdout.close() #as per http://bugs.python.org/issue7678
3349 zip_proc.communicate()
3350 retcode=prep_proc.poll()
3351 if retcode==0:
3352 retcode=zip_proc.poll()
3353 else:
3354 if out_bam:
3355 retcode = subprocess.call(prep_cmd, stderr=filter_log)
3356 else:
3357 retcode = subprocess.call(prep_cmd, stdout=um_reads,
3358 stderr=filter_log)
3359 if retcode:
3360 die(fail_str+"Error running 'prep_reads'\n"+log_tail(log_fname))
3361
3362 except OSError, o:
3363 errmsg=fail_str+str(o)
3364 die(errmsg+"\n"+log_tail(log_fname))
3365 if not out_bam: um_reads.close()
3366
3367 return (out_mappings, out_unmapped)
3368
3369
3370 # The main aligment routine of TopHat. This function executes most of the
3371 # workflow producing a set of candidate alignments for each cDNA fragment in a
3372 # pair of SAM alignment files (for paired end reads).
3373 def spliced_alignment(params,
3374 bwt_idx_prefix,
3375 sam_header_filename,
3376 ref_fasta,
3377 read_len,
3378 segment_len,
3379 prepared_reads,
3380 user_supplied_junctions,
3381 user_supplied_insertions,
3382 user_supplied_deletions):
3383
3384 possible_juncs = []
3385 possible_juncs.extend(user_supplied_junctions)
3386
3387 possible_insertions = []
3388 possible_insertions.extend(user_supplied_insertions)
3389 possible_deletions = []
3390 possible_deletions.extend(user_supplied_deletions)
3391 possible_fusions = []
3392
3393 left_reads, right_reads = prepared_reads
3394
3395 maps = [[], []] # maps[0] = left_reads mapping data, maps[1] = right_reads_mapping_data
3396 # Before anything, map the reads using Map2GTF (if using annotation)
3397 m2g_maps = [ None, None ] # left, right
3398 initial_reads = [ left_reads, right_reads ]
3399
3400 if params.gff_annotation:
3401 (mapped_gtf_list, unmapped_gtf_list) = \
3402 map2gtf(params, sam_header_filename, ref_fasta, left_reads, right_reads)
3403
3404 m2g_left_maps, m2g_right_maps = mapped_gtf_list
3405 m2g_maps = [m2g_left_maps, m2g_right_maps]
3406 if params.transcriptome_only or not fileExists(unmapped_gtf_list[0]):
3407 # The case where the user doesn't want to map to anything other
3408 # than the transcriptome OR we have no unmapped reads
3409 maps[0] = [m2g_left_maps]
3410 if right_reads:
3411 maps[1] = [m2g_right_maps]
3412
3413 return maps
3414 # Feed the unmapped reads into spliced_alignment()
3415 initial_reads = unmapped_gtf_list[:]
3416 if currentStage >= resumeStage:
3417 th_log("Resuming TopHat pipeline with unmapped reads")
3418
3419 if not nonzeroFile(initial_reads[0]) and \
3420 (not initial_reads[1] or not nonzeroFile(initial_reads[1])):
3421
3422 if m2g_maps[1]:
3423 return [[m2g_maps[0]], [m2g_maps[1]]]
3424 else:
3425 return [[m2g_maps[0]], []]
3426
3427 max_seg_len = segment_len #this is the ref seq span on either side of the junctions
3428 #to be extracted into segment_juncs.fa
3429
3430 num_segs = int(read_len / segment_len)
3431 if (read_len % segment_len) >= min(segment_len-2, 20):
3432 #remainder is shorter but long enough to become a new segment
3433 num_segs += 1
3434 else:
3435 # the last segment is longer
3436 if num_segs>1: max_seg_len += (read_len % segment_len)
3437
3438 if num_segs <= 1:
3439 th_logp("Warning: you have only one segment per read.\n\tIf the read length is greater than or equal to 45bp,\n\twe strongly recommend that you decrease --segment-length to about half the read length because TopHat will work better with multiple segments")
3440
3441 # Using the num_segs value returned by check_reads(),
3442 # decide which junction discovery strategy to use
3443 if num_segs < 3:
3444 #if params.butterfly_search != False:
3445 # params.butterfly_search = True
3446 if params.coverage_search != False:
3447 params.coverage_search = True
3448 if num_segs == 1:
3449 segment_len = read_len
3450 else: #num_segs >= 3:
3451 # if we have at least three segments, just use split segment search,
3452 # which is the most sensitive and specific, fastest, and lightest-weight.
3453 # so unless specifically requested, disable the other junction searches
3454 if params.closure_search != True:
3455 params.closure_search = False
3456 if params.coverage_search != True:
3457 params.coverage_search = False
3458 if params.butterfly_search != True:
3459 params.butterfly_search = False
3460
3461 # Perform the first part of the TopHat work flow on the left and right
3462 # reads of paired ends separately - we'll use the pairing information later
3463 have_left_IUM = False
3464 for ri in (0,1):
3465 reads=initial_reads[ri]
3466 if reads == None or not nonzeroFile(reads):
3467 continue
3468
3469 fbasename=getFileBaseName(reads)
3470 unspliced_out = tmp_dir + fbasename + ".mapped"
3471 unspliced_sam = None
3472 unmapped_reads = None
3473 #if use_zpacker: unspliced_out+=".z"
3474 unmapped_unspliced = tmp_dir + fbasename + "_unmapped"
3475 if params.prefilter_multi:
3476 #unmapped_unspliced += ".z"
3477 (unspliced_sam, unmapped_reads) = get_preflt_data(params, ri, reads, unspliced_out, unmapped_unspliced)
3478 else:
3479 # Perform the initial Bowtie mapping of the full length reads
3480 (unspliced_sam, unmapped_reads) = bowtie(params,
3481 bwt_idx_prefix,
3482 sam_header_filename,
3483 [reads],
3484 "fastq",
3485 params.read_mismatches,
3486 params.read_gap_length,
3487 params.read_edit_dist,
3488 params.read_realign_edit_dist,
3489 unspliced_out,
3490 unmapped_unspliced,
3491 "",
3492 _reads_vs_G)
3493
3494 seg_maps = []
3495 unmapped_segs = []
3496 segs = []
3497
3498 have_IUM = nonzeroFile(unmapped_reads)
3499 if ri==0 and have_IUM:
3500 have_left_IUM = True
3501 setRunStage(_stage_map_segments)
3502 if num_segs > 1 and have_IUM:
3503 # split up the IUM reads into segments
3504 # unmapped_reads can be in BAM format
3505 read_segments = split_reads(unmapped_reads,
3506 tmp_dir + fbasename,
3507 False,
3508 params,
3509 segment_len)
3510
3511 #if not params.system_params.keep_tmp:
3512 # removeFileWithIndex(unmapped_reads)
3513
3514 # Map each segment file independently with Bowtie
3515 for i in range(len(read_segments)):
3516 seg = read_segments[i]
3517 fbasename=getFileBaseName(seg)
3518 seg_out = tmp_dir + fbasename
3519 unmapped_seg = tmp_dir + fbasename + "_unmapped"
3520 extra_output = "(%d/%d)" % (i+1, len(read_segments))
3521 (seg_map, unmapped) = bowtie(params,
3522 bwt_idx_prefix,
3523 sam_header_filename,
3524 [seg],
3525 "fastq",
3526 params.segment_mismatches,
3527 params.segment_mismatches,
3528 params.segment_mismatches,
3529 params.segment_mismatches,
3530 seg_out,
3531 unmapped_seg,
3532 extra_output,
3533 _segs_vs_G)
3534 seg_maps.append(seg_map)
3535 unmapped_segs.append(unmapped)
3536 segs.append(seg)
3537
3538 # Collect the segment maps for left and right reads together
3539 maps[ri] = Maps(unspliced_sam, seg_maps, unmapped_segs, segs)
3540 else:
3541 # if there's only one segment, just collect the initial map as the only
3542 # map to be used downstream for coverage-based junction discovery
3543 read_segments = [reads]
3544 maps[ri] = Maps(unspliced_sam, [unspliced_sam], [unmapped_reads], [unmapped_reads])
3545
3546 # XXX: At this point if using M2G, have three sets of reads:
3547 # mapped to transcriptome, mapped to genome, and unmapped (potentially
3548 # spliced or poly-A tails) - hp
3549 unmapped_reads = []
3550 if maps[0]:
3551 left_reads_map = maps[0].unspliced_sam
3552 left_seg_maps = maps[0].seg_maps
3553 unmapped_reads = maps[0].unmapped_segs
3554 else:
3555 left_reads_map = None
3556 left_seg_maps = None
3557
3558 if right_reads and maps[1]:
3559 right_reads_map = maps[1].unspliced_sam
3560 right_seg_maps = maps[1].seg_maps
3561 unmapped_reads.extend(maps[1].unmapped_segs)
3562 else:
3563 right_reads_map = None
3564 right_seg_maps = None
3565
3566 if params.find_novel_juncs and have_left_IUM: # or params.find_novel_indels:
3567 # Call segment_juncs to infer a list of possible splice junctions from
3568 # the regions of the genome covered in the initial and segment maps
3569 #if params.find_novel_juncs:
3570 #TODO: in m2g case, we might want to pass the m2g mappings as well,
3571 # or perhaps the GTF file directly
3572 # -> this could improve alternative junction detection?
3573 setRunStage(_stage_find_juncs)
3574 juncs = junctions_from_segments(params,
3575 sam_header_filename,
3576 left_reads,
3577 left_reads_map,
3578 left_seg_maps,
3579 right_reads,
3580 right_reads_map,
3581 right_seg_maps,
3582 unmapped_reads,
3583 "fastq",
3584 ref_fasta)
3585
3586 if not params.system_params.keep_tmp:
3587 for unmapped_seg in unmapped_reads:
3588 removeFileWithIndex(unmapped_seg)
3589
3590 if os.path.getsize(juncs[0]) != 0:
3591 possible_juncs.append(juncs[0])
3592 if params.find_novel_indels:
3593 if os.path.getsize(juncs[1]) != 0:
3594 possible_insertions.append(juncs[1])
3595 if os.path.getsize(juncs[2]) != 0:
3596 possible_deletions.append(juncs[2])
3597 if params.find_novel_fusions:
3598 if os.path.getsize(juncs[3]) != 0:
3599 possible_fusions.append(juncs[3])
3600 # Optionally, and for paired reads only, use a closure search to
3601 # discover addtional junctions
3602 if currentStage >= resumeStage and params.closure_search and left_reads and right_reads:
3603 juncs = junctions_from_closures(params,
3604 sam_header_filename,
3605 [maps[initial_reads[left_reads]].unspliced_sam, maps[initial_reads[left_reads]].seg_maps[-1]],
3606 [maps[initial_reads[right_reads]].unspliced_sam, maps[initial_reads[right_reads]].seg_maps[-1]],
3607 ref_fasta)
3608 if os.path.getsize(juncs[0]) != 0:
3609 possible_juncs.extend(juncs)
3610
3611 if len(possible_insertions) == 0 and len(possible_deletions) == 0 and len(possible_juncs) == 0 and len(possible_fusions) == 0:
3612 spliced_seg_maps = None
3613 junc_idx_prefix = None
3614 else:
3615 junc_idx_prefix = "segment_juncs"
3616 if len(possible_insertions) == 0:
3617 possible_insertions.append(os.devnull)
3618 # print >> sys.stderr, "Warning: insertions database is empty!"
3619 if len(possible_deletions) == 0:
3620 possible_deletions.append(os.devnull)
3621 # print >> sys.stderr, "Warning: deletions database is empty!"
3622 if len(possible_juncs) == 0:
3623 possible_juncs.append(os.devnull)
3624 th_logp("Warning: junction database is empty!")
3625 if len(possible_fusions) == 0:
3626 possible_fusions.append(os.devnull)
3627
3628 setRunStage(_stage_juncs_db)
3629 juncs_bwt_samheader = None
3630 juncs_bwt_idx = None
3631 if junc_idx_prefix:
3632 jdb_prefix = tmp_dir + junc_idx_prefix
3633 if currentStage<resumeStage and fileExists(jdb_prefix + ".fa"):
3634 juncs_bwt_idx = jdb_prefix
3635 else:
3636 juncs_bwt_idx = build_juncs_index(params.bowtie2,
3637 3,
3638 max_seg_len,
3639 junc_idx_prefix,
3640 possible_juncs,
3641 possible_insertions,
3642 possible_deletions,
3643 possible_fusions,
3644 ref_fasta,
3645 params.read_params.color)
3646 juncs_bwt_samheader = get_index_sam_header(params, juncs_bwt_idx)
3647
3648 # Now map read segments (or whole IUM reads, if num_segs == 1) to the splice
3649 # index with Bowtie
3650 setRunStage(_stage_map2juncs)
3651 for ri in (0,1):
3652 reads = initial_reads[ri]
3653 if not reads:
3654 continue
3655
3656 spliced_seg_maps = []
3657 rfname=getFileBaseName(reads)
3658 rfdir=getFileDir(reads)
3659
3660 m2g_map = m2g_maps[ri]
3661 mapped_reads = rfdir + rfname + ".candidates.bam"
3662 merged_map = rfdir + rfname + ".candidates_and_unspl.bam"
3663
3664 if maps[ri]:
3665 unspl_samfile = maps[ri].unspliced_sam
3666 else:
3667 unspl_samfile = None
3668
3669 have_IUM = True
3670 if reads == None or not nonzeroFile(reads):
3671 have_IUM = False
3672
3673 if have_IUM:
3674 if junc_idx_prefix:
3675 i = 0
3676 for seg in maps[ri].segs:
3677 #search each segment
3678 fsegname = getFileBaseName(seg)
3679 seg_out = tmp_dir + fsegname + ".to_spliced"
3680 #if use_zpacker: seg_out += ".z"
3681 extra_output = "(%d/%d)" % (i+1, len(maps[ri].segs))
3682 (seg_map, unmapped) = bowtie(params,
3683 tmp_dir + junc_idx_prefix,
3684 juncs_bwt_samheader,
3685 [seg],
3686 "fastq",
3687 params.segment_mismatches,
3688 params.segment_mismatches,
3689 params.segment_mismatches,
3690 params.segment_mismatches,
3691 seg_out,
3692 None,
3693 extra_output,
3694 _segs_vs_J)
3695 spliced_seg_maps.append(seg_map)
3696 i += 1
3697
3698 if not params.system_params.keep_tmp:
3699 removeFileWithIndex(seg)
3700
3701
3702 # Join the contigous and spliced segment hits into full length
3703 # read alignments
3704 # -- spliced mappings built from all segment mappings vs genome and junc_db
3705 join_mapped_segments(params,
3706 sam_header_filename,
3707 reads,
3708 ref_fasta,
3709 possible_juncs,
3710 possible_insertions,
3711 possible_deletions,
3712 possible_fusions,
3713 maps[ri].seg_maps,
3714 spliced_seg_maps,
3715 mapped_reads)
3716
3717 if not params.system_params.keep_tmp:
3718 for seg_map in maps[ri].seg_maps:
3719 removeFileWithIndex(seg_map)
3720
3721 for spliced_seg_map in spliced_seg_maps:
3722 removeFileWithIndex(spliced_seg_map)
3723
3724 maps[ri] = []
3725 if m2g_map and \
3726 nonzeroFile(m2g_map):
3727 maps[ri].append(m2g_map)
3728
3729 if unspl_samfile and \
3730 nonzeroFile(unspl_samfile):
3731 maps[ri].append(unspl_samfile)
3732
3733 if mapped_reads and nonzeroFile(mapped_reads):
3734 maps[ri].append(mapped_reads)
3735 else:
3736 for bam_i in range(0, params.system_params.num_threads):
3737 temp_bam = mapped_reads[:-4] + str(bam_i) + ".bam"
3738 if nonzeroFile(temp_bam):
3739 maps[ri].append(mapped_reads[:-4])
3740 break
3741
3742 return maps
3743
3744 # rough equivalent of the 'which' command to find external programs
3745 # (current script path is tested first, then PATH envvar)
3746 def which(program):
3747 def is_executable(fpath):
3748 return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
3749 fpath, fname = os.path.split(program)
3750 if fpath:
3751 if is_executable(program):
3752 return program
3753 else:
3754 progpath = os.path.join(bin_dir, program)
3755 if is_executable(progpath):
3756 return progpath
3757 for path in os.environ["PATH"].split(os.pathsep):
3758 progpath = os.path.join(path, program)
3759 if is_executable(progpath):
3760 return progpath
3761 return None
3762
3763 def prog_path(program):
3764 progpath=which(program)
3765 if progpath == None:
3766 die("Error locating program: "+program)
3767 return progpath
3768
3769 # FIXME: this should get set during the make dist autotools phase of the build
3770 def get_version():
3771 return "2.0.5"
3772
3773 def mlog(msg):
3774 print >> sys.stderr, "[DBGLOG]:"+msg
3775
3776 def test_input_file(filename):
3777 try:
3778 test_file = open(filename, "r")
3779 except IOError, o:
3780 die("Error: Opening file %s" % filename)
3781 return
3782
3783 def main(argv=None):
3784 warnings.filterwarnings("ignore", "tmpnam is a potential security risk")
3785
3786 # Initialize default parameter values
3787 params = TopHatParams()
3788 try:
3789 if argv is None:
3790 argv = sys.argv
3791 args = params.parse_options(argv)
3792 if params.resume_dir:
3793 newargv=doResume(params.resume_dir)
3794 #if not args[0]:
3795 args = params.parse_options(newargv)
3796 params.check()
3797
3798 bwt_idx_prefix = args[0]
3799 left_reads_list = args[1]
3800 left_quals_list, right_quals_list = None, None
3801 if (not params.read_params.quals and len(args) > 2) or (params.read_params.quals and len(args) > 3):
3802 if params.read_params.mate_inner_dist == None:
3803 params.read_params.mate_inner_dist = 50
3804 #die("Error: you must set the mean inner distance between mates with -r")
3805
3806 right_reads_list = args[2]
3807 if params.read_params.quals:
3808 left_quals_list = args[3]
3809 right_quals_list = args[4]
3810 else:
3811 right_reads_list = None
3812 if params.read_params.quals:
3813 left_quals_list = args[2]
3814
3815 start_time = datetime.now()
3816 prepare_output_dir()
3817 init_logger(logging_dir + "tophat.log")
3818
3819 th_logp()
3820 if resumeStage>0:
3821 th_log("Resuming TopHat run in directory '"+output_dir+"', from stage: '"+stageNames[resumeStage]+"'")
3822 else:
3823 th_log("Beginning TopHat run (v"+get_version()+")")
3824 th_logp("-----------------------------------------------")
3825
3826 global run_log
3827 run_log = open(logging_dir + "run.log", "w", 0)
3828 global run_cmd
3829 run_cmd = " ".join(argv)
3830 print >> run_log, run_cmd
3831
3832 check_bowtie(params)
3833 check_samtools()
3834
3835 # Validate all the input files, check all prereqs before committing
3836 # to the run
3837 if params.gff_annotation:
3838 if not os.path.exists(params.gff_annotation):
3839 die("Error: cannot find transcript file %s" % params.gff_annotation)
3840 if os.path.getsize(params.gff_annotation)<10:
3841 die("Error: invalid transcript file %s" % params.gff_annotation)
3842
3843 if params.transcriptome_index:
3844 if params.gff_annotation:
3845 #gff file given, so transcriptome data will be written there
3846 gff_basename = getFileBaseName(params.gff_annotation)
3847 #just in case, check if it's not already there (-G/--GTF given again by mistake)
3848 tpath, tname = os.path.split(params.transcriptome_index)
3849 new_subdir=False
3850 if tpath in (".", "./") or not tpath :
3851 if not os.path.exists(params.transcriptome_index):
3852 os.makedirs(params.transcriptome_index)
3853 new_subdir=True
3854 if new_subdir or (os.path.exists(params.transcriptome_index) and os.path.isdir(params.transcriptome_index)):
3855 params.transcriptome_index = os.path.join(params.transcriptome_index, gff_basename)
3856 gff_out=params.transcriptome_index+".gff"
3857 if not (os.path.exists(gff_out) and os.path.getsize(gff_out)==os.path.getsize(params.gff_annotation)):
3858 #generate the transcriptome data files
3859 tpath, tname = os.path.split(params.transcriptome_index)
3860 params.transcriptome_outdir=tpath
3861 t_gff=params.transcriptome_index+".gff"
3862 if params.transcriptome_outdir:
3863 #will create the transcriptome data files
3864 if not os.path.exists(params.transcriptome_outdir):
3865 os.makedirs(params.transcriptome_outdir)
3866 copy(params.gff_annotation, t_gff)
3867 else:
3868 #try to use existing transcriptome data files
3869
3870 if not (os.path.exists(t_gff) and os.path.getsize(t_gff)>10):
3871 die("Error: GFF transcripts file not found or invalid (%s)" % t_gff)
3872 check_bowtie_index(params.transcriptome_index, params.bowtie2)
3873 params.gff_annotation=t_gff
3874 #end @ transcriptome_index given
3875
3876 (ref_fasta, ref_seq_dict) = check_index(bwt_idx_prefix, params.bowtie2)
3877
3878 if currentStage >= resumeStage:
3879 th_log("Generating SAM header for "+bwt_idx_prefix)
3880 # we need to provide another name for this sam header as genome and transcriptome may have the same prefix.
3881 sam_header_filename = get_index_sam_header(params, bwt_idx_prefix, "genome")
3882 params.sam_header = sam_header_filename
3883 #if not params.skip_check_reads:
3884 reads_list = left_reads_list
3885 if right_reads_list:
3886 reads_list = reads_list + "," + right_reads_list
3887 params.read_params = check_reads_format(params, reads_list)
3888
3889 user_supplied_juncs = []
3890 user_supplied_insertions = []
3891 user_supplied_deletions = []
3892 user_supplied_fusions = []
3893 global gtf_juncs
3894 if params.gff_annotation and params.find_GFF_juncs:
3895 test_input_file(params.gff_annotation)
3896 (found_juncs, gtf_juncs) = get_gtf_juncs(params.gff_annotation)
3897 ##-- we shouldn't need these junctions in user_supplied_juncs anymore because now map2gtf does a much better job
3898 ## but we still need them loaded in gtf_juncs for later splice verification
3899 if found_juncs:
3900 ## and not params.gff_annotation:
3901 user_supplied_juncs.append(gtf_juncs)
3902 #else:
3903 # gtf_juncs = None
3904 if params.raw_junctions:
3905 test_input_file(params.raw_junctions)
3906 user_supplied_juncs.append(params.raw_junctions)
3907
3908 if params.raw_insertions:
3909 test_input_file(params.raw_insertions)
3910 user_supplied_insertions.append(params.raw_insertions)
3911
3912 if params.raw_deletions:
3913 test_input_file(params.raw_deletions)
3914 user_supplied_deletions.append(params.raw_deletions)
3915
3916 global unmapped_reads_fifo
3917 unmapped_reads_fifo = tmp_dir + str(os.getpid())+".bwt_unmapped.z.fifo"
3918
3919 # Now start the time consuming stuff
3920 if params.prefilter_multi:
3921 sides=("left","right")
3922 read_lists=(left_reads_list, right_reads_list)
3923 qual_lists=(left_quals_list, right_quals_list)
3924 for ri in (0,1):
3925 reads_list=read_lists[ri]
3926 if not reads_list:
3927 continue
3928 fmulti_ext="bam"
3929 if not params.bowtie2:
3930 fmulti_ext="fq"
3931 params.preflt_data[ri].seqfiles = reads_list
3932 params.preflt_data[ri].qualfiles = qual_lists[ri]
3933 params.preflt_data[ri].multihit_reads = tmp_dir + sides[ri]+"_multimapped."+fmulti_ext
3934 side_imap = tmp_dir + sides[ri]+"_im"
3935 #if use_zpacker: side_imap+=".z"
3936 side_ium = tmp_dir + sides[ri]+"_ium"
3937 #if use_BWT_FIFO and not params.bowtie2:
3938 # side_ium += ".z"
3939 th_log("Pre-filtering multi-mapped "+sides[ri]+" reads")
3940 rdlist=reads_list.split(',')
3941 bwt=bowtie(params, bwt_idx_prefix, sam_header_filename, rdlist,
3942 params.read_params.reads_format,
3943 params.read_mismatches,
3944 params.read_gap_length,
3945 params.read_edit_dist,
3946 params.read_realign_edit_dist,
3947 side_imap, side_ium,
3948 "", _reads_vs_G, ri ) # multi-mapped reads will be in params.preflt_data[ri].multihit_reads
3949 params.preflt_data[ri].mappings = bwt[0] # initial mappings
3950 params.preflt_data[ri].unmapped_reads = bwt[1] # IUM reads
3951 if currentStage >= resumeStage:
3952 th_log("Preparing reads")
3953 else:
3954 th_log("Prepared reads:")
3955 prep_info=None
3956
3957 multihit_reads = []
3958 if params.preflt_data[0].multihit_reads:
3959 multihit_reads += [params.preflt_data[0].multihit_reads]
3960 if params.preflt_data[1].multihit_reads:
3961 multihit_reads += [params.preflt_data[1].multihit_reads]
3962 prep_info= prep_reads(params,
3963 left_reads_list, left_quals_list,
3964 right_reads_list, right_quals_list,
3965 multihit_reads)
3966
3967 min_read_len = prep_info.min_len[0]
3968 if prep_info.min_len[1] > 0 and min_read_len > prep_info.min_len[1]:
3969 min_read_len = prep_info.min_len[1]
3970 if min_read_len < 20:
3971 th_logp("Warning: short reads (<20bp) will make TopHat quite slow and take large amount of memory because they are likely to be mapped to too many places")
3972
3973 max_read_len=max(prep_info.max_len[0], prep_info.max_len[1])
3974
3975 seed_len=params.read_params.seed_length
3976 if seed_len: #if read len was explicitly given
3977 seed_len = max(seed_len, min_read_len)
3978 #can't be smaller than minimum length observed
3979 else:
3980 seed_len = max_read_len
3981 params.read_params.seed_length=seed_len
3982 # turn off integer-quals
3983 if params.read_params.integer_quals:
3984 params.read_params.integer_quals = False
3985
3986 input_reads = prep_info.kept_reads[:]
3987 mappings = spliced_alignment(params,
3988 bwt_idx_prefix,
3989 sam_header_filename,
3990 ref_fasta,
3991 params.read_params.seed_length,
3992 params.segment_length,
3993 input_reads,
3994 user_supplied_juncs,
3995 user_supplied_insertions,
3996 user_supplied_deletions)
3997 setRunStage(_stage_tophat_reports)
3998 compile_reports(params,
3999 sam_header_filename,
4000 ref_fasta,
4001 mappings,
4002 input_reads,
4003 params.gff_annotation)
4004
4005 if not params.system_params.keep_tmp:
4006 try:
4007 for m in mappings[0]:
4008 os.remove(m)
4009 for m in input_reads:
4010 if m and os.path.exists(m): os.remove(m)
4011 for m in mappings[1]:
4012 os.remove(m)
4013 tmp_files = os.listdir(tmp_dir)
4014 for t in tmp_files:
4015 os.remove(tmp_dir+t)
4016 os.rmdir(tmp_dir)
4017 except OSError, o:
4018 th_logp("Warning: couldn't remove all temporary files in "+tmp_dir)
4019
4020 finish_time = datetime.now()
4021 duration = finish_time - start_time
4022 th_logp("-----------------------------------------------")
4023 th_log("Run complete: %s elapsed" % formatTD(duration))
4024
4025 except Usage, err:
4026 th_logp(sys.argv[0].split("/")[-1] + ": " + str(err.msg))
4027 th_logp(" for detailed help see http://tophat.cbcb.umd.edu/manual.html")
4028 return 2
4029
4030
4031 if __name__ == "__main__":
4032 sys.exit(main())

Properties

Name Value
svn:executable *