1 |
#!/usr/bin/env python |
2 |
|
3 |
# encoding: utf-8 |
4 |
""" |
5 |
tophat.py |
6 |
|
7 |
Created by Cole Trapnell on 2008-12-25. |
8 |
Copyright (c) 2008 Cole Trapnell. All rights reserved. |
9 |
Updated and maintained by Daehwan Kim and Geo Pertea since Jul 2010. |
10 |
""" |
11 |
import sys |
12 |
try: |
13 |
import psyco |
14 |
psyco.full() |
15 |
except ImportError: |
16 |
pass |
17 |
|
18 |
import getopt |
19 |
import subprocess |
20 |
import errno |
21 |
import os |
22 |
import warnings |
23 |
import re |
24 |
import glob |
25 |
import signal |
26 |
from datetime import datetime, date, time |
27 |
from shutil import copy |
28 |
import logging |
29 |
|
30 |
use_message = ''' |
31 |
TopHat maps short sequences from spliced transcripts to whole genomes. |
32 |
|
33 |
Usage: |
34 |
tophat [options] <bowtie_index> <reads1[,reads2,...]> [reads1[,reads2,...]] \\ |
35 |
[quals1,[quals2,...]] [quals1[,quals2,...]] |
36 |
|
37 |
Options: |
38 |
-v/--version |
39 |
-o/--output-dir <string> [ default: ./tophat_out ] |
40 |
--bowtie1 [ default: bowtie2 ] |
41 |
-N/--read-mismatches <int> [ default: 2 ] |
42 |
--read-gap-length <int> [ default: 2 ] |
43 |
--read-edit-dist <int> [ default: 2 ] |
44 |
--read-realign-edit-dist <int> [ default: 2 ] |
45 |
-a/--min-anchor <int> [ default: 8 ] |
46 |
-m/--splice-mismatches <0-2> [ default: 0 ] |
47 |
-i/--min-intron-length <int> [ default: 50 ] |
48 |
-I/--max-intron-length <int> [ default: 500000 ] |
49 |
-g/--max-multihits <int> [ default: 20 ] |
50 |
--suppress-hits |
51 |
-x/--transcriptome-max-hits <int> [ default: 60 ] |
52 |
-M/--prefilter-multihits ( for -G/--GTF option, enable |
53 |
an initial bowtie search |
54 |
against the genome ) |
55 |
--max-insertion-length <int> [ default: 3 ] |
56 |
--max-deletion-length <int> [ default: 3 ] |
57 |
--solexa-quals |
58 |
--solexa1.3-quals (same as phred64-quals) |
59 |
--phred64-quals (same as solexa1.3-quals) |
60 |
-Q/--quals |
61 |
--integer-quals |
62 |
-C/--color (Solid - color space) |
63 |
--color-out |
64 |
--library-type <string> (fr-unstranded, fr-firststrand, |
65 |
fr-secondstrand) |
66 |
-p/--num-threads <int> [ default: 1 ] |
67 |
-R/--resume <out_dir> ( try to resume execution ) |
68 |
-G/--GTF <filename> (GTF/GFF with known transcripts) |
69 |
--transcriptome-index <bwtidx> (transcriptome bowtie index) |
70 |
-T/--transcriptome-only (map only to the transcriptome) |
71 |
-j/--raw-juncs <filename> |
72 |
--insertions <filename> |
73 |
--deletions <filename> |
74 |
-r/--mate-inner-dist <int> |
75 |
--mate-std-dev <int> [ default: 20 ] |
76 |
--no-novel-juncs |
77 |
--no-novel-indels |
78 |
--no-gtf-juncs |
79 |
--no-coverage-search |
80 |
--coverage-search |
81 |
--microexon-search |
82 |
--keep-tmp |
83 |
--tmp-dir <dirname> [ default: <output_dir>/tmp ] |
84 |
-z/--zpacker <program> [ default: gzip ] |
85 |
-X/--unmapped-fifo [use mkfifo to compress more temporary |
86 |
files for color space reads] |
87 |
|
88 |
Advanced Options: |
89 |
--report-secondary-alignments |
90 |
--no-discordant |
91 |
--no-mixed |
92 |
|
93 |
--segment-mismatches <int> [ default: 2 ] |
94 |
--segment-length <int> [ default: 25 ] |
95 |
|
96 |
--bowtie-n [ default: bowtie -v ] |
97 |
--min-coverage-intron <int> [ default: 50 ] |
98 |
--max-coverage-intron <int> [ default: 20000 ] |
99 |
--min-segment-intron <int> [ default: 50 ] |
100 |
--max-segment-intron <int> [ default: 500000 ] |
101 |
--no-sort-bam (Output BAM is not coordinate-sorted) |
102 |
--no-convert-bam (Do not output bam format. |
103 |
Output is <output_dir>/accepted_hit.sam) |
104 |
--keep-fasta-order |
105 |
--allow-partial-mapping |
106 |
|
107 |
Bowtie2 related options: |
108 |
Preset options in --end-to-end mode (local alignment is not used in TopHat2) |
109 |
--b2-very-fast |
110 |
--b2-fast |
111 |
--b2-sensitive |
112 |
--b2-very-sensitive |
113 |
|
114 |
Alignment options |
115 |
--b2-N <int> [ default: 0 ] |
116 |
--b2-L <int> [ default: 20 ] |
117 |
--b2-i <func> [ default: S,1,1.25 ] |
118 |
--b2-n-ceil <func> [ default: L,0,0.15 ] |
119 |
--b2-gbar <int> [ default: 4 ] |
120 |
|
121 |
Scoring options |
122 |
--b2-mp <int>,<int> [ default: 6,2 ] |
123 |
--b2-np <int> [ default: 1 ] |
124 |
--b2-rdg <int>,<int> [ default: 5,3 ] |
125 |
--b2-rfg <int>,<int> [ default: 5,3 ] |
126 |
--b2-score-min <func> [ default: L,-0.6,-0.6 ] |
127 |
|
128 |
Effort options |
129 |
--b2-D <int> [ default: 15 ] |
130 |
--b2-R <int> [ default: 2 ] |
131 |
|
132 |
Fusion related options: |
133 |
--fusion-search |
134 |
--fusion-anchor-length <int> [ default: 20 ] |
135 |
--fusion-min-dist <int> [ default: 10000000 ] |
136 |
--fusion-read-mismatches <int> [ default: 2 ] |
137 |
--fusion-multireads <int> [ default: 2 ] |
138 |
--fusion-multipairs <int> [ default: 2 ] |
139 |
--fusion-ignore-chromosomes <list> [ e.g, <chrM,chrX> ] |
140 |
|
141 |
--fusion-do-not-resolve-conflicts [this is for test purposes ] |
142 |
|
143 |
SAM Header Options (for embedding sequencing run metadata in output): |
144 |
--rg-id <string> (read group ID) |
145 |
--rg-sample <string> (sample ID) |
146 |
--rg-library <string> (library ID) |
147 |
--rg-description <string> (descriptive string, no tabs allowed) |
148 |
--rg-platform-unit <string> (e.g Illumina lane ID) |
149 |
--rg-center <string> (sequencing center name) |
150 |
--rg-date <string> (ISO 8601 date of the sequencing run) |
151 |
--rg-platform <string> (Sequencing platform descriptor) |
152 |
''' |
153 |
|
154 |
# Deprecated: |
155 |
# --min-closure-exon <int> [ default: 100 ] |
156 |
# --min-closure-intron <int> [ default: 50 ] |
157 |
# --max-closure-intron <int> [ default: 5000 ] |
158 |
# --no-closure-search |
159 |
# --closure-search |
160 |
# --butterfly-search |
161 |
# --no-butterfly-search |
162 |
# -F/--min-isoform-fraction <float> [ default: 0.15 ] |
163 |
|
164 |
class Usage(Exception): |
165 |
def __init__(self, msg): |
166 |
self.msg = msg |
167 |
|
168 |
output_dir = "./tophat_out/" |
169 |
logging_dir = output_dir + "logs/" |
170 |
run_log = None |
171 |
tophat_log = None #main log file handle |
172 |
tophat_logger = None # main logging object |
173 |
run_cmd = None |
174 |
tmp_dir = output_dir + "tmp/" |
175 |
bin_dir = sys.path[0] + "/" |
176 |
use_zpacker = False # this is set by -z/--zpacker option (-z0 leaves it False) |
177 |
|
178 |
use_BAM_Unmapped = False # automatically set to True for non-Solid reads, handles unmapped reads in BAM format |
179 |
|
180 |
use_BWT_FIFO = False # can only be set to True if use_zpacker is True and only with -C/--color |
181 |
# enabled by -X/-unmapped-fifo option (unless -z0) |
182 |
unmapped_reads_fifo = None # if use_BWT_FIFO is True, this tricks bowtie into writing the |
183 |
# unmapped reads into a compressed file |
184 |
|
185 |
samtools_path = None |
186 |
bowtie_path = None |
187 |
fail_str = "\t[FAILED]\n" |
188 |
gtf_juncs = None #file name with junctions extracted from given GFF file |
189 |
|
190 |
#mapping types: |
191 |
|
192 |
_reads_vs_G, _reads_vs_T, _segs_vs_G, _segs_vs_J = range(1,5) |
193 |
|
194 |
# execution resuming stages (for now, execution can be resumed only for stages |
195 |
# after the pre-filter and transcriptome searches): |
196 |
_stage_map_segments, _stage_find_juncs, _stage_juncs_db, _stage_map2juncs, _stage_tophat_reports = range(1,6) |
197 |
stageNames = ["start", "map_segments", "find_juncs", "juncs_db", "map2juncs", "tophat_reports"] |
198 |
# 0 1 2 3 4 5 |
199 |
runStages = dict([(stageNames[st], st) for st in range(0, 6)]) |
200 |
currentStage = 0 |
201 |
resumeStage = 0 |
202 |
|
203 |
def doResume(odir): |
204 |
#must return the original list of arguments |
205 |
rlog = odir+"/logs/run.log" |
206 |
oldargv = None |
207 |
try: |
208 |
flog=open(rlog) |
209 |
#first line must be the actual tophat command used |
210 |
thcmd=None |
211 |
try: |
212 |
thcmd = next(flog) |
213 |
except StopIteration as e: |
214 |
die("Error: cannot resume, run.log is empty.") |
215 |
oldargv=thcmd.split() |
216 |
mlog("Tophat command to resume: "+thcmd) |
217 |
resume_tag = None |
218 |
for line in flog: |
219 |
#scan for last resume code, if any |
220 |
r=re.match("^#>(\w+):$", line) |
221 |
if r: |
222 |
resume_tag=r.group(1) |
223 |
mlog("resume_tag found: "+resume_tag) |
224 |
global resumeStage |
225 |
if resume_tag: |
226 |
if resume_tag in runStages: |
227 |
resumeStage = runStages[resume_tag] |
228 |
th_log("Resuming TopHat run from stage: " + resume_tag) |
229 |
else: |
230 |
die("Error: unrecognized run stage '"+resume_tag+"'") |
231 |
else: |
232 |
die("Error: resuming requested but no valid stage found in run.log") |
233 |
flog.close() |
234 |
except IOError as e: |
235 |
die("Error: cannot resume, failed to open "+rlog) |
236 |
return oldargv |
237 |
|
238 |
def setRunStage(stnum): |
239 |
global currentStage |
240 |
print >> run_log, "#>"+stageNames[stnum]+":" |
241 |
currentStage = stnum |
242 |
|
243 |
def init_logger(log_fname): |
244 |
global tophat_logger |
245 |
tophat_logger = logging.getLogger('project') |
246 |
formatter = logging.Formatter('%(asctime)s %(message)s', '[%Y-%m-%d %H:%M:%S]') |
247 |
# level = logging.__dict__.get(options.loglevel.upper(),logging.DEBUG) |
248 |
tophat_logger.setLevel(logging.DEBUG) |
249 |
|
250 |
hstream = logging.StreamHandler(sys.stderr) |
251 |
hstream.setFormatter(formatter) |
252 |
tophat_logger.addHandler(hstream) |
253 |
# |
254 |
# Output logging information to file |
255 |
if os.path.isfile(log_fname): |
256 |
os.remove(log_fname) |
257 |
global tophat_log |
258 |
logfh = logging.FileHandler(log_fname) |
259 |
logfh.setFormatter(formatter) |
260 |
tophat_logger.addHandler(logfh) |
261 |
tophat_log=logfh.stream |
262 |
|
263 |
# TopHatParams captures all of the runtime paramaters used by TopHat, and many |
264 |
# of these are passed as command line options to exectubles run by the pipeline |
265 |
|
266 |
# This class and its nested classes also do options parsing through parse_options() |
267 |
# and option validation via the member function check() |
268 |
|
269 |
class BowtieFltFiles: |
270 |
def __init__(self, |
271 |
seqfiles=None, qualfiles=None, |
272 |
mappings=None, |
273 |
unmapped_reads=None, |
274 |
multihit_reads=None): |
275 |
self.seqfiles=seqfiles |
276 |
self.qualfiles=qualfiles |
277 |
self.mappings=mappings |
278 |
self.unmapped_reads=unmapped_reads |
279 |
self.multihit_reads=multihit_reads |
280 |
|
281 |
class TopHatParams: |
282 |
|
283 |
# SpliceConstraints is a group of runtime parameters that specify what |
284 |
# constraints to put on junctions discovered by the program. These constraints |
285 |
# are used to filter out spurious/false positive junctions. |
286 |
|
287 |
class SpliceConstraints: |
288 |
def __init__(self, |
289 |
min_anchor_length, |
290 |
min_intron_length, |
291 |
max_intron_length, |
292 |
splice_mismatches, |
293 |
min_isoform_fraction): |
294 |
self.min_anchor_length = min_anchor_length |
295 |
self.min_intron_length = min_intron_length |
296 |
self.max_intron_length = max_intron_length |
297 |
self.splice_mismatches = splice_mismatches |
298 |
self.min_isoform_fraction = min_isoform_fraction |
299 |
|
300 |
def parse_options(self, opts): |
301 |
for option, value in opts: |
302 |
if option in ("-m", "--splice-mismatches"): |
303 |
self.splice_mismatches = int(value) |
304 |
elif option in ("-a", "--min-anchor"): |
305 |
self.min_anchor_length = int(value) |
306 |
elif option in ("-F", "--min-isoform-fraction"): |
307 |
self.min_isoform_fraction = float(value) |
308 |
elif option in ("-i", "--min-intron-length"): |
309 |
self.min_intron_length = int(value) |
310 |
elif option in ("-I", "--max-intron-length"): |
311 |
self.max_intron_length = int(value) |
312 |
|
313 |
def check(self): |
314 |
if self.splice_mismatches not in [0,1,2]: |
315 |
die("Error: arg to --splice-mismatches must be 0, 1, or 2") |
316 |
if self.min_anchor_length < 4: |
317 |
die("Error: arg to --min-anchor-len must be greater than 4") |
318 |
if self.min_isoform_fraction < 0.0 or self.min_isoform_fraction > 1.0: |
319 |
die("Error: arg to --min-isoform-fraction must be between 0.0 and 1.0") |
320 |
if self.min_intron_length <= 0: |
321 |
die("Error: arg to --min-intron-length must be greater than 0") |
322 |
if self.max_intron_length <= 0: |
323 |
die("Error: arg to --max-intron-length must be greater than 0") |
324 |
|
325 |
# SystemParams is a group of runtime parameters that determine how to handle |
326 |
# temporary files produced during a run and how many threads to use for threaded |
327 |
# stages of the pipeline (e.g. Bowtie) |
328 |
|
329 |
class SystemParams: |
330 |
def __init__(self, |
331 |
num_threads, |
332 |
keep_tmp): |
333 |
self.num_threads = num_threads |
334 |
self.keep_tmp = keep_tmp |
335 |
self.zipper = "gzip" |
336 |
self.zipper_opts= [] |
337 |
|
338 |
def parse_options(self, opts): |
339 |
global use_zpacker |
340 |
global use_BWT_FIFO |
341 |
for option, value in opts: |
342 |
if option in ("-p", "--num-threads"): |
343 |
self.num_threads = int(value) |
344 |
elif option == "--keep-tmp": |
345 |
self.keep_tmp = True |
346 |
elif option in ("-z","--zpacker"): |
347 |
if value.lower() in ["-", " ", ".", "0", "none", "f", "false", "no"]: |
348 |
value="" |
349 |
self.zipper = value |
350 |
#if not self.zipper: |
351 |
# self.zipper='gzip' |
352 |
elif option in ("-X", "--unmapped-fifo"): |
353 |
use_BWT_FIFO=True |
354 |
if self.zipper: |
355 |
use_zpacker=True |
356 |
if self.num_threads>1 and not self.zipper_opts: |
357 |
if self.zipper.endswith('pbzip2') or self.zipper.endswith('pigz'): |
358 |
self.zipper_opts.append('-p'+str(self.num_threads)) |
359 |
else: |
360 |
use_zpacker=False |
361 |
if use_BWT_FIFO: use_BWT_FIFO=False |
362 |
def cmd(self): |
363 |
cmdline=[] |
364 |
if self.zipper: |
365 |
cmdline.extend(['-z',self.zipper]) |
366 |
if self.num_threads>1: |
367 |
cmdline.extend(['-p'+str(self.num_threads)]) |
368 |
return cmdline |
369 |
|
370 |
def check(self): |
371 |
if self.num_threads<1 : |
372 |
die("Error: arg to --num-threads must be greater than 0") |
373 |
if self.zipper: |
374 |
xzip=which(self.zipper) |
375 |
if not xzip: |
376 |
die("Error: cannot find compression program "+self.zipper) |
377 |
|
378 |
# ReadParams is a group of runtime parameters that specify various properties |
379 |
# of the user's reads (e.g. which quality scale their are on, how long the |
380 |
# fragments are, etc). |
381 |
class ReadParams: |
382 |
def __init__(self, |
383 |
solexa_quals, |
384 |
phred64_quals, |
385 |
quals, |
386 |
integer_quals, |
387 |
color, |
388 |
library_type, |
389 |
seed_length, |
390 |
reads_format, |
391 |
mate_inner_dist, |
392 |
mate_inner_dist_std_dev, |
393 |
read_group_id, |
394 |
sample_id, |
395 |
library_id, |
396 |
description, |
397 |
seq_platform_unit, |
398 |
seq_center, |
399 |
seq_run_date, |
400 |
seq_platform): |
401 |
self.solexa_quals = solexa_quals |
402 |
self.phred64_quals = phred64_quals |
403 |
self.quals = quals |
404 |
self.integer_quals = integer_quals |
405 |
self.color = color |
406 |
self.library_type = library_type |
407 |
self.seed_length = seed_length |
408 |
self.reads_format = reads_format |
409 |
self.mate_inner_dist = mate_inner_dist |
410 |
self.mate_inner_dist_std_dev = mate_inner_dist_std_dev |
411 |
self.read_group_id = read_group_id |
412 |
self.sample_id = sample_id |
413 |
self.library_id = library_id |
414 |
self.description = description |
415 |
self.seq_platform_unit = seq_platform_unit |
416 |
self.seq_center = seq_center |
417 |
self.seq_run_date = seq_run_date |
418 |
self.seq_platform = seq_platform |
419 |
|
420 |
def parse_options(self, opts): |
421 |
for option, value in opts: |
422 |
if option == "--solexa-quals": |
423 |
self.solexa_quals = True |
424 |
elif option in ("--solexa1.3-quals", "--phred64-quals"): |
425 |
self.phred64_quals = True |
426 |
elif option in ("-Q", "--quals"): |
427 |
self.quals = True |
428 |
elif option == "--integer-quals": |
429 |
self.integer_quals = True |
430 |
elif option in ("-C", "--color"): |
431 |
self.color = True |
432 |
elif option == "--library-type": |
433 |
self.library_type = value |
434 |
elif option in ("-s", "--seed-length"): |
435 |
self.seed_length = int(value) |
436 |
elif option in ("-r", "--mate-inner-dist"): |
437 |
self.mate_inner_dist = int(value) |
438 |
elif option == "--mate-std-dev": |
439 |
self.mate_inner_dist_std_dev = int(value) |
440 |
elif option == "--rg-id": |
441 |
self.read_group_id = value |
442 |
elif option == "--rg-sample": |
443 |
self.sample_id = value |
444 |
elif option == "--rg-library": |
445 |
self.library_id = value |
446 |
elif option == "--rg-description": |
447 |
self.description = value |
448 |
elif option == "--rg-platform-unit": |
449 |
self.seq_platform_unit = value |
450 |
elif option == "--rg-center": |
451 |
self.seq_center = value |
452 |
elif option == "--rg-date": |
453 |
self.seq_run_date = value |
454 |
elif option == "--rg-platform": |
455 |
self.seq_platform = value |
456 |
|
457 |
def check(self): |
458 |
if self.seed_length and self.seed_length < 20: |
459 |
die("Error: arg to --seed-length must be at least 20") |
460 |
|
461 |
if self.mate_inner_dist_std_dev != None and self.mate_inner_dist_std_dev < 0: |
462 |
die("Error: arg to --mate-std-dev must at least 0") |
463 |
if (not self.read_group_id and self.sample_id) or (self.read_group_id and not self.sample_id): |
464 |
die("Error: --rg-id and --rg-sample must be specified or omitted together") |
465 |
|
466 |
# SearchParams is a group of runtime parameters that specify how TopHat will |
467 |
# search for splice junctions |
468 |
|
469 |
class SearchParams: |
470 |
def __init__(self, |
471 |
min_closure_exon, |
472 |
min_closure_intron, |
473 |
max_closure_intron, |
474 |
min_coverage_intron, |
475 |
max_coverage_intron, |
476 |
min_segment_intron, |
477 |
max_segment_intron): |
478 |
|
479 |
self.min_closure_exon_length = min_closure_exon |
480 |
self.min_closure_intron_length = min_closure_intron |
481 |
self.max_closure_intron_length = max_closure_intron |
482 |
self.min_coverage_intron_length = min_coverage_intron |
483 |
self.max_coverage_intron_length = max_coverage_intron |
484 |
self.min_segment_intron_length = min_segment_intron |
485 |
self.max_segment_intron_length = max_segment_intron |
486 |
|
487 |
def parse_options(self, opts): |
488 |
for option, value in opts: |
489 |
if option == "--min-closure-exon": |
490 |
self.min_closure_exon_length = int(value) |
491 |
if option == "--min-closure-intron": |
492 |
self.min_closure_intron_length = int(value) |
493 |
if option == "--max-closure-intron": |
494 |
self.max_closure_intron_length = int(value) |
495 |
if option == "--min-coverage-intron": |
496 |
self.min_coverage_intron_length = int(value) |
497 |
if option == "--max-coverage-intron": |
498 |
self.max_coverage_intron_length = int(value) |
499 |
if option == "--min-segment-intron": |
500 |
self.min_segment_intron_length = int(value) |
501 |
if option == "--max-segment-intron": |
502 |
self.max_segment_intron_length = int(value) |
503 |
|
504 |
def check(self): |
505 |
if self.min_closure_exon_length < 0: |
506 |
die("Error: arg to --min-closure-exon must be at least 20") |
507 |
if self.min_closure_intron_length < 0: |
508 |
die("Error: arg to --min-closure-intron must be at least 20") |
509 |
if self.max_closure_intron_length < 0: |
510 |
die("Error: arg to --max-closure-intron must be at least 20") |
511 |
if self.min_coverage_intron_length < 0: |
512 |
die("Error: arg to --min-coverage-intron must be at least 20") |
513 |
if self.max_coverage_intron_length < 0: |
514 |
die("Error: arg to --max-coverage-intron must be at least 20") |
515 |
if self.min_segment_intron_length < 0: |
516 |
die("Error: arg to --min-segment-intron must be at least 20") |
517 |
if self.max_segment_intron_length < 0: |
518 |
die("Error: arg to --max-segment-intron must be at least 20") |
519 |
|
520 |
class ReportParams: |
521 |
def __init__(self): |
522 |
self.sort_bam = True |
523 |
self.convert_bam = True |
524 |
|
525 |
def parse_options(self, opts): |
526 |
for option, value in opts: |
527 |
if option == "--no-sort-bam": |
528 |
self.sort_bam = False |
529 |
if option == "--no-convert-bam": |
530 |
self.convert_bam = False |
531 |
|
532 |
class Bowtie2Params: |
533 |
def __init__(self): |
534 |
self.very_fast = False |
535 |
self.fast = False |
536 |
self.sensitive = False |
537 |
self.very_sensitive = False |
538 |
|
539 |
self.N = 0 |
540 |
self.L = 20 |
541 |
self.i = "S,1,1.25" |
542 |
self.n_ceil = "L,0,0.15" |
543 |
self.gbar = 4 |
544 |
|
545 |
self.mp = "6,2" |
546 |
self.np = 1 |
547 |
self.rdg = "5,3" |
548 |
self.rfg = "5,3" |
549 |
# self.score_min = "L,-0.6,-0.6" |
550 |
self.score_min = None |
551 |
|
552 |
self.D = 15 |
553 |
self.R = 2 |
554 |
|
555 |
def parse_options(self, opts): |
556 |
for option, value in opts: |
557 |
if option == "--b2-very-fast": |
558 |
self.very_fast = True |
559 |
if option == "--b2-fast": |
560 |
self.fast = True |
561 |
if option == "--b2-sensitive": |
562 |
self.sensitive = True |
563 |
if option == "--b2-very-sensitive": |
564 |
self.very_sensitive = True |
565 |
|
566 |
if option == "--b2-N": |
567 |
self.N = int(value) |
568 |
if option == "--b2-L": |
569 |
self.L = 20 |
570 |
if option == "--b2-i": |
571 |
self.i = value |
572 |
if option == "--b2-n-ceil": |
573 |
self.n_ceil = value |
574 |
if option == "--b2-gbar": |
575 |
self.gbar = 4 |
576 |
|
577 |
if option == "--b2-mp": |
578 |
self.mp = value |
579 |
if option == "--b2-np": |
580 |
self.np = int(value) |
581 |
if option == "--b2-rdg": |
582 |
self.rdg = value |
583 |
if option == "--b2-rfg": |
584 |
self.rfg = value |
585 |
if option == "--b2-score-min": |
586 |
self.score_min = value |
587 |
|
588 |
if option == "--b2-D": |
589 |
self.D = int(value) |
590 |
if option == "--b2-R": |
591 |
self.R = int(value) |
592 |
|
593 |
def check(self): |
594 |
more_than_once = False |
595 |
if self.very_fast: |
596 |
if self.fast or self.sensitive or self.very_sensitive: |
597 |
more_than_once = True |
598 |
else: |
599 |
if self.fast: |
600 |
if self.sensitive or self.very_sensitive: |
601 |
more_than_once = True |
602 |
else: |
603 |
if self.sensitive and self.very_sensitive: |
604 |
more_than_once = True |
605 |
|
606 |
if more_than_once: |
607 |
die("Error: use only one of --b2-very-fast, --b2-fast, --b2-sensitive, --b2-very-sensitive") |
608 |
|
609 |
if not self.N in [0, 1]: |
610 |
die("Error: arg to --b2-N must be either 0 or 1") |
611 |
|
612 |
function_re = r'^[CLSG],-?\d+(\.\d+)?,-?\d+(\.\d+)?$' |
613 |
function_match = re.search(function_re, self.i) |
614 |
|
615 |
if not function_match: |
616 |
die("Error: arg to --b2-i must be <func> (e.g. --b2-i S,1,1.25)") |
617 |
|
618 |
function_match = re.search(function_re, self.n_ceil) |
619 |
if not function_match: |
620 |
die("Error: arg to --b2-n-ceil must be <func> (e.g. --b2-n-ceil L,0,0.15)") |
621 |
|
622 |
if self.score_min: |
623 |
function_match = re.search(function_re, self.score_min) |
624 |
if not function_match: |
625 |
die("Error: arg to --b2-score-min must be <func> (e.g. --b2-score-min L,-0.6,-0.6)") |
626 |
|
627 |
pair_re = r'^\d+,\d+$' |
628 |
pair_match = re.search(pair_re, self.mp) |
629 |
if not pair_match: |
630 |
die("Error: arg to --b2-mp must be <int>,<int> (e.g. --b2-mp 6,2)") |
631 |
|
632 |
pair_match = re.search(pair_re, self.rdg) |
633 |
if not pair_match: |
634 |
die("Error: arg to --b2-rdg must be <int>,<int> (e.g. --b2-mp 5,3)") |
635 |
|
636 |
pair_match = re.search(pair_re, self.rfg) |
637 |
if not pair_match: |
638 |
die("Error: arg to --b2-rfg must be <int>,<int> (e.g. --b2-mp 5,3)") |
639 |
|
640 |
|
641 |
def __init__(self): |
642 |
self.splice_constraints = self.SpliceConstraints(8, # min_anchor |
643 |
50, # min_intron |
644 |
500000, # max_intron |
645 |
0, # splice_mismatches |
646 |
0.15) # min_isoform_frac |
647 |
|
648 |
self.preflt_data = [ BowtieFltFiles(), BowtieFltFiles() ] |
649 |
self.sam_header = None |
650 |
self.read_params = self.ReadParams(False, # solexa_scale |
651 |
False, |
652 |
False, # quals |
653 |
None, # integer quals |
654 |
False, # SOLiD - color space |
655 |
"", # library type (e.g. "illumina-stranded-pair-end") |
656 |
None, # seed_length |
657 |
"fastq", # quality_format |
658 |
None, # mate inner distance |
659 |
20, # mate inner dist std dev |
660 |
None, # read group id |
661 |
None, # sample id |
662 |
None, # library id |
663 |
None, # description |
664 |
None, # platform unit (i.e. lane) |
665 |
None, # sequencing center |
666 |
None, # run date |
667 |
None) # sequencing platform |
668 |
|
669 |
self.system_params = self.SystemParams(1, # bowtie_threads (num_threads) |
670 |
False) # keep_tmp |
671 |
|
672 |
self.search_params = self.SearchParams(100, # min_closure_exon_length |
673 |
50, # min_closure_intron_length |
674 |
5000, # max_closure_intron_length |
675 |
50, # min_coverage_intron_length |
676 |
20000, # max_coverage_intron_length |
677 |
50, # min_segment_intron_length |
678 |
500000) # max_segment_intron_length |
679 |
|
680 |
self.report_params = self.ReportParams() |
681 |
|
682 |
self.bowtie2_params = self.Bowtie2Params() |
683 |
|
684 |
self.bowtie2 = True |
685 |
self.gff_annotation = None |
686 |
self.transcriptome_only = False |
687 |
self.transcriptome_index = None |
688 |
self.transcriptome_outdir = None |
689 |
self.raw_junctions = None |
690 |
self.resume_dir = None |
691 |
self.find_novel_juncs = True |
692 |
self.find_novel_indels = True |
693 |
self.find_novel_fusions = True |
694 |
self.find_GFF_juncs = True |
695 |
self.max_hits = 20 |
696 |
self.suppress_hits = False |
697 |
self.t_max_hits = 60 |
698 |
self.max_seg_hits = 40 |
699 |
self.prefilter_multi = False |
700 |
self.read_mismatches = 2 |
701 |
self.read_gap_length = 2 |
702 |
self.read_edit_dist = 2 |
703 |
self.read_realign_edit_dist = None |
704 |
self.segment_length = 25 |
705 |
self.segment_mismatches = 2 |
706 |
self.bowtie_alignment_option = "-v" |
707 |
self.max_insertion_length = 3 |
708 |
self.max_deletion_length = 3 |
709 |
self.raw_insertions = None |
710 |
self.raw_deletions = None |
711 |
self.coverage_search = None |
712 |
self.closure_search = False |
713 |
#self.butterfly_search = None |
714 |
self.butterfly_search = False |
715 |
self.microexon_search = False |
716 |
self.report_secondary_alignments = False |
717 |
self.report_discordant_pair_alignments = True |
718 |
self.report_mixed_alignments = True |
719 |
|
720 |
# experimental -W option to activate score and edit distance filtering |
721 |
# in fix_map_ordering (hits post processing) |
722 |
self.b2scoreflt = False |
723 |
|
724 |
self.keep_fasta_order = False |
725 |
self.partial_mapping = False |
726 |
|
727 |
self.fusion_search = False |
728 |
self.fusion_anchor_length = 20 |
729 |
self.fusion_min_dist = 10000000 |
730 |
self.fusion_read_mismatches = 2 |
731 |
self.fusion_multireads = 2 |
732 |
self.fusion_multipairs = 2 |
733 |
self.fusion_ignore_chromosomes = [] |
734 |
self.fusion_do_not_resolve_conflicts = False |
735 |
|
736 |
def check(self): |
737 |
self.splice_constraints.check() |
738 |
self.read_params.check() |
739 |
self.system_params.check() |
740 |
if self.segment_length < 10: |
741 |
die("Error: arg to --segment-length must at least 10") |
742 |
if self.segment_mismatches < 0 or self.segment_mismatches > 3: |
743 |
die("Error: arg to --segment-mismatches must in [0, 3]") |
744 |
if self.read_params.color: |
745 |
if self.bowtie2: |
746 |
th_log("Warning: bowtie2 in colorspace is not supported; --bowtie1 option assumed.") |
747 |
self.bowtie2=False |
748 |
if self.fusion_search: |
749 |
die("Error: fusion-search in colorspace is not yet supported") |
750 |
if self.butterfly_search: |
751 |
die("Error: butterfly-search in colorspace is not yet supported") |
752 |
|
753 |
self.bowtie2_params.check() |
754 |
|
755 |
if self.bowtie2 and self.fusion_search: |
756 |
th_logp("\tWarning: --fusion-search with Bowtie2 may not work well as it may require much memory space and produce many spurious fusions. Please try --bowtie1 option if this doesn't work.") |
757 |
|
758 |
library_types = ["fr-unstranded", "fr-firststrand", "fr-secondstrand"] |
759 |
|
760 |
if self.read_params.library_type and self.read_params.library_type not in library_types: |
761 |
die("Error: library-type should be one of: "+', '.join(library_types)) |
762 |
|
763 |
self.search_params.max_closure_intron_length = min(self.splice_constraints.max_intron_length, |
764 |
self.search_params.max_closure_intron_length) |
765 |
|
766 |
self.search_params.max_segment_intron_length = min(self.splice_constraints.max_intron_length, |
767 |
self.search_params.max_segment_intron_length) |
768 |
|
769 |
self.search_params.max_coverage_intron_length = min(self.splice_constraints.max_intron_length, |
770 |
self.search_params.max_coverage_intron_length) |
771 |
|
772 |
if self.max_insertion_length >= self.segment_length: |
773 |
die("Error: the max insertion length ("+self.max_insertion_length+") can not be equal to or greater than the segment length ("+self.segment_length+")") |
774 |
|
775 |
if self.max_insertion_length < 0: |
776 |
die("Error: the max insertion length ("+self.max_insertion_length+") can not be less than 0") |
777 |
|
778 |
if self.max_deletion_length >= self.splice_constraints.min_intron_length: |
779 |
die("Error: the max deletion length ("+self.max_deletion_length+") can not be equal to or greater than the min intron length ("+self.splice_constraints.min_intron_length+")") |
780 |
|
781 |
if self.max_deletion_length < 0: |
782 |
die("Error: the max deletion length ("+self.max_deletion_length+") can not be less than 0") |
783 |
|
784 |
if self.read_mismatches > self.read_edit_dist or self.read_gap_length > self.read_edit_dist: |
785 |
die("Error: the read mismatches (" + str(self.read_mismatches) + ") and the read gap length (" + str(self.read_edit_dist) + ") should be less than or equal to the read edit dist (" + str(self.read_edit_dist) + ")\n" + \ |
786 |
"Either decrease --read-mismatches or --read-gap-length, or increase --read-edit-dist") |
787 |
|
788 |
def cmd(self): |
789 |
cmd = ["--min-anchor", str(self.splice_constraints.min_anchor_length), |
790 |
"--splice-mismatches", str(self.splice_constraints.splice_mismatches), |
791 |
"--min-report-intron", str(self.splice_constraints.min_intron_length), |
792 |
"--max-report-intron", str(self.splice_constraints.max_intron_length), |
793 |
"--min-isoform-fraction", str(self.splice_constraints.min_isoform_fraction), |
794 |
"--output-dir", output_dir, |
795 |
"--max-multihits", str(self.max_hits), |
796 |
"--max-seg-multihits", str(self.max_seg_hits), |
797 |
"--segment-length", str(self.segment_length), |
798 |
"--segment-mismatches", str(self.segment_mismatches), |
799 |
"--min-closure-exon", str(self.search_params.min_closure_exon_length), |
800 |
"--min-closure-intron", str(self.search_params.min_closure_intron_length), |
801 |
"--max-closure-intron", str(self.search_params.max_closure_intron_length), |
802 |
"--min-coverage-intron", str(self.search_params.min_coverage_intron_length), |
803 |
"--max-coverage-intron", str(self.search_params.max_coverage_intron_length), |
804 |
"--min-segment-intron", str(self.search_params.min_segment_intron_length), |
805 |
"--max-segment-intron", str(self.search_params.max_segment_intron_length), |
806 |
"--read-mismatches", str(self.read_mismatches), |
807 |
"--read-gap-length", str(self.read_gap_length), |
808 |
"--read-edit-dist", str(self.read_edit_dist), |
809 |
"--read-realign-edit-dist", str(self.read_realign_edit_dist), |
810 |
"--max-insertion-length", str(self.max_insertion_length), |
811 |
"--max-deletion-length", str(self.max_deletion_length)] |
812 |
|
813 |
if self.suppress_hits: |
814 |
cmd.extend(["--suppress-hits"]) |
815 |
|
816 |
if not self.bowtie2: |
817 |
cmd.extend(["--bowtie1"]) |
818 |
|
819 |
if self.fusion_search: |
820 |
cmd.extend(["--fusion-search", |
821 |
"--fusion-anchor-length", str(self.fusion_anchor_length), |
822 |
"--fusion-min-dist", str(self.fusion_min_dist), |
823 |
"--fusion-read-mismatches", str(self.fusion_read_mismatches), |
824 |
"--fusion-multireads", str(self.fusion_multireads), |
825 |
"--fusion-multipairs", str(self.fusion_multipairs)]) |
826 |
|
827 |
if self.fusion_ignore_chromosomes: |
828 |
cmd.extend(["--fusion-ignore-chromosomes", ",".join(self.fusion_ignore_chromosomes)]) |
829 |
|
830 |
if self.fusion_do_not_resolve_conflicts: |
831 |
cmd.extend(["--fusion-do-not-resolve-conflicts"]) |
832 |
|
833 |
cmd.extend(self.system_params.cmd()) |
834 |
|
835 |
if self.read_params.mate_inner_dist != None: |
836 |
cmd.extend(["--inner-dist-mean", str(self.read_params.mate_inner_dist), |
837 |
"--inner-dist-std-dev", str(self.read_params.mate_inner_dist_std_dev)]) |
838 |
if self.gff_annotation != None: |
839 |
cmd.extend(["--gtf-annotations", str(self.gff_annotation)]) |
840 |
if gtf_juncs: |
841 |
cmd.extend(["--gtf-juncs", gtf_juncs]) |
842 |
if self.closure_search == False: |
843 |
cmd.append("--no-closure-search") |
844 |
if not self.coverage_search: |
845 |
cmd.append("--no-coverage-search") |
846 |
if not self.microexon_search: |
847 |
cmd.append("--no-microexon-search") |
848 |
if self.butterfly_search: |
849 |
cmd.append("--butterfly-search") |
850 |
if self.read_params.solexa_quals: |
851 |
cmd.append("--solexa-quals") |
852 |
if self.read_params.quals: |
853 |
cmd.append("--quals") |
854 |
if self.read_params.integer_quals: |
855 |
cmd.append("--integer-quals") |
856 |
if self.read_params.color: |
857 |
cmd.append("--color") |
858 |
if self.read_params.library_type: |
859 |
cmd.extend(["--library-type", self.read_params.library_type]) |
860 |
if self.read_params.read_group_id: |
861 |
cmd.extend(["--rg-id", self.read_params.read_group_id]) |
862 |
if self.read_params.phred64_quals: |
863 |
cmd.append("--phred64-quals") |
864 |
return cmd |
865 |
|
866 |
# This is the master options parsing routine, which calls parse_options for |
867 |
# the delegate classes (e.g. SpliceConstraints) that handle certain groups |
868 |
# of options. |
869 |
def parse_options(self, argv): |
870 |
try: |
871 |
opts, args = getopt.getopt(argv[1:], "hvp:m:n:N:F:a:i:I:G:Tr:o:j:Xz:s:g:x:R:MQCW", |
872 |
["version", |
873 |
"help", |
874 |
"output-dir=", |
875 |
"bowtie1", |
876 |
"solexa-quals", |
877 |
"solexa1.3-quals", |
878 |
"phred64-quals", |
879 |
"quals", |
880 |
"integer-quals", |
881 |
"color", |
882 |
"library-type=", |
883 |
"num-threads=", |
884 |
"splice-mismatches=", |
885 |
"max-multihits=", |
886 |
"suppress-hits", |
887 |
"min-isoform-fraction=", |
888 |
"min-anchor-length=", |
889 |
"min-intron-length=", |
890 |
"max-intron-length=", |
891 |
"GTF=", |
892 |
"transcriptome-only", |
893 |
"transcriptome-max-hits=", |
894 |
"transcriptome-index=", |
895 |
"raw-juncs=", |
896 |
"no-novel-juncs", |
897 |
"allow-fusions", |
898 |
"fusion-search", |
899 |
"fusion-anchor-length=", |
900 |
"fusion-min-dist=", |
901 |
"fusion-read-mismatches=", |
902 |
"fusion-multireads=", |
903 |
"fusion-multipairs=", |
904 |
"fusion-ignore-chromosomes=", |
905 |
"fusion-do-not-resolve-conflicts", |
906 |
"no-novel-indels", |
907 |
"no-gtf-juncs", |
908 |
"mate-inner-dist=", |
909 |
"mate-std-dev=", |
910 |
"no-coverage-search", |
911 |
"coverage-search", |
912 |
"prefilter-multihits", |
913 |
"microexon-search", |
914 |
"min-coverage-intron=", |
915 |
"max-coverage-intron=", |
916 |
"min-segment-intron=", |
917 |
"max-segment-intron=", |
918 |
"resume=", |
919 |
"seed-length=", |
920 |
"read-mismatches=", |
921 |
"read-gap-length=", |
922 |
"read-edit-dist=", |
923 |
"read-realign-edit-dist=", |
924 |
"segment-length=", |
925 |
"segment-mismatches=", |
926 |
"bowtie-n", |
927 |
"keep-tmp", |
928 |
"rg-id=", |
929 |
"rg-sample=", |
930 |
"rg-library=", |
931 |
"rg-description=", |
932 |
"rg-platform-unit=", |
933 |
"rg-center=", |
934 |
"rg-date=", |
935 |
"rg-platform=", |
936 |
"tmp-dir=", |
937 |
"zpacker=", |
938 |
"unmapped-fifo", |
939 |
"max-insertion-length=", |
940 |
"max-deletion-length=", |
941 |
"insertions=", |
942 |
"deletions=", |
943 |
"no-sort-bam", |
944 |
"no-convert-bam", |
945 |
"report-secondary-alignments", |
946 |
"no-discordant", |
947 |
"no-mixed", |
948 |
"keep-fasta-order", |
949 |
"allow-partial-mapping", |
950 |
"b2-very-fast", |
951 |
"b2-fast", |
952 |
"b2-sensitive", |
953 |
"b2-very-sensitive", |
954 |
"b2-N=", |
955 |
"b2-L=", |
956 |
"b2-i=", |
957 |
"b2-n-ceil=", |
958 |
"b2-gbar=", |
959 |
"b2-ma=", |
960 |
"b2-mp=", |
961 |
"b2-np=", |
962 |
"b2-rdg=", |
963 |
"b2-rfg=", |
964 |
"b2-score-min=", |
965 |
"b2-D=", |
966 |
"b2-R="]) |
967 |
except getopt.error, msg: |
968 |
raise Usage(msg) |
969 |
|
970 |
self.splice_constraints.parse_options(opts) |
971 |
self.system_params.parse_options(opts) |
972 |
self.read_params.parse_options(opts) |
973 |
self.search_params.parse_options(opts) |
974 |
self.report_params.parse_options(opts) |
975 |
self.bowtie2_params.parse_options(opts) |
976 |
global use_BWT_FIFO |
977 |
global use_BAM_Unmapped |
978 |
if not self.read_params.color: |
979 |
use_BWT_FIFO=False |
980 |
use_BAM_Unmapped=True |
981 |
global output_dir |
982 |
global logging_dir |
983 |
global tmp_dir |
984 |
|
985 |
custom_tmp_dir = None |
986 |
custom_out_dir = None |
987 |
# option processing |
988 |
for option, value in opts: |
989 |
if option in ("-v", "--version"): |
990 |
print "TopHat v%s" % (get_version()) |
991 |
sys.exit(0) |
992 |
if option in ("-h", "--help"): |
993 |
raise Usage(use_message) |
994 |
if option == "--bowtie1": |
995 |
self.bowtie2 = False |
996 |
if option in ("-g", "--max-multihits"): |
997 |
self.max_hits = int(value) |
998 |
self.max_seg_hits = max(10, self.max_hits * 2) |
999 |
if option == "--suppress-hits": |
1000 |
self.suppress_hits = True |
1001 |
if option in ("-x", "--transcriptome-max-hits"): |
1002 |
self.t_max_hits = int(value) |
1003 |
if option in ("-G", "--GTF"): |
1004 |
self.gff_annotation = value |
1005 |
if option in ("-T", "--transcriptome-only"): |
1006 |
self.transcriptome_only = True |
1007 |
if option == "--transcriptome-index": |
1008 |
self.transcriptome_index = value |
1009 |
if option in("-M", "--prefilter-multihits"): |
1010 |
self.prefilter_multi = True |
1011 |
if option in ("-j", "--raw-juncs"): |
1012 |
self.raw_junctions = value |
1013 |
if option == "--no-novel-juncs": |
1014 |
self.find_novel_juncs = False |
1015 |
if option == "--no-novel-indels": |
1016 |
self.find_novel_indels = False |
1017 |
if option == "--fusion-search": |
1018 |
self.fusion_search = True |
1019 |
if option == "--fusion-anchor-length": |
1020 |
self.fusion_anchor_length = int(value) |
1021 |
if option == "--fusion-min-dist": |
1022 |
self.fusion_min_dist = int(value) |
1023 |
if option == "--fusion-read-mismatches": |
1024 |
self.fusion_read_mismatches = int(value) |
1025 |
if option == "--fusion-multireads": |
1026 |
self.fusion_multireads = int(value) |
1027 |
if option == "--fusion-multipairs": |
1028 |
self.fusion_multipairs = int(value) |
1029 |
if option == "--fusion-ignore-chromosomes": |
1030 |
self.fusion_ignore_chromosomes = value.split(",") |
1031 |
if option == "--fusion-do-not-resolve-conflicts": |
1032 |
self.fusion_do_not_resolve_conflicts = True |
1033 |
if option == "--no-gtf-juncs": |
1034 |
self.find_GFF_juncs = False |
1035 |
if option == "--no-coverage-search": |
1036 |
self.coverage_search = False |
1037 |
if option == "--coverage-search": |
1038 |
self.coverage_search = True |
1039 |
# -W option : score and edit distance filtering in fix_map_ordering |
1040 |
# this is *soft* post-processing of bowtie2 results, should be |
1041 |
# more effectively implemented by using bowtie2's score function |
1042 |
if option == "-W": |
1043 |
self.b2scoreflt = True |
1044 |
self.closure_search = False |
1045 |
#if option == "--no-closure-search": |
1046 |
# self.closure_search = False |
1047 |
#if option == "--closure-search": |
1048 |
# self.closure_search = True |
1049 |
if option == "--microexon-search": |
1050 |
self.microexon_search = True |
1051 |
|
1052 |
self.butterfly_search = False |
1053 |
#if option == "--butterfly-search": |
1054 |
# self.butterfly_search = True |
1055 |
#if option == "--no-butterfly-search": |
1056 |
# self.butterfly_search = False |
1057 |
if option in ("-N", "--read-mismatches"): |
1058 |
self.read_mismatches = int(value) |
1059 |
if option == "--read-gap-length": |
1060 |
self.read_gap_length = int(value) |
1061 |
if option == "--read-edit-dist": |
1062 |
self.read_edit_dist = int(value) |
1063 |
if option == "--read-realign-edit-dist": |
1064 |
self.read_realign_edit_dist = int(value) |
1065 |
if option == "--segment-length": |
1066 |
self.segment_length = int(value) |
1067 |
if option == "--segment-mismatches": |
1068 |
self.segment_mismatches = int(value) |
1069 |
if option == "--bowtie-n": |
1070 |
self.bowtie_alignment_option = "-n" |
1071 |
if option == "--max-insertion-length": |
1072 |
self.max_insertion_length = int(value) |
1073 |
if option == "--max-deletion-length": |
1074 |
self.max_deletion_length = int(value) |
1075 |
if option == "--insertions": |
1076 |
self.raw_insertions = value |
1077 |
if option == "--deletions": |
1078 |
self.raw_deletions = value |
1079 |
if option == "--report-secondary-alignments": |
1080 |
self.report_secondary_alignments = True |
1081 |
if option == "--no-discordant": |
1082 |
self.report_discordant_pair_alignments = False |
1083 |
if option == "--no-mixed": |
1084 |
self.report_mixed_alignments = False |
1085 |
if option == "--keep-fasta-order": |
1086 |
self.keep_fasta_order = True |
1087 |
if option == "--allow-partial-mapping": |
1088 |
self.partial_mapping = True |
1089 |
if option in ("-o", "--output-dir"): |
1090 |
custom_out_dir = value + "/" |
1091 |
if option in ("-R", "--resume"): |
1092 |
self.resume_dir = value |
1093 |
if option == "--tmp-dir": |
1094 |
custom_tmp_dir = value + "/" |
1095 |
|
1096 |
if self.transcriptome_only: |
1097 |
self.find_novel_juncs=False |
1098 |
self.find_novel_indels=False |
1099 |
if custom_out_dir: |
1100 |
output_dir = custom_out_dir |
1101 |
logging_dir = output_dir + "logs/" |
1102 |
tmp_dir = output_dir + "tmp/" |
1103 |
sam_header = tmp_dir + "stub_header.sam" |
1104 |
if custom_tmp_dir: |
1105 |
tmp_dir = custom_tmp_dir |
1106 |
sam_header = tmp_dir + "stub_header.sam" |
1107 |
if len(args) < 2 and not self.resume_dir: |
1108 |
raise Usage(use_message) |
1109 |
|
1110 |
if not self.read_realign_edit_dist: |
1111 |
self.read_realign_edit_dist = self.read_edit_dist |
1112 |
|
1113 |
return args |
1114 |
|
1115 |
|
1116 |
def nonzeroFile(filepath): |
1117 |
if os.path.exists(filepath): |
1118 |
fpath, fname=os.path.split(filepath) |
1119 |
fbase, fext =os.path.splitext(fname) |
1120 |
if fext.lower() == ".bam": |
1121 |
samtools_view_cmd = ["samtools", "view", filepath] |
1122 |
samtools_view = subprocess.Popen(samtools_view_cmd, stdout=subprocess.PIPE) |
1123 |
head_cmd = ["head", "-1"] |
1124 |
head = subprocess.Popen(head_cmd, stdin=samtools_view.stdout, stdout=subprocess.PIPE) |
1125 |
output = head.communicate()[0][:-1] |
1126 |
if len(output) > 0: |
1127 |
return True |
1128 |
else: |
1129 |
if os.path.getsize(filepath)>25: |
1130 |
return True |
1131 |
return False |
1132 |
|
1133 |
|
1134 |
# check if a file exists and has non-zero (or minimum) size |
1135 |
def fileExists(filepath, minfsize=1): |
1136 |
if os.path.exists(filepath) and os.path.getsize(filepath)>=minfsize: |
1137 |
return True |
1138 |
else: |
1139 |
return False |
1140 |
|
1141 |
def removeFileWithIndex(filepath): |
1142 |
if os.path.exists(filepath): |
1143 |
os.remove(filepath) |
1144 |
|
1145 |
fileindexpath = filepath + ".index" |
1146 |
if os.path.exists(fileindexpath): |
1147 |
os.remove(fileindexpath) |
1148 |
|
1149 |
def getFileDir(filepath): |
1150 |
#if fullpath given, returns path including the ending / |
1151 |
fpath, fname=os.path.split(filepath) |
1152 |
if fpath: fpath+='/' |
1153 |
return fpath |
1154 |
|
1155 |
def getFileBaseName(filepath): |
1156 |
fpath, fname=os.path.split(filepath) |
1157 |
fbase, fext =os.path.splitext(fname) |
1158 |
fx=fext.lower() |
1159 |
if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fbase)>0: |
1160 |
return fbase |
1161 |
elif fx == '.z' or fx.find('.gz')==0 or fx.find('.bz')==0: |
1162 |
fb, fext = os.path.splitext(fbase) |
1163 |
fx=fext.lower() |
1164 |
if (fx in ['.fq','.txt','.seq','.bwtout'] or fx.find('.fa')==0) and len(fb)>0: |
1165 |
return fb |
1166 |
else: |
1167 |
return fbase |
1168 |
else: |
1169 |
if len(fbase)>0: |
1170 |
return fbase |
1171 |
else: |
1172 |
return fname |
1173 |
|
1174 |
# Returns the current time in a nice format |
1175 |
def right_now(): |
1176 |
curr_time = datetime.now() |
1177 |
return curr_time.strftime("%c") |
1178 |
|
1179 |
# The TopHat logging formatter |
1180 |
def th_log(out_str): |
1181 |
#print >> sys.stderr, "[%s] %s" % (right_now(), out_str) |
1182 |
if tophat_logger: |
1183 |
tophat_logger.info(out_str) |
1184 |
|
1185 |
def th_logp(out_str=""): |
1186 |
print >> sys.stderr, out_str |
1187 |
if tophat_log: |
1188 |
print >> tophat_log, out_str |
1189 |
|
1190 |
def die(msg=None): |
1191 |
if msg is not None: |
1192 |
th_logp(msg) |
1193 |
sys.exit(1) |
1194 |
|
1195 |
# Ensures that the output, logging, and temp directories are present. If not, |
1196 |
# they are created |
1197 |
def prepare_output_dir(): |
1198 |
|
1199 |
#th_log("Preparing output location "+output_dir) |
1200 |
if os.path.exists(output_dir): |
1201 |
pass |
1202 |
else: |
1203 |
os.mkdir(output_dir) |
1204 |
|
1205 |
if os.path.exists(logging_dir): |
1206 |
pass |
1207 |
else: |
1208 |
os.mkdir(logging_dir) |
1209 |
|
1210 |
if os.path.exists(tmp_dir): |
1211 |
pass |
1212 |
else: |
1213 |
try: |
1214 |
os.makedirs(tmp_dir) |
1215 |
except OSError, o: |
1216 |
die("\nError creating directory %s (%s)" % (tmp_dir, o)) |
1217 |
|
1218 |
|
1219 |
# to be added as preexec_fn for every subprocess.Popen() call: |
1220 |
# see http://bugs.python.org/issue1652 |
1221 |
def subprocess_setup(): |
1222 |
# Python installs a SIGPIPE handler by default, which causes |
1223 |
# gzip or other de/compression pipes to complain about "stdout: Broken pipe" |
1224 |
signal.signal(signal.SIGPIPE, signal.SIG_DFL) |
1225 |
|
1226 |
# Check that the Bowtie index specified by the user is present and all files |
1227 |
# are there. |
1228 |
def check_bowtie_index(idx_prefix, is_bowtie2): |
1229 |
if currentStage >= resumeStage: |
1230 |
th_log("Checking for Bowtie index files") |
1231 |
idxext="ebwt" |
1232 |
bowtie_ver="" |
1233 |
if is_bowtie2: |
1234 |
idxext="bt2" |
1235 |
bowtie_ver="2 " |
1236 |
|
1237 |
idx_fwd_1 = idx_prefix + ".1."+idxext |
1238 |
idx_fwd_2 = idx_prefix + ".2."+idxext |
1239 |
idx_rev_1 = idx_prefix + ".rev.1."+idxext |
1240 |
idx_rev_2 = idx_prefix + ".rev.2."+idxext |
1241 |
|
1242 |
if os.path.exists(idx_fwd_1) and \ |
1243 |
os.path.exists(idx_fwd_2) and \ |
1244 |
os.path.exists(idx_rev_1) and \ |
1245 |
os.path.exists(idx_rev_2): |
1246 |
return |
1247 |
else: |
1248 |
bwtidxerr="Error: Could not find Bowtie "+bowtie_ver+"index files (" + idx_prefix + ".*."+idxext+")" |
1249 |
|
1250 |
if is_bowtie2: |
1251 |
bwtidx_env = os.environ.get("BOWTIE2_INDEXES") |
1252 |
else: |
1253 |
bwtidx_env = os.environ.get("BOWTIE_INDEXES") |
1254 |
|
1255 |
if bwtidx_env == None: |
1256 |
die(bwtidxerr) |
1257 |
if os.path.exists(bwtidx_env+idx_fwd_1) and \ |
1258 |
os.path.exists(bwtidx_env+idx_fwd_2) and \ |
1259 |
os.path.exists(bwtidx_env+idx_rev_1) and \ |
1260 |
os.path.exists(bwtidx_env+idx_rev_2): |
1261 |
return |
1262 |
else: |
1263 |
die(bwtidxerr) |
1264 |
|
1265 |
# Reconstructs the multifasta file from which the Bowtie index was created, if |
1266 |
# it's not already there. |
1267 |
def bowtie_idx_to_fa(idx_prefix, is_bowtie2): |
1268 |
idx_name = idx_prefix.split('/')[-1] |
1269 |
th_log("Reconstituting reference FASTA file from Bowtie index") |
1270 |
|
1271 |
try: |
1272 |
tmp_fasta_file_name = tmp_dir + idx_name + ".fa" |
1273 |
tmp_fasta_file = open(tmp_fasta_file_name, "w") |
1274 |
|
1275 |
inspect_log = open(logging_dir + "bowtie_inspect_recons.log", "w") |
1276 |
|
1277 |
if is_bowtie2: |
1278 |
inspect_cmd = [prog_path("bowtie2-inspect")] |
1279 |
else: |
1280 |
inspect_cmd = [prog_path("bowtie-inspect")] |
1281 |
|
1282 |
inspect_cmd += [idx_prefix] |
1283 |
|
1284 |
th_logp(" Executing: " + " ".join(inspect_cmd) + " > " + tmp_fasta_file_name) |
1285 |
ret = subprocess.call(inspect_cmd, |
1286 |
stdout=tmp_fasta_file, |
1287 |
stderr=inspect_log) |
1288 |
# Bowtie reported an error |
1289 |
if ret != 0: |
1290 |
die(fail_str+"Error: bowtie-inspect returned an error\n"+log_tail(logging_dir + "bowtie_inspect_recons.log")) |
1291 |
|
1292 |
# Bowtie not found |
1293 |
except OSError, o: |
1294 |
if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT: |
1295 |
die(fail_str+"Error: bowtie-inspect not found on this system. Did you forget to include it in your PATH?") |
1296 |
|
1297 |
return tmp_fasta_file_name |
1298 |
|
1299 |
# Checks whether the multifasta file for the genome is present alongside the |
1300 |
# Bowtie index files for it. |
1301 |
def check_fasta(idx_prefix, is_bowtie2): |
1302 |
th_log("Checking for reference FASTA file") |
1303 |
idx_fasta = idx_prefix + ".fa" |
1304 |
if os.path.exists(idx_fasta): |
1305 |
return idx_fasta |
1306 |
else: |
1307 |
if is_bowtie2: |
1308 |
bowtie_idx_env_var = os.environ.get("BOWTIE2_INDEXES") |
1309 |
else: |
1310 |
bowtie_idx_env_var = os.environ.get("BOWTIE_INDEXES") |
1311 |
if bowtie_idx_env_var: |
1312 |
idx_fasta = bowtie_idx_env_var + idx_prefix + ".fa" |
1313 |
if os.path.exists(idx_fasta): |
1314 |
return idx_fasta |
1315 |
|
1316 |
th_logp("\tWarning: Could not find FASTA file " + idx_fasta) |
1317 |
idx_fa = bowtie_idx_to_fa(idx_prefix, is_bowtie2) |
1318 |
return idx_fa |
1319 |
|
1320 |
# Check that both the Bowtie index and the genome's fasta file are present |
1321 |
def check_index(idx_prefix, is_bowtie2): |
1322 |
check_bowtie_index(idx_prefix, is_bowtie2) |
1323 |
ref_fasta_file = check_fasta(idx_prefix, is_bowtie2) |
1324 |
|
1325 |
return (ref_fasta_file, None) |
1326 |
|
1327 |
# Retrive a tuple containing the system's version of Bowtie. Parsed from |
1328 |
# `bowtie --version` |
1329 |
def get_bowtie_version(is_bowtie2): |
1330 |
try: |
1331 |
# Launch Bowtie to capture its version info |
1332 |
proc = subprocess.Popen([bowtie_path, "--version"], |
1333 |
stdout=subprocess.PIPE) |
1334 |
|
1335 |
stdout_value = proc.communicate()[0] |
1336 |
|
1337 |
bowtie_version = None |
1338 |
bowtie_out = repr(stdout_value) |
1339 |
|
1340 |
# Find the version identifier |
1341 |
if is_bowtie2: |
1342 |
version_str = "bowtie2-align version " |
1343 |
else: |
1344 |
version_str = "bowtie version " |
1345 |
|
1346 |
ver_str_idx = bowtie_out.find(version_str) |
1347 |
if ver_str_idx != -1: |
1348 |
nl = bowtie_out.find("\\n", ver_str_idx) |
1349 |
version_val = bowtie_out[ver_str_idx + len(version_str):nl] |
1350 |
|
1351 |
if is_bowtie2: |
1352 |
ver_numbers = version_val.split('.') |
1353 |
ver_numbers[2:] = ver_numbers[2].split('-') |
1354 |
bowtie_version = [int(x) for x in ver_numbers[:3]] + [int(ver_numbers[3][4:])] |
1355 |
else: |
1356 |
bowtie_version = [int(x) for x in version_val.split('.')] |
1357 |
|
1358 |
if len(bowtie_version) == 3: |
1359 |
bowtie_version.append(0) |
1360 |
|
1361 |
return bowtie_version |
1362 |
except OSError, o: |
1363 |
errmsg=fail_str+str(o)+"\n" |
1364 |
if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT: |
1365 |
errmsg+="Error: bowtie not found on this system" |
1366 |
die(errmsg) |
1367 |
|
1368 |
def get_index_sam_header(params, idx_prefix, name = ""): |
1369 |
noSkip = currentStage >= resumeStage |
1370 |
try: |
1371 |
temp_sam_header_filename = tmp_dir + "temp.samheader.sam" |
1372 |
temp_sam_header_file = None |
1373 |
if noSkip: |
1374 |
temp_sam_header_file = open(temp_sam_header_filename, "w") |
1375 |
|
1376 |
bowtie_header_cmd = [bowtie_path] |
1377 |
|
1378 |
read_params = params.read_params |
1379 |
if not params.bowtie2: |
1380 |
bowtie_header_cmd += ["--sam"] |
1381 |
|
1382 |
if read_params.color: |
1383 |
bowtie_header_cmd.append('-C') |
1384 |
|
1385 |
bowtie_header_cmd.extend([idx_prefix, '/dev/null']) |
1386 |
if noSkip: |
1387 |
subprocess.call(bowtie_header_cmd, |
1388 |
stdout=temp_sam_header_file, |
1389 |
stderr=open('/dev/null')) |
1390 |
|
1391 |
temp_sam_header_file.close() |
1392 |
temp_sam_header_file = open(temp_sam_header_filename, "r") |
1393 |
|
1394 |
bowtie_sam_header_filename = tmp_dir + idx_prefix.split('/')[-1] |
1395 |
if name != "": |
1396 |
bowtie_sam_header_filename += ("_" + name) |
1397 |
bowtie_sam_header_filename += ".bwt.samheader.sam" |
1398 |
if not noSkip: |
1399 |
return bowtie_sam_header_filename |
1400 |
bowtie_sam_header_file = open(bowtie_sam_header_filename, "w") |
1401 |
|
1402 |
preamble = [] |
1403 |
sq_dict_lines = [] |
1404 |
|
1405 |
for line in temp_sam_header_file.readlines(): |
1406 |
line = line.strip() |
1407 |
if line.find("@SQ") != -1: |
1408 |
# Sequence dictionary record |
1409 |
cols = line.split('\t') |
1410 |
seq_name = None |
1411 |
for col in cols: |
1412 |
fields = col.split(':') |
1413 |
#print fields |
1414 |
if len(fields) > 0 and fields[0] == "SN": |
1415 |
seq_name = fields[1] |
1416 |
if seq_name == None: |
1417 |
die("Error: malformed sequence dictionary in sam header") |
1418 |
sq_dict_lines.append([seq_name,line]) |
1419 |
elif line.find("CL"): |
1420 |
continue |
1421 |
else: |
1422 |
preamble.append(line) |
1423 |
|
1424 |
print >> bowtie_sam_header_file, "@HD\tVN:1.0\tSO:coordinate" |
1425 |
if read_params.read_group_id and read_params.sample_id: |
1426 |
rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id, |
1427 |
read_params.sample_id) |
1428 |
if read_params.library_id: |
1429 |
rg_str += "\tLB:%s" % read_params.library_id |
1430 |
if read_params.description: |
1431 |
rg_str += "\tDS:%s" % read_params.description |
1432 |
if read_params.seq_platform_unit: |
1433 |
rg_str += "\tPU:%s" % read_params.seq_platform_unit |
1434 |
if read_params.seq_center: |
1435 |
rg_str += "\tCN:%s" % read_params.seq_center |
1436 |
if read_params.mate_inner_dist: |
1437 |
rg_str += "\tPI:%s" % read_params.mate_inner_dist |
1438 |
if read_params.seq_run_date: |
1439 |
rg_str += "\tDT:%s" % read_params.seq_run_date |
1440 |
if read_params.seq_platform: |
1441 |
rg_str += "\tPL:%s" % read_params.seq_platform |
1442 |
|
1443 |
print >> bowtie_sam_header_file, rg_str |
1444 |
|
1445 |
if not params.keep_fasta_order: |
1446 |
sq_dict_lines.sort(lambda x,y: cmp(x[0],y[0])) |
1447 |
|
1448 |
for [name, line] in sq_dict_lines: |
1449 |
print >> bowtie_sam_header_file, line |
1450 |
print >> bowtie_sam_header_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd) |
1451 |
|
1452 |
bowtie_sam_header_file.close() |
1453 |
temp_sam_header_file.close() |
1454 |
return bowtie_sam_header_filename |
1455 |
|
1456 |
except OSError, o: |
1457 |
errmsg=fail_str+str(o)+"\n" |
1458 |
if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT: |
1459 |
errmsg+="Error: bowtie not found on this system" |
1460 |
die(errmsg) |
1461 |
|
1462 |
# Make sure Bowtie is installed and is recent enough to be useful |
1463 |
def check_bowtie(params): |
1464 |
bowtie_req="" |
1465 |
if params.bowtie2: |
1466 |
bowtie_req="2-align" |
1467 |
log_msg = "Checking for Bowtie" |
1468 |
th_log(log_msg) |
1469 |
|
1470 |
bowtie_bin = "bowtie"+bowtie_req |
1471 |
|
1472 |
global bowtie_path |
1473 |
bowtie_version = None |
1474 |
bowtie_path=which(bowtie_bin) |
1475 |
if bowtie_path: |
1476 |
bowtie_version = get_bowtie_version(params.bowtie2) |
1477 |
if params.bowtie2 and bowtie_version == None: |
1478 |
th_logp(" Bowtie 2 not found, checking for older version..") |
1479 |
#try to fallback on bowtie 1 |
1480 |
params.bowtie2=False |
1481 |
bowtie_path=which('bowtie') |
1482 |
if bowtie_path: |
1483 |
bowtie_version=get_bowtie_version(params.bowtie2) |
1484 |
if bowtie_version == None: |
1485 |
die("Error: Bowtie not found on this system.") |
1486 |
if params.bowtie2: |
1487 |
if bowtie_version[0] < 3 and bowtie_version[1] < 1 and bowtie_version[2] < 1 and bowtie_version[3] < 5: |
1488 |
die("Error: TopHat requires Bowtie 2.0.0-beta5 or later") |
1489 |
else: |
1490 |
if bowtie_version[0] < 1 and bowtie_version[1] < 12 and bowtie_version[2] < 3: |
1491 |
die("Error: TopHat requires Bowtie 0.12.3 or later") |
1492 |
th_logp("\t\t Bowtie version:\t %s" % ".".join([str(x) for x in bowtie_version])) |
1493 |
|
1494 |
|
1495 |
# Retrive a tuple containing the system's version of samtools. Parsed from |
1496 |
# `samtools` |
1497 |
def get_samtools_version(): |
1498 |
try: |
1499 |
# Launch Bowtie to capture its version info |
1500 |
proc = subprocess.Popen(samtools_path, stderr=subprocess.PIPE) |
1501 |
samtools_out = proc.communicate()[1] |
1502 |
|
1503 |
# Find the version identifier |
1504 |
version_match = re.search(r'Version:\s+(\d+)\.(\d+).(\d+)([a-zA-Z]?)', samtools_out) |
1505 |
samtools_version_arr = [int(version_match.group(x)) for x in [1,2,3]] |
1506 |
if version_match.group(4): |
1507 |
samtools_version_arr.append(version_match.group(4)) |
1508 |
else: |
1509 |
samtools_version_arr.append(0) |
1510 |
|
1511 |
return version_match.group(), samtools_version_arr |
1512 |
except OSError, o: |
1513 |
errmsg=fail_str+str(o)+"\n" |
1514 |
if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT: |
1515 |
errmsg+="Error: samtools not found on this system" |
1516 |
die(errmsg) |
1517 |
|
1518 |
# Make sure the SAM tools are installed and are recent enough to be useful |
1519 |
def check_samtools(): |
1520 |
th_log("Checking for Samtools") |
1521 |
global samtools_path |
1522 |
samtools_path=prog_path("samtools") |
1523 |
samtools_version_str, samtools_version_arr = get_samtools_version() |
1524 |
if samtools_version_str == None: |
1525 |
die("Error: Samtools not found on this system") |
1526 |
elif samtools_version_arr[1] < 1 or samtools_version_arr[2] < 7: |
1527 |
die("Error: TopHat requires Samtools 0.1.7 or later") |
1528 |
th_logp("\t\tSamtools version:\t %s" % ".".join([str(x) for x in samtools_version_arr])) |
1529 |
|
1530 |
|
1531 |
|
1532 |
class FastxReader: |
1533 |
def __init__(self, i_file, is_color=0, fname=''): |
1534 |
self.bufline=None |
1535 |
self.format=None |
1536 |
self.ifile=i_file |
1537 |
self.nextRecord=None |
1538 |
self.eof=None |
1539 |
self.fname=fname |
1540 |
self.lastline=None |
1541 |
self.numrecords=0 |
1542 |
self.isColor=0 |
1543 |
if is_color : self.isColor=1 |
1544 |
# determine file type |
1545 |
#no records processed yet, skip custom header lines if any |
1546 |
hlines=10 # allow maximum 10 header lines |
1547 |
self.lastline=" " |
1548 |
while hlines>0 and self.lastline[0] not in "@>" : |
1549 |
self.lastline=self.ifile.readline() |
1550 |
hlines-=1 |
1551 |
if self.lastline[0] == '@': |
1552 |
self.format='fastq' |
1553 |
self.nextRecord=self.nextFastq |
1554 |
elif self.lastline[0] == '>': |
1555 |
self.format='fasta' |
1556 |
self.nextRecord=self.nextFasta |
1557 |
else: |
1558 |
die("Error: cannot determine record type in input file %s" % fname) |
1559 |
self.bufline=self.lastline |
1560 |
self.lastline=None |
1561 |
|
1562 |
def nextFastq(self): |
1563 |
# returning tuple: (seqID, sequence_string, seq_len, qv_string) |
1564 |
seqid,seqstr,qstr,seq_len='','','',0 |
1565 |
if self.eof: return (seqid, seqstr, seq_len, qstr) |
1566 |
fline=self.getLine #shortcut to save a bit of time |
1567 |
line=fline() |
1568 |
|
1569 |
if not line : return (seqid, seqstr, seq_len, qstr) |
1570 |
while len(line.rstrip())==0: # skip empty lines |
1571 |
line=fline() |
1572 |
if not line : return (seqid, seqstr,seq_len, qstr) |
1573 |
try: |
1574 |
if line[0] != "@": |
1575 |
raise ValueError("Records in Fastq files should start with '@' character") |
1576 |
|
1577 |
seqid = line[1:].rstrip() |
1578 |
seqstr = fline().rstrip() |
1579 |
|
1580 |
#There may now be more sequence lines, or the "+" quality marker line: |
1581 |
while True: |
1582 |
line = fline() |
1583 |
if not line: |
1584 |
raise ValueError("Premature end of file (missing quality values for "+seqid+")") |
1585 |
if line[0] == "+": |
1586 |
# -- sequence string ended |
1587 |
#qtitle = line[1:].rstrip() |
1588 |
#if qtitle and qtitle != seqid: |
1589 |
# raise ValueError("Different read ID for sequence and quality (%s vs %s)" \ |
1590 |
# % (seqid, qtitle)) |
1591 |
break |
1592 |
seqstr += line.rstrip() #removes trailing newlines |
1593 |
#loop until + found |
1594 |
seq_len = len(seqstr) |
1595 |
#at least one line of quality data should follow |
1596 |
qstrlen=0 |
1597 |
#now read next lines as quality values until seq_len is reached |
1598 |
while True: |
1599 |
line=fline() |
1600 |
if not line : break #end of file |
1601 |
qstr += line.rstrip() |
1602 |
qstrlen=len(qstr) |
1603 |
if qstrlen + self.isColor >= seq_len : |
1604 |
break # qv string has reached the length of seq string |
1605 |
#loop until qv has the same length as seq |
1606 |
|
1607 |
if self.isColor: |
1608 |
# and qstrlen==seq_len : |
1609 |
if qstrlen==seq_len: |
1610 |
#qual string may have a dummy qv at the beginning, should be stripped |
1611 |
qstr = qstr[1:] |
1612 |
qstrlen -= 1 |
1613 |
if qstrlen!=seq_len-1: |
1614 |
raise ValueError("Length mismatch between sequence and quality strings "+ \ |
1615 |
"for %s (%i vs %i)." % (seqid, seq_len, qstrlen)) |
1616 |
else: |
1617 |
if seq_len != qstrlen : |
1618 |
raise ValueError("Length mismatch between sequence and quality strings "+ \ |
1619 |
"for %s (%i vs %i)." % (seqid, seq_len, qstrlen)) |
1620 |
except ValueError, err: |
1621 |
die("\nError encountered parsing file "+self.fname+":\n "+str(err)) |
1622 |
#return the record |
1623 |
self.numrecords+=1 |
1624 |
##--discard the primer base [NO] |
1625 |
if self.isColor : |
1626 |
seq_len-=1 |
1627 |
seqstr = seqstr[1:] |
1628 |
return (seqid, seqstr, seq_len, qstr) |
1629 |
|
1630 |
def nextFasta(self): |
1631 |
# returning tuple: (seqID, sequence_string, seq_len) |
1632 |
seqid,seqstr,seq_len='','',0 |
1633 |
fline=self.getLine # shortcut to readline function of f |
1634 |
line=fline() # this will use the buffer line if it's there |
1635 |
if not line : return (seqid, seqstr, seq_len, None) |
1636 |
while len(line.rstrip())==0: # skip empty lines |
1637 |
line=fline() |
1638 |
if not line : return (seqid, seqstr, seq_len, None) |
1639 |
try: |
1640 |
if line[0] != ">": |
1641 |
raise ValueError("Records in Fasta files must start with '>' character") |
1642 |
seqid = line[1:].split()[0] |
1643 |
#more sequence lines, or the ">" quality marker line: |
1644 |
while True: |
1645 |
line = fline() |
1646 |
if not line: break |
1647 |
if line[0] == '>': |
1648 |
#next sequence starts here |
1649 |
self.ungetLine() |
1650 |
break |
1651 |
seqstr += line.rstrip() |
1652 |
#loop until '>' found |
1653 |
seq_len = len(seqstr) |
1654 |
if seq_len < 3: |
1655 |
raise ValueError("Read %s too short (%i)." \ |
1656 |
% (seqid, seq_len)) |
1657 |
except ValueError, err: |
1658 |
die("\nError encountered parsing fasta file "+self.fname+"\n "+str(err)) |
1659 |
#return the record and continue |
1660 |
self.numrecords+=1 |
1661 |
if self.isColor : # -- discard primer base |
1662 |
seq_len-=1 |
1663 |
seqstr=seqstr[1:] |
1664 |
return (seqid, seqstr, seq_len, None) |
1665 |
|
1666 |
def getLine(self): |
1667 |
if self.bufline: #return previously buffered line |
1668 |
r=self.bufline |
1669 |
self.bufline=None |
1670 |
return r |
1671 |
else: #read a new line from stream and return it |
1672 |
if self.eof: return None |
1673 |
self.lastline=self.ifile.readline() |
1674 |
if not self.lastline: |
1675 |
self.eof=1 |
1676 |
return None |
1677 |
return self.lastline |
1678 |
def ungetLine(self): |
1679 |
if self.lastline is None: |
1680 |
th_logp("Warning: FastxReader called ungetLine() with no prior line!") |
1681 |
self.bufline=self.lastline |
1682 |
self.lastline=None |
1683 |
#< class FastxReader |
1684 |
|
1685 |
def fa_write(fhandle, seq_id, seq): |
1686 |
""" |
1687 |
Write to a file in the FASTA format. |
1688 |
|
1689 |
Arguments: |
1690 |
- `fhandle`: A file handle open for writing |
1691 |
- `seq_id`: The sequence id string for this sequence |
1692 |
- `seq`: An unformatted string of the sequence to write |
1693 |
""" |
1694 |
line_len = 60 |
1695 |
fhandle.write(">" + seq_id + "\n") |
1696 |
for i in xrange(len(seq) / line_len + 1): |
1697 |
start = i * line_len |
1698 |
#end = (i+1) * line_len if (i+1) * line_len < len(seq) else len(seq) |
1699 |
if (i+1) * line_len < len(seq): |
1700 |
end = (i+1) * line_len |
1701 |
else: |
1702 |
end = len(seq) |
1703 |
fhandle.write( seq[ start:end ] + "\n") |
1704 |
|
1705 |
class ZReader: |
1706 |
def __init__(self, filename, params, guess=True): |
1707 |
self.fname=filename |
1708 |
self.file=None |
1709 |
self.fsrc=None |
1710 |
self.popen=None |
1711 |
sys_params = params.system_params |
1712 |
pipecmd=[] |
1713 |
s=filename.lower() |
1714 |
if s.endswith(".bam"): |
1715 |
pipecmd=[prog_path("bam2fastx")] |
1716 |
if params.read_params.color: |
1717 |
pipecmd+=["--color"] |
1718 |
pipecmd+=["--all", "-"] |
1719 |
else: |
1720 |
if guess: |
1721 |
if s.endswith(".z") or s.endswith(".gz") or s.endswith(".gzip"): |
1722 |
pipecmd=['gzip'] |
1723 |
else: |
1724 |
if s.endswith(".bz2") or s.endswith(".bzip2") or s.endswith(".bzip"): |
1725 |
pipecmd=['bzip2'] |
1726 |
if len(pipecmd)>0 and which(pipecmd[0]) is None: |
1727 |
die("Error: cannot find %s to decompress input file %s " % (pipecmd, filename)) |
1728 |
if len(pipecmd)>0: |
1729 |
if pipecmd[0]=='gzip' and sys_params.zipper.endswith('pigz'): |
1730 |
pipecmd[0]=sys_params.zipper |
1731 |
pipecmd.extend(sys_params.zipper_opts) |
1732 |
elif pipecmd[0]=='bzip2' and sys_params.zipper.endswith('pbzip2'): |
1733 |
pipecmd[0]=sys_params.zipper |
1734 |
pipecmd.extend(sys_params.zipper_opts) |
1735 |
else: #not guessing, but must still check if it's a compressed file |
1736 |
if use_zpacker and filename.endswith(".z"): |
1737 |
pipecmd=[sys_params.zipper] |
1738 |
pipecmd.extend(sys_params.zipper_opts) |
1739 |
|
1740 |
if pipecmd: |
1741 |
pipecmd+=['-cd'] |
1742 |
if pipecmd: |
1743 |
try: |
1744 |
self.fsrc=open(self.fname, 'rb') |
1745 |
self.popen=subprocess.Popen(pipecmd, |
1746 |
preexec_fn=subprocess_setup, |
1747 |
stdin=self.fsrc, |
1748 |
stdout=subprocess.PIPE, stderr=tophat_log, close_fds=True) |
1749 |
except Exception: |
1750 |
die("Error: could not open pipe "+' '.join(pipecmd)+' < '+ self.fname) |
1751 |
self.file=self.popen.stdout |
1752 |
else: |
1753 |
self.file=open(filename) |
1754 |
def close(self): |
1755 |
if self.fsrc: self.fsrc.close() |
1756 |
self.file.close() |
1757 |
if self.popen: |
1758 |
self.popen.wait() |
1759 |
self.popen=None |
1760 |
|
1761 |
class ZWriter: |
1762 |
def __init__(self, filename, sysparams): |
1763 |
self.fname=filename |
1764 |
if use_zpacker: |
1765 |
pipecmd=[sysparams.zipper,"-cf", "-"] |
1766 |
self.ftarget=open(filename, "wb") |
1767 |
try: |
1768 |
self.popen=subprocess.Popen(pipecmd, |
1769 |
preexec_fn=subprocess_setup, |
1770 |
stdin=subprocess.PIPE, |
1771 |
stderr=tophat_log, stdout=self.ftarget, close_fds=True) |
1772 |
except Exception: |
1773 |
die("Error: could not open writer pipe "+' '.join(pipecmd)+' < '+ self.fname) |
1774 |
self.file=self.popen.stdin # client writes to this end of the pipe |
1775 |
else: #no compression |
1776 |
self.file=open(filename, "w") |
1777 |
self.ftarget=None |
1778 |
self.popen=None |
1779 |
def close(self): |
1780 |
self.file.close() |
1781 |
if self.ftarget: self.ftarget.close() |
1782 |
if self.popen: |
1783 |
self.popen.wait() #! required to actually flush the pipes (eek!) |
1784 |
self.popen=None |
1785 |
|
1786 |
# check_reads_format() examines the first few records in the user files |
1787 |
# to determines the file format |
1788 |
def check_reads_format(params, reads_files): |
1789 |
#seed_len = params.read_params.seed_length |
1790 |
fileformat = params.read_params.reads_format |
1791 |
|
1792 |
observed_formats = set([]) |
1793 |
# observed_scales = set([]) |
1794 |
min_seed_len = 99999 |
1795 |
max_seed_len = 0 |
1796 |
files = reads_files.split(',') |
1797 |
|
1798 |
for f_name in files: |
1799 |
#try: |
1800 |
zf = ZReader(f_name, params) |
1801 |
#except IOError: |
1802 |
# die("Error: could not open file "+f_name) |
1803 |
freader=FastxReader(zf.file, params.read_params.color, zf.fname) |
1804 |
toread=4 #just sample the first 4 reads |
1805 |
while toread>0: |
1806 |
seqid, seqstr, seq_len, qstr = freader.nextRecord() |
1807 |
if not seqid: break |
1808 |
toread-=1 |
1809 |
if seq_len < 20: |
1810 |
th_logp("Warning: found a read < 20bp in "+f_name) |
1811 |
else: |
1812 |
min_seed_len = min(seq_len, min_seed_len) |
1813 |
max_seed_len = max(seq_len, max_seed_len) |
1814 |
zf.close() |
1815 |
observed_formats.add(freader.format) |
1816 |
if len(observed_formats) > 1: |
1817 |
die("Error: TopHat requires all reads be either FASTQ or FASTA. Mixing formats is not supported.") |
1818 |
fileformat=list(observed_formats)[0] |
1819 |
#if seed_len != None: |
1820 |
# seed_len = max(seed_len, max_seed_len) |
1821 |
#else: |
1822 |
# seed_len = max_seed_len |
1823 |
#print >> sys.stderr, "\tmin read length: %dbp, max read length: %dbp" % (min_seed_len, max_seed_len) |
1824 |
th_logp("\tformat:\t\t %s" % fileformat) |
1825 |
if fileformat == "fastq": |
1826 |
quality_scale = "phred33 (default)" |
1827 |
if params.read_params.solexa_quals and not params.read_params.phred64_quals: |
1828 |
quality_scale = "solexa33 (reads generated with GA pipeline version < 1.3)" |
1829 |
elif params.read_params.phred64_quals: |
1830 |
quality_scale = "phred64 (reads generated with GA pipeline version >= 1.3)" |
1831 |
th_logp("\tquality scale:\t %s" % quality_scale) |
1832 |
elif fileformat == "fasta": |
1833 |
if params.read_params.color: |
1834 |
params.read_params.integer_quals = True |
1835 |
|
1836 |
#print seed_len, format, solexa_scale |
1837 |
#NOTE: seed_len will be re-evaluated later by prep_reads |
1838 |
return TopHatParams.ReadParams(params.read_params.solexa_quals, |
1839 |
params.read_params.phred64_quals, |
1840 |
params.read_params.quals, |
1841 |
params.read_params.integer_quals, |
1842 |
params.read_params.color, |
1843 |
params.read_params.library_type, |
1844 |
#seed_len, |
1845 |
params.read_params.seed_length, |
1846 |
fileformat, |
1847 |
params.read_params.mate_inner_dist, |
1848 |
params.read_params.mate_inner_dist_std_dev, |
1849 |
params.read_params.read_group_id, |
1850 |
params.read_params.sample_id, |
1851 |
params.read_params.library_id, |
1852 |
params.read_params.description, |
1853 |
params.read_params.seq_platform_unit, |
1854 |
params.read_params.seq_center, |
1855 |
params.read_params.seq_run_date, |
1856 |
params.read_params.seq_platform) |
1857 |
|
1858 |
def grep_file(logfile, regex="warning"): |
1859 |
f=open(logfile, "r") |
1860 |
r=[] |
1861 |
for line in f: |
1862 |
if re.match(regex, line, re.IGNORECASE): |
1863 |
r += [line.rstrip()] |
1864 |
return r |
1865 |
|
1866 |
def log_tail(logfile, lines=1): |
1867 |
f=open(logfile, "r") |
1868 |
f.seek(0, 2) |
1869 |
fbytes= f.tell() |
1870 |
size=lines |
1871 |
block=-1 |
1872 |
while size > 0 and fbytes+block*1024 > 0: |
1873 |
if (fbytes+block*1024 > 0): |
1874 |
##Seek back once more, if possible |
1875 |
f.seek( block*1024, 2 ) |
1876 |
else: |
1877 |
#Seek to the beginning |
1878 |
f.seek(0, 0) |
1879 |
data= f.read( 1024 ) |
1880 |
linesFound= data.count('\n') |
1881 |
size -= linesFound |
1882 |
block -= 1 |
1883 |
if (fbytes + block*1024 > 0): |
1884 |
f.seek(block*1024, 2) |
1885 |
else: |
1886 |
f.seek(0,0) |
1887 |
#f.readline() # find a newline |
1888 |
lastBlocks= list( f.readlines() ) |
1889 |
f.close() |
1890 |
return "".join(lastBlocks[-lines:]) |
1891 |
|
1892 |
# Format a DateTime as a pretty string. |
1893 |
# FIXME: Currently doesn't support days! |
1894 |
def formatTD(td): |
1895 |
days = td.days |
1896 |
hours = td.seconds // 3600 |
1897 |
minutes = (td.seconds % 3600) // 60 |
1898 |
seconds = td.seconds % 60 |
1899 |
|
1900 |
if days > 0: |
1901 |
return '%d days %02d:%02d:%02d' % (days, hours, minutes, seconds) |
1902 |
else: |
1903 |
return '%02d:%02d:%02d' % (hours, minutes, seconds) |
1904 |
|
1905 |
class PrepReadsInfo: |
1906 |
def __init__(self, fname, out_fname): |
1907 |
self.min_len = [0, 0] |
1908 |
self.max_len = [0, 0] |
1909 |
self.in_count = [0, 0] |
1910 |
self.out_count= [0, 0] |
1911 |
self.kept_reads = [None, None] |
1912 |
try: |
1913 |
f=open(fname,"r") |
1914 |
self.min_len[0]=int(f.readline().split("=")[-1]) |
1915 |
self.max_len[0]=int(f.readline().split("=")[-1]) |
1916 |
self.in_count[0]=int(f.readline().split("=")[-1]) |
1917 |
self.out_count[0]=int(f.readline().split("=")[-1]) |
1918 |
if (self.out_count[0]==0) or (self.max_len[0]<16): |
1919 |
raise Exception() |
1920 |
line=f.readline() |
1921 |
if line and line.find("=") > 0: |
1922 |
self.min_len[1]=int(line.split("=")[-1]) |
1923 |
self.max_len[1]=int(f.readline().split("=")[-1]) |
1924 |
self.in_count[1]=int(f.readline().split("=")[-1]) |
1925 |
self.out_count[1]=int(f.readline().split("=")[-1]) |
1926 |
if (self.out_count[1]==0) or (self.max_len[1]<16): |
1927 |
raise Exception() |
1928 |
except Exception, e: |
1929 |
die(fail_str+"Error retrieving prep_reads info.") |
1930 |
sides=["left", "right"] |
1931 |
for ri in (0,1): |
1932 |
if self.in_count[ri]==0: break |
1933 |
trashed=self.in_count[ri]-self.out_count[ri] |
1934 |
self.kept_reads[ri]=out_fname.replace("%side%", sides[ri]) |
1935 |
th_logp("\t%5s reads: min. length=%s, max. length=%s, %s kept reads (%s discarded)" % (sides[ri], self.min_len[ri], self.max_len[ri], self.out_count[ri], trashed)) |
1936 |
|
1937 |
def prep_reads_cmd(params, l_reads_list, l_quals_list=None, r_reads_list=None, r_quals_list=None, out_file=None, aux_file=None, |
1938 |
index_file=None, filter_reads=[], hits_to_filter=[]): |
1939 |
#generate a prep_reads cmd arguments |
1940 |
prep_cmd = [prog_path("prep_reads")] |
1941 |
|
1942 |
prep_cmd.extend(params.cmd()) |
1943 |
|
1944 |
if params.read_params.reads_format == "fastq": |
1945 |
prep_cmd += ["--fastq"] |
1946 |
elif params.read_params.reads_format == "fasta": |
1947 |
prep_cmd += ["--fasta"] |
1948 |
if hits_to_filter: |
1949 |
prep_cmd += ["--flt-hits=" + ",".join(hits_to_filter)] |
1950 |
if aux_file: |
1951 |
prep_cmd += ["--aux-outfile="+aux_file] |
1952 |
if index_file: |
1953 |
prep_cmd += ["--index-outfile="+index_file] # could be a template |
1954 |
if filter_reads: |
1955 |
prep_cmd += ["--flt-reads=" + ",".join(filter_reads)] |
1956 |
if params.sam_header: |
1957 |
prep_cmd += ["--sam-header="+params.sam_header] |
1958 |
if out_file: |
1959 |
prep_cmd += ["--outfile="+out_file] #could be a template |
1960 |
prep_cmd.append(l_reads_list) |
1961 |
if l_quals_list: |
1962 |
prep_cmd.append(l_quals_list) |
1963 |
if r_reads_list: |
1964 |
prep_cmd.append(r_reads_list) |
1965 |
if r_quals_list: |
1966 |
prep_cmd.append(r_quals_list) |
1967 |
|
1968 |
return prep_cmd |
1969 |
|
1970 |
# Calls the prep_reads executable, which prepares an internal read library. |
1971 |
# The read library features reads with monotonically increasing integer IDs. |
1972 |
# prep_reads also filters out very low complexy or garbage reads as well as |
1973 |
# polyA reads. |
1974 |
#--> returns a PrepReadsInfo structure |
1975 |
def prep_reads(params, l_reads_list, l_quals_list, r_reads_list, r_quals_list, prefilter_reads=[]): |
1976 |
reads_suffix = ".bam" |
1977 |
use_bam = True |
1978 |
|
1979 |
#if params.read_params.color: |
1980 |
# reads_suffix = ".fq" |
1981 |
# use_bam = False |
1982 |
|
1983 |
# for parallelization, we don't compress the read files |
1984 |
do_use_zpacker = use_zpacker and not use_bam |
1985 |
if do_use_zpacker and params.system_params.num_threads > 1: |
1986 |
do_use_zpacker = False |
1987 |
|
1988 |
if do_use_zpacker: reads_suffix += ".z" |
1989 |
|
1990 |
out_suffix = "_kept_reads" + reads_suffix |
1991 |
#kept_reads_filename = tmp_dir + output_name + reads_suffix |
1992 |
|
1993 |
for side in ("left", "right"): |
1994 |
kept_reads_filename = tmp_dir + side + out_suffix |
1995 |
if resumeStage<1 and os.path.exists(kept_reads_filename): |
1996 |
os.remove(kept_reads_filename) |
1997 |
out_tmpl="left" |
1998 |
out_fname=None |
1999 |
kept_reads = None #output file handle |
2000 |
if r_reads_list: |
2001 |
out_tmpl="%side%" |
2002 |
info_file = output_dir+"prep_reads.info" |
2003 |
if fileExists(info_file,10) and resumeStage>0 : |
2004 |
return PrepReadsInfo(info_file, tmp_dir + out_tmpl + out_suffix) |
2005 |
|
2006 |
if use_bam: |
2007 |
out_fname = tmp_dir + out_tmpl + out_suffix |
2008 |
else: |
2009 |
#assumed no right reads given here, only one side is being processed |
2010 |
kept_reads = open(tmp_dir + out_tmpl + out_suffix, "wb") |
2011 |
log_fname=logging_dir + "prep_reads.log" |
2012 |
filter_log = open(log_fname,"w") |
2013 |
|
2014 |
index_file = out_fname + ".index" |
2015 |
if do_use_zpacker: index_file=None |
2016 |
|
2017 |
prep_cmd=prep_reads_cmd(params, l_reads_list, l_quals_list, r_reads_list, r_quals_list, |
2018 |
out_fname, info_file, index_file, prefilter_reads) |
2019 |
shell_cmd = ' '.join(prep_cmd) |
2020 |
#finally, add the compression pipe if needed |
2021 |
zip_cmd=[] |
2022 |
if do_use_zpacker: |
2023 |
zip_cmd=[ params.system_params.zipper ] |
2024 |
zip_cmd.extend(params.system_params.zipper_opts) |
2025 |
zip_cmd.extend(['-c','-']) |
2026 |
shell_cmd +=' | '+' '.join(zip_cmd) |
2027 |
if not use_bam: shell_cmd += ' >' +kept_reads_filename |
2028 |
retcode = None |
2029 |
try: |
2030 |
print >> run_log, shell_cmd |
2031 |
if do_use_zpacker: |
2032 |
filter_proc = subprocess.Popen(prep_cmd, |
2033 |
stdout=subprocess.PIPE, |
2034 |
stderr=filter_log) |
2035 |
zip_proc=subprocess.Popen(zip_cmd, |
2036 |
preexec_fn=subprocess_setup, |
2037 |
stdin=filter_proc.stdout, |
2038 |
stderr=tophat_log, stdout=kept_reads) |
2039 |
filter_proc.stdout.close() #as per http://bugs.python.org/issue7678 |
2040 |
zip_proc.communicate() |
2041 |
retcode=filter_proc.poll() |
2042 |
if retcode==0: |
2043 |
retcode=zip_proc.poll() |
2044 |
else: |
2045 |
if use_bam: |
2046 |
retcode = subprocess.call(prep_cmd, stderr=filter_log) |
2047 |
else: |
2048 |
retcode = subprocess.call(prep_cmd, |
2049 |
stdout=kept_reads, stderr=filter_log) |
2050 |
if retcode: |
2051 |
die(fail_str+"Error running 'prep_reads'\n"+log_tail(log_fname)) |
2052 |
|
2053 |
except OSError, o: |
2054 |
errmsg=fail_str+str(o) |
2055 |
die(errmsg+"\n"+log_tail(log_fname)) |
2056 |
|
2057 |
if kept_reads: kept_reads.close() |
2058 |
warnings=grep_file(log_fname) |
2059 |
if warnings: |
2060 |
th_logp("\n"+"\n".join(warnings)+"\n") |
2061 |
return PrepReadsInfo(info_file, tmp_dir + out_tmpl + out_suffix) |
2062 |
|
2063 |
# Call bowtie |
2064 |
def bowtie(params, |
2065 |
bwt_idx_prefix, |
2066 |
sam_headers, |
2067 |
reads_list, |
2068 |
reads_format, |
2069 |
num_mismatches, |
2070 |
gap_length, |
2071 |
edit_dist, |
2072 |
realign_edit_dist, |
2073 |
mapped_reads, |
2074 |
unmapped_reads, |
2075 |
extra_output = "", |
2076 |
mapping_type = _reads_vs_G, |
2077 |
multihits_out = None): #only --prefilter-multihits should activate this parameter for the initial prefilter search |
2078 |
start_time = datetime.now() |
2079 |
bwt_idx_name = bwt_idx_prefix.split('/')[-1] |
2080 |
reads_file=reads_list[0] |
2081 |
readfile_basename=getFileBaseName(reads_file) |
2082 |
|
2083 |
g_mapping, t_mapping, seg_mapping = False, False, False |
2084 |
sam_header_filename = None |
2085 |
genome_sam_header_filename = None |
2086 |
if mapping_type == _reads_vs_T: |
2087 |
t_mapping = True |
2088 |
sam_header_filename = sam_headers[0] |
2089 |
genome_sam_header_filename = sam_headers[1] |
2090 |
else: |
2091 |
sam_header_filename = sam_headers |
2092 |
if mapping_type >= _segs_vs_G: |
2093 |
seg_mapping = True |
2094 |
else: |
2095 |
g_mapping = True |
2096 |
|
2097 |
bowtie_str = "Bowtie" |
2098 |
if params.bowtie2: |
2099 |
bowtie_str += "2" |
2100 |
|
2101 |
if seg_mapping: |
2102 |
if not params.bowtie2: |
2103 |
backup_bowtie_alignment_option = params.bowtie_alignment_option |
2104 |
params.bowtie_alignment_option = "-v" |
2105 |
|
2106 |
resume_skip = resumeStage > currentStage |
2107 |
unmapped_reads_out=None |
2108 |
if unmapped_reads: |
2109 |
unmapped_reads_out=unmapped_reads+".fq" |
2110 |
mapped_reads += ".bam" |
2111 |
if unmapped_reads: |
2112 |
unmapped_reads_out = unmapped_reads + ".bam" |
2113 |
use_FIFO = use_BWT_FIFO and use_zpacker and unmapped_reads and params.read_params.color |
2114 |
if use_FIFO: |
2115 |
unmapped_reads_out+=".z" |
2116 |
if resume_skip: |
2117 |
#skipping this step |
2118 |
return (mapped_reads, unmapped_reads_out) |
2119 |
|
2120 |
bwt_logname=logging_dir + 'bowtie.'+readfile_basename+'.log' |
2121 |
|
2122 |
if t_mapping: |
2123 |
th_log("Mapping %s to transcriptome %s with %s %s" % (readfile_basename, |
2124 |
bwt_idx_name, bowtie_str, extra_output)) |
2125 |
else: |
2126 |
qryname = readfile_basename |
2127 |
if len(reads_list) > 1: |
2128 |
bnames=[] |
2129 |
for fname in reads_list: |
2130 |
bnames += [getFileBaseName(fname)] |
2131 |
qryname = ",".join(bnames) |
2132 |
th_log("Mapping %s to genome %s with %s %s" % (qryname, |
2133 |
bwt_idx_name, bowtie_str, extra_output)) |
2134 |
|
2135 |
if use_FIFO: |
2136 |
global unmapped_reads_fifo |
2137 |
unmapped_reads_fifo=unmapped_reads+".fifo" |
2138 |
if os.path.exists(unmapped_reads_fifo): |
2139 |
os.remove(unmapped_reads_fifo) |
2140 |
try: |
2141 |
os.mkfifo(unmapped_reads_fifo) |
2142 |
except OSError, o: |
2143 |
die(fail_str+"Error at mkfifo("+unmapped_reads_fifo+'). '+str(o)) |
2144 |
|
2145 |
# Launch Bowtie |
2146 |
try: |
2147 |
bowtie_cmd = [bowtie_path] |
2148 |
if reads_format == "fastq": |
2149 |
bowtie_cmd += ["-q"] |
2150 |
elif reads_format == "fasta": |
2151 |
bowtie_cmd += ["-f"] |
2152 |
if params.read_params.color: |
2153 |
bowtie_cmd += ["-C", "--col-keepends"] |
2154 |
|
2155 |
unzip_cmd=None |
2156 |
bam_input=False |
2157 |
if len(reads_list) > 0 and reads_list[0].endswith('.bam'): |
2158 |
bam_input=True |
2159 |
unzip_cmd=[ prog_path('bam2fastx'), "--all" ] |
2160 |
if params.read_params.color: |
2161 |
unzip_cmd.append("--color") |
2162 |
if reads_format: |
2163 |
unzip_cmd.append("--" + reads_format) |
2164 |
unzip_cmd+=[reads_list[0]] |
2165 |
|
2166 |
if use_zpacker and (unzip_cmd is None): |
2167 |
unzip_cmd=[ params.system_params.zipper ] |
2168 |
unzip_cmd.extend(params.system_params.zipper_opts) |
2169 |
unzip_cmd+=['-cd'] |
2170 |
|
2171 |
fifo_pid=None |
2172 |
if use_FIFO: |
2173 |
unm_zipcmd=[ params.system_params.zipper ] |
2174 |
unm_zipcmd.extend(params.system_params.zipper_opts) |
2175 |
unm_zipcmd+=['-c'] |
2176 |
print >> run_log, ' '.join(unm_zipcmd)+' < '+ unmapped_reads_fifo + ' > '+ unmapped_reads_out + ' & ' |
2177 |
fifo_pid=os.fork() |
2178 |
if fifo_pid==0: |
2179 |
def on_sig_exit(sig, func=None): |
2180 |
os._exit(os.EX_OK) |
2181 |
signal.signal(signal.SIGTERM, on_sig_exit) |
2182 |
subprocess.call(unm_zipcmd, |
2183 |
stdin=open(unmapped_reads_fifo, "r"), |
2184 |
stderr=tophat_log, |
2185 |
stdout=open(unmapped_reads_out, "wb")) |
2186 |
os._exit(os.EX_OK) |
2187 |
|
2188 |
fix_map_cmd = [prog_path('fix_map_ordering')] |
2189 |
if params.read_params.color: |
2190 |
fix_map_cmd += ["--color"] |
2191 |
|
2192 |
if params.bowtie2: |
2193 |
#if t_mapping or g_mapping: |
2194 |
max_penalty, min_penalty = params.bowtie2_params.mp.split(',') |
2195 |
max_penalty, min_penalty = int(max_penalty), int(min_penalty) |
2196 |
min_score = (max_penalty - 1) * realign_edit_dist |
2197 |
fix_map_cmd += ["--bowtie2-min-score", str(min_score)] |
2198 |
# testing score filtering |
2199 |
if params.b2scoreflt: |
2200 |
fix_map_cmd +=["-W"+str(min_score+max_penalty)] |
2201 |
fix_map_cmd += ["--read-mismatches", str(params.read_mismatches), |
2202 |
"--read-gap-length", str(params.read_gap_length), |
2203 |
"--read-edit-dist", str(params.read_edit_dist), |
2204 |
"--read-realign-edit-dist", str(params.read_realign_edit_dist)] |
2205 |
|
2206 |
#write BAM file |
2207 |
out_bam = mapped_reads |
2208 |
|
2209 |
if not t_mapping: |
2210 |
fix_map_cmd += ["--index-outfile", mapped_reads + ".index"] |
2211 |
if not params.bowtie2: |
2212 |
fix_map_cmd += ["--bowtie1"] |
2213 |
if multihits_out != None: |
2214 |
fix_map_cmd += ["--aux-outfile", params.preflt_data[multihits_out].multihit_reads] |
2215 |
fix_map_cmd += ["--max-multihits", str(params.max_hits)] |
2216 |
if t_mapping: |
2217 |
out_bam = "-" # we'll pipe into map2gtf |
2218 |
fix_map_cmd += ["--sam-header", sam_header_filename, "-", out_bam] |
2219 |
if unmapped_reads: |
2220 |
fix_map_cmd += [unmapped_reads_out] |
2221 |
if t_mapping: |
2222 |
max_hits = params.t_max_hits |
2223 |
elif seg_mapping: |
2224 |
max_hits = params.max_seg_hits |
2225 |
else: |
2226 |
max_hits = params.max_hits |
2227 |
|
2228 |
if num_mismatches > 3: |
2229 |
num_mismatches = 3 |
2230 |
|
2231 |
if params.bowtie2: |
2232 |
if seg_mapping or multihits_out != None: |
2233 |
# since bowtie2 does not suppress reads that map to too many places, |
2234 |
# we suppress those in segment_juncs and long_spanning_reads. |
2235 |
bowtie_cmd += ["-k", str(max_hits + 1)] |
2236 |
else: |
2237 |
bowtie_cmd += ["-k", str(max_hits)] |
2238 |
|
2239 |
bowtie2_params = params.bowtie2_params |
2240 |
if seg_mapping: |
2241 |
# after intensive testing, |
2242 |
# the following parameters seem to work faster than Bowtie1 and as sensitive as Bowtie1, |
2243 |
# but room for further improvements remains. |
2244 |
bowtie_cmd += ["-N", str(min(num_mismatches, 1))] |
2245 |
bowtie_cmd += ["-i", "C,2,0"] |
2246 |
bowtie_cmd += ["-L", "20"] |
2247 |
# bowtie_cmd += ["-L", str(min(params.segment_length, 20))] |
2248 |
else: |
2249 |
bowtie2_preset = "" |
2250 |
if bowtie2_params.very_fast: |
2251 |
bowtie2_preset = "--very-fast" |
2252 |
elif bowtie2_params.fast: |
2253 |
bowtie2_preset = "--fast" |
2254 |
elif bowtie2_params.sensitive: |
2255 |
bowtie2_preset = "--sensitive" |
2256 |
elif bowtie2_params.very_sensitive: |
2257 |
bowtie2_preset = "--very-sensitive" |
2258 |
|
2259 |
if bowtie2_preset != "": |
2260 |
bowtie_cmd += [bowtie2_preset] |
2261 |
else: |
2262 |
bowtie_cmd += ["-D", str(bowtie2_params.D), |
2263 |
"-R", str(bowtie2_params.R), |
2264 |
"-N", str(bowtie2_params.N), |
2265 |
"-L", str(bowtie2_params.L), |
2266 |
"-i", bowtie2_params.i] |
2267 |
|
2268 |
score_min = bowtie2_params.score_min |
2269 |
if not score_min: |
2270 |
max_penalty, min_penalty = bowtie2_params.mp.split(',') |
2271 |
score_min = "C,-%d,0" % (int(max_penalty) * edit_dist + 2) |
2272 |
|
2273 |
# "--n-ceil" is not correctly parsed in Bowtie2, |
2274 |
# I (daehwan) already talked to Ben who will fix the problem. |
2275 |
bowtie_cmd += [# "--n-ceil", bowtie2_params.n_ceil, |
2276 |
"--gbar", str(bowtie2_params.gbar), |
2277 |
"--mp", bowtie2_params.mp, |
2278 |
"--np", str(bowtie2_params.np), |
2279 |
"--rdg", bowtie2_params.rdg, |
2280 |
"--rfg", bowtie2_params.rfg, |
2281 |
"--score-min", score_min] |
2282 |
|
2283 |
else: |
2284 |
bowtie_cmd += [params.bowtie_alignment_option, str(num_mismatches), |
2285 |
"-k", str(max_hits), |
2286 |
"-m", str(max_hits), |
2287 |
"-S"] |
2288 |
|
2289 |
bowtie_cmd += ["-p", str(params.system_params.num_threads)] |
2290 |
|
2291 |
if params.bowtie2: #always use headerless SAM file |
2292 |
bowtie_cmd += ["--sam-no-hd"] |
2293 |
else: |
2294 |
bowtie_cmd += ["--sam-nohead"] |
2295 |
|
2296 |
if not params.bowtie2: |
2297 |
if multihits_out != None: |
2298 |
bowtie_cmd += ["--max", params.preflt_data[multihits_out].multihit_reads] |
2299 |
else: |
2300 |
bowtie_cmd += ["--max", "/dev/null"] |
2301 |
|
2302 |
if params.bowtie2: |
2303 |
bowtie_cmd += ["-x"] |
2304 |
|
2305 |
bowtie_cmd += [ bwt_idx_prefix ] |
2306 |
bowtie_proc=None |
2307 |
shellcmd="" |
2308 |
unzip_proc=None |
2309 |
|
2310 |
if multihits_out != None: |
2311 |
#special prefilter bowtie run: we use prep_reads on the fly |
2312 |
#in order to get multi-mapped reads to exclude later |
2313 |
prep_cmd = prep_reads_cmd(params, params.preflt_data[0].seqfiles, params.preflt_data[0].qualfiles, |
2314 |
params.preflt_data[1].seqfiles, params.preflt_data[1].qualfiles) |
2315 |
prep_cmd.insert(1,"--flt-side="+str(multihits_out)) |
2316 |
sides=["left", "right"] |
2317 |
preplog_fname=logging_dir + "prep_reads.prefilter_%s.log" % sides[multihits_out] |
2318 |
prepfilter_log = open(preplog_fname,"w") |
2319 |
unzip_proc = subprocess.Popen(prep_cmd, |
2320 |
stdout=subprocess.PIPE, |
2321 |
stderr=prepfilter_log) |
2322 |
shellcmd=' '.join(prep_cmd) + "|" |
2323 |
else: |
2324 |
z_input=use_zpacker and reads_file.endswith(".z") |
2325 |
if z_input: |
2326 |
unzip_proc = subprocess.Popen(unzip_cmd, |
2327 |
stdin=open(reads_file, "rb"), |
2328 |
stderr=tophat_log, stdout=subprocess.PIPE) |
2329 |
shellcmd=' '.join(unzip_cmd) + "< " +reads_file +"|" |
2330 |
else: |
2331 |
#must be uncompressed fastq input (unmapped reads from a previous run) |
2332 |
#or a BAM file with unmapped reads |
2333 |
if bam_input: |
2334 |
unzip_proc = subprocess.Popen(unzip_cmd, stderr=tophat_log, stdout=subprocess.PIPE) |
2335 |
shellcmd=' '.join(unzip_cmd) + "|" |
2336 |
else: |
2337 |
bowtie_cmd += [reads_file] |
2338 |
if not unzip_proc: |
2339 |
bowtie_proc = subprocess.Popen(bowtie_cmd, |
2340 |
stdout=subprocess.PIPE, |
2341 |
stderr=open(bwt_logname, "w")) |
2342 |
if unzip_proc: |
2343 |
#input is compressed OR prep_reads is used as a filter |
2344 |
bowtie_cmd += ['-'] |
2345 |
bowtie_proc = subprocess.Popen(bowtie_cmd, |
2346 |
stdin=unzip_proc.stdout, |
2347 |
stdout=subprocess.PIPE, |
2348 |
stderr=open(bwt_logname, "w")) |
2349 |
unzip_proc.stdout.close() # see http://bugs.python.org/issue7678 |
2350 |
|
2351 |
shellcmd += ' '.join(bowtie_cmd) + '|' + ' '.join(fix_map_cmd) |
2352 |
pipeline_proc = None |
2353 |
fix_order_proc = None |
2354 |
#write BAM format directly |
2355 |
if t_mapping: |
2356 |
#pipe into map2gtf |
2357 |
fix_order_proc = subprocess.Popen(fix_map_cmd, |
2358 |
stdin=bowtie_proc.stdout, |
2359 |
stdout=subprocess.PIPE, |
2360 |
stderr=tophat_log) |
2361 |
bowtie_proc.stdout.close() |
2362 |
m2g_cmd = [prog_path("map2gtf")] |
2363 |
m2g_cmd += ["--sam-header", genome_sam_header_filename] |
2364 |
m2g_cmd.append(params.gff_annotation) |
2365 |
m2g_cmd.append("-") #incoming uncompressed BAM stream |
2366 |
m2g_cmd.append(mapped_reads) |
2367 |
m2g_log = logging_dir + "m2g_"+readfile_basename+".out" |
2368 |
m2g_err = logging_dir + "m2g_"+readfile_basename+".err" |
2369 |
shellcmd += ' | '+' '.join(m2g_cmd)+ ' > '+m2g_log |
2370 |
pipeline_proc = subprocess.Popen(m2g_cmd, |
2371 |
stdin=fix_order_proc.stdout, |
2372 |
stdout=open(m2g_log, "w"), |
2373 |
stderr=open(m2g_err, "w")) |
2374 |
fix_order_proc.stdout.close() |
2375 |
else: |
2376 |
fix_order_proc = subprocess.Popen(fix_map_cmd, |
2377 |
stdin=bowtie_proc.stdout, |
2378 |
stderr=tophat_log) |
2379 |
bowtie_proc.stdout.close() |
2380 |
pipeline_proc = fix_order_proc |
2381 |
|
2382 |
print >> run_log, shellcmd |
2383 |
retcode = None |
2384 |
if pipeline_proc: |
2385 |
pipeline_proc.communicate() |
2386 |
retcode = pipeline_proc.returncode |
2387 |
bowtie_proc.wait() |
2388 |
r=bowtie_proc.returncode |
2389 |
if r: |
2390 |
die(fail_str+"Error running bowtie:\n"+log_tail(bwt_logname,100)) |
2391 |
if use_FIFO: |
2392 |
if fifo_pid and not os.path.exists(unmapped_reads_out): |
2393 |
try: |
2394 |
os.kill(fifo_pid, signal.SIGTERM) |
2395 |
except: |
2396 |
pass |
2397 |
if retcode: |
2398 |
die(fail_str+"Error running:\n"+shellcmd) |
2399 |
except OSError, o: |
2400 |
die(fail_str+"Error: "+str(o)) |
2401 |
|
2402 |
# Success |
2403 |
#finish_time = datetime.now() |
2404 |
#duration = finish_time - start_time |
2405 |
#print >> sys.stderr, "\t\t\t[%s elapsed]" % formatTD(duration) |
2406 |
if use_FIFO: |
2407 |
try: |
2408 |
os.remove(unmapped_reads_fifo) |
2409 |
except: |
2410 |
pass |
2411 |
if multihits_out != None and not os.path.exists(params.preflt_data[multihits_out].multihit_reads): |
2412 |
open(params.preflt_data[multihits_out].multihit_reads, "w").close() |
2413 |
|
2414 |
if seg_mapping: |
2415 |
if not params.bowtie2: |
2416 |
params.bowtie_alignment_option = backup_bowtie_alignment_option |
2417 |
|
2418 |
return (mapped_reads, unmapped_reads_out) |
2419 |
|
2420 |
|
2421 |
# Retrieve a .juncs file from a GFF file by calling the gtf_juncs executable |
2422 |
def get_gtf_juncs(gff_annotation): |
2423 |
th_log("Reading known junctions from GTF file") |
2424 |
gtf_juncs_log = open(logging_dir + "gtf_juncs.log", "w") |
2425 |
|
2426 |
gff_prefix = gff_annotation.split('/')[-1].split('.')[0] |
2427 |
|
2428 |
gtf_juncs_out_name = tmp_dir + gff_prefix + ".juncs" |
2429 |
gtf_juncs_out = open(gtf_juncs_out_name, "w") |
2430 |
|
2431 |
gtf_juncs_cmd=[prog_path("gtf_juncs"), gff_annotation] |
2432 |
try: |
2433 |
print >> run_log, " ".join(gtf_juncs_cmd), " > "+gtf_juncs_out_name |
2434 |
retcode = subprocess.call(gtf_juncs_cmd, |
2435 |
stderr=gtf_juncs_log, |
2436 |
stdout=gtf_juncs_out) |
2437 |
# cvg_islands returned an error |
2438 |
if retcode == 1: |
2439 |
th_logp("\tWarning: TopHat did not find any junctions in GTF file") |
2440 |
return (False, gtf_juncs_out_name) |
2441 |
elif retcode != 0: |
2442 |
die(fail_str+"Error: GTF junction extraction failed with err ="+str(retcode)) |
2443 |
|
2444 |
# cvg_islands not found |
2445 |
except OSError, o: |
2446 |
errmsg=fail_str+str(o)+"\n" |
2447 |
if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT: |
2448 |
errmsg+="Error: gtf_juncs not found on this system" |
2449 |
die(errmsg) |
2450 |
return (True, gtf_juncs_out_name) |
2451 |
|
2452 |
# Call bowtie-build on the FASTA file of sythetic splice junction sequences |
2453 |
def build_juncs_bwt_index(is_bowtie2, external_splice_prefix, color): |
2454 |
th_log("Indexing splices") |
2455 |
bowtie_build_log = open(logging_dir + "bowtie_build.log", "w") |
2456 |
|
2457 |
#user_splices_out_prefix = output_dir + "user_splices_idx" |
2458 |
|
2459 |
if is_bowtie2: |
2460 |
bowtie_build_cmd = [prog_path("bowtie2-build")] |
2461 |
else: |
2462 |
bowtie_build_cmd = [prog_path("bowtie-build")] |
2463 |
|
2464 |
if color: |
2465 |
bowtie_build_cmd += ["-C"] |
2466 |
|
2467 |
bowtie_build_cmd += [external_splice_prefix + ".fa", |
2468 |
external_splice_prefix] |
2469 |
try: |
2470 |
print >> run_log, " ".join(bowtie_build_cmd) |
2471 |
retcode = subprocess.call(bowtie_build_cmd, |
2472 |
stdout=bowtie_build_log) |
2473 |
|
2474 |
if retcode != 0: |
2475 |
die(fail_str+"Error: Splice sequence indexing failed with err ="+ str(retcode)) |
2476 |
except OSError, o: |
2477 |
errmsg=fail_str+str(o)+"\n" |
2478 |
if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT: |
2479 |
errmsg+="Error: bowtie-build not found on this system" |
2480 |
die(errmsg) |
2481 |
return external_splice_prefix |
2482 |
|
2483 |
# Build a splice index from a .juncs file, suitable for use with specified read |
2484 |
# (or read segment) lengths |
2485 |
def build_juncs_index(is_bowtie2, |
2486 |
min_anchor_length, |
2487 |
max_seg_len, |
2488 |
juncs_prefix, |
2489 |
external_juncs, |
2490 |
external_insertions, |
2491 |
external_deletions, |
2492 |
external_fusions, |
2493 |
reference_fasta, |
2494 |
color): |
2495 |
th_log("Retrieving sequences for splices") |
2496 |
juncs_file_list = ",".join(external_juncs) |
2497 |
insertions_file_list = ",".join(external_insertions) |
2498 |
deletions_file_list = ",".join(external_deletions) |
2499 |
fusions_file_list = ",".join(external_fusions) |
2500 |
|
2501 |
# do not use insertions and deletions in case of Bowtie2 |
2502 |
if is_bowtie2: |
2503 |
insertions_file_list = "/dev/null" |
2504 |
deletions_file_list = "/dev/null" |
2505 |
|
2506 |
juncs_db_log = open(logging_dir + "juncs_db.log", "w") |
2507 |
|
2508 |
external_splices_out_prefix = tmp_dir + juncs_prefix |
2509 |
external_splices_out_name = external_splices_out_prefix + ".fa" |
2510 |
|
2511 |
external_splices_out = open(external_splices_out_name, "w") |
2512 |
# juncs_db_cmd = [bin_dir + "juncs_db", |
2513 |
juncs_db_cmd = [prog_path("juncs_db"), |
2514 |
str(min_anchor_length), |
2515 |
str(max_seg_len), |
2516 |
juncs_file_list, |
2517 |
insertions_file_list, |
2518 |
deletions_file_list, |
2519 |
fusions_file_list, |
2520 |
reference_fasta] |
2521 |
try: |
2522 |
print >> run_log, " ".join(juncs_db_cmd) + " > " + external_splices_out_name |
2523 |
retcode = subprocess.call(juncs_db_cmd, |
2524 |
stderr=juncs_db_log, |
2525 |
stdout=external_splices_out) |
2526 |
|
2527 |
if retcode != 0: |
2528 |
die(fail_str+"Error: Splice sequence retrieval failed with err ="+str(retcode)) |
2529 |
# juncs_db not found |
2530 |
except OSError, o: |
2531 |
errmsg=fail_str+str(o)+"\n" |
2532 |
if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT: |
2533 |
errmsg+="Error: juncs_db not found on this system" |
2534 |
die(errmsg) |
2535 |
|
2536 |
external_splices_out_prefix = build_juncs_bwt_index(is_bowtie2, external_splices_out_prefix, color) |
2537 |
return external_splices_out_prefix |
2538 |
|
2539 |
def build_idx_from_fa(is_bowtie2, fasta_fname, out_dir, color): |
2540 |
""" Build a bowtie index from a FASTA file. |
2541 |
|
2542 |
Arguments: |
2543 |
- `fasta_fname`: File path to FASTA file. |
2544 |
- `out_dir`: Output directory to place index in. (includes os.sep) |
2545 |
|
2546 |
Returns: |
2547 |
- The path to the Bowtie index. |
2548 |
""" |
2549 |
bwt_idx_path = out_dir + os.path.basename(fasta_fname).replace(".fa", "") |
2550 |
|
2551 |
if is_bowtie2: |
2552 |
bowtie_idx_cmd = [prog_path("bowtie2-build")] |
2553 |
else: |
2554 |
bowtie_idx_cmd = [prog_path("bowtie-build")] |
2555 |
|
2556 |
if color: |
2557 |
bowtie_idx_cmd += ["-C"] |
2558 |
|
2559 |
bowtie_idx_cmd += [fasta_fname, |
2560 |
bwt_idx_path] |
2561 |
try: |
2562 |
th_log("Building Bowtie index from " + os.path.basename(fasta_fname)) |
2563 |
print >> run_log, " ".join(bowtie_idx_cmd) |
2564 |
retcode = subprocess.call(bowtie_idx_cmd, |
2565 |
stdout=open(os.devnull, "w"), |
2566 |
stderr=open(os.devnull, "w")) |
2567 |
if retcode != 0: |
2568 |
die(fail_str + "Error: Couldn't build bowtie index with err = " |
2569 |
+ str(retcode)) |
2570 |
except OSError, o: |
2571 |
errmsg=fail_str+str(o)+"\n" |
2572 |
if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT: |
2573 |
errmsg+="Error: bowtie-build not found on this system" |
2574 |
die(errmsg) |
2575 |
|
2576 |
return bwt_idx_path |
2577 |
|
2578 |
# Print out the sam header, embedding the user's specified library properties. |
2579 |
# FIXME: also needs SQ dictionary lines |
2580 |
def write_sam_header(read_params, sam_file): |
2581 |
print >> sam_file, "@HD\tVN:1.0\tSO:coordinate" |
2582 |
if read_params.read_group_id and read_params.sample_id: |
2583 |
rg_str = "@RG\tID:%s\tSM:%s" % (read_params.read_group_id, |
2584 |
read_params.sample_id) |
2585 |
if read_params.library_id: |
2586 |
rg_str += "\tLB:%s" % read_params.library_id |
2587 |
if read_params.description: |
2588 |
rg_str += "\tDS:%s" % read_params.description |
2589 |
if read_params.seq_platform_unit: |
2590 |
rg_str += "\tPU:%s" % read_params.seq_platform_unit |
2591 |
if read_params.seq_center: |
2592 |
rg_str += "\tCN:%s" % read_params.seq_center |
2593 |
if read_params.mate_inner_dist: |
2594 |
rg_str += "\tPI:%s" % read_params.mate_inner_dist |
2595 |
if read_params.seq_run_date: |
2596 |
rg_str += "\tDT:%s" % read_params.seq_run_date |
2597 |
if read_params.seq_platform: |
2598 |
rg_str += "\tPL:%s" % read_params.seq_platform |
2599 |
|
2600 |
print >> sam_file, rg_str |
2601 |
print >> sam_file, "@PG\tID:TopHat\tVN:%s\tCL:%s" % (get_version(), run_cmd) |
2602 |
|
2603 |
# Write final TopHat output, via tophat_reports and wiggles |
2604 |
def compile_reports(params, sam_header_filename, ref_fasta, mappings, readfiles, gff_annotation): |
2605 |
th_log("Reporting output tracks") |
2606 |
left_maps, right_maps = mappings |
2607 |
left_reads, right_reads = readfiles |
2608 |
# left_maps = [x for x in left_maps if (os.path.exists(x) and os.path.getsize(x) > 25)] |
2609 |
left_maps = ','.join(left_maps) |
2610 |
|
2611 |
if len(right_maps) > 0: |
2612 |
# right_maps = [x for x in right_maps if (os.path.exists(x) and os.path.getsize(x) > 25)] |
2613 |
right_maps = ','.join(right_maps) |
2614 |
|
2615 |
log_fname = logging_dir + "reports.log" |
2616 |
report_log = open(log_fname, "w") |
2617 |
junctions = output_dir + "junctions.bed" |
2618 |
insertions = output_dir + "insertions.bed" |
2619 |
deletions = output_dir + "deletions.bed" |
2620 |
accepted_hits = output_dir + "accepted_hits" |
2621 |
report_cmdpath = prog_path("tophat_reports") |
2622 |
fusions = output_dir + "fusions.out" |
2623 |
report_cmd = [report_cmdpath] |
2624 |
|
2625 |
alignments_output_filename = tmp_dir + "accepted_hits" |
2626 |
|
2627 |
report_cmd.extend(params.cmd()) |
2628 |
report_cmd += ["--sam-header", sam_header_filename] |
2629 |
if params.report_secondary_alignments: |
2630 |
report_cmd += ["--report-secondary-alignments"] |
2631 |
|
2632 |
if params.report_discordant_pair_alignments: |
2633 |
report_cmd += ["--report-discordant-pair-alignments"] |
2634 |
|
2635 |
if params.report_mixed_alignments: |
2636 |
report_cmd += ["--report-mixed-alignments"] |
2637 |
|
2638 |
report_cmd.extend(["--samtools="+samtools_path]) |
2639 |
|
2640 |
b2_params = params.bowtie2_params |
2641 |
max_penalty, min_penalty = b2_params.mp.split(',') |
2642 |
report_cmd += ["--bowtie2-max-penalty", max_penalty, |
2643 |
"--bowtie2-min-penalty", min_penalty] |
2644 |
|
2645 |
report_cmd += ["--bowtie2-penalty-for-N", str(b2_params.np)] |
2646 |
|
2647 |
read_gap_open, read_gap_cont = b2_params.rdg.split(',') |
2648 |
report_cmd += ["--bowtie2-read-gap-open", read_gap_open, |
2649 |
"--bowtie2-read-gap-cont", read_gap_cont] |
2650 |
|
2651 |
ref_gap_open, ref_gap_cont = b2_params.rfg.split(',') |
2652 |
report_cmd += ["--bowtie2-ref-gap-open", ref_gap_open, |
2653 |
"--bowtie2-ref-gap-cont", ref_gap_cont] |
2654 |
|
2655 |
report_cmd.extend([ref_fasta, |
2656 |
junctions, |
2657 |
insertions, |
2658 |
deletions, |
2659 |
fusions, |
2660 |
alignments_output_filename, |
2661 |
left_maps, |
2662 |
left_reads]) |
2663 |
|
2664 |
if len(right_maps) > 0 and right_reads: |
2665 |
report_cmd.append(right_maps) |
2666 |
report_cmd.append(right_reads) |
2667 |
|
2668 |
try: |
2669 |
print >> run_log, " ".join(report_cmd) |
2670 |
report_proc=subprocess.call(report_cmd, |
2671 |
preexec_fn=subprocess_setup, |
2672 |
stderr=report_log) |
2673 |
if report_proc != 0: |
2674 |
die(fail_str+"Error running "+" ".join(report_cmd)+"\n"+log_tail(log_fname)) |
2675 |
bam_parts = [] |
2676 |
for i in range(params.system_params.num_threads): |
2677 |
bam_part_filename = "%s%d.bam" % (alignments_output_filename, i) |
2678 |
if os.path.exists(bam_part_filename): |
2679 |
bam_parts.append(bam_part_filename) |
2680 |
else: |
2681 |
break |
2682 |
num_bam_parts = len(bam_parts) |
2683 |
|
2684 |
if params.report_params.sort_bam: |
2685 |
pids = [0 for i in range(num_bam_parts)] |
2686 |
sorted_bam_parts = ["%s%d_sorted" % (alignments_output_filename, i) for i in range(num_bam_parts)] |
2687 |
#left_um_parts = ["%s%s%d_sorted" % (alignments_output_filename, i) for i in range(num_bam_parts)] |
2688 |
#right_um_parts = ["%s%d_sorted" % (alignments_output_filename, i) for i in range(num_bam_parts)] |
2689 |
for i in range(num_bam_parts): |
2690 |
bamsort_cmd = [samtools_path, |
2691 |
"sort", |
2692 |
bam_parts[i], |
2693 |
sorted_bam_parts[i]] |
2694 |
|
2695 |
sorted_bam_parts[i] += ".bam" |
2696 |
print >> run_log, " ".join(bamsort_cmd) |
2697 |
|
2698 |
if i + 1 < range(num_bam_parts): |
2699 |
pid = os.fork() |
2700 |
if pid == 0: |
2701 |
subprocess.call(bamsort_cmd, |
2702 |
stderr=open(logging_dir + "reports.samtools_sort.log%d" % i, "w")) |
2703 |
os._exit(os.EX_OK) |
2704 |
else: |
2705 |
pids[i] = pid |
2706 |
else: |
2707 |
subprocess.call(bamsort_cmd, |
2708 |
stderr=open(logging_dir + "reports.samtools_sort.log%d" % i, "w")) |
2709 |
|
2710 |
for i in range(len(pids)): |
2711 |
if pids[i] > 0: |
2712 |
os.waitpid(pids[i], 0) |
2713 |
pids[i] = 0 |
2714 |
|
2715 |
for bam_part in bam_parts: |
2716 |
os.remove(bam_part) |
2717 |
bam_parts = sorted_bam_parts[:] |
2718 |
#-- endif sort_bam |
2719 |
|
2720 |
if num_bam_parts > 1: |
2721 |
if params.report_params.sort_bam: |
2722 |
bammerge_cmd = [samtools_path, |
2723 |
"merge","-f","-h", sam_header_filename] |
2724 |
if not params.report_params.convert_bam: |
2725 |
bammerge_cmd += ["-u"] |
2726 |
else: #not sorted, so just raw merge |
2727 |
bammerge_cmd = [prog_path("bam_merge"), "-Q", |
2728 |
"--sam-header", sam_header_filename] |
2729 |
|
2730 |
if params.report_params.convert_bam: |
2731 |
bammerge_cmd += ["%s.bam" % accepted_hits] |
2732 |
bammerge_cmd += bam_parts |
2733 |
print >> run_log, " ".join(bammerge_cmd) |
2734 |
subprocess.call(bammerge_cmd, |
2735 |
stderr=open(logging_dir + "reports.merge_bam.log", "w")) |
2736 |
else: #make .sam |
2737 |
bammerge_cmd += ["-"] |
2738 |
bammerge_cmd += bam_parts |
2739 |
merge_proc = subprocess.Popen(bammerge_cmd, |
2740 |
stdout=subprocess.PIPE, |
2741 |
stderr=open(logging_dir + "reports.merge_bam.log", "w")) |
2742 |
bam2sam_cmd = [samtools_path, "view", "-h", "-"] |
2743 |
sam_proc = subprocess.Popen(bam2sam_cmd, |
2744 |
stdin=merge_proc.stdout, |
2745 |
stdout=open(accepted_hits + ".sam", "w"), |
2746 |
stderr=open(logging_dir + "accepted_hits_bam_to_sam.log", "w")) |
2747 |
merge_proc.stdout.close() |
2748 |
shellcmd = " ".join(bammerge_cmd) + " | " + " ".join(bam2sam_cmd) |
2749 |
print >> run_log, shellcmd |
2750 |
sam_proc.communicate() |
2751 |
retcode = sam_proc.returncode |
2752 |
if retcode: |
2753 |
die(fail_str+"Error running:\n"+shellcmd) |
2754 |
else: # only one file |
2755 |
os.rename(bam_parts[0], accepted_hits+".bam") |
2756 |
if not params.report_params.convert_bam: |
2757 |
#just convert to .sam |
2758 |
bam2sam_cmd = [samtools_path, "view", "-h", accepted_hits+".bam"] |
2759 |
shellcmd = " ".join(bam2sam_cmd) + " > " + accepted_hits + ".sam" |
2760 |
print >> run_log, shellcmd |
2761 |
r = subprocess.call(bam2sam_cmd, |
2762 |
stdout=open(accepted_hits + ".sam", "w"), |
2763 |
stderr=open(logging_dir + "accepted_hits_bam_to_sam.log", "w")) |
2764 |
if r != 0: |
2765 |
die(fail_str+"Error running: "+shellcmd) |
2766 |
os.remove(accepted_hits+".bam") |
2767 |
|
2768 |
except OSError, o: |
2769 |
die(fail_str+"Error: "+str(o)+"\n"+log_tail(log_fname)) |
2770 |
|
2771 |
try: |
2772 |
# -- merge the unmapped files |
2773 |
um_parts = [] |
2774 |
um_merged = output_dir + "unmapped.bam" |
2775 |
for i in range(params.system_params.num_threads): |
2776 |
left_um_file = tmp_dir + "unmapped_left_%d.bam" % i |
2777 |
right_um_file = tmp_dir + "unmapped_right_%d.bam" % i |
2778 |
um_len = len(um_parts) |
2779 |
if nonzeroFile(left_um_file): |
2780 |
um_parts.append(left_um_file) |
2781 |
if right_reads and nonzeroFile(right_um_file): |
2782 |
um_parts.append(right_um_file) |
2783 |
|
2784 |
if len(um_parts) > 0: |
2785 |
if len(um_parts)==1: |
2786 |
os.rename(um_parts[0], um_merged) |
2787 |
else: |
2788 |
merge_cmd=[prog_path("bam_merge"), "-Q", |
2789 |
"--sam-header", sam_header_filename, um_merged] |
2790 |
merge_cmd += um_parts |
2791 |
print >> run_log, " ".join(merge_cmd) |
2792 |
ret = subprocess.call( merge_cmd, |
2793 |
stderr=open(logging_dir + "bam_merge_um.log", "w") ) |
2794 |
if ret != 0: |
2795 |
die(fail_str+"Error executing: "+" ".join(merge_cmd)+"\n"+log_tail(logging_dir+"bam_merge_um.log")) |
2796 |
except OSError, o: |
2797 |
die(fail_str+"Error: "+str(o)+"\n"+log_tail(log_fname)) |
2798 |
|
2799 |
return junctions |
2800 |
|
2801 |
|
2802 |
# Split up each read in a FASTQ file into multiple segments. Creates a FASTQ file |
2803 |
# for each segment This function needs to be fixed to support mixed read length |
2804 |
# inputs |
2805 |
def open_output_files(prefix, num_files_prev, num_files, out_segf, extension, params): |
2806 |
i = num_files_prev + 1 |
2807 |
while i <= num_files: |
2808 |
segfname=prefix+("_seg%d" % i)+extension |
2809 |
out_segf.append(ZWriter(segfname,params.system_params)) |
2810 |
i += 1 |
2811 |
|
2812 |
def split_reads(reads_filename, |
2813 |
prefix, |
2814 |
fasta, |
2815 |
params, |
2816 |
segment_length): |
2817 |
#reads_file = open(reads_filename) |
2818 |
out_segfiles = [] |
2819 |
if fasta: |
2820 |
extension = ".fa" |
2821 |
else: |
2822 |
extension = ".fq" |
2823 |
if use_zpacker: extension += ".z" |
2824 |
existing_seg_files = glob.glob(prefix+"_seg*"+extension) |
2825 |
if resumeStage > currentStage and len(existing_seg_files)>0: |
2826 |
#skip this, we are going to return the existing files |
2827 |
return existing_seg_files |
2828 |
zreads = ZReader(reads_filename, params, False) |
2829 |
|
2830 |
def convert_color_to_bp(color_seq): |
2831 |
decode_dic = { 'A0':'A', 'A1':'C', 'A2':'G', 'A3':'T', 'A4':'N', 'A.':'N', 'AN':'N', |
2832 |
'C0':'C', 'C1':'A', 'C2':'T', 'C3':'G', 'C4':'N', 'C.':'N', 'CN':'N', |
2833 |
'G0':'G', 'G1':'T', 'G2':'A', 'G3':'C', 'G4':'N', 'G.':'N', 'GN':'N', |
2834 |
'T0':'T', 'T1':'G', 'T2':'C', 'T3':'A', 'T4':'N', 'T.':'N', 'TN':'N', |
2835 |
'N0':'N', 'N1':'N', 'N2':'N', 'N3':'N', 'N4':'N', 'N.':'N', 'NN':'N', |
2836 |
'.0':'N', '.1':'N', '.2':'N', '.3':'N', '.4':'N', '..':'N', '.N':'N' } |
2837 |
|
2838 |
base = color_seq[0] |
2839 |
bp_seq = base |
2840 |
for ch in color_seq[1:]: |
2841 |
base = decode_dic[base+ch] |
2842 |
bp_seq += base |
2843 |
return bp_seq |
2844 |
|
2845 |
def convert_bp_to_color(bp_seq): |
2846 |
encode_dic = { 'AA':'0', 'CC':'0', 'GG':'0', 'TT':'0', |
2847 |
'AC':'1', 'CA':'1', 'GT':'1', 'TG':'1', |
2848 |
'AG':'2', 'CT':'2', 'GA':'2', 'TC':'2', |
2849 |
'AT':'3', 'CG':'3', 'GC':'3', 'TA':'3', |
2850 |
'A.':'4', 'C.':'4', 'G.':'4', 'T.':'4', |
2851 |
'.A':'4', '.C':'4', '.G':'4', '.T':'4', |
2852 |
'.N':'4', 'AN':'4', 'CN':'4', 'GN':'4', |
2853 |
'TN':'4', 'NA':'4', 'NC':'4', 'NG':'4', |
2854 |
'NT':'4', 'NN':'4', 'N.':'4', '..':'4' } |
2855 |
|
2856 |
base = bp_seq[0] |
2857 |
color_seq = base |
2858 |
for ch in bp_seq[1:]: |
2859 |
color_seq += encode_dic[base + ch] |
2860 |
base = ch |
2861 |
|
2862 |
return color_seq |
2863 |
|
2864 |
def split_record(read_name, read_seq, read_qual, out_segf, offsets, color): |
2865 |
if color: |
2866 |
color_offset = 1 |
2867 |
read_seq_temp = convert_color_to_bp(read_seq) |
2868 |
|
2869 |
seg_num = 1 |
2870 |
while seg_num + 1 < len(offsets): |
2871 |
if read_seq[offsets[seg_num]+1] not in ['0', '1', '2', '3']: |
2872 |
return |
2873 |
seg_num += 1 |
2874 |
else: |
2875 |
color_offset = 0 |
2876 |
|
2877 |
seg_num = 0 |
2878 |
last_seq_offset = 0 |
2879 |
while seg_num + 1 < len(offsets): |
2880 |
f = out_segf[seg_num].file |
2881 |
seg_seq = read_seq[last_seq_offset+color_offset:offsets[seg_num + 1]+color_offset] |
2882 |
print >> f, "%s|%d:%d:%d" % (read_name,last_seq_offset,seg_num, len(offsets) - 1) |
2883 |
if color: |
2884 |
print >> f, "%s%s" % (read_seq_temp[last_seq_offset], seg_seq) |
2885 |
else: |
2886 |
print >> f, seg_seq |
2887 |
if not fasta: |
2888 |
seg_qual = read_qual[last_seq_offset:offsets[seg_num + 1]] |
2889 |
print >> f, "+" |
2890 |
print >> f, seg_qual |
2891 |
seg_num += 1 |
2892 |
last_seq_offset = offsets[seg_num] |
2893 |
|
2894 |
line_state = 0 |
2895 |
read_name = "" |
2896 |
read_seq = "" |
2897 |
read_quals = "" |
2898 |
num_segments = 0 |
2899 |
offsets = [] |
2900 |
for line in zreads.file: |
2901 |
if line.strip() == "": |
2902 |
continue |
2903 |
if line_state == 0: |
2904 |
read_name = line.strip() |
2905 |
elif line_state == 1: |
2906 |
read_seq = line.strip() |
2907 |
|
2908 |
read_length = len(read_seq) |
2909 |
tmp_num_segments = read_length / segment_length |
2910 |
offsets = [segment_length * i for i in range(0, tmp_num_segments + 1)] |
2911 |
|
2912 |
# Bowtie's minimum read length here is 20bp, so if the last segment |
2913 |
# is between 20 and segment_length bp long, go ahead and write it out |
2914 |
if read_length % segment_length >= min(segment_length - 2, 20): |
2915 |
offsets.append(read_length) |
2916 |
tmp_num_segments += 1 |
2917 |
else: |
2918 |
offsets[-1] = read_length |
2919 |
|
2920 |
if tmp_num_segments == 1: |
2921 |
offsets = [0, read_length] |
2922 |
|
2923 |
if tmp_num_segments > num_segments: |
2924 |
open_output_files(prefix, num_segments, tmp_num_segments, out_segfiles, extension, params) |
2925 |
num_segments = tmp_num_segments |
2926 |
|
2927 |
if fasta: |
2928 |
split_record(read_name, read_seq, None, out_segfiles, offsets, params.read_params.color) |
2929 |
elif line_state == 2: |
2930 |
line = line.strip() |
2931 |
else: |
2932 |
read_quals = line.strip() |
2933 |
if not fasta: |
2934 |
split_record(read_name, read_seq, read_quals, out_segfiles, offsets, params.read_params.color) |
2935 |
|
2936 |
line_state += 1 |
2937 |
if fasta: |
2938 |
line_state %= 2 |
2939 |
else: |
2940 |
line_state %= 4 |
2941 |
zreads.close() |
2942 |
out_fnames=[] |
2943 |
for zf in out_segfiles: |
2944 |
zf.close() |
2945 |
out_fnames.append(zf.fname) |
2946 |
#return [o.fname for o in out_segfiles] |
2947 |
return out_fnames |
2948 |
|
2949 |
# Find possible splice junctions using the "closure search" strategy, and report |
2950 |
# them in closures.juncs. Calls the executable closure_juncs |
2951 |
def junctions_from_closures(params, |
2952 |
sam_header_filename, |
2953 |
left_maps, |
2954 |
right_maps, |
2955 |
ref_fasta): |
2956 |
th_log("Searching for junctions via mate-pair closures") |
2957 |
|
2958 |
|
2959 |
#maps = [x for x in seg_maps if (os.path.exists(x) and os.path.getsize(x) > 0)] |
2960 |
#if len(maps) == 0: |
2961 |
# return None |
2962 |
slash = left_maps[0].rfind('/') |
2963 |
juncs_out = "" |
2964 |
if slash != -1: |
2965 |
juncs_out += left_maps[0][:slash+1] |
2966 |
fusions_out = juncs_out |
2967 |
|
2968 |
juncs_out += "closure.juncs" |
2969 |
fusions_out += "closure.fusions" |
2970 |
|
2971 |
juncs_log = open(logging_dir + "closure.log", "w") |
2972 |
juncs_cmdpath=prog_path("closure_juncs") |
2973 |
juncs_cmd = [juncs_cmdpath] |
2974 |
|
2975 |
left_maps = ','.join(left_maps) |
2976 |
right_maps = ','.join(right_maps) |
2977 |
|
2978 |
juncs_cmd.extend(params.cmd()) |
2979 |
juncs_cmd.extend(["--sam-header", sam_header_filename, |
2980 |
juncs_out, |
2981 |
fusions_out, |
2982 |
ref_fasta, |
2983 |
left_maps, |
2984 |
right_maps]) |
2985 |
try: |
2986 |
print >> run_log, ' '.join(juncs_cmd) |
2987 |
retcode = subprocess.call(juncs_cmd, |
2988 |
stderr=juncs_log) |
2989 |
|
2990 |
# spanning_reads returned an error |
2991 |
if retcode != 0: |
2992 |
die(fail_str+"Error: closure-based junction search failed with err ="+str(retcode)) |
2993 |
# cvg_islands not found |
2994 |
except OSError, o: |
2995 |
if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT: |
2996 |
th_logp(fail_str + "Error: closure_juncs not found on this system") |
2997 |
die(str(o)) |
2998 |
return [juncs_out] |
2999 |
|
3000 |
# Find possible junctions by examining coverage and split segments in the initial |
3001 |
# map and segment maps. Report junctions, insertions, and deletions in segment.juncs, |
3002 |
# segment.insertions, and segment.deletions. Calls the executable |
3003 |
# segment_juncs |
3004 |
def junctions_from_segments(params, |
3005 |
sam_header_filename, |
3006 |
left_reads, |
3007 |
left_reads_map, |
3008 |
left_seg_maps, |
3009 |
right_reads, |
3010 |
right_reads_map, |
3011 |
right_seg_maps, |
3012 |
unmapped_reads, |
3013 |
reads_format, |
3014 |
ref_fasta): |
3015 |
# if left_reads_map != left_seg_maps[0]: |
3016 |
|
3017 |
out_path=getFileDir(left_seg_maps[0]) |
3018 |
juncs_out=out_path+"segment.juncs" |
3019 |
insertions_out=out_path+"segment.insertions" |
3020 |
deletions_out =out_path+"segment.deletions" |
3021 |
fusions_out = out_path+"segment.fusions" |
3022 |
if resumeStage>currentStage and fileExists(juncs_out): |
3023 |
return [juncs_out, insertions_out, deletions_out, fusions_out] |
3024 |
th_log("Searching for junctions via segment mapping") |
3025 |
if params.coverage_search == True: |
3026 |
print >> sys.stderr, "\tCoverage-search algorithm is turned on, making this step very slow" |
3027 |
print >> sys.stderr, "\tPlease try running TopHat again with the option (--no-coverage-search) if this step takes too much time or memory." |
3028 |
|
3029 |
left_maps = ','.join(left_seg_maps) |
3030 |
log_fname = logging_dir + "segment_juncs.log" |
3031 |
segj_log = open(log_fname, "w") |
3032 |
segj_cmd = [prog_path("segment_juncs")] |
3033 |
|
3034 |
segj_cmd.extend(params.cmd()) |
3035 |
segj_cmd.extend(["--sam-header", sam_header_filename, |
3036 |
"--ium-reads", ",".join(unmapped_reads), |
3037 |
ref_fasta, |
3038 |
juncs_out, |
3039 |
insertions_out, |
3040 |
deletions_out, |
3041 |
fusions_out, |
3042 |
left_reads, |
3043 |
left_reads_map, |
3044 |
left_maps]) |
3045 |
if right_seg_maps: |
3046 |
right_maps = ','.join(right_seg_maps) |
3047 |
segj_cmd.extend([right_reads, right_reads_map, right_maps]) |
3048 |
try: |
3049 |
print >> run_log, " ".join(segj_cmd) |
3050 |
retcode = subprocess.call(segj_cmd, |
3051 |
preexec_fn=subprocess_setup, |
3052 |
stderr=segj_log) |
3053 |
|
3054 |
# spanning_reads returned an error |
3055 |
if retcode != 0: |
3056 |
die(fail_str+"Error: segment-based junction search failed with err ="+str(retcode)+"\n"+log_tail(log_fname)) |
3057 |
|
3058 |
# cvg_islands not found |
3059 |
except OSError, o: |
3060 |
if o.errno == errno.ENOTDIR or o.errno == errno.ENOENT: |
3061 |
th_logp(fail_str + "Error: segment_juncs not found on this system") |
3062 |
die(str(o)) |
3063 |
|
3064 |
return [juncs_out, insertions_out, deletions_out, fusions_out] |
3065 |
|
3066 |
# Joins mapped segments into full-length read alignments via the executable |
3067 |
# long_spanning_reads |
3068 |
def join_mapped_segments(params, |
3069 |
sam_header_filename, |
3070 |
reads, |
3071 |
ref_fasta, |
3072 |
possible_juncs, |
3073 |
possible_insertions, |
3074 |
possible_deletions, |
3075 |
possible_fusions, |
3076 |
contig_seg_maps, |
3077 |
spliced_seg_maps, |
3078 |
alignments_out_name): |
3079 |
rn="" |
3080 |
if len(contig_seg_maps)>1: |
3081 |
th_log("Joining segment hits") |
3082 |
rn=".segs" |
3083 |
else: |
3084 |
th_log("Processing bowtie hits") |
3085 |
contig_seg_maps = ','.join(contig_seg_maps) |
3086 |
|
3087 |
possible_juncs = ','.join(possible_juncs) |
3088 |
possible_insertions = ",".join(possible_insertions) |
3089 |
possible_deletions = ",".join(possible_deletions) |
3090 |
possible_fusions = ",".join(possible_fusions) |
3091 |
|
3092 |
log_fname=logging_dir + "long_spanning_reads"+rn+".log" |
3093 |
align_log = open(log_fname, "w") |
3094 |
align_cmd = [prog_path("long_spanning_reads")] |
3095 |
|
3096 |
align_cmd.extend(params.cmd()) |
3097 |
align_cmd += ["--sam-header", sam_header_filename] |
3098 |
|
3099 |
b2_params = params.bowtie2_params |
3100 |
max_penalty, min_penalty = b2_params.mp.split(',') |
3101 |
align_cmd += ["--bowtie2-max-penalty", max_penalty, |
3102 |
"--bowtie2-min-penalty", min_penalty] |
3103 |
|
3104 |
align_cmd += ["--bowtie2-penalty-for-N", str(b2_params.np)] |
3105 |
|
3106 |
read_gap_open, read_gap_cont = b2_params.rdg.split(',') |
3107 |
align_cmd += ["--bowtie2-read-gap-open", read_gap_open, |
3108 |
"--bowtie2-read-gap-cont", read_gap_cont] |
3109 |
|
3110 |
ref_gap_open, ref_gap_cont = b2_params.rfg.split(',') |
3111 |
align_cmd += ["--bowtie2-ref-gap-open", ref_gap_open, |
3112 |
"--bowtie2-ref-gap-cont", ref_gap_cont] |
3113 |
|
3114 |
align_cmd.append(ref_fasta) |
3115 |
align_cmd.extend([reads, |
3116 |
possible_juncs, |
3117 |
possible_insertions, |
3118 |
possible_deletions, |
3119 |
possible_fusions, |
3120 |
alignments_out_name, |
3121 |
contig_seg_maps]) |
3122 |
|
3123 |
if spliced_seg_maps: |
3124 |
spliced_seg_maps = ','.join(spliced_seg_maps) |
3125 |
align_cmd.append(spliced_seg_maps) |
3126 |
|
3127 |
try: |
3128 |
print >> run_log, " ".join(align_cmd) |
3129 |
ret = subprocess.call(align_cmd, |
3130 |
stderr=align_log) |
3131 |
if ret: |
3132 |
die(fail_str+"Error running 'long_spanning_reads':"+log_tail(log_fname)) |
3133 |
except OSError, o: |
3134 |
die(fail_str+"Error: "+str(o)) |
3135 |
|
3136 |
# This class collects spliced and unspliced alignments for each of the |
3137 |
# left and right read files provided by the user. |
3138 |
class Maps: |
3139 |
def __init__(self, |
3140 |
unspliced_sam, |
3141 |
seg_maps, |
3142 |
unmapped_segs, |
3143 |
segs): |
3144 |
self.unspliced_sam = unspliced_sam |
3145 |
self.seg_maps = seg_maps |
3146 |
self.unmapped_segs = unmapped_segs |
3147 |
self.segs = segs |
3148 |
|
3149 |
# Map2GTF stuff |
3150 |
def m2g_convert_coords(params, sam_header_filename, gtf_fname, reads, out_fname): |
3151 |
"""ajjkljlks |
3152 |
|
3153 |
Arguments: |
3154 |
- `params`: TopHat parameters |
3155 |
- `gtf_fname`: File name pointing to the annotation. |
3156 |
- `reads`: The reads to convert coords (in Bowtie format). |
3157 |
- `out_fname`: The file name pointing to the output. |
3158 |
""" |
3159 |
m2g_cmd = [prog_path("map2gtf")] |
3160 |
m2g_cmd.extend(params.cmd()) |
3161 |
m2g_cmd += ["--sam-header", sam_header_filename] |
3162 |
m2g_cmd.append(gtf_fname) |
3163 |
m2g_cmd.append(reads) #could be BAM file |
3164 |
m2g_cmd.append(out_fname) |
3165 |
fbasename = getFileBaseName(reads) |
3166 |
m2g_log = logging_dir + "m2g_" + fbasename + ".out" |
3167 |
m2g_err = logging_dir + "m2g_" + fbasename + ".err" |
3168 |
|
3169 |
try: |
3170 |
th_log("Converting " + fbasename + " to genomic coordinates (map2gtf)") |
3171 |
print >> run_log, " ".join(m2g_cmd) + " > " + m2g_log |
3172 |
ret = subprocess.call(m2g_cmd, |
3173 |
stdout=open(m2g_log, "w"), |
3174 |
stderr=open(m2g_err, "w")) |
3175 |
if ret != 0: |
3176 |
die(fail_str + " Error: map2gtf returned an error") |
3177 |
except OSError, o: |
3178 |
err_msg = fail_str + str(o) |
3179 |
die(err_msg + "\n") |
3180 |
|
3181 |
|
3182 |
def gtf_to_fasta(params, trans_gtf, genome, out_fname): |
3183 |
""" Make the transcriptome from a GTF. |
3184 |
|
3185 |
Arguments: |
3186 |
- `trans_gtf`: |
3187 |
- `genome`: |
3188 |
- `out_fname`: |
3189 |
""" |
3190 |
# TODO: Call gtf_to_fasta |
3191 |
if resumeStage > currentStage and fileExists(out_fname): |
3192 |
return |
3193 |
g2f_cmd = [prog_path("gtf_to_fasta")] |
3194 |
g2f_cmd.extend(params.cmd()) |
3195 |
g2f_cmd.append(trans_gtf) |
3196 |
g2f_cmd.append(genome) |
3197 |
g2f_cmd.append(out_fname) |
3198 |
|
3199 |
g2f_log = logging_dir + "g2f.out" |
3200 |
g2f_err = logging_dir + "g2f.err" |
3201 |
|
3202 |
try: |
3203 |
print >> run_log, " ".join(g2f_cmd)+" > " + g2f_log |
3204 |
ret = subprocess.call(g2f_cmd, |
3205 |
stdout = open(g2f_log, "w"), |
3206 |
stderr = open(g2f_err, "w")) |
3207 |
if ret != 0: |
3208 |
die(fail_str + " Error: gtf_to_fasta returned an error.") |
3209 |
except OSError, o: |
3210 |
err_msg = fail_str + str(o) |
3211 |
die(err_msg + "\n") |
3212 |
|
3213 |
def map2gtf(params, genome_sam_header_filename, ref_fasta, left_reads, right_reads): |
3214 |
""" Main GTF mapping function |
3215 |
|
3216 |
Arguments: |
3217 |
- `params`: The TopHat parameters. |
3218 |
- `ref_fasta`: The reference genome. |
3219 |
- `left_reads`: A list of reads. |
3220 |
- `right_reads`: A list of reads (empty if single-end). |
3221 |
|
3222 |
""" |
3223 |
test_input_file(params.gff_annotation) |
3224 |
|
3225 |
# th_log("Reading in GTF file: " + params.gff_annotation) |
3226 |
# transcripts = gtf_to_transcripts(params.gff_annotation) |
3227 |
|
3228 |
gtf_name = getFileBaseName(params.gff_annotation) |
3229 |
m2g_bwt_idx = None |
3230 |
t_out_dir = tmp_dir |
3231 |
if params.transcriptome_index and not params.transcriptome_outdir: |
3232 |
m2g_bwt_idx = params.transcriptome_index |
3233 |
th_log("Using pre-built transcriptome index..") |
3234 |
else: |
3235 |
th_log("Creating transcriptome data files..") |
3236 |
if params.transcriptome_outdir: |
3237 |
t_out_dir=params.transcriptome_outdir+"/" |
3238 |
m2g_ref_fasta = t_out_dir + gtf_name + ".fa" |
3239 |
gtf_to_fasta(params, params.gff_annotation, ref_fasta, m2g_ref_fasta) |
3240 |
m2g_bwt_idx = build_idx_from_fa(params.bowtie2, m2g_ref_fasta, t_out_dir, params.read_params.color) |
3241 |
|
3242 |
transcriptome_header_filename = get_index_sam_header(params, m2g_bwt_idx) |
3243 |
|
3244 |
mapped_gtf_list = [] |
3245 |
unmapped_gtf_list = [] |
3246 |
# do the initial mapping in GTF coordinates |
3247 |
for reads in [left_reads, right_reads]: |
3248 |
if reads == None or os.path.getsize(reads) < 25 : |
3249 |
continue |
3250 |
fbasename = getFileBaseName(reads) |
3251 |
mapped_gtf_out = tmp_dir + fbasename + ".m2g" |
3252 |
#if use_zpacker: |
3253 |
# mapped_gtf_out+=".z" |
3254 |
|
3255 |
unmapped_gtf = tmp_dir + fbasename + ".m2g_um" |
3256 |
#if use_BWT_FIFO: |
3257 |
# unmapped_gtf += ".z" |
3258 |
|
3259 |
(mapped_gtf_map, unmapped) = bowtie(params, |
3260 |
m2g_bwt_idx, |
3261 |
[transcriptome_header_filename, genome_sam_header_filename], |
3262 |
[reads], |
3263 |
"fastq", |
3264 |
params.read_mismatches, |
3265 |
params.read_gap_length, |
3266 |
params.read_edit_dist, |
3267 |
params.read_realign_edit_dist, |
3268 |
mapped_gtf_out, |
3269 |
unmapped_gtf, |
3270 |
"", _reads_vs_T) |
3271 |
mapped_gtf_list.append(mapped_gtf_map) |
3272 |
unmapped_gtf_list.append(unmapped) |
3273 |
|
3274 |
# bam_gtf_list = mapped_gtf_map[] |
3275 |
|
3276 |
# for reads in mapped_gtf_list: |
3277 |
# fbasename = getFileBaseName(reads) |
3278 |
# bam_out_fname = tmp_dir + fbasename + "_converted.bam" |
3279 |
# m2g_convert_coords(params, |
3280 |
# genome_sam_header_filename, |
3281 |
# params.gff_annotation, |
3282 |
# reads, |
3283 |
# bam_out_fname) |
3284 |
# |
3285 |
# bam_gtf_list.append(bam_out_fname) |
3286 |
# if not params.system_params.keep_tmp: |
3287 |
# removeFileWithIndex(reads) |
3288 |
|
3289 |
# if len(bam_gtf_list) < 2: |
3290 |
# bam_gtf_list.append(None) |
3291 |
# unmapped_gtf_list.append(None) |
3292 |
if len(mapped_gtf_list) < 2: |
3293 |
mapped_gtf_list.append(None) |
3294 |
if len(unmapped_gtf_list) < 2: |
3295 |
unmapped_gtf_list.append(None) |
3296 |
# return (bam_gtf_list, unmapped_gtf_list) |
3297 |
return (mapped_gtf_list, unmapped_gtf_list) |
3298 |
# end Map2GTF |
3299 |
|
3300 |
def get_preflt_data(params, ri, target_reads, out_mappings, out_unmapped): |
3301 |
## extract mappings and unmapped reads from prefilter mappings and preflt_ium |
3302 |
## |
3303 |
#this is accomplished by a special prep_reads usage (triggered by --flt-hits) |
3304 |
out_bam=None |
3305 |
#if params.read_params.color: |
3306 |
# out_unmapped += ".fq" |
3307 |
# #if use_zpacker: out_unmapped += ".z" |
3308 |
#else: |
3309 |
out_unmapped += ".bam" |
3310 |
out_bam = out_unmapped |
3311 |
# no colorspace reads |
3312 |
if resumeStage: |
3313 |
return (out_mappings, out_unmapped) |
3314 |
do_use_zpacker = use_zpacker and not out_bam |
3315 |
prep_cmd=prep_reads_cmd(params, params.preflt_data[ri].unmapped_reads, None, |
3316 |
None, None, # right-side mates |
3317 |
out_bam, # stdout file |
3318 |
out_mappings, # aux file (filtered mappings) |
3319 |
None, # no index for out_bam |
3320 |
[target_reads], # prefilter reads |
3321 |
[params.preflt_data[ri].mappings]) # mappings to filter |
3322 |
if not out_bam: um_reads = open(out_unmapped, "wb") |
3323 |
sides=["left","right"] |
3324 |
log_fname=logging_dir + "prep_reads.from_preflt."+sides[ri]+".log" |
3325 |
filter_log = open(log_fname,"w") |
3326 |
|
3327 |
shell_cmd = " ".join(prep_cmd) |
3328 |
#add the compression pipe |
3329 |
zip_cmd=[] |
3330 |
if do_use_zpacker: |
3331 |
zip_cmd=[ params.system_params.zipper ] |
3332 |
zip_cmd.extend(params.system_params.zipper_opts) |
3333 |
zip_cmd.extend(['-c','-']) |
3334 |
shell_cmd +=' | '+' '.join(zip_cmd) |
3335 |
if not out_bam: |
3336 |
shell_cmd += ' >' + out_unmapped |
3337 |
retcode=0 |
3338 |
try: |
3339 |
print >> run_log, shell_cmd |
3340 |
if do_use_zpacker: |
3341 |
prep_proc = subprocess.Popen(prep_cmd, |
3342 |
stdout=subprocess.PIPE, |
3343 |
stderr=filter_log) |
3344 |
zip_proc = subprocess.Popen(zip_cmd, |
3345 |
preexec_fn=subprocess_setup, |
3346 |
stdin=prep_proc.stdout, |
3347 |
stderr=tophat_log, stdout=um_reads) |
3348 |
prep_proc.stdout.close() #as per http://bugs.python.org/issue7678 |
3349 |
zip_proc.communicate() |
3350 |
retcode=prep_proc.poll() |
3351 |
if retcode==0: |
3352 |
retcode=zip_proc.poll() |
3353 |
else: |
3354 |
if out_bam: |
3355 |
retcode = subprocess.call(prep_cmd, stderr=filter_log) |
3356 |
else: |
3357 |
retcode = subprocess.call(prep_cmd, stdout=um_reads, |
3358 |
stderr=filter_log) |
3359 |
if retcode: |
3360 |
die(fail_str+"Error running 'prep_reads'\n"+log_tail(log_fname)) |
3361 |
|
3362 |
except OSError, o: |
3363 |
errmsg=fail_str+str(o) |
3364 |
die(errmsg+"\n"+log_tail(log_fname)) |
3365 |
if not out_bam: um_reads.close() |
3366 |
|
3367 |
return (out_mappings, out_unmapped) |
3368 |
|
3369 |
|
3370 |
# The main aligment routine of TopHat. This function executes most of the |
3371 |
# workflow producing a set of candidate alignments for each cDNA fragment in a |
3372 |
# pair of SAM alignment files (for paired end reads). |
3373 |
def spliced_alignment(params, |
3374 |
bwt_idx_prefix, |
3375 |
sam_header_filename, |
3376 |
ref_fasta, |
3377 |
read_len, |
3378 |
segment_len, |
3379 |
prepared_reads, |
3380 |
user_supplied_junctions, |
3381 |
user_supplied_insertions, |
3382 |
user_supplied_deletions): |
3383 |
|
3384 |
possible_juncs = [] |
3385 |
possible_juncs.extend(user_supplied_junctions) |
3386 |
|
3387 |
possible_insertions = [] |
3388 |
possible_insertions.extend(user_supplied_insertions) |
3389 |
possible_deletions = [] |
3390 |
possible_deletions.extend(user_supplied_deletions) |
3391 |
possible_fusions = [] |
3392 |
|
3393 |
left_reads, right_reads = prepared_reads |
3394 |
|
3395 |
maps = [[], []] # maps[0] = left_reads mapping data, maps[1] = right_reads_mapping_data |
3396 |
# Before anything, map the reads using Map2GTF (if using annotation) |
3397 |
m2g_maps = [ None, None ] # left, right |
3398 |
initial_reads = [ left_reads, right_reads ] |
3399 |
|
3400 |
if params.gff_annotation: |
3401 |
(mapped_gtf_list, unmapped_gtf_list) = \ |
3402 |
map2gtf(params, sam_header_filename, ref_fasta, left_reads, right_reads) |
3403 |
|
3404 |
m2g_left_maps, m2g_right_maps = mapped_gtf_list |
3405 |
m2g_maps = [m2g_left_maps, m2g_right_maps] |
3406 |
if params.transcriptome_only or not fileExists(unmapped_gtf_list[0]): |
3407 |
# The case where the user doesn't want to map to anything other |
3408 |
# than the transcriptome OR we have no unmapped reads |
3409 |
maps[0] = [m2g_left_maps] |
3410 |
if right_reads: |
3411 |
maps[1] = [m2g_right_maps] |
3412 |
|
3413 |
return maps |
3414 |
# Feed the unmapped reads into spliced_alignment() |
3415 |
initial_reads = unmapped_gtf_list[:] |
3416 |
if currentStage >= resumeStage: |
3417 |
th_log("Resuming TopHat pipeline with unmapped reads") |
3418 |
|
3419 |
if not nonzeroFile(initial_reads[0]) and \ |
3420 |
(not initial_reads[1] or not nonzeroFile(initial_reads[1])): |
3421 |
|
3422 |
if m2g_maps[1]: |
3423 |
return [[m2g_maps[0]], [m2g_maps[1]]] |
3424 |
else: |
3425 |
return [[m2g_maps[0]], []] |
3426 |
|
3427 |
max_seg_len = segment_len #this is the ref seq span on either side of the junctions |
3428 |
#to be extracted into segment_juncs.fa |
3429 |
|
3430 |
num_segs = int(read_len / segment_len) |
3431 |
if (read_len % segment_len) >= min(segment_len-2, 20): |
3432 |
#remainder is shorter but long enough to become a new segment |
3433 |
num_segs += 1 |
3434 |
else: |
3435 |
# the last segment is longer |
3436 |
if num_segs>1: max_seg_len += (read_len % segment_len) |
3437 |
|
3438 |
if num_segs <= 1: |
3439 |
th_logp("Warning: you have only one segment per read.\n\tIf the read length is greater than or equal to 45bp,\n\twe strongly recommend that you decrease --segment-length to about half the read length because TopHat will work better with multiple segments") |
3440 |
|
3441 |
# Using the num_segs value returned by check_reads(), |
3442 |
# decide which junction discovery strategy to use |
3443 |
if num_segs < 3: |
3444 |
#if params.butterfly_search != False: |
3445 |
# params.butterfly_search = True |
3446 |
if params.coverage_search != False: |
3447 |
params.coverage_search = True |
3448 |
if num_segs == 1: |
3449 |
segment_len = read_len |
3450 |
else: #num_segs >= 3: |
3451 |
# if we have at least three segments, just use split segment search, |
3452 |
# which is the most sensitive and specific, fastest, and lightest-weight. |
3453 |
# so unless specifically requested, disable the other junction searches |
3454 |
if params.closure_search != True: |
3455 |
params.closure_search = False |
3456 |
if params.coverage_search != True: |
3457 |
params.coverage_search = False |
3458 |
if params.butterfly_search != True: |
3459 |
params.butterfly_search = False |
3460 |
|
3461 |
# Perform the first part of the TopHat work flow on the left and right |
3462 |
# reads of paired ends separately - we'll use the pairing information later |
3463 |
have_left_IUM = False |
3464 |
for ri in (0,1): |
3465 |
reads=initial_reads[ri] |
3466 |
if reads == None or not nonzeroFile(reads): |
3467 |
continue |
3468 |
|
3469 |
fbasename=getFileBaseName(reads) |
3470 |
unspliced_out = tmp_dir + fbasename + ".mapped" |
3471 |
unspliced_sam = None |
3472 |
unmapped_reads = None |
3473 |
#if use_zpacker: unspliced_out+=".z" |
3474 |
unmapped_unspliced = tmp_dir + fbasename + "_unmapped" |
3475 |
if params.prefilter_multi: |
3476 |
#unmapped_unspliced += ".z" |
3477 |
(unspliced_sam, unmapped_reads) = get_preflt_data(params, ri, reads, unspliced_out, unmapped_unspliced) |
3478 |
else: |
3479 |
# Perform the initial Bowtie mapping of the full length reads |
3480 |
(unspliced_sam, unmapped_reads) = bowtie(params, |
3481 |
bwt_idx_prefix, |
3482 |
sam_header_filename, |
3483 |
[reads], |
3484 |
"fastq", |
3485 |
params.read_mismatches, |
3486 |
params.read_gap_length, |
3487 |
params.read_edit_dist, |
3488 |
params.read_realign_edit_dist, |
3489 |
unspliced_out, |
3490 |
unmapped_unspliced, |
3491 |
"", |
3492 |
_reads_vs_G) |
3493 |
|
3494 |
seg_maps = [] |
3495 |
unmapped_segs = [] |
3496 |
segs = [] |
3497 |
|
3498 |
have_IUM = nonzeroFile(unmapped_reads) |
3499 |
if ri==0 and have_IUM: |
3500 |
have_left_IUM = True |
3501 |
setRunStage(_stage_map_segments) |
3502 |
if num_segs > 1 and have_IUM: |
3503 |
# split up the IUM reads into segments |
3504 |
# unmapped_reads can be in BAM format |
3505 |
read_segments = split_reads(unmapped_reads, |
3506 |
tmp_dir + fbasename, |
3507 |
False, |
3508 |
params, |
3509 |
segment_len) |
3510 |
|
3511 |
#if not params.system_params.keep_tmp: |
3512 |
# removeFileWithIndex(unmapped_reads) |
3513 |
|
3514 |
# Map each segment file independently with Bowtie |
3515 |
for i in range(len(read_segments)): |
3516 |
seg = read_segments[i] |
3517 |
fbasename=getFileBaseName(seg) |
3518 |
seg_out = tmp_dir + fbasename |
3519 |
unmapped_seg = tmp_dir + fbasename + "_unmapped" |
3520 |
extra_output = "(%d/%d)" % (i+1, len(read_segments)) |
3521 |
(seg_map, unmapped) = bowtie(params, |
3522 |
bwt_idx_prefix, |
3523 |
sam_header_filename, |
3524 |
[seg], |
3525 |
"fastq", |
3526 |
params.segment_mismatches, |
3527 |
params.segment_mismatches, |
3528 |
params.segment_mismatches, |
3529 |
params.segment_mismatches, |
3530 |
seg_out, |
3531 |
unmapped_seg, |
3532 |
extra_output, |
3533 |
_segs_vs_G) |
3534 |
seg_maps.append(seg_map) |
3535 |
unmapped_segs.append(unmapped) |
3536 |
segs.append(seg) |
3537 |
|
3538 |
# Collect the segment maps for left and right reads together |
3539 |
maps[ri] = Maps(unspliced_sam, seg_maps, unmapped_segs, segs) |
3540 |
else: |
3541 |
# if there's only one segment, just collect the initial map as the only |
3542 |
# map to be used downstream for coverage-based junction discovery |
3543 |
read_segments = [reads] |
3544 |
maps[ri] = Maps(unspliced_sam, [unspliced_sam], [unmapped_reads], [unmapped_reads]) |
3545 |
|
3546 |
# XXX: At this point if using M2G, have three sets of reads: |
3547 |
# mapped to transcriptome, mapped to genome, and unmapped (potentially |
3548 |
# spliced or poly-A tails) - hp |
3549 |
unmapped_reads = [] |
3550 |
if maps[0]: |
3551 |
left_reads_map = maps[0].unspliced_sam |
3552 |
left_seg_maps = maps[0].seg_maps |
3553 |
unmapped_reads = maps[0].unmapped_segs |
3554 |
else: |
3555 |
left_reads_map = None |
3556 |
left_seg_maps = None |
3557 |
|
3558 |
if right_reads and maps[1]: |
3559 |
right_reads_map = maps[1].unspliced_sam |
3560 |
right_seg_maps = maps[1].seg_maps |
3561 |
unmapped_reads.extend(maps[1].unmapped_segs) |
3562 |
else: |
3563 |
right_reads_map = None |
3564 |
right_seg_maps = None |
3565 |
|
3566 |
if params.find_novel_juncs and have_left_IUM: # or params.find_novel_indels: |
3567 |
# Call segment_juncs to infer a list of possible splice junctions from |
3568 |
# the regions of the genome covered in the initial and segment maps |
3569 |
#if params.find_novel_juncs: |
3570 |
#TODO: in m2g case, we might want to pass the m2g mappings as well, |
3571 |
# or perhaps the GTF file directly |
3572 |
# -> this could improve alternative junction detection? |
3573 |
setRunStage(_stage_find_juncs) |
3574 |
juncs = junctions_from_segments(params, |
3575 |
sam_header_filename, |
3576 |
left_reads, |
3577 |
left_reads_map, |
3578 |
left_seg_maps, |
3579 |
right_reads, |
3580 |
right_reads_map, |
3581 |
right_seg_maps, |
3582 |
unmapped_reads, |
3583 |
"fastq", |
3584 |
ref_fasta) |
3585 |
|
3586 |
if not params.system_params.keep_tmp: |
3587 |
for unmapped_seg in unmapped_reads: |
3588 |
removeFileWithIndex(unmapped_seg) |
3589 |
|
3590 |
if os.path.getsize(juncs[0]) != 0: |
3591 |
possible_juncs.append(juncs[0]) |
3592 |
if params.find_novel_indels: |
3593 |
if os.path.getsize(juncs[1]) != 0: |
3594 |
possible_insertions.append(juncs[1]) |
3595 |
if os.path.getsize(juncs[2]) != 0: |
3596 |
possible_deletions.append(juncs[2]) |
3597 |
if params.find_novel_fusions: |
3598 |
if os.path.getsize(juncs[3]) != 0: |
3599 |
possible_fusions.append(juncs[3]) |
3600 |
# Optionally, and for paired reads only, use a closure search to |
3601 |
# discover addtional junctions |
3602 |
if currentStage >= resumeStage and params.closure_search and left_reads and right_reads: |
3603 |
juncs = junctions_from_closures(params, |
3604 |
sam_header_filename, |
3605 |
[maps[initial_reads[left_reads]].unspliced_sam, maps[initial_reads[left_reads]].seg_maps[-1]], |
3606 |
[maps[initial_reads[right_reads]].unspliced_sam, maps[initial_reads[right_reads]].seg_maps[-1]], |
3607 |
ref_fasta) |
3608 |
if os.path.getsize(juncs[0]) != 0: |
3609 |
possible_juncs.extend(juncs) |
3610 |
|
3611 |
if len(possible_insertions) == 0 and len(possible_deletions) == 0 and len(possible_juncs) == 0 and len(possible_fusions) == 0: |
3612 |
spliced_seg_maps = None |
3613 |
junc_idx_prefix = None |
3614 |
else: |
3615 |
junc_idx_prefix = "segment_juncs" |
3616 |
if len(possible_insertions) == 0: |
3617 |
possible_insertions.append(os.devnull) |
3618 |
# print >> sys.stderr, "Warning: insertions database is empty!" |
3619 |
if len(possible_deletions) == 0: |
3620 |
possible_deletions.append(os.devnull) |
3621 |
# print >> sys.stderr, "Warning: deletions database is empty!" |
3622 |
if len(possible_juncs) == 0: |
3623 |
possible_juncs.append(os.devnull) |
3624 |
th_logp("Warning: junction database is empty!") |
3625 |
if len(possible_fusions) == 0: |
3626 |
possible_fusions.append(os.devnull) |
3627 |
|
3628 |
setRunStage(_stage_juncs_db) |
3629 |
juncs_bwt_samheader = None |
3630 |
juncs_bwt_idx = None |
3631 |
if junc_idx_prefix: |
3632 |
jdb_prefix = tmp_dir + junc_idx_prefix |
3633 |
if currentStage<resumeStage and fileExists(jdb_prefix + ".fa"): |
3634 |
juncs_bwt_idx = jdb_prefix |
3635 |
else: |
3636 |
juncs_bwt_idx = build_juncs_index(params.bowtie2, |
3637 |
3, |
3638 |
max_seg_len, |
3639 |
junc_idx_prefix, |
3640 |
possible_juncs, |
3641 |
possible_insertions, |
3642 |
possible_deletions, |
3643 |
possible_fusions, |
3644 |
ref_fasta, |
3645 |
params.read_params.color) |
3646 |
juncs_bwt_samheader = get_index_sam_header(params, juncs_bwt_idx) |
3647 |
|
3648 |
# Now map read segments (or whole IUM reads, if num_segs == 1) to the splice |
3649 |
# index with Bowtie |
3650 |
setRunStage(_stage_map2juncs) |
3651 |
for ri in (0,1): |
3652 |
reads = initial_reads[ri] |
3653 |
if not reads: |
3654 |
continue |
3655 |
|
3656 |
spliced_seg_maps = [] |
3657 |
rfname=getFileBaseName(reads) |
3658 |
rfdir=getFileDir(reads) |
3659 |
|
3660 |
m2g_map = m2g_maps[ri] |
3661 |
mapped_reads = rfdir + rfname + ".candidates.bam" |
3662 |
merged_map = rfdir + rfname + ".candidates_and_unspl.bam" |
3663 |
|
3664 |
if maps[ri]: |
3665 |
unspl_samfile = maps[ri].unspliced_sam |
3666 |
else: |
3667 |
unspl_samfile = None |
3668 |
|
3669 |
have_IUM = True |
3670 |
if reads == None or not nonzeroFile(reads): |
3671 |
have_IUM = False |
3672 |
|
3673 |
if have_IUM: |
3674 |
if junc_idx_prefix: |
3675 |
i = 0 |
3676 |
for seg in maps[ri].segs: |
3677 |
#search each segment |
3678 |
fsegname = getFileBaseName(seg) |
3679 |
seg_out = tmp_dir + fsegname + ".to_spliced" |
3680 |
#if use_zpacker: seg_out += ".z" |
3681 |
extra_output = "(%d/%d)" % (i+1, len(maps[ri].segs)) |
3682 |
(seg_map, unmapped) = bowtie(params, |
3683 |
tmp_dir + junc_idx_prefix, |
3684 |
juncs_bwt_samheader, |
3685 |
[seg], |
3686 |
"fastq", |
3687 |
params.segment_mismatches, |
3688 |
params.segment_mismatches, |
3689 |
params.segment_mismatches, |
3690 |
params.segment_mismatches, |
3691 |
seg_out, |
3692 |
None, |
3693 |
extra_output, |
3694 |
_segs_vs_J) |
3695 |
spliced_seg_maps.append(seg_map) |
3696 |
i += 1 |
3697 |
|
3698 |
if not params.system_params.keep_tmp: |
3699 |
removeFileWithIndex(seg) |
3700 |
|
3701 |
|
3702 |
# Join the contigous and spliced segment hits into full length |
3703 |
# read alignments |
3704 |
# -- spliced mappings built from all segment mappings vs genome and junc_db |
3705 |
join_mapped_segments(params, |
3706 |
sam_header_filename, |
3707 |
reads, |
3708 |
ref_fasta, |
3709 |
possible_juncs, |
3710 |
possible_insertions, |
3711 |
possible_deletions, |
3712 |
possible_fusions, |
3713 |
maps[ri].seg_maps, |
3714 |
spliced_seg_maps, |
3715 |
mapped_reads) |
3716 |
|
3717 |
if not params.system_params.keep_tmp: |
3718 |
for seg_map in maps[ri].seg_maps: |
3719 |
removeFileWithIndex(seg_map) |
3720 |
|
3721 |
for spliced_seg_map in spliced_seg_maps: |
3722 |
removeFileWithIndex(spliced_seg_map) |
3723 |
|
3724 |
maps[ri] = [] |
3725 |
if m2g_map and \ |
3726 |
nonzeroFile(m2g_map): |
3727 |
maps[ri].append(m2g_map) |
3728 |
|
3729 |
if unspl_samfile and \ |
3730 |
nonzeroFile(unspl_samfile): |
3731 |
maps[ri].append(unspl_samfile) |
3732 |
|
3733 |
if mapped_reads and nonzeroFile(mapped_reads): |
3734 |
maps[ri].append(mapped_reads) |
3735 |
else: |
3736 |
for bam_i in range(0, params.system_params.num_threads): |
3737 |
temp_bam = mapped_reads[:-4] + str(bam_i) + ".bam" |
3738 |
if nonzeroFile(temp_bam): |
3739 |
maps[ri].append(mapped_reads[:-4]) |
3740 |
break |
3741 |
|
3742 |
return maps |
3743 |
|
3744 |
# rough equivalent of the 'which' command to find external programs |
3745 |
# (current script path is tested first, then PATH envvar) |
3746 |
def which(program): |
3747 |
def is_executable(fpath): |
3748 |
return os.path.isfile(fpath) and os.access(fpath, os.X_OK) |
3749 |
fpath, fname = os.path.split(program) |
3750 |
if fpath: |
3751 |
if is_executable(program): |
3752 |
return program |
3753 |
else: |
3754 |
progpath = os.path.join(bin_dir, program) |
3755 |
if is_executable(progpath): |
3756 |
return progpath |
3757 |
for path in os.environ["PATH"].split(os.pathsep): |
3758 |
progpath = os.path.join(path, program) |
3759 |
if is_executable(progpath): |
3760 |
return progpath |
3761 |
return None |
3762 |
|
3763 |
def prog_path(program): |
3764 |
progpath=which(program) |
3765 |
if progpath == None: |
3766 |
die("Error locating program: "+program) |
3767 |
return progpath |
3768 |
|
3769 |
# FIXME: this should get set during the make dist autotools phase of the build |
3770 |
def get_version(): |
3771 |
return "2.0.5" |
3772 |
|
3773 |
def mlog(msg): |
3774 |
print >> sys.stderr, "[DBGLOG]:"+msg |
3775 |
|
3776 |
def test_input_file(filename): |
3777 |
try: |
3778 |
test_file = open(filename, "r") |
3779 |
except IOError, o: |
3780 |
die("Error: Opening file %s" % filename) |
3781 |
return |
3782 |
|
3783 |
def main(argv=None): |
3784 |
warnings.filterwarnings("ignore", "tmpnam is a potential security risk") |
3785 |
|
3786 |
# Initialize default parameter values |
3787 |
params = TopHatParams() |
3788 |
try: |
3789 |
if argv is None: |
3790 |
argv = sys.argv |
3791 |
args = params.parse_options(argv) |
3792 |
if params.resume_dir: |
3793 |
newargv=doResume(params.resume_dir) |
3794 |
#if not args[0]: |
3795 |
args = params.parse_options(newargv) |
3796 |
params.check() |
3797 |
|
3798 |
bwt_idx_prefix = args[0] |
3799 |
left_reads_list = args[1] |
3800 |
left_quals_list, right_quals_list = None, None |
3801 |
if (not params.read_params.quals and len(args) > 2) or (params.read_params.quals and len(args) > 3): |
3802 |
if params.read_params.mate_inner_dist == None: |
3803 |
params.read_params.mate_inner_dist = 50 |
3804 |
#die("Error: you must set the mean inner distance between mates with -r") |
3805 |
|
3806 |
right_reads_list = args[2] |
3807 |
if params.read_params.quals: |
3808 |
left_quals_list = args[3] |
3809 |
right_quals_list = args[4] |
3810 |
else: |
3811 |
right_reads_list = None |
3812 |
if params.read_params.quals: |
3813 |
left_quals_list = args[2] |
3814 |
|
3815 |
start_time = datetime.now() |
3816 |
prepare_output_dir() |
3817 |
init_logger(logging_dir + "tophat.log") |
3818 |
|
3819 |
th_logp() |
3820 |
if resumeStage>0: |
3821 |
th_log("Resuming TopHat run in directory '"+output_dir+"', from stage: '"+stageNames[resumeStage]+"'") |
3822 |
else: |
3823 |
th_log("Beginning TopHat run (v"+get_version()+")") |
3824 |
th_logp("-----------------------------------------------") |
3825 |
|
3826 |
global run_log |
3827 |
run_log = open(logging_dir + "run.log", "w", 0) |
3828 |
global run_cmd |
3829 |
run_cmd = " ".join(argv) |
3830 |
print >> run_log, run_cmd |
3831 |
|
3832 |
check_bowtie(params) |
3833 |
check_samtools() |
3834 |
|
3835 |
# Validate all the input files, check all prereqs before committing |
3836 |
# to the run |
3837 |
if params.gff_annotation: |
3838 |
if not os.path.exists(params.gff_annotation): |
3839 |
die("Error: cannot find transcript file %s" % params.gff_annotation) |
3840 |
if os.path.getsize(params.gff_annotation)<10: |
3841 |
die("Error: invalid transcript file %s" % params.gff_annotation) |
3842 |
|
3843 |
if params.transcriptome_index: |
3844 |
if params.gff_annotation: |
3845 |
#gff file given, so transcriptome data will be written there |
3846 |
gff_basename = getFileBaseName(params.gff_annotation) |
3847 |
#just in case, check if it's not already there (-G/--GTF given again by mistake) |
3848 |
tpath, tname = os.path.split(params.transcriptome_index) |
3849 |
new_subdir=False |
3850 |
if tpath in (".", "./") or not tpath : |
3851 |
if not os.path.exists(params.transcriptome_index): |
3852 |
os.makedirs(params.transcriptome_index) |
3853 |
new_subdir=True |
3854 |
if new_subdir or (os.path.exists(params.transcriptome_index) and os.path.isdir(params.transcriptome_index)): |
3855 |
params.transcriptome_index = os.path.join(params.transcriptome_index, gff_basename) |
3856 |
gff_out=params.transcriptome_index+".gff" |
3857 |
if not (os.path.exists(gff_out) and os.path.getsize(gff_out)==os.path.getsize(params.gff_annotation)): |
3858 |
#generate the transcriptome data files |
3859 |
tpath, tname = os.path.split(params.transcriptome_index) |
3860 |
params.transcriptome_outdir=tpath |
3861 |
t_gff=params.transcriptome_index+".gff" |
3862 |
if params.transcriptome_outdir: |
3863 |
#will create the transcriptome data files |
3864 |
if not os.path.exists(params.transcriptome_outdir): |
3865 |
os.makedirs(params.transcriptome_outdir) |
3866 |
copy(params.gff_annotation, t_gff) |
3867 |
else: |
3868 |
#try to use existing transcriptome data files |
3869 |
|
3870 |
if not (os.path.exists(t_gff) and os.path.getsize(t_gff)>10): |
3871 |
die("Error: GFF transcripts file not found or invalid (%s)" % t_gff) |
3872 |
check_bowtie_index(params.transcriptome_index, params.bowtie2) |
3873 |
params.gff_annotation=t_gff |
3874 |
#end @ transcriptome_index given |
3875 |
|
3876 |
(ref_fasta, ref_seq_dict) = check_index(bwt_idx_prefix, params.bowtie2) |
3877 |
|
3878 |
if currentStage >= resumeStage: |
3879 |
th_log("Generating SAM header for "+bwt_idx_prefix) |
3880 |
# we need to provide another name for this sam header as genome and transcriptome may have the same prefix. |
3881 |
sam_header_filename = get_index_sam_header(params, bwt_idx_prefix, "genome") |
3882 |
params.sam_header = sam_header_filename |
3883 |
#if not params.skip_check_reads: |
3884 |
reads_list = left_reads_list |
3885 |
if right_reads_list: |
3886 |
reads_list = reads_list + "," + right_reads_list |
3887 |
params.read_params = check_reads_format(params, reads_list) |
3888 |
|
3889 |
user_supplied_juncs = [] |
3890 |
user_supplied_insertions = [] |
3891 |
user_supplied_deletions = [] |
3892 |
user_supplied_fusions = [] |
3893 |
global gtf_juncs |
3894 |
if params.gff_annotation and params.find_GFF_juncs: |
3895 |
test_input_file(params.gff_annotation) |
3896 |
(found_juncs, gtf_juncs) = get_gtf_juncs(params.gff_annotation) |
3897 |
##-- we shouldn't need these junctions in user_supplied_juncs anymore because now map2gtf does a much better job |
3898 |
## but we still need them loaded in gtf_juncs for later splice verification |
3899 |
if found_juncs: |
3900 |
## and not params.gff_annotation: |
3901 |
user_supplied_juncs.append(gtf_juncs) |
3902 |
#else: |
3903 |
# gtf_juncs = None |
3904 |
if params.raw_junctions: |
3905 |
test_input_file(params.raw_junctions) |
3906 |
user_supplied_juncs.append(params.raw_junctions) |
3907 |
|
3908 |
if params.raw_insertions: |
3909 |
test_input_file(params.raw_insertions) |
3910 |
user_supplied_insertions.append(params.raw_insertions) |
3911 |
|
3912 |
if params.raw_deletions: |
3913 |
test_input_file(params.raw_deletions) |
3914 |
user_supplied_deletions.append(params.raw_deletions) |
3915 |
|
3916 |
global unmapped_reads_fifo |
3917 |
unmapped_reads_fifo = tmp_dir + str(os.getpid())+".bwt_unmapped.z.fifo" |
3918 |
|
3919 |
# Now start the time consuming stuff |
3920 |
if params.prefilter_multi: |
3921 |
sides=("left","right") |
3922 |
read_lists=(left_reads_list, right_reads_list) |
3923 |
qual_lists=(left_quals_list, right_quals_list) |
3924 |
for ri in (0,1): |
3925 |
reads_list=read_lists[ri] |
3926 |
if not reads_list: |
3927 |
continue |
3928 |
fmulti_ext="bam" |
3929 |
if not params.bowtie2: |
3930 |
fmulti_ext="fq" |
3931 |
params.preflt_data[ri].seqfiles = reads_list |
3932 |
params.preflt_data[ri].qualfiles = qual_lists[ri] |
3933 |
params.preflt_data[ri].multihit_reads = tmp_dir + sides[ri]+"_multimapped."+fmulti_ext |
3934 |
side_imap = tmp_dir + sides[ri]+"_im" |
3935 |
#if use_zpacker: side_imap+=".z" |
3936 |
side_ium = tmp_dir + sides[ri]+"_ium" |
3937 |
#if use_BWT_FIFO and not params.bowtie2: |
3938 |
# side_ium += ".z" |
3939 |
th_log("Pre-filtering multi-mapped "+sides[ri]+" reads") |
3940 |
rdlist=reads_list.split(',') |
3941 |
bwt=bowtie(params, bwt_idx_prefix, sam_header_filename, rdlist, |
3942 |
params.read_params.reads_format, |
3943 |
params.read_mismatches, |
3944 |
params.read_gap_length, |
3945 |
params.read_edit_dist, |
3946 |
params.read_realign_edit_dist, |
3947 |
side_imap, side_ium, |
3948 |
"", _reads_vs_G, ri ) # multi-mapped reads will be in params.preflt_data[ri].multihit_reads |
3949 |
params.preflt_data[ri].mappings = bwt[0] # initial mappings |
3950 |
params.preflt_data[ri].unmapped_reads = bwt[1] # IUM reads |
3951 |
if currentStage >= resumeStage: |
3952 |
th_log("Preparing reads") |
3953 |
else: |
3954 |
th_log("Prepared reads:") |
3955 |
prep_info=None |
3956 |
|
3957 |
multihit_reads = [] |
3958 |
if params.preflt_data[0].multihit_reads: |
3959 |
multihit_reads += [params.preflt_data[0].multihit_reads] |
3960 |
if params.preflt_data[1].multihit_reads: |
3961 |
multihit_reads += [params.preflt_data[1].multihit_reads] |
3962 |
prep_info= prep_reads(params, |
3963 |
left_reads_list, left_quals_list, |
3964 |
right_reads_list, right_quals_list, |
3965 |
multihit_reads) |
3966 |
|
3967 |
min_read_len = prep_info.min_len[0] |
3968 |
if prep_info.min_len[1] > 0 and min_read_len > prep_info.min_len[1]: |
3969 |
min_read_len = prep_info.min_len[1] |
3970 |
if min_read_len < 20: |
3971 |
th_logp("Warning: short reads (<20bp) will make TopHat quite slow and take large amount of memory because they are likely to be mapped to too many places") |
3972 |
|
3973 |
max_read_len=max(prep_info.max_len[0], prep_info.max_len[1]) |
3974 |
|
3975 |
seed_len=params.read_params.seed_length |
3976 |
if seed_len: #if read len was explicitly given |
3977 |
seed_len = max(seed_len, min_read_len) |
3978 |
#can't be smaller than minimum length observed |
3979 |
else: |
3980 |
seed_len = max_read_len |
3981 |
params.read_params.seed_length=seed_len |
3982 |
# turn off integer-quals |
3983 |
if params.read_params.integer_quals: |
3984 |
params.read_params.integer_quals = False |
3985 |
|
3986 |
input_reads = prep_info.kept_reads[:] |
3987 |
mappings = spliced_alignment(params, |
3988 |
bwt_idx_prefix, |
3989 |
sam_header_filename, |
3990 |
ref_fasta, |
3991 |
params.read_params.seed_length, |
3992 |
params.segment_length, |
3993 |
input_reads, |
3994 |
user_supplied_juncs, |
3995 |
user_supplied_insertions, |
3996 |
user_supplied_deletions) |
3997 |
setRunStage(_stage_tophat_reports) |
3998 |
compile_reports(params, |
3999 |
sam_header_filename, |
4000 |
ref_fasta, |
4001 |
mappings, |
4002 |
input_reads, |
4003 |
params.gff_annotation) |
4004 |
|
4005 |
if not params.system_params.keep_tmp: |
4006 |
try: |
4007 |
for m in mappings[0]: |
4008 |
os.remove(m) |
4009 |
for m in input_reads: |
4010 |
if m and os.path.exists(m): os.remove(m) |
4011 |
for m in mappings[1]: |
4012 |
os.remove(m) |
4013 |
tmp_files = os.listdir(tmp_dir) |
4014 |
for t in tmp_files: |
4015 |
os.remove(tmp_dir+t) |
4016 |
os.rmdir(tmp_dir) |
4017 |
except OSError, o: |
4018 |
th_logp("Warning: couldn't remove all temporary files in "+tmp_dir) |
4019 |
|
4020 |
finish_time = datetime.now() |
4021 |
duration = finish_time - start_time |
4022 |
th_logp("-----------------------------------------------") |
4023 |
th_log("Run complete: %s elapsed" % formatTD(duration)) |
4024 |
|
4025 |
except Usage, err: |
4026 |
th_logp(sys.argv[0].split("/")[-1] + ": " + str(err.msg)) |
4027 |
th_logp(" for detailed help see http://tophat.cbcb.umd.edu/manual.html") |
4028 |
return 2 |
4029 |
|
4030 |
|
4031 |
if __name__ == "__main__": |
4032 |
sys.exit(main()) |