1 |
gpertea |
29 |
#ifndef COMMON_H |
2 |
|
|
#define COMMON_H |
3 |
|
|
/* |
4 |
|
|
* common.h |
5 |
|
|
* TopHat |
6 |
|
|
* |
7 |
|
|
* Created by Cole Trapnell on 11/26/08. |
8 |
|
|
* Copyright 2008 Cole Trapnell. All rights reserved. |
9 |
|
|
* |
10 |
|
|
*/ |
11 |
|
|
#include <stdint.h> |
12 |
|
|
#include <cassert> |
13 |
|
|
#include <cstring> |
14 |
|
|
#include <cstdlib> |
15 |
|
|
#include <cstdio> |
16 |
|
|
#include <string> |
17 |
|
|
#include <vector> |
18 |
|
|
#include "bam/bam.h" |
19 |
|
|
#include "bam/sam.h" |
20 |
|
|
|
21 |
|
|
|
22 |
|
|
#ifdef MEM_DEBUG |
23 |
|
|
void process_mem_usage(double& vm_usage, double& resident_set); |
24 |
|
|
void print_mem_usage(); |
25 |
|
|
#endif |
26 |
|
|
|
27 |
|
|
/* |
28 |
|
|
* Maximum allowable length of an |
29 |
|
|
* an insertion. Used mainly in |
30 |
|
|
* segment_juncs.cpp |
31 |
|
|
*/ |
32 |
|
|
extern unsigned int max_insertion_length; |
33 |
|
|
|
34 |
|
|
/* |
35 |
|
|
* Maximum allowable length of a |
36 |
|
|
* deletion. Used mainly in segment_juncs.cpp |
37 |
|
|
* and long_spanning_reads.cpp |
38 |
|
|
*/ |
39 |
|
|
extern unsigned int max_deletion_length; |
40 |
|
|
|
41 |
|
|
extern int inner_dist_mean; |
42 |
|
|
extern int inner_dist_std_dev; |
43 |
|
|
extern int max_mate_inner_dist; |
44 |
|
|
|
45 |
|
|
extern int min_anchor_len; |
46 |
|
|
extern int min_report_intron_length; |
47 |
|
|
extern int max_report_intron_length; |
48 |
|
|
|
49 |
|
|
extern int min_closure_intron_length; |
50 |
|
|
extern int max_closure_intron_length; |
51 |
|
|
|
52 |
|
|
extern int min_coverage_intron_length; |
53 |
|
|
extern int max_coverage_intron_length; |
54 |
|
|
|
55 |
|
|
extern int min_segment_intron_length; |
56 |
|
|
extern int max_segment_intron_length; |
57 |
|
|
|
58 |
|
|
extern uint32_t min_closure_exon_length; |
59 |
|
|
extern int island_extension; |
60 |
|
|
extern int num_cpus; |
61 |
|
|
extern int segment_length; // the read segment length used by the pipeline |
62 |
|
|
extern int segment_mismatches; |
63 |
|
|
|
64 |
|
|
extern int max_splice_mismatches; |
65 |
|
|
|
66 |
|
|
enum ReadFormat {FASTA, FASTQ}; |
67 |
|
|
extern ReadFormat reads_format; |
68 |
|
|
|
69 |
|
|
extern bool verbose; |
70 |
|
|
extern int max_multihits; |
71 |
|
|
extern bool no_closure_search; |
72 |
|
|
extern bool no_coverage_search; |
73 |
|
|
extern bool no_microexon_search; |
74 |
|
|
extern bool butterfly_search; |
75 |
|
|
|
76 |
|
|
extern float min_isoform_fraction; |
77 |
|
|
|
78 |
|
|
extern std::string output_dir; |
79 |
|
|
extern std::string gff_file; |
80 |
|
|
extern std::string gene_filter; |
81 |
|
|
|
82 |
|
|
extern std::string ium_reads; |
83 |
|
|
extern std::string sam_header; |
84 |
|
|
extern std::string sam_readgroup_id; |
85 |
|
|
extern std::string zpacker; //path to program to use for de/compression (gzip, pigz, bzip2, pbzip2) |
86 |
|
|
extern std::string samtools_path; //path to samtools executable |
87 |
|
|
extern std::string aux_outfile; //auxiliary output file name |
88 |
|
|
extern bool solexa_quals; |
89 |
|
|
extern bool phred64_quals; |
90 |
|
|
extern bool quals; |
91 |
|
|
extern bool integer_quals; |
92 |
|
|
extern bool color; |
93 |
|
|
extern bool color_out; |
94 |
|
|
|
95 |
|
|
extern std::string gtf_juncs; |
96 |
|
|
|
97 |
|
|
enum eLIBRARY_TYPE |
98 |
|
|
{ |
99 |
|
|
LIBRARY_TYPE_NONE = 0, |
100 |
|
|
|
101 |
|
|
FR_UNSTRANDED, |
102 |
|
|
FR_FIRSTSTRAND, |
103 |
|
|
FR_SECONDSTRAND, |
104 |
|
|
|
105 |
|
|
FF_UNSTRANDED, |
106 |
|
|
FF_FIRSTSTRAND, |
107 |
|
|
FF_SECONDSTRAND, |
108 |
|
|
|
109 |
|
|
NUM_LIBRARY_TYPE |
110 |
|
|
}; |
111 |
|
|
|
112 |
|
|
extern eLIBRARY_TYPE library_type; |
113 |
|
|
std::string getFext(const std::string& s); //returns file extension converted to lowercase |
114 |
|
|
bool str_endsWith(std::string& str, const char* suffix); |
115 |
|
|
void str_appendInt(std::string& str, int v); |
116 |
|
|
FILE* fzOpen(std::string& fname, const char* mode); |
117 |
|
|
|
118 |
|
|
int parseIntOpt(int lower, const char *errmsg, void (*print_usage)()); |
119 |
|
|
int parse_options(int argc, char** argv, void (*print_usage)()); |
120 |
|
|
|
121 |
|
|
void err_exit(const char* format,...); // exit with an error |
122 |
|
|
|
123 |
|
|
char* get_token(char** str, const char* delims); |
124 |
|
|
std::string guess_packer(const std::string& fname, bool use_all_cpus); |
125 |
|
|
std::string getUnpackCmd(const std::string& fname, bool use_all_cpus=false); |
126 |
|
|
|
127 |
|
|
void checkSamHeader(); |
128 |
|
|
void writeSamHeader(FILE* fout); |
129 |
|
|
|
130 |
|
|
struct FZPipe { |
131 |
|
|
FILE* file; |
132 |
|
|
std::string filename; |
133 |
|
|
std::string pipecmd; |
134 |
|
|
FZPipe():filename(),pipecmd() { |
135 |
|
|
file=NULL; |
136 |
|
|
} |
137 |
|
|
FZPipe(std::string& fname, std::string& pcmd):filename(fname),pipecmd(pcmd) { |
138 |
|
|
//open as a compressed file reader |
139 |
|
|
file=NULL; |
140 |
|
|
this->openRead(fname.c_str(), pipecmd); |
141 |
|
|
} |
142 |
|
|
void close() { |
143 |
|
|
if (file!=NULL) { |
144 |
|
|
if (pipecmd.empty()) fclose(file); |
145 |
|
|
else pclose(file); |
146 |
|
|
file=NULL; |
147 |
|
|
} |
148 |
|
|
} |
149 |
|
|
FILE* openWrite(const char* fname, std::string& popencmd); |
150 |
|
|
FILE* openWrite(const char* fname); |
151 |
|
|
FILE* openRead(const char* fname, std::string& popencmd); |
152 |
|
|
|
153 |
|
|
FILE* openRead(const char* fname); |
154 |
|
|
FILE* openRead(const std::string fname, std::string& popencmd) { |
155 |
|
|
return this->openRead(fname.c_str(),popencmd); |
156 |
|
|
} |
157 |
|
|
FILE* openRead(const std::string fname) { |
158 |
|
|
return this->openRead(fname.c_str()); |
159 |
|
|
} |
160 |
|
|
void rewind(); |
161 |
|
|
}; |
162 |
|
|
|
163 |
|
|
void err_die(const char* format,...); |
164 |
|
|
|
165 |
|
|
//uint8_t* realloc_bdata(bam1_t *b, int size); |
166 |
|
|
//uint8_t* dupalloc_bdata(bam1_t *b, int size); |
167 |
|
|
|
168 |
|
|
class GBamRecord { |
169 |
|
|
bam1_t* b; |
170 |
|
|
// b->data has the following strings concatenated: |
171 |
|
|
// qname (including the terminal \0) |
172 |
|
|
// +cigar (each event encoded on 32 bits) |
173 |
|
|
// +seq (4bit-encoded) |
174 |
|
|
// +qual |
175 |
|
|
// +aux |
176 |
|
|
bool novel; |
177 |
|
|
public: |
178 |
|
|
GBamRecord(bam1_t* from_b=NULL) { |
179 |
|
|
if (from_b==NULL) { |
180 |
|
|
b=bam_init1(); |
181 |
|
|
novel=true; |
182 |
|
|
} |
183 |
|
|
else { |
184 |
|
|
b=from_b; |
185 |
|
|
novel=false; |
186 |
|
|
} |
187 |
|
|
} |
188 |
|
|
|
189 |
|
|
void clear() { |
190 |
|
|
if (novel) { |
191 |
|
|
bam_destroy1(b); |
192 |
|
|
} |
193 |
|
|
novel=true; |
194 |
|
|
b=bam_init1(); |
195 |
|
|
} |
196 |
|
|
|
197 |
|
|
~GBamRecord() { |
198 |
|
|
if (novel) { bam_destroy1(b); } |
199 |
|
|
} |
200 |
|
|
|
201 |
|
|
void parse_error(const char* s) { |
202 |
|
|
err_die("BAM parsing error: %s\n", s); |
203 |
|
|
} |
204 |
|
|
|
205 |
|
|
bam1_t* get_b() { return b; } |
206 |
|
|
|
207 |
|
|
void set_mdata(int32_t mtid, int32_t m0pos, //0-based coordinate, -1 if not available |
208 |
|
|
int32_t isize=0) { //mate info for current record |
209 |
|
|
b->core.mtid=mtid; |
210 |
|
|
b->core.mpos=m0pos; // should be -1 if '*' |
211 |
|
|
b->core.isize=isize; //should be 0 if not available |
212 |
|
|
} |
213 |
|
|
|
214 |
|
|
void set_flags(uint16_t flags) { |
215 |
|
|
b->core.flag=flags; |
216 |
|
|
} |
217 |
|
|
|
218 |
|
|
//creates a new record from 1-based alignment coordinate |
219 |
|
|
//quals should be given as Phred33 |
220 |
|
|
//Warning: pos and mate_pos must be given 1-based! |
221 |
|
|
GBamRecord(const char* qname, int32_t gseq_tid, |
222 |
|
|
int pos, bool reverse, const char* qseq, const char* cigar=NULL, const char* quals=NULL); |
223 |
|
|
GBamRecord(const char* qname, int32_t flags, int32_t g_tid, |
224 |
|
|
int pos, int map_qual, const char* cigar, int32_t mg_tid, int mate_pos, |
225 |
|
|
int insert_size, const char* qseq, const char* quals=NULL, |
226 |
|
|
const std::vector<std::string>* aux_strings=NULL); |
227 |
|
|
void set_cigar(const char* cigar); //converts and adds CIGAR string given in plain SAM text format |
228 |
|
|
void add_sequence(const char* qseq, int slen=-1); //adds the DNA sequence given in plain text format |
229 |
|
|
void add_quals(const char* quals); //quality values string in Phred33 format |
230 |
|
|
void add_aux(const char* str); //adds one aux field in plain SAM text format (e.g. "NM:i:1") |
231 |
|
|
void add_aux(const char tag[2], char atype, int len, uint8_t *data) { |
232 |
|
|
int ori_len = b->data_len; |
233 |
|
|
b->data_len += 3 + len; |
234 |
|
|
b->l_aux += 3 + len; |
235 |
|
|
if (b->m_data < b->data_len) { |
236 |
|
|
b->m_data = b->data_len; |
237 |
|
|
kroundup32(b->m_data); |
238 |
|
|
b->data = (uint8_t*)realloc(b->data, b->m_data); |
239 |
|
|
} |
240 |
|
|
b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1]; |
241 |
|
|
b->data[ori_len + 2] = atype; |
242 |
|
|
memcpy(b->data + ori_len + 3, data, len); |
243 |
|
|
} |
244 |
|
|
}; |
245 |
|
|
|
246 |
|
|
class GBamWriter { |
247 |
|
|
samfile_t* bam_file; |
248 |
|
|
bam_header_t* bam_header; |
249 |
|
|
public: |
250 |
|
|
void create(const char* fname, bool uncompressed=false) { |
251 |
|
|
if (bam_header==NULL) |
252 |
|
|
err_die("Error: no bam_header for GBamWriter::create()!\n"); |
253 |
|
|
if (uncompressed) { |
254 |
|
|
bam_file=samopen(fname, "wbu", bam_header); |
255 |
|
|
} |
256 |
|
|
else { |
257 |
|
|
bam_file=samopen(fname, "wb", bam_header); |
258 |
|
|
} |
259 |
|
|
if (bam_file==NULL) |
260 |
|
|
err_die("Error: could not create BAM file %s!\n",fname); |
261 |
|
|
//do we need to call bam_header_write() ? |
262 |
|
|
} |
263 |
|
|
|
264 |
|
|
GBamWriter(const char* fname, bam_header_t* bh, bool uncompressed=false) { |
265 |
|
|
bam_header=bh; |
266 |
|
|
create(fname, uncompressed); |
267 |
|
|
} |
268 |
|
|
|
269 |
|
|
GBamWriter(const char* fname, const char* samfname, bool uncompressed=false) { |
270 |
|
|
tamFile samf_in=sam_open(samfname); |
271 |
|
|
if (samf_in==NULL) |
272 |
|
|
err_die("Error: could not open SAM file %s\n", samfname); |
273 |
|
|
bam_header=sam_header_read(samf_in); |
274 |
|
|
if (bam_header==NULL) |
275 |
|
|
err_die("Error: could not read SAM header from %s!\n",samfname); |
276 |
|
|
sam_close(samf_in); |
277 |
|
|
create(fname, uncompressed); |
278 |
|
|
} |
279 |
|
|
|
280 |
|
|
~GBamWriter() { |
281 |
|
|
samclose(bam_file); |
282 |
|
|
bam_header_destroy(bam_header); |
283 |
|
|
} |
284 |
|
|
bam_header_t* get_header() { return bam_header; } |
285 |
|
|
int32_t get_tid(const char *seq_name) { |
286 |
|
|
if (bam_header==NULL) |
287 |
|
|
err_die("Error: missing SAM header (get_tid())\n"); |
288 |
|
|
return bam_get_tid(bam_header, seq_name); |
289 |
|
|
} |
290 |
|
|
|
291 |
|
|
//just a convenience function for creating a new record, but it's NOT written |
292 |
|
|
//given pos must be 1-based (so it'll be stored as pos-1 because BAM is 0-based) |
293 |
|
|
GBamRecord* new_record(const char* qname, const char* gseqname, |
294 |
|
|
int pos, bool reverse, const char* qseq, const char* cigar=NULL, const char* qual=NULL) { |
295 |
|
|
int32_t gseq_tid=get_tid(gseqname); |
296 |
|
|
if (gseq_tid < 0 && strcmp(gseqname, "*")) { |
297 |
|
|
if (bam_header->n_targets == 0) { |
298 |
|
|
err_die("Error: missing/invalid SAM header\n"); |
299 |
|
|
} else |
300 |
|
|
fprintf(stderr, "Warning: reference '%s' not found in header, will consider it '*'.\n", |
301 |
|
|
gseqname); |
302 |
|
|
} |
303 |
|
|
|
304 |
|
|
return (new GBamRecord(qname, gseq_tid, pos, reverse, qseq, cigar, qual)); |
305 |
|
|
} |
306 |
|
|
|
307 |
|
|
GBamRecord* new_record(const char* qname, int32_t flags, const char* gseqname, |
308 |
|
|
int pos, int map_qual, const char* cigar, const char* mgseqname, int mate_pos, |
309 |
|
|
int insert_size, const char* qseq, const char* quals=NULL, |
310 |
|
|
const std::vector<std::string>* aux_strings=NULL) { |
311 |
|
|
int32_t gseq_tid=get_tid(gseqname); |
312 |
|
|
if (gseq_tid < 0 && strcmp(gseqname, "*")) { |
313 |
|
|
if (bam_header->n_targets == 0) { |
314 |
|
|
err_die("Error: missing/invalid SAM header\n"); |
315 |
|
|
} else |
316 |
|
|
fprintf(stderr, "Warning: reference '%s' not found in header, will consider it '*'.\n", |
317 |
|
|
gseqname); |
318 |
|
|
} |
319 |
|
|
int32_t mgseq_tid=-1; |
320 |
|
|
if (mgseqname!=NULL) { |
321 |
|
|
if (strcmp(mgseqname, "=")==0) { |
322 |
|
|
mgseq_tid=gseq_tid; |
323 |
|
|
} |
324 |
|
|
else { |
325 |
|
|
mgseq_tid=get_tid(mgseqname); |
326 |
|
|
if (mgseq_tid < 0 && strcmp(mgseqname, "*")) { |
327 |
|
|
fprintf(stderr, "Warning: reference '%s' not found in header, will consider it '*'.\n", |
328 |
|
|
mgseqname); |
329 |
|
|
} |
330 |
|
|
} |
331 |
|
|
} |
332 |
|
|
return (new GBamRecord(qname, flags, gseq_tid, pos, map_qual, cigar, |
333 |
|
|
mgseq_tid, mate_pos, insert_size, qseq, quals, aux_strings)); |
334 |
|
|
} |
335 |
|
|
|
336 |
|
|
void write(GBamRecord* brec) { |
337 |
|
|
if (brec!=NULL) |
338 |
|
|
samwrite(this->bam_file,brec->get_b()); |
339 |
|
|
} |
340 |
|
|
}; |
341 |
|
|
|
342 |
|
|
|
343 |
|
|
#endif |