ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/common.h
Revision: 33
Committed: Wed Aug 3 17:02:56 2011 UTC (13 years, 1 month ago) by gpertea
File size: 11267 byte(s)
Log Message:
wip conversion to SAM processing

Line File contents
1 #ifndef COMMON_H
2 #define COMMON_H
3 /*
4 * common.h
5 * TopHat
6 *
7 * Created by Cole Trapnell on 11/26/08.
8 * Copyright 2008 Cole Trapnell. All rights reserved.
9 *
10 */
11 #include <stdint.h>
12 #include <cassert>
13 #include <cstring>
14 #include <cstdlib>
15 #include <cstdio>
16 #include <string>
17 #include <vector>
18 #include "bam/bam.h"
19 #include "bam/sam.h"
20
21
22 #ifdef MEM_DEBUG
23 void process_mem_usage(double& vm_usage, double& resident_set);
24 void print_mem_usage();
25 #endif
26
27 /*
28 * Maximum allowable length of an
29 * an insertion. Used mainly in
30 * segment_juncs.cpp
31 */
32 extern unsigned int max_insertion_length;
33
34 /*
35 * Maximum allowable length of a
36 * deletion. Used mainly in segment_juncs.cpp
37 * and long_spanning_reads.cpp
38 */
39 extern unsigned int max_deletion_length;
40
41 extern int inner_dist_mean;
42 extern int inner_dist_std_dev;
43 extern int max_mate_inner_dist;
44
45 extern int min_anchor_len;
46 extern int min_report_intron_length;
47 extern int max_report_intron_length;
48
49 extern int min_closure_intron_length;
50 extern int max_closure_intron_length;
51
52 extern int min_coverage_intron_length;
53 extern int max_coverage_intron_length;
54
55 extern int min_segment_intron_length;
56 extern int max_segment_intron_length;
57
58 extern uint32_t min_closure_exon_length;
59 extern int island_extension;
60 extern int num_cpus;
61 extern int segment_length; // the read segment length used by the pipeline
62 extern int segment_mismatches;
63
64 extern int max_splice_mismatches;
65
66 enum ReadFormat {FASTA, FASTQ};
67 extern ReadFormat reads_format;
68
69 extern bool verbose;
70 extern int max_multihits;
71 extern bool no_closure_search;
72 extern bool no_coverage_search;
73 extern bool no_microexon_search;
74 extern bool butterfly_search;
75
76 extern float min_isoform_fraction;
77
78 extern std::string output_dir;
79 extern std::string gff_file;
80 extern std::string gene_filter;
81
82 extern std::string ium_reads;
83 extern std::string sam_header;
84 extern std::string sam_readgroup_id;
85 extern std::string zpacker; //path to program to use for de/compression (gzip, pigz, bzip2, pbzip2)
86 extern std::string samtools_path; //path to samtools executable
87 extern std::string aux_outfile; //auxiliary output file name
88 extern bool solexa_quals;
89 extern bool phred64_quals;
90 extern bool quals;
91 extern bool integer_quals;
92 extern bool color;
93 extern bool color_out;
94
95 extern std::string gtf_juncs;
96
97 enum eLIBRARY_TYPE
98 {
99 LIBRARY_TYPE_NONE = 0,
100
101 FR_UNSTRANDED,
102 FR_FIRSTSTRAND,
103 FR_SECONDSTRAND,
104
105 FF_UNSTRANDED,
106 FF_FIRSTSTRAND,
107 FF_SECONDSTRAND,
108
109 NUM_LIBRARY_TYPE
110 };
111
112 extern eLIBRARY_TYPE library_type;
113 std::string getFext(const std::string& s); //returns file extension converted to lowercase
114 bool str_endsWith(std::string& str, const char* suffix);
115 void str_appendInt(std::string& str, int v);
116 FILE* fzOpen(std::string& fname, const char* mode);
117
118 int parseIntOpt(int lower, const char *errmsg, void (*print_usage)());
119 int parse_options(int argc, char** argv, void (*print_usage)());
120
121 void err_exit(const char* format,...); // exit with an error
122
123 char* get_token(char** str, const char* delims);
124 std::string guess_packer(const std::string& fname, bool use_all_cpus);
125 std::string getUnpackCmd(const std::string& fname, bool use_all_cpus=false);
126
127 void checkSamHeader();
128 void writeSamHeader(FILE* fout);
129
130 class FZPipe {
131 public:
132 FILE* file;
133 std::string filename;
134 std::string pipecmd;
135 FZPipe(std::string& fname, bool is_mapping):filename(fname),pipecmd() {
136 //this constructor is only to use FZPipe as a READER
137 //also accepts/recognizes BAM files
138 //for which it only stores the filename, other fields/methods are unused
139 if (is_mapping && getFext(fname) == "bam") {
140 file=NULL;
141 return;
142 }
143 pipecmd=getUnpackCmd(fname); //also bam2fastx
144 this->openRead(fname.c_str(), pipecmd);
145 }
146
147 FZPipe():filename(),pipecmd() {
148 file=NULL;
149 }
150 FZPipe(std::string& fname, std::string& pcmd):filename(fname),pipecmd(pcmd) {
151 //open as a compressed file reader
152 file=NULL;
153 this->openRead(fname.c_str(), pipecmd);
154 }
155 void close() {
156 if (file!=NULL) {
157 if (pipecmd.empty()) fclose(file);
158 else pclose(file);
159 file=NULL;
160 }
161 }
162 FILE* openWrite(const char* fname, std::string& popencmd);
163 FILE* openWrite(const char* fname);
164 FILE* openRead(const char* fname, std::string& popencmd);
165
166 FILE* openRead(const char* fname);
167 FILE* openRead(const std::string fname, std::string& popencmd) {
168 return this->openRead(fname.c_str(),popencmd);
169 }
170 FILE* openRead(const std::string fname) {
171 return this->openRead(fname.c_str());
172 }
173 void rewind();
174 };
175
176 void err_die(const char* format,...);
177
178 //uint8_t* realloc_bdata(bam1_t *b, int size);
179 //uint8_t* dupalloc_bdata(bam1_t *b, int size);
180
181 class GBamRecord {
182 bam1_t* b;
183 // b->data has the following strings concatenated:
184 // qname (including the terminal \0)
185 // +cigar (each event encoded on 32 bits)
186 // +seq (4bit-encoded)
187 // +qual
188 // +aux
189 bool novel;
190 public:
191 GBamRecord(bam1_t* from_b=NULL) {
192 if (from_b==NULL) {
193 b=bam_init1();
194 novel=true;
195 }
196 else {
197 b=from_b;
198 novel=false;
199 }
200 }
201
202 void clear() {
203 if (novel) {
204 bam_destroy1(b);
205 }
206 novel=true;
207 b=bam_init1();
208 }
209
210 ~GBamRecord() {
211 if (novel) { bam_destroy1(b); }
212 }
213
214 void parse_error(const char* s) {
215 err_die("BAM parsing error: %s\n", s);
216 }
217
218 bam1_t* get_b() { return b; }
219
220 void set_mdata(int32_t mtid, int32_t m0pos, //0-based coordinate, -1 if not available
221 int32_t isize=0) { //mate info for current record
222 b->core.mtid=mtid;
223 b->core.mpos=m0pos; // should be -1 if '*'
224 b->core.isize=isize; //should be 0 if not available
225 }
226
227 void set_flags(uint16_t flags) {
228 b->core.flag=flags;
229 }
230
231 //creates a new record from 1-based alignment coordinate
232 //quals should be given as Phred33
233 //Warning: pos and mate_pos must be given 1-based!
234 GBamRecord(const char* qname, int32_t gseq_tid,
235 int pos, bool reverse, const char* qseq, const char* cigar=NULL, const char* quals=NULL);
236 GBamRecord(const char* qname, int32_t flags, int32_t g_tid,
237 int pos, int map_qual, const char* cigar, int32_t mg_tid, int mate_pos,
238 int insert_size, const char* qseq, const char* quals=NULL,
239 const std::vector<std::string>* aux_strings=NULL);
240 void set_cigar(const char* cigar); //converts and adds CIGAR string given in plain SAM text format
241 void add_sequence(const char* qseq, int slen=-1); //adds the DNA sequence given in plain text format
242 void add_quals(const char* quals); //quality values string in Phred33 format
243 void add_aux(const char* str); //adds one aux field in plain SAM text format (e.g. "NM:i:1")
244 void add_aux(const char tag[2], char atype, int len, uint8_t *data) {
245 int ori_len = b->data_len;
246 b->data_len += 3 + len;
247 b->l_aux += 3 + len;
248 if (b->m_data < b->data_len) {
249 b->m_data = b->data_len;
250 kroundup32(b->m_data);
251 b->data = (uint8_t*)realloc(b->data, b->m_data);
252 }
253 b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1];
254 b->data[ori_len + 2] = atype;
255 memcpy(b->data + ori_len + 3, data, len);
256 }
257 };
258
259 class GBamWriter {
260 samfile_t* bam_file;
261 bam_header_t* bam_header;
262 public:
263 void create(const char* fname, bool uncompressed=false) {
264 if (bam_header==NULL)
265 err_die("Error: no bam_header for GBamWriter::create()!\n");
266 if (uncompressed) {
267 bam_file=samopen(fname, "wbu", bam_header);
268 }
269 else {
270 bam_file=samopen(fname, "wb", bam_header);
271 }
272 if (bam_file==NULL)
273 err_die("Error: could not create BAM file %s!\n",fname);
274 //do we need to call bam_header_write() ?
275 }
276 void create(const char* fname, bam_header_t* bh, bool uncompressed=false) {
277 bam_header=bh;
278 create(fname,uncompressed);
279 }
280
281 GBamWriter(const char* fname, bam_header_t* bh, bool uncompressed=false) {
282 create(fname, bh, uncompressed);
283 }
284
285 GBamWriter(const char* fname, const char* samfname, bool uncompressed=false) {
286 tamFile samf_in=sam_open(samfname);
287 if (samf_in==NULL)
288 err_die("Error: could not open SAM file %s\n", samfname);
289 bam_header=sam_header_read(samf_in);
290 if (bam_header==NULL)
291 err_die("Error: could not read SAM header from %s!\n",samfname);
292 sam_close(samf_in);
293 create(fname, uncompressed);
294 }
295
296 ~GBamWriter() {
297 samclose(bam_file);
298 bam_header_destroy(bam_header);
299 }
300 bam_header_t* get_header() { return bam_header; }
301 int32_t get_tid(const char *seq_name) {
302 if (bam_header==NULL)
303 err_die("Error: missing SAM header (get_tid())\n");
304 return bam_get_tid(bam_header, seq_name);
305 }
306
307 //just a convenience function for creating a new record, but it's NOT written
308 //given pos must be 1-based (so it'll be stored as pos-1 because BAM is 0-based)
309 GBamRecord* new_record(const char* qname, const char* gseqname,
310 int pos, bool reverse, const char* qseq, const char* cigar=NULL, const char* qual=NULL) {
311 int32_t gseq_tid=get_tid(gseqname);
312 if (gseq_tid < 0 && strcmp(gseqname, "*")) {
313 if (bam_header->n_targets == 0) {
314 err_die("Error: missing/invalid SAM header\n");
315 } else
316 fprintf(stderr, "Warning: reference '%s' not found in header, will consider it '*'.\n",
317 gseqname);
318 }
319
320 return (new GBamRecord(qname, gseq_tid, pos, reverse, qseq, cigar, qual));
321 }
322
323 GBamRecord* new_record(const char* qname, int32_t flags, const char* gseqname,
324 int pos, int map_qual, const char* cigar, const char* mgseqname, int mate_pos,
325 int insert_size, const char* qseq, const char* quals=NULL,
326 const std::vector<std::string>* aux_strings=NULL) {
327 int32_t gseq_tid=get_tid(gseqname);
328 if (gseq_tid < 0 && strcmp(gseqname, "*")) {
329 if (bam_header->n_targets == 0) {
330 err_die("Error: missing/invalid SAM header\n");
331 } else
332 fprintf(stderr, "Warning: reference '%s' not found in header, will consider it '*'.\n",
333 gseqname);
334 }
335 int32_t mgseq_tid=-1;
336 if (mgseqname!=NULL) {
337 if (strcmp(mgseqname, "=")==0) {
338 mgseq_tid=gseq_tid;
339 }
340 else {
341 mgseq_tid=get_tid(mgseqname);
342 if (mgseq_tid < 0 && strcmp(mgseqname, "*")) {
343 fprintf(stderr, "Warning: reference '%s' not found in header, will consider it '*'.\n",
344 mgseqname);
345 }
346 }
347 }
348 return (new GBamRecord(qname, flags, gseq_tid, pos, map_qual, cigar,
349 mgseq_tid, mate_pos, insert_size, qseq, quals, aux_strings));
350 }
351
352 void write(GBamRecord* brec) {
353 if (brec!=NULL)
354 samwrite(this->bam_file,brec->get_b());
355 }
356 void write(bam1_t* b) {
357 samwrite(this->bam_file, b);
358 }
359 };
360
361
362 #endif