ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/common.h
Revision: 29
Committed: Tue Aug 2 21:24:54 2011 UTC (13 years, 2 months ago) by gpertea
File size: 10605 byte(s)
Log Message:
adding tophat source work

Line User Rev File contents
1 gpertea 29 #ifndef COMMON_H
2     #define COMMON_H
3     /*
4     * common.h
5     * TopHat
6     *
7     * Created by Cole Trapnell on 11/26/08.
8     * Copyright 2008 Cole Trapnell. All rights reserved.
9     *
10     */
11     #include <stdint.h>
12     #include <cassert>
13     #include <cstring>
14     #include <cstdlib>
15     #include <cstdio>
16     #include <string>
17     #include <vector>
18     #include "bam/bam.h"
19     #include "bam/sam.h"
20    
21    
22     #ifdef MEM_DEBUG
23     void process_mem_usage(double& vm_usage, double& resident_set);
24     void print_mem_usage();
25     #endif
26    
27     /*
28     * Maximum allowable length of an
29     * an insertion. Used mainly in
30     * segment_juncs.cpp
31     */
32     extern unsigned int max_insertion_length;
33    
34     /*
35     * Maximum allowable length of a
36     * deletion. Used mainly in segment_juncs.cpp
37     * and long_spanning_reads.cpp
38     */
39     extern unsigned int max_deletion_length;
40    
41     extern int inner_dist_mean;
42     extern int inner_dist_std_dev;
43     extern int max_mate_inner_dist;
44    
45     extern int min_anchor_len;
46     extern int min_report_intron_length;
47     extern int max_report_intron_length;
48    
49     extern int min_closure_intron_length;
50     extern int max_closure_intron_length;
51    
52     extern int min_coverage_intron_length;
53     extern int max_coverage_intron_length;
54    
55     extern int min_segment_intron_length;
56     extern int max_segment_intron_length;
57    
58     extern uint32_t min_closure_exon_length;
59     extern int island_extension;
60     extern int num_cpus;
61     extern int segment_length; // the read segment length used by the pipeline
62     extern int segment_mismatches;
63    
64     extern int max_splice_mismatches;
65    
66     enum ReadFormat {FASTA, FASTQ};
67     extern ReadFormat reads_format;
68    
69     extern bool verbose;
70     extern int max_multihits;
71     extern bool no_closure_search;
72     extern bool no_coverage_search;
73     extern bool no_microexon_search;
74     extern bool butterfly_search;
75    
76     extern float min_isoform_fraction;
77    
78     extern std::string output_dir;
79     extern std::string gff_file;
80     extern std::string gene_filter;
81    
82     extern std::string ium_reads;
83     extern std::string sam_header;
84     extern std::string sam_readgroup_id;
85     extern std::string zpacker; //path to program to use for de/compression (gzip, pigz, bzip2, pbzip2)
86     extern std::string samtools_path; //path to samtools executable
87     extern std::string aux_outfile; //auxiliary output file name
88     extern bool solexa_quals;
89     extern bool phred64_quals;
90     extern bool quals;
91     extern bool integer_quals;
92     extern bool color;
93     extern bool color_out;
94    
95     extern std::string gtf_juncs;
96    
97     enum eLIBRARY_TYPE
98     {
99     LIBRARY_TYPE_NONE = 0,
100    
101     FR_UNSTRANDED,
102     FR_FIRSTSTRAND,
103     FR_SECONDSTRAND,
104    
105     FF_UNSTRANDED,
106     FF_FIRSTSTRAND,
107     FF_SECONDSTRAND,
108    
109     NUM_LIBRARY_TYPE
110     };
111    
112     extern eLIBRARY_TYPE library_type;
113     std::string getFext(const std::string& s); //returns file extension converted to lowercase
114     bool str_endsWith(std::string& str, const char* suffix);
115     void str_appendInt(std::string& str, int v);
116     FILE* fzOpen(std::string& fname, const char* mode);
117    
118     int parseIntOpt(int lower, const char *errmsg, void (*print_usage)());
119     int parse_options(int argc, char** argv, void (*print_usage)());
120    
121     void err_exit(const char* format,...); // exit with an error
122    
123     char* get_token(char** str, const char* delims);
124     std::string guess_packer(const std::string& fname, bool use_all_cpus);
125     std::string getUnpackCmd(const std::string& fname, bool use_all_cpus=false);
126    
127     void checkSamHeader();
128     void writeSamHeader(FILE* fout);
129    
130     struct FZPipe {
131     FILE* file;
132     std::string filename;
133     std::string pipecmd;
134     FZPipe():filename(),pipecmd() {
135     file=NULL;
136     }
137     FZPipe(std::string& fname, std::string& pcmd):filename(fname),pipecmd(pcmd) {
138     //open as a compressed file reader
139     file=NULL;
140     this->openRead(fname.c_str(), pipecmd);
141     }
142     void close() {
143     if (file!=NULL) {
144     if (pipecmd.empty()) fclose(file);
145     else pclose(file);
146     file=NULL;
147     }
148     }
149     FILE* openWrite(const char* fname, std::string& popencmd);
150     FILE* openWrite(const char* fname);
151     FILE* openRead(const char* fname, std::string& popencmd);
152    
153     FILE* openRead(const char* fname);
154     FILE* openRead(const std::string fname, std::string& popencmd) {
155     return this->openRead(fname.c_str(),popencmd);
156     }
157     FILE* openRead(const std::string fname) {
158     return this->openRead(fname.c_str());
159     }
160     void rewind();
161     };
162    
163     void err_die(const char* format,...);
164    
165     //uint8_t* realloc_bdata(bam1_t *b, int size);
166     //uint8_t* dupalloc_bdata(bam1_t *b, int size);
167    
168     class GBamRecord {
169     bam1_t* b;
170     // b->data has the following strings concatenated:
171     // qname (including the terminal \0)
172     // +cigar (each event encoded on 32 bits)
173     // +seq (4bit-encoded)
174     // +qual
175     // +aux
176     bool novel;
177     public:
178     GBamRecord(bam1_t* from_b=NULL) {
179     if (from_b==NULL) {
180     b=bam_init1();
181     novel=true;
182     }
183     else {
184     b=from_b;
185     novel=false;
186     }
187     }
188    
189     void clear() {
190     if (novel) {
191     bam_destroy1(b);
192     }
193     novel=true;
194     b=bam_init1();
195     }
196    
197     ~GBamRecord() {
198     if (novel) { bam_destroy1(b); }
199     }
200    
201     void parse_error(const char* s) {
202     err_die("BAM parsing error: %s\n", s);
203     }
204    
205     bam1_t* get_b() { return b; }
206    
207     void set_mdata(int32_t mtid, int32_t m0pos, //0-based coordinate, -1 if not available
208     int32_t isize=0) { //mate info for current record
209     b->core.mtid=mtid;
210     b->core.mpos=m0pos; // should be -1 if '*'
211     b->core.isize=isize; //should be 0 if not available
212     }
213    
214     void set_flags(uint16_t flags) {
215     b->core.flag=flags;
216     }
217    
218     //creates a new record from 1-based alignment coordinate
219     //quals should be given as Phred33
220     //Warning: pos and mate_pos must be given 1-based!
221     GBamRecord(const char* qname, int32_t gseq_tid,
222     int pos, bool reverse, const char* qseq, const char* cigar=NULL, const char* quals=NULL);
223     GBamRecord(const char* qname, int32_t flags, int32_t g_tid,
224     int pos, int map_qual, const char* cigar, int32_t mg_tid, int mate_pos,
225     int insert_size, const char* qseq, const char* quals=NULL,
226     const std::vector<std::string>* aux_strings=NULL);
227     void set_cigar(const char* cigar); //converts and adds CIGAR string given in plain SAM text format
228     void add_sequence(const char* qseq, int slen=-1); //adds the DNA sequence given in plain text format
229     void add_quals(const char* quals); //quality values string in Phred33 format
230     void add_aux(const char* str); //adds one aux field in plain SAM text format (e.g. "NM:i:1")
231     void add_aux(const char tag[2], char atype, int len, uint8_t *data) {
232     int ori_len = b->data_len;
233     b->data_len += 3 + len;
234     b->l_aux += 3 + len;
235     if (b->m_data < b->data_len) {
236     b->m_data = b->data_len;
237     kroundup32(b->m_data);
238     b->data = (uint8_t*)realloc(b->data, b->m_data);
239     }
240     b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1];
241     b->data[ori_len + 2] = atype;
242     memcpy(b->data + ori_len + 3, data, len);
243     }
244     };
245    
246     class GBamWriter {
247     samfile_t* bam_file;
248     bam_header_t* bam_header;
249     public:
250     void create(const char* fname, bool uncompressed=false) {
251     if (bam_header==NULL)
252     err_die("Error: no bam_header for GBamWriter::create()!\n");
253     if (uncompressed) {
254     bam_file=samopen(fname, "wbu", bam_header);
255     }
256     else {
257     bam_file=samopen(fname, "wb", bam_header);
258     }
259     if (bam_file==NULL)
260     err_die("Error: could not create BAM file %s!\n",fname);
261     //do we need to call bam_header_write() ?
262     }
263    
264     GBamWriter(const char* fname, bam_header_t* bh, bool uncompressed=false) {
265     bam_header=bh;
266     create(fname, uncompressed);
267     }
268    
269     GBamWriter(const char* fname, const char* samfname, bool uncompressed=false) {
270     tamFile samf_in=sam_open(samfname);
271     if (samf_in==NULL)
272     err_die("Error: could not open SAM file %s\n", samfname);
273     bam_header=sam_header_read(samf_in);
274     if (bam_header==NULL)
275     err_die("Error: could not read SAM header from %s!\n",samfname);
276     sam_close(samf_in);
277     create(fname, uncompressed);
278     }
279    
280     ~GBamWriter() {
281     samclose(bam_file);
282     bam_header_destroy(bam_header);
283     }
284     bam_header_t* get_header() { return bam_header; }
285     int32_t get_tid(const char *seq_name) {
286     if (bam_header==NULL)
287     err_die("Error: missing SAM header (get_tid())\n");
288     return bam_get_tid(bam_header, seq_name);
289     }
290    
291     //just a convenience function for creating a new record, but it's NOT written
292     //given pos must be 1-based (so it'll be stored as pos-1 because BAM is 0-based)
293     GBamRecord* new_record(const char* qname, const char* gseqname,
294     int pos, bool reverse, const char* qseq, const char* cigar=NULL, const char* qual=NULL) {
295     int32_t gseq_tid=get_tid(gseqname);
296     if (gseq_tid < 0 && strcmp(gseqname, "*")) {
297     if (bam_header->n_targets == 0) {
298     err_die("Error: missing/invalid SAM header\n");
299     } else
300     fprintf(stderr, "Warning: reference '%s' not found in header, will consider it '*'.\n",
301     gseqname);
302     }
303    
304     return (new GBamRecord(qname, gseq_tid, pos, reverse, qseq, cigar, qual));
305     }
306    
307     GBamRecord* new_record(const char* qname, int32_t flags, const char* gseqname,
308     int pos, int map_qual, const char* cigar, const char* mgseqname, int mate_pos,
309     int insert_size, const char* qseq, const char* quals=NULL,
310     const std::vector<std::string>* aux_strings=NULL) {
311     int32_t gseq_tid=get_tid(gseqname);
312     if (gseq_tid < 0 && strcmp(gseqname, "*")) {
313     if (bam_header->n_targets == 0) {
314     err_die("Error: missing/invalid SAM header\n");
315     } else
316     fprintf(stderr, "Warning: reference '%s' not found in header, will consider it '*'.\n",
317     gseqname);
318     }
319     int32_t mgseq_tid=-1;
320     if (mgseqname!=NULL) {
321     if (strcmp(mgseqname, "=")==0) {
322     mgseq_tid=gseq_tid;
323     }
324     else {
325     mgseq_tid=get_tid(mgseqname);
326     if (mgseq_tid < 0 && strcmp(mgseqname, "*")) {
327     fprintf(stderr, "Warning: reference '%s' not found in header, will consider it '*'.\n",
328     mgseqname);
329     }
330     }
331     }
332     return (new GBamRecord(qname, flags, gseq_tid, pos, map_qual, cigar,
333     mgseq_tid, mate_pos, insert_size, qseq, quals, aux_strings));
334     }
335    
336     void write(GBamRecord* brec) {
337     if (brec!=NULL)
338     samwrite(this->bam_file,brec->get_b());
339     }
340     };
341    
342    
343     #endif