ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/juncs_db.cpp
Revision: 154
Committed: Tue Jan 24 02:29:21 2012 UTC (12 years, 7 months ago) by gpertea
File size: 18572 byte(s)
Log Message:
massive update with Daehwan's work

Line File contents
1 /*
2 * juncs_db.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 12/12/08.
6 * Copyright 2008 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13
14
15 #include <cassert>
16 #include <cstdio>
17 #include <cstring>
18 #include <vector>
19 #include <string>
20 #include <map>
21 #include <algorithm>
22 #include <set>
23 #include <stdexcept>
24 #include <iostream>
25 #include <fstream>
26 #include <seqan/sequence.h>
27 #include <seqan/find.h>
28 #include <seqan/file.h>
29 #include <seqan/modifier.h>
30 #include <getopt.h>
31
32 #include "common.h"
33 #include "bwt_map.h"
34 #include "tokenize.h"
35 #include "junctions.h"
36 #include "insertions.h"
37 #include "deletions.h"
38 #include "fusions.h"
39
40
41 using namespace std;
42 using namespace seqan;
43 using std::set;
44
45
46 // Length of the outer dimension of a single insert from the paired-end library
47
48 static int read_length = -1;
49 void print_usage()
50 {
51 fprintf(stderr, "Usage: juncs_db <min_anchor> <read_length> <splice_coords1,...,splice_coordsN> <insertion_coords1,...,insertion_coordsN> <deletion_coords1,...,deletion_coordsN> <ref.fa>\n");
52 }
53
54 typedef vector<string> Mapped;
55
56 bool splice_junc_lt(const pair<size_t, size_t>& lhs,
57 const pair<size_t, size_t>& rhs)
58 {
59 if (lhs.first < rhs.first)
60 return true;
61 else
62 return lhs.second < rhs.second;
63 }
64
65 /**
66 * Given an insertion set, this code will print FASTA entries
67 * for the surrounding sequence. The names of the FASTA entries
68 * contain information about the exact location and nature of the
69 * insertion. The entry is generally of the form
70 * <contig>|<left end of fasta sequence>|<position of insertion>-<sequence of insertion>|<right end of fasta sequence>|ins|<[fwd|rev]>
71 */
72 template<typename TStr>
73 void print_insertion(const Insertion& insertion,
74 int read_len,
75 TStr& ref_str,
76 const string& ref_name,
77 ostream& splice_db)
78 {
79 int half_splice_len = read_len - min_anchor_len;
80
81 size_t left_start, right_start;
82 size_t left_end, right_end;
83 if (insertion.left >= 0 && insertion.left <= length(ref_str))
84 {
85 left_start = (int)insertion.left - half_splice_len + 1 >= 0 ? (int)insertion.left - half_splice_len + 1 : 0;
86 left_end = left_start + half_splice_len;
87
88 right_start = (int)left_end;
89 right_end = right_start + half_splice_len < length(ref_str) ? right_start + half_splice_len : length(ref_str);
90
91 Infix<RefSequenceTable::Sequence>::Type left_splice = infix(ref_str, left_start, left_end);
92 Infix<RefSequenceTable::Sequence>::Type right_splice = infix(ref_str, right_start, right_end);
93
94 splice_db << ">" << ref_name << "|" << left_start << "|" << insertion.left << "-" << insertion.sequence
95 << "|" << right_end << "|ins|" << ("fwd") << endl;
96
97 splice_db << left_splice << insertion.sequence << right_splice << endl;
98 }
99 }
100
101
102 template<typename TStr>
103 void print_splice(const Junction& junction,
104 int read_len,
105 const string& tag,
106 TStr& ref_str,
107 const string& ref_name,
108 ostream& splice_db)
109 {
110 // daehwan - this is tentative, let's think about this more :)
111 // int half_splice_len = read_len - min_anchor_len;
112 int half_splice_len = read_len;
113 size_t left_start, right_start;
114 size_t left_end, right_end;
115
116 if (junction.left >= 0 && junction.left <= length(ref_str) &&
117 junction.right >= 0 && junction.right <= length(ref_str))
118 {
119 left_start = (int)junction.left - half_splice_len + 1 >= 0 ? (int)junction.left - half_splice_len + 1 : 0;
120 left_end = left_start + half_splice_len;
121
122 right_start = junction.right;
123 right_end = right_start + half_splice_len < length(ref_str) ? right_start + half_splice_len : length(ref_str) - right_start;
124
125 Infix<RefSequenceTable::Sequence>::Type left_splice = infix(ref_str,
126 left_start,
127 left_end);
128 Infix<RefSequenceTable::Sequence>::Type right_splice = infix(ref_str,
129 right_start,
130 right_end);
131
132 splice_db << ">" << ref_name << "|" << left_start << "|" << junction.left <<
133 "-" << junction.right << "|" << right_end << "|" << tag << endl;
134
135 splice_db << left_splice << right_splice << endl;
136 }
137 }
138
139 template<typename TStr>
140 void print_fusion(const Fusion& fusion,
141 int read_len,
142 TStr& left_ref_str,
143 TStr& right_ref_str,
144 const char* left_ref_name,
145 const char* right_ref_name,
146 ostream& fusion_db)
147 {
148 int half_splice_len = read_len - min_anchor_len;
149
150 size_t left_start, right_start;
151 size_t left_end, right_end;
152
153 if (fusion.left >= 0 && fusion.left < length(left_ref_str) &&
154 fusion.right >= 0 && fusion.right < length(right_ref_str))
155 {
156 if (fusion.dir == FUSION_FF || fusion.dir == FUSION_FR)
157 {
158 left_start = fusion.left + 1 >= half_splice_len ? fusion.left - half_splice_len + 1 : 0;
159 left_end = left_start + half_splice_len;
160 }
161 else
162 {
163 left_start = fusion.left;
164 left_end = left_start + half_splice_len < length(left_ref_str) ? left_start + half_splice_len : length(left_ref_str);
165 }
166
167 if (fusion.dir == FUSION_FF || fusion.dir == FUSION_RF)
168 {
169 right_start = fusion.right;
170 right_end = right_start + half_splice_len < length(right_ref_str) ? right_start + half_splice_len : length(right_ref_str);
171 }
172 else
173 {
174 right_end = fusion.right + 1;
175 right_start = right_end >= half_splice_len ? right_end - half_splice_len : 0;
176 }
177
178 seqan::Dna5String left_splice = infix(left_ref_str, left_start, left_end);
179 seqan::Dna5String right_splice = infix(right_ref_str, right_start, right_end);
180
181 if (fusion.dir == FUSION_RF || fusion.dir == FUSION_RR)
182 {
183 seqan::convertInPlace(left_splice, seqan::FunctorComplement<Dna>());
184 seqan::reverseInPlace(left_splice);
185
186 left_start = left_end - 1;
187 }
188
189 if (fusion.dir == FUSION_FR || fusion.dir == FUSION_RR)
190 {
191 seqan::convertInPlace(right_splice, seqan::FunctorComplement<Dna>());
192 seqan::reverseInPlace(right_splice);
193
194 right_end = right_start - 1;
195 }
196
197 const char* dir = "ff";
198 if (fusion.dir == FUSION_FR)
199 dir = "fr";
200 else if (fusion.dir == FUSION_RF)
201 dir = "rf";
202 else if (fusion.dir == FUSION_RR)
203 dir = "rr";
204
205 fusion_db << ">" << left_ref_name << "-" << right_ref_name << "|"
206 << left_start << "|"
207 << fusion.left << "-" << fusion.right << "|"
208 << right_end << "|fus|" << dir << endl;
209
210 fusion_db << left_splice << right_splice << endl;
211 }
212 }
213
214
215 /**
216 * Parse an int out of optarg and enforce that it be at least 'lower';
217 * if it is less than 'lower', than output the given error message and
218 * exit with an error and a usage message.
219 */
220 static int parse_oInt(int lower, char* arg, const char *errmsg) {
221 long l;
222 char *endPtr= NULL;
223 l = strtol(arg, &endPtr, 10);
224 if (endPtr != NULL) {
225 if (l < lower) {
226 cerr << errmsg << endl;
227 print_usage();
228 exit(1);
229 }
230 return (int32_t)l;
231 }
232 cerr << errmsg << endl;
233 print_usage();
234 exit(1);
235 return -1;
236 }
237 //
238 //int parse_options(int argc, char** argv)
239 //{
240 // int option_index = 0;
241 // int next_option;
242 // do {
243 // next_option = getopt_long(argc, argv, short_options, long_options, &option_index);
244 // switch (next_option) {
245 // case 'v':
246 // verbose = true;
247 // break;
248 // case -1: /* Done with options. */
249 // break;
250 // default:
251 // print_usage();
252 // return 1;
253 // }
254 // } while(next_option != -1);
255 //
256 // return 0;
257 //}
258
259 void get_seqs(istream& ref_stream,
260 RefSequenceTable& rt,
261 bool keep_seqs = true)
262 {
263 while(ref_stream.good() &&
264 !ref_stream.eof())
265 {
266 RefSequenceTable::Sequence* ref_str = new RefSequenceTable::Sequence();
267 string name;
268 readMeta(ref_stream, name, Fasta());
269 string::size_type space_pos = name.find_first_of(" \t\r");
270 if (space_pos != string::npos)
271 {
272 name.resize(space_pos);
273 }
274 fprintf(stderr, "\tLoading %s...", name.c_str());
275 seqan::read(ref_stream, *ref_str, Fasta());
276 fprintf(stderr, "done\n");
277 rt.get_id(name, keep_seqs ? ref_str : NULL, 0);
278 if (!keep_seqs)
279 delete ref_str;
280 }
281 }
282
283 void driver(const vector<FILE*>& splice_coords_files,
284 const vector<FILE*>& insertion_coords_files,
285 const vector<FILE*>& deletion_coords_files,
286 const vector<FILE*>& fusion_coords_files,
287 ifstream& ref_stream)
288 {
289 char splice_buf[2048];
290 RefSequenceTable rt(sam_header, true);
291 get_seqs(ref_stream, rt, true);
292
293 JunctionSet junctions;
294 for (size_t i = 0; i < splice_coords_files.size(); ++i)
295 {
296 FILE* splice_coords = splice_coords_files[i];
297 if (!splice_coords)
298 continue;
299 while (fgets(splice_buf, 2048, splice_coords))
300 {
301 char* nl = strrchr(splice_buf, '\n');
302 char* buf = splice_buf;
303 if (nl) *nl = 0;
304
305 /**
306 Fields are:
307 1) reference name
308 2) left coord of splice (last char of the left exon)
309 3) right coord of splice (first char of the right exon)
310 */
311
312 char* ref_name = get_token((char**)&buf, "\t");
313 char* scan_left_coord = get_token((char**)&buf, "\t");
314 char* scan_right_coord = get_token((char**)&buf, "\t");
315 char* orientation = get_token((char**)&buf, "\t");
316
317 if (!scan_left_coord || !scan_right_coord || !orientation)
318 {
319 fprintf(stderr,"Error: malformed splice coordinate record\n");
320 exit(1);
321 }
322 uint32_t ref_id = rt.get_id(ref_name, NULL, 0);
323 uint32_t left_coord = atoi(scan_left_coord);
324 uint32_t right_coord = atoi(scan_right_coord);
325 bool antisense = *orientation == '-';
326 junctions.insert(make_pair<Junction, JunctionStats>(Junction(ref_id, left_coord, right_coord, antisense), JunctionStats()));
327 }
328 }
329
330
331 /*
332 * Read in the deletion coordinates
333 * and store in a set
334 */
335 std::set<Deletion> deletions;
336 for(size_t i=0; i < deletion_coords_files.size(); ++i){
337 FILE* deletion_coords = deletion_coords_files[i];
338 if(!deletion_coords){
339 continue;
340 }
341 while (fgets(splice_buf, 2048, deletion_coords))
342 {
343 char* nl = strrchr(splice_buf, '\n');
344 char* buf = splice_buf;
345 if (nl) *nl = 0;
346
347 /**
348 Fields are:
349 1) reference name
350 2) left coord of splice (last char of the left exon)
351 3) right coord of splice (first char of the right exon)
352 */
353
354 char* ref_name = get_token((char**)&buf, "\t");
355 char* scan_left_coord = get_token((char**)&buf, "\t");
356 char* scan_right_coord = get_token((char**)&buf, "\t");
357
358 if (!scan_left_coord || !scan_right_coord)
359 {
360 fprintf(stderr,"Error: malformed deletion coordinate record\n");
361 exit(1);
362 }
363
364 /*
365 * Note that when reading in a deletion, the left co-ord is the position of the
366 * first deleted based. Since we are co-opting the junction data structure, need
367 * to fix up this location
368 */
369 uint32_t ref_id = rt.get_id(ref_name, NULL, 0);
370 uint32_t left_coord = atoi(scan_left_coord);
371 uint32_t right_coord = atoi(scan_right_coord);
372 deletions.insert(Deletion(ref_id, left_coord - 1, right_coord, false));
373 }
374 }
375
376 /*
377 * Read in the insertion coordinates
378 * and store in a set
379 */
380 std::set<Insertion> insertions;
381 for(size_t i=0; i < insertion_coords_files.size(); ++i){
382 FILE* insertion_coords = insertion_coords_files[i];
383 if(!insertion_coords){
384 continue;
385 }
386 while(fgets(splice_buf, 2048, insertion_coords)){
387 char* nl = strrchr(splice_buf, '\n');
388 char* buf = splice_buf;
389 if (nl) *nl = 0;
390
391 char* ref_name = get_token((char**)&buf, "\t");
392 char* scan_left_coord = get_token((char**)&buf, "\t");
393 char* scan_right_coord = get_token((char**)&buf, "\t");
394 char* scan_sequence = get_token((char**)&buf, "\t");
395
396 if (!scan_left_coord || !scan_sequence || !scan_right_coord)
397 {
398 fprintf(stderr,"Error: malformed insertion coordinate record\n");
399 exit(1);
400 }
401
402 seqan::Dna5String sequence = seqan::Dna5String(scan_sequence);
403 bool containsN = false;
404 for(size_t index = 0; index < seqan::length(sequence); index += 1){
405 /*
406 * Don't allow any ambiguities in the insertion
407 */
408 if(sequence[index] == 'N'){
409 containsN = true;
410 break;
411 }
412 }
413 if(containsN){
414 continue;
415 }
416 seqan::CharString charSequence = sequence;
417 uint32_t ref_id = rt.get_id(ref_name,NULL,0);
418 uint32_t left_coord = atoi(scan_left_coord);
419 insertions.insert(Insertion(ref_id, left_coord, seqan::toCString(charSequence)));
420 }
421 }
422
423 std::set<Fusion> fusions;
424 for(size_t i=0; i < fusion_coords_files.size(); ++i){
425 FILE* fusion_coords = fusion_coords_files[i];
426 if(!fusion_coords){
427 continue;
428 }
429 while(fgets(splice_buf, 2048, fusion_coords)){
430 char* nl = strrchr(splice_buf, '\n');
431 char* buf = splice_buf;
432 if (nl) *nl = 0;
433
434 char* ref_name1 = strsep((char**)&buf, "\t");
435 char* scan_left_coord = strsep((char**)&buf, "\t");
436 char* ref_name2 = strsep((char**)&buf, "\t");
437 char* scan_right_coord = strsep((char**)&buf, "\t");
438 char* scan_dir = strsep((char**)&buf, "\t");
439
440 if (!ref_name1 || !scan_left_coord || !ref_name2 || !scan_right_coord || !scan_dir)
441 {
442 fprintf(stderr,"Error: malformed insertion coordinate record\n");
443 exit(1);
444 }
445
446 uint32_t ref_id1 = rt.get_id(ref_name1, NULL, 0);
447 uint32_t ref_id2 = rt.get_id(ref_name2, NULL, 0);
448 uint32_t left_coord = atoi(scan_left_coord);
449 uint32_t right_coord = atoi(scan_right_coord);
450 uint32_t dir = FUSION_FF;
451 if (strcmp(scan_dir, "fr") == 0)
452 dir = FUSION_FR;
453 else if(strcmp(scan_dir, "rf") == 0)
454 dir = FUSION_RF;
455 else if(strcmp(scan_dir, "rr") == 0)
456 dir = FUSION_RR;
457
458 fusions.insert(Fusion(ref_id1, ref_id2, left_coord, right_coord, dir));
459 }
460 }
461
462 {
463 JunctionSet::iterator itr = junctions.begin();
464 while(itr != junctions.end())
465 {
466 RefSequenceTable::Sequence* ref_str = rt.get_seq(itr->first.refid);
467 const char* name = rt.get_name(itr->first.refid);
468 print_splice(itr->first, read_length, itr->first.antisense ? "GTAG|rev" : "GTAG|fwd", *ref_str, name, cout);
469 ++itr;
470 }
471 }
472
473 {
474 std::set<Deletion>::iterator itr = deletions.begin();
475 while(itr != deletions.end())
476 {
477 RefSequenceTable::Sequence* ref_str = rt.get_seq(itr->refid);
478 const char* name = rt.get_name(itr->refid);
479 print_splice((Junction)*itr, read_length, itr->antisense ? "del|rev" : "del|fwd", *ref_str, name, cout);
480 ++itr;
481 }
482 }
483
484 {
485 std::set<Insertion>::iterator itr = insertions.begin();
486 while(itr != insertions.end()){
487 RefSequenceTable::Sequence* ref_str = rt.get_seq(itr->refid);
488 const char* name = rt.get_name(itr->refid);
489 print_insertion(*itr, read_length, *ref_str, name, cout);
490 ++itr;
491 }
492 }
493
494 {
495 std::set<Fusion>::iterator itr = fusions.begin();
496 while(itr != fusions.end()){
497 RefSequenceTable::Sequence* left_ref_str = rt.get_seq(itr->refid1);
498 RefSequenceTable::Sequence* right_ref_str = rt.get_seq(itr->refid2);
499 const char* left_ref_name = rt.get_name(itr->refid1);
500 const char* right_ref_name = rt.get_name(itr->refid2);
501 print_fusion(*itr, read_length, *left_ref_str, *right_ref_str, left_ref_name, right_ref_name, cout);
502 ++itr;
503 }
504 }
505 }
506
507 int main(int argc, char** argv)
508 {
509 fprintf(stderr, "juncs_db v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
510 fprintf(stderr, "---------------------------\n");
511
512 int parse_ret = parse_options(argc, argv, print_usage);
513 if (parse_ret)
514 return parse_ret;
515
516 if(optind >= argc)
517 {
518 print_usage();
519 return 1;
520 }
521
522 min_anchor_len = parse_oInt(3, argv[optind++], "anchor length must be at least 3");
523
524 if(optind >= argc)
525 {
526 print_usage();
527 return 1;
528 }
529
530 read_length = parse_oInt(4, argv[optind++], "read length must be at least 4");
531
532 if(optind >= argc)
533 {
534 print_usage();
535 return 1;
536 }
537
538 string splice_coords_file_list = argv[optind++];
539 vector<string> splice_coords_file_names;
540 vector<FILE*> coords_files;
541 tokenize(splice_coords_file_list, ",", splice_coords_file_names);
542 for (size_t s = 0; s < splice_coords_file_names.size(); ++s)
543 {
544
545 FILE* coords_file = fopen(splice_coords_file_names[s].c_str(), "r");
546
547 if (!coords_file)
548 {
549 fprintf(stderr, "Warning: cannot open %s for reading\n",
550 splice_coords_file_names[s].c_str());
551 continue;
552 }
553 coords_files.push_back(coords_file);
554 }
555 if(optind >= argc)
556 {
557 print_usage();
558 return 1;
559 }
560
561 /*
562 * Read in the insertion co-ordinates
563 */
564 string insertion_coords_file_list = argv[optind++];
565 vector<string> insertion_coords_file_names;
566 vector<FILE*> insertion_coords_files;
567 tokenize(insertion_coords_file_list, ",", insertion_coords_file_names);
568 for(size_t s = 0; s < insertion_coords_file_names.size(); ++s)
569 {
570 FILE* insertion_coords_file = fopen(insertion_coords_file_names[s].c_str(),"r");
571 if(!insertion_coords_file)
572 {
573 fprintf(stderr, "Warning: cannot open %s for reading\n",
574 insertion_coords_file_names[s].c_str());
575 continue;
576 }
577 insertion_coords_files.push_back(insertion_coords_file);
578 }
579 if(optind >= argc)
580 {
581 print_usage();
582 return 1;
583 }
584
585
586 /*
587 * Read in the deletion co-ordinates
588 */
589 string deletion_coords_file_list = argv[optind++];
590 vector<string> deletion_coords_file_names;
591 vector<FILE*> deletion_coords_files;
592 tokenize(deletion_coords_file_list, ",", deletion_coords_file_names);
593 for(size_t s = 0; s < deletion_coords_file_names.size(); ++s)
594 {
595 FILE* deletion_coords_file = fopen(deletion_coords_file_names[s].c_str(),"r");
596 if(!deletion_coords_file)
597 {
598 fprintf(stderr, "Warning: cannot open %s for reading\n",
599 deletion_coords_file_names[s].c_str());
600 continue;
601 }
602 deletion_coords_files.push_back(deletion_coords_file);
603 }
604 if(optind >= argc)
605 {
606 print_usage();
607 return 1;
608 }
609
610
611 /*
612 */
613 string fusion_coords_file_list = argv[optind++];
614 vector<string> fusion_coords_file_names;
615 vector<FILE*> fusion_coords_files;
616 tokenize(fusion_coords_file_list, ",", fusion_coords_file_names);
617 for(size_t s = 0; s < fusion_coords_file_names.size(); ++s)
618 {
619 FILE* fusion_coords_file = fopen(fusion_coords_file_names[s].c_str(),"r");
620 if(!fusion_coords_file)
621 {
622 fprintf(stderr, "Warning: cannot open %s for reading\n",
623 fusion_coords_file_names[s].c_str());
624 continue;
625 }
626 fusion_coords_files.push_back(fusion_coords_file);
627 }
628 if(optind >= argc)
629 {
630 print_usage();
631 return 1;
632 }
633
634
635 string ref_file_name = argv[optind++];
636 ifstream ref_stream(ref_file_name.c_str());
637
638 if (!ref_stream.good())
639 {
640 fprintf(stderr, "Error: cannot open %s for reading\n",
641 ref_file_name.c_str());
642 exit(1);
643 }
644
645 driver(coords_files, insertion_coords_files, deletion_coords_files, fusion_coords_files, ref_stream);
646 return 0;
647 }