ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/juncs_db.cpp
Revision: 245
Committed: Wed May 16 21:51:01 2012 UTC (12 years, 3 months ago) by gpertea
File size: 19058 byte(s)
Log Message:
merge-sync attempt

Line File contents
1 /*
2 * juncs_db.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 12/12/08.
6 * Copyright 2008 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13
14
15 #include <cassert>
16 #include <cstdio>
17 #include <cstring>
18 #include <vector>
19 #include <string>
20 #include <map>
21 #include <algorithm>
22 #include <set>
23 #include <stdexcept>
24 #include <iostream>
25 #include <fstream>
26 #include <seqan/sequence.h>
27 #include <seqan/find.h>
28 #include <seqan/file.h>
29 #include <seqan/modifier.h>
30 #include <getopt.h>
31
32 #include "common.h"
33 #include "bwt_map.h"
34 #include "tokenize.h"
35 #include "junctions.h"
36 #include "insertions.h"
37 #include "deletions.h"
38 #include "fusions.h"
39
40
41 using namespace std;
42 using namespace seqan;
43 using std::set;
44
45
46 // Length of the outer dimension of a single insert from the paired-end library
47
48 static int read_length = -1;
49 void print_usage()
50 {
51 fprintf(stderr, "Usage: juncs_db <min_anchor> <read_length> <splice_coords1,...,splice_coordsN> <insertion_coords1,...,insertion_coordsN> <deletion_coords1,...,deletion_coordsN> <ref.fa>\n");
52 }
53
54 typedef vector<string> Mapped;
55
56 bool splice_junc_lt(const pair<size_t, size_t>& lhs,
57 const pair<size_t, size_t>& rhs)
58 {
59 if (lhs.first < rhs.first)
60 return true;
61 else
62 return lhs.second < rhs.second;
63 }
64
65 /**
66 * Given an insertion set, this code will print FASTA entries
67 * for the surrounding sequence. The names of the FASTA entries
68 * contain information about the exact location and nature of the
69 * insertion. The entry is generally of the form
70 * <contig>|<left end of fasta sequence>|<position of insertion>-<sequence of insertion>|<right end of fasta sequence>|ins|<[fwd|rev]>
71 */
72 template<typename TStr>
73 void print_insertion(const Insertion& insertion,
74 int read_len,
75 TStr& ref_str,
76 const string& ref_name,
77 ostream& splice_db)
78 {
79 int half_splice_len = read_len - min_anchor_len;
80
81 size_t left_start, right_start;
82 size_t left_end, right_end;
83
84 size_t ref_len = length(ref_str);
85 if (insertion.left >= 0 && insertion.left <= ref_len)
86 {
87 left_start = (int)insertion.left - half_splice_len + 1 >= 0 ? (int)insertion.left - half_splice_len + 1 : 0;
88 left_end = left_start + half_splice_len;
89
90 right_start = (int)left_end;
91 right_end = right_start + half_splice_len < ref_len ? right_start + half_splice_len : ref_len;
92
93 if (left_start < left_end && left_end <= ref_len &&
94 right_start < right_end && right_end <= ref_len)
95 {
96 Infix<RefSequenceTable::Sequence>::Type left_splice = infix(ref_str, left_start, left_end);
97 Infix<RefSequenceTable::Sequence>::Type right_splice = infix(ref_str, right_start, right_end);
98
99 splice_db << ">" << ref_name << "|" << left_start << "|" << insertion.left << "-" << insertion.sequence
100 << "|" << right_end << "|ins|" << ("fwd") << endl;
101
102 splice_db << left_splice << insertion.sequence << right_splice << endl;
103 }
104 }
105 }
106
107
108 template<typename TStr>
109 void print_splice(const Junction& junction,
110 int read_len,
111 const string& tag,
112 TStr& ref_str,
113 const string& ref_name,
114 ostream& splice_db)
115 {
116 // daehwan - this is tentative, let's think about this more :)
117 // int half_splice_len = read_len - min_anchor_len;
118 int half_splice_len = read_len;
119 size_t left_start, right_start;
120 size_t left_end, right_end;
121
122 size_t ref_len = length(ref_str);
123
124 if (junction.left >= 0 && junction.left <= ref_len &&
125 junction.right >= 0 && junction.right <= ref_len)
126 {
127 left_start = (int)junction.left - half_splice_len + 1 >= 0 ? (int)junction.left - half_splice_len + 1 : 0;
128 left_end = left_start + half_splice_len;
129
130 right_start = junction.right;
131 right_end = right_start + half_splice_len < ref_len ? right_start + half_splice_len : ref_len;
132
133 if (left_start < left_end && left_end <= ref_len &&
134 right_start < right_end && right_end <= ref_len)
135 {
136 Infix<RefSequenceTable::Sequence>::Type left_splice = infix(ref_str,
137 left_start,
138 left_end);
139 Infix<RefSequenceTable::Sequence>::Type right_splice = infix(ref_str,
140 right_start,
141 right_end);
142
143 splice_db << ">" << ref_name << "|" << left_start << "|" << junction.left <<
144 "-" << junction.right << "|" << right_end << "|" << tag << endl;
145
146 splice_db << left_splice << right_splice << endl;
147 }
148 }
149 }
150
151 template<typename TStr>
152 void print_fusion(const Fusion& fusion,
153 int read_len,
154 TStr& left_ref_str,
155 TStr& right_ref_str,
156 const char* left_ref_name,
157 const char* right_ref_name,
158 ostream& fusion_db)
159 {
160 int half_splice_len = read_len - min_anchor_len;
161
162 size_t left_start, right_start;
163 size_t left_end, right_end;
164
165 size_t left_ref_len = length(left_ref_str);
166 size_t right_ref_len = length(right_ref_str);
167
168 if (fusion.left >= 0 && fusion.left < left_ref_len &&
169 fusion.right >= 0 && fusion.right < right_ref_len)
170 {
171 if (fusion.dir == FUSION_FF || fusion.dir == FUSION_FR)
172 {
173 left_start = fusion.left + 1 >= (size_t)half_splice_len ? fusion.left - half_splice_len + 1 : 0;
174 left_end = left_start + half_splice_len;
175 }
176 else
177 {
178 left_start = fusion.left;
179 left_end = left_start + half_splice_len < left_ref_len ? left_start + half_splice_len : left_ref_len;
180 }
181
182 if (fusion.dir == FUSION_FF || fusion.dir == FUSION_RF)
183 {
184 right_start = fusion.right;
185 right_end = right_start + half_splice_len < right_ref_len ? right_start + half_splice_len : right_ref_len;
186 }
187 else
188 {
189 right_end = fusion.right + 1;
190 right_start = right_end >= (size_t)half_splice_len ? right_end - half_splice_len : 0;
191 }
192
193 if (left_start < left_end && left_end <= left_ref_len &&
194 right_start < right_end && right_end <= right_ref_len)
195 {
196 seqan::Dna5String left_splice = infix(left_ref_str, left_start, left_end);
197 seqan::Dna5String right_splice = infix(right_ref_str, right_start, right_end);
198
199 if (fusion.dir == FUSION_RF || fusion.dir == FUSION_RR)
200 {
201 seqan::reverseComplement(left_splice);
202 left_start = left_end - 1;
203 }
204
205 if (fusion.dir == FUSION_FR || fusion.dir == FUSION_RR)
206 {
207 seqan::reverseComplement(right_splice);
208 right_end = right_start - 1;
209 }
210
211 const char* dir = "ff";
212 if (fusion.dir == FUSION_FR)
213 dir = "fr";
214 else if (fusion.dir == FUSION_RF)
215 dir = "rf";
216 else if (fusion.dir == FUSION_RR)
217 dir = "rr";
218
219 fusion_db << ">" << left_ref_name << "-" << right_ref_name << "|"
220 << left_start << "|"
221 << fusion.left << "-" << fusion.right << "|"
222 << right_end << "|fus|" << dir << endl;
223
224 fusion_db << left_splice << right_splice << endl;
225 }
226 }
227 }
228
229
230 /**
231 * Parse an int out of optarg and enforce that it be at least 'lower';
232 * if it is less than 'lower', than output the given error message and
233 * exit with an error and a usage message.
234 */
235 static int parse_oInt(int lower, char* arg, const char *errmsg) {
236 long l;
237 char *endPtr= NULL;
238 l = strtol(arg, &endPtr, 10);
239 if (endPtr != NULL) {
240 if (l < lower) {
241 cerr << errmsg << endl;
242 print_usage();
243 exit(1);
244 }
245 return (int32_t)l;
246 }
247 cerr << errmsg << endl;
248 print_usage();
249 exit(1);
250 return -1;
251 }
252 //
253 //int parse_options(int argc, char** argv)
254 //{
255 // int option_index = 0;
256 // int next_option;
257 // do {
258 // next_option = getopt_long(argc, argv, short_options, long_options, &option_index);
259 // switch (next_option) {
260 // case 'v':
261 // verbose = true;
262 // break;
263 // case -1: /* Done with options. */
264 // break;
265 // default:
266 // print_usage();
267 // return 1;
268 // }
269 // } while(next_option != -1);
270 //
271 // return 0;
272 //}
273
274 void get_seqs(istream& ref_stream,
275 RefSequenceTable& rt,
276 bool keep_seqs = true)
277 {
278 while(ref_stream.good() &&
279 !ref_stream.eof())
280 {
281 RefSequenceTable::Sequence* ref_str = new RefSequenceTable::Sequence();
282 string name;
283 readMeta(ref_stream, name, Fasta());
284 string::size_type space_pos = name.find_first_of(" \t\r");
285 if (space_pos != string::npos)
286 {
287 name.resize(space_pos);
288 }
289 fprintf(stderr, "\tLoading %s...", name.c_str());
290 seqan::read(ref_stream, *ref_str, Fasta());
291 fprintf(stderr, "done\n");
292 rt.get_id(name, keep_seqs ? ref_str : NULL, 0);
293 if (!keep_seqs)
294 delete ref_str;
295 }
296 }
297
298 void driver(const vector<FILE*>& splice_coords_files,
299 const vector<FILE*>& insertion_coords_files,
300 const vector<FILE*>& deletion_coords_files,
301 const vector<FILE*>& fusion_coords_files,
302 ifstream& ref_stream)
303 {
304 char splice_buf[2048];
305 RefSequenceTable rt(sam_header, true);
306 get_seqs(ref_stream, rt, true);
307
308 JunctionSet junctions;
309 for (size_t i = 0; i < splice_coords_files.size(); ++i)
310 {
311 FILE* splice_coords = splice_coords_files[i];
312 if (!splice_coords)
313 continue;
314 while (fgets(splice_buf, 2048, splice_coords))
315 {
316 char* nl = strrchr(splice_buf, '\n');
317 char* buf = splice_buf;
318 if (nl) *nl = 0;
319
320 /**
321 Fields are:
322 1) reference name
323 2) left coord of splice (last char of the left exon)
324 3) right coord of splice (first char of the right exon)
325 */
326
327 char* ref_name = get_token((char**)&buf, "\t");
328 char* scan_left_coord = get_token((char**)&buf, "\t");
329 char* scan_right_coord = get_token((char**)&buf, "\t");
330 char* orientation = get_token((char**)&buf, "\t");
331
332 if (!scan_left_coord || !scan_right_coord || !orientation)
333 {
334 fprintf(stderr,"Error: malformed splice coordinate record\n");
335 exit(1);
336 }
337 uint32_t ref_id = rt.get_id(ref_name, NULL, 0);
338 uint32_t left_coord = atoi(scan_left_coord);
339 uint32_t right_coord = atoi(scan_right_coord);
340 bool antisense = *orientation == '-';
341 junctions.insert(make_pair<Junction, JunctionStats>(Junction(ref_id, left_coord, right_coord, antisense), JunctionStats()));
342 }
343 }
344
345
346 /*
347 * Read in the deletion coordinates
348 * and store in a set
349 */
350 std::set<Deletion> deletions;
351 for(size_t i=0; i < deletion_coords_files.size(); ++i){
352 FILE* deletion_coords = deletion_coords_files[i];
353 if(!deletion_coords){
354 continue;
355 }
356 while (fgets(splice_buf, 2048, deletion_coords))
357 {
358 char* nl = strrchr(splice_buf, '\n');
359 char* buf = splice_buf;
360 if (nl) *nl = 0;
361
362 /**
363 Fields are:
364 1) reference name
365 2) left coord of splice (last char of the left exon)
366 3) right coord of splice (first char of the right exon)
367 */
368
369 char* ref_name = get_token((char**)&buf, "\t");
370 char* scan_left_coord = get_token((char**)&buf, "\t");
371 char* scan_right_coord = get_token((char**)&buf, "\t");
372
373 if (!scan_left_coord || !scan_right_coord)
374 {
375 fprintf(stderr,"Error: malformed deletion coordinate record\n");
376 exit(1);
377 }
378
379 /*
380 * Note that when reading in a deletion, the left co-ord is the position of the
381 * first deleted based. Since we are co-opting the junction data structure, need
382 * to fix up this location
383 */
384 uint32_t ref_id = rt.get_id(ref_name, NULL, 0);
385 uint32_t left_coord = atoi(scan_left_coord);
386 uint32_t right_coord = atoi(scan_right_coord);
387 deletions.insert(Deletion(ref_id, left_coord - 1, right_coord, false));
388 }
389 }
390
391 /*
392 * Read in the insertion coordinates
393 * and store in a set
394 */
395 std::set<Insertion> insertions;
396 for(size_t i=0; i < insertion_coords_files.size(); ++i){
397 FILE* insertion_coords = insertion_coords_files[i];
398 if(!insertion_coords){
399 continue;
400 }
401 while(fgets(splice_buf, 2048, insertion_coords)){
402 char* nl = strrchr(splice_buf, '\n');
403 char* buf = splice_buf;
404 if (nl) *nl = 0;
405
406 char* ref_name = get_token((char**)&buf, "\t");
407 char* scan_left_coord = get_token((char**)&buf, "\t");
408 char* scan_right_coord = get_token((char**)&buf, "\t");
409 char* scan_sequence = get_token((char**)&buf, "\t");
410
411 if (!scan_left_coord || !scan_sequence || !scan_right_coord)
412 {
413 fprintf(stderr,"Error: malformed insertion coordinate record\n");
414 exit(1);
415 }
416
417 seqan::Dna5String sequence = seqan::Dna5String(scan_sequence);
418 bool containsN = false;
419 for(size_t index = 0; index < seqan::length(sequence); index += 1){
420 /*
421 * Don't allow any ambiguities in the insertion
422 */
423 if(sequence[index] == 'N'){
424 containsN = true;
425 break;
426 }
427 }
428 if(containsN){
429 continue;
430 }
431 seqan::CharString charSequence = sequence;
432 uint32_t ref_id = rt.get_id(ref_name,NULL,0);
433 uint32_t left_coord = atoi(scan_left_coord);
434 insertions.insert(Insertion(ref_id, left_coord, seqan::toCString(charSequence)));
435 }
436 }
437
438 std::set<Fusion> fusions;
439 for(size_t i=0; i < fusion_coords_files.size(); ++i){
440 FILE* fusion_coords = fusion_coords_files[i];
441 if(!fusion_coords){
442 continue;
443 }
444 while(fgets(splice_buf, 2048, fusion_coords)){
445 char* nl = strrchr(splice_buf, '\n');
446 char* buf = splice_buf;
447 if (nl) *nl = 0;
448
449 char* ref_name1 = strsep((char**)&buf, "\t");
450 char* scan_left_coord = strsep((char**)&buf, "\t");
451 char* ref_name2 = strsep((char**)&buf, "\t");
452 char* scan_right_coord = strsep((char**)&buf, "\t");
453 char* scan_dir = strsep((char**)&buf, "\t");
454
455 if (!ref_name1 || !scan_left_coord || !ref_name2 || !scan_right_coord || !scan_dir)
456 {
457 fprintf(stderr,"Error: malformed insertion coordinate record\n");
458 exit(1);
459 }
460
461 uint32_t ref_id1 = rt.get_id(ref_name1, NULL, 0);
462 uint32_t ref_id2 = rt.get_id(ref_name2, NULL, 0);
463 uint32_t left_coord = atoi(scan_left_coord);
464 uint32_t right_coord = atoi(scan_right_coord);
465 uint32_t dir = FUSION_FF;
466 if (strcmp(scan_dir, "fr") == 0)
467 dir = FUSION_FR;
468 else if(strcmp(scan_dir, "rf") == 0)
469 dir = FUSION_RF;
470 else if(strcmp(scan_dir, "rr") == 0)
471 dir = FUSION_RR;
472
473 fusions.insert(Fusion(ref_id1, ref_id2, left_coord, right_coord, dir));
474 }
475 }
476
477 {
478 JunctionSet::iterator itr = junctions.begin();
479 for (; itr != junctions.end(); ++itr)
480 {
481 RefSequenceTable::Sequence* ref_str = rt.get_seq(itr->first.refid);
482 if (ref_str == NULL) continue;
483
484 const char* name = rt.get_name(itr->first.refid);
485 print_splice(itr->first, read_length, itr->first.antisense ? "GTAG|rev" : "GTAG|fwd", *ref_str, name, cout);
486 }
487 }
488
489 {
490 std::set<Deletion>::iterator itr = deletions.begin();
491 for (; itr != deletions.end(); ++itr)
492 {
493 RefSequenceTable::Sequence* ref_str = rt.get_seq(itr->refid);
494 if (ref_str == NULL) continue;
495
496 const char* name = rt.get_name(itr->refid);
497 print_splice((Junction)*itr, read_length, itr->antisense ? "del|rev" : "del|fwd", *ref_str, name, cout);
498 }
499 }
500
501 {
502 std::set<Insertion>::iterator itr = insertions.begin();
503 for (; itr != insertions.end(); ++itr){
504 RefSequenceTable::Sequence* ref_str = rt.get_seq(itr->refid);
505 if (ref_str == NULL) continue;
506
507 const char* name = rt.get_name(itr->refid);
508 print_insertion(*itr, read_length, *ref_str, name, cout);
509 }
510 }
511
512 {
513 std::set<Fusion>::iterator itr = fusions.begin();
514 for (; itr != fusions.end(); ++itr){
515 RefSequenceTable::Sequence* left_ref_str = rt.get_seq(itr->refid1);
516 RefSequenceTable::Sequence* right_ref_str = rt.get_seq(itr->refid2);
517
518 if (left_ref_str == NULL || right_ref_str == NULL) continue;
519
520 const char* left_ref_name = rt.get_name(itr->refid1);
521 const char* right_ref_name = rt.get_name(itr->refid2);
522 print_fusion(*itr, read_length, *left_ref_str, *right_ref_str, left_ref_name, right_ref_name, cout);
523 }
524 }
525 }
526
527 int main(int argc, char** argv)
528 {
529 fprintf(stderr, "juncs_db v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
530 fprintf(stderr, "---------------------------\n");
531
532 int parse_ret = parse_options(argc, argv, print_usage);
533 if (parse_ret)
534 return parse_ret;
535
536 if(optind >= argc)
537 {
538 print_usage();
539 return 1;
540 }
541
542 min_anchor_len = parse_oInt(3, argv[optind++], "anchor length must be at least 3");
543
544 if(optind >= argc)
545 {
546 print_usage();
547 return 1;
548 }
549
550 read_length = parse_oInt(4, argv[optind++], "read length must be at least 4");
551
552 if(optind >= argc)
553 {
554 print_usage();
555 return 1;
556 }
557
558 string splice_coords_file_list = argv[optind++];
559 vector<string> splice_coords_file_names;
560 vector<FILE*> coords_files;
561 tokenize(splice_coords_file_list, ",", splice_coords_file_names);
562 for (size_t s = 0; s < splice_coords_file_names.size(); ++s)
563 {
564
565 FILE* coords_file = fopen(splice_coords_file_names[s].c_str(), "r");
566
567 if (!coords_file)
568 {
569 fprintf(stderr, "Warning: cannot open %s for reading\n",
570 splice_coords_file_names[s].c_str());
571 continue;
572 }
573 coords_files.push_back(coords_file);
574 }
575 if(optind >= argc)
576 {
577 print_usage();
578 return 1;
579 }
580
581 /*
582 * Read in the insertion co-ordinates
583 */
584 string insertion_coords_file_list = argv[optind++];
585 vector<string> insertion_coords_file_names;
586 vector<FILE*> insertion_coords_files;
587 tokenize(insertion_coords_file_list, ",", insertion_coords_file_names);
588 for(size_t s = 0; s < insertion_coords_file_names.size(); ++s)
589 {
590 FILE* insertion_coords_file = fopen(insertion_coords_file_names[s].c_str(),"r");
591 if(!insertion_coords_file)
592 {
593 fprintf(stderr, "Warning: cannot open %s for reading\n",
594 insertion_coords_file_names[s].c_str());
595 continue;
596 }
597 insertion_coords_files.push_back(insertion_coords_file);
598 }
599 if(optind >= argc)
600 {
601 print_usage();
602 return 1;
603 }
604
605
606 /*
607 * Read in the deletion co-ordinates
608 */
609 string deletion_coords_file_list = argv[optind++];
610 vector<string> deletion_coords_file_names;
611 vector<FILE*> deletion_coords_files;
612 tokenize(deletion_coords_file_list, ",", deletion_coords_file_names);
613 for(size_t s = 0; s < deletion_coords_file_names.size(); ++s)
614 {
615 FILE* deletion_coords_file = fopen(deletion_coords_file_names[s].c_str(),"r");
616 if(!deletion_coords_file)
617 {
618 fprintf(stderr, "Warning: cannot open %s for reading\n",
619 deletion_coords_file_names[s].c_str());
620 continue;
621 }
622 deletion_coords_files.push_back(deletion_coords_file);
623 }
624 if(optind >= argc)
625 {
626 print_usage();
627 return 1;
628 }
629
630
631 /*
632 */
633 string fusion_coords_file_list = argv[optind++];
634 vector<string> fusion_coords_file_names;
635 vector<FILE*> fusion_coords_files;
636 tokenize(fusion_coords_file_list, ",", fusion_coords_file_names);
637 for(size_t s = 0; s < fusion_coords_file_names.size(); ++s)
638 {
639 FILE* fusion_coords_file = fopen(fusion_coords_file_names[s].c_str(),"r");
640 if(!fusion_coords_file)
641 {
642 fprintf(stderr, "Warning: cannot open %s for reading\n",
643 fusion_coords_file_names[s].c_str());
644 continue;
645 }
646 fusion_coords_files.push_back(fusion_coords_file);
647 }
648 if(optind >= argc)
649 {
650 print_usage();
651 return 1;
652 }
653
654
655 string ref_file_name = argv[optind++];
656 ifstream ref_stream(ref_file_name.c_str());
657
658 if (!ref_stream.good())
659 {
660 fprintf(stderr, "Error: cannot open %s for reading\n",
661 ref_file_name.c_str());
662 exit(1);
663 }
664
665 driver(coords_files, insertion_coords_files, deletion_coords_files, fusion_coords_files, ref_stream);
666 return 0;
667 }