ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/mask_sam.cpp
Revision: 29
Committed: Tue Aug 2 21:24:54 2011 UTC (13 years, 1 month ago) by gpertea
File size: 3598 byte(s)
Log Message:
adding tophat source work

Line File contents
1 /*
2 * mask_sam.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 9/10/09.
6 * Copyright 2009 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #include <cstdlib>
11 #include <cstdio>
12 #include <getopt.h>
13
14 #include "common.h"
15 #include "bwt_map.h"
16
17 void print_usage()
18 {
19 fprintf(stderr, "Usage: mask_sam hits.sam repeatmask.out\n");
20 }
21
22 struct Repeat
23 {
24 Repeat(uint32_t i, int l, int r) : ref_id(i), left(l), right(r) {}
25 uint32_t ref_id;
26 int left;
27 int right;
28
29 bool operator<(const Repeat& rhs) const
30 {
31 if (ref_id != rhs.ref_id)
32 return ref_id < rhs.ref_id;
33 if (left != rhs.left)
34 return left < rhs.left;
35 if (right != rhs.right)
36 return right < rhs.right;
37 return false;
38 }
39 };
40
41 void driver(FILE* map_file, FILE* repeat_file)
42 {
43 RefSequenceTable rt(true, true);
44 ReadTable it;
45
46 SAMHitFactory factory(it, rt);
47
48 vector<Repeat > repeats;
49
50 char buf[1024 * 16];
51 while(!feof(repeat_file) &&
52 fgets(buf, sizeof(buf), repeat_file))
53 {
54 char chrom[1024];
55
56 int left;
57 int right;
58 int score;
59 float t1,t2,t3;
60 int ret = sscanf(buf, "%d %f %f %f %s %d %d", &score,&t1,&t2,&t3,chrom,&left,&right);
61 if (ret < 7)
62 continue;
63
64 int id = rt.get_id(chrom, NULL, 0);
65 repeats.push_back(Repeat(id, left, right));
66 }
67
68
69 sort(repeats.begin(), repeats.end());
70
71 vector<Repeat>::iterator curr_repeat = repeats.begin();
72
73 static int buf_size = 4096;
74 char bwt_buf[buf_size];
75
76 uint32_t last_ref_id_seen = 0;
77 int last_pos_seen = 0;
78
79 while (fgets(bwt_buf, 4096, map_file))
80 {
81 // Chomp the newline
82 char* nl = strrchr(bwt_buf, '\n');
83 if (nl) *nl = 0;
84
85 if (*bwt_buf == 0)
86 continue;
87
88 string clean_buf = bwt_buf;
89 // Get a new record from the tab-delimited Bowtie map
90 BowtieHit bh;
91
92 if (factory.get_hit_from_buf(bwt_buf, bh, true))
93 {
94
95 if (bh.left() < last_pos_seen)
96 {
97 fprintf(stderr, "Error: this SAM file doesn't appear to be correctly sorted!\n");
98 fprintf(stderr, "\tcurrent hit is at %s:%d, last one was at %s:%d\n",
99 rt.get_name(bh.ref_id()),
100 bh.left(),
101 rt.get_name(last_ref_id_seen),
102 last_pos_seen);
103
104 exit(1);
105 }
106
107 while (curr_repeat != repeats.end() &&
108 (curr_repeat->ref_id < bh.ref_id() ||
109 curr_repeat->ref_id == bh.ref_id() && curr_repeat->right < bh.left()))
110 {
111 curr_repeat++;
112 }
113
114 vector<Repeat>::iterator next_repeat = curr_repeat;
115
116 bool found_mask = false;
117 while (next_repeat != repeats.end() &&
118 next_repeat->left < bh.right())
119 {
120 if (next_repeat->ref_id == bh.ref_id() &&
121 next_repeat->left <= bh.left() && next_repeat->right >= bh.right())
122 {
123 found_mask = true;
124 break;
125 }
126 next_repeat++;
127 }
128 if (!found_mask)
129 {
130 printf("%s\n", clean_buf.c_str());
131 }
132 }
133 }
134 }
135
136
137 int main(int argc, char** argv)
138 {
139 int parse_ret = parse_options(argc, argv, print_usage);
140 if (parse_ret)
141 return parse_ret;
142
143 if(optind >= argc)
144 {
145 print_usage();
146 return 1;
147 }
148
149 string map_file_name = argv[optind++];
150
151 if(optind >= argc)
152 {
153 print_usage();
154 return 1;
155 }
156
157 string repeat_file_name = argv[optind++];
158
159
160 FILE* map_file = fopen(map_file_name.c_str(), "r");
161 if (map_file == NULL)
162 {
163 fprintf(stderr, "Error: cannot open SAM map file %s for reading\n",
164 map_file_name.c_str());
165 exit(1);
166 }
167
168 FILE* repeat_file = fopen(repeat_file_name.c_str(), "r");
169 if (repeat_file == NULL)
170 {
171 fprintf(stderr, "Error: cannot open SAM map file %s for reading\n",
172 repeat_file_name.c_str());
173 exit(1);
174 }
175
176 driver(map_file, repeat_file);
177
178 return 0;
179 }