ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/library_stats.cpp
Revision: 29
Committed: Tue Aug 2 21:24:54 2011 UTC (13 years, 1 month ago) by gpertea
File size: 7116 byte(s)
Log Message:
adding tophat source work

Line File contents
1 /*
2 * library_stats.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 11/17/08.
6 * Copyright 2008 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13
14 #include <cassert>
15 #include <cstdio>
16 #include <vector>
17 #include <string>
18 #include <map>
19 #include <algorithm>
20 #include <set>
21 #include <stdexcept>
22 #include <iostream>
23 #include <fstream>
24 #include <getopt.h>
25
26 #include "common.h"
27 #include "bwt_map.h"
28 #include "inserts.h"
29 #include "tokenize.h"
30
31 using namespace std;
32
33 struct LibraryStats
34 {
35 LibraryStats() :
36 m1_aligned_reads(0),
37 m2_aligned_reads(0),
38 m1_singletons(0),
39 m2_singletons(0),
40 spliced_reads(0),
41 too_far_inserts(0),
42 too_close_inserts(0),
43 same_strand_inserts(0),
44 happy_inserts(0) {}
45
46 // A name for this statistics object
47 string ref_region;
48
49 // Read-level alignment counters
50 uint32_t m1_aligned_reads;
51 uint32_t m2_aligned_reads;
52 uint32_t m1_singletons;
53 uint32_t m2_singletons;
54 uint32_t spliced_reads;
55
56 // Insert category counts
57 uint32_t too_far_inserts;
58 uint32_t too_close_inserts;
59 uint32_t same_strand_inserts;
60 uint32_t happy_inserts;
61
62
63
64 // Roll the stats for rhs into this object
65 LibraryStats& operator+=(const LibraryStats& rhs)
66 {
67 m1_aligned_reads += rhs.m1_aligned_reads;
68 m2_aligned_reads += rhs.m2_aligned_reads;
69 m1_singletons += rhs.m1_singletons;
70 m2_singletons += rhs.m2_singletons;
71
72 too_far_inserts += rhs.too_far_inserts;
73 too_close_inserts += rhs.too_close_inserts;
74 same_strand_inserts += rhs.same_strand_inserts;
75 happy_inserts += rhs.happy_inserts;
76
77 spliced_reads += rhs.spliced_reads;
78
79 return *this;
80 }
81
82 uint32_t contiguously_aligned_reads() const
83 {
84 return m2_aligned_reads + m1_aligned_reads - spliced_reads;
85 }
86 };
87
88
89 ostream& operator<<(ostream& os, const LibraryStats& L)
90 {
91 os << L.ref_region << ":" << endl;
92 os << "\t" << "Aligned reads:\t" << L.m2_aligned_reads + L.m1_aligned_reads << endl;
93 os << "\t" << "Singletons:\t" << L.m2_singletons + L.m1_singletons << endl;
94 os << "\t" << "Happy reads:\t" << 2 * L.happy_inserts << endl;
95 os << "\t" << "Unhappy inserts, too far:\t" << 2 * L.too_far_inserts << endl;
96 os << "\t" << "Unhappy inserts, too close:\t" << 2 * L.too_close_inserts << endl;
97 os << "\t" << "Unhappy inserts, same strand:\t" << 2 * L.same_strand_inserts << endl;
98 os << "\t" << "Spliced read alignments:\t" << L.spliced_reads << endl;
99 os << "\t" << "Contiguous read alignments:\t" << L.contiguously_aligned_reads() << endl;
100
101 return os;
102 }
103
104
105 // Print a simple listing of inner distances on the best insert mappings.
106 // Useful if you want to build a histogram of insert sizes, or determine
107 // the mean and std dev of your library insert size empirically.
108 //void print_mate_distances(const string& dist_file_name,
109 // const vector<InsertStatus>& best_status_for_inserts)
110 //{
111 // ofstream dist_file(dist_file_name.c_str());
112 // for (size_t i = 0; i < best_status_for_inserts.size(); ++i)
113 // {
114 // if (best_status_for_inserts[i].mask < SINGLETON)
115 // {
116 // dist_file << best_status_for_inserts[i].inner_dist << endl;
117 // }
118 // }
119 //}
120
121 // Compute some statistics about the mapping for this library, and return them
122 // in a LibraryStats object.
123
124
125
126 LibraryStats compute_stats(const BestInsertAlignmentTable best_status_for_inserts)
127 {
128 LibraryStats best_pairing_stats;
129 //best_pairing_stats.ref_region = "Best alignment pairings";
130
131 // Analyze the table of best pairings for each insert and report the same
132 // statistics
133 for (size_t i = 0; i < best_status_for_inserts.size(); ++i)
134 {
135 const pair<InsertAlignmentGrade, vector<InsertAlignment> >& st = best_status_for_inserts[i];
136
137 for (size_t j = 0; j < st.second.size(); ++j)
138 {
139
140 InsertAlignment a = st.second[j];
141 if (a.left_alignment && a.right_alignment)
142 {
143 pair<int, int> distances = pair_distances(*(a.left_alignment),
144 *(a.right_alignment));
145 int inner_dist = distances.second;
146 printf("%d\n", inner_dist);
147 }
148 }
149 }
150 return best_pairing_stats;
151 }
152
153 void load_hits(const vector<string>& filenames,
154 RefSequenceTable& rt,
155 ReadTable& it,
156 HitTable& hits)
157 {
158 for (size_t i = 0; i < filenames.size(); ++i)
159 {
160 const string& filename = filenames[i];
161 HitFactory* hit_factory = NULL;
162
163 if (filename.rfind(".sam") != string::npos)
164 {
165 hit_factory = new SAMHitFactory(it,rt);
166 }
167 else
168 {
169 hit_factory = new BowtieHitFactory(it,rt);
170 }
171
172
173 FILE* map = fopen(filename.c_str(), "r");
174
175 if (map == NULL)
176 {
177 fprintf(stderr, "Error: could not open %s\n", filename.c_str());
178 exit(1);
179 }
180 fprintf(stderr, "Loading hits from %s\n", filename.c_str());
181 size_t num_hits_before_load = hits.total_hits();
182 get_mapped_reads(map, hits, *hit_factory, true);
183 fprintf(stderr, "Loaded %d hits from %s\n",
184 (int)hits.total_hits() - (int)num_hits_before_load,
185 filename.c_str());
186 delete hit_factory;
187 }
188 }
189
190 void load_hits(RefSequenceTable& rt,
191 ReadTable& it,
192 const string& left_read_maplist,
193 HitTable& left_hits,
194 const string& right_read_maplist,
195 HitTable& right_hits)
196 {
197 vector<string> left_filenames;
198 tokenize(left_read_maplist, ",", left_filenames);
199 load_hits(left_filenames, rt, it, left_hits);
200 vector<string> right_filenames;
201 tokenize(right_read_maplist, ",", right_filenames);
202 load_hits(right_filenames, rt, it, right_hits);
203 }
204
205 void driver(const string& left_maps,
206 const string& right_maps)
207 {
208 // Load the set of left maps, and if provided, the set of right maps
209 ReadTable it;
210 RefSequenceTable rt(true);
211
212 //HitFactory hit_factory(it,rt);
213
214 HitTable left_hits;
215 HitTable right_hits;
216
217 load_hits(rt, it, left_maps, left_hits, right_maps, right_hits);
218
219 BestInsertAlignmentTable best_pairings(it.size() + 1);
220
221 insert_best_pairings(rt, it, left_hits, right_hits, best_pairings, true);
222
223 // Print a simple listing of inner distances on the best insert mappings.
224 // Useful if you want to build a histogram of insert sizes, or determine
225 // the mean and std dev of your library insert size empirically.
226 //print_mate_distances("mate_dist.txt", insert_best_pairings);
227
228 // Compute and report some statistics about the mapping for this library.
229 LibraryStats best_pairing_stats = compute_stats(best_pairings);
230 //cout << best_pairing_stats;
231 }
232
233
234 void print_usage()
235 {
236 fprintf(stderr, "Usage: library_stats <left_map1,...,left_mapN> <right_map1,...,right_mapN>\n");
237
238 // fprintf(stderr, "Usage: tophat_reports <coverage.wig> <junctions.bed> <accepted_hits.sam> <map1.bwtout> [splice_map1.sbwtout]\n");
239 }
240
241 int main(int argc, char** argv)
242 {
243 int parse_ret = parse_options(argc, argv, print_usage);
244 if (parse_ret)
245 return parse_ret;
246
247 if(optind >= argc)
248 {
249 print_usage();
250 return 1;
251 }
252
253 string map1_file_name = argv[optind++];
254
255 if(optind >= argc)
256 {
257 print_usage();
258 return 1;
259 }
260
261 string map2_file_name = argv[optind++];
262
263 driver(map1_file_name, map2_file_name);
264
265 return 0;
266 }