ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/gcdb.h
Revision: 16
Committed: Mon Jul 18 20:56:02 2011 UTC (13 years, 1 month ago) by gpertea
File size: 13136 byte(s)
Log Message:
sync with local source

Line File contents
1 #ifndef __GCDB_H
2 #define __GCDB_H
3 #include "GBase.h"
4 #include <stddef.h>
5 #include <fcntl.h>
6
7 #ifdef __WIN32__
8 #define PROT_READ 1
9 #define PROT_WRITE 2
10 #define PROT_READWRITE 3
11 #define MAP_SHARED 1
12 #define MAP_PRIVATE 2
13 #define F_OK 0
14 #define R_OK 4
15 #define W_OK 2
16 #define RW_OK 6
17
18 #ifndef MAP_FAILED
19 #define MAP_FAILED ((void *) -1)
20 #endif
21 void *mmap(char *,size_t,int,int,int,off_t);
22 int munmap(void *,size_t);
23 #else
24 #include <sys/mman.h>
25 #endif
26
27 //=====================================================
28 //------------- buffer stuff -------------------
29 //=====================================================
30 #define GCDBUFFER_INSIZE 8192
31 #define GCDBUFFER_OUTSIZE 8192
32
33 typedef int (*opfunc)(int, char*, size_t);
34
35 //typedef unsigned long gcdb_seek_pos;
36 typedef off_t gcdb_seek_pos;
37 typedef unsigned int (*uint_conv_func)(void*); //uint conversion function pointer
38 typedef off_t (*offt_conv_func)(void*); //uint conversion function pointer
39 typedef int16_t (*int16_conv_func)(void*); //int16 conversion function pointer
40
41
42 //conversion function --> to platform independent uint
43 extern uint_conv_func gcvt_uint;
44 extern offt_conv_func gcvt_offt;
45 extern int16_conv_func gcvt_int16;
46 /*
47 unsigned int uint32_sun(void* x86int);
48 unsigned int uint32_x86(void* x86int);
49 //for file offsets: off_t runtime conversions:
50 off_t offt_sun(void* offt);
51 off_t offt_x86(void* offt);
52 int16_t int16_sun(void* i16);
53 int16_t int16_x86(void* i16);
54 */
55
56 void gcvt_endian_setup();
57
58 class GCDBuffer {
59 public:
60 char *x;
61 unsigned int p;
62 unsigned int n;
63 int fd;
64 opfunc op;
65 //methods:
66 GCDBuffer() {
67 x=NULL;
68 fd=0;
69 op=NULL;
70 n=0;
71 }
72 GCDBuffer(opfunc aop,int afd,char *buf,unsigned int len) {
73 //check endianness
74 gcvt_endian_setup();
75 init(aop, afd, buf, len);
76 }
77 void init(opfunc aop,int afd,char *buf,unsigned int len) {
78 x=buf;
79 fd=afd;
80 op=aop;
81 p=0;
82 n=len;
83 }
84 int flush();
85 int write_all(char* buf, unsigned int pt);
86 int put(char* buf,unsigned int len);
87 int putalign(char* buf,unsigned int len);
88 int putflush(char* buf,unsigned int len);
89 int puts(char *buf);
90 int putsalign(char *buf);
91 int putsflush(char *buf);
92 int oneRead(char* buf, unsigned int len);
93 int getthis(char* buf,unsigned int len);
94 int get(char* buf,unsigned int len);
95 int bget(char* buf,unsigned int len);
96 int feed();
97 char *peek();
98 void seek(unsigned int len);
99 int copy(GCDBuffer* bin);
100 };
101
102
103 //=====================================================
104 //------------- cdb utils -------------------
105 //=====================================================
106 #ifndef __WIN32__
107 extern int errno;
108 #endif
109 extern int error_intr;
110 extern int error_nomem;
111 extern int error_proto;
112
113 //additional data to be appended to the cdb file:
114 #define CDBMSK_OPT_MULTI 0x00000001
115 #define CDBMSK_OPT_C 0x00000002
116 #define CDBMSK_OPT_CADD 0x00000004
117 #define CDBMSK_OPT_COMPRESS 0x00000008
118 #define CDBMSK_OPT_GSEQ 0x00000010
119 //creates a compressed version of the database
120 //uses plenty of unions for ensuring compatibility with
121 // the old 'CIDX' info structure
122
123 //trying to prevent [64bit] machines to align this to 64bit -- sizeof() gets it wrong!
124 #pragma pack(4)
125 // eek, gcc 2.95.3 alpha-decosf version does not
126 // recognize this pragma directive
127
128
129 struct cdbInfo {
130 uint32 num_keys;
131 union {
132 uint32 num_records;
133 char oldtag[4]; // 'CIDX' for old tag style
134 };
135 // data file size -- used to be uint32, now it could be 64bit
136 union {
137 off_t dbsize;
138 uint32 oldnum[2]; //num_keys, num_records
139 };
140 union {
141 uint32 idxflags;
142 uint32 old_dbsize;
143 };
144 union {
145 int dbnamelen;
146 int old_idxflags;
147 };
148 // -- the actual db name precedes this fixed-size record
149 union {
150 char tag[4]; //'CDBX' for new files with LFS
151 uint32 old_dbnamelen;
152 };
153 };
154
155 // for passing around index data:
156 struct CIdxData32 {
157 uint32 fpos;
158 uint32 reclen;
159 };
160 /*
161 struct CIdxSeqData32 { //4+4+2+1 = 11 bytes
162 uint32 fpos;
163 uint32 reclen;
164 uint16_t linelen; //line length for FASTA-formatted seq
165 byte elen; //length of end-of-line delimiter: 1 (unix/mac) or 2 (Windows)
166 };
167 */
168 struct CIdxData {
169 off_t fpos; //64bit value on Linux
170 uint32 reclen;
171 };
172 /*
173 struct CIdxSeqData { //8+4+2+1 = 15 bytes
174 off_t fpos; //64bit value on Linux
175 uint32 reclen;
176 uint16_t linelen; //line length for FASTA-formatted seq
177 byte elen; //length of end-of-line delimiter: 1 (unix/mac) or 2 (Windows)
178 };
179 */
180 #pragma pack()
181
182 extern int cdbInfoSIZE;
183 extern int IdxDataSIZE;
184 extern int IdxDataSIZE32;
185 /*
186 extern int IdxSeqDataSIZE;
187 extern int IdxSeqDataSIZE32;
188 */
189
190 void uint32_pack(char *,uint32);
191 void uint32_pack_big(char *,uint32);
192 void uint32_unpack(char *,uint32 *);
193 void uint32_unpack_big(char *,uint32 *);
194
195 //=====================================================
196 //------------- cdb index -------------------
197 //=====================================================
198
199 #define CDB_HPLIST 1000
200
201 struct cdb_hp { uint32 h; uint32 p; } ;
202
203 struct cdb_hplist {
204 struct cdb_hp hp[CDB_HPLIST];
205 struct cdb_hplist *next;
206 int num;
207 };
208
209 //the index file should always be smaller than 4GB !
210
211 class GCdbWrite {
212 GCDBuffer* cdbuf;
213 char bspace[8192];
214 char fname[1024];
215 char final[2048];
216 uint32 count[256];
217 uint32 start[256];
218 struct cdb_hplist *head;
219 struct cdb_hp *split; /* includes space for hash */
220 struct cdb_hp *hash;
221 uint32 numentries;
222 uint32 pos; //file position
223 int posplus(uint32 len);
224 int fd; //file descriptor
225 public:
226 //methods:
227 GCdbWrite(int afd); //was: init
228 GCdbWrite(char* fname);
229 ~GCdbWrite();
230 int addbegin(unsigned int keylen,unsigned int datalen);
231 int addend(unsigned int keylen,unsigned int datalen,uint32 h);
232 int addrec(const char *key,unsigned int keylen,char *data,unsigned int datalen);
233 int add(const char *key, char *data, unsigned int datalen);
234 int getNumEntries() { return numentries; }
235 int finish();
236 int close();
237 int getfd() { return fd; }
238 char* getfile() { return fname; }
239 };
240
241
242 //=====================================================
243 //------------- cdb -------------------
244 //=====================================================
245
246 #define CDB_HASHSTART 5381
247
248 uint32 cdb_hashadd(uint32,unsigned char);
249 uint32 cdb_hash(const char *,unsigned int);
250
251 class GCdbRead {
252 uint32 size; // initialized if map is nonzero
253 uint32 loop; // number of hash slots searched under this key
254 uint32 khash; // initialized if loop is nonzero
255 uint32 kpos; // initialized if loop is nonzero
256 uint32 hpos; // initialized if loop is nonzero
257 uint32 hslots; // initialized if loop is nonzero
258 uint32 dpos; // initialized if cdb_findnext() returns 1
259 uint32 dlen; // initialized if cdb_findnext() returns 1
260 char fname[1024];
261 char *map; // 0 if no map is available
262 int fd;
263 public:
264 //methods:
265 GCdbRead(int fd); //was cdb_init
266 GCdbRead(char* afname); //was cdb_init
267 ~GCdbRead(); //was cdb_free
268 int read(char *,unsigned int,uint32);
269 int match(const char *key, unsigned int len, uint32 pos);
270 void findstart() { loop =0; }
271 int findnext(const char *key,unsigned int len);
272 int find(const char *key);
273 int datapos() { return dpos; }
274 int datalen() { return dlen; }
275 int getfd() { return fd; }
276 char* getfile() { return fname; }
277 };
278
279 class GReadBuf {
280 protected:
281 FILE* f;
282 uchar* buf;
283 int buflen;
284 int bufused; //
285 int bufpos;
286 off_t fpos;
287 bool eof;
288 bool eob;
289
290 int refill(bool repos=false) {
291 //refill the buffer-----------
292 if (repos && bufpos==0) return 0; //no need to repos
293 if (eof) return 0;
294 int fr=0;
295 if (repos && bufpos<bufused) {
296 int kept=bufused-bufpos;
297 memmove((void*)buf, (void*)(buf+bufpos),kept);
298 fr=(int)fread((void *)(buf+kept), 1, buflen-kept, f);
299 if (fr<buflen-kept) eof=true;
300 buf[kept+fr]='\0';
301 bufused=kept+fr;
302 }
303 else {
304 fr=(int)fread((void *)buf, 1, buflen, f);
305 if (fr<buflen) eof=true;
306 buf[fr]='\0'; //only for text record parsers
307 bufused=fr;
308 }
309 if (feof(f)) eof=true;
310 if (ferror(f)) {
311 GMessage("GReadBuf::refill - error at fread!\n");
312 eof=true;
313 }
314 bufpos=0;
315 fpos+=fr; //bytes read from file so far
316 return fr;
317 }
318 public:
319 GReadBuf(FILE* fin, int bsize=4096) {
320 f=fin;
321 buflen=bsize;
322 GMALLOC(buf,buflen+1);
323 bufpos=0; //current pointer for get function
324 bufused=0;
325 fpos=0;
326 eof=false;
327 eob=false;
328 refill();
329 }
330 ~GReadBuf() { GFREE(buf); }
331
332 //reads len chars from stream into the outbuf
333 //updates bufpos
334 //->returns the number of bytes read
335 int get(uchar *outbuf, int len) {
336 if (eob) return 0;
337 int rd=0; //bytes read
338 while (!eob && rd<len) {
339 int to_read=GMIN((bufused-bufpos),(len-rd));
340 memcpy((void*)(outbuf+rd),(void*)(buf+bufpos), to_read);
341 bufpos+=to_read;
342 rd+=to_read;
343 if (bufpos>=bufused) {
344 if (eof) eob=true;
345 else refill();
346 }
347 }//while
348 return rd;
349 }
350
351 uchar* getStr(uchar *outbuf, int len) {
352 int rd=get(outbuf,len);
353 if (rd==0) return NULL;
354 else {
355 outbuf[rd]='\0';
356 return outbuf;
357 }
358 }
359
360 // getc equivalent
361 int getch() {
362 if (eob) return -1;
363 int ch=(int)(uchar)buf[bufpos];
364 bufpos++;
365 if (bufpos>=bufused) {
366 if (eof) eob=true;
367 else refill();
368 }
369 return ch;
370 }
371
372 //---
373 bool isEof() { return eob; }
374 bool ended() { return eob; }
375 off_t getPos() {
376 //returns the virtual file position
377 // = the actual file offset of the byte at bufpos
378 return fpos-(bufused-bufpos);
379 }
380 //skip into the stream the specified number of bytes
381 int skip(int skiplen) {
382 if (eob) return 0;
383 int r=0; //the actual number of bytes skipped
384 while (skiplen && !eob) {
385 int dif=GMIN(bufused-bufpos,skiplen);
386 skiplen-=dif;
387 bufpos+=dif;
388 r+=dif;
389 if (bufpos>=bufused) {
390 if (eof) { eob=true; return r; }
391 refill();
392 }
393 }
394 return r;
395 }
396 //look ahead without updating the read pointer (bufpos)
397 //Cannot peek more than buflen!
398 int peek(uchar* outbuf, int len) {
399 if (eob) return -1;
400 //if (eob || len>buflen) return -1;
401 if (len>bufused-bufpos) refill(true);
402 int mlen=GMIN((bufused-bufpos),len);
403 memcpy((void*)outbuf, (void*)(buf+bufpos), mlen);
404 return mlen;
405 }
406 char peekChar() {
407 if (eob) return -1;
408 //if (eob || len>buflen) return -1;
409 if (1>bufused-bufpos) refill(true);
410 return *(buf+bufpos);
411 }
412 uchar* peekStr(uchar* outbuf, int len) {
413 int rd=peek(outbuf,len);
414 if (rd>0) { outbuf[rd]='\0'; return outbuf; }
415 else return NULL;
416 }
417 //looks ahead to check if what follows matches
418 int peekCmp(char* cmpstr, int cmplen=-1) {
419 if (cmplen==0) return 0;
420 if (eob) //GError("GReadBuf::peekcmp error: eob!\n");
421 return -2;
422 if (cmplen<0) cmplen=strlen(cmpstr);
423 if (cmplen>bufused-bufpos) {
424 refill(true);
425 if (cmplen>bufused-bufpos) return -2;
426 }
427 //use memcmp
428 return memcmp((void*)(buf+bufpos), cmpstr, cmplen);
429 }
430
431 };
432
433 //circular line buffer, with read-ahead (peeking) capability
434 class GReadBufLine {
435 protected:
436 struct BufLine {
437 off_t fpos;
438 int len;
439 char* chars;
440 };
441 int bufcap; //total number of lines in the buf array
442 int bufidx; // the "current line" index in buf array
443 bool isEOF;
444 int lno;
445 FILE* file;
446 off_t filepos; //current file/stream offset for the first char of buf[bufidx]
447 BufLine* buf; //array of bufferred lines
448 char* readline(int idx);//read line from file into the buffer
449 int fillbuf();
450 bool isEOB;
451 public:
452 const char* line(); //gets current line and advances the "current line" pointer
453 //use putLine() to revert/undo this advancement
454 off_t fpos(); //gets current line's byte offset in the file
455 // does NOT advance the "current line" pointer
456 int len(); //gets current line's length
457 // does NOT advance the "current line" pointer
458 bool isEof() { return isEOB; }
459 bool eof() { return isEOB; }
460 off_t getfpos() { return fpos(); }
461 const char* getline() { return line(); }
462 const char* getLine() { return line(); }
463 int getLen() { return len(); }
464 int linenumber() { return lno; }
465 int lineno() { return lno; }
466 int getLineNo() { return lno; }
467 void putLine();
468 GReadBufLine(FILE* stream, int bcap=20) {
469 if (bcap<2) bcap=2; //at least 1 prev line is needed for putLine()
470 bufcap=bcap;
471 bufidx=-1;
472 isEOB=false;
473 isEOF=false;
474 lno=0;
475 GMALLOC(buf, bufcap * sizeof(BufLine));
476 for (int i=0;i<bufcap;i++) {
477 buf[i].chars=NULL;
478 buf[i].fpos=-1;
479 buf[i].len=0;
480 }
481 file=stream;
482 fillbuf();
483 }
484 ~GReadBufLine() {
485 for (int i=0;i<bufcap;i++) {
486 GFREE(buf[i].chars);
487 }
488 GFREE(buf);
489 }
490 };
491
492 #endif