ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/cdbfasta/cdbyank.cpp
Revision: 15
Committed: Mon Jul 18 20:53:45 2011 UTC (13 years, 1 month ago) by gpertea
File size: 24131 byte(s)
Log Message:
sync with local src

Line File contents
1 #include "GBase.h"
2 #include "gcdb.h"
3 #include "GArgs.h"
4 #include "ctype.h"
5 #include <fcntl.h>
6 #include <string.h>
7
8 #ifdef ENABLE_COMPRESSION
9 #include "gcdbz.h"
10 #else
11 const char err_COMPRESSION[]="Error: compression requested but not compiled in!\n";
12 #endif
13
14 #ifdef __WIN32__
15 #define VERSION "cdbyank version 0.995w"
16 #else
17 #define VERSION "cdbyank version 0.995"
18 #endif
19
20
21
22 #define USAGE "Usage:\n\
23 cdbyank <index_file> [-d <fasta_file>] [-a <key>|-n|-l|-s]\n\
24 [-o <outfile>] [-q <char>|-Q][-F] [-R] [-P] [-x] [-w] \n\
25 [-z <dbfasta.cdbz>\n\n\
26 <index_file> is the index file created previously with cdbfasta\n\
27 (usually having a \".cidx\" suffix)\n\
28 -a <key> the sequence name (accession) for a fasta record to be\n\
29 retrieved; if not given, a list of accessions is expected\n\
30 at stdin\n\
31 -d <fasta_file> is the fasta file to pull records from; \n\
32 if not specified, cdbyank will look in the same directory\n\
33 where <index_file> resides, for a file with the same name\n\
34 but without the \".cidx\" suffix\n\
35 -o the records found are written to file <outfile> instead of stdout\n\
36 -x allows retrieval of multiple records per key, if the indexed \n\
37 database had records with the same key (non-unique keys);\n\
38 (without -x only one record for a given key is retrieved)\n\
39 -i case insensitive query (expects the <index_file> to have been \n\
40 created with cdbfasta -i option)\n\
41 -Q output the query key surrounded by character '%' before the\n\
42 corresponding record\n\
43 -q same as -Q but use character <char> instead of '%'\n\
44 -w enable warnings (sent to stderr) when a key is not found\n\
45 -F pulls only the defline for each record (discard the sequence)\n\
46 -P only displays the position(s) (file offset) within the \n\
47 database file, for the requested record(s)\n\
48 -R sequence range extraction: expects the input <key(s)> to have \n\
49 the format: '<seq_name> <start> <end>'\n\
50 and pulls only the specified sequence range\n\
51 -z decompress the entire file <dbfasta.cdbz>\n\
52 (assumes it was built using cdbfasta with '-z' option)\n\
53 -v show version number and exit\n\
54 \n\
55 Index file statistics (no database file needed):\n\
56 -n display the number of records indexed\n\
57 -l list all keys stored in <index_file>\n\
58 -s display indexing summary info\n\n"
59
60 /*
61 -E same as -R but assumes FASTA records have a fixed line length\n\
62 (faster extraction of distant ranges for long records)\n\
63 */
64
65 #define ERR_READ "cdbyank: error reading from file.\n"
66 #define ERR_READFMT "cdbyank read error: incorrect file format.\n"
67 #define ERR_RANGEFMT "Sequence range parsing error for key '%s'\n"
68 #define ERR_RANGE_INVALID "Invalid range (%d-%d) specified for sequence '%s' of length %d\n"
69 // 1MB memory buffer:
70 #define MAX_MEM_RECSIZE 1048576
71 #ifndef O_BINARY
72 #define O_BINARY 0x0000
73 #endif
74
75 char* idxfile;
76 int warnings;
77 bool is_compressed=false;
78 bool has_gseqs=false; //fasta records are genomic sequence with fixed line length
79 bool defline_only=false;
80 bool rec_pos_only=false;
81 bool use_range=false;
82 bool fixed_linelen=false;
83 bool caseInsensitive=false;
84 bool showQuery=false;
85 char delimQuery='%';
86 uint32 irec_size32=8; //default size of the index record for records with 32bit offsets
87
88 off_t lastfpos=-1; //to avoid pulling the same record twice in a row..
89
90 FILE* fout=NULL;
91 GCdbRead* cdb=NULL;
92 #ifdef ENABLE_COMPRESSION
93 GCdbz* cdbz=NULL;
94 #endif
95 int fdb=-1;
96 FILE* fz=NULL;
97
98 void inplace_Lower(char* c) {
99 char *p=c;
100 while (*p!='\0') { *p=tolower(*p);p++; }
101 }
102
103 void buf_get(GCDBuffer* b, uint32& pos, char *buf, unsigned int len) {
104 int r;
105 while (len > 0) {
106 r = b->get(buf,len);
107 if (r == -1) GError(ERR_READ);
108 if (r == 0)
109 GError(ERR_READFMT);
110 pos += r;
111 buf += r;
112 len -= r;
113 }
114 }
115
116 void buf_getnum(GCDBuffer* b, uint32& pos, uint32 *num) {
117 char buf[4];
118 buf_get(b, pos, buf, 4);
119 uint32_unpack(buf,num);
120 }
121
122
123 int fetch_record(char* key, char* dbname, int many, int r_start=0, int r_end=0) {
124 //assumes fdb is open, cdb was created on the index file
125 if (caseInsensitive) inplace_Lower(key);
126 int r=cdb->find(key);
127 if (r==0 && warnings) {
128 GMessage("cdbyank: key \"%s\" not found in %s\n", key, idxfile);
129 return 0;
130 }
131 if (r==-1)
132 GError("cdbyank: error searching for key %s in %s\n", key, idxfile);
133 while (r>0) {
134 off_t pos = cdb->datapos(); //position of this key's record in the index file
135 unsigned int len=cdb->datalen(); // length of this key's record
136 char bbuf[64]; // data buffer -- should just accomodate fastarec_pos, fastarec_length
137 if (cdb->read(bbuf,len,pos) == -1)
138 GError("cdbyank: error at GCbd::read (%s)!\n", idxfile);
139
140 off_t fpos; //this will be the fastadb offset
141 uint32 reclen; //this will be the fasta record offset
142 //int16_t linelen=0; //for genomic sequences, length of FASTA line
143 //byte elen=0; //size of end-of-line delimiter
144 if (len>irec_size32) { //64 bit file offset was used
145 fpos=gcvt_offt(bbuf);
146 if (rec_pos_only) {
147 fprintf(fout, "%lld\n", (long long)fpos);
148 return 1;
149 }
150 reclen=gcvt_uint(&bbuf[offsetof(CIdxData, reclen)]);
151 /*
152 if (has_gseqs) {
153 linelen=gcvt_int16(&bbuf[offsetof(CIdxSeqData, linelen)]);
154 elen=bbuf[offsetof(CIdxSeqData, elen)];
155 }
156 */
157 }
158 else { //32bit offset used
159 fpos=gcvt_uint(bbuf);
160 if (rec_pos_only) {
161 fprintf(fout, "%lld\n", (long long)fpos);
162 return 1;
163 }
164 reclen=gcvt_uint(&bbuf[offsetof(CIdxData32, reclen)]);
165 /*
166 if (has_gseqs) {
167 linelen=gcvt_int16(&bbuf[offsetof(CIdxSeqData32, linelen)]);
168 elen=bbuf[offsetof(CIdxSeqData32, elen)];
169 }
170 */
171 }
172 //GMessage("reclen=%d\n", reclen);
173
174
175 if (fpos == lastfpos) {
176 if (many) r=cdb->findnext(key, strlen(key));
177 else r=0;
178 continue;
179 }
180 lastfpos=fpos;
181 if (showQuery)
182 fprintf(fout, "%c%s%c\t", delimQuery, key, delimQuery);
183 if (is_compressed) {
184 #ifdef ENABLE_COMPRESSION
185 //for now: ignore special retrievals, just print the whole record
186 cdbz->decompress(fout, reclen, fpos);
187 if (many) r=cdb->findnext(key, strlen(key));
188 else r=0;
189 #endif
190 continue;
191 }
192 lseek(fdb, fpos, SEEK_SET);
193 if (reclen<=MAX_MEM_RECSIZE) {
194 char* p;
195 GMALLOC(p,reclen+1);
196 //errno=0;
197 r=read(fdb, p, reclen);
198 if (r<=0)
199 GError("cdbyank: Error reading from database file [%s] for %s (returned %d, offset %d) !\n",
200 dbname, idxfile, r, fpos);
201 p[reclen]='\0';
202 //--- now we have the whole record, check if some special options were given:
203 if (defline_only) {
204 char* q=strchr(p,'\n');
205 if (q!=NULL) *q='\0';
206 //skip '>' char
207 fprintf(fout, "%s\n",p+1);
208 }
209 else
210 if (use_range && r_start>0) { //range case
211 if (r_end<=0) r_end=reclen;
212 //extract only a substring of the sequence
213 char* r=strchr(p,'\n');
214 if (r!=NULL) *r='\0'; //now p only has the defline
215 fprintf(fout, "%s\n", p); //output the defline
216 r++;
217 unsigned int recpos=r-p; //p[recpos] MUST be a nucleotide or aminoacid now!
218 int seqpos=0;
219 char linebuf[61];
220 int linelen=0;
221 while (recpos<reclen) {
222 if (isspace(p[recpos])) recpos++; //skip newlines, etc. in the fasta sequence
223 else {
224 seqpos++;
225 if (seqpos>=r_start && seqpos<=r_end) {
226 linebuf[linelen]=p[recpos];
227 linelen++;
228 if (linelen==60 || seqpos==r_end) {
229 linebuf[linelen]='\0';
230 linelen=0;
231 fprintf(fout, "%s\n", linebuf);
232 if (seqpos==r_end) break;
233 }
234 }
235 recpos++;
236 }
237 }//while
238 if (linelen>0) {
239 linebuf[linelen]='\0';
240 linelen=0;
241 fprintf(fout, "%s\n", linebuf);
242 }
243 }
244 else { //not range display
245 fprintf(fout, "%s\n",p);
246 }
247 GFREE(p);
248 } //small record
249 else { //large record, read it char by char and return it as output
250 char c='\0';
251 if (defline_only) {
252 reclen--;
253 read(fdb, &c, 1);
254 }
255 while (reclen-- && read(fdb, &c, 1)==1) {
256 fprintf(fout, "%c", c);
257 if (c=='\n') break;
258 }
259 //defline written
260 if (!defline_only) {
261 int seqpos=1;
262 if (use_range) {
263 while (reclen-- && read(fdb, &c, 1)==1 && seqpos<=r_end) {
264 if (isspace(c)) continue;
265 if (seqpos>=r_start) {
266 int written=seqpos-r_start;
267 if (written && written%60 == 0)
268 fprintf(fout,"\n");
269 fprintf(fout, "%c", c);
270 }
271 seqpos++;
272 }//while
273 } //range case
274 else { //no range, just copy all chars to output
275 while (reclen-- && read(fdb, &c, 1)==1) {
276 fprintf(fout, "%c", c);
277 }
278 }
279 fprintf(fout, "\n");
280 }
281 }
282 if (many) r=cdb->findnext(key, strlen(key));
283 else r=0;
284 }
285 return 1;
286 }
287
288 int read_dbinfo(int fd, char** fnameptr, cdbInfo& dbstat) {
289 //this is messy due to the need of compatibility with the
290 //old 32bit file-length
291 char* dbname=*fnameptr;
292 //read just the tag first: 4 bytes ID
293 lseek(fd, -cdbInfoSIZE, SEEK_END);
294 int r=read(fd, &dbstat, cdbInfoSIZE );
295 if (r!=cdbInfoSIZE) return 2;
296 //GMessage("Size of dbstat=%d\n", cdbInfoSIZE);
297 if (strncmp(dbstat.oldtag, "CIDX", 4)==0) {
298 //old dbstat structure -- convert it
299 dbstat.num_keys=gcvt_uint(&dbstat.oldnum[0]);
300 dbstat.num_records=gcvt_uint(&dbstat.oldnum[1]);
301 dbstat.dbsize=gcvt_uint(&dbstat.old_dbsize);
302 dbstat.idxflags = gcvt_uint(&dbstat.old_idxflags);
303 //position on the dbnamelen entry
304 dbstat.dbnamelen = gcvt_uint(&dbstat.old_dbnamelen);
305 //GMessage("dbnamelen=%d\n", dbstat.dbnamelen);
306 lseek(fd, -(off_t)(cdbInfoSIZE-4+dbstat.dbnamelen), SEEK_END);
307 }
308 else if (strncmp(dbstat.tag, "CDBX", 4)!=0) {
309 GMessage("Error: this doesn't appear to be a cdbfasta created file!\n");
310 return 1;
311 }
312 else { // new CDBX type:
313 dbstat.dbsize = gcvt_offt(&dbstat.dbsize);
314 dbstat.num_keys=gcvt_uint(&dbstat.num_keys);
315 dbstat.num_records=gcvt_uint(&dbstat.num_records);
316 dbstat.idxflags = gcvt_uint(&dbstat.idxflags);
317 //position on the dbnamelen entry
318 dbstat.dbnamelen = gcvt_uint(&dbstat.dbnamelen);
319 //GMessage("dbnamelen=%d\n", dbstat.dbnamelen);
320 lseek(fd, -(off_t)(cdbInfoSIZE+dbstat.dbnamelen), SEEK_END);
321 }
322
323 GMALLOC(dbname, dbstat.dbnamelen+1);
324 dbname[dbstat.dbnamelen]='\0';
325 r=read(fd, dbname, dbstat.dbnamelen);
326 *fnameptr=dbname;
327 if (r!=dbstat.dbnamelen)
328 return 2;
329 return 0;
330 }
331
332 int parse_int(FILE* f, char* buf, char* key, int& e) {
333 char* p, *q;
334 while (e!=EOF && isspace(e)) { //skip any spaces
335 if (e=='\n') return 0; //GError(ERR_RANGEFMT, key);
336 e=fgetc(stdin);
337 }
338 if (e==EOF) return 0; //GError(ERR_RANGEFMT, key);
339 //now e is the first non-space
340 p=buf;
341 q=p;
342 while (e!=EOF && !isspace(e)) {
343 *q=e;
344 q++;
345 e=fgetc(stdin);
346 }
347 *q='\0'; //now p is the starting coordinate string
348 return atoi(p);
349 //now the file pointer should be on the first space after the parsed value
350 }
351
352 int parse_int(char*& f, char* key, int& e) {
353 char* p, *q;
354 char buf[16];
355 while (e!='\0' && isspace(e)) { //skip any spaces
356 //if (e=='\n') GError(ERR_RANGEFMT, key);
357 if (e=='\n') return 0;
358 f++;
359 e=*f;
360 }
361 //if (e=='\0') GError(ERR_RANGEFMT, key);
362 if (e=='\0') return 0;
363 //now e is the first non-space char
364 p=buf;
365 q=p;
366 while (e!='\0' && !isspace(e)) {
367 *q=e;
368 q++;
369 f++;
370 e=*f;
371 }
372 *q='\0';
373 return atoi(p);
374 //now f and e should be on the first space after the parsed value (or '\0')
375 }
376
377 #ifdef ENABLE_COMPRESSION
378
379 GCdbz* openCdbz(char* p) {
380 //in case this was not done before:
381 //gcvt_uint=(endian_test())? &uint32_sun : &uint32_x86;
382 gcvt_endian_setup();
383 FILE* zf=fopen(p, "rb");
384 if (zf==NULL) {
385 GMessage("Error: cannot open compressed file '%s'!\n",p);
386 return NULL;
387 }
388 //check if the file is valid and read the length of the first record
389 //
390 char ztag[5];
391 ztag[4]='\0';
392 if (fread(ztag, 1, 4, zf)<4) {
393 GMessage("Error reading header of compressed file '%s'\n",p);
394 return NULL;
395 }
396 if (strcmp(ztag, "CDBZ")!=0) {
397 GMessage("Error: file '%s' doesn't appear to be a zlib compressed cdb?\n",p);
398 return NULL;
399 }
400 unsigned int zrecsize;
401 if (fread((void*) &zrecsize,1,4,zf)<4) {
402 GMessage("Error reading 1st compressed record size for file '%s'!\n",p);
403 return NULL;
404 }
405 zrecsize=gcvt_uint(&zrecsize);
406 return new GCdbz(zf, true, zrecsize);
407 }
408 #endif
409
410 int main(int argc, char **argv) {
411 char namebuf[1024];
412 int r_start, r_end;
413 char* p;
414 char* dbname=NULL;
415 int result=0;
416 int r=0;
417 cdbInfo dbstat;
418 dbstat.dbsize=0;
419 GArgs args(argc, argv, "a:d:o:z:q:nlsxwvFREiPQ");
420 int e=args.isError();
421 if (e>0)
422 GError("%s Invalid argument: %s\n", USAGE, argv[e]);
423 if (args.getOpt('v')!=NULL) {
424 printf("%s\n",VERSION);
425 return 0;
426 }
427 char* outfile=(char*)args.getOpt('o');
428 if (outfile!=NULL) {
429 if ((fout=fopen(outfile, "wb"))==NULL)
430 GError("Cannot create file '%s'!", outfile);
431 }
432 else fout=stdout;
433
434 if ((p=(char*)args.getOpt('z'))!=NULL) { //simply stream-decompress cdbz
435 #ifndef ENABLE_COMPRESSION
436 GError(err_COMPRESSION);
437 #else
438 GCdbz* cdbz=openCdbz(p);
439 if (cdbz==NULL)
440 GError("Error opening the cdbz file '%s'\n");
441 FILE* zf=cdbz->getZFile();
442 int numrecs=0;
443 int xcode;
444 while ((xcode=cdbz->decompress(fout))>0) numrecs++;
445 delete cdbz;
446 fclose(zf);
447 #endif
448 return 0;
449 }
450 int numfiles = args.startNonOpt();
451 if (numfiles==0)
452 GError("%s Error: the cdb index file must be provided !\n", USAGE);
453 idxfile=Gstrdup((char*)args.nextNonOpt()); //first fasta file given
454 char* idxfile_cidx=NULL;
455 GMALLOC(idxfile_cidx, strlen(idxfile)+6);
456 strcpy(idxfile_cidx,idxfile);
457 strcat(idxfile_cidx, ".cidx");
458 if (fileExists(idxfile_cidx)==2) {
459 GFREE(idxfile);
460 idxfile=idxfile_cidx;
461 }
462 else {
463 GFREE(idxfile_cidx);
464 }
465 char* key=(char*)args.getOpt('a');
466
467 defline_only=(args.getOpt('F')!=NULL);
468 rec_pos_only=(args.getOpt('P')!=NULL);
469 showQuery=(args.getOpt('Q')!=NULL);
470 const char* q;
471 if ((q=args.getOpt('q'))!=NULL) {
472 delimQuery=*q;
473 showQuery=true;
474 }
475 use_range=((args.getOpt('R')!=NULL) || (args.getOpt('E')!=NULL));
476 fixed_linelen=(args.getOpt('E')!=NULL);
477 caseInsensitive=(args.getOpt('i')!=NULL);
478 /*is_compressed=((args.getOpt('Z')!=NULL) ||
479 (strstr(idxfile,".cidxz")!=NULL));*/
480 int listQuery=(args.getOpt('l')!=NULL);
481 warnings=(args.getOpt('w')!=NULL);
482 int dataQuery=(!listQuery && args.getOpt('n')==NULL
483 && args.getOpt('l')==NULL &&args.getOpt('s')==NULL);
484 //exclude the possibility of index-only stats query
485 dbname=(char*)args.getOpt('d');
486 int fd;
487 cdb=new GCdbRead(idxfile);
488 fd=cdb->getfd();
489 char* info_dbname=NULL;
490 off_t db_size=0;
491 dbstat.dbsize=0;
492
493 r=read_dbinfo(fd, &info_dbname, dbstat);
494 lseek(fd, 0, SEEK_SET);
495 if (r==1) GError("This file does not seem to be a cdbfasta generated file.\n");
496 else if (r==2)
497 GError("Error reading info chunk!\n");
498 if (dbstat.idxflags & CDBMSK_OPT_GSEQ) {
499 has_gseqs=true;
500 irec_size32=12;
501 }
502 if (dataQuery) {
503 //--------------- DB QUERY MODE: (always read the cdb stored info!)
504 /*try to find the database file
505 rules: if given, only the -d given filename is used
506 otherwise:
507 1) the same directory with the given index file(stripping the suffix)
508 2) the dbstat filepath/name stored by cdbfasta
509 */
510
511 if (!rec_pos_only && dbname==NULL) { // no -d database given, find it
512 // 1) try to rip the suffix:
513 p = rstrchr(idxfile, '.');
514 if (p!=NULL) {
515 /*GError("%s\ncdbyank error: cannot use %s as an index file. When no -d is\n\
516 given, so the database file can be located in the same directory \n\
517 by removing the index file suffix (.cidx)\n", USAGE, idxfile);*/
518 int nlen=p-idxfile;
519 strncpy(namebuf, idxfile, nlen);
520 namebuf[nlen]='\0';
521 if (fileExists(namebuf))
522 dbname=namebuf;
523 }
524 // 2) try the stored dbstat name
525 if (dbname==NULL) {
526 if (fileExists(info_dbname)) dbname=info_dbname;
527 else GError("Cannot locate the database file for this index\n");
528 }
529 }
530 if (!rec_pos_only) {
531 if (!is_compressed) {
532 if (r==0 && (dbstat.idxflags & CDBMSK_OPT_COMPRESS))
533 is_compressed=true;
534 }
535 if (is_compressed) {
536 //try to open the dbname as a compressed file
537 #ifndef ENABLE_COMPRESSION
538 GError(err_COMPRESSION);
539 #endif
540 fz=fopen(dbname, "rb");
541 }
542 else fdb=open(dbname, O_RDONLY|O_BINARY);
543 if (fdb==-1 && fz==NULL)
544 GError("Error: cannot open database file %s\n",dbname);
545 if (is_compressed) {
546 fclose(fz);//just to start fresh here
547 if (use_range)
548 GError("Error: cannot use range extraction with compressed records, sorry.\n");
549 if (defline_only)
550 GError("Error: cannot use defline-only retrieval with compressed records (sorry).\n");
551 //determine size:
552 int ftmp = open(dbname, O_RDONLY|O_BINARY);
553 if (ftmp == -1) GError("Error reopening db '%s'?\n",dbname);
554 struct stat fdbstat;
555 fstat(ftmp, &fdbstat);
556 db_size=fdbstat.st_size;
557 close(ftmp);
558 //-------- reopen here
559 #ifdef ENABLE_COMPRESSION
560 cdbz=openCdbz(dbname);
561 if (cdbz==NULL)
562 GError("Error opening the cdbz file '%s'\n");
563 fz=cdbz->getZFile();
564 #endif
565 }
566 else {
567 struct stat fdbstat;
568 if (stat(dbname, &fdbstat)!=0) {
569 perror("stat()");
570 exit(1);
571 }
572 db_size=fdbstat.st_size;
573 }
574 //abort if the database size was read and it doesn't match the cdbfasta stored size
575 if (dbstat.dbsize>0 && dbstat.dbsize!=db_size)
576 GError("Error: invalid %d database size - (%lld vs %lld) please rerun cdbfasta for '%s'\n",
577 fdb, (long long)dbstat.dbsize, (long long)db_size, dbname);
578 }
579 int many=(args.getOpt('x')!=NULL);
580 int keypos=0;
581 if (key==NULL) { //key not given
582 GMALLOC(key, 2048);
583 //get the keys at stdin
584 if (use_range) {
585 //expects the key and its sequence range on a single line!
586 while ((e=fgetc(stdin)) != EOF) {
587 if (isspace(e)) { //word end, close it
588 key[keypos]='\0';
589 if (keypos==0) continue;
590 r_start=parse_int(stdin, &key[keypos+1], key, e);
591 if (r_start<=0) GError(ERR_RANGEFMT, key);
592 //if (e==EOF || e=='\n') GError(ERR_RANGEFMT, key);
593 r_end=0;
594 r_end=parse_int(stdin, &key[keypos+1], key, e);
595 //if (r_end<=0 || r_end<=r_start) GError(ERR_RANGEFMT, key);
596 fetch_record(key, dbname, many, r_start, r_end);
597 //if (rec_pos_only) break;
598 if (e==EOF) break;
599 keypos=0;
600 }
601 else { //extend the key string
602 key[keypos]=e;
603 keypos++;
604 }
605 } //while
606 } //range case
607 else { //no range, accept any space delimiter
608 while ((e=fgetc(stdin)) != EOF) {
609 if (isspace(e)) { //word end, close it
610 key[keypos]='\0';
611 fetch_record(key, dbname, many);
612 //if (rec_pos_only) break;
613 keypos=0;
614 }
615 else { //extend the key string
616 key[keypos]=e;
617 keypos++;
618 }
619 } //while
620 }
621 GFREE(key);
622 } //stdin case
623 else { //key given already on command line
624 //get only the first word of it:
625 size_t keylen=strlen(key);
626 p=key; while (!isspace(*p) && *p!='\0') p++;
627 if (*p!='\0') *p='\0';
628 if (use_range) {
629 //parse the range from the query string
630 if (keylen==strlen(p)) GError(ERR_RANGEFMT, key);
631 p++;e=*p;
632 r_start=parse_int(p, key, e);
633 if (r_start<=0) GError(ERR_RANGEFMT, key);
634 //if (e=='\0' || e=='\n') GError(ERR_RANGEFMT, key);
635 r_end=parse_int(p, key, e);
636 //if (r_end<=0 || r_end<=r_start) GError(ERR_RANGEFMT, key);
637 }
638 else {
639 r_start=0;
640 r_end=0;
641 }
642 if (fetch_record(key, dbname, many, r_start, r_end)==0)
643 result=1; //the only key given not found
644 }
645 //end data query:
646 if (!rec_pos_only) {
647 if (is_compressed) {
648 fclose(fz);
649 #ifdef ENABLE_COMPRESSION
650 delete cdbz;
651 #endif
652 }
653 else close(fdb);
654 }
655 if (fout!=NULL) fclose(fout);
656 }
657 //--------------- INDEX ONLY QUERY MODE:
658 else { //index query mode: just retrieve some statistics or key names
659 if (listQuery) { //request for list keys
660 uint32 eod;
661 uint32 pos=0;
662 uint32 klen;
663 uint32 dlen;
664 char* bufspace;
665 GMALLOC(bufspace, GCDBUFFER_INSIZE);
666 GCDBuffer* readbuf=new GCDBuffer((opfunc)&read,
667 fd, bufspace, GCDBUFFER_INSIZE);
668
669 buf_getnum(readbuf, pos, &eod);
670 GMALLOC(key, 1024); //!!! hopefully we don't have keys larger than that
671 while (pos < 2048)
672 buf_getnum(readbuf, pos, &dlen);
673 while (pos < eod) {
674 buf_getnum(readbuf, pos,&klen);
675 buf_getnum(readbuf, pos,&dlen);
676 //read key:
677 buf_get(readbuf, pos, key, klen);
678 key[klen]='\0';
679 printf("%s\n", key);
680 //read data (and ignore it)
681 //assume that data is always shorter than 1K (should be just 4 bytes)
682 buf_get(readbuf, pos, key, dlen);
683 }
684 GFREE(key);
685 GFREE(bufspace);
686 delete readbuf;
687 }
688 else { //dig up the info written at the end of the database file
689 if (args.getOpt('n')!=NULL) {
690 printf("%d\n",dbstat.num_records);
691 }
692 else {//must be -s
693 printf("-= Indexing information: =-\n");
694 printf("Number of records:%12d\n", dbstat.num_records);
695 printf("Number of keys :%12d\n", dbstat.num_keys);
696 if (dbstat.idxflags & CDBMSK_OPT_COMPRESS)
697 printf("Database records are compressed.\n");
698 if (dbstat.idxflags & CDBMSK_OPT_MULTI)
699 printf("Index was built with \"multi-key\" option enabled.\n");
700 if (dbstat.idxflags & CDBMSK_OPT_GSEQ)
701 printf("Line length information is stored for each record.\n");
702 if (dbstat.idxflags & CDBMSK_OPT_C)
703 printf("Index was built with \"shortcut keys\" only.\n");
704 else if (dbstat.idxflags & CDBMSK_OPT_CADD)
705 printf("The index was built with full keys and \"shortcut keys\".\n");
706 printf("Database file: %s\n", info_dbname);
707 printf("Database size: %lld bytes\n", (long long)dbstat.dbsize);
708 }
709 }
710 }
711 GFREE(info_dbname);
712 delete cdb;
713 close(fd);
714 GFREE(idxfile);
715 //getc(stdin);
716 return result;
717 }