// ============================================================================= // CD-HIT // http://bioinformatics.burnham-inst.org/cd-hi // // program written by // Weizhong Li // UCSD, San Diego Supercomputer Center // La Jolla, CA, 92093 // Email liwz@sdsc.edu // at // Adam Godzik's lab // The Burnham Institute // La Jolla, CA, 92037 // Email adam@burnham-inst.org // ============================================================================= #include "cd-hi.h" #include "cd-hi-init.h" //////////////////////////////////// MAIN ///////////////////////////////////// int main(int argc, char **argv) { int i, j, k, i1, j1, k1, i0, j0, k0, sggi, sgj; int si, sj, sk; char db_in[MAX_FILE_NAME]; char db_out[MAX_FILE_NAME]; char db_clstr[MAX_FILE_NAME]; char db_clstr_bak[MAX_FILE_NAME]; char db_clstr_old[MAX_FILE_NAME]; // *********************************** parse command line and open file if (argc < 5) print_usage(argv[0]); for (i=1; i 1.0) || (NR_clstr < 0.4)) bomb_error("invalid clstr"); NR_clstr100 = (int) (NR_clstr * 100 ); } else if (strcmp(argv[i], "-L") == 0) { NR_cov = atof(argv[++i]); if ((NR_cov > 1.0) || (NR_cov < 0.0)) bomb_error("invalid coverage cutoff"); } else if (strcmp(argv[i], "-b") == 0) { BAND_width = atoi(argv[++i]); if (BAND_width < 0 ) bomb_error("invalid band width"); } else if (strcmp(argv[i], "-n") == 0) { NAA = atoi(argv[++i]); if ( NAA < 2 || NAA > 5 ) bomb_error("invalid word length"); } else if (strcmp(argv[i], "-d") == 0) { des_len = atoi(argv[++i]); if ( des_len < 15 ) bomb_error("too short description, not enough to identify sequences"); } else if (strcmp(argv[i], "-t") == 0) { tolerance = atoi(argv[++i]); if ( tolerance < 0 || tolerance > 5 ) bomb_error("invalid tolerance"); } else print_usage(argv[0]); } db_clstr[0]=0; strcat(db_clstr,db_out); strcat(db_clstr,".clstr"); db_clstr_bak[0]=0; strcat(db_clstr_bak,db_out); strcat(db_clstr_bak,".bak.clstr"); if ( NAA == 2 ) { NAAN = NAA2; } else if ( NAA == 3 ) { NAAN = NAA3; } else if ( NAA == 4 ) { NAAN = NAA4; } else if ( NAA == 5 ) { NAAN = NAA5; } else bomb_error("invalid -n parameter!"); word_table.init(NAA, NAAN); if ( tolerance ) { int clstr_idx = (int) (NR_clstr * 100) - naa_stat_start_percent; int tcutoff = naa_stat[tolerance-1][clstr_idx][5-NAA]; if (tcutoff < 5 ) bomb_error("Too short word length, increase it or the tolerance"); for ( i=5; i>NAA; i--) { if ( naa_stat[tolerance-1][clstr_idx][5-i] > 10 ) { cout << "Your word length is " << NAA << ", using " << i << " may be faster!" < 0.85 && NAA < 5) cout << "Your word length is " << NAA << ", using 5 may be faster!" < 0.80 && NAA < 4 ) cout << "Your word length is " << NAA << ", using 4 may be faster!" < 0.75 && NAA < 3 ) cout << "Your word length is " << NAA << ", using 3 may be faster!" < 0 ? db_read_in2(in1, length_of_throw, NR_no, NR_seq, NR_len, NRo_no, NRo_idx, NRo_id1, NRo_id2, NRo_NR_idx): db_read_in(in1, length_of_throw, NR_no, NR_seq, NR_len); in1.close(); cout << "total seq: " << NR_no << endl; // ********************************************* init NR_flag for (i=0; i aa2_cutoff ? d2 : aa2_cutoff; aan_cutoff = dn > aan_cutoff ? dn : aan_cutoff; } NR90_no = 0; for (sggi=0; sggi1) cout << "SEG " << sggi << " " << SEG_b[sggi] << " " << SEG_e[sggi] <=0; sgj--) { cout << "Reading swap" << endl; if ( sgj != sggi-1) word_table.read_tbl(db_swap[sgj]); // reading old segment cout << "Comparing with SEG " << sgj << endl; for (i1=SEG_b[sggi]; i1<=SEG_e[sggi]; i1++) { i = NR_idx[i1]; if (NR_flag[i] & IS_REDUNDANT ) continue; if ( (NR_flag[i] & IS_OLD_REDUNDANT) && (NR_flag[ NR_clstr_no[i] ] & IS_REP) ) { NR_clstr_no[i] = - (NR_clstr_no[ NR_clstr_no[i] ]) - 1; NR_flag[i] |= IS_REDUNDANT ; delete [] NR_seq[i]; continue; } len = NR_len[i]; seqi = NR_seq[i]; has_aa2 = 0; int flag = check_this(len, seqi, has_aa2, NAA, aan_no, aan_list, aan_list_no, look_and_count, hit_no, SEG90_b[sgj], SEG90_e[sgj], iden_no, aa1_cutoff, aa2_cutoff, aan_cutoff, NR_flag[i], NR_flag); if ( flag == 1) { // if similar to old one delete it delete [] NR_seq[i]; NR_clstr_no[i] = -hit_no-1; // (-hit_no-1) for non representatives NR_iden[i] = iden_no * 100 / len; NR_flag[i] |= IS_REDUNDANT ; } } //for (i1=SEG_b[sggi]; i1<=SEG_e[sggi]; i1++) } // for (sgj=0; sgj1) cout << "Refresh Memory" << endl; word_table.clean(); if (SEG_no >1) cout << "Self comparing" << endl; segb = NR90_no; for (i1=SEG_b[sggi]; i1<=SEG_e[sggi]; i1++) { i = NR_idx[i1]; if ( ! (NR_flag[i] & IS_REDUNDANT) ) { if ( (NR_flag[i] & IS_OLD_REDUNDANT) && (NR_flag[ NR_clstr_no[i] ] & IS_REP) ) { NR_clstr_no[i] = - (NR_clstr_no[ NR_clstr_no[i] ]) - 1; NR_flag[i] |= IS_REDUNDANT ; delete [] NR_seq[i]; } else { len = NR_len[i]; seqi = NR_seq[i]; has_aa2 = 0; int flag = check_this(len, seqi, has_aa2, NAA, aan_no, aan_list, aan_list_no, look_and_count, hit_no, segb, NR90_no-1, iden_no, aa1_cutoff, aa2_cutoff, aan_cutoff, NR_flag[i], NR_flag); if ( flag == 1) { // if similar to old one delete it delete [] NR_seq[i]; NR_clstr_no[i] = -hit_no-1; // (-hit_no-1) for non representatives NR_iden[i] = iden_no * 100 / len; NR_flag[i] |= IS_REDUNDANT ; } else { // else add to NR90 db NR90_idx[NR90_no] = i; NR_clstr_no[i] = NR90_no; // positive value for representatives NR_iden[i] = 0; NR_flag[i] |= IS_REP; word_table.add_word_list(aan_no, aan_list, aan_list_no, NR90_no); NR90_no++; } // else } // else } // if ( ! (NR_flag[i] & IS_REDUNDANT) ) if ( (i1+1) % 100 == 0 ) { cerr << "."; if ( (i1+1) % 1000 == 0 ) cout << i1+1 << " finished\t" << NR90_no << " clusters" << endl; } } // for (i1=SEG_b[sggi]; i1<=SEG_e[sggi]; i1++) { SEG90_b[sggi] = segb; SEG90_e[sggi] = NR90_no-1; if ( sggi < SEG_no-2 ) word_table.write_tbl( db_swap[sggi] ); // if not last segment } // for (sggi=0; sggi0 ) out2_bak << " at " << int(NR_iden[i]) << "%" << endl; else out2_bak << " *" << endl; } out2_bak.close(); cout << "writing clustering information" << endl; // write clstr information // I mask following 3 lines, because it crash when clusters NR // I thought maybe there is not a big block memory now, so // move the new statement to the begining of program, but because I // don't know the NR90_no, I just use DB_no instead // int *Clstr_no, *(*Clstr_list); // if ((Clstr_no = new int[NR90_no]) == NULL) bomb_error("Memory"); // if ((Clstr_list = new int*[NR90_no]) == NULL) bomb_error("Memory"); for (i=0; iCluster " << i << endl; for (k=0; k0 ) out2 << " at " << int(NR_iden[j]) << "%" << endl; else out2 << " *" << endl; } } out2.close(); cout << "program completed !" << endl << endl; } // END int main ///////////////////////FUNCTION of common tools//////////////////////////// int check_this(int len, char *seqi, int &has_aa2, int NAA, int& aan_no, int *aan_list, INTs *aan_list_no, INTs *look_and_count, int &hit_no, int libb, int libe, int &iden_no, double aa1_cutoff, double aa2_cutoff, double aan_cutoff, char this_flag, char *NR_flag) { static int taap[MAX_UAA*MAX_UAA]; static INTs aap_list[MAX_SEQ]; static INTs aap_begin[MAX_UAA*MAX_UAA]; int i, j, k, i1, j1, k1, i0, j0, k0, c22, sk, mm; int required_aa1 = int (aa1_cutoff* (double) len); int required_aa2 = (aa1_cutoff > 0.95) ? len-2 +1-(len-required_aa1)*2 : int (aa2_cutoff* (double) len); int required_aan = (aa1_cutoff > 0.95) ? len-NAA+1-(len-required_aa1)*NAA : int (aan_cutoff* (double) len); // check_aan_list aan_no = len - NAA + 1; if ( NAA == 2) for (j=0; j=libb; j--) look_and_count[j]=0; word_table.count_word_no(aan_no, aan_list, aan_list_no, look_and_count); // contained_in_old_lib() int band_left, band_right, best_score, band_width1, best_sum, len2; int len1 = len - 1; char *seqj; int flag = 0; // compare to old lib for (j=libe; j>=libb; j--) { if ( look_and_count[j] < required_aan ) continue; if ( (this_flag & IS_OLD_REP ) && (NR_flag[NR90_idx[j]] & IS_OLD_REP) ) continue; len2 = NR_len[NR90_idx[j]]; seqj = NR_seq[NR90_idx[j]]; if ( has_aa2 == 0 ) { // calculate AAP array for (sk=0; sk