// ============================================================================= // CD-HI // // Cluster Database at High Identity // // CD-HI clusters protein sequence database at high sequence identity threshold. // This program can remove the high sequence redundance efficiently. // // program written by // Weizhong Li // UCSD, San Diego Supercomputer Center // La Jolla, CA, 92093 // Email liwz@sdsc.edu // // at // Adam Godzik's lab // The Burnham Institute // La Jolla, CA, 92037 // Email adam@burnham-inst.org // ============================================================================= #include "cd-hi.h" int DIAG_score[MAX_DIAG]; void bomb_error(char *message) { cerr << "\nFatal Error\n"; cerr << message << endl; cerr << "\nProgram halted !! \n\n"; exit (1); } // END void bomb_error //quick_sort calling (a, 0, no-1) int quick_sort (int *a, int lo0, int hi0 ) { int lo = lo0; int hi = hi0; int mid; int tmp; if ( hi0 > lo0) { mid = a[ ( lo0 + hi0 ) / 2 ]; while( lo <= hi ) { while( ( lo < hi0 ) && ( a[lo] < mid ) ) lo++; while( ( hi > lo0 ) && ( a[hi] > mid ) ) hi--; if( lo <= hi ) { tmp=a[lo]; a[lo]=a[hi]; a[hi]=tmp; lo++; hi--; } } // while if( lo0 < hi ) quick_sort(a, lo0, hi ); if( lo < hi0 ) quick_sort(a, lo, hi0 ); } // if ( hi0 > lo0) return 0; } // quick_sort //quick_sort_idx calling (a, idx, 0, no-1) //sort a with another array idx //so that idx rearranged int quick_sort_idx (int *a, int *idx, int lo0, int hi0 ) { int lo = lo0; int hi = hi0; int mid; int tmp; if ( hi0 > lo0) { mid = a[ ( lo0 + hi0 ) / 2 ]; while( lo <= hi ) { while( ( lo < hi0 ) && ( a[lo] < mid ) ) lo++; while( ( hi > lo0 ) && ( a[hi] > mid ) ) hi--; if( lo <= hi ) { tmp=a[lo]; a[lo]=a[hi]; a[hi]=tmp; tmp=idx[lo]; idx[lo]=idx[hi]; idx[hi]=tmp; lo++; hi--; } } // while if( lo0 < hi ) quick_sort_idx(a, idx, lo0, hi ); if( lo < hi0 ) quick_sort_idx(a, idx, lo, hi0 ); } // if ( hi0 > lo0) return 0; } // quick_sort_idx //quick_sort_idx calling (a, idx, 0, no-1) //sort a with another array idx //so that idx rearranged int quick_sort_idx2 (int *a, int *b, int *idx, int lo0, int hi0 ) { int lo = lo0; int hi = hi0; int mid; int tmp; if ( hi0 > lo0) { mid = a[ ( lo0 + hi0 ) / 2 ]; while( lo <= hi ) { while( ( lo < hi0 ) && ( a[lo] < mid ) ) lo++; while( ( hi > lo0 ) && ( a[hi] > mid ) ) hi--; if( lo <= hi ) { tmp=a[lo]; a[lo]=a[hi]; a[hi]=tmp; tmp=b[lo]; b[lo]=b[hi]; b[hi]=tmp; tmp=idx[lo]; idx[lo]=idx[hi]; idx[hi]=tmp; lo++; hi--; } } // while if( lo0 < hi ) quick_sort_idx2(a, b, idx, lo0, hi ); if( lo < hi0 ) quick_sort_idx2(a, b, idx, lo, hi0 ); } // if ( hi0 > lo0) return 0; } // quick_sort_idx2 //quick_sort_a_b_idx //sort a list by a first priority // and b second priority //another idx go with them int quick_sort_a_b_idx (int *a, int *b, int *idx, int lo0, int hi0 ) { //first sort list by a quick_sort_idx2(a, b, idx, lo0, hi0); //then sort segments where elements in a is same int i, j, k; int bb = lo0; for (i=bb+1; i<=hi0; i++) { if ( a[i] == a[i-1] ) { ; } else { if ( i-1 > bb ) quick_sort_idx(b, idx, bb, i-1); bb = i; } } // last segment if ( hi0 > bb ) quick_sort_idx(b, idx, bb, hi0); return 0; } // quick_sort_a_b_idx void format_seq(char *seq) { int i, j, k; char c1; int len = strlen(seq); for (i=0,j=0; i (1,1) //// 1 means direction (0,1) -> (1,2) //// -1 means direction (1,0) -> (2,1) int diag_test_aapn(char iseq2[], int len1, int len2, int *taap, INTs *aap_begin, INTs *aap_list, int &best_sum, int band_width, int &band_left, int &band_right, int required_aa1) { int i, i1, j, k, l, m, n; int *pp; int nall = len1+len2-1; static int diag_score[MAX_DIAG]; for (pp=diag_score, i=nall; i; i--, pp++) *pp=0; int bi, bj, c22; INTs *bip; int len11 = len1-1; int len22 = len2-1; i1 = len11; for (i=0; i= 0 ? required_aa1-1:0; // on dec 21 2001 int band_e = nall - required_aa1; int band_m = ( band_b+band_width-1 < band_e ) ? band_b+band_width-1 : band_e; int best_score=0; for (i=band_b; i<=band_m; i++) best_score += diag_score[i]; int from=band_b; int end =band_m; int score = best_score; for (k=from, j=band_m+1; j best_score ) { from = k; end = j; best_score = score; } } for (j=from; j<=end; j++) { // if aap pairs fail to open gap if ( diag_score[j] < 5 ) { best_score -= diag_score[j]; from++;} else break; } for (j=end; j>=from; j--) { // if aap pairs fail to open gap if ( diag_score[j] < 5 ) { best_score -= diag_score[j]; end--;} else break; } // delete [] diag_score; band_left = from-len1+1; band_right= end-len1+1; best_sum = best_score; return OK_FUNC; } // END diag_test_aapn ////local alignment of two sequence within a diag band ////for band 0 means direction (0,0) -> (1,1) //// 1 means direction (0,1) -> (1,2) //// -1 means direction (1,0) -> (2,1) ////iseq len are integer sequence and its length, ////mat is matrix, return ALN_PAIR class int local_band_align(char iseq1[], char iseq2[], int len1, int len2, AA_MATRIX &mat, int &best_score, int &iden_no, int band_left, int band_right) { int i, j, k, l, m, n, j1; int ii, jj, kk; int best_score1, iden_no1, best_i, best_j, best_j1; int *gap_array; iden_no = 0; if ( (band_right >= len2 ) || (band_left <= -len1) || (band_left > band_right) ) return FAILED_FUNC; // allocate mem for score_mat[len1][len2] etc int band_width = band_right - band_left + 1; int *(*score_mat), *(*iden_mat); if ((score_mat = new int * [len1]) == NULL) bomb_error("Memory"); if ((iden_mat = new int * [len1]) == NULL) bomb_error("Memory"); for (i=0; i best_score) best_score = score_mat[i][j1]; iden_mat[i][j1] = (iseq1[i] == iseq2[0]) ? 1 : 0; } } if (band_right >=0) { //set score to top border of the matrix within band int tband = (band_left > 0) ? band_left : 0; for (i=0,j=tband; j<=band_right; j++) { j1 = j-band_left; if ( ( score_mat[i][j1] = mat.matrix[iseq1[i]][iseq2[j]] ) > best_score) best_score = score_mat[i][j1]; iden_mat[i][j1] = (iseq1[i] == iseq2[j]) ? 1 : 0; } } for (i=1; i=len2) continue; int sij = mat.matrix[iseq1[i]][iseq2[j]]; int iden_ij = (iseq1[i] == iseq2[j] ) ? 1 : 0; int s1, *mat_row; int k0, k_idx; // from (i-1,j-1) if ( (best_score1 = score_mat[i-1][j1] )> 0 ) { iden_no1 = iden_mat[i-1][j1]; } else { best_score1 = 0; iden_no1 = 0; } // from last row mat_row = score_mat[i-1]; k0 = (-band_left+1-i > 0) ? -band_left+1-i : 0; for (k=j1-1, kk=0; k>=k0; k--, kk++) { if ( (s1 = mat_row[k] + gap_array[kk] ) > best_score1 ){ best_score1 = s1; iden_no1 = iden_mat[i-1][k]; } } k0 = (j-band_right-1 > 0) ? j-band_right-1 : 0; for(k=i-2, jj=j1+1,kk=0; k>=k0; k--,kk++,jj++) { if ( (s1 = score_mat[k][jj] + gap_array[kk] ) > best_score1 ){ best_score1 = s1; iden_no1 = iden_mat[k][jj]; } } best_score1 += sij; iden_no1 += iden_ij; score_mat[i][j1] = best_score1; iden_mat[i][j1] = iden_no1; if ( best_score1 > best_score ) { best_score = best_score1; iden_no = iden_no1; } } // END for (j=1; j idx_of_X, eg aa2idx['A' - 'A'] => 0, and aa2idx['M'-'A'] => 12 int BLOSUM62[] = { 4, // A -1, 5, // R -2, 0, 6, // N -2,-2, 1, 6, // D 0,-3,-3,-3, 9, // C -1, 1, 0, 0,-3, 5, // Q -1, 0, 0, 2,-4, 2, 5, // E 0,-2, 0,-1,-3,-2,-2, 6, // G -2, 0, 1,-1,-3, 0, 0,-2, 8, // H -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, // I -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4, // L -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5, // K -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, // M -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6, // F -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7, // P 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, // S 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5, // T -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, // W -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7, // Y 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4, // V -2,-1, 3, 4,-3, 0, 1,-1, 0,-3,-4, 0,-3,-3,-2, 0,-1,-4,-3,-3, 4, // B -1, 0, 0, 1,-3, 3, 4,-2, 0,-3,-3, 1,-1,-3,-1, 0,-1,-3,-2,-2, 1, 4, // Z 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-1,-1,-1 // X //A R N D C Q E G H I L K M F P S T W Y V B Z X //0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 2 6 20 }; int outiseq(char iseq[], int len) { int i; char seq[MAX_SEQ]; for (i=0; i>" << seq << endl; return 0; } int setiseq(char *seq, int len) { for (int i=0; i 0 ) { delete [] seq_idx[i]; delete [] word_no[i]; } fswap.read( &size[i], sizeof(int)); capacity[i] = size[i]; if (size[i] == 0 ) continue; if ((seq_idx[i] = new int[size[i]]) == NULL) bomb_error("Memory"); if ((word_no[i] = new INTs[size[i]]) == NULL) bomb_error("Memory"); fswap.read(seq_idx[i], sizeof(int) * size[i]); fswap.read(word_no[i], sizeof(INTs) * size[i]); } fswap.close(); return OK_FUNC; } // END int IDX_TBL::read_tbl int IDX_TBL::write_tbl(char *filename) { int i, j, k; ofstream fswap(filename); if (! fswap) bomb_error("Can not open file"); for (i=0; i buffer_size ) { delete [] buffer; buffer_size = capacity[j]; if ((buffer = new int[buffer_size]) == NULL) bomb_error("Memory"); } for (k=0; k0 ) delete [] seq_idx[j]; if ((seq_idx[j] = new int[mem_size+capacity[j]]) == NULL) bomb_error("Memory"); for (k=0; k0 ) delete [] word_no[j]; if ((word_no[j] = new INTs[mem_size+capacity[j]]) == NULL) bomb_error("Memory"); for (k=0; k' && c0 == '\n') no++; c0 = c1; } in1.close(); return no; } int old_clstr_seq_no_test(ifstream &in1) { char c0, c1; int no = 0; c0 = '\n'; while(1) { if ( in1.eof()) break; in1.read(&c1, 1); if ( c1 != '>' && c0 == '\n') no++; c0 = c1; } in1.close(); return no; } int db_read_in (ifstream &in1, int length_of_throw, int & NR_no, char *NR_seq[], int *NR_len) { char raw_seq[MAX_SEQ], raw_des[MAX_DES], raw_seq1[MAX_SEQ]; char buffer1[MAX_LINE_SIZE]; raw_seq[0] = raw_des[0] = buffer1[0] = 0; int read_in = 0; NR_no = 0; while(1) { if ( in1.eof()) break; in1.getline(buffer1, MAX_LINE_SIZE-2, '\n'); if ( buffer1[0] == '>') { if ( read_in ) { // write previous record if ( strlen(raw_seq) >= MAX_SEQ-1) bomb_error("Long sequence found, enlarge Macro MAX_SEQ"); format_seq(raw_seq); if ( strlen(raw_seq) > length_of_throw ) { // if (strlen(raw_seq) > 32766) raw_seq[32766]=0; // temp if ( (NR_seq[NR_no] = new char[strlen(raw_seq)+2] ) == NULL ) bomb_error("memory"); strcpy( NR_seq[NR_no], raw_seq); NR_len[NR_no] = strlen(raw_seq); NR_no++; } } strncpy(raw_des, buffer1, MAX_DES-2); raw_seq[0] = 0; } else { read_in = 1; strcat(raw_seq, buffer1); } } // END while(1); if (1) { // the last record if ( strlen(raw_seq) >= MAX_SEQ-1) bomb_error("Long sequence found, enlarge Macro MAX_SEQ"); format_seq(raw_seq); if ( strlen(raw_seq) > length_of_throw ) { if ( (NR_seq[NR_no] = new char[strlen(raw_seq)+2] ) == NULL ) bomb_error("memory"); strcpy( NR_seq[NR_no], raw_seq); NR_len[NR_no] = strlen(raw_seq); NR_no++; } } in1.close(); return 0; } // END db_read_in int db_read_in2(ifstream &in1, int length_of_throw, int & NR_no, char *NR_seq[], int *NR_len, int NRo_no, int * NRo_idx, int * NRo_id1, int * NRo_id2, int * NRo_NR_idx) { int i, j, k; char raw_seq[MAX_SEQ], raw_des[MAX_DES], raw_seq1[MAX_SEQ]; char buffer1[MAX_LINE_SIZE]; raw_seq[0] = raw_des[0] = buffer1[0] = 0; int read_in = 0; int id1, id2; NR_no = 0; while(1) { if ( in1.eof()) break; in1.getline(buffer1, MAX_LINE_SIZE-2, '\n'); if ( buffer1[0] == '>' || buffer1[0] == ';') { if ( read_in ) { // write last record if ( strlen(raw_seq) >= MAX_SEQ-1) bomb_error("Long sequence found, enlarge Macro MAX_SEQ"); format_seq(raw_seq); if ( strlen(raw_seq) > length_of_throw ) { if ( (NR_seq[NR_no] = new char[strlen(raw_seq)+2] ) == NULL ) bomb_error("memory"); strcpy( NR_seq[NR_no], raw_seq); NR_len[NR_no] = strlen(raw_seq); des_to_idx(id1, id2, raw_des); i = get_index_of_2_sorted_list(NRo_id1, NRo_id2, 0, NRo_no-1, id1, id2); if ( i != -1 ) NRo_NR_idx[ NRo_idx[i] ] = NR_no; // else { cout << id1 << " " << id2 << raw_des << endl; } NR_no++; } } strncpy(raw_des, buffer1, MAX_DES-2); raw_seq[0] = 0; } else { read_in = 1; strcat(raw_seq, buffer1); } } // END while(1); if (1) { // the last record if ( strlen(raw_seq) >= MAX_SEQ-1) bomb_error("Long sequence found, enlarge Macro MAX_SEQ"); format_seq(raw_seq); if ( strlen(raw_seq) > length_of_throw ) { if ( (NR_seq[NR_no] = new char[strlen(raw_seq)+2] ) == NULL ) bomb_error("memory"); strcpy( NR_seq[NR_no], raw_seq); NR_len[NR_no] = strlen(raw_seq); des_to_idx(id1, id2, raw_des); i = get_index_of_2_sorted_list(NRo_id1, NRo_id2, 0, NRo_no-1, id1, id2); if ( i != -1 ) NRo_NR_idx[ NRo_idx[i] ] = NR_no; NR_no++; } } in1.close(); return 0; } // END db_read_in2 int sort_seqs_divide_segs (int NR_no, int *NR_len, int *NR_idx, char *NR_seq[], int mem_limit, int NAAN, int &SEG_no, int *SEG_b, int *SEG_e, char db_swap[MAX_SEG][MAX_FILE_NAME]) { int i, j, k, i1; // ************************************* change all the NR_seq to iseq int len, len1, len2, len22, total_letter=0; int max_len = 0, min_len = 99999; for (i=0; i max_len) max_len = len; if (len < min_len) min_len = len; setiseq(NR_seq[i], len); } cout << "longest and shortest : " << max_len << " and " << min_len << endl; cout << "Total letters: " << total_letter << endl; // END change all the NR_seq to iseq // **************************** Form NR_idx[], Sort them from Long to short int *size_no; int *size_begin; if ((size_no = new int[max_len-min_len+1]) == NULL ) bomb_error("Memory"); if ((size_begin = new int[max_len-min_len+1]) == NULL ) bomb_error("Memory"); for (i=max_len; i>=min_len; i--) { size_no[max_len - i] = 0; size_begin[max_len - i] = 0; } for (i=0; i=min_len; i--) for (j=max_len; j>i; j--) size_begin[max_len-i] += size_no[max_len-j]; for (i=max_len; i>=min_len; i--) size_no[max_len - i] = 0; for (i=0; imem_limit ) { SEG_b[SEG_no] = k; SEG_e[SEG_no] = i1; sprintf(db_swap[SEG_no], "SWAP.%d",SEG_no); j=0; k=i1+1; SEG_no++; if ( SEG_no >= MAX_SEG ) bomb_error("Too many segments, enlarge Macro MAX_SEG or change -M option"); } } if ( SEG_no == 0 ) { SEG_b[SEG_no] = 0; SEG_e[SEG_no] = NR_no-1; sprintf(db_swap[SEG_no], "SWAP.%d",SEG_no); SEG_no++; } else if ( SEG_e[SEG_no-1] != NR_no-1 ) { // last Segment SEG_b[SEG_no] = k; SEG_e[SEG_no] = NR_no-1; sprintf(db_swap[SEG_no], "SWAP.%d",SEG_no); SEG_no++; } if (SEG_no > 1) cout << "Sequences divided into " << SEG_no << " parts\n"; return 0; }// END sort_seqs_divide_segs int db_read_and_write (ifstream &in1, ofstream &out1, int length_of_throw, int des_len, char *NR_seq[], int *NR_clstr_no) { char raw_seq[MAX_SEQ], raw_des[MAX_DES], raw_seq1[MAX_SEQ]; char buffer1[MAX_LINE_SIZE]; raw_seq[0] = raw_des[0] = buffer1[0] = 0; int read_in = 0; int NR_no1 = 0; while(1) { if ( in1.eof()) break; in1.getline(buffer1, MAX_LINE_SIZE-2, '\n'); if ( buffer1[0] == '>' || buffer1[0] == ';') { if ( read_in ) { // write last record strcpy(raw_seq1, raw_seq); format_seq(raw_seq1); if ( strlen(raw_seq1) > length_of_throw ) { if (NR_clstr_no[NR_no1] >= 0 ) out1 << raw_des << "\n" << raw_seq; if ((NR_seq[NR_no1] = new char[des_len] ) == NULL ) bomb_error("memory"); strncpy(NR_seq[NR_no1], raw_des, des_len-2); NR_seq[NR_no1][des_len-2]=0; NR_no1++; } } strncpy(raw_des, buffer1, MAX_DES-2); raw_seq[0] = 0; } else { read_in = 1; strcat(raw_seq, buffer1); strcat(raw_seq,"\n"); } } // END while(1); if (1) { // the last record strcpy(raw_seq1, raw_seq); format_seq(raw_seq1); if ( strlen(raw_seq1) > length_of_throw ) { if (NR_clstr_no[NR_no1] >= 0 ) out1 << raw_des << "\n" << raw_seq; if ((NR_seq[NR_no1] = new char[des_len] ) == NULL ) bomb_error("memory"); strncpy(NR_seq[NR_no1], raw_des, des_len-2); NR_seq[NR_no1][des_len-2]=0; NR_no1++; } } return 0; } // END db_read_and_write int old_clstr_read_in(ifstream &in_clstr, int &NRo_no, int &NRo90_no, int *NRo_idx, int *NRo_id1, int *NRo_id2, char *NRo_iden, int *NRo_clstr_no, int *NRo_NR_idx) { int i, j, k, i1; char buffer1[MAX_LINE_SIZE]; int is_rep, len; char iden, iden1[4]; NRo_no = 0; NRo90_no = 0; while(1) { if (in_clstr.eof()) break; in_clstr.getline(buffer1, MAX_LINE_SIZE-2, '\n'); if ( buffer1[0] == '>') { //read in line like >Cluster 10 NRo90_no ++; } else { //read in line like //1 225aa, >gi|4099051|gb|AAD... at 80% //2 9448aa, >gi|8249467|emb|CA... * len = strlen(buffer1); if (len<12) continue; iden = 0; for (i=len-1; i>=0; i--) { if (buffer1[i] == '*') { is_rep = 1; break; } else if (buffer1[i] == '%') { is_rep = 0; for (j=i-1; isdigit(buffer1[j]); j--) ; strncpy(iden1, buffer1+j, i-j); iden1[i-j]=0; iden = atoi(iden1); break; } } for (i=0; i' ) break; NRo_idx[NRo_no] = NRo_no; des_to_idx(NRo_id1[NRo_no], NRo_id2[NRo_no], buffer1+i); NRo_clstr_no[NRo_no] = is_rep ? NRo90_no -1 : -1 - (NRo90_no-1); NRo_NR_idx[NRo_no] = -1; NRo_iden[NRo_no] = iden; NRo_no ++; } } for (i=0; i=0; j--) { if ( i1 == -1-NRo_clstr_no[j] ) NRo_clstr_no[j] = i; else break; } k=i; for (j=i+1; j b+1 ) { mid = (b+e) / 2; mid_v = list[mid]; if (element > mid_v) { b = mid; } else if (element < mid_v) { e = mid; } else { break; } } if (element == mid_v ) { return mid; } else if (element == list[e] ) { return e; } else if (element == list[b] ) { return b; } else { return -1; } } // END get_index_of_sorted_list // get index of a element of a sorted list using 2-div method // calling get_index_of_2_sorted_list (list, list2, begin_no, end_no, element) // list is a sorted list in order of increasing // if index of list is same, check list2 int get_index_of_2_sorted_list (int *list, int *list2, int b, int e, int element, int element2) { int i = get_index_of_sorted_list(list, b, e, element); if ( i == -1 ) { return -1; } int bb = i; int ee = i; while (1) { if (bb == b) break; if (list[bb] == list[bb-1] ) {bb--;} else {break;} } while(1) { if (ee == e) break; if (list[ee] == list[ee+1] ) {ee++;} else {break;} } i = get_index_of_sorted_list(list2, bb, ee, element2); return i; } // END get_index_of_2_sorted_list /////////////////////////// END ALL ////////////////////////