/* * * * * fmtseq.java * * Fmtseq is a C program which can convert biological database sequences * from one file format to another. This is a Java implementation of * some of that program as an applet. * * Author: James Knight at CuraGen Corporation * 322 E. Main St. * Branford, CT 06405 * * E-mail: jknight@curagen.com * * Copyright (c)1996 James Knight at CuraGen Corporation * * This software may be used, copied, modified and/or distributed * freely by anyone for any purpose, commercial or non-commercial, * subject to the restrictions listed in this copyright notice. * All materials or software developed as a consequence of the * use of this software or software derived from it must duly * acknowledge such use, in accordance with the usual standards * of acknowledging credit in academic research. * * Although all reasonable efforts have been taken to ensure * the accuracy and reliability of the software and data, James * Knight and CuraGen Corporation do not and cannot warrant the * performance or results that may be obtained by using this * software or data. James Knight and CuraGen Corporation * disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for * any particular purpose. * * HISTORY * 9 Nov 1996 - 23 Nov 1996 * Original Implementation. */ import java.awt.*; import java.applet.*; import java.util.*; public class fmtseq extends Applet { Choice infmt_choice, outfmt_choice; Checkbox reverse_box, complement_box, tu_box, ut_box; Checkbox nocase_box, toupper_box, tolower_box; Checkbox normalmode_box, rawmode_box, stripmode_box; TextArea input_text, output_text; TextField gapin_text, gapout_text; public void init() { Label l; Canvas space; Panel p1, p2, p3; fmtseqHRule hr; CheckboxGroup g1; GridBagLayout layout; GridBagConstraints c; Font f; /* * Create the font used by the text areas. */ f = new Font("Courier", Font.PLAIN, 14); /* * Set the layout for the applet so that it flows vertically down the * page (using GridBagLayout, since Java doesn't have a vertical * flow layout). Note that with this layout, each row of the page * must be encapsulated in a panel of its own. * * Also, set the constraints so that none of the rows except the * output text area can resize itself (so that the output text area * grows to fill up the available space). */ layout = new GridBagLayout(); setLayout(layout); c = new GridBagConstraints(); c.fill = GridBagConstraints.HORIZONTAL; c.gridwidth = GridBagConstraints.REMAINDER; /* * Build the Input section of the applet. */ p1 = new Panel(); p1.setLayout(new FlowLayout(FlowLayout.LEFT, 0, 3)); p1.add(new Label("Input Format:")); infmt_choice = new Choice(); infmt_choice.addItem("* (Any)"); IN_ANY = 0; infmt_choice.addItem("Raw/Plain"); IN_PLAIN = 1; infmt_choice.addItem("GenBank"); IN_GENBANK = 2; infmt_choice.addItem("EMBL/Swiss-Prot"); IN_EMBL = 3; infmt_choice.addItem("PIR (CODATA)"); IN_PIR = 4; infmt_choice.addItem("FASTA"); IN_FASTA = 5; infmt_choice.addItem("NBRF"); IN_NBRF = 6; infmt_choice.addItem("IG/Stanford"); IN_IG = 7; p1.add(infmt_choice); p1.add(new Button("Clear Input")); layout.setConstraints(p1, c); add(p1); input_text = new TextArea(8, 60); input_text.setFont(f); layout.setConstraints(input_text, c); add(input_text); hr = new fmtseqHRule(); hr.resize(20, 30); layout.setConstraints(hr, c); add(hr); /* * Build the transformation section of the applet. * * (Note: To work around the AWT's annoying habit of centering * the labels of checkboxes that appear inside a grid * layout, I enclosed each checkbox in its own Panel.) */ l = new Label("Transformations:"); layout.setConstraints(l, c); add(l); p1 = new Panel(); p1.setLayout(new FlowLayout(FlowLayout.LEFT, 50, 8)); p2 = new Panel(); p2.setLayout(new GridLayout(3, 1)); g1 = new CheckboxGroup(); p3 = new Panel(); p3.setLayout(new FlowLayout(FlowLayout.LEFT, 0, 0)); nocase_box = new Checkbox("Retain Case", g1, true); p3.add(nocase_box); p2.add(p3); toupper_box = new Checkbox("To Upper Case", g1, false); p2.add(toupper_box); tolower_box = new Checkbox("To Lower Case", g1, false); p2.add(tolower_box); p1.add(p2); p2 = new Panel(); p2.setLayout(new GridLayout(3, 1)); p3 = new Panel(); p3.setLayout(new FlowLayout(FlowLayout.LEFT, 0, 0)); reverse_box = new Checkbox("Reverse"); p3.add(reverse_box); p2.add(p3); p3 = new Panel(); p3.setLayout(new FlowLayout(FlowLayout.LEFT, 0, 0)); complement_box = new Checkbox("Complement"); p3.add(complement_box); p2.add(p3); p3 = new Panel(); p3.setLayout(new FlowLayout(FlowLayout.LEFT, 0, 0)); tu_box = new Checkbox("T -> U"); p3.add(tu_box); ut_box = new Checkbox("U -> T"); p3.add(ut_box); p2.add(p3); p1.add(p2); p2 = new Panel(); p2.setLayout(new GridLayout(3, 1)); g1 = new CheckboxGroup(); normalmode_box = new Checkbox("Normal Mode", g1, true); p2.add(normalmode_box); p3 = new Panel(); p3.setLayout(new FlowLayout(FlowLayout.LEFT, 0, 0)); rawmode_box = new Checkbox("Raw Mode", g1, false); p3.add(rawmode_box); p2.add(p3); p3 = new Panel(); p3.setLayout(new FlowLayout(FlowLayout.LEFT, 0, 0)); stripmode_box = new Checkbox("Strip Mode", g1, false); p3.add(stripmode_box); p2.add(p3); p1.add(p2); layout.setConstraints(p1, c); add(p1); p1 = new Panel(); p1.setLayout(new FlowLayout(FlowLayout.LEFT, 70, 3)); p2 = new Panel(); p2.setLayout(new FlowLayout(FlowLayout.LEFT, 0, 0)); p2.add(new Label("Input Gap Char:")); gapin_text = new TextField(1); gapin_text.setText("-"); p2.add(gapin_text); p1.add(p2); p2 = new Panel(); p2.setLayout(new FlowLayout(FlowLayout.LEFT, 0, 0)); p2.add(new Label("Output Gap Char:")); gapout_text = new TextField(1); gapout_text.setText("-"); p2.add(gapout_text); p1.add(p2); layout.setConstraints(p1, c); add(p1); hr = new fmtseqHRule(); hr.resize(20, 24); layout.setConstraints(hr, c); add(hr); /* * Build the output section of the page. */ p1 = new Panel(); p1.setLayout(new FlowLayout(FlowLayout.LEFT, 0, 3)); p1.add(new Label("Output Format:")); outfmt_choice = new Choice(); outfmt_choice.addItem("Raw"); OUT_RAW = 0; outfmt_choice.addItem("Plain"); OUT_PLAIN = 1; outfmt_choice.addItem("GenBank"); OUT_GENBANK = 2; outfmt_choice.addItem("EMBL"); OUT_EMBL = 3; outfmt_choice.addItem("Swiss-Prot"); OUT_SPROT = 4; outfmt_choice.addItem("PIR (CODATA)"); OUT_PIR = 5; outfmt_choice.addItem("FASTA"); OUT_FASTA = 6; outfmt_choice.addItem("FASTA-old"); OUT_FASTAOLD = 7; outfmt_choice.addItem("NBRF"); OUT_NBRF = 8; outfmt_choice.addItem("IG/Stanford"); OUT_IG = 9; p1.add(outfmt_choice); p1.add(new Button("Perform Conversion")); layout.setConstraints(p1, c); add(p1); output_text = new TextArea(); output_text.setFont(f); c.fill = GridBagConstraints.BOTH; c.weightx = 1.0; c.weighty = 1.0; layout.setConstraints(output_text, c); add(output_text); } public void paint(Graphics g) { int x[] = new int[6]; int y[] = new int[6]; Color light_border = new Color(224, 224, 224); Color dark_border = new Color(128, 128, 128); Dimension d = size(); super.paint(g); /* * Paint the 3-D border around the edge of the applet using two * fillPolygon calls (since the 3D rectangle drawing wasn't working * for me). */ x[0] = 0; y[0] = 0; x[1] = d.width; y[1] = 0; x[2] = d.width - 3; y[2] = 3; x[3] = 3; y[3] = 3; x[4] = 3; y[4] = d.height - 3; x[5] = 0; y[5] = d.height; g.setColor(light_border); g.fillPolygon(x, y, 6); x[0] = d.width; y[0] = d.height; x[1] = 0; y[1] = d.height; x[2] = 3; y[2] = d.height - 3; x[3] = d.width - 3; y[3] = d.height - 3; x[4] = d.width - 3; y[4] = 3; x[5] = d.width; y[5] = 0; g.setColor(dark_border); g.fillPolygon(x, y, 6); } public Insets insets() { return new Insets(8, 8, 8, 8); } public boolean action(Event evt, Object arg) { /* * The only actions which the applet performs (other than the default * input/checkbox event handling) are the clearing of the input text * area and a conversion. */ if (arg.equals("Clear Input")) { input_text.setText(""); return true; } else if (arg.equals("Perform Conversion")) { do_conversion(); return true; } return false; } /* * * * The Section handling all of the parsing and outputting of the * various file formats. * * * (Note: Normally, I don't create this many global variables, but * there's no real way to pass all of these values as * parameters to each of the parsing and output routines * and making it a structure requires a completely new * class (which is not really worth the delay in load time * over a network).) * */ private String instring; private char[] inbuffer, seq; private int seqlen, seqbuflen, inbuflen, seqcount; private int entrystart, seqstart, entryend, nextentry; private int informat_num, outformat_num, case_mode; private boolean raw_mode, strip_mode, tu_mode, ut_mode; private boolean reverse_mode, complement_mode, gcg_flag; private char gapin, gapout; private String date, description, comment, organism; private String mainid, acclist, nid, pid; private boolean isfragment, iscircular; private int alphabet, acount, ccount, gcount, tcount, ucount, ocount; private int truelen; private int nbrf_header; private boolean sprot_flag; private boolean error_flag; private final static int NOCASE = 0; private final static int LOWERCASE = 1; private final static int UPPERCASE = 2; private final static int UNKNOWN = 0; private final static int DNA = 1; private final static int RNA = 2; private final static int PROTEIN = 3; private int IN_ANY; private int IN_PLAIN; private int IN_GENBANK; private int IN_EMBL; private int IN_PIR; private int IN_FASTA; private int IN_NBRF; private int IN_IG; private int OUT_RAW; private int OUT_PLAIN; private int OUT_GENBANK; private int OUT_EMBL; private int OUT_SPROT; private int OUT_PIR; private int OUT_FASTA; private int OUT_FASTAOLD; private int OUT_NBRF; private int OUT_IG; /* * do_conversion * * The main routine to perform the file format conversion. It first * gets the input text, determines its format and the option settings, * then loops through calls to parse the input and produce the converted * output. */ private void do_conversion() { String text; output_text.setText(""); error_flag = false; /* * Get the input. The applet keeps both a string and character array * version of the input, because the parsing works best using a * combination of the both of them. */ instring = input_text.getText(); if (instring.length() == 0) { error_message("There is no input."); return; } inbuffer = instring.toCharArray(); inbuflen = instring.length(); seqcount = nextentry = 0; informat_num = infmt_choice.getSelectedIndex(); if (informat_num == IN_ANY) informat_num = autoDetermine(); outformat_num = outfmt_choice.getSelectedIndex(); /* * Determine the format conversion options. */ case_mode = NOCASE; if (toupper_box.getState()) case_mode = UPPERCASE; else if (tolower_box.getState()) case_mode = LOWERCASE; raw_mode = rawmode_box.getState(); strip_mode = stripmode_box.getState(); reverse_mode = reverse_box.getState(); complement_mode = complement_box.getState(); tu_mode = tu_box.getState(); ut_mode = ut_box.getState(); text = gapin_text.getText(); gapin = (text.length() == 0 ? '\0' : text.charAt(0)); text = gapout_text.getText(); gapout = (text.length() == 0 ? '\0' : text.charAt(0)); /* * The main loop which parses the input and does the conversion. */ if (informat_num == IN_PLAIN) { while (parseRaw()) convert(); } else if (informat_num == IN_GENBANK) { while (parseGenBank()) convert(); } else if (informat_num == IN_EMBL) { while (parseEMBL()) convert(); } else if (informat_num == IN_PIR) { while (parsePIR()) convert(); } else if (informat_num == IN_FASTA) { while (parseFASTA()) convert(); } else if (informat_num == IN_NBRF) { while (parseNBRF()) convert(); } else if (informat_num == IN_IG) { while (parseIG()) convert(); } output_text.select(0, 0); } /* * autoDetermine * * Automatically determine the file format from the initial text * in the file. This is, perhaps, too simple a version of the * autoDetermine function (a more complete version appears in the * seqio package), but I think it should work in most cases. */ private int autoDetermine() { if (isKeyword(0, "LOCUS")) return IN_GENBANK; else if (isKeyword(0, "ID ")) return IN_EMBL; else if (isKeyword(0, "ENTRY")) return IN_PIR; else if (inbuffer[0] == '>' && inbuffer[3] == ';') return IN_NBRF; else if (inbuffer[0] == '>') return IN_FASTA; else if (inbuffer[0] == ';') return IN_IG; else return IN_PLAIN; } /* * convert * * Once an input entry has been parsed and its values are stored in * the global variables of the applet, perform any transformations * of the sequence and then call the output routine. */ private void convert() { int i, j, gapcount, notalphas, dnacount; char ch; boolean tuflag; /* * Change the case of the sequence. */ if (case_mode == LOWERCASE) for (i=0; i < seqlen; i++) seq[i] = toLowerCase(seq[i]); if (case_mode == UPPERCASE) for (i=0; i < seqlen; i++) seq[i] = toUpperCase(seq[i]); /* * Reverse and/or Complement the sequence. */ if (reverse_mode) { for (i=0,j=seqlen-1; i < j; i++,j--) { ch = seq[i]; seq[i] = seq[j]; seq[j] = ch; } } if (complement_mode) { tcount = ucount = 0; for (i=0; i < seqlen; i++) { if (seq[i] == 't' || seq[i] == 'T') tcount++; else if (seq[i] == 'u' || seq[i] == 'U') ucount++; } tuflag = (tcount == 0 && ucount > 0); for (i=0; i < seqlen; i++) { switch (seq[i]) { case 'A': seq[i] = (tuflag ? 'U' : 'T'); break; case 'a': seq[i] = (tuflag ? 'u' : 't'); break; case 'C': seq[i] = 'G'; break; case 'c': seq[i] = 'g'; break; case 'G': seq[i] = 'C'; break; case 'g': seq[i] = 'c'; break; case 'T': seq[i] = 'A'; break; case 't': seq[i] = 'a'; break; case 'U': seq[i] = 'A'; break; case 'u': seq[i] = 'a'; break; case 'R': seq[i] = 'Y'; break; case 'r': seq[i] = 'y'; break; case 'Y': seq[i] = 'R'; break; case 'y': seq[i] = 'r'; break; case 'W': seq[i] = 'W'; break; case 'w': seq[i] = 'w'; break; case 'S': seq[i] = 'S'; break; case 's': seq[i] = 's'; break; case 'M': seq[i] = 'K'; break; case 'm': seq[i] = 'k'; break; case 'K': seq[i] = 'M'; break; case 'k': seq[i] = 'm'; break; case 'H': seq[i] = 'D'; break; case 'h': seq[i] = 'd'; break; case 'B': seq[i] = 'V'; break; case 'b': seq[i] = 'v'; break; case 'V': seq[i] = 'B'; break; case 'v': seq[i] = 'b'; break; case 'D': seq[i] = 'H'; break; case 'd': seq[i] = 'h'; break; case 'N': seq[i] = 'N'; break; case 'n': seq[i] = 'n'; break; } } } /* * In normal mode, if the output format is one of the databank formats * (GenBank, EMBL, Swiss-Prot or PIR) and the input format was not * one of those formats (so the sequence may have gap and annotation * characters), remove all of the non-alphabetic characters from the * sequence (since the databank formats don't really expect them). */ if (!raw_mode) { if (strip_mode || (!(informat_num == IN_GENBANK || informat_num == IN_EMBL || informat_num == IN_PIR) && (outformat_num == OUT_GENBANK || outformat_num == OUT_EMBL || outformat_num == OUT_SPROT || outformat_num == OUT_PIR))) { for (i=0,j=0; i < seqlen; i++) { if (isAlpha(seq[i])) { if (i != j) seq[j] = seq[i]; j++; } } seqlen = j; } } /* * Either change the gap character of the sequence, or degap the * sequence, as appropriate. */ if (gapin != '\0' && gapin != gapout) { for (i=0,j=0; i < seqlen; i++) { if (seq[i] == gapin) { if (gapout != '\0') seq[j++] = gapout; } else { if (i != j) seq[j] = seq[i]; j++; } } seqlen = j; } /* * As a final step of the parsing, determine the alphabet for * the sequence, along with counts of the A's, C's, G's, T's and * U's (used when the alphabet is DNA or RNA). * * Also in this step, perform the T->U and U->T conversion, so that * the alphabet is set to the post-transformation alphabet. */ acount = ccount = gcount = tcount = ucount = ocount = 0; gapcount = notalphas = 0; for (i=0; i < seqlen; i++) { switch (seq[i]) { case 'a': case 'A': acount++; break; case 'c': case 'C': ccount++; break; case 'g': case 'G': gcount++; break; case 't': case 'T': if (!tu_mode) tcount++; else { seq[i] = (seq[i] == 't' ? 'u' : 'U'); ucount++; } break; case 'u': case 'U': if (!ut_mode) ucount++; else { seq[i] = (seq[i] == 'u' ? 't' : 'T'); tcount++; } break; default: if (seq[i] == gapin) gapcount++; else if (!isAlpha(seq[i])) notalphas++; ocount++; } } if (alphabet == UNKNOWN) { dnacount = acount + ccount + gcount + tcount + ucount; if (((float) dnacount) / ((float) (seqlen - gapcount)) >= 0.85) alphabet = (tcount == 0 && ucount > 0 ? RNA : DNA); else if (notalphas == 0) alphabet = PROTEIN; } /* * Now that the sequence has been transformed, output an entry in the * output format. */ initPrint(); if (outformat_num == OUT_RAW) { outputRaw(); } else if (outformat_num == OUT_PLAIN) { outputPlain(); } else if (outformat_num == OUT_GENBANK) { if (informat_num == IN_GENBANK && !complement_mode && !tu_mode && !ut_mode) print(inbuffer, entrystart, seqstart); else outputGenBankHeader(); outputGenBankSequence(); } else if (outformat_num == OUT_EMBL) { if (informat_num == IN_EMBL && !sprot_flag && !complement_mode && !tu_mode && !ut_mode) print(inbuffer, entrystart, seqstart); else outputEMBLHeader(); outputEMBLSequence(); } else if (outformat_num == OUT_SPROT) { if (informat_num == IN_EMBL && sprot_flag) print(inbuffer, entrystart, seqstart); else outputSProtHeader(); outputSProtSequence(); } else if (outformat_num == OUT_PIR) { if (informat_num == IN_PIR) print(inbuffer, entrystart, seqstart); else outputPIRHeader(); outputPIRSequence(); } else if (outformat_num == OUT_FASTA) { if (informat_num == IN_FASTA) print(inbuffer, entrystart, seqstart); else outputFASTAHeader(); outputFASTASequence(); } else if (outformat_num == OUT_FASTAOLD) { outputFASTAoldHeader(); outputFASTASequence(); } else if (outformat_num == OUT_NBRF) { if (informat_num == IN_NBRF) { print(inbuffer, entrystart, seqstart); outputNBRFSequence(); if (nbrf_header > 0 && nbrf_header + 1 < entryend) print(inbuffer, nbrf_header, entryend); } else { outputNBRFHeader(); outputNBRFSequence(); } } else if (outformat_num == OUT_IG) { if (informat_num == IN_IG) print(inbuffer, entrystart, seqstart); else outputIGHeader(); outputIGSequence(); } else { error_message("Unknown Output Format: " + outformat_num); } endPrint(); } /* * * * The utility functions involved with the input parsers. * * * */ /* * initParse * * Initialize all of the variables set by the parsing procedure. This * function is called at the beginning of each parsing procedure. */ private void initParse() { seqcount++; seqlen = 0; entrystart = seqstart = entryend = -1; date = ""; mainid = ""; acclist = ""; nid = ""; pid = ""; description = ""; comment = ""; organism = ""; isfragment = false; iscircular = false; alphabet = UNKNOWN; nbrf_header = 0; sprot_flag = false; } /* * get EntryBounds * * For some formats, gets the beginning and end of the entry (setting * global variables entrystart and entryend). */ private boolean getEntryBounds(String start_key, String end_key) { entrystart = nextentry; if (!isKeyword(entrystart, start_key)) { error_message("Entry " + seqcount + " does not begin with a " + start_key + " line."); return false; } if ((entryend = instring.indexOf("\n" + end_key, entrystart)) == -1) { error_message("Entry " + seqcount + " does not end with a " + end_key + " line."); return false; } return true; } /* * getAccessions * * Parses a section of the text for accession numbers and stores them * in acclist. */ private void getAccessions(int pos, int end) { int start; while (pos < end) { while (pos < end && !isAlNum(inbuffer[pos])) pos++; for (start=pos; pos < end && isAlNum(inbuffer[pos]); pos++) ; if (start < pos) { if (acclist.length() == 0) acclist = instring.substring(start, pos); else acclist += "," + instring.substring(start, pos); } } } /* * getComment * * Parses a section of the text for comments and stores them in comment. */ private void getComment(int pos, int end, int headlen) { int i, start; for (start=pos; pos < end && inbuffer[pos] != '\n'; pos++) ; comment += instring.substring(start, pos) + "\n"; for (pos++; pos < end; pos++) { for (i=0; pos < end && i < headlen && inbuffer[pos] == ' '; pos++,i++) ; if (pos == end) break; for (start=pos; pos < end && inbuffer[pos] != '\n'; pos++) ; comment += instring.substring(start, pos) + "\n"; } } /* * getDescription * * Parses a section of the text for comments and stores them in comment. */ private void getDescription(int pos, int end, int headlen) { int i, start; for (start=pos; pos < end && inbuffer[pos] != '\n'; pos++) ; if (start < pos) { if (description.length() == 0) description = instring.substring(start, pos); else description += " " + instring.substring(start, pos); } for (pos++; pos < end; pos++) { for (i=0; pos < end && i < headlen && inbuffer[pos] == ' '; pos++,i++) ; if (pos == end) break; for (start=pos; pos < end && inbuffer[pos] != '\n'; pos++) ; if (start < pos) { if (description.length() == 0) description = instring.substring(start, pos); else description += " " + instring.substring(start, pos); } } if (description.endsWith(".")) description = description.substring(0, description.length()-1); } /* * getSequence * * Some formats use this to extract the sequence from the text (and * reallocate the seq buffer if necessary). */ private void getSequence(int pos, int end, boolean databank_flag) { if (seqbuflen < end - pos) { seqbuflen = end - pos + 1; seq = new char[seqbuflen]; } seqlen = 0; if (databank_flag && !raw_mode) { for ( ; pos < end; pos++) if (isAlpha(inbuffer[pos])) seq[seqlen++] = inbuffer[pos]; } else { for ( ; pos < end; pos++) if (!isDigit(inbuffer[pos]) && !isSpace(inbuffer[pos])) seq[seqlen++] = inbuffer[pos]; } } /* * getIDList * * This is used to extract the main identifier from an identifier * list formatted text, where the format of the identifier list could * be one of three formats, described below. Unless the type of the * identifier is given in the identifier list, the "prefix" parameter * specifies the type of the main identifier (i.e., "gb" for GenBank * LOCUS, "acc" for an accession number, "embl" for an EMBL identifier, * and so on). */ private void getIDList(String prefix, int start, int end) { int i; String idlist, token, token2, token3, token4; StringTokenizer st; while (start < end && isSpace(inbuffer[start])) start++; while (start < end && isSpace(inbuffer[end-1])) end--; if (start >= end) return; /* * The format of the idlist can be as follows: * * 1) It may begin with a '~', followed by 5-12 alphanumeric * characters. That string is treated as an accession number. * That string (and a following '|') is skipped before checking * for parts 2a or 2b. * 2a) If 2-4 alphanumeric characters are then followed by a ':', * the initial non-whitespace segment is treated as a SEQIO * identifier. * 2b) If 2-3 alphanumeric characters are then followed by a '|' * the initial non-whitespace segment is treated as an NCBI * Search Format identifier. * 3) None of 1 or 2 match, and the whole string is taken as * a single identifier. */ idlist = instring.substring(start, end); st = new StringTokenizer(idlist, "|"); while (st.hasMoreTokens()) { token = st.nextToken(); if (token.startsWith("~") || token.startsWith("acc:")) { token = token.substring(token.startsWith("~") ? 1 : 4); if (acclist.length() == 0) acclist = token; else acclist += "," + token; } else if ((i = token.indexOf(":")) == 2 || i == 3 || i == 4) { if (mainid.length() > 0) break; mainid = token; } else if (token.length() == 2 || token.length() == 3) { if (mainid.length() > 0) break; if (token.equals("bbm") || token.equals("bbs") || token.equals("gi") || token.equals("lcl") || token.equals("pdb")) { if (token.equals("lcl")) token = "oth"; if (st.hasMoreTokens()) { token2 = st.nextToken(); mainid = token + ":" + token2; } } else if (token.equals("dbj") || token.equals("emb") || token.equals("gb") || token.equals("gp") || token.equals("oth") || token.equals("pir") || token.equals("prf") || token.equals("sp")) { if (token.equals("emb")) token = "embl"; else if (token.equals("dbj")) token = "ddbj"; if (st.hasMoreTokens()) { token2 = st.nextToken(); if (st.hasMoreTokens()) { token3 = st.nextToken(); if (acclist.length() == 0) acclist = token2; else acclist += "," + token2; mainid = token + ":" + token3; } } } else if (token.equals("gnl")) { if (st.hasMoreTokens()) { token2 = st.nextToken(); if (st.hasMoreTokens()) { token3 = st.nextToken(); mainid = "gnl:" + token2 + ":" + token3; } } } else if (token.equals("pat")) { if (st.hasMoreTokens()) { token2 = st.nextToken(); if (st.hasMoreTokens()) { token3 = st.nextToken(); if (st.hasMoreTokens()) { token4 = st.nextToken(); mainid = "oth:" + token2 + "." + token3 + "." + token4; } } } } else if (!st.hasMoreTokens()) mainid = prefix + ":" + token; break; } else { if (mainid.length() > 0) break; mainid = prefix + ":" + token; break; } } } private void getOneline(int start, int end) { int i, idend, pos, pos1, pos2; String s; while (start < end && isSpace(inbuffer[start])) start++; while (start < end && isSpace(inbuffer[end-1])) end--; if (start < end && inbuffer[end-1] == '.') end--; if (start >= end) return; /* * First, look to see if the line begins with an identifier list * by checking for an initial '~' (specifying an accession number), * a '|' at the second or third character (specifying an NCBI ident list), * or a ':' at the second, third or fourth character (for a SEQIO list). */ for (idend=start; idend < end && !isSpace(inbuffer[idend]); idend++) ; if (idend - start >= 4 && (inbuffer[start] == '~' || inbuffer[start+2] == '|' || inbuffer[start+3] == '|' || inbuffer[start+2] == ':' || inbuffer[start+3] == ':' || inbuffer[start+4] == ':')) { getIDList("oth", start, idend); while (idend < end && isSpace(inbuffer[idend])) idend++; start = idend; } /* * Next, search the rest of the string for the two dividing points * in the oneline format: * 1) a ' - ' divides the description from the organism name, * 2) the suffix of the string consists of * - a comma * - a number (the sequence length) * - one of 'bp', 'aa' or 'ch' * - an optional string in parentheses, i.e. '(circular DNA)' * - an optional period * That separates the pieces of the line. */ pos1 = 0; pos2 = 0; for (pos=start; pos < end; pos++) { if (pos1 == 0 && inbuffer[pos] == '-' && pos > start && inbuffer[pos-1] == ' ' && pos+1 < end && inbuffer[pos+1] == ' ') pos1 = pos; else if (pos2 == 0 && inbuffer[pos] == ',') { for (i=pos+1; i < end && isSpace(inbuffer[i]); i++) ; if (i < end && isDigit(inbuffer[i])) { while (i < end && isDigit(inbuffer[i])) i++; while (i < end && isSpace(inbuffer[i])) i++; if (i + 1 < end && (isKeyword(i, "bp") || isKeyword(i, "aa") || isKeyword(i, "ch"))) { for (i+=2; i < end && isSpace(inbuffer[i]); i++) ; if (i < end && inbuffer[i] == '(') { while (i < end && inbuffer[i] != ')') i++; i++; } while (i < end && isSpace(inbuffer[i])) i++; if (i < end && inbuffer[i] == '.') i++; if (i == end) { pos2 = pos; break; } } } } } if (pos1 > 0) { description = instring.substring(start, pos1-1).trim(); if (pos2 > 0) organism = instring.substring(pos1+1, pos2).trim(); else organism = instring.substring(pos1+1, end).trim(); } else if (pos2 > 0) description = instring.substring(start, pos2).trim(); else description = instring.substring(start, end); if (pos2 > 0) { s = instring.substring(pos2, end); if (s.indexOf("DNA") != -1) alphabet = DNA; else if (s.indexOf("RNA") != -1) alphabet = RNA; else if (s.indexOf("aa") != -1) alphabet = PROTEIN; if (s.indexOf("circular") != -1) iscircular = true; if (s.indexOf("fragment") != -1) isfragment = true; } } /* * * * The file format input parsing routines. * * * */ /* * parseRaw * * Parse the Raw and Plain formats. */ private boolean parseRaw() { int i; if (seqcount != 0) return false; initParse(); entrystart = seqstart = 0; getSequence(0, inbuflen, false); entryend = inbuflen; return true; } /* * parseGenBank * * Parse the GenBank flat-file format. */ private boolean parseGenBank() { int i, pos, start, end; String alphastr; if (nextentry >= inbuflen) return false; /* * Begin the parse, getting the bounds of the entry and checking * to make sure the entry is in the correct format. */ initParse(); if (!getEntryBounds("LOCUS", "//")) return false; /* * Process the LOCUS line. */ pos = entrystart; for (start=pos; pos < entryend && inbuffer[pos] != '\n'; pos++) ; if (pos - start < 73) { error_message("Entry " + seqcount + " begins with an invalid LOCUS " + "line."); return false; } getIDList("gb", start+12, start+22); alphastr = instring.substring(start+36, start+40); if (alphastr.indexOf("RNA") != -1) alphabet = RNA; else if (alphastr.indexOf("DNA") != -1) alphabet = DNA; else if (alphastr.indexOf("PRT") != -1) alphabet = PROTEIN; iscircular = isKeyword(start+42, "circular"); date = instring.substring(start+62, start+73); /* * Process the other header lines. */ while (pos < entryend && !isKeyword(pos, "ORIGIN")) { /* * Find the end of the section (the next line that begins with * a non-space character). */ for (end=pos+1; end < entryend; end++) if (inbuffer[end-1] == '\n' && !isSpace(inbuffer[end])) break; switch (toUpperCase(inbuffer[pos])) { case 'A': if (isKeyword(pos, "ACCESSION")) getAccessions(pos+12, end); break; case 'C': if (isKeyword(pos, "COMMENT")) getComment(pos+12, end, 12); break; case 'D': if (isKeyword(pos, "DEFINITION")) getDescription(pos+12, end, 12); break; case 'N': if (isKeyword(pos, "NID")) { for (pos+=12; pos < end && !isAlNum(inbuffer[pos]); pos++) ; for (start=pos; pos < end && isAlNum(inbuffer[pos]); pos++) ; if (start < pos) nid = instring.substring(start, pos); } break; case 'P': if (isKeyword(pos, "PID")) { for (pos+=12; pos < end && !isAlNum(inbuffer[pos]); pos++) ; for (start=pos; pos < end && isAlNum(inbuffer[pos]); pos++) ; if (start < pos) pid = instring.substring(start, pos); } break; case 'S': if (isKeyword(pos, "SOURCE")) { for (start=pos; pos < end && inbuffer[pos] != '\n'; pos++) ; for (pos++; pos < end; pos++) { if (isKeyword(pos, " ORGANISM")) { for (pos+=12; pos < end && inbuffer[pos] == ' '; pos++) ; for (start=pos; pos < end && inbuffer[pos] != '\n'; pos++) ; if (start < pos) organism = instring.substring(start, pos); break; } while (pos < end && inbuffer[pos] != '\n') pos++; } } break; } pos = end; } /* * Process the sequence. */ if (pos < entryend) { for (pos+=6; pos < entryend && inbuffer[pos] != '\n'; pos++) ; seqstart = pos + 1; getSequence(seqstart, entryend, true); } entryend += 3; if (entryend < inbuflen) entryend++; nextentry = entryend; return true; } /* * parseEMBL * * Parse an EMBL, Swiss-Prot or EMBL/Swiss-Prot variant entry. */ private boolean parseEMBL() { int i, pos, start, end, count, index; boolean period; String prefix; if (nextentry >= inbuflen) return false; /* * Begin the parse, getting the bounds of the entry and checking * to make sure the entry is in the correct format. */ initParse(); if (!getEntryBounds("ID ", "//")) return false; /* * Process the ID line, first distinguishing between the * EMBL, EPD and Swiss-Prot entries. */ pos = entrystart + 5; for (end=pos; end < entryend && inbuffer[end] != '\n'; end++) ; count = index = 0; period = false; for (i=pos; i < end; i++) { if (inbuffer[i] == ';') { count++; index = i - 3; } else if (inbuffer[i] == '.') period = true; } if (count == 2 && period) { prefix = "sp"; sprot_flag = true; } else if (count == 3 && period && isKeyword(index, "EPD;")) prefix = "epd"; else prefix = "embl"; /* * Then, extract the identifier, circularity and alphabet information * from the line. */ while (pos < end && inbuffer[pos] == ' ') pos++; for (start=pos; pos < end && !isSpace(inbuffer[pos]); pos++) ; if (start < pos) getIDList(prefix, start, pos); for ( ; pos < end; pos++) { switch (inbuffer[pos]) { case 'c': if (isKeyword(pos, "circular")) iscircular = true; break; case 'd': case 'D': if (isKeyword(pos, "DNA")) alphabet = DNA; break; case 'r': case 'R': if (isKeyword(pos, "RNA")) alphabet = RNA; break; case 'p': case 'P': if (isKeyword(pos, "PRT")) alphabet = PROTEIN; break; } } /* * Process the other header lines. */ while (pos < entryend && !isKeyword(pos, " ")) { for (end=pos+1; end < entryend && inbuffer[end-1] != '\n'; end++) ; switch (toUpperCase(inbuffer[pos])) { case 'A': if (isKeyword(pos, "AC ")) getAccessions(pos+5, end); break; case 'C': case 'X': if (inbuffer[pos] == 'X') sprot_flag = false; if (isKeyword(pos, "CC ") || isKeyword(pos, "XX ")) { if (comment.length() > 0 && isKeyword(pos-3, "XX\n")) comment += "\n"; pos += 5; for (start=pos; pos < end && inbuffer[pos] != '\n'; pos++) ; comment += instring.substring(start, pos) + "\n"; } break; case 'D': if (isKeyword(pos, "DE ")) { getDescription(pos+5, end, 5); if ((i = description.indexOf("(FRAGMENT")) != -1 || (i = description.indexOf("(fragment")) != -1) { isfragment = true; description = description.substring(0, i).trim(); } } else if (isKeyword(pos, "DT ")) { for (pos+=5; pos < end && inbuffer[pos] == ' '; pos++) ; for (start=pos; pos < end; pos++) { if (isSpace(inbuffer[pos]) || inbuffer[pos] == ';' || inbuffer[pos] == '.') break; } date = instring.substring(start, pos); } break; case 'N': if (isKeyword(pos, "NI ")) { for (pos+=5; pos < end && !isAlNum(inbuffer[pos]); pos++) ; for (start=pos; pos < end && isAlNum(inbuffer[pos]); pos++) ; if (start < pos) nid = instring.substring(start, pos); } break; case 'P': if (isKeyword(pos, "PI ")) { for (pos+=5; pos < end && !isAlNum(inbuffer[pos]); pos++) ; for (start=pos; pos < end && isAlNum(inbuffer[pos]); pos++) ; if (start < pos) pid = instring.substring(start, pos); } break; case 'O': if (isKeyword(pos, "OS ")) { for (pos+=5; pos < end && inbuffer[pos] == ' '; pos++) ; for (start=pos; pos < end && inbuffer[pos] != '\n'; pos++) ; if (start < pos) { if (inbuffer[pos-1] == '.') pos--; organism = instring.substring(start, pos); } } break; } pos = end; } /* * Process the sequence. */ if (pos < entryend) { seqstart = pos; getSequence(pos, entryend, true); } entryend += 3; if (entryend < inbuflen) entryend++; nextentry = entryend; return true; } private boolean parsePIR() { int i, pos, start, end; if (nextentry >= inbuflen) return false; /* * Begin the parse, getting the bounds of the entry and checking * to make sure the entry is in the correct format. */ initParse(); if (!getEntryBounds("ENTRY", "///")) return false; /* * Process the ENTRY line. */ pos = entrystart; for (pos+=6; pos < entryend && inbuffer[pos] == ' '; pos++) ; if (inbuffer[pos] == '\n') { error_message("Entry " + seqcount + " begins with an invalid ENTRY " + "line"); return false; } for (start=pos; pos < entryend && !isSpace(inbuffer[pos]); pos++) ; if (start < pos) getIDList("pir", start, pos); for ( ; pos < entryend && inbuffer[pos] != '\n'; pos++) if (inbuffer[pos] == 'f' && isKeyword(pos, "fragment")) isfragment = true; /* * Process the other header lines. */ while (pos < entryend && !isKeyword(pos, "SEQUENCE")) { /* * Find the end of the section (the next line that begins with * a non-space character). */ for (end=pos+1; end < entryend; end++) if (inbuffer[end-1] == '\n' && !isSpace(inbuffer[end])) break; switch (toUpperCase(inbuffer[pos])) { case 'A': if (isKeyword(pos, "ACCESSION")) getAccessions(pos+10, end); break; case 'C': if (isKeyword(pos, "COMMENT")) getComment(pos+11, end, 11); break; case 'D': if (isKeyword(pos, "DATE")) { for (pos+=4; pos < end && inbuffer[pos] == ' '; pos++) ; start = pos; for ( ; pos < end; pos++) { if (inbuffer[pos] == '#') { while (pos < end && !isSpace(inbuffer[pos])) pos++; while (pos < end && isSpace(inbuffer[pos])) pos++; if (pos + 11 < end) start = pos; } } date = instring.substring(start, start+11); } break; case 'N': if (isKeyword(pos, "NID")) { for (pos+=11; pos < end && !isAlNum(inbuffer[pos]); pos++) ; for (start=pos; pos < end && isAlNum(inbuffer[pos]); pos++) ; if (start < pos) nid = instring.substring(start, pos); } break; case 'P': if (isKeyword(pos, "PID")) { for (pos+=11; pos < end && !isAlNum(inbuffer[pos]); pos++) ; for (start=pos; pos < end && isAlNum(inbuffer[pos]); pos++) ; if (start < pos) pid = instring.substring(start, pos); } break; case 'O': if (isKeyword(pos, "ORGANISM")) { for (pos+=8; pos < end; pos++) { if (inbuffer[pos] == '#' && isKeyword(pos, "#formal_name")) { while (pos < end && !isSpace(inbuffer[pos])) pos++; while (pos < end && isSpace(inbuffer[pos])) pos++; boolean flag = true; while (pos < end && inbuffer[pos] != '#') { for (start=pos; pos < end && !isSpace(inbuffer[pos]); pos++) ; organism += (flag ? "" : " ") + instring.substring(start, pos); while (pos < end && isSpace(inbuffer[pos])) pos++; flag = false; } break; } } } break; case 'T': if (isKeyword(pos, "TITLE")) { getDescription(pos + 11, end, 11); if ((i = description.indexOf("(fragment)")) != -1) { isfragment = true; description = description.substring(0, i).trim(); } if ((i = description.indexOf(" - ")) != -1) description = description.substring(0, i).trim(); } break; } pos = end; } /* * Process the sequence. */ if (pos < entryend) { for (pos+=7; pos < entryend && inbuffer[pos] != '\n'; pos++) ; seqstart = pos + 1; getSequence(seqstart, entryend, true); } entryend += 4; if (entryend < inbuflen) entryend++; nextentry = entryend; return true; } private boolean parseNBRF() { int pos, start, end; char ch; if (nextentry >= inbuflen) return false; initParse(); entrystart = nextentry; /* * Check to make sure the entry is in the correct format. */ if (inbuffer[entrystart] != '>') { error_message("Entry " + seqcount + " does not begin with a '>' " + "header line."); return false; } if (entrystart + 5 >= inbuflen) { error_message("Entry " + seqcount + " begins with an invalid " + "header line."); return false; } pos = entrystart; /* * Process the header line. */ isfragment = (toUpperCase(inbuffer[pos+1]) == 'F'); iscircular = (toUpperCase(inbuffer[pos+2]) == 'C'); pos+=4; for (end=pos; end < inbuflen && inbuffer[end] != '\n'; end++) ; getIDList("oth", pos, end); pos = end + 1; /* * Process the description line. */ for (start=pos; pos < inbuflen && inbuffer[pos] != '\n'; pos++) ; if (start < pos) getOneline(start, pos); /* * Process the sequence. It must end with an asterisk. */ seqstart = pos + 1; for (end=pos; end < inbuflen && inbuffer[end] != '*'; end++) ; getSequence(seqstart, end, false); /* * Process any other header lines. */ for (pos=end; pos < inbuflen && inbuffer[pos] != '\n'; pos++) ; nbrf_header = pos + 1; while (pos < inbuflen && inbuffer[pos] != '>') { for (end=pos; end < inbuflen && inbuffer[end] != '\n'; end++) ; if (toUpperCase(inbuffer[pos]) == 'C' && inbuffer[pos+1] == ';') { pos += 2; switch (toUpperCase(inbuffer[pos])) { case 'A': if (isKeyword(pos, "ACCESSION:")) getAccessions(pos + 10, end); break; case 'C': if (isKeyword(pos, "COMMENT:")) getComment(pos + 9, end, 11); break; case 'D': if (isKeyword(pos, "DATE:")) { for (pos+=5; pos < end && inbuffer[pos] == ' '; pos++) ; start = pos; for ( ; pos < end; pos++) { if (inbuffer[pos] == '#') { while (pos < end && !isSpace(inbuffer[pos])) pos++; while (pos < end && isSpace(inbuffer[pos])) pos++; if (pos + 11 < end) start = pos; } } date = instring.substring(start, start+11); } break; } } pos = (end < inbuflen ? end + 1 : inbuflen); } nextentry = entryend = pos; return true; } private boolean parseFASTA() { int pos, start, end; if (nextentry >= inbuflen) return false; initParse(); entrystart = nextentry; /* * Check to make sure the entry is in the correct format. */ if (inbuffer[entrystart] != '>') { error_message("Entry " + seqcount + " does not begin with a '>' " + "header line."); return false; } pos = entrystart; /* * Process the header line. */ pos++; for (start=pos; pos < inbuflen && inbuffer[pos] != '\n'; pos++) ; if (start < pos) getOneline(start, pos); pos++; /* * Process any comment lines. */ while (pos < inbuflen && (inbuffer[pos] == '>' || inbuffer[pos] == ';')) { for (start=pos+1; pos < inbuflen && inbuffer[pos] != '\n'; pos++) ; comment += instring.substring(start, pos) + "\n"; pos++; } /* * Process the sequence. */ seqstart = pos; for (end=pos; end < inbuflen; end++) if (inbuffer[end-1] == '\n' && inbuffer[end] == '>') break; if (seqbuflen < end - pos) { seqbuflen = end - pos; seq = new char[seqbuflen]; } for( ; pos < end; pos++) { if (inbuffer[pos] == ';') { while (pos < end && inbuffer[pos] != '\n') pos++; } else if (!isDigit(inbuffer[pos]) && !isSpace(inbuffer[pos])) seq[seqlen++] = inbuffer[pos]; } nextentry = entryend = pos; return true; } private boolean parseIG() { int pos, start, end; if (nextentry >= inbuflen) return false; initParse(); entrystart = nextentry; /* * Check to make sure the entry is in the correct format. */ if (inbuffer[entrystart] != ';') { error_message("Entry " + seqcount + " does not begin with a ';' " + "header line."); return false; } pos = entrystart; /* * Process any comment lines. */ while (pos < inbuflen && inbuffer[pos] == ';') { for (start=pos+1; pos < inbuflen && inbuffer[pos] != '\n'; pos++) ; comment += instring.substring(start, pos) + "\n"; pos++; } /* * Process the description line. */ for (start=pos; pos < inbuflen && inbuffer[pos] != '\n'; pos++) ; if (start < pos) getOneline(start, pos); pos++; /* * Process the sequence. It must end with a '1' or '2'. */ seqstart = pos; for (end=pos; end < inbuflen; end++) if (inbuffer[end] == '1' || inbuffer[end] == '2') break; getSequence(pos, end, false); iscircular = (end < inbuflen && inbuffer[end] == '2'); /* * Advance to the beginning of the next entry. */ while (pos < inbuflen && (inbuffer[pos-1] != '\n' || inbuffer[pos] != ';')) pos++; nextentry = entryend = pos; return true; } private boolean parseGCG() { int pos; if (seqcount > 0) return false; initParse(); entrystart = 0; /* * Check to make sure the entry is in the correct format. */ if ((pos = instring.indexOf("..\n")) == -1) { error_message("Invalid format for a GCG entry."); return false; } seqstart = pos + 3; while (pos >= 0 && inbuffer[pos] != '\n') pos--; pos++; if (entrystart < pos) getComment(entrystart, pos, 0); /* Parse GCG line and then the sequence. */ return true; } /* * * * * The utility functions involved with the output generators. * * * */ /* * initPrint * * Set up an internal buffer to buffer the output produced by the * output functions (which is sent to the output textarea). */ private char outbuffer[] = new char[2048]; private int outbuflen = 0, outbufsize = 2048; private void initPrint() { outbuflen = 0; } private void print(String s) { if (outbuflen + s.length() <= outbufsize) { s.getChars(0, s.length(), outbuffer, outbuflen); outbuflen += s.length(); } else { if (outbuflen > 0) output_text.appendText(String.copyValueOf(outbuffer, 0, outbuflen)); if (s.length() <= 1800) { s.getChars(0, s.length(), outbuffer, 0); outbuflen = s.length(); } else { output_text.appendText(s); outbuflen = 0; } } } private void print(char buf[], int len) { print(buf, 0, len); } private void print(char buf[], int start, int end) { int len = end - start; if (outbuflen + len <= outbufsize) { System.arraycopy(buf, start, outbuffer, outbuflen, len); outbuflen += len; } else { if (outbuflen > 0) output_text.appendText(String.copyValueOf(outbuffer, 0, outbuflen)); if (len <= 1800) { System.arraycopy(buf, start, outbuffer, 0, len); outbuflen = len; } else { output_text.appendText(String.copyValueOf(buf, start, len)); outbuflen = 0; } } } private void print(int num, String s) { int i; boolean flag; /* * A negative length puts the string on the left (right padded), * a positive length puts the string on the right (left pads). */ flag = (num < 0); if (flag) num = -num; if (s.length() <= num) { if (!flag) for (i=s.length(); i < num; i++) print(" "); print(s); if (flag) for (i=s.length(); i < num; i++) print(" "); } else { print(s.substring(1, num)); } } private void endPrint() { if (error_flag) outbuflen = 0; else if (outbuflen > 0) { output_text.appendText(String.copyValueOf(outbuffer, 0, outbuflen)); outbuflen = 0; } } private void putline(String s, String line1, String line2, int maxlen) { int len, pos; boolean firstflag; String line, token; StringTokenizer st1; print(line1); len = line1.length(); st1 = new StringTokenizer(s, "\n", true); while (st1.hasMoreTokens()) { if (len == 0) { print(line2); len = line2.length(); } line = st1.nextToken(); if (line.equals("\n")) { print(line); len = 0; } else { while (maxlen > 0 && len + line.length() > maxlen) { if ((pos = line.lastIndexOf(' ', maxlen - len)) == -1 && (pos = line.indexOf(' ', maxlen - len)) == -1) { print(line); line = ""; } else { print(line.substring(0, pos)); print("\n" + line2); len = line2.length(); if (pos + 1 < line.length()) line = line.substring(pos + 1); else line = ""; } } print(line); len += line.length(); } } if (len > 0) print("\n"); } private void put_oneline(int format) { int i, truelen; boolean flag; StringTokenizer st; if (format == OUT_NBRF) { if (acclist.length() > 0) { st = new StringTokenizer(acclist, ","); print("~" + st.nextToken() + " "); } } else if (mainid.length() > 0) { print(mainid); if (acclist.length() > 0) { st = new StringTokenizer(acclist, ","); print("|acc:" + st.nextToken()); } print(" "); } if (description.length() > 0) { print(description); if (organism.length() > 0) { print(" - "); print(organism); } for (i=0,truelen=0; i < seqlen; i++) if (isAlpha(seq[i])) truelen++; print(", " + truelen); if (alphabet == DNA || alphabet == RNA) print(" bp"); else if (alphabet == PROTEIN) print(" aa"); else print(" ch"); if (isfragment || iscircular || alphabet == RNA) { print(" ("); flag = false; if (iscircular) { print("circular"); flag = true; } if (alphabet == RNA) { print((flag ? " " : "") + "RNA"); flag = true; } if (isfragment) print((flag ? " " : "") + "fragment"); print(")"); } print("."); } print("\n"); } /* * outputRaw * * Output the Raw sequence. */ private void outputRaw() { if (seqlen > 0) output_text.appendText(String.copyValueOf(seq, 0, seqlen)); else error_message("No sequence to output."); } /* * outputPlain * * Output the sequence 60 characters to a line. */ private void outputPlain() { int len; char line[] = new char[80]; if (seqlen == 0) error_message("No sequence to output."); else { len = 0; for (int i=0,k=0; i < seqlen; i++,k++) { if (k == 60) { line[len++] = '\n'; print(line, len); k = len = 0; } line[len++] = seq[i]; } line[len++] = '\n'; print(line, len); } } /* * outputGenBank * * Output a GenBank flat-file entry. */ private void outputGenBankHeader() { int length; boolean flag; String line; print("LOCUS "); /* * Print the main identifier. */ if (mainid.length() == 0) print(-10, "Unknown"); else if (mainid.startsWith("gb:")) print(-10, mainid.substring(3)); else if (mainid.length() <= 10) print(-10, mainid); else print(-10, mainid.substring(mainid.indexOf(":") + 1)); /* * Print the rest of the LOCUS line. */ print(7, "" + seqlen); if (alphabet == PROTEIN) print(" aa PRT "); else if (alphabet == DNA) print(" bp DNA "); else if (alphabet == RNA) print(" bp RNA "); else print(" ch "); if (iscircular) print(" circular "); else print(" "); print("UNC "); if (date.length() > 0) print(-11, date); else print("01-JAN-0000"); print("\n"); /* * Print the definition line, if it exists. */ if (description.length() > 0) { line = (description.endsWith(".") ? description : description + "."); putline(line, "DEFINITION ", " ", 80); } if (acclist.length() > 0) { StringTokenizer st = new StringTokenizer(acclist, ","); String token; token = st.nextToken(); print("ACCESSION " + token); length = 12 + token.length(); while (st.hasMoreTokens()) { token = st.nextToken(); if (length + token.length() + 1 > 80) { print("\n " + token); length = 12 + token.length(); } else { print(" " + token); length += 1 + token.length(); } } print("\n"); } if (nid.length() > 0) print("NID " + nid + "\n"); if (pid.length() > 0) print("PID " + pid + "\n"); if (organism.length() > 0) { print("SOURCE .\n"); putline(organism, " ORGANISM ", " ", 80); } flag = false; if (comment.length() > 0) { putline(comment, "COMMENT ", " ", 80); flag = true; } /* * Print the BASE COUNT line. */ if (alphabet == DNA || alphabet == RNA) { print("BASE COUNT "); print(7, "" + acount); print(" a"); print(7, "" + ccount); print(" c"); print(7, "" + gcount); print(" g"); if (alphabet == RNA && ucount > 0) { print(7, "" + ucount); print(" u"); } else { print(7, "" + tcount); print(" t"); } if (ocount > 0) { print(7, "" + ocount); print(" others"); } print("\n"); } print("ORIGIN \n"); } /* * outputGenBankSequence * * Output the sequence lines for a GenBank flat-file entry. */ private void outputGenBankSequence() { int i, j, k, l, count; char line[] = new char[80]; for (i=0,count=1; i < seqlen; count+=60) { print(" "); print(6, "" + count); for (j=0,l=0; i < seqlen && j < 6; j++) { line[l++] = ' '; for (k=0; i < seqlen && k < 10; k++) line[l++] = seq[i++]; } line[l++] = '\n'; print(line, l); } print("//\n"); } private void outputEMBLHeader() { int length; String line; print("ID "); if (mainid.length() == 0) print(-10, "Unknown"); else if (mainid.startsWith("embl:")) print(-10, mainid.substring(5)); else if (mainid.startsWith("epd:")) print(-10, mainid.substring(4)); else print(mainid); print(" converted; "); if (iscircular) print("circular "); if (alphabet == DNA) print("DNA; "); else if (alphabet == RNA) print("RNA; "); else if (alphabet == PROTEIN) print("PRT; "); else print("UNK; "); if (mainid.length() > 0 && mainid.startsWith("epd:")) print("EPD; "); else print("UNC; "); if (alphabet == DNA || alphabet == RNA) print("" + seqlen + " BP.\n"); else if (alphabet == PROTEIN) print("" + seqlen + " AA.\n"); else print("" + seqlen + " CH.\n"); print("XX\n"); if (acclist.length() > 0) { StringTokenizer st = new StringTokenizer(acclist, ","); String token; token = st.nextToken(); print("AC " + token + ";"); length = 5 + token.length() + 1; while (st.hasMoreTokens()) { token = st.nextToken(); if (length + token.length() + 2 > 80) { print("\nAC " + token + ";"); length = 5 + token.length() + 1; } else { print(" " + token + ";"); length += 2 + token.length(); } } print("\nXX\n"); } if (nid.length() > 0) print("NI " + nid + "\nXX\n"); if (pid.length() > 0) print("PI " + pid + "\nXX\n"); if (date.length() > 0) print("DT " + date + "\nXX\n"); if (description.length() > 0) { line = description; if (isfragment) line += " (fragment)"; putline(line, "DE ", "DE ", 80); print("XX\n"); } if (organism.length() > 0) { putline(organism, "OS ", "OS ", 80); print("XX\n"); } if (comment.length() > 0) { putline(comment, "CC ", "CC ", 80); print("XX\n"); } print("SQ Sequence " + seqlen); if (alphabet == DNA || alphabet == RNA) { print(" BP; " + acount + " A; " + ccount + " C; " + gcount + " G; "); if (tcount > 0) print(tcount + " T; "); else print(ucount + " U; "); print(ocount + " other;\n"); } else if (alphabet == PROTEIN) print(" AA;\n"); else print(" CH;\n"); } private void outputEMBLSequence() { int i, j, k, l, count; char line[] = new char[81]; for (i=0,count=60; i < seqlen; count+=60) { print(" "); for (j=0,l=0; j < 6; j++) { line[l++] = ' '; for (k=0; k < 10; k++) line[l++] = (i < seqlen ? seq[i++] : ' '); } print(line, l); print(10, "" + (i < seqlen ? count : seqlen)); print("\n"); } print("//\n"); } private void outputSProtHeader() { int length; String line; print("ID "); if (mainid.length() == 0) print(-10, "Unknown"); else if (mainid.startsWith("sp:")) print(-10, mainid.substring(3)); else print(mainid); print(" CONVERTED; "); if (iscircular) print("circular "); else print(" "); if (alphabet == DNA) print("DNA; "); else if (alphabet == RNA) print("RNA; "); else if (alphabet == PROTEIN) print("PRT; "); else print("UNK; "); print(5, "" + seqlen); if (alphabet == DNA || alphabet == RNA) print(" BP.\n"); else if (alphabet == PROTEIN) print(" AA.\n"); else print(" CH.\n"); if (acclist.length() > 0) { StringTokenizer st = new StringTokenizer(acclist, ","); String token; token = st.nextToken(); print("AC " + token + ";"); length = 5 + token.length() + 1; while (st.hasMoreTokens()) { token = st.nextToken(); if (length + token.length() + 2 > 80) { print("\nAC " + token + ";"); length = 5 + token.length() + 1; } else { print(" " + token + ";"); length += 2 + token.length(); } } print("\n"); } if (nid.length() > 0) print("NI " + nid + "\n"); if (pid.length() > 0) print("PI " + pid + "\n"); if (date.length() > 0) print("DT " + date + "\n"); if (description.length() > 0) { line = description; if (isfragment) line += " (FRAGMENT)"; if (!line.endsWith(".")) line += "."; putline(line, "DE ", "DE ", 80); } if (organism.length() > 0) { line = organism + "."; putline(organism, "OS ", "OS ", 80); } if (comment.length() > 0) putline(comment, "CC ", "CC ", 80); print("SQ SEQUENCE " + seqlen); if (alphabet == DNA || alphabet == RNA) print(" BP;\n"); else if (alphabet == PROTEIN) print(" AA;\n"); else print(" CH;\n"); } private void outputSProtSequence() { int i, j, k, l, count; char line[] = new char[80]; for (i=0,count=60; i < seqlen; count+=60) { print(" "); for (j=0,l=0; j < 6; j++) { line[l++] = ' '; for (k=0; i < seqlen && k < 10; k++) line[l++] = seq[i++]; } line[l++] = '\n'; print(line, l); } print("//\n"); } private void outputPIRHeader() { int length; String line; print("ENTRY "); if (mainid.length() == 0) print("UNKNWN"); else if (mainid.startsWith("pir:")) print(mainid.substring(4)); else print(mainid); if (isfragment) print(" #type fragment\n"); else print(" #type complete\n"); if (description.length() > 0) { line = description; if (organism.length() > 0) line += " - " + organism; if (isfragment) line += " (fragment)"; putline(line, "TITLE ", " ", 80); } if (organism.length() > 0) putline(organism, "ORGANISM #formal_name ", " ", 80); if (date.length() > 0) print("DATE " + date + "\n"); if (acclist.length() > 0) { StringTokenizer st = new StringTokenizer(acclist, ","); String token; token = st.nextToken(); print("ACCESSIONS " + token); length = 17 + token.length(); while (st.hasMoreTokens()) { token = st.nextToken(); if (length + token.length() + 2 > 80) { print(";\n " + token); length = 17 + token.length(); } else { print("; " + token); length += 2 + token.length(); } } print("\n"); } if (comment.length() > 0) putline(comment, "COMMENT ", " ", 80); print("SUMMARY #length " + seqlen + "\n"); print("SEQUENCE\n"); } private void outputPIRSequence() { int i, j, k, l, count; char line[] = new char[80]; print(" 5 10 15" + " 20 25 30\n"); for (i=0,count=1; i < seqlen; count+=30) { print(7, "" + count); for (j=0,l=0; i < seqlen && j < 30; j++) { line[l++] = ' '; line[l++] = seq[i++]; } line[l++] = '\n'; print(line, l); } print("///\n"); } private void outputFASTAHeader() { print(">"); put_oneline(OUT_FASTA); if (comment.length() > 0) putline(comment, ";", ";", 0); } private void outputFASTAoldHeader() { print(">"); put_oneline(OUT_FASTA); } private void outputFASTASequence() { int len; char line[] = new char[80]; len = 0; for (int i=0,k=0; i < seqlen; i++,k++) { if (k == 60) { line[len++] = '\n'; print(line, len); k = len = 0; } line[len++] = seq[i]; } line[len++] = '\n'; print(line, len); } private void outputNBRFHeader() { print(">"); if (alphabet == DNA) print(iscircular ? "DC" : "DL"); else if (alphabet == RNA) print(iscircular ? "RC" : "RL"); else if (alphabet == PROTEIN) print(isfragment ? "F1" : "P1"); else print("XX"); print(";"); if (mainid.length() > 0) print(mainid); else print("Unknown"); print("\n");