/* * Copyright (C) Lennart Martens * * Contact: lennart.martens AT UGent.be (' AT ' to be replaced with '@') */ /* * Created by IntelliJ IDEA. * User: Lennart * Date: 7-okt-02 * Time: 13:43:28 */ package com.compomics.util.protein; import com.compomics.util.experiment.identification.protein_sequences.SequenceFactory; import java.io.Serializable; import org.apache.log4j.Logger; import java.util.StringTokenizer; /** * This class represents the header for a Protein instance. It is meant to work * closely with FASTA format notation. The Header class knows how to handle * certain often-used headers such as SwissProt and NCBI formatted FASTA * headers.<br> Note that the Header class is it's own factory, and should be * used as such. * * @author Lennart Martens * @author Harald Barsnes * @author Marc Vaudel */ public class Header implements Cloneable, Serializable { /** * The version UID for Serialization/Deserialization compatibility. */ static final long serialVersionUID = 7665784733371863163L; /** * Class specific log4j logger for Header instances. */ static Logger logger = Logger.getLogger(Header.class); /** * Private constructor to force use of factory methods. */ private Header() { } /** * The ID String corresponds to the String that is present as the first * element following the opening '>'. It is most notably 'sw' for * SwissProt, and 'gi' for NCBI. <br> ID is the first element in the * abbreviated header String. */ private String iID = null; /** * The foreign ID is the ID of another database this entry is originally * from. Most notably used for SwissProt entries in NCBI. <br> The foreign * ID String is an addendum to the accession String in the abbreviated * header String. */ private String iForeignID = null; /** * The accession String is the unique identifier for the sequence in the * respective database. Note that for NCBI, the accession number also * defines a unique moment in time. <br> Accession String is the second * element in the abbreviated header String. */ private String iAccession = null; /** * Extracted database name. As there are no standard database names, this is * only an internally consistent naming scheme included to be able to later * separate the databases. For example when linking to the online version of * the database. The links themselves are not included as these might change * outside the control of the compomics-utilities library. Note that the * type is set to unknown by default, and is set to the correct type during * the parsing of the header. */ private DatabaseType databaseType = DatabaseType.Unknown; /** * A list of the database types. As there are no standard database names, * this is only an internally consistent naming scheme included to be able * to later separate the databases. For example when linking to the online * version of the database. The links themselves are not included as these * might change outside the control of the compomics-utilities library. */ public enum DatabaseType { UniProt("UniProtKB", "14681372"), EnsemblGenomes("Ensembl Genomes", "26578574"), SGD("Saccharomyces Genome Database (SGD)", "9399804"), Arabidopsis_thaliana_TAIR("The Arabidopsis Information Resource (TAIR)", "12519987"), PSB_Arabidopsis_thaliana("PSB Arabidopsis thaliana", null), Drosophile("Drosophile", null), Flybase("Flybase", null), NCBI("NCBI Reference Sequences (RefSeq)", "22121212"), M_Tuberculosis("TBDatabase (TBDB)", "18835847"), H_Invitation("H_Invitation", null), Halobacterium("Halobacterium", null), H_Influenza("H_Influenza", null), C_Trachomatis("C_Trachomatis", null), GenomeTranslation("Genome Translation", null), Listeria("Listeria", null), GAFFA("GAFFA", null), UPS("Universal Proteomic Standard (UPS)", null), Generic_Header(null, null), IPI("International Protein Index (IPI)", "15221759"), Generic_Split_Header(null, null), NextProt("neXtProt", "22139911"), UniRef("UniRef", null), Unknown(null, null); // @TODO: add support for Ensembl headers? /** * The full name of the database. */ String fullName; /** * The PubMed id of the database. */ String pmid; /** * Constructor. * * @param fullName the full name * @param pmid the PubMed ID. */ private DatabaseType(String fullName, String pmid) { this.fullName = fullName; this.pmid = pmid; } /** * Returns the full name of the database, null if not set. * * @return the full name of the database */ public String getFullName() { return fullName; } /** * Returns the PubMed id of the database, null if not set. * * @return the PubMed id of the database */ public String getPmid() { return pmid; } } /** * The foreign accession String is an accession String in another database * of significance. Most notably used for SwissProt accessions that are kept * in the NCBI database. <br> The foreign accession String is an addendum to * the foreign ID String in the abbreviated header String. */ private String iForeignAccession = null; /** * The description is a more or less elaborate description of the protein in * question. <br> The description is the third element (and final) in the * abbreviated header String. */ private String iDescription = null; /** * A short protein description, removing all but the protein description * itself. For example: "GRP78_HUMAN 78 kDa glucose-regulated protein * OS=Homo sapiens GN=HSPA5 PE=1 SV=2" becomes "78 kDa glucose-regulated * protein". */ private String iDescriptionShort = null; /** * Protein name, the protein name extracted from the protein description. * For example: "GRP78_HUMAN 78 kDa glucose-regulated protein OS=Homo * sapiens GN=HSPA5 PE=1 SV=2" returns "GRP78_HUMAN". */ private String iDescriptionProteinName = null; /** * The name of the gene the protein comes from. Note that this is only * available for UniProt and NextProt based databases. */ private String iGeneName = null; /** * The protein evidence for the protein. Note that this is only available * for UniProt-based databases. */ private String iProteinEvidence = null; /** * The name of the taxonomy the protein comes from. Note that this is only * available for UniProt-based databases. */ private String iTaxonomy = null; /** * The foreign Description is a description for an entry in another DB. Most * notably, the SwissProt short description for an entry that is found * within NCBI. <br> The foreign description is an addendum to the foreign * accession String in the abbreviated header String. */ private String iForeignDescription = null; /** * This variable holds all unidentified parts for the Header. If the String * was not (recognized as) a standard SwissProt or NCBI header, this * variable holds the entire header. */ private String iRest = null; /** * This variable holds the raw complete unformatted header. Only trailing * white space is removed. */ private String iRawHeader = null; /** * This StringBuffer holds all the addenda for this header. */ private StringBuffer iAddenda = null; /** * This variable holds a possible start index for the associated sequence. */ private int iStart = -1; /** * This variable holds a possible end index for the associated sequence. */ private int iEnd = -1; /** * Factory method that constructs a Header instance based on a FASTA header * line. * * @param aFASTAHeader the String with the original FASTA header line. * @return Header with the Header instance representing the given header. * The object returned will have been parsed correctly if it is a standard * SwissProt or NCBI formatted header, and will be plain in all other cases. * @throws StringIndexOutOfBoundsException thrown if issues occur during the * parsing */ public static Header parseFromFASTA(String aFASTAHeader) throws StringIndexOutOfBoundsException { Header result = null; if (aFASTAHeader == null) { // Do nothing, just return 'null'. } else if (aFASTAHeader.trim().equals("")) { result = new Header(); result.iRest = ""; result.iRawHeader = ""; } else { result = new Header(); // remove leading and trailing white space aFASTAHeader = aFASTAHeader.trim(); // save the raw unformatted header result.iRawHeader = aFASTAHeader; // remove leading '>', if present if (aFASTAHeader.startsWith(">")) { aFASTAHeader = aFASTAHeader.substring(1); } // Now check for the possible presence of addenda in the header. // First check the description for addenda, and if that should fail, give 'Rest' a chance. int liPos; if ((liPos = aFASTAHeader.indexOf("^A")) >= 0) { result.iAddenda = new StringBuffer(aFASTAHeader.substring(liPos)); aFASTAHeader = aFASTAHeader.substring(0, liPos); } try { // First determine what kind of Header we've got. if (aFASTAHeader.startsWith("sw|") || aFASTAHeader.startsWith("SW|")) { // SwissProt. // We need to find three elements: // - the ID (sw, we already know that one). // - the accession String (easily retrieved as the next String). // - the description (composed of the short description and the longer, // verbose description) StringTokenizer lSt = new StringTokenizer(aFASTAHeader, "|"); // There should be at least three tokens. if (lSt.countTokens() < 3) { throw new IllegalArgumentException("Non-standard or false SwissProt header passed. " + "Expecting something like: '>sw|Pxxxx|ACTB_HUMAN xxxx xxx xxxx ...', received '" + aFASTAHeader + "'."); } else { result.databaseType = DatabaseType.UniProt; result.iID = lSt.nextToken(); result.iAccession = lSt.nextToken(); // Check for the presence of a location. int index; if ((index = result.iAccession.indexOf(" (")) > 0) { String temp = result.iAccession.substring(index); result.iAccession = result.iAccession.substring(0, index); int open = 2; int minus = temp.indexOf("-"); int end = temp.indexOf(")"); result.iStart = Integer.parseInt(temp.substring(open, minus)); result.iEnd = Integer.parseInt(temp.substring(minus + 1, end)); } // get the description result.iDescription = lSt.nextToken(); // try to get the gene name and taxonomy from the description parseUniProtDescription(result); // If there are any more elements, add them to the 'rest' section. if (lSt.hasMoreTokens()) { StringBuilder lBuffer = new StringBuilder(); while (lSt.hasMoreTokens()) { lBuffer.append(lSt.nextToken()); } result.iRest = lBuffer.toString(); } } } else if (aFASTAHeader.startsWith("gi|") || aFASTAHeader.startsWith("GI|")) { // NCBI. // We need to check for a number of things here: // - first of all, we should get the ID (which we already have, 'gi') // - second is the NCBI accession String // - third we need to check for a foreign ID and accession // - If there is a foreign accession, there could also be a description // associated. Get that one too. // - finally, get the full NCBI description. StringTokenizer lSt = new StringTokenizer(aFASTAHeader, "|"); // We expect to see either two or at least four or more tokens. int tokenCount = lSt.countTokens(); if (tokenCount == 3) { result.databaseType = DatabaseType.NCBI; result.iID = lSt.nextToken(); result.iAccession = lSt.nextToken(); // Check for the presence of a location. int index; if ((index = result.iAccession.indexOf(" (")) > 0) { String temp = result.iAccession.substring(index); result.iAccession = result.iAccession.substring(0, index); int open = 2; int minus = temp.indexOf("-"); int end = temp.indexOf(")"); result.iStart = Integer.parseInt(temp.substring(open, minus)); result.iEnd = Integer.parseInt(temp.substring(minus + 1, end)); } result.iDescription = lSt.nextToken().trim(); } else if (tokenCount < 4) { throw new IllegalArgumentException("Non-standard or false NCBInr header passed. " + "Expecting something like: '>gi|xxxxx|xx|xxxxx|(x) xxxx xxx xxxx ...', received '" + aFASTAHeader + "'."); } else { result.databaseType = DatabaseType.NCBI; result.iID = lSt.nextToken(); result.iAccession = lSt.nextToken(); // Check for the presence of a location. int index; if ((index = result.iAccession.indexOf(" (")) > 0) { String temp = result.iAccession.substring(index); result.iAccession = result.iAccession.substring(0, index); int open = 2; int minus = temp.indexOf("-"); int end = temp.indexOf(")"); result.iStart = Integer.parseInt(temp.substring(open, minus)); result.iEnd = Integer.parseInt(temp.substring(minus + 1, end)); } result.iForeignID = lSt.nextToken(); // Only retrieve the foreign accession if it is specifed (meaning a token count of 5). if (tokenCount >= 5) { result.iForeignAccession = lSt.nextToken(); } StringBuilder lSB = new StringBuilder(); while (lSt.hasMoreTokens()) { lSB.append(lSt.nextToken()); } String temp = lSB.toString(); if (temp.startsWith(" ")) { // Only description present. result.iDescription = temp.substring(1); } else { // Up to the first space is foreign description. int location = temp.indexOf(" "); result.iForeignDescription = temp.substring(0, location); result.iDescription = temp.substring(location + 1); } } } else if (aFASTAHeader.startsWith("IPI:") || aFASTAHeader.startsWith("ipi:") || aFASTAHeader.startsWith("IPI|") || aFASTAHeader.startsWith("ipi|")) { // An IPI header looks like: // >IPI:IPIxxxxxx.y|REFSEQ_XP:XP_aaaaa[|many more like this can be present] Tax_Id=9606 descr result.databaseType = DatabaseType.IPI; result.iID = "IPI"; result.iAccession = aFASTAHeader.substring(4, aFASTAHeader.indexOf("|", 4)); // Check for the presence of a location. int index; if ((index = result.iAccession.indexOf(" (")) > 0) { String temp = result.iAccession.substring(index); result.iAccession = result.iAccession.substring(0, index); int open = 2; int minus = temp.indexOf("-"); int end = temp.indexOf(")"); result.iStart = Integer.parseInt(temp.substring(open, minus)); result.iEnd = Integer.parseInt(temp.substring(minus + 1, end)); } // Take everything from the first '|' we meet after the accession number. result.iDescription = aFASTAHeader.substring(aFASTAHeader.indexOf("|", 5) + 1); } else if (aFASTAHeader.startsWith("HIT")) { try { //http://www.h-invitational.jp/ // A H-Invitation database entry looks like: // >HIT000000001.10|HIX0021591.10|AB002292.2|NO|NO|HC|cds 185..4219|DH domain containing protein. result.databaseType = DatabaseType.H_Invitation; result.iID = ""; result.iAccession = aFASTAHeader.substring(0, aFASTAHeader.indexOf("|")); // Check for the presence of a location. int index; if ((index = result.iAccession.indexOf(" (")) > 0) { String temp = result.iAccession.substring(index); result.iAccession = result.iAccession.substring(0, index); int open = 2; int minus = temp.indexOf("-"); int end = temp.indexOf(")"); result.iStart = Integer.parseInt(temp.substring(open, minus)); result.iEnd = Integer.parseInt(temp.substring(minus + 1, end)); } // Take everything from the first '|' we meet after the accession number. result.iDescription = aFASTAHeader.substring(aFASTAHeader.indexOf("|") + 1); } catch (Exception excep) { logger.error(excep.getMessage(), excep); logger.info(aFASTAHeader); } } else if (aFASTAHeader.startsWith("OE")) { // Halobacterium header from the Max Planck people. // We need to find two elements: // - the accession String (easily retrieved as the next String until a space is encountered). // - the description int accessionEndLoc = aFASTAHeader.indexOf(" "); if (accessionEndLoc < 0 || aFASTAHeader.length() < (accessionEndLoc + 4)) { throw new IllegalArgumentException("Non-standard Halobacterium (Max Planck) header passed. " + "Expecting something like '>OExyz (OExyz) xxx xxx xxx', but was '" + aFASTAHeader + "'!"); } // Now we have to see if there is location information present. // This is a bit tricky here, because the accession number itself is repeated between '()' after the space. if (aFASTAHeader.charAt(accessionEndLoc + 1) == '(' && Character.isDigit(aFASTAHeader.charAt(accessionEndLoc + 2))) { // start and end found. Add it to the accession number and remove it from the description. accessionEndLoc = aFASTAHeader.indexOf(")", accessionEndLoc) + 1; } result.databaseType = DatabaseType.Halobacterium; result.iID = ""; result.iAccession = aFASTAHeader.substring(0, accessionEndLoc).trim(); // Check for the presence of a location. int index; if ((index = result.iAccession.indexOf(" (")) > 0) { String temp = result.iAccession.substring(index); result.iAccession = result.iAccession.substring(0, index); int open = 2; int minus = temp.indexOf("-"); int end = temp.indexOf(")"); result.iStart = Integer.parseInt(temp.substring(open, minus)); result.iEnd = Integer.parseInt(temp.substring(minus + 1, end)); } result.iDescription = aFASTAHeader.substring(accessionEndLoc).trim(); } else if (aFASTAHeader.startsWith("hflu_")) { // H Influenza header from Novartis. // We need to find two elements: // - the accession String (easily retrieved as the next String until a space is encountered). // - the description int accessionEndLoc = aFASTAHeader.indexOf(" "); if (accessionEndLoc < 0) { throw new IllegalArgumentException("Non-standard H Influenza (Novartis) header passed. " + "Expecting something like '>hflu_lsi_xxxx xxx xxx xxx', but was '" + aFASTAHeader + "'!"); } // Now we have to see if there is location information present. if (aFASTAHeader.charAt(accessionEndLoc + 1) == '(' && Character.isDigit(aFASTAHeader.charAt(accessionEndLoc + 2))) { // start and end found. Add it to the accession number and remove it from the description. accessionEndLoc = aFASTAHeader.indexOf(")", accessionEndLoc) + 1; } result.databaseType = DatabaseType.H_Influenza; result.iID = ""; result.iAccession = aFASTAHeader.substring(0, accessionEndLoc).trim(); // Check for the presence of a location. int index; if ((index = result.iAccession.indexOf(" (")) > 0) { String temp = result.iAccession.substring(index); result.iAccession = result.iAccession.substring(0, index); int open = 2; int minus = temp.indexOf("-"); int end = temp.indexOf(")"); result.iStart = Integer.parseInt(temp.substring(open, minus)); result.iEnd = Integer.parseInt(temp.substring(minus + 1, end)); } result.iDescription = aFASTAHeader.substring(accessionEndLoc).trim(); } else if (aFASTAHeader.startsWith("C.tr_") || aFASTAHeader.startsWith("C_trachomatis_")) { // C. Trachomatis header. // We need to find two elements: // - the accession String (retrieved as the actual accession String which lasts up to the first space). // - the description (everything after the first space). int accessionEndLoc = aFASTAHeader.indexOf(" "); if (accessionEndLoc < 0) { throw new IllegalArgumentException("Non-standard C trachomatis header passed. " + "Expecting something like '>C_tr_Lx_x [xxx - xxx] | xxx xxx ', but was '" + aFASTAHeader + "'!"); } // Now we have to see if there is location information present. if (aFASTAHeader.charAt(accessionEndLoc + 1) == '(' && Character.isDigit(aFASTAHeader.charAt(accessionEndLoc + 2))) { // start and end found. Add it to the accession number and remove it from the description. accessionEndLoc = aFASTAHeader.indexOf(")", accessionEndLoc) + 1; } result.databaseType = DatabaseType.C_Trachomatis; result.iID = ""; result.iAccession = aFASTAHeader.substring(0, accessionEndLoc).trim(); // Check for the presence of a location. int index; if ((index = result.iAccession.indexOf(" (")) > 0) { String temp = result.iAccession.substring(index); result.iAccession = result.iAccession.substring(0, index); int open = 2; int minus = temp.indexOf("-"); int end = temp.indexOf(")"); result.iStart = Integer.parseInt(temp.substring(open, minus)); result.iEnd = Integer.parseInt(temp.substring(minus + 1, end)); } result.iDescription = aFASTAHeader.substring(accessionEndLoc).trim(); } else if (aFASTAHeader.startsWith(" M. tub.")) { // M. Tuberculosis header. // We need to find two elements: // - the accession String (retrieved as the first pipe-delimited String). // - the description (everything after the pipe that closes the accession String). int accessionStartLoc = aFASTAHeader.indexOf("|") + 1; int accessionEndLoc = aFASTAHeader.indexOf("|", accessionStartLoc); if (accessionEndLoc < 0) { throw new IllegalArgumentException("Non-standard M tuberculosis header passed. " + "Expecting something like '>M. tub.xxx|Rvxxx| xxx xxx', but was '" + aFASTAHeader + "'!"); } result.databaseType = DatabaseType.M_Tuberculosis; result.iID = aFASTAHeader.substring(0, accessionStartLoc - 1); result.iAccession = aFASTAHeader.substring(accessionStartLoc, accessionEndLoc).trim(); // Check for the presence of a location. int index; if ((index = result.iAccession.indexOf(" (")) > 0) { String temp = result.iAccession.substring(index); result.iAccession = result.iAccession.substring(0, index); int open = 2; int minus = temp.indexOf("-"); int end = temp.indexOf(")"); result.iStart = Integer.parseInt(temp.substring(open, minus)); result.iEnd = Integer.parseInt(temp.substring(minus + 1, end)); } result.iDescription = aFASTAHeader.substring(accessionEndLoc + 1).trim(); } else if (aFASTAHeader.matches("^CG.* pep:.*")) { // Drosophile DB. // We need to find two elements: // - the accession String (retrieved as the trimmed version of everything // up to (and NOT including) " pep:" // - the description (everything (trimmed) starting from (and including) the " pep:". int pepLoc = aFASTAHeader.indexOf(" pep:"); result.databaseType = DatabaseType.Drosophile; result.iID = ""; result.iAccession = aFASTAHeader.substring(0, pepLoc).trim(); String possibleDescriptionPrefix = ""; // See if there is "(*xE*)" information wrongly assigned to the accession number. if (result.iAccession.indexOf("(*") > 0) { possibleDescriptionPrefix = result.iAccession.substring(result.iAccession.indexOf("(*"), result.iAccession.indexOf("*)") + 2) + " "; result.iAccession = result.iAccession.substring(0, result.iAccession.indexOf("(*")); } // Check for the presence of a location. int index; if ((index = result.iAccession.indexOf(" (")) > 0) { String temp = result.iAccession.substring(index); result.iAccession = result.iAccession.substring(0, index); int open = 2; int minus = temp.indexOf("-"); int end = temp.indexOf(")"); result.iStart = Integer.parseInt(temp.substring(open, minus)); result.iEnd = Integer.parseInt(temp.substring(minus + 1, end)); } result.iDescription = possibleDescriptionPrefix + aFASTAHeader.substring(pepLoc).trim(); } else if (aFASTAHeader.matches(".*SGDID:[^\\s]+,.*")) { // OK, SGD entry. The text up to but not including the first space is deemed accession, // everything else is taken as description. // So we need to find two elements: // - the accession String (taking into account possible location info). // - the description int accessionEndLoc = aFASTAHeader.indexOf(" "); if (accessionEndLoc < 0) { throw new IllegalArgumentException("Non-standard SGD header passed. " + "Expecting something like '>xxxx xxx SGDID:xxxx xxx', but was '" + aFASTAHeader + "'!"); } // Now we have to see if there is location information present. if (aFASTAHeader.charAt(accessionEndLoc + 1) == '(' && Character.isDigit(aFASTAHeader.charAt(accessionEndLoc + 2))) { // start and end found. Add it to the accession number and remove it from the description. accessionEndLoc = aFASTAHeader.indexOf(")", accessionEndLoc) + 1; } result.databaseType = DatabaseType.SGD; result.iID = ""; result.iAccession = aFASTAHeader.substring(0, accessionEndLoc).trim(); // Check for the presence of a location. int index; if ((index = result.iAccession.indexOf(" (")) > 0) { String temp = result.iAccession.substring(index); result.iAccession = result.iAccession.substring(0, index); int open = 2; int minus = temp.indexOf("-"); int end = temp.indexOf(")"); result.iStart = Integer.parseInt(temp.substring(open, minus)); result.iEnd = Integer.parseInt(temp.substring(minus + 1, end)); } result.iDescription = aFASTAHeader.substring(accessionEndLoc).trim(); } else if (aFASTAHeader.startsWith("generic")) { // try to parse as a generic header with splitters // should look something like this: // >generic_some_tag|proten_accession|a description for this protein result.databaseType = DatabaseType.Generic_Split_Header; result.iID = aFASTAHeader.substring(0, aFASTAHeader.indexOf("|")); String subHeader = aFASTAHeader.substring(aFASTAHeader.indexOf("|") + 1); if (subHeader.contains("|")) { result.iAccession = subHeader.substring(0, subHeader.indexOf("|")); result.iDescription = subHeader.substring(subHeader.indexOf("|") + 1).trim(); } else { result.iAccession = subHeader; result.iDescription = ""; } } else if (aFASTAHeader.matches("^[^\\s]+_[^\\s]+ \\([PQOA][^\\s]+\\) .*") && aFASTAHeader.lastIndexOf("|") == -1) { // Old (everything before 9.0 release (31 Oct 2006)) standard SwissProt header as // present in the Expasy FTP FASTA file. // Is formatted something like this: // >XXX_YYYY (acc) rest int start = aFASTAHeader.indexOf(" ("); int end = aFASTAHeader.indexOf(") "); result.iAccession = aFASTAHeader.substring(start + 2, end); result.databaseType = DatabaseType.UniProt; result.iID = "sw"; // @TODO: remove hardcoding? result.iDescription = aFASTAHeader.substring(0, start) + " " + aFASTAHeader.substring(end + 2); // try to get the gene name and taxonomy //parseUniProtDescription(result); // @TOOD: not sure if the header has the right format... } else if (aFASTAHeader.matches("^sp\\|[^|]*\\|[^\\s]+_[^\\s]+ .*")) { // New (September 2008 and beyond) standard SwissProt header as // present in the Expasy FTP FASTA file. // Is formatted something like this: // >sp|accession|ID descr rest (including taxonomy, if available) String tempHeader = aFASTAHeader.substring(3); result.iAccession = tempHeader.substring(0, tempHeader.indexOf("|")).trim(); // See if there is location information. if (result.iAccession.matches("[^\\(]+\\([\\d]+ [\\d]\\)$")) { int openBracket = result.iAccession.indexOf("("); result.iStart = Integer.parseInt(result.iAccession.substring(openBracket, result.iAccession.indexOf(" ", openBracket)).trim()); result.iEnd = Integer.parseInt(result.iAccession.substring(result.iAccession.indexOf(" ", openBracket), result.iAccession.indexOf(")")).trim()); result.iAccession = result.iAccession.substring(0, openBracket).trim(); } else if (result.iAccession.matches("[^\\(]+\\([\\d]+-[\\d]+\\)$")) { int openBracket = result.iAccession.indexOf("("); result.iStart = Integer.parseInt(result.iAccession.substring(openBracket + 1, result.iAccession.indexOf("-", openBracket)).trim()); result.iEnd = Integer.parseInt(result.iAccession.substring(result.iAccession.indexOf("-", openBracket) + 1, result.iAccession.indexOf(")")).trim()); result.iAccession = result.iAccession.substring(0, openBracket).trim(); } result.databaseType = DatabaseType.UniProt; result.iID = "sp"; result.iDescription = tempHeader.substring(tempHeader.indexOf("|") + 1); // try to get the gene name and taxonomy parseUniProtDescription(result); } else if (aFASTAHeader.matches("^tr\\|[^|]*\\|[^\\s]+_[^\\s]+ .*")) { // New (September 2008 and beyond) standard SwissProt header as // present in the Expasy FTP FASTA file. // Is formatted something like this: // >tr|accession|ID descr rest (including taxonomy, if available) String tempHeader = aFASTAHeader.substring(3); result.iAccession = tempHeader.substring(0, tempHeader.indexOf("|")).trim(); // See if there is location information. if (result.iAccession.matches("[^\\(]+\\([\\d]+ [\\d]+\\)$")) { int openBracket = result.iAccession.indexOf("("); result.iStart = Integer.parseInt(result.iAccession.substring(openBracket + 1, result.iAccession.indexOf(" ", openBracket)).trim()); result.iEnd = Integer.parseInt(result.iAccession.substring(result.iAccession.indexOf(" ", openBracket), result.iAccession.indexOf(")")).trim()); result.iAccession = result.iAccession.substring(0, openBracket).trim(); } else if (result.iAccession.matches("[^\\(]+\\([\\d]+-[\\d]+\\)$")) { int openBracket = result.iAccession.indexOf("("); result.iStart = Integer.parseInt(result.iAccession.substring(openBracket + 1, result.iAccession.indexOf("-", openBracket)).trim()); result.iEnd = Integer.parseInt(result.iAccession.substring(result.iAccession.indexOf("-", openBracket) + 1, result.iAccession.indexOf(")")).trim()); result.iAccession = result.iAccession.substring(0, openBracket).trim(); } result.databaseType = DatabaseType.UniProt; result.iID = "tr"; result.iDescription = tempHeader.substring(tempHeader.indexOf("|") + 1); // try to get the gene name and taxonomy parseUniProtDescription(result); } else if (aFASTAHeader.matches("^en\\|[^|]*\\|.*")) { // Ensembl Genomes header // Is formatted something like this: // >en|CCF76815|pCol1B9_SL1344:3971-4420 conserved hypothetical plasmid protein String tempHeader = aFASTAHeader.substring(3); result.iAccession = tempHeader.substring(0, tempHeader.indexOf("|")).trim(); // See if there is location information. if (result.iAccession.matches("[^\\(]+\\([\\d]+ [\\d]+\\)$")) { int openBracket = result.iAccession.indexOf("("); result.iStart = Integer.parseInt(result.iAccession.substring(openBracket + 1, result.iAccession.indexOf(" ", openBracket)).trim()); result.iEnd = Integer.parseInt(result.iAccession.substring(result.iAccession.indexOf(" ", openBracket), result.iAccession.indexOf(")")).trim()); result.iAccession = result.iAccession.substring(0, openBracket).trim(); } else if (result.iAccession.matches("[^\\(]+\\([\\d]+-[\\d]+\\)$")) { int openBracket = result.iAccession.indexOf("("); result.iStart = Integer.parseInt(result.iAccession.substring(openBracket + 1, result.iAccession.indexOf("-", openBracket)).trim()); result.iEnd = Integer.parseInt(result.iAccession.substring(result.iAccession.indexOf("-", openBracket) + 1, result.iAccession.indexOf(")")).trim()); result.iAccession = result.iAccession.substring(0, openBracket).trim(); } result.databaseType = DatabaseType.EnsemblGenomes; result.iID = "en"; result.iDescription = tempHeader.substring(tempHeader.indexOf("|") + 1); // try to get the gene name and taxonomy parseUniProtDescription(result); } else if (aFASTAHeader.startsWith("nxp|NX_") && aFASTAHeader.split("\\|").length == 5) { // @TODO: replace by regular expression? // header should look like this: // >nxp|NX_P02768-1|ALB|Serum albumin|Iso 1 result.databaseType = DatabaseType.NextProt; result.iID = "nxp"; String[] headerElements = aFASTAHeader.split("\\|"); result.iAccession = headerElements[1]; result.iGeneName = headerElements[2]; result.iDescription = headerElements[3] + "|" + headerElements[4]; } else if (aFASTAHeader.startsWith("UniRef") && aFASTAHeader.contains(" ")) { // @TODO: replace by regular expression? // header should look like this: // >UniRef100_U3PVA8 Protein IroK n=22 Tax=Escherichia coli RepID=IROK_ECOL result.databaseType = DatabaseType.UniRef; result.iID = ""; // @TODO: could be UniRef or UniRef100 etc? result.iAccession = aFASTAHeader.substring(0, aFASTAHeader.indexOf(" ")); result.iDescription = aFASTAHeader.substring(aFASTAHeader.indexOf(" ") + 1); } else if (aFASTAHeader.matches("^[^\\s]*\\|[^\\s]+_[^\\s]+ .*")) { // New (9.0 release (31 Oct 2006) and beyond) standard SwissProt header as // present in the Expasy FTP FASTA file. // Is formatted something like this: // >accession|ID descr rest (including taxonomy, if available) result.iAccession = aFASTAHeader.substring(0, aFASTAHeader.indexOf("|")).trim(); // See if there is location information. if (aFASTAHeader.matches("[^\\(]+\\([\\d]+ [\\d]\\)$")) { int openBracket = aFASTAHeader.indexOf("("); result.iAccession = aFASTAHeader.substring(0, openBracket).trim(); result.iStart = Integer.parseInt(aFASTAHeader.substring(openBracket, aFASTAHeader.indexOf(" ", openBracket)).trim()); result.iEnd = Integer.parseInt(aFASTAHeader.substring(aFASTAHeader.indexOf(" ", openBracket), aFASTAHeader.indexOf(")")).trim()); } result.databaseType = DatabaseType.UniProt; result.iID = "sw"; // @TODO: remove hardcoding? result.iDescription = aFASTAHeader.substring(aFASTAHeader.indexOf("|") + 1); // try to get the gene name and taxonomy parseUniProtDescription(result); } else if (aFASTAHeader.matches("^FB.+\\stype=.*")) { // Flybase FASTA format. // Accession number result.iAccession = aFASTAHeader.substring(0, aFASTAHeader.indexOf("type")).trim(); if (result.iAccession.matches("[^\\(]+\\([\\d]+-[\\d]+\\)$")) { int openBracket = result.iAccession.indexOf("("); result.iStart = Integer.parseInt(result.iAccession.substring(openBracket + 1, result.iAccession.indexOf("-", openBracket)).trim()); result.iEnd = Integer.parseInt(result.iAccession.substring(result.iAccession.indexOf("-", openBracket) + 1, result.iAccession.indexOf(")")).trim()); result.iAccession = result.iAccession.substring(0, openBracket).trim(); } result.databaseType = DatabaseType.Flybase; result.iID = ""; result.iDescription = aFASTAHeader.substring(aFASTAHeader.indexOf("type=")); } else if (aFASTAHeader.matches(".* [.]*\\[[\\d]+[ ]?\\-[ ]?[\\d]+\\].*")) { // A header translating a genome sequence into a protein sequences. // We need to find two elements, separated by a space: // - the accession string (retrieved as the first part of a space delimited String). // - the nucleic acid start and stop site (between brackets, separated by a '-'). // // ex: >dm345_3L-sense [234353534-234353938] // >dmic_c_1_469 Dialister micraerophilus DSM 19965 [161699 - 160872] aspartate-semialdehyde dehydrogenase Database // >synsp_j_c_8_5 Synergistes[G-2] sp. oral taxon 357 W5455 (JCVI) [820 - 1089] ORF int accessionEndLoc = aFASTAHeader.indexOf(" "); if (accessionEndLoc < 0) { throw new IllegalArgumentException("Incorrect genome to protein sequence header. " + "Expected something like '>dm345_3L-sense (something) [234353-234359] (something)', but found '" + aFASTAHeader + "'!"); } result.databaseType = DatabaseType.GenomeTranslation; result.iID = aFASTAHeader.substring(0, accessionEndLoc).trim(); result.iAccession = aFASTAHeader.substring(0, accessionEndLoc).trim(); // Parse the location. int index1 = aFASTAHeader.lastIndexOf("["); // @TODO: should check for [number-number] or [number - number], as the current test will fail if the part after the indexes contains [... int index2 = aFASTAHeader.indexOf("]", index1); int separation = aFASTAHeader.indexOf("-", index1); if (index1 > 0 && index2 > 0 && separation > 0) { try { result.iStart = Integer.parseInt(aFASTAHeader.substring(index1 + 1, separation).trim()); result.iEnd = Integer.parseInt(aFASTAHeader.substring(separation + 1, index2).trim()); } catch (NumberFormatException e) { throw new IllegalArgumentException("Incorrect genome to protein sequence header. " + "Expected something like '>dm345_3L-sense (something) [234353-234359] (something)', but found '" + aFASTAHeader + "'!"); } } result.iDescription = aFASTAHeader.substring(accessionEndLoc + 1).trim(); } else if (aFASTAHeader.matches("^[^|\t]* [|] Symbol[^|]*[|] [^|]* [|].*")) { // The Arabidopsis thaliana database; TAIR format // We need to find two elements, separated by pipes: // - the accession number with version (retrieved as the part before the first pipe). // - the description (retrieved as the part between the second and third pipe). // // ex: >AT1G08520.1 | Symbol: PDE166 | magnesium-chelatase subunit chlD, chloroplast, putative / Mg-protoporphyrin IX chelatase, putative (CHLD), similar to Mg-chelatase SP:O24133 from Nicotiana tabacum, GB:AF014399 GI:2318116 from (Pisum sativum) | chr1:2696415-2700961 FORWARD | Aliases: T27G7.20, T27G7_20, PDE166, PIGMENT DEFECTIVE 166 int firstPipeLoc = aFASTAHeader.indexOf("|"); result.databaseType = DatabaseType.Arabidopsis_thaliana_TAIR; result.iAccession = aFASTAHeader.substring(0, firstPipeLoc).trim(); result.iID = ""; int secondPipeLoc = aFASTAHeader.indexOf("|", firstPipeLoc + 1); int thirdPipeLoc = aFASTAHeader.indexOf("|", secondPipeLoc + 1); result.iDescription = aFASTAHeader.substring(secondPipeLoc + 1, thirdPipeLoc).trim(); // Check for the presence of a location. int index; if ((index = result.iAccession.indexOf(" (")) > 0) { String temp = result.iAccession.substring(index); result.iAccession = result.iAccession.substring(0, index); int open = 2; int minus = temp.indexOf("-"); int end = temp.indexOf(")"); result.iStart = Integer.parseInt(temp.substring(open, minus)); result.iEnd = Integer.parseInt(temp.substring(minus + 1, end)); } } else if (aFASTAHeader.matches("^nrAt[^\t]*\t.*")) { // The PSB Arabidopsis thaliana database; proprietary format // We need to find three elements: // - the internal accession (at the start, separated by 'tab' and space from the next part). // - the external accession (between '()', after the internal accession). // - the description (retrieved as the rest of the header). // // ex: nrAt0.2_1 (TR:Q8HT11_ARATH) Photosystem II CP43 protein (Fragment).- Arabidopsis thaliana (Mouse-ear cress). int openBracketLoc = aFASTAHeader.indexOf("("); int closeBracketLoc = aFASTAHeader.indexOf(")"); // If there is a location, there will be a closing bracket at 'closeBracketLoc+1' as well. // If so, use this one. int tempLoc = closeBracketLoc + 1; if (aFASTAHeader.length() > tempLoc && aFASTAHeader.charAt(tempLoc) == ')') { closeBracketLoc = tempLoc; } result.databaseType = DatabaseType.PSB_Arabidopsis_thaliana; result.iAccession = aFASTAHeader.substring(openBracketLoc + 1, closeBracketLoc).trim(); result.iID = aFASTAHeader.substring(0, openBracketLoc).trim(); result.iDescription = aFASTAHeader.substring(closeBracketLoc + 1).trim(); // Check for the presence of a location. int index; if ((index = result.iAccession.indexOf(" (")) > 0) { String temp = result.iAccession.substring(index); result.iAccession = result.iAccession.substring(0, index); int open = 2; int minus = temp.indexOf("-"); int end = temp.indexOf(")"); result.iStart = Integer.parseInt(temp.substring(open, minus)); result.iEnd = Integer.parseInt(temp.substring(minus + 1, end)); } } else if (aFASTAHeader.matches("^L. monocytogenes[^|]*[|][^|]*[|].*")) { // The Listeria database; proprietary format // We need to find three elements: // - the leader element (at the start, separated by '|' from the next part). // - the accession number (between '||', after the leader). // - the description (retrieved as the rest of the header). // // ex: L. monocytogenes EGD-e|LMO02333|'comK: 158 aa - competence transcription factor (C-terminal part) int firstPipe = aFASTAHeader.indexOf("|"); int secondPipe = aFASTAHeader.indexOf("|", firstPipe + 1); result.databaseType = DatabaseType.Listeria; result.iID = aFASTAHeader.substring(0, firstPipe).trim(); result.iAccession = aFASTAHeader.substring(firstPipe + 1, secondPipe).trim(); result.iDescription = aFASTAHeader.substring(secondPipe + 1).trim(); // Check for the presence of a location. int index; if ((index = result.iAccession.indexOf(" (")) > 0) { String temp = result.iAccession.substring(index); result.iAccession = result.iAccession.substring(0, index); int open = 2; int minus = temp.indexOf("-"); int end = temp.indexOf(")"); result.iStart = Integer.parseInt(temp.substring(open, minus)); result.iEnd = Integer.parseInt(temp.substring(minus + 1, end)); } } else if (aFASTAHeader.toLowerCase().startsWith("gaffa")) { // A Genome Annotation Framework for Flexible Analysis (GAFFA) header. // Should look like this: // >GAFFA|"accession"|"species"/unknown // Example: // >GAFFA|cgb_GMPQSG401A00X3_1_cgb_pilot_F1_1|unknown result.databaseType = DatabaseType.GAFFA; try { result.iAccession = aFASTAHeader.substring(aFASTAHeader.indexOf("|") + 1, aFASTAHeader.lastIndexOf("|")); result.iDescription = aFASTAHeader.substring(aFASTAHeader.lastIndexOf("|") + 1); } catch (IndexOutOfBoundsException e) { result.iAccession = aFASTAHeader.substring(aFASTAHeader.indexOf("|") + 1); result.iDescription = ""; } result.iID = "GAFFA"; } else if (aFASTAHeader.contains("_HUMAN_UPS")) { // UPS sequences, processed like SGD int accessionEndLoc = aFASTAHeader.indexOf(" "); if (accessionEndLoc < 0) { throw new IllegalArgumentException("Non-standard UPS header passed. " + "Expecting something like '>xxxx xxxxx_HUMAN_UPS xxxxxxx xxx', but was '" + aFASTAHeader + "'."); } // Now we have to see if there is location information present. if (aFASTAHeader.charAt(accessionEndLoc + 1) == '(' && Character.isDigit(aFASTAHeader.charAt(accessionEndLoc + 2))) { // start and end found. Add it to the accession number and remove it from the description. accessionEndLoc = aFASTAHeader.indexOf(")", accessionEndLoc) + 1; } result.databaseType = DatabaseType.UPS; result.iID = ""; result.iAccession = aFASTAHeader.substring(0, accessionEndLoc).trim(); // Check for the presence of a location. int index; if ((index = result.iAccession.indexOf(" (")) > 0) { String temp = result.iAccession.substring(index); result.iAccession = result.iAccession.substring(0, index); int open = 2; int minus = temp.indexOf("-"); int end = temp.indexOf(")"); result.iStart = Integer.parseInt(temp.substring(open, minus)); result.iEnd = Integer.parseInt(temp.substring(minus + 1, end)); } result.iDescription = aFASTAHeader.substring(accessionEndLoc).trim(); } else { // Okay, try the often-used 'generic' approach. If this fails, we go to the worse-case scenario, ie. do not process at all. // Testing for this is somewhat more complicated. // Often used simple header; looks like: // >NP0465 (NP0465) A description for this protein. // We need to find two elements: // - the accession String (easily retrieved as the next String until a space is encountered). // - the description result.databaseType = DatabaseType.Generic_Header; int accessionEndLoc = aFASTAHeader.indexOf(" "); // Temporary storage variables. int startSecAcc = -1; int endSecAcc = -1; String testAccession = null; String testDescription = null; int testStart = -1; int testEnd = -1; if ((accessionEndLoc > 0) && (aFASTAHeader.contains("(")) && (aFASTAHeader.indexOf(")", aFASTAHeader.indexOf("(") + 1) >= 0)) { // Now we have to see if there is location information present. if (aFASTAHeader.substring(accessionEndLoc + 1, aFASTAHeader.indexOf(")", accessionEndLoc + 2) + 1).matches("[(][0-9]+-[0-9]+[)]") && !aFASTAHeader.substring(accessionEndLoc + 2, aFASTAHeader.indexOf(")", accessionEndLoc + 2)).equals(aFASTAHeader.substring(0, accessionEndLoc).trim())) { // start and end found. Add it to the accession number and remove it from the description. accessionEndLoc = aFASTAHeader.indexOf(")", accessionEndLoc) + 1; } testAccession = aFASTAHeader.substring(0, accessionEndLoc).trim(); // Check for the presence of a location. int index; if ((index = testAccession.indexOf(" (")) > 0) { String temp = testAccession.substring(index); testAccession = testAccession.substring(0, index); int open = 2; int minus = temp.indexOf("-"); int end = temp.indexOf(")"); testStart = Integer.parseInt(temp.substring(open, minus)); testEnd = Integer.parseInt(temp.substring(minus + 1, end)); } testDescription = aFASTAHeader.substring(accessionEndLoc).trim(); // Find the second occurrence of the accession number, which should be in the description. int enzymicity = -1; if (testDescription.contains("(*") && testDescription.indexOf("*)", testDescription.indexOf("(*" + 4)) > 0) { enzymicity = testDescription.indexOf("*)") + 2; } startSecAcc = testDescription.indexOf("(", enzymicity); endSecAcc = testDescription.indexOf(")", startSecAcc + 2); } // See if the accessions match up. if (startSecAcc >= 0 && endSecAcc >= 0 && testDescription != null && testDescription.substring(startSecAcc + 1, endSecAcc).trim().equals(testAccession.trim())) { result.iID = ""; result.iAccession = testAccession; result.iDescription = testDescription; if (testStart >= 0 && testEnd >= 0) { result.iStart = testStart; result.iEnd = testEnd; } } else { //try >nonsense|accession|description if (aFASTAHeader.lastIndexOf("|") >= 0) { String end = aFASTAHeader.substring(aFASTAHeader.indexOf("|") + 1); if (end.contains("|")) { result.iAccession = end.substring(0, end.indexOf("|")); result.iDescription = end.substring(end.indexOf("|") + 1); } } // Unknown. // Everything is rest. result.iRest = aFASTAHeader; // Check for the presence of a location. int index; if (((index = result.iRest.lastIndexOf(" (")) > 0) && (result.iRest.lastIndexOf(")") > 0) && (result.iRest.lastIndexOf("-") > index)) { String temp = result.iRest.substring(index); int open = 2; int minus = temp.indexOf("-"); int end = temp.lastIndexOf(")"); try { int tempStart = Integer.parseInt(temp.substring(open, minus)); int tempEnd = Integer.parseInt(temp.substring(minus + 1, end)); result.iStart = tempStart; result.iEnd = tempEnd; result.iRest = result.iRest.substring(0, index); } catch (Exception e) { // apparently not location info. } } } } } catch (StringIndexOutOfBoundsException e) { throw new StringIndexOutOfBoundsException("Unable to process FASTA header line:\n" + "'" + aFASTAHeader + "'\n" + "as a '" + result.databaseType + "' header.\n" + "Process cancelled."); } catch (RuntimeException excep) { logger.error(" * Unable to process FASTA header line:\n\t" + aFASTAHeader + "\n\n"); // @TODO: throw a proper exception!!! throw excep; } } return result; } /** * Returns the ID. * * @return the ID */ public String getID() { return this.iID; } /** * Sets the ID. Null if not set. * * @param aID the ID */ public void setID(String aID) { iID = aID; } /** * Returns the foreign ID. Null if not set. * * @return the foreign ID */ public String getForeignID() { return iForeignID; } /** * Sets the foreign ID. * * @param aForeignID the foreign ID */ public void setForeignID(String aForeignID) { iForeignID = aForeignID; } /** * Returns the accession. Null if not set. * * @return the accession */ public String getAccession() { return iAccession; } /** * Sets the accession. * * @param aAccession the accession */ public void setAccession(String aAccession) { iAccession = aAccession; } /** * Returns the accession or if this is null the rest. This is a quick fix * for unsupported custom headers. * * @return the accession or if this is null the rest */ public String getAccessionOrRest() { if (iAccession == null) { return iRest; } else { return iAccession; } } /** * Returns the database type as inferred from the header structure. * * @return the database type */ public DatabaseType getDatabaseType() { return databaseType; } /** * Sets the database type. * * @param aDatabaseType the database type */ public void setDatabaseType(DatabaseType aDatabaseType) { databaseType = aDatabaseType; } /** * Returns the foreign accession. Null if not set. * * @return the foreign accession */ public String getForeignAccession() { return iForeignAccession; } /** * Sets the foreign accession. * * @param aForeignAccession the foreign accession */ public void setForeignAccession(String aForeignAccession) { iForeignAccession = aForeignAccession; } /** * Returns the description. Null if not set. * * @return the description */ public String getDescription() { return iDescription; } /** * Sets the description. * * @param aDescription the description */ public void setDescription(String aDescription) { iDescription = aDescription; } /** * Returns the short description. Null if not set. * * @return the short description */ public String getDescriptionShort() { return iDescriptionShort; } /** * Sets the short description. * * @param aDescriptionShort the short description */ public void setDescriptionShort(String aDescriptionShort) { iDescriptionShort = aDescriptionShort; } /** * Returns the protein name as inferred from the description. * * @return the protein name */ public String getDescriptionProteinName() { return iDescriptionProteinName; } /** * Sets the protein name. * * @param aDescriptionProteinName the protein name */ public void setDescriptionProteinName(String aDescriptionProteinName) { iDescriptionProteinName = aDescriptionProteinName; } /** * Returns the gene name. * * @return the gene name */ public String getGeneName() { return iGeneName; } /** * Set the gene name. * * @param aGeneName the gene name */ public void setGeneName(String aGeneName) { iGeneName = aGeneName; } /** * Returns the protein evidence level. * * @return the protein evidence level */ public String getProteinEvidence() { return iProteinEvidence; } /** * Sets the protein evidence level. * * @param aProteinEvidence the protein evidence level */ public void setProteinEvidence(String aProteinEvidence) { iProteinEvidence = aProteinEvidence; } /** * Returns the taxonomy. * * @return the taxonomy */ public String getTaxonomy() { return iTaxonomy; } /** * Sets the taxonomy. * * @param aTaxonomy the taxonomy */ public void setTaxonomy(String aTaxonomy) { iTaxonomy = aTaxonomy; } /** * Returns the foreign description. * * @return the foreign description */ public String getForeignDescription() { return iForeignDescription; } /** * Sets the foreign description. * * @param aForeignDescription the foreign description */ public void setForeignDescription(String aForeignDescription) { iForeignDescription = aForeignDescription; } /** * Returns the rest of the header. * * @return the rest of the header */ public String getRest() { return iRest; } /** * Sets the rest of the header. * * @param aRest the rest of the header */ public void setRest(String aRest) { iRest = aRest; } /** * Returns the entire header. * * @return the entire header */ public String getRawHeader() { return iRawHeader; } /** * Sets the entire header. * * @param aRawHeader the entire header */ public void setRawHeader(String aRawHeader) { iRawHeader = aRawHeader; } /** * Returns a simplified protein description for a UniProt header. For * example "GRP78_HUMAN 78 kDa glucose-regulated protein OS=Homo sapiens * GN=HSPA5 PE=1 SV=2" becomes "78 kDa glucose-regulated protein * [GRP78_HUMAN]". For non UniProt headers the normal protein description is * returned. * * @return a simplified protein description for a UniProt header */ public String getSimpleProteinDescription() { if (databaseType == DatabaseType.UniProt) { // get the default simple header String temp = iDescriptionShort + " (" + iDescriptionProteinName + ")"; // see if we need to add a decoy flag if (SequenceFactory.getInstance().isDecoyAccession(iAccession)) { temp = SequenceFactory.getDefaultDecoyDescription(temp); } return temp; } else if (iDescription != null) { return iDescription; } else { return ""; } } /** * This method returns an abbreviated version of the Header, suitable for * inclusion in FASTA formatted files. <br> The abbreviated header is * composed in the following way: <br> * >[ID]|[accession_string]|([foreign_ID]|[foreign_accession_string]|[foreign_description] * )[description] * * @return String with the abbreviated header. */ public String getAbbreviatedFASTAHeader() { return getAbbreviatedFASTAHeader(""); } /** * This method returns an abbreviated version of the Header, suitable for * inclusion in FASTA formatted files. <br> The abbreviated header is * composed in the following way: <br> * >[ID]|[accession_string]|([foreign_ID]|[foreign_accession_string]|[foreign_description] * )[description] * * * @param decoyTag the decoy tag to add * @return String with the abbreviated header. */ public String getAbbreviatedFASTAHeader(String decoyTag) { StringBuffer result = new StringBuffer(">" + this.getCoreHeader() + decoyTag); if (this.iID == null || this.databaseType == DatabaseType.Unknown) { // Apparently we have not been able to identify and parse this header. // In that case, the core header already contains everything, so don't do anything. } else { // Some more appending to be done here. if (!this.iID.equals("")) { if (this.databaseType == DatabaseType.UniProt || this.databaseType == DatabaseType.IPI || this.databaseType == DatabaseType.Listeria || this.databaseType == DatabaseType.NextProt || this.databaseType == DatabaseType.EnsemblGenomes) { // FASTA entry with pipe ('|') separating core header from description. result.append("|").append(this.iDescription); } else if (this.databaseType == DatabaseType.NCBI) { // NCBI entry. result.append("|"); // See if we have a foreign ID. if (iForeignID != null) { result.append(this.iForeignID).append("|").append(this.iForeignAccession).append("|"); // See if we also have a description. if (this.iForeignDescription != null) { result.append(this.iForeignDescription); } } // Add the Description. result.append(" ").append(this.iDescription); } else if (this.databaseType == DatabaseType.M_Tuberculosis) { // Mycobacterium tuberculosis entry. result.append("|").append(this.iDescription); } else if (this.databaseType == DatabaseType.GenomeTranslation) { // Genome to protein sequnece translation. result = new StringBuffer(">" + this.iAccession + decoyTag + " " + this.iDescription); } else if (this.databaseType == DatabaseType.PSB_Arabidopsis_thaliana) { // Proprietary PSB A. thaliana entry result.append(" ").append(this.iDescription); } } else { if (this.databaseType == DatabaseType.H_Invitation) { result.append("|").append(this.iDescription); } else { // Just add a space and the description. result.append(" ").append(this.iDescription); } } } return result.toString(); } /** * This method reports on the entire processed(!) header. To get the raw * header use getRawHeader instead. * * @return String with the full header. */ public String toString() { return toString(""); } /** * This method reports on the entire processed(!) header, with the given * decoy tag added. To get the raw header use getRawHeader instead. * * @param decoyTag the decoy tag to add * @return String with the full header. */ public String toString(String decoyTag) { String result; if (databaseType == DatabaseType.Generic_Split_Header) { result = ">" + this.iID + decoyTag + "|" + this.iAccession + "|" + this.iDescription; } else { if (this.iID == null) { result = this.getAbbreviatedFASTAHeader(decoyTag); } else { result = this.getAbbreviatedFASTAHeader(decoyTag); if (this.iRest != null) { result += " " + this.iRest; } } } result += decoyTag; return result; } /** * This method will attribute a score to the current header, based on the * following scoring list: <ul> <li> SwissProt : 4 </li> <li> IPI, SwissProt * reference : 3 </li> <li> IPI, TrEMBL or REFSEQ_NP reference : 2 </li> * <li> IPI, without SwissProt, TrEMBL or REFSEQ_NP reference : 1 </li> <li> * NCBI, SwissProt reference : 2</li> <li> NCBI, other reference : 1</li> * <li> Unknown header format : 0</li> </ul> * * @return int with the header score. The higher the score, the more * interesting a Header is. */ public int getScore() { int score = -1; // @TODO: should rely in database type instead of the ID tag? // Score the header... if (this.iID == null || this.iID.equals("") || this.iID.startsWith(" M. tub.") || this.iID.startsWith("nrAt") || this.iID.startsWith("L. monocytogenes")) { score = 0; } else if (this.iID.equalsIgnoreCase("sw") || this.iID.equalsIgnoreCase("sp")) { score = 4; } else if (this.iID.equalsIgnoreCase("tr")) { score = 2; } else if (this.iID.equalsIgnoreCase("ipi")) { if (this.iDescription != null && this.iDescription.toUpperCase().contains("SWISS-PROT")) { score = 3; } else if (this.iDescription != null && ((this.iDescription.toUpperCase().contains("TREMBL")) || (this.iDescription.toUpperCase().contains("REFSEQ_NP")))) { score = 2; } else { score = 1; } } else if (this.iID.equalsIgnoreCase("gi")) { if (this.iForeignID != null && this.iForeignID.equals("sp")) { score = 2; } else { score = 1; } } else if (this.iID.equalsIgnoreCase("en")) { score = 3; } return score; } /** * This method reports on the core information for the header, which is * comprised of the ID and the accession String: * <pre> * [ID]|[accession_string] * </pre> This is mostly useful for appending this core as an addendum to * another header. * * @return String with the header core data ([ID]|[accession_string]). */ public String getCoreHeader() { String result = null; if (iID != null && iID.startsWith("nrAt")) { // @TODO: should rely in database type instead of the ID tag? result = this.getID() + " \t(" + this.getAccession(); } else if (iID != null && !iID.equals("")) { result = this.getID() + "|" + this.getAccession(); } else if (iID != null && iID.equals("")) { // No ID given, so just take the accession. result = this.getAccession(); } else if (iID == null) { result = this.iRest; } // See if we need to add information about the location. if (iStart >= 0) { result += " (" + Integer.toString(iStart) + "-" + Integer.toString(iEnd) + ")"; } // For the PSB A. Thaliana, we need to include the closing ')'. if (iID != null && iID.startsWith("nrAt")) { result += ")"; } return result; } /** * This method allows the addition of an addendum to the list. If the * addendum is already preceded with '^A', it is added as is, otherwise '^A' * is prepended before addition to the list. * * @param aAddendum String with the addendum, facultatively preceded by * '^A'. */ public void addAddendum(String aAddendum) { // First see if we have addenda already. if (this.iAddenda == null) { iAddenda = new StringBuffer(); } // Now check for the presence of the '^A' sequence. if (aAddendum.startsWith("^A")) { iAddenda.append(aAddendum); } else { iAddenda.append("^A").append(aAddendum); } } /** * This method allows the caller to retrieve all addenda for the current * header, or 'null' if there aren't any. * * @return String with the addenda, or 'null' if there aren't any. */ public String getAddenda() { String result = null; if (this.iAddenda != null) { result = iAddenda.toString(); } return result; } /** * This method reports on the presence of addenda for this header. * * @return boolean whether addenda are present. */ public boolean hasAddenda() { boolean result = false; if (this.iAddenda != null) { result = true; } return result; } /** * This method reports on the full header, with the addenda (if present). If * no addenda are present, this method reports the same information as the * 'toString()' method. * * @return String with the header and addenda (if any). */ public String getFullHeaderWithAddenda() { String result = this.toString(); if (this.iAddenda != null) { result += iAddenda.toString(); } return result; } /** * This method returns an abbreviated version of the Header, suitable for * inclusion in FASTA formatted files. <br> The abbreviated header is * composed in the following way: <br> * >[ID]|[accession_string]|([foreign_ID]|[foreign_accession_string]|[foreign_description] * )[description]([addenda]) * <br> * Note that the output of this method is identical to that of the * getAbbreviatedFASTAHeader() if no addenda are present. * * @return String with the abbreviated header and addenda (if any). */ public String getAbbreviatedFASTAHeaderWithAddenda() { String result = this.getAbbreviatedFASTAHeader(); if (this.iAddenda != null) { result += iAddenda.toString(); } return result; } /** * This method allows the caller to add information to the header about * location of the sequence in a certain master sequence. <br> This * information is typically specified right after the accession number: * <pre> * [id]|[accession_string] ([startindex]-[endindex])|... * </pre> <b>Please note the following:</b> <ul> <li>If an index is already * present, it is removed and replaced.</li> <li>If the header is of unknown * format, the indeces are appended to the end of the header.</li> </ul> * * @param aStart int with the startindex. * @param aEnd int with the endindex. */ public void setLocation(int aStart, int aEnd) { this.iStart = aStart; this.iEnd = aEnd; } /** * This method reports on the start index of the header. It returns '-1' if * no location is specified. * * @return int with the start location, or '-1' if none was defined. */ public int getStartLocation() { return iStart; } /** * This method reports on the end index of the header. It returns '-1' if no * location is specified. * * @return int with the end location, or '-1' if none was defined. */ public int getEndLocation() { return iEnd; } /** * This method provides a deep copy of the Header instance. * * @return Object Header that is a deep copy of this Header. */ public Object clone() { Object result = null; try { result = super.clone(); } catch (CloneNotSupportedException cnse) { logger.error(cnse.getMessage(), cnse); } return result; } /** * Returns the implemented database types as an array of String. * * @return the implemented database types as an array of String */ public static String[] getDatabaseTypesAsString() { DatabaseType[] enumValues = DatabaseType.values(); String[] result = new String[enumValues.length]; for (int i = 0; i < enumValues.length; i++) { result[i] = getDatabaseTypeAsString(enumValues[i]); } return result; } /** * Convenience method returning the database name as a String. * * @param databaseType the database type * @return the name */ public static String getDatabaseTypeAsString(DatabaseType databaseType) { switch (databaseType) { case UniProt: return "UniProtKB"; case Unknown: return "Unknown"; case NCBI: return "NCBI"; case IPI: return "IPI (deprecated)"; case H_Invitation: return "H_Invitation"; case Halobacterium: return "Halobacterium"; case H_Influenza: return "H_Influenza"; case C_Trachomatis: return "C_Trachomatis"; case M_Tuberculosis: return "M_Tuberculosis"; case Drosophile: return "Drosophile"; case SGD: return "SGD"; case Flybase: return "Flybase"; case GenomeTranslation: return "Genome to protein translation"; case Arabidopsis_thaliana_TAIR: return "Arabidopsis thaliana TAIR"; case PSB_Arabidopsis_thaliana: return "PSB Arabidopsis thaliana"; case Listeria: return "Listeria"; case Generic_Header: return "User Defined"; case Generic_Split_Header: return "Generic Header"; case GAFFA: return "GAFFA"; case UPS: return "Universal Proteomic Standard"; case NextProt: return "neXtProt"; case UniRef: return "UniRef"; default: throw new UnsupportedOperationException("Database type not implemented: " + databaseType + "."); } } /** * Tries to extract the gene name, taxonomy and the protein evidence level * from a UniProt description. * * @param header the header to parse. */ private static void parseUniProtDescription(Header header) { // try to get the gene name from the description if (header.iDescription.contains(" GN=")) { int geneStartIndex = header.iDescription.indexOf(" GN=") + 4; int geneEndIndex = header.iDescription.indexOf(" ", geneStartIndex); if (geneEndIndex != -1) { header.iGeneName = header.iDescription.substring(geneStartIndex, geneEndIndex); } else { header.iGeneName = header.iDescription.substring(geneStartIndex); } } // try to get the protein evidence level from the description if (header.iDescription.contains(" PE=")) { int evidenceStartIndex = header.iDescription.indexOf(" PE=") + 4; int evidenceEndIndex = header.iDescription.indexOf(" ", evidenceStartIndex); if (evidenceEndIndex != -1) { header.iProteinEvidence = header.iDescription.substring(evidenceStartIndex, evidenceEndIndex); } else { header.iProteinEvidence = header.iDescription.substring(evidenceStartIndex); } // http://www.uniprot.org/manual/protein_existence } // try to get the taxonomy name from the description if (header.iDescription.contains(" OS=")) { int taxonomyStartIndex = header.iDescription.indexOf(" OS=") + 4; int taxonomyEndIndex = header.iDescription.indexOf(" GN="); // have to check if gene name is in the header if (taxonomyEndIndex == -1) { if (header.iDescription.contains(" PE=")) { taxonomyEndIndex = header.iDescription.indexOf(" PE="); } else { taxonomyEndIndex = header.iDescription.length(); } } header.iTaxonomy = header.iDescription.substring(taxonomyStartIndex, taxonomyEndIndex); // now we can also shorten the protein description String tempShortHeader = header.iDescription.substring(0, taxonomyStartIndex - 3); header.iDescriptionShort = tempShortHeader.substring(tempShortHeader.indexOf(" ") + 1).trim(); header.iDescriptionProteinName = tempShortHeader.substring(0, tempShortHeader.indexOf(" ")); } } /** * Return the Uniprot protein evidence type as text. * * @param type the type of evidence * * @return the protein evidence type as text */ public static String getProteinEvidencAsString(Integer type) { switch (type) { case 1: return "Protein"; case 2: return "Transcript"; case 3: return "Homology"; case 4: return "Predicted"; case 5: return "Uncertain"; default: return null; } } }