/* * Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.fhcrc.cpl.toolbox.proteomics; import java.io.PrintWriter; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.util.*; /** * User: migra * Date: Jun 23, 2004 * Time: 10:28:04 PM * */ public class Protein { private String _header; private byte[] _bytes; private double _mass; private String _lookup; private String _origHeader; private Map _identifierMap; //known identifier types. Multiple identifiers found in fasta files can often //boil down to the same thing public static HashMap<String, String> IdentTypeMap = new HashMap<String, String>(); /* for parsing header lines of FASTA files */ public static final String SEPARATOR_PATTERN = "\\|"; public static final String SEPARATOR_CHAR = "|"; //populate the hashmap of known identifier types static { IdentTypeMap.put("GI", "GI"); IdentTypeMap.put("REF", "REFSEQ"); IdentTypeMap.put("GB", "Genbank"); IdentTypeMap.put("EMB", "Genbank"); IdentTypeMap.put("SPROT_NAME", "SwissProt"); IdentTypeMap.put("DBJ", "Genbank"); IdentTypeMap.put("SP", "SwissProtAccn"); IdentTypeMap.put("IPI", "IPI"); IdentTypeMap.put("COG", "COG"); IdentTypeMap.put("ENSEMBL", "ENSEMBL"); IdentTypeMap.put("REFSEQ_NP", "REFSEQ"); IdentTypeMap.put("PDB", "PDB"); IdentTypeMap.put("UNIPROT/TREMBL", "SwissProtAccn"); IdentTypeMap.put("TREMBL", "SwissProtAccn"); IdentTypeMap.put("REFSEQ_XP", "REFSEQ"); IdentTypeMap.put("ORFP", "SGD_LOCUS"); IdentTypeMap.put("UNIPROT/SPROT", "SwissProtAccn"); IdentTypeMap.put("SWISS-PROT", "SwissProtAccn"); IdentTypeMap.put("TPG", "Genbank"); IdentTypeMap.put("UG", "Unigene"); IdentTypeMap.put("SI", "SI"); IdentTypeMap.put("UPTR", "SwissProt"); IdentTypeMap.put("UPSP", "SwissProt"); IdentTypeMap.put("GP", "Genbank"); IdentTypeMap.put("PIR", "PIR"); IdentTypeMap.put("PIR2", "PIR"); IdentTypeMap.put("UNIREF100", "UniRef100"); IdentTypeMap.put("REFSEQ", "REFSEQ"); IdentTypeMap.put("SGDID", "SGDID"); IdentTypeMap.put("SGD_GN", "GeneName"); IdentTypeMap.put("GN", "GeneName"); } public Protein(String header, byte[] bytes) { _bytes = bytes; int firstAliasIndex = 0; _origHeader = header; // Check for special case of repeated gi| at start... if so, remove the initial text, but use it for lookup string if (header.startsWith("gi|")) { firstAliasIndex = header.indexOf(" gi|", 2) + 1; if (firstAliasIndex < 0 || firstAliasIndex > 30) firstAliasIndex = 0; } if (0 == firstAliasIndex) { header = header.replaceAll("\t", " "); // Some annoying FASTA files have tabs instead of spaces int firstSpace = header.indexOf(" "); if (-1 != firstSpace) _lookup = header.substring(0, firstSpace).trim(); else _lookup = header; if (_lookup.length() > 79) _lookup = _lookup.substring(0, 79); // Comet truncates protein after first 79 characters } else _lookup = header.substring(0, firstAliasIndex).trim(); int massStart = header.lastIndexOf("[MASS="); if (massStart >= 0) { try { int massEnd = header.indexOf(']', massStart); _mass = Double.parseDouble(header.substring(massStart + 6, massEnd)); } catch(Exception e) { // fall through } } else massStart = header.length(); if (0 == _mass) _mass = PeptideGenerator.computeMass(_bytes, 0, _bytes.length, PeptideGenerator.AMINO_ACID_AVERAGE_MASSES); _header = header.substring(firstAliasIndex, massStart); } public String getHeader() { return _header; } public void setHeader(String header) { _header = header; } public String getOrigHeader() { return _origHeader; } public void setOrigHeader(String h) { this._origHeader = h; } public String getName() { return getHeader().substring(0, Math.min(getHeader().length(), 80)); } public String toString() { return getName(); } public byte[] getBytes() { return _bytes; } public void setBytes(byte[] bytes) { _bytes = bytes; } public String getSequenceAsString() { return new String(getBytes()); } public Alias[] getAliases() { String[] aliasStrings = _header.split("\01"); Alias[] aliases = new Alias[aliasStrings.length]; for (int i=0; i<aliasStrings.length; i++) aliases[i] = new Alias(aliasStrings[i]); return aliases; } public double getMass() { return _mass; } public String getLookup() { return _lookup; } public void setLookup(String lookup) { _lookup = lookup; } //lazily parse the header for identifiers public Map getIdentifierMap() { if (_identifierMap == null) { String lookupString = _lookup; if (lookupString.startsWith("IPI") && !lookupString.contains("|") && _header.contains(" ")) { lookupString = _header.substring(_header.indexOf(" ") + 1); } _identifierMap = identParse(lookupString,_header); } return _identifierMap; } /** * Save out to a PrintWriter in fasta format * @param out */ public void saveFastaFormat(PrintWriter out) { out.println(">" + _header); out.println(new String(_bytes)); out.flush(); } /** * Save a protein array in fasta format * @param proteins * @param outFastaFile * @return */ public static void saveProteinArrayToFasta(Protein[] proteins, File outFastaFile) throws IOException { PrintWriter pw = null; try { pw = new PrintWriter(new FileOutputStream(outFastaFile)); saveProteinArrayToFasta(proteins,pw); } catch (IOException x) { throw x; } finally { if (null != pw) pw.close(); } } /** * Save a protein array in fasta format * @param proteins * @param pw */ public static void saveProteinArrayToFasta(Protein[] proteins, PrintWriter pw) { for (int i=0; i<proteins.length; i++) { proteins[i].saveFastaFormat(pw); } } /** * New version of parseIdent using regular expressions. Identifiers found in the lookup string * portion of the header come in two basic flavors-- typed and untyped. Typed ids look like * <typename>|<idvalue>|<typename>|<idvalue>... * Untyped ids are not separated into typename and value, but sometimes have a leading character * sequence that identifies them (e.g. IPI id) or a reasonably well defined pattern (e.g. SwissProt * names like HPH2_YEAST). Regular expressions are used by the IdPattern class to recognize * untyped identiers and to transform them as necessary. The IdPattern class is also used * to validate and transform typed identifiers in a small number of cases.. * * 2/09/2008 added a third mechanism, a single regex that looks at the whole header rather than tokens * * @param fastaIdentifierString * @param wholeHeader * @return a map of identifiers parsed from the header; might be empty */ public static Map<String, Set<String>> identParse(String fastaIdentifierString, String wholeHeader) { Map<String, Set<String>> identifiers = new HashMap<String, Set<String>>(); if (fastaIdentifierString == null) return identifiers; if (fastaIdentifierString.indexOf(" ") != -1) fastaIdentifierString = fastaIdentifierString.substring(0, fastaIdentifierString.indexOf(" ")); fastaIdentifierString = fastaIdentifierString.replaceAll(":", "|"); fastaIdentifierString = fastaIdentifierString.replace("|$", ""); String tokens[] = fastaIdentifierString.split(SEPARATOR_PATTERN); for (int i = 0; i < tokens.length; i++) { Map<String, Set<String>> additionalIds=null; // if the current token is the last or only token, or the token is // not recognized as an Identtype name, see if it matches a pattern in the list if ((i == tokens.length - 1) || (!IdentTypeMap.containsKey(tokens[i].toUpperCase()))) { for (String typeName : IdPattern.UNTYPED_ID_PATTERN_LIST) { additionalIds = IdPattern.ID_PATTERN_MAP.get(typeName).getIdFromPattern(tokens, i); if (null!=additionalIds) break; } } // if the pattern matching found identifiers, add them to the map and // go to the next token. if the pattern matching found multiple identifiers, // bump the token an extra bump for each identifier beyond 1 if (null!=additionalIds && additionalIds.size() > 0) { identifiers = IdPattern.addIdMap(identifiers, additionalIds); i = i + additionalIds.size() -1 ; continue; } String key = tokens[i]; if (key.equalsIgnoreCase("gnl")) { if (i < (tokens.length - 2)) { key = tokens[++i]; } } String value = null; if (i + 1 < tokens.length) { value = tokens[++i]; } if (value != null && IdentTypeMap.containsKey(key.toUpperCase())) { String newKey = IdentTypeMap.get(key.toUpperCase()); if (IdPattern.TYPED_ID_PATTERN_LIST.contains(newKey)) additionalIds = IdPattern.ID_PATTERN_MAP.get(newKey).getIdFromPattern(new String[]{value}, 0); else additionalIds = IdPattern.createIdMap(newKey, value); identifiers = IdPattern.addIdMap(identifiers, additionalIds); } } if (wholeHeader != null) { Map<String, Set<String>> additionalIds=null; for (String typeName : IdPattern.WHOLE_HEADER_ID_PATTERN_LIST) { additionalIds = IdPattern.ID_PATTERN_MAP.get(typeName).getIdFromPattern(new String[]{wholeHeader}, 0); if (null!=additionalIds && additionalIds.size() > 0) { identifiers = IdPattern.addIdMap(identifiers, additionalIds); } } } return identifiers; } public class Alias { private String _remaining = ""; private String _description = ""; public Alias(String s) { _description = s; } public String getRemaining() { return _remaining; } public String getDescription() { return _description; } } public static class SequenceComparator implements Comparator { public int compare(Object o1, Object o2) { return ((Protein) o1).getSequenceAsString().compareTo(((Protein) o2).getSequenceAsString()); } } public static class LookupComparator implements Comparator { public int compare(Object o1, Object o2) { return ((Protein) o1).getLookup().compareTo(((Protein) o2).getLookup()); } } public static class HeaderComparator implements Comparator { public int compare(Object o1, Object o2) { return ((Protein) o1).getHeader().compareTo(((Protein) o2).getHeader()); } } }