Protein.java example

Explorer
msInspect-master
/*
 * Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.fhcrc.cpl.toolbox.proteomics;

import java.io.PrintWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.*;

/**
 * User: migra
 * Date: Jun 23, 2004
 * Time: 10:28:04 PM
 *
 */
public class Protein
{
    private String _header;
    private byte[] _bytes;
    private double _mass;
    private String _lookup;

    private String _origHeader;
    private Map _identifierMap;

    //known identifier types.  Multiple identifiers found in fasta files can often
    //boil down to the same thing
    public static HashMap<String, String> IdentTypeMap = new HashMap<String, String>();

    /* for parsing header lines of FASTA files */
    public static final String SEPARATOR_PATTERN = "\\|";
    public static final String SEPARATOR_CHAR = "|";

    //populate the hashmap of known identifier types
    static
    {
        IdentTypeMap.put("GI", "GI");
        IdentTypeMap.put("REF", "REFSEQ");
        IdentTypeMap.put("GB", "Genbank");
        IdentTypeMap.put("EMB", "Genbank");
        IdentTypeMap.put("SPROT_NAME", "SwissProt");
        IdentTypeMap.put("DBJ", "Genbank");
        IdentTypeMap.put("SP", "SwissProtAccn");
        IdentTypeMap.put("IPI", "IPI");
        IdentTypeMap.put("COG", "COG");
        IdentTypeMap.put("ENSEMBL", "ENSEMBL");
        IdentTypeMap.put("REFSEQ_NP", "REFSEQ");
        IdentTypeMap.put("PDB", "PDB");
        IdentTypeMap.put("UNIPROT/TREMBL", "SwissProtAccn");
        IdentTypeMap.put("TREMBL", "SwissProtAccn");
        IdentTypeMap.put("REFSEQ_XP", "REFSEQ");
        IdentTypeMap.put("ORFP", "SGD_LOCUS");
        IdentTypeMap.put("UNIPROT/SPROT", "SwissProtAccn");
        IdentTypeMap.put("SWISS-PROT", "SwissProtAccn");
        IdentTypeMap.put("TPG", "Genbank");
        IdentTypeMap.put("UG", "Unigene");
        IdentTypeMap.put("SI", "SI");
        IdentTypeMap.put("UPTR", "SwissProt");
        IdentTypeMap.put("UPSP", "SwissProt");
        IdentTypeMap.put("GP", "Genbank");
        IdentTypeMap.put("PIR", "PIR");
        IdentTypeMap.put("PIR2", "PIR");
        IdentTypeMap.put("UNIREF100", "UniRef100");
        IdentTypeMap.put("REFSEQ", "REFSEQ");
        IdentTypeMap.put("SGDID", "SGDID");
        IdentTypeMap.put("SGD_GN", "GeneName");
        IdentTypeMap.put("GN", "GeneName");
    }



    public Protein(String header, byte[] bytes)
    {
        _bytes = bytes;
        int firstAliasIndex = 0;

        _origHeader = header;

        // Check for special case of repeated gi| at start... if so, remove the initial text, but use it for lookup string
        if (header.startsWith("gi|"))
            {
            firstAliasIndex = header.indexOf(" gi|", 2) + 1;
            if (firstAliasIndex < 0 || firstAliasIndex > 30)
                firstAliasIndex = 0;
            }

        if (0 == firstAliasIndex)
            {
            header = header.replaceAll("\t", " "); // Some annoying FASTA files have tabs instead of spaces 

            int firstSpace = header.indexOf(" ");

            if (-1 != firstSpace)
                _lookup = header.substring(0, firstSpace).trim();
            else
                _lookup = header;

            if (_lookup.length() > 79)
                _lookup = _lookup.substring(0, 79);   // Comet truncates protein after first 79 characters
            }
        else
            _lookup = header.substring(0, firstAliasIndex).trim();

        int massStart = header.lastIndexOf("[MASS=");

        if (massStart >= 0)
        {
            try
                {
                int massEnd = header.indexOf(']', massStart);
                _mass = Double.parseDouble(header.substring(massStart + 6, massEnd));
                }
            catch(Exception e)
                {
                // fall through
                }
        }
        else
            massStart = header.length();

        if (0 == _mass)
            _mass = PeptideGenerator.computeMass(_bytes, 0, _bytes.length, PeptideGenerator.AMINO_ACID_AVERAGE_MASSES);

        _header = header.substring(firstAliasIndex, massStart);
    }

    public String getHeader()
    {
        return _header;
    }

    public void setHeader(String header)
    {
        _header = header;
    }

    public String getOrigHeader() {
       return _origHeader;
    }
    public void setOrigHeader(String h) {
       this._origHeader = h;
    }

    public String getName()
    {
        return getHeader().substring(0, Math.min(getHeader().length(), 80));
    }

    public String toString()
    {
        return getName();
    }

    public byte[] getBytes()
    {
        return _bytes;
    }

    public void setBytes(byte[] bytes)
    {
        _bytes = bytes;
    }

    public String getSequenceAsString()
    {
        return new String(getBytes());
    }

    public Alias[] getAliases()
        {
        String[] aliasStrings = _header.split("\01");
        Alias[] aliases = new Alias[aliasStrings.length];

        for (int i=0; i<aliasStrings.length; i++)
            aliases[i] = new Alias(aliasStrings[i]);

        return aliases;
        }

    public double getMass()
        {
        return _mass;
        }

    public String getLookup()
        {
        return _lookup;
        }

    public void setLookup(String lookup)
        {
        _lookup = lookup;
        }

    //lazily parse the header for identifiers
    public Map getIdentifierMap()
    {
        if (_identifierMap == null)
        {
            String lookupString = _lookup;
            if (lookupString.startsWith("IPI") && !lookupString.contains("|") && _header.contains(" "))
            {
                lookupString = _header.substring(_header.indexOf(" ") + 1);
            }
            _identifierMap = identParse(lookupString,_header);
        }
        return _identifierMap;
    }

    /**
     * Save out to a PrintWriter in fasta format
     * @param out
     */
    public void saveFastaFormat(PrintWriter out)
    {
        out.println(">" + _header);
        out.println(new String(_bytes));
        out.flush();
    }

    /**
     * Save a protein array in fasta format
     * @param proteins
     * @param outFastaFile
     * @return
     */
    public static void saveProteinArrayToFasta(Protein[] proteins,
                                               File outFastaFile)
            throws IOException
    {
        PrintWriter pw = null;
        try
            {
            pw = new PrintWriter(new FileOutputStream(outFastaFile));
            saveProteinArrayToFasta(proteins,pw);
            }
        catch (IOException x)
            {
                throw x;
            }
           finally
                {
                if (null != pw)
                    pw.close();
                }
    }

    /**
     * Save a protein array in fasta format
     * @param proteins
     * @param pw
     */
    public static void saveProteinArrayToFasta(Protein[] proteins, PrintWriter pw)
    {
        for (int i=0; i<proteins.length; i++)
        {
            proteins[i].saveFastaFormat(pw);
        }
    }

    /**
     *  New version of parseIdent using regular expressions.  Identifiers found in the lookup string
     *  portion of the header come in two basic flavors-- typed and untyped.  Typed ids look like
     *          <typename>|<idvalue>|<typename>|<idvalue>...
     *  Untyped ids are not separated into typename and value, but sometimes have a leading character
     *  sequence that identifies them (e.g. IPI id) or a reasonably well defined pattern (e.g. SwissProt
     *  names like HPH2_YEAST).  Regular expressions are used by the IdPattern class to recognize
     *  untyped identiers and to transform them as necessary.  The IdPattern class is also used
     *  to validate and transform typed identifiers in a small number of cases..
     *
     * 2/09/2008 added a third mechanism, a single regex that looks at the whole header rather than tokens
     *
     * @param fastaIdentifierString
     * @param wholeHeader
     * @return a map of identifiers parsed from the header;  might be empty
     */
    public static Map<String, Set<String>> identParse(String fastaIdentifierString, String wholeHeader)
    {
        Map<String, Set<String>> identifiers = new HashMap<String, Set<String>>();
        if (fastaIdentifierString == null) return identifiers;
        if (fastaIdentifierString.indexOf(" ") != -1) fastaIdentifierString = fastaIdentifierString.substring(0, fastaIdentifierString.indexOf(" "));
        fastaIdentifierString = fastaIdentifierString.replaceAll(":", "|");
        fastaIdentifierString = fastaIdentifierString.replace("|$", "");

        String tokens[] = fastaIdentifierString.split(SEPARATOR_PATTERN);
        for (int i = 0; i < tokens.length; i++)
        {
            Map<String, Set<String>>  additionalIds=null;
            // if the current token is the last or only token, or the token is
            // not recognized as an Identtype name, see if it matches a pattern in the list
            if ((i == tokens.length - 1) || (!IdentTypeMap.containsKey(tokens[i].toUpperCase())))
            {
                for (String typeName : IdPattern.UNTYPED_ID_PATTERN_LIST)
                {
                    additionalIds = IdPattern.ID_PATTERN_MAP.get(typeName).getIdFromPattern(tokens, i);
                    if (null!=additionalIds) break;
                }
            }
            // if the pattern matching found identifiers, add them to the map and
            // go to the next token.  if the pattern matching found multiple identifiers,
            // bump the token an extra bump for each identifier beyond 1
            if (null!=additionalIds && additionalIds.size() > 0)
            {
                identifiers = IdPattern.addIdMap(identifiers, additionalIds);
                i = i + additionalIds.size() -1 ;
                continue;
            }

            String key = tokens[i];
            if (key.equalsIgnoreCase("gnl"))
            {
                if (i < (tokens.length - 2))
                {
                    key = tokens[++i];
                }
            }
            String value = null;
            if (i + 1 < tokens.length)
            {
                value = tokens[++i];
            }

            if (value != null && IdentTypeMap.containsKey(key.toUpperCase()))
            {
                String newKey = IdentTypeMap.get(key.toUpperCase());
                if (IdPattern.TYPED_ID_PATTERN_LIST.contains(newKey))
                    additionalIds = IdPattern.ID_PATTERN_MAP.get(newKey).getIdFromPattern(new String[]{value}, 0);
                else
                    additionalIds = IdPattern.createIdMap(newKey, value);

                identifiers = IdPattern.addIdMap(identifiers, additionalIds);
            }
        }
        if (wholeHeader != null)
        {
            Map<String, Set<String>>  additionalIds=null;
            for (String typeName : IdPattern.WHOLE_HEADER_ID_PATTERN_LIST)
            {
                additionalIds = IdPattern.ID_PATTERN_MAP.get(typeName).getIdFromPattern(new String[]{wholeHeader}, 0);
                if (null!=additionalIds && additionalIds.size() > 0)
                {
                    identifiers = IdPattern.addIdMap(identifiers, additionalIds);
                }
            }

        }
        return identifiers;
    }

    public class Alias
        {
        private String _remaining = "";
        private String _description = "";

        public Alias(String s)
            {
            _description = s;
            }

        public String getRemaining()
            {
            return _remaining;
            }

        public String getDescription()
            {
            return _description;
            }
        }

    public static class SequenceComparator implements Comparator
        {
        public int compare(Object o1, Object o2)
            {                
            return ((Protein) o1).getSequenceAsString().compareTo(((Protein) o2).getSequenceAsString());
            }
        }

    public static class LookupComparator implements Comparator
        {
        public int compare(Object o1, Object o2)
            {
            return ((Protein) o1).getLookup().compareTo(((Protein) o2).getLookup());

            }
        }

    public static class HeaderComparator implements Comparator
        {
        public int compare(Object o1, Object o2)
            {
            return ((Protein) o1).getHeader().compareTo(((Protein) o2).getHeader());

            }
        }

}