/*
* Copyright (C) Lennart Martens
*
* Contact: lennart.martens AT UGent.be (' AT ' to be replaced with '@')
*/
package com.compomics.util.general;
import org.apache.log4j.Logger;
import java.io.*;
import java.util.Vector;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Properties;
import java.text.StringCharacterIterator;
/*
* CVS information:
*
* $Revision: 1.4 $
* $Date: 2007/07/06 09:41:53 $
*/
/**
* This class takes care of mass calculation, based on a sequence
* in IUPAC format. <br>
* By default it can handle monoisotopic biochemical element masses
* and monoisotopic single-letter amino acid masses. <br>
* The object can be customized with your own mass lists through the
* use of a properties file, or by directly passing a HashMap.
* <br><br>
* <b>Please note:</b> if you decide on your own lists, be sure to
* follow the following guidelines:
* <ul>
* <li>
* An element (key in properties file or HashMap) can be ONE or TWO
* letters. The first ALWAYS has to be UPPERCASE, the optional second
* ALWAYS has to be <i>lowercase</i>.
* </li>
* <li>
* A value has to be a parseable number for a properties file, or
* a Double instance for the HashMap. No other formats are accepted!
* </li>
* </ul>
*
* @author Lennart Martens
*/
public class MassCalc {
// Class specific log4j logger for MassCalc instances.
static Logger logger = Logger.getLogger(MassCalc.class);
/**
* This variable highlights which map was chosen for the element masses.
*/
private int iChosen = -1;
/**
* This hash will contain all the masses for the currently selected
* element list.
*/
private HashMap masses = null;
/**
* This Vector stores all the HashMaps that can be selected as element
* lists. <br>
* Please only access them via the proper index, as defined in static vars
* on this class.
*/
private static Vector allMaps = new Vector(2);
/**
* An index into an array.
*/
private static final int ELEMENT = 0;
/**
* An index into an array.
*/
private static final int MULTIPLICITY = 1;
/**
* Index for the monoisotopic masses of the biochemically relevant
* elements.
*/
public static final int MONOELEMENTS = 0;
/**
* Index for the monoisotopic aminoacid masses.
*/
public static final int MONOAA = 1;
/**
* Index for the monoisotopic nucleotide masses.
*/
public static final int MONONUCLEOTIDES = 2;
/**
* Value for the self-defined masses.
*/
public static final int SELFDEFINED = -1;
/**
* Default constructor. The mass list to be used defaults to the
* monoisotopic masses for biochemically relevant elements.
*/
public MassCalc() {
this(MONOELEMENTS);
}
/**
* This constructor allows you to specify an identifier to select a
* element list to use for calculating a mass. <br>
* Please use the finalo vriables on this class as identifiers!
*
* @param aMassListIdentifier int with the identifier for the
* elementlist to use.
*/
public MassCalc(int aMassListIdentifier) {
if (allMaps.isEmpty()) {
allMaps.add(MONOELEMENTS, this.loadMassesFromPropFile("MonoElementMasses.properties"));
allMaps.add(MONOAA, this.loadMassesFromPropFile("MonoAAMasses.properties"));
allMaps.add(MONONUCLEOTIDES, this.loadMassesFromPropFile("MonoNucleotideMasses.properties"));
}
if (aMassListIdentifier > allMaps.size()) {
throw new IllegalArgumentException("No such elementlist defined (" + aMassListIdentifier + ").\n");
}
masses = (HashMap) ((HashMap) allMaps.elementAt(aMassListIdentifier)).clone();
iChosen = aMassListIdentifier;
}
/**
* This constructor allows the caller to use an elementlist of
* its own making. Simply passing the filename of the file suffices. <br>
*
* @param aFilename String with the name of the file to be loaded.
* <b>NOTE!</b> this file must be located in the
* classpath and mst be a simple properties file!
*/
public MassCalc(String aFilename) {
masses = this.loadMassesFromPropFile(aFilename);
}
/**
* This constructor allows the caller to initialize the elementlist
* with a HashMap of its own design. <br>
* This HashMap needs be structured in the following way: <br>
* <ul>
* <li>KEY can be one or two letters, the first has to be uppercase,
* the second and optional letter has to be lowercase.</li>
* <li>VALUE must be a Double value</li>
* </ul>
*
* @param aElementList HashMap with the elementlist to use.
*/
public MassCalc(HashMap aElementList) {
masses = aElementList;
}
/**
* This constructor allows the caller to supplement (or to replace
* elements in) a built-in elementlist with a HashMap of its own
* design. <br>
* This HashMap needs be structured in the following way: <br>
* <ul>
* <li>KEY can be one or two letters, the first has to be uppercase,
* the second and optional letter has to be lowercase.</li>
* <li>VALUE must be a Double value</li>
* </ul>
*
* @param aMassListIdentifier int with the identifier for the
* built-in elementlist to use.
* @param aSupplElementList HashMap with the supplementary
* elementlist to use.
*/
public MassCalc(int aMassListIdentifier, HashMap aSupplElementList) {
this(aMassListIdentifier);
this.masses.putAll(aSupplElementList);
}
/**
* This method attempts to calculate the mass of a chemical formula.
* It cannot calculate the mass of an element if it is not known to
* this class (i.e.: if it does not occur in the Properties instance).
* In that case it will flag an exception.
*
* @param aFormula String with the chemical formula (or bruto
* formula) of the compound in question.
* @return double with the mass of the compound.
* @exception UnknownElementMassException when one of the composing elements'
* mass is unknown to the class.
*/
public double calculateMass(String aFormula) throws UnknownElementMassException {
// Implemented with a character iterator.
StringCharacterIterator sci = new StringCharacterIterator(aFormula);
// The mass we so hungrily crave.
double mass = 0.0;
// A HashMap for the Bruto formula.
HashMap bruto = new HashMap(sci.getEndIndex());
// Now cycle the iterator.
char currentChar;
// Note that iterator advancement is taken care of in the
// processing methods called form this one!
while ((currentChar = sci.current()) != StringCharacterIterator.DONE) {
// For this character, there are only TWO possibilities:
// either it is a letter, or it is a opening bracket.
// If it is a letter, we forward to the getElement() method,
// else we forward it to the getInnerFormula method.
// The getElement returns an array with two elements: the
// element and its multiplicity (which can be '1' or more).
// Indexes are defined in the final variables ELEMENT and
// MULTIPLICITY.
// The getInnerFormula just returns a mass.
if (Character.isLetter(currentChar) || ('_' == currentChar) || ('*' == currentChar)) {
Object[] result = this.getElement(sci);
// this method takes care of adding this stuff to the
// brutoformula.
// Just to prevent code clutter.
this.addResultToBrutoFormula((String) result[ELEMENT],
((Integer) result[MULTIPLICITY]).intValue(),
bruto);
} else if (currentChar == '(') {
mass += this.getInnerFormulaMass(sci);
} else {
// This means: no letter and no bracket.
// It has got to be an error.
throw new IllegalArgumentException("Formula '" + aFormula
+ "' could not be parsed due to the following unrecognized character: '" + currentChar + "'!\n");
}
}
// We have a part of the mass (at least, if any inner formulae were
// present) and the bruto formula for the remainder of the elements.
// We'll cycle it and add all masses.
Iterator iter = bruto.keySet().iterator();
while (iter.hasNext()) {
String key = (String) iter.next();
if (masses.containsKey(key)) {
double tempMass = ((Double) masses.get(key)).doubleValue();
int multiplicity = ((Integer) bruto.get(key)).intValue();
mass += multiplicity * tempMass;
} else {
// Oooops! Unknown element! Flag an UnknownElementMassException
// and be done with it.
throw new UnknownElementMassException(key);
}
}
// If the sequence is somehow connected to AAmasses,
// we should add the mass of H2O!
MassCalc innerMC = new MassCalc(MassCalc.MONOELEMENTS);
if (iChosen == MONOAA) {
//mass += 18.010565;
mass += innerMC.calculateMass("H2O");
} else if (iChosen == MONONUCLEOTIDES) {
// For nucleotides, add hydrogen (for 5') and subtract PO2 (for 3').
mass += innerMC.calculateMass("H");
mass -= innerMC.calculateMass("PO2");
}
// This should be it.
return mass;
}
/**
* This method adds the element with given multiplicity to the
* HashMap. the element as the key, multiplicity as the value.
* If the element was already present as a key, the multiplicity
* is added to the existing multiplicity.
*
* @param aElement String with the element symbol.
* @param aMultiplicity int with the multiplicity of the element.
* @param aBruto HashMap with the bruto formula to add the element and
* multiplicity to.
*/
private void addResultToBrutoFormula(String aElement, int aMultiplicity, HashMap aBruto) {
if (aBruto.containsKey(aElement)) {
// Add the multiplicity to the existing value.
int tempValue = ((Integer) aBruto.get(aElement)).intValue();
tempValue += aMultiplicity;
aBruto.put(aElement, Integer.valueOf(tempValue));
} else {
// Not yet there, simply insert it.
aBruto.put(aElement, Integer.valueOf(aMultiplicity));
}
}
/**
* This method will read an element symbol and it's multiplicity from an
* SCI.
*
* @param aSCI StringCharacterIterator to read from.
* @return Object[] with the element behind index ELEMENT and the
* multiplicity behind index MULTIPLICITY.
*/
private Object[] getElement(StringCharacterIterator aSCI) {
Object[] result = new Object[2];
// Okay, we'll need to find out the element name.
// It can consist of one or two letters, the second
// being lowercase if present.
String element = Character.toString(aSCI.current());
int multiplicity = 1;
// First of all, check whether there IS a next (the element
// could well be the last in line, in which case multiplicity is 1
// and we're done!
char next = aSCI.next();
if (next == StringCharacterIterator.DONE) {
// We don't do anything else here.
} else {
// Check if the next char is a lowercase letter.
if (Character.isLetter(next) && Character.isLowerCase(next)) {
// Add the second char to the element String and
// move the position one step further.
element += Character.toString(next);
next = aSCI.next();
} else if (next == '<') {
// It's the start of a modification tag.
// Let's grab it and add it!
element += this.isolateInnerPartString(aSCI, '<', '>', true);
}
// Now we can check multiplicity.
// This is only necessary if the next char is a number, else
// we'll just set it to '1'.
if ((next != StringCharacterIterator.DONE) && Character.isDigit(next)) {
multiplicity = this.getMultiplicity(aSCI);
} else {
// Just set to one.
multiplicity = 1;
}
}
// Voila.
result[ELEMENT] = element;
result[MULTIPLICITY] = Integer.valueOf(multiplicity);
return result;
}
/**
* This method attempts to read a multiplicity starting from the current
* position in the SCI parameter. The position will be moved such that
* calling next on the iterator results in getting the first non-numerical
* character to follow the multiplicity.
*
* @param aSCI StringCharacterIterator to read from.
* @return int the multiplicity.
*/
private int getMultiplicity(StringCharacterIterator aSCI) {
int mp = 0;
// If the current char is not a number, multiplicity is simply
// '1'.
if (!Character.isDigit(aSCI.current())) {
mp = 1;
} else {
// The current char is the first of the number.
String number = Character.toString(aSCI.current());
// Fence-post.
char next = aSCI.next();
// Get all digits constructing the number.
while ((next != StringCharacterIterator.DONE) && Character.isDigit(next)) {
// Add it to the number.
number += Character.toString(next);
// Increment.
next = aSCI.next();
}
// Convert the number into an int.
mp = Integer.parseInt(number);
}
// Voila.
return mp;
}
/**
* This method will isolate and calculate the mass for the inner formula
* presented here, starting from the current position in the aSCI.
* This means that a call to current yields an opening bracket.
* The position of the SCI will be moved to the closing bracket of
* the inner formula.
*
* @param aSCI StringCharacterIterator with the inner formula.
* @return double with the mass of the inner formula.
*/
private double getInnerFormulaMass(StringCharacterIterator aSCI) throws UnknownElementMassException {
int multiplicity;
// Isolate inner formula String.
String inner = this.isolateInnerPartString(aSCI, '(', ')', false);
// Calculate mass.
double mass = this.calculateMass(inner);
// Get multiplicity.
multiplicity = this.getMultiplicity(aSCI);
// Calculate result.
mass *= multiplicity;
return mass;
}
/**
* This method will isolate an inner part, if the SCI is
* currently positioned on the opening token of that inner
* part. <br />
* It also allows for nested inner parts!
*
* @param aSCI StringCharacterIterator from which to read the
* inner formula.
* @param aOpener char with the opening token for the inner part.
* @param aCloser char with the closing token for the inner part.
* @param aKeepTokens boolean that indicates whether the tokens
* should be included in the return String.
* @return String with the inner formula.
*/
private String isolateInnerPartString(StringCharacterIterator aSCI, char aOpener, char aCloser, boolean aKeepTokens) {
// The String which we'll return.
String innerFormula = "";
// We will count tokens...
int tokenCount = 1;
// Current character is opening token and can and will be ignored.
char next = aSCI.next();
// This position is also the starting position for our String to be.
int startPosition = aSCI.getIndex();
// This one is derived from the logic within the loop below.
int endPosition = -1;
// Now to count tokens. Opening token adds 1 to the counter,
// closing token subtracts 1. If the counter reaches zero, we've
// found the end of our innerFormula.
while (tokenCount > 0) {
if (next == aOpener) {
// Opening token, add one to counter.
tokenCount++;
} else if (next == aCloser) {
// Closing token, subtract one from counter.
tokenCount--;
}
// Advance one character.
next = aSCI.next();
}
// Okay, end found. We'll have to retrieve it's position, 'though.
// It's position is NOT the current, since that is BEYOND the last
// closing token. It is not one before that, since that would be
// the closing token itself.
// We need the position 2 before the current.
// BTW: this int is to reset to the current position when we're done.
int imPosition = aSCI.getIndex();
endPosition = imPosition - 2;
// Construct the inner formula from the characters starting at
// 'startPosition' and ending with 'endPosition'. Note that both are
// inclusive.
for (int i = startPosition; i <= endPosition; i++) {
// Set the index for the char to retrieve.
aSCI.setIndex(i);
// Get the current char and append it to the String.
innerFormula += Character.valueOf(aSCI.current());
}
// Reset the index on the SCI to the correct endposition (which is
// just after the last closing bracket, btw).
// We've stored that position in 'imPosition'.
aSCI.setIndex(imPosition);
// All done.
return ((aKeepTokens ? Character.valueOf(aOpener) : "")
+ innerFormula + (aKeepTokens ? Character.toString(aCloser) : ""));
}
/**
* This method loads a properties file and creates a HashMap
* from this file where elements are keys and values are the
* masses for the elements. <br />
* It is assumed (and thus <i>necessary</i>) that these files are located
* <b>in the classpath</b>.
*
* @param aFilename String with the name of the file.
* @return HashMap with the key-value pairs (element - mass).
*/
private HashMap loadMassesFromPropFile(String aFilename) {
Properties tMasses = new Properties();
HashMap lMasses = new HashMap();
try {
// Load the monoisotopic masses for elements file.
InputStream is = this.getClass().getClassLoader().getResourceAsStream(aFilename);
if (is == null) {
throw new IOException();
}
// Initialize the properties.
tMasses.load(is);
// Transform the values from Strings into doubles.
Iterator iter = tMasses.keySet().iterator();
while (iter.hasNext()) {
Object o = iter.next();
lMasses.put(o, new Double((String) tMasses.get(o)));
}
} catch (IOException ioe) {
logger.error("\n**********************\nUnable to load file '" + aFilename + "' from the classpath.");
logger.error("All mass calculations based on these masses will throw Exceptions!\n**********************\n");
}
return lMasses;
}
// For easy access.
/**
* The main method can be used for command-line usage of this class.
* The parameters should be (a) chemical (or bruto) formula(e) to
* calculate the mass for.
*
* @param args String[] at least one chemical formula should be specified,
* up to as much as you can cramp into a single
* command-line.
*/
public static void main(String[] args) {
if (args == null || args.length == 0) {
logger.error("\nUsage: MassCalc [-a|n] <formula1> [<formula2> ...]\n");
} else {
int start = 0;
int elementlist = MassCalc.MONOELEMENTS;
if (args[0].equals("-a")) {
start = 1;
elementlist = MassCalc.MONOAA;
if (args.length < 2) {
logger.error("\nUsage: MassCalc [-a] <formula1> [<formula2> ...]\n");
}
} else if (args[0].equals("-n")) {
start = 1;
elementlist = MassCalc.MONONUCLEOTIDES;
if (args.length < 2) {
logger.error("\nUsage: MassCalc [-n] <formula1> [<formula2> ...]\n");
}
}
MassCalc mc = new MassCalc(elementlist);
try {
for (int i = start; i < args.length; i++) {
logger.info("\nMass for '" + args[i] + "': " + mc.calculateMass(args[i]) + ".");
}
} catch (Exception e) {
logger.error(e.getMessage(), e);
}
}
}
}