/*
* EuroCarbDB, a framework for carbohydrate bioinformatics
*
* Copyright (c) 2006-2009, Eurocarb project, or third-party contributors as
* indicated by the @author tags or express copyright attribution
* statements applied by the authors.
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
* A copy of this license accompanies this distribution in the file LICENSE.txt.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* Last commit: $Rev: 1273 $ by $Author: glycoslave $ on $Date:: 2009-06-26 #$
*/
package org.eurocarbdb.sugar;
// stdlib imports
import java.util.Map;
import java.util.Set;
import java.util.List;
import java.util.HashMap;
import java.util.HashSet;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Arrays;
// 3rd party imports
import org.apache.log4j.Logger;
import com.google.common.collect.Multimap;
import com.google.common.collect.ArrayListMultimap;
// eurocarb imports
import org.eurocarbdb.util.BitSet;
import org.eurocarbdb.sugar.Monosaccharide;
// static imports
import static org.eurocarbdb.sugar.CommonBasetype.*;
import static org.eurocarbdb.sugar.Superclass.*;
import static org.eurocarbdb.sugar.StereoConfig.*;
import static org.eurocarbdb.sugar.RingConformation.*;
import static org.eurocarbdb.sugar.CommonSubstituent.*;
import static org.eurocarbdb.util.StringUtils.join;
import static org.eurocarbdb.util.StringUtils.split;
import static org.eurocarbdb.util.StringUtils.CR;
/**
* Provides various functions for creating, manipulating, and
* querying {@link Basetype}s.
*
* @see CommonBasetype
* @see CustomBasetype
* @author mjh
*/
public final class Basetypes
{
/** Constant indicating an unknown basetype. */
public static final Basetype UnknownBasetype = new CustomBasetype()
{
public final String getName()
{
return "UNKNOWN";
}
};
/** logging handle */
static Logger log = Logger.getLogger( Basetypes.class );
/** Uninstantiable */
private Basetypes() {}
static final Multimap<Integer,Basetype> basetypesById
= ArrayListMultimap.create( CommonBasetype.values().length * 2, 1 );
static final Multimap<Integer,Basetype> basetypesByStereochemId
= ArrayListMultimap.create( CommonBasetype.values().length * 2, 2 );
static
{
for ( CommonBasetype b : CommonBasetype.values() )
{
if ( b.getStereochemistry() == null )
continue;
int basetype_id = getBasetypeId( b );
basetypesById.put( basetype_id, b );
int stereochem_id = getStereochemicalId( b );
basetypesByStereochemId.put( stereochem_id, b );
Basetype inverse = getInvertedBasetype( b );
basetype_id = getBasetypeId( inverse );
basetypesById.put( basetype_id, inverse );
stereochem_id = getStereochemicalId( inverse );
basetypesByStereochemId.put( stereochem_id, inverse );
}
}
/**
* Returns a {@link Basetype} corresponding to the given String name.
*/
public static Basetype getBasetype( String name )
{
if ( name == null || name.length() == 0 )
throw new IllegalArgumentException(
"Basetype name can't be null or zero-length");
Basetype b = CommonBasetype.forName( name );
if ( b != null )
return b;
log.warn("**** still need to move this code ****");
BasetypeParser parser = new BasetypeParser( name );
parser.parse();
b = parser.getBasetype();
List<Substituent> subs = parser.getSubstituents();
return b;
}
static class BasetypeParser
{
int pos = 0;
final String input;
final String basetypeText;
final String substituentText;
Basetype basetype = null;
List<Substituent> substituents = null;
BasetypeParser( String text )
{
this.input = text;
int i = text.indexOf(';');
if ( i != -1 )
{
this.basetypeText = text.substring( 0, i );
this.substituentText = text.substring( i+1 );
}
else
{
this.basetypeText = text;
this.substituentText = null;
}
}
Basetype getBasetype() { return basetype; }
List<Substituent> getSubstituents() { return substituents; }
void parse()
{
this.basetype = parseBasetype();
if ( substituentText == null )
return;
substituents = new ArrayList( basetype.getFunctionalGroups() );
List<String> substits = split(';', substituentText );
pos = basetypeText.length() + 1;
int position;
List<String> bits;
for ( String substit : substits )
{
if ( substit.length() < 3 )
throw new SequenceFormatException(
input, pos + 2, "Expected a Substituent name");
bits = split( '-', substit );
if ( bits.size() > 2 )
throw new SequenceFormatException(
input, pos, pos + substit.length() - 1,
"Invalid Substituent: expected '<position>-<substituent-name>");
try { position = Integer.valueOf( bits.get(0) ); }
catch ( NumberFormatException ex )
{
throw new SequenceFormatException(
input, pos, pos + bits.get(0).length(),
"Expected a numeric substituent position" );
}
Substituent s = CommonSubstituent.forName( bits.get(1) );
substitute( position, s );
pos += substit.length() + 1;
}
}
private void substitute( int position, Substituent s )
{
if ( position < 0 || position >= basetype.getStereochemistry().length() )
throw new SequenceFormatException(
input, pos, "Invalid substituent position");
Substituent existing = basetype.getFunctionalGroups().get( position );
// log.debug("existing = ");
if ( existing != OH )
{
throw new UnsupportedOperationException(
"Substitution of non-hydroxy positions not implemented; position="
+ position
+ ", existing substituent="
+ existing
+ ", desired substituent="
+ s
);
}
boolean multiple_basetypes
= ( (basetype instanceof CustomBasetype)
&& ((CustomBasetype) basetype).getComponentBasetypes().length > 1 );
if ( multiple_basetypes && s.causesStereoloss() )
{
((CustomBasetype) basetype).removeChiralPosition( position, false );
}
substituents.add( position, s );
}
// private final List<Substituent> parseSubstituents()
// {
// return substituents;
// }
private final Basetype parseBasetype()
{
List<StereoConfig> scs = new ArrayList<StereoConfig>( 4 );
List<CommonBasetype> bts = new ArrayList<CommonBasetype>( 4 );
while ( pos < basetypeText.length() )
{
StereoConfig sc = null;
if ( pos + 1 < basetypeText.length() && basetypeText.charAt( pos + 1 ) == '-' )
{
sc = parseStereoConfig();
pos += 2;
}
CommonBasetype cb = parseSingleBasetype();
if ( sc == null )
sc = cb.getStereoConfig();
scs.add( sc );
bts.add( cb );
}
if ( bts.size() == 1 )
{
return Basetypes.getBasetype( scs.get(0), bts.get(0) );
}
else if ( bts.size() == 2 )
{
return Basetypes.getBasetype(
scs.get(0), bts.get(0),
scs.get(1), bts.get(1)
);
}
else
{
throw new UnsupportedOperationException(
"compound basetypes with > 2 bts not yet supported");
}
}
private final StereoConfig parseStereoConfig()
{
try
{
return StereoConfig.forName( basetypeText.charAt(pos) );
}
catch ( IllegalArgumentException ex )
{
throw new SequenceFormatException(
input, pos, "Expected a StereoConfig letter" );
}
}
private final CommonBasetype parseSingleBasetype()
{
String name;
int pos2 = basetypeText.indexOf( '-', pos );
if ( pos2 == -1 )
{
pos2 = basetypeText.length() - 1;
name = basetypeText.substring( pos );
}
else
{
name = basetypeText.substring( pos, pos2 );
}
if ( name.length() < 3 || name.length() > 8 )
throw new SequenceFormatException(
input, pos, pos2, "Invalid basetype name" );
// log.debug( "looking up name: " + name );
CommonBasetype cb = CommonBasetype.forName( name );
if ( cb == null )
throw new SequenceFormatException(
input, pos, pos2, "Unknown basetype name" );
pos += cb.name().length() + 1;
return cb;
}
} // end class
/**
* Returns a {@link Basetype} corresponding to the given {@link CommonBasetype}
* with the specified {@link StereoConfig}.
*/
public static Basetype getBasetype( StereoConfig sc, CommonBasetype bt )
{
assert sc != null;
assert bt != null;
// if the Basetype given is defined in terms of the given StereoConfig,
// then we can just return the enum constant Basetype.
if ( sc == bt.getStereoConfig() )
return bt;
// otherwise, return a basetype with inverted stereochemistry
return new CustomBasetype(
new StereoConfig[] { sc },
new CommonBasetype[] { bt },
bt.getSuperclass(),
bt.getStereochemistry().bitComplementEquals().bitwiseAndEquals( bt.getChiralPositions() ),
bt.getChiralPositions(),
bt.getFunctionalGroups()
);
}
/**
* Returns a {@link Basetype} with a hydroxyl configuration corresponding
* to the given conjoined {@link CommonBasetype}s with the given
* {@link StereoConfig}urations, as specified by
* <a href="http://www.chem.qmul.ac.uk/iupac/2carb/08n09.html#083">
* IUPAC conventions</a>, with the first CommonBasetype being closest
* to the carbonyl end.
*/
public static Basetype getBasetype(
StereoConfig sc1, CommonBasetype bt1,
StereoConfig sc2, CommonBasetype bt2
)
{
assert sc1 != null;
assert bt1 != null;
assert sc2 != null;
assert bt2 != null;
assert bt1.isAldose();
assert bt2.isAldose();
Basetype b1 = bt1;
if ( sc1 != b1.getStereoConfig() )
b1 = getBasetype( sc1, bt1 );
Basetype b2 = bt2;
if ( sc2 != b2.getStereoConfig() )
b2 = getBasetype( sc2, bt2 );
// resolve stereochemistry of b1
int lo_index, hi_index;
lo_index = b1.getChiralPositions().lowestSetBit();
hi_index = b1.getChiralPositions().highestSetBit() + 1;
assert hi_index - lo_index == b1.getChiralPositions().size();
BitSet stereochem1 = b1.getStereochemistry().bitSlice( lo_index, hi_index );
BitSet chiral_posc1 = b1.getChiralPositions().bitSlice( lo_index, hi_index );
// resolve stereochemistry of b2
lo_index = b2.getChiralPositions().lowestSetBit();
hi_index = b2.getChiralPositions().highestSetBit() + 1;
assert hi_index - lo_index == b2.getChiralPositions().size();
BitSet stereochem2 = b2.getStereochemistry().bitSlice( lo_index, hi_index );
BitSet chiral_posc2 = b2.getChiralPositions().bitSlice( lo_index, hi_index );
// append stereochems: b1 goes into the highest bits.
// chiral positions start from the second carbon, as per IUPAC,
// see http://www.chem.qmul.ac.uk/iupac/2carb/08n09.html#083
BitSet stereochem = new BitSet( 1 );
stereochem.append( stereochem2, stereochem1, new BitSet( 1 ) );
BitSet chiral_pos = new BitSet( stereochem.length() );
chiral_pos.set( 1, chiral_pos.length() - 1 );
int size = stereochem.length();
List<Substituent> func_groups = new ArrayList<Substituent>( size );
for ( int i = 0; i < size; i++ )
func_groups.add( OH );
func_groups.set( 0, Carbonyl );
return new CustomBasetype(
new StereoConfig[] { sc1, sc2 },
new CommonBasetype[] { bt1, bt2 },
Superclass.forSize( size ),
stereochem,
chiral_pos,
func_groups
);
}
public static final Basetype getBasetype( List<Basetype> basetypes )
{
if ( basetypes.size() == 1 )
return basetypes.get( 0 );
if ( basetypes.size() == 2 )
{
// bit of a hack here, casting to CommonBasetype
// isn't necessary except to satisfy javac. what should really
// happen here is to write a new method that will take
// any arbitrary list of basetypes.
return getBasetype(
basetypes.get( 0 ).getStereoConfig(),
(CommonBasetype) basetypes.get( 0 ),
basetypes.get( 1 ).getStereoConfig(),
(CommonBasetype) basetypes.get( 1 )
);
}
if ( basetypes.size() == 0 )
return UnknownBasetype;
throw new UnsupportedOperationException(
"more than 2 conjoined basetype temporarily not supported");
}
/**
* Returns a {@link Basetype} with inverted {@link StereoConfig}.
*/
public static final Basetype getInvertedBasetype( CommonBasetype b )
{
StereoConfig sc = StereoConfig.invert( b.getStereoConfig() );
return getBasetype( sc, b );
}
/**
*/
static final BitSet getInvertedStereochemistry( BitSet stereochem, BitSet chiralPos )
{
return stereochem.bitwiseXorEquals( chiralPos );
}
static final boolean tracing = log.isTraceEnabled();
static final boolean debugging = log.isDebugEnabled();
/**
*<p>
* Normalises an input {@link Basetype} and array of
* {@link Substituent}s, preserving Basetype stereochemistry
* at the expense of altering the {@link Superclass} (if
* substituents in the passed array would cause a loss of
* stereochemistry).
*</p>
*<p>
* The returned Basetype may also be different than the given
* Basetype if:
*<ol>
* <li>Substituents in the parameter list cause a gain or loss
* of stereocentres, in which case the Basetype stereochemistry
* is preserved and the {@link Superclass} adjusted accordingly</li>
*
* <li>Substituents extracted from the parameter list
* would result in a Basetype that better matches the list of
* known basetypes (ie: the enum of {@link CommonBasetype}s).
* If such a match occurs, Substituents that match the new,
* returned Basetype will be removed from the passed list.</li>
*</ol>
*</p>
*<p>
* For example:
*<pre>
* import org.eurocarbdb.sugar.CommonBasetype.Glc;
* import org.eurocarbdb.sugar.CommonSubstituent.NAc;
* import org.eurocarbdb.sugar.Basetypes.getNormalisedBasetype;
*
* List<Substituent> subs = Arrays.asList( null, NAc, null, null, null, null );
* Basetype b = getNormalisedBasetype( Glc, subs );
*
* System.out.println( b ); // prints "GlcNAc"
* System.out.println( subs ); // prints "[null, null, null, null, null, null]"
*</pre>
*</p>
*/
public static final Basetype
getNormalisedBasetype( Basetype b, List<Substituent> substits )//, boolean preserveSuperclass )
{
if ( substits == null || substits.size() == 0 )
return b;
if ( debugging )
{
log.debug(
"normalising basetype " + b + " to substituents: "
+ substits
);
if ( tracing )
{
log.trace(
"before considering substituents, basetype is:\n"
+ describe( b )
);
}
}
List<Substituent> fgs = b.getFunctionalGroups();
CustomBasetype cb = CustomBasetype.clone( b );
boolean basetype_changed = false;
boolean substits_is_all_nulls = true;
for ( int i = 0; i < substits.size(); i++ )
{
if ( substits.get(i) == null )
continue;
substits_is_all_nulls = false;
boolean need_to_alter_superclass
= ( substits.get(i).causesStereoloss()
&& cb.getChiralPositions().get(i) );
if ( need_to_alter_superclass )
{
cb.removeChiralPosition( i+1, false );
cb.getFunctionalGroups().set( i, substits.get(i) );
basetype_changed = true;
// this next line doesn't really affect this method,
// but has important implications for the caller.
// if not nulled out, then the caller sees all the
// substituents that have been effectively added to
// the basetype still in their substituent array.
substits.set( i, null );
}
}
// if the passed substituents array has no substituents in it,
// just return the originally passed basetype.
if ( substits_is_all_nulls )
{
if ( tracing )
log.trace("no substituents defined in array");
return b;
}
if ( tracing && ! substits_is_all_nulls )
{
log.trace(
"after considering stereoloss substits, derived basetype is:\n"
+ describe( cb )
);
}
// else examine the current working basetype to see if there is
// an existing CommonBasetype that matches the stereochemistry
// of the passed basetype plus passed substituents.
int id = getBasetypeId( cb );
Collection<Basetype> equivalent_bts = basetypesById.get( id );
if ( debugging )
{
log.debug(
"CommonBasetypes with matching stereochemistry: "
+ equivalent_bts
);
}
if ( equivalent_bts.size() == 0 )
return basetype_changed ? cb : b;
// CommonBasetype best_match = null;
Basetype best_match = null;
// record which basetypes hit and use the one that matches the
// most number of substituents in the passed array.
int max_hits = 0;
fgs = cb.getFunctionalGroups();
BitSet best_match_indexes = null;
BitSet matched = new BitSet( substits.size() );
basetypes: for ( Basetype known_bt : equivalent_bts )
{
matched.clear();
// for ( int i : not_null_indexes )
func_groups: for ( int i = 0; i < fgs.size(); i++ )
{
if ( i < substits.size() && substits.get(i) != null )
{
// if ( fgs.get(i) == substits[i] )
if ( known_bt.getFunctionalGroups().get(i) == substits.get(i) )
matched.set(i);
}
else
{
if ( fgs.get(i) != known_bt.getFunctionalGroups().get(i) )
// break;
continue basetypes;
}
}
if ( matched.size() > max_hits )
{
best_match = known_bt;
max_hits = matched.size();
best_match_indexes = matched.clone();
}
}
// if no better match, return the Basetype we have been working with
if ( best_match == null )
{
if ( tracing )
log.trace("no CommonBasetypes with matching stereochemistry "
+ "as well as functional groups, returning derived basetype");
return basetype_changed ? cb : b;
}
if ( debugging )
{
log.debug(
"derived basetype matches "
+ best_match
+ ", returning it..."
);
}
// else there is a better match: remove the substituents from the
// passed array that match the matched basetype, and return the
// matched basetype.
for ( int i : best_match_indexes )
substits.set( i, null );
return best_match;
}
/**
* Returns an ID for a {@link Basetype} that is unique for that
* Basetype's stereochemistry and chiral positions - basetypes
* with different substitutions that do not affect the stereochemistry
* still return the same ID.
* @see #getEquivalentBasetypes
*/
public static final int getBasetypeId( Basetype b )
{
BitSet sc = b.getStereochemistry();
if ( sc == null ) // some CBs have null values (temporary)
return 0;
BitSet cp = b.getChiralPositions();
/*BitSet joined = sc.clone();
joined.append( cp );
return joined.intValue();*/
return getBasetypeId( sc, cp );
}
/** Derives basetype ID from the given arguments. */
static final int getBasetypeId( BitSet stereochem, BitSet chiralPositions )
{
BitSet joined = stereochem.clone();
joined.append( chiralPositions );
return joined.intValue();
}
/**
* Returns an ID that uniquely identifies the stereochemistry of the
* passed {@link Basetype}, irrespective of functional groups and
* the location of chiral positions.
*/
public static final int getStereochemicalId( Basetype b )
{
// BitSet bs = b.getStereochemistry().bitSlice( b.getChiralPositions() );
// return bs.intValue() | (1 << bs.length());
return getStereochemicalId(
b.getStereochemistry(), b.getChiralPositions() );
}
/** Derives stereochemical ID from the given arguments. */
static final int getStereochemicalId( BitSet stereochem, BitSet chiralPositions )
{
BitSet bs = stereochem.bitSlice( chiralPositions );
return bs.intValue() | (1 << bs.length());
}
/**
* Returns a {@link Set} of {@link CommonBasetype}s that have similar
* stereochemistry and chiral position configuration to the passed
* {@link Basetype}. Note that this method does not consider functional
* groups at all, it only considers (1) which positions are chiral, and
* (2) the stereochemistry of those positions. Accordingly, the {@link #D}
* forms of {@link #Glc}, {@link #GlcNAc}, and {@link #GlcN} are equivalent
* according to this method, but D-{@link #Ara}, D-{@link #Fru} are not,
* despite the latter having identical stereochemistry.
*/
public static final Set<Basetype> getEquivalentBasetypes( Basetype b )
{
return new HashSet<Basetype>(
basetypesById.get( getBasetypeId( b )));
}
/**
* Returns a {@link Set} of pre-defined {@link Basetype}s that have identical
* stereochemistry, irrespective of actual chiral positions and functional
* groups. So {@link #Glc}, {@link #GlcNAc}, and {@link #GlcN} are
* stereochemically identical, as are D-{@link #Ara}, D-{@link #Fru}.
*/
public static final Set<Basetype> getStereochemicallyEqualBasetypes( Basetype b )
{
return new HashSet<Basetype>(
basetypesByStereochemId.get( getBasetypeId( b )));
}
static final StereoConfig determineStereoConfig( Basetype b )
{
// BitSet chiralHydroxylPositions = b.getStereochemistry().bitwiseAndEquals( b.getChiralPositions() );
// int dl_position = chiralHydroxylPositions.highestSetBit();
// return b.getStereochemistry().get( dl_position ) ? D : L;
return b.getStereoConfig();
}
static final Superclass determineSuperclass( Basetype b )
{
return Superclass.forSize( b.getStereochemistry().length() );
}
/**
* Returns a {@link String} which lists the salient features of the
* passed {@link Basetype}, mainly useful for debugging purposes.
* The String returned has the following form:
*<pre>
* CommonBasetype=D-GlcNAc
* name: D-GlcNAc, fullname: N-acetylglucosamine, superclass: Hexose=6
* stereochemistry : [false, true, false, true, true, false]
* chiral indexes : [false, true, true, true, true, false]
* functional groups : [Carbonyl, NAc, OH, OH, OH, OH]
* fischer:
*
* 1: Carbonyl
* |
* +---- 2
* |
* 3 ----+
* |
* +---- 4
* |
* +---- 5
* |
* 6
*
*</pre>
* Some notes: a stereochemical value of 'true' means projected RIGHT
* in a Fischer projection (same side as C2 of {@link Gly Glyceraldhyde}).
*/
public static final String describe( Basetype b )
{
StringBuilder sb = new StringBuilder();
sb.append( b.getClass().getSimpleName() + "=" + b );
sb.append( CR );
sb.append("name: " + b.getName() );
sb.append(", fullname: " + b.getFullName() );
sb.append(", superclass: " + b.getSuperclass() );
sb.append( CR );
boolean[] stereo = b.getStereochemistry().toBitArray();
boolean[] chiral = b.getChiralPositions().toBitArray();
List<Substituent> chem = b.getFunctionalGroups();
sb.append("stereochemistry : " + Arrays.toString(stereo) + CR );
sb.append("chiral indexes : " + Arrays.toString(chiral) + CR );
sb.append("functional groups : " + chem + CR );
sb.append("fischer: " + CR + CR );
assert stereo.length == chiral.length;
assert stereo.length == chem.size();
for ( int i = 0; i < stereo.length; i++ )
{
int pos = i + 1;
if ( i != 0 )
{
sb.append(" |");
sb.append( CR );
}
if ( chiral[i] )
{
if ( stereo[i] )
{
// fischer right
sb.append(" +---- " + pos + CR );
}
else
{
// fischer left
sb.append( " " + pos + " ----+ " + CR );
}
}
else
{
sb.append(" " + pos );
if ( chem.get(i) != OH )
sb.append(": " + chem.get(i) );
sb.append( CR );
}
}
sb.append( CR );
return sb.toString();
}
} // end class